KT AIVLE/Daily Review

241111

bestone888 2024. 11. 11. 19:54

241111

미니 프로젝트 4차 day1

In [ ]:
# 4-2. df에서 '_'이 절반 이상 되는 col 제거
# 방법1
for col in df.columns:
    if (df[col]== '_').mean() >= 0.5:
        df.drop(columns = col, inplace = True)

# 방법2: hard coding
In [ ]:
# 4-4. '_' 값을 결측치로 변경
df.replace({'_': np.nan})
In [ ]:
# 4-8. 결측치가 있는 col에서 중앙값으로 결측치 채우기
# 결측치가 있을 경우에는 .median() 사용 불가

m = df.loc[df['age_itg_cd'].notnull(), 'age_itg_cd'].median()
df['age_itg_cd'].fillna(m, inplace = True)
In [ ]:
# 5-2. df의 'object' column만 찾아서 one-hot-encoding, 원본에 합기기
# 방법1
temp = df.get_dummies(df.select_dtypes(include = ['object']), drop_first = True, dtype = int)
df = pd.concat([df.select_dtypes(exclued = ['object']), temp], axis = 1)

# 방법2: hard coding
In [ ]:
# LabelEnconder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['cust_clas_itg_cd'] = le.fit_transform(temp['cust_clas_itg_cd'])
In [ ]:
# 8-1. LogisticRegression
model = LogisticRegression(C = 10, max_iter = 2000)

# C: 작아지면 과적합 방지를 위한 규제 강화
# max_iter: 최대 반복 횟수
In [ ]:
# 8-4. RandomForest
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
In [ ]:
# 8-5. XGBoost
!pip install xgboost

from xgboost import XGBClassifier
from xgboost import XGBRegressor
In [ ]:
# 8-6. LightGBM
!pip install lightgbm

from lightgbm import LGBMClassifier
from lightgbm import LGBMRegressor
In [ ]:
# 9-1. 딥러닝 모델(이중분류)
from keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.backend import clear_session
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout

# 모델 선언
clear_session()

model = Sequential([Input(shape = (nfeatures, )),
                   Dense(64, activation = 'relu'),
                   Dropout(0.2),
                   Dense(32, activation = 'relu'),
                   Dropout(0.2),
                   Dense(16, activation = 'relu'),
                   Dropout(0.2),
                   Dense(1, activation = 'sigmoid')])

# 컴파일, 학습
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

es = EarlyStopping(monitor = 'val_loss', patience = 5, min_delta = 0.001)
mc = ModelCheckpoint('best_name.keras', monitor = 'val_loss', save_best_only = True)

history = model.fit(x_train, y_train, epochs = 10, validation_split = 0.2,
                    batch_size = 10, callbacks = [es, mc]).history

# 시각화
plt.plot(history['val_loss'], label = 'Validation')
plt.plot(history['loss'], label = 'Train')
plt.legend()
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.show()
In [ ]:
# 9-2. y를 one-hot-encoding 후 다중분류하는 딥러닝 모델(분류)
# one-hot-encoding
y_train_ohe = pd.get_dummies(y_train, dtype = int)
y_test_ohe = pd.get_dummies(y_test, dtype = int)

# 모델 선언
clear_session()

model.Sequential([Input(shape = (nfeatures, )),
                   Dense(64, activation = 'relu'),
                   Dropout(0.2),
                   Dense(32, activation = 'relu'),
                   Dropout(0.2),
                   Dense(16, activation = 'relu'),
                   Dropout(0.2),
                   Dense(2, activation = 'softmax')])

# 컴파일, 학습
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

es = EarlyStopping(monitor = 'val_loss', patience = 5, min_delta = 0.001)
cp = ModelCheckpoint('best_name.keras', monitor = 'val_loss', save_best_only = True)

history = model.fit(x_train, y_train_ohe, validation_split = 0.2, batch_size = 10,
                   epochs = 10, callbacks = [es, cp]).history

# 시각화
plt.plot(history['accuracy'], label = 'Train')
plt.plot(history['val_accuracy'], label = 'Validation')
plt.legend()
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.show()
In [ ]:
# 9-5. 정확도 예측하기
y_pred = model.predict(x_test)
y_pred = y_pred.argmax(axis = 1)

y_test = y_test_ohe.to_numpy().argmax(axis = 1)    # y_test_ohe은 dataframe이므로 array로 먼저 바꿔야함.

print(accuracy_score(y_test, y_pred))

'KT AIVLE > Daily Review' 카테고리의 다른 글

241107~241108  (0) 2024.11.18
241112  (0) 2024.11.13
241105  (0) 2024.11.05
241104  (0) 2024.11.04
241101  (0) 2024.11.03