241111
미니 프로젝트 4차 day1
In [ ]:
# 4-2. df에서 '_'이 절반 이상 되는 col 제거
# 방법1
for col in df.columns:
if (df[col]== '_').mean() >= 0.5:
df.drop(columns = col, inplace = True)
# 방법2: hard coding
In [ ]:
# 4-4. '_' 값을 결측치로 변경
df.replace({'_': np.nan})
In [ ]:
# 4-8. 결측치가 있는 col에서 중앙값으로 결측치 채우기
# 결측치가 있을 경우에는 .median() 사용 불가
m = df.loc[df['age_itg_cd'].notnull(), 'age_itg_cd'].median()
df['age_itg_cd'].fillna(m, inplace = True)
In [ ]:
# 5-2. df의 'object' column만 찾아서 one-hot-encoding, 원본에 합기기
# 방법1
temp = df.get_dummies(df.select_dtypes(include = ['object']), drop_first = True, dtype = int)
df = pd.concat([df.select_dtypes(exclued = ['object']), temp], axis = 1)
# 방법2: hard coding
In [ ]:
# LabelEnconder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['cust_clas_itg_cd'] = le.fit_transform(temp['cust_clas_itg_cd'])
In [ ]:
# 8-1. LogisticRegression
model = LogisticRegression(C = 10, max_iter = 2000)
# C: 작아지면 과적합 방지를 위한 규제 강화
# max_iter: 최대 반복 횟수
In [ ]:
# 8-4. RandomForest
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
In [ ]:
# 8-5. XGBoost
!pip install xgboost
from xgboost import XGBClassifier
from xgboost import XGBRegressor
In [ ]:
# 8-6. LightGBM
!pip install lightgbm
from lightgbm import LGBMClassifier
from lightgbm import LGBMRegressor
In [ ]:
# 9-1. 딥러닝 모델(이중분류)
from keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.backend import clear_session
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
# 모델 선언
clear_session()
model = Sequential([Input(shape = (nfeatures, )),
Dense(64, activation = 'relu'),
Dropout(0.2),
Dense(32, activation = 'relu'),
Dropout(0.2),
Dense(16, activation = 'relu'),
Dropout(0.2),
Dense(1, activation = 'sigmoid')])
# 컴파일, 학습
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
es = EarlyStopping(monitor = 'val_loss', patience = 5, min_delta = 0.001)
mc = ModelCheckpoint('best_name.keras', monitor = 'val_loss', save_best_only = True)
history = model.fit(x_train, y_train, epochs = 10, validation_split = 0.2,
batch_size = 10, callbacks = [es, mc]).history
# 시각화
plt.plot(history['val_loss'], label = 'Validation')
plt.plot(history['loss'], label = 'Train')
plt.legend()
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.show()
In [ ]:
# 9-2. y를 one-hot-encoding 후 다중분류하는 딥러닝 모델(분류)
# one-hot-encoding
y_train_ohe = pd.get_dummies(y_train, dtype = int)
y_test_ohe = pd.get_dummies(y_test, dtype = int)
# 모델 선언
clear_session()
model.Sequential([Input(shape = (nfeatures, )),
Dense(64, activation = 'relu'),
Dropout(0.2),
Dense(32, activation = 'relu'),
Dropout(0.2),
Dense(16, activation = 'relu'),
Dropout(0.2),
Dense(2, activation = 'softmax')])
# 컴파일, 학습
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
es = EarlyStopping(monitor = 'val_loss', patience = 5, min_delta = 0.001)
cp = ModelCheckpoint('best_name.keras', monitor = 'val_loss', save_best_only = True)
history = model.fit(x_train, y_train_ohe, validation_split = 0.2, batch_size = 10,
epochs = 10, callbacks = [es, cp]).history
# 시각화
plt.plot(history['accuracy'], label = 'Train')
plt.plot(history['val_accuracy'], label = 'Validation')
plt.legend()
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.show()
In [ ]:
# 9-5. 정확도 예측하기
y_pred = model.predict(x_test)
y_pred = y_pred.argmax(axis = 1)
y_test = y_test_ohe.to_numpy().argmax(axis = 1) # y_test_ohe은 dataframe이므로 array로 먼저 바꿔야함.
print(accuracy_score(y_test, y_pred))
'KT AIVLE > Daily Review' 카테고리의 다른 글
241107~241108 (0) | 2024.11.18 |
---|---|
241112 (0) | 2024.11.13 |
241105 (0) | 2024.11.05 |
241104 (0) | 2024.11.04 |
241101 (0) | 2024.11.03 |