KT AIVLE/Daily Review
241018
bestone888
2024. 10. 20. 02:23
241018
In [149]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action='ignore')
%config InlineBackend.figure_format = 'retina'
Ensemble
In [151]:
path = 'https://raw.githubusercontent.com/jangrae/csv/master/admission_simple.csv'
data = pd.read_csv(path)
data.tail()
Out[151]:
GRETOEFLRANKSOPLORGPARESEARCHADMIT495496497498499
332 | 108 | 5 | 4.5 | 4.0 | 9.02 | 1 | 1 |
337 | 117 | 5 | 5.0 | 5.0 | 9.87 | 1 | 1 |
330 | 120 | 5 | 4.5 | 5.0 | 9.56 | 1 | 1 |
312 | 103 | 4 | 4.0 | 5.0 | 8.43 | 0 | 0 |
327 | 113 | 4 | 4.5 | 4.5 | 9.04 | 0 | 1 |
In [152]:
# 데이터 분리
from sklearn.model_selection import train_test_split
x = data.drop(columns = 'ADMIT')
y = data['ADMIT']
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y, test_size = 0.3, random_state = 1)
In [153]:
# 정규화
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_s = scaler.transform(x_train)
x_test_s = scaler.transform(x_test)
In [154]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import *
In [155]:
# KNN
model = KNeighborsClassifier()
model.fit(x_train_s, y_train)
y_pred = model.predict(x_test_s)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[77 8]
[14 51]]
precision recall f1-score support
0 0.85 0.91 0.88 85
1 0.86 0.78 0.82 65
accuracy 0.85 150
macro avg 0.86 0.85 0.85 150
weighted avg 0.85 0.85 0.85 150
In [156]:
# Decision Tree
model = DecisionTreeClassifier(max_depth = 5, random_state= 1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[79 6]
[11 54]]
precision recall f1-score support
0 0.88 0.93 0.90 85
1 0.90 0.83 0.86 65
accuracy 0.89 150
macro avg 0.89 0.88 0.88 150
weighted avg 0.89 0.89 0.89 150
In [157]:
# Logistic Regression
model = LogisticRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[73 12]
[15 50]]
precision recall f1-score support
0 0.83 0.86 0.84 85
1 0.81 0.77 0.79 65
accuracy 0.82 150
macro avg 0.82 0.81 0.82 150
weighted avg 0.82 0.82 0.82 150
In [158]:
# Random Forest
model = RandomForestClassifier(max_depth= 5, random_state= 1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[79 6]
[13 52]]
precision recall f1-score support
0 0.86 0.93 0.89 85
1 0.90 0.80 0.85 65
accuracy 0.87 150
macro avg 0.88 0.86 0.87 150
weighted avg 0.88 0.87 0.87 150
In [159]:
# Random Forest에서 feature의 중요도
plt.figure(figsize=(5, 5))
plt.barh(list(x), model.feature_importances_)
plt.show()
In [160]:
# XGBoost
model = XGBClassifier(max_depth=5, random_state=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[74 11]
[11 54]]
precision recall f1-score support
0 0.87 0.87 0.87 85
1 0.83 0.83 0.83 65
accuracy 0.85 150
macro avg 0.85 0.85 0.85 150
weighted avg 0.85 0.85 0.85 150
In [161]:
# XGBoost에서 feature의 중요도
plt.figure(figsize=(5, 5))
plt.barh(list(x), model.feature_importances_)
plt.show()

In [162]:
# LightGBM
model = LGBMClassifier(max_depth=5, random_state=1, verbose=-1)
# verbose =-1: 학습 과정에서 출력되는 모든 메시지를 억제
# verbose = 0: 기본적으로 경고 메시지만 출력
# verbose = 1: 학습 과정에서 더 많은 정보를 출력
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[76 9]
[12 53]]
precision recall f1-score support
0 0.86 0.89 0.88 85
1 0.85 0.82 0.83 65
accuracy 0.86 150
macro avg 0.86 0.85 0.86 150
weighted avg 0.86 0.86 0.86 150
In [163]:
# Light에서 feature의 중요도
plt.figure(figsize=(5, 5))
plt.barh(list(x), model.feature_importances_)
plt.show()

In [ ]:
실습1 (회귀)
In [165]:
path = 'https://raw.githubusercontent.com/jangrae/csv/master/Carseats.csv'
data = pd.read_csv(path)
data.tail()
Out[165]:
SalesCompPriceIncomeAdvertisingPopulationPriceShelveLocAgeEducationUrbanUS395396397398399
12.57 | 138 | 108 | 17 | 203 | 128 | Good | 33 | 14 | Yes | Yes |
6.14 | 139 | 23 | 3 | 37 | 120 | Medium | 55 | 11 | No | Yes |
7.41 | 162 | 26 | 12 | 368 | 159 | Medium | 40 | 18 | Yes | Yes |
5.94 | 100 | 79 | 7 | 284 | 95 | Bad | 50 | 12 | Yes | Yes |
9.71 | 134 | 37 | 0 | 27 | 120 | Good | 49 | 16 | Yes | Yes |
In [166]:
# 데이터 분리, 가변수화
from sklearn.model_selection import train_test_split
x = data.drop(columns = 'Sales')
y = data['Sales']
# 가변수화
dumm_cols = ['ShelveLoc', 'Education', 'Urban', 'US']
x = pd.get_dummies(x, columns=dumm_cols, drop_first=True, dtype=int)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
In [167]:
# 정규화 방법1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_s = scaler.transform(x_train)
x_test_s = scaler.transform(x_test)
In [168]:
# 정규화 방법2
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_test_s = scaler.transform(x_test)
In [169]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import *
In [170]:
# Linear Regression
model =LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))
0.8416098802896127
0.8657819916636766
In [171]:
# KNN
model = KNeighborsRegressor(n_neighbors=5)
model.fit(x_train_s, y_train)
y_pred = model.predict(x_test_s)
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))
1.8401833333333333
0.3164099528655834
In [172]:
# Decision Tree
model = DecisionTreeRegressor(max_depth=5, random_state=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))
1.9110944370982414
0.307084554928293
In [173]:
# Random Forest
model = RandomForestRegressor(max_depth=5, random_state=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))
1.3799522255378287
0.605314731843458
In [174]:
# XGBoost
model = XGBRegressor(max_depth=5, random_state=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))
1.374096424102783
0.6010697648596808
In [175]:
# LightGBM
model = LGBMRegressor(max_depth=5, random_state=1, verbose=-1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))
1.1793538735127505
0.6946417907320817
실습2 (분류)
In [177]:
path = 'https://raw.githubusercontent.com/jangrae/csv/master/mobile_cust_churn.csv'
data = pd.read_csv(path)
data['CHURN'] = data['CHURN'].map({'STAY':0, 'LEAVE': 1})
data.tail()
Out[177]:
idCOLLEGEINCOMEOVERAGELEFTOVERHOUSEHANDSET_PRICEOVER_15MINS_CALLS_PER_MONTHAVERAGE_CALL_DURATIONREPORTED_SATISFACTIONREPORTED_USAGE_LEVELCONSIDERING_CHANGE_OF_PLANCHURN1999519996199971999819999
19996 | 0 | 153252 | 0 | 23 | 368403 | 597 | 1 | 6 | very_sat | little | actively_looking_into_it | 0 |
19997 | 1 | 107126 | 71 | 82 | 237397 | 609 | 5 | 2 | very_sat | very_little | no | 0 |
19998 | 0 | 78529 | 0 | 66 | 172589 | 275 | 0 | 2 | unsat | very_high | considering | 1 |
19999 | 0 | 78674 | 47 | 41 | 572406 | 288 | 4 | 2 | very_unsat | little | considering | 1 |
20000 | 0 | 124697 | 0 | 0 | 845575 | 808 | 24 | 14 | unsat | high | actively_looking_into_it | 1 |
In [178]:
data.drop(columns = 'id', inplace = True)
In [179]:
# 데이터 분리, 가변수화
from sklearn.model_selection import train_test_split
x = data.drop(columns = 'CHURN')
y = data['CHURN']
# 가변수화
dumm_cols = ['REPORTED_SATISFACTION','REPORTED_USAGE_LEVEL','CONSIDERING_CHANGE_OF_PLAN']
x = pd.get_dummies(x, columns=dumm_cols, drop_first=True, dtype=int)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
In [180]:
# 정규화
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_s = scaler.transform(x_train)
x_test_s = scaler.transform(x_test)
In [181]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
In [182]:
# KNN
cv_score = cross_val_score(KNeighborsClassifier(), x_train_s, y_train, cv=5)
print(cv_score.mean())
# 결과 수집
result = {}
result['KNN'] = cv_score.mean()
0.5930000000000001
In [183]:
# Decision Tree
cv_score = cross_val_score(DecisionTreeClassifier(), x_train, y_train, cv=5)
print(cv_score.mean())
# 결과 수집
result['Decision Tree'] = cv_score.mean()
0.6153571428571428
In [184]:
# Logistic Regression
cv_score = cross_val_score(LogisticRegression(), x_train, y_train, cv=5)
print(cv_score.mean())
# 결과 수집
result['Logistic Regression'] = cv_score.mean()
0.6350714285714286
In [185]:
# Random Forest
cv_score = cross_val_score(RandomForestClassifier(), x_train, y_train, cv=5)
print(cv_score.mean())
# 결과 수집
result['Random Forest'] = cv_score.mean()
0.6929285714285713
In [186]:
# XGBoost
cv_score = cross_val_score(XGBClassifier(), x_train, y_train, cv=5)
print(cv_score.mean())
# 결과 수집
result['XGBoost'] = cv_score.mean()
0.6866428571428573
In [187]:
# LightGBM
cv_score = cross_val_score(LGBMClassifier(), x_train, y_train, cv=5)
print(cv_score.mean())
# 결과 수집
result['LightGBM'] = cv_score.mean()
0.6962142857142857
In [188]:
# 결과 확인
for k, v in result.items():
print(k, v.round(3))
KNN 0.593
Decision Tree 0.615
Logistic Regression 0.635
Random Forest 0.693
XGBoost 0.687
LightGBM 0.696
In [189]:
# 결과 시각화
plt.figure(figsize=(5, 5))
plt.barh(list(result), result.values())
plt.show()

# 성능 튜닝
# 가장 높게 나온 LightGBM 사용
In [191]:
# 파라미터
param = {'max_depth': range(1,21), 'n_estimators': range(60, 131, 10)}
model = GridSearchCV(LGBMClassifier(verbose = -1),
param,
cv = 5,
scoring = 'accuracy')
model.fit(x_train, y_train)
Out[191]:
estimator: LGBMClassifier
LGBMClassifier
In [236]:
print(model.best_params_)
print(model.best_score_)
{'max_depth': 4, 'n_estimators': 60}
0.6994999999999999
In [238]:
# 변수 중요도 시각화
plt.barh(list(x), model.best_estimator_.feature_importances_)
plt.show()

In [240]:
# 성능평가
y_pred = model.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[1928 1137]
[ 673 2262]]
precision recall f1-score support
0 0.74 0.63 0.68 3065
1 0.67 0.77 0.71 2935
accuracy 0.70 6000
macro avg 0.70 0.70 0.70 6000
weighted avg 0.70 0.70 0.70 6000