bestone888 2024. 10. 20. 02:23

241018

In [149]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings(action='ignore')
%config InlineBackend.figure_format = 'retina'

Ensemble

In [151]:
path = 'https://raw.githubusercontent.com/jangrae/csv/master/admission_simple.csv'
data = pd.read_csv(path)
data.tail()
Out[151]:
GRETOEFLRANKSOPLORGPARESEARCHADMIT495496497498499
332 108 5 4.5 4.0 9.02 1 1
337 117 5 5.0 5.0 9.87 1 1
330 120 5 4.5 5.0 9.56 1 1
312 103 4 4.0 5.0 8.43 0 0
327 113 4 4.5 4.5 9.04 0 1
In [152]:
# 데이터 분리
from sklearn.model_selection import train_test_split

x = data.drop(columns = 'ADMIT')
y = data['ADMIT']

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y, test_size = 0.3, random_state = 1)
In [153]:
# 정규화
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_s = scaler.transform(x_train)
x_test_s = scaler.transform(x_test)
In [154]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import *
In [155]:
# KNN
model = KNeighborsClassifier()
model.fit(x_train_s, y_train)
y_pred = model.predict(x_test_s)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[77  8]
 [14 51]]
              precision    recall  f1-score   support

           0       0.85      0.91      0.88        85
           1       0.86      0.78      0.82        65

    accuracy                           0.85       150
   macro avg       0.86      0.85      0.85       150
weighted avg       0.85      0.85      0.85       150

In [156]:
# Decision Tree
model = DecisionTreeClassifier(max_depth = 5, random_state=  1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[79  6]
 [11 54]]
              precision    recall  f1-score   support

           0       0.88      0.93      0.90        85
           1       0.90      0.83      0.86        65

    accuracy                           0.89       150
   macro avg       0.89      0.88      0.88       150
weighted avg       0.89      0.89      0.89       150

In [157]:
# Logistic Regression
model = LogisticRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[73 12]
 [15 50]]
              precision    recall  f1-score   support

           0       0.83      0.86      0.84        85
           1       0.81      0.77      0.79        65

    accuracy                           0.82       150
   macro avg       0.82      0.81      0.82       150
weighted avg       0.82      0.82      0.82       150

In [158]:
# Random Forest
model = RandomForestClassifier(max_depth= 5, random_state= 1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[79  6]
 [13 52]]
              precision    recall  f1-score   support

           0       0.86      0.93      0.89        85
           1       0.90      0.80      0.85        65

    accuracy                           0.87       150
   macro avg       0.88      0.86      0.87       150
weighted avg       0.88      0.87      0.87       150

In [159]:
# Random Forest에서 feature의 중요도
plt.figure(figsize=(5, 5))
plt.barh(list(x), model.feature_importances_)
plt.show()
 
In [160]:
# XGBoost
model = XGBClassifier(max_depth=5, random_state=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[74 11]
 [11 54]]
              precision    recall  f1-score   support

           0       0.87      0.87      0.87        85
           1       0.83      0.83      0.83        65

    accuracy                           0.85       150
   macro avg       0.85      0.85      0.85       150
weighted avg       0.85      0.85      0.85       150

In [161]:
# XGBoost에서 feature의 중요도
plt.figure(figsize=(5, 5))
plt.barh(list(x), model.feature_importances_)
plt.show()
In [162]:
# LightGBM
model = LGBMClassifier(max_depth=5, random_state=1, verbose=-1)
# verbose =-1: 학습 과정에서 출력되는 모든 메시지를 억제
# verbose = 0: 기본적으로 경고 메시지만 출력
# verbose = 1: 학습 과정에서 더 많은 정보를 출력
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[76  9]
 [12 53]]
              precision    recall  f1-score   support

           0       0.86      0.89      0.88        85
           1       0.85      0.82      0.83        65

    accuracy                           0.86       150
   macro avg       0.86      0.85      0.86       150
weighted avg       0.86      0.86      0.86       150

In [163]:
# Light에서 feature의 중요도
plt.figure(figsize=(5, 5))
plt.barh(list(x), model.feature_importances_)
plt.show()
In [ ]:
 

실습1 (회귀)

In [165]:
path = 'https://raw.githubusercontent.com/jangrae/csv/master/Carseats.csv'
data = pd.read_csv(path)

data.tail()
Out[165]:
SalesCompPriceIncomeAdvertisingPopulationPriceShelveLocAgeEducationUrbanUS395396397398399
12.57 138 108 17 203 128 Good 33 14 Yes Yes
6.14 139 23 3 37 120 Medium 55 11 No Yes
7.41 162 26 12 368 159 Medium 40 18 Yes Yes
5.94 100 79 7 284 95 Bad 50 12 Yes Yes
9.71 134 37 0 27 120 Good 49 16 Yes Yes
In [166]:
# 데이터 분리, 가변수화
from sklearn.model_selection import train_test_split

x = data.drop(columns = 'Sales')
y = data['Sales']

# 가변수화
dumm_cols = ['ShelveLoc', 'Education', 'Urban', 'US']
x = pd.get_dummies(x, columns=dumm_cols, drop_first=True, dtype=int)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
In [167]:
# 정규화 방법1
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_s = scaler.transform(x_train)
x_test_s = scaler.transform(x_test)
In [168]:
# 정규화 방법2
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_test_s = scaler.transform(x_test)
In [169]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import *
In [170]:
# Linear Regression
model =LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))
0.8416098802896127
0.8657819916636766
In [171]:
# KNN
model = KNeighborsRegressor(n_neighbors=5)
model.fit(x_train_s, y_train)
y_pred = model.predict(x_test_s)

print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))
1.8401833333333333
0.3164099528655834
In [172]:
# Decision Tree
model = DecisionTreeRegressor(max_depth=5, random_state=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))
1.9110944370982414
0.307084554928293
In [173]:
# Random Forest
model = RandomForestRegressor(max_depth=5, random_state=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))
1.3799522255378287
0.605314731843458
In [174]:
# XGBoost
model = XGBRegressor(max_depth=5, random_state=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))
1.374096424102783
0.6010697648596808
In [175]:
# LightGBM
model = LGBMRegressor(max_depth=5, random_state=1, verbose=-1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))
1.1793538735127505
0.6946417907320817

실습2 (분류)

In [177]:
path = 'https://raw.githubusercontent.com/jangrae/csv/master/mobile_cust_churn.csv'
data = pd.read_csv(path)
data['CHURN'] = data['CHURN'].map({'STAY':0, 'LEAVE': 1})

data.tail()
Out[177]:
idCOLLEGEINCOMEOVERAGELEFTOVERHOUSEHANDSET_PRICEOVER_15MINS_CALLS_PER_MONTHAVERAGE_CALL_DURATIONREPORTED_SATISFACTIONREPORTED_USAGE_LEVELCONSIDERING_CHANGE_OF_PLANCHURN1999519996199971999819999
19996 0 153252 0 23 368403 597 1 6 very_sat little actively_looking_into_it 0
19997 1 107126 71 82 237397 609 5 2 very_sat very_little no 0
19998 0 78529 0 66 172589 275 0 2 unsat very_high considering 1
19999 0 78674 47 41 572406 288 4 2 very_unsat little considering 1
20000 0 124697 0 0 845575 808 24 14 unsat high actively_looking_into_it 1
In [178]:
data.drop(columns = 'id', inplace = True)
In [179]:
# 데이터 분리, 가변수화
from sklearn.model_selection import train_test_split
x = data.drop(columns = 'CHURN')
y = data['CHURN']

# 가변수화
dumm_cols = ['REPORTED_SATISFACTION','REPORTED_USAGE_LEVEL','CONSIDERING_CHANGE_OF_PLAN']
x = pd.get_dummies(x, columns=dumm_cols, drop_first=True, dtype=int)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
In [180]:
# 정규화
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(x_train)

x_train_s = scaler.transform(x_train)
x_test_s = scaler.transform(x_test)
In [181]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
In [182]:
# KNN
cv_score = cross_val_score(KNeighborsClassifier(), x_train_s, y_train, cv=5)

print(cv_score.mean())

# 결과 수집
result = {}
result['KNN'] = cv_score.mean()
0.5930000000000001
In [183]:
# Decision Tree
cv_score = cross_val_score(DecisionTreeClassifier(), x_train, y_train, cv=5)

print(cv_score.mean())

# 결과 수집
result['Decision Tree'] = cv_score.mean()
0.6153571428571428
In [184]:
# Logistic Regression
cv_score = cross_val_score(LogisticRegression(), x_train, y_train, cv=5)

print(cv_score.mean())

# 결과 수집
result['Logistic Regression'] = cv_score.mean()
0.6350714285714286
In [185]:
# Random Forest
cv_score = cross_val_score(RandomForestClassifier(), x_train, y_train, cv=5)

print(cv_score.mean())

# 결과 수집
result['Random Forest'] = cv_score.mean()
0.6929285714285713
In [186]:
# XGBoost
cv_score = cross_val_score(XGBClassifier(), x_train, y_train, cv=5)

print(cv_score.mean())

# 결과 수집
result['XGBoost'] = cv_score.mean()
0.6866428571428573
In [187]:
# LightGBM
cv_score = cross_val_score(LGBMClassifier(), x_train, y_train, cv=5)

print(cv_score.mean())

# 결과 수집
result['LightGBM'] = cv_score.mean()
0.6962142857142857
In [188]:
# 결과 확인
for k, v in result.items():
    print(k, v.round(3))
KNN 0.593
Decision Tree 0.615
Logistic Regression 0.635
Random Forest 0.693
XGBoost 0.687
LightGBM 0.696
In [189]:
# 결과 시각화
plt.figure(figsize=(5, 5))
plt.barh(list(result), result.values())
plt.show()
# 성능 튜닝
# 가장 높게 나온 LightGBM 사용
In [191]:
# 파라미터
param = {'max_depth': range(1,21), 'n_estimators': range(60, 131, 10)}

model = GridSearchCV(LGBMClassifier(verbose = -1),
                    param, 
                    cv = 5,
                    scoring = 'accuracy')

model.fit(x_train, y_train)
Out[191]:
  GridSearchCV?i
 
estimator: LGBMClassifier
 
LGBMClassifier
 
In [236]:
print(model.best_params_)
print(model.best_score_)
{'max_depth': 4, 'n_estimators': 60}
0.6994999999999999
In [238]:
# 변수 중요도 시각화
plt.barh(list(x), model.best_estimator_.feature_importances_)
plt.show()
In [240]:
# 성능평가
y_pred = model.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[1928 1137]
 [ 673 2262]]
              precision    recall  f1-score   support

           0       0.74      0.63      0.68      3065
           1       0.67      0.77      0.71      2935

    accuracy                           0.70      6000
   macro avg       0.70      0.70      0.70      6000
weighted avg       0.70      0.70      0.70      6000