KT AIVLE/Daily Review

241018

bestone888 2024. 10. 20. 02:23

241018

In [149]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings(action='ignore')
%config InlineBackend.figure_format = 'retina'

Ensemble

In [151]:

path = 'https://raw.githubusercontent.com/jangrae/csv/master/admission_simple.csv'
data = pd.read_csv(path)
data.tail()

Out[151]:

GRETOEFLRANKSOPLORGPARESEARCHADMIT495496497498499

332	108	5	4.5	4.0	9.02	1	1
337	117	5	5.0	5.0	9.87	1	1
330	120	5	4.5	5.0	9.56	1	1
312	103	4	4.0	5.0	8.43	0	0
327	113	4	4.5	4.5	9.04	0	1

In [152]:

# 데이터 분리
from sklearn.model_selection import train_test_split

x = data.drop(columns = 'ADMIT')
y = data['ADMIT']

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y, test_size = 0.3, random_state = 1)

In [153]:

# 정규화
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_s = scaler.transform(x_train)
x_test_s = scaler.transform(x_test)

In [154]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import *

In [155]:

# KNN
model = KNeighborsClassifier()
model.fit(x_train_s, y_train)
y_pred = model.predict(x_test_s)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[77  8]
 [14 51]]
              precision    recall  f1-score   support

           0       0.85      0.91      0.88        85
           1       0.86      0.78      0.82        65

    accuracy                           0.85       150
   macro avg       0.86      0.85      0.85       150
weighted avg       0.85      0.85      0.85       150

In [156]:

# Decision Tree
model = DecisionTreeClassifier(max_depth = 5, random_state=  1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[79  6]
 [11 54]]
              precision    recall  f1-score   support

           0       0.88      0.93      0.90        85
           1       0.90      0.83      0.86        65

    accuracy                           0.89       150
   macro avg       0.89      0.88      0.88       150
weighted avg       0.89      0.89      0.89       150

In [157]:

# Logistic Regression
model = LogisticRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[73 12]
 [15 50]]
              precision    recall  f1-score   support

           0       0.83      0.86      0.84        85
           1       0.81      0.77      0.79        65

    accuracy                           0.82       150
   macro avg       0.82      0.81      0.82       150
weighted avg       0.82      0.82      0.82       150

In [158]:

# Random Forest
model = RandomForestClassifier(max_depth= 5, random_state= 1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[79  6]
 [13 52]]
              precision    recall  f1-score   support

           0       0.86      0.93      0.89        85
           1       0.90      0.80      0.85        65

    accuracy                           0.87       150
   macro avg       0.88      0.86      0.87       150
weighted avg       0.88      0.87      0.87       150

In [159]:

# Random Forest에서 feature의 중요도
plt.figure(figsize=(5, 5))
plt.barh(list(x), model.feature_importances_)
plt.show()

In [160]:

# XGBoost
model = XGBClassifier(max_depth=5, random_state=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[74 11]
 [11 54]]
              precision    recall  f1-score   support

           0       0.87      0.87      0.87        85
           1       0.83      0.83      0.83        65

    accuracy                           0.85       150
   macro avg       0.85      0.85      0.85       150
weighted avg       0.85      0.85      0.85       150

In [161]:

# XGBoost에서 feature의 중요도
plt.figure(figsize=(5, 5))
plt.barh(list(x), model.feature_importances_)
plt.show()

In [162]:

# LightGBM
model = LGBMClassifier(max_depth=5, random_state=1, verbose=-1)
# verbose =-1: 학습 과정에서 출력되는 모든 메시지를 억제
# verbose = 0: 기본적으로 경고 메시지만 출력
# verbose = 1: 학습 과정에서 더 많은 정보를 출력
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[76  9]
 [12 53]]
              precision    recall  f1-score   support

           0       0.86      0.89      0.88        85
           1       0.85      0.82      0.83        65

    accuracy                           0.86       150
   macro avg       0.86      0.85      0.86       150
weighted avg       0.86      0.86      0.86       150

In [163]:

# Light에서 feature의 중요도
plt.figure(figsize=(5, 5))
plt.barh(list(x), model.feature_importances_)
plt.show()

In [ ]:

실습1 (회귀)

In [165]:

path = 'https://raw.githubusercontent.com/jangrae/csv/master/Carseats.csv'
data = pd.read_csv(path)

data.tail()

Out[165]:

SalesCompPriceIncomeAdvertisingPopulationPriceShelveLocAgeEducationUrbanUS395396397398399

12.57	138	108	17	203	128	Good	33	14	Yes	Yes
6.14	139	23	3	37	120	Medium	55	11	No	Yes
7.41	162	26	12	368	159	Medium	40	18	Yes	Yes
5.94	100	79	7	284	95	Bad	50	12	Yes	Yes
9.71	134	37	0	27	120	Good	49	16	Yes	Yes

In [166]:

# 데이터 분리, 가변수화
from sklearn.model_selection import train_test_split

x = data.drop(columns = 'Sales')
y = data['Sales']

# 가변수화
dumm_cols = ['ShelveLoc', 'Education', 'Urban', 'US']
x = pd.get_dummies(x, columns=dumm_cols, drop_first=True, dtype=int)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [167]:

# 정규화 방법1
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_s = scaler.transform(x_train)
x_test_s = scaler.transform(x_test)

In [168]:

# 정규화 방법2
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_test_s = scaler.transform(x_test)

In [169]:

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import *

In [170]:

# Linear Regression
model =LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

0.8416098802896127
0.8657819916636766

In [171]:

# KNN
model = KNeighborsRegressor(n_neighbors=5)
model.fit(x_train_s, y_train)
y_pred = model.predict(x_test_s)

print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

1.8401833333333333
0.3164099528655834

In [172]:

# Decision Tree
model = DecisionTreeRegressor(max_depth=5, random_state=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

1.9110944370982414
0.307084554928293

In [173]:

# Random Forest
model = RandomForestRegressor(max_depth=5, random_state=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

1.3799522255378287
0.605314731843458

In [174]:

# XGBoost
model = XGBRegressor(max_depth=5, random_state=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

1.374096424102783
0.6010697648596808

In [175]:

# LightGBM
model = LGBMRegressor(max_depth=5, random_state=1, verbose=-1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

1.1793538735127505
0.6946417907320817

실습2 (분류)

In [177]:

path = 'https://raw.githubusercontent.com/jangrae/csv/master/mobile_cust_churn.csv'
data = pd.read_csv(path)
data['CHURN'] = data['CHURN'].map({'STAY':0, 'LEAVE': 1})

data.tail()

Out[177]:

idCOLLEGEINCOMEOVERAGELEFTOVERHOUSEHANDSET_PRICEOVER_15MINS_CALLS_PER_MONTHAVERAGE_CALL_DURATIONREPORTED_SATISFACTIONREPORTED_USAGE_LEVELCONSIDERING_CHANGE_OF_PLANCHURN1999519996199971999819999

19996	0	153252	0	23	368403	597	1	6	very_sat	little	actively_looking_into_it	0
19997	1	107126	71	82	237397	609	5	2	very_sat	very_little	no	0
19998	0	78529	0	66	172589	275	0	2	unsat	very_high	considering	1
19999	0	78674	47	41	572406	288	4	2	very_unsat	little	considering	1
20000	0	124697	0	0	845575	808	24	14	unsat	high	actively_looking_into_it	1

In [178]:

data.drop(columns = 'id', inplace = True)

In [179]:

# 데이터 분리, 가변수화
from sklearn.model_selection import train_test_split
x = data.drop(columns = 'CHURN')
y = data['CHURN']

# 가변수화
dumm_cols = ['REPORTED_SATISFACTION','REPORTED_USAGE_LEVEL','CONSIDERING_CHANGE_OF_PLAN']
x = pd.get_dummies(x, columns=dumm_cols, drop_first=True, dtype=int)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [180]:

# 정규화
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(x_train)

x_train_s = scaler.transform(x_train)
x_test_s = scaler.transform(x_test)

In [181]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

In [182]:

# KNN
cv_score = cross_val_score(KNeighborsClassifier(), x_train_s, y_train, cv=5)

print(cv_score.mean())

# 결과 수집
result = {}
result['KNN'] = cv_score.mean()

0.5930000000000001

In [183]:

# Decision Tree
cv_score = cross_val_score(DecisionTreeClassifier(), x_train, y_train, cv=5)

print(cv_score.mean())

# 결과 수집
result['Decision Tree'] = cv_score.mean()

0.6153571428571428

In [184]:

# Logistic Regression
cv_score = cross_val_score(LogisticRegression(), x_train, y_train, cv=5)

print(cv_score.mean())

# 결과 수집
result['Logistic Regression'] = cv_score.mean()

0.6350714285714286

In [185]:

# Random Forest
cv_score = cross_val_score(RandomForestClassifier(), x_train, y_train, cv=5)

print(cv_score.mean())

# 결과 수집
result['Random Forest'] = cv_score.mean()

0.6929285714285713

In [186]:

# XGBoost
cv_score = cross_val_score(XGBClassifier(), x_train, y_train, cv=5)

print(cv_score.mean())

# 결과 수집
result['XGBoost'] = cv_score.mean()

0.6866428571428573

In [187]:

# LightGBM
cv_score = cross_val_score(LGBMClassifier(), x_train, y_train, cv=5)

print(cv_score.mean())

# 결과 수집
result['LightGBM'] = cv_score.mean()

0.6962142857142857

In [188]:

# 결과 확인
for k, v in result.items():
    print(k, v.round(3))

KNN 0.593
Decision Tree 0.615
Logistic Regression 0.635
Random Forest 0.693
XGBoost 0.687
LightGBM 0.696

In [189]:

# 결과 시각화
plt.figure(figsize=(5, 5))
plt.barh(list(result), result.values())
plt.show()

# 성능 튜닝
# 가장 높게 나온 LightGBM 사용

In [191]:

# 파라미터
param = {'max_depth': range(1,21), 'n_estimators': range(60, 131, 10)}

model = GridSearchCV(LGBMClassifier(verbose = -1),
                    param, 
                    cv = 5,
                    scoring = 'accuracy')

model.fit(x_train, y_train)

Out[191]:

GridSearchCV?i

estimator: LGBMClassifier

LGBMClassifier

In [236]:

print(model.best_params_)
print(model.best_score_)

{'max_depth': 4, 'n_estimators': 60}
0.6994999999999999

In [238]:

# 변수 중요도 시각화
plt.barh(list(x), model.best_estimator_.feature_importances_)
plt.show()

In [240]:

# 성능평가
y_pred = model.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1928 1137]
 [ 673 2262]]
              precision    recall  f1-score   support

           0       0.74      0.63      0.68      3065
           1       0.67      0.77      0.71      2935

    accuracy                           0.70      6000
   macro avg       0.70      0.70      0.70      6000
weighted avg       0.70      0.70      0.70      6000