KT AIVLE/Daily Review

241017

bestone888 2024. 10. 17. 19:45

241017

In [129]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings(action='ignore')
%config InlineBackend.figure_format='retina'

1. Logistic Regression

In [131]:
path = 'https://raw.githubusercontent.com/jangrae/csv/master/diabetes.csv'
data = pd.read_csv(path)
In [132]:
data.tail()
Out[132]:
PregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcome763764765766767
10 101 76 48 180 32.9 0.171 63 0
2 122 70 27 0 36.8 0.340 27 0
5 121 72 23 112 26.2 0.245 30 0
1 126 60 0 0 30.1 0.349 47 1
1 93 70 31 0 30.4 0.315 23 0
In [133]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
In [134]:
data.corr(numeric_only = True)
Out[134]:
PregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcomePregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcome
1.000000 0.129459 0.141282 -0.081672 -0.073535 0.017683 -0.033523 0.544341 0.221898
0.129459 1.000000 0.152590 0.057328 0.331357 0.221071 0.137337 0.263514 0.466581
0.141282 0.152590 1.000000 0.207371 0.088933 0.281805 0.041265 0.239528 0.065068
-0.081672 0.057328 0.207371 1.000000 0.436783 0.392573 0.183928 -0.113970 0.074752
-0.073535 0.331357 0.088933 0.436783 1.000000 0.197859 0.185071 -0.042163 0.130548
0.017683 0.221071 0.281805 0.392573 0.197859 1.000000 0.140647 0.036242 0.292695
-0.033523 0.137337 0.041265 0.183928 0.185071 0.140647 1.000000 0.033561 0.173844
0.544341 0.263514 0.239528 -0.113970 -0.042163 0.036242 0.033561 1.000000 0.238356
0.221898 0.466581 0.065068 0.074752 0.130548 0.292695 0.173844 0.238356 1.000000
In [135]:
from sklearn.model_selection import train_test_split

x = data.drop(columns = 'Outcome')
y = data['Outcome']

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y,
                                                            train_size = 0.7,
                                                            random_state = 1)
In [136]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

model = LogisticRegression()

model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[132  18]
 [ 36  45]]
              precision    recall  f1-score   support

           0       0.79      0.88      0.83       150
           1       0.71      0.56      0.62        81

    accuracy                           0.77       231
   macro avg       0.75      0.72      0.73       231
weighted avg       0.76      0.77      0.76       231

In [137]:
# 회귀계수
print(model.coef_)    # 계수
print(model.intercept_)    # y절편
[[ 1.50114669e-01  3.31625358e-02 -1.64284589e-02 -1.29767759e-03
   1.26907089e-04  9.45353144e-02  9.45280772e-01  1.66363363e-02]]
[-8.3819741]
In [138]:
# z 값
z = model.decision_function(x_test)
z[:10]
Out[138]:
array([-1.38003159,  1.38522403, -0.74469448, -3.1947857 , -1.1868971 ,
       -1.16430342,  2.76414362, -0.31485228, -0.72954185, -2.02736112])
In [139]:
# 로지스틱 함수 사용(0 or 1)
from scipy.special import expit
expit(z)[:10].round(3)
Out[139]:
array([0.201, 0.8  , 0.322, 0.039, 0.234, 0.238, 0.941, 0.422, 0.325,
       0.116])
In [140]:
y_pred[:10]
Out[140]:
array([0, 1, 0, 0, 0, 0, 1, 0, 0, 0], dtype=int64)
In [141]:
# 확률값 확인
p = model.predict_proba(x_test)
p1 = p[:, 1]    # 클래스 1의 확률

print(p1[:10].round(3))
[0.201 0.8   0.322 0.039 0.234 0.238 0.941 0.422 0.325 0.116]
In [142]:
# 새로운 예측값: 기준 0.5
y_pred2 = [1 if x >0.5 else 0 for x in p1]
print(y_pred[:10])    # 로직스틱 회귀분석 예측값
print(y_pred2[:10])    # 0.5 기준 판단한 예측값
[0 1 0 0 0 0 1 0 0 0]
[0, 1, 0, 0, 0, 0, 1, 0, 0, 0]

1-1. Logistic Regression

In [144]:
path = 'https://raw.githubusercontent.com/jangrae/csv/master/iris.csv'
data = pd.read_csv(path)
In [145]:
data.tail()
Out[145]:
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies145146147148149
6.7 3.0 5.2 2.3 virginica
6.3 2.5 5.0 1.9 virginica
6.5 3.0 5.2 2.0 virginica
6.2 3.4 5.4 2.3 virginica
5.9 3.0 5.1 1.8 virginica
In [146]:
from sklearn.model_selection import train_test_split

x = data.drop(columns = 'Species')
y = data['Species']

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y,
                                                           test_size = 0.3,
                                                           random_state = 1)
In [147]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

model = LogisticRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))
[[15  0  0]
 [ 0 15  1]
 [ 0  0 14]]
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        15
  versicolor       1.00      0.94      0.97        16
   virginica       0.93      1.00      0.97        14

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

In [148]:
# LR 회귀식
print(model.coef_.round(3))    # 각 품종에 대한 회귀식: 3 row
print()
print(model.intercept_.round(3))   # y절편 3개
[[-0.481  0.779 -2.291 -0.922]
 [ 0.152 -0.217 -0.078 -0.694]
 [ 0.329 -0.562  2.37   1.615]]

[  9.792   3.112 -12.904]
In [149]:
# z 값
z = model.decision_function(x_test)
z[:10]
Out[149]:
array([[-5.0357003 ,  1.29709011,  3.7386102 ],
       [ 5.9680559 ,  2.67427064, -8.64232653],
       [ 6.74302654,  2.7542294 , -9.49725593],
       [-2.49603916,  1.78172178,  0.71431738],
       [-2.41853989,  2.1470536 ,  0.27148629],
       [-1.32507382,  2.21522906, -0.89015524],
       [-6.43274013,  1.88387669,  4.54886344],
       [-3.17046422,  2.18284448,  0.98761974],
       [-5.07915288,  1.7646631 ,  3.31448978],
       [ 6.88465471,  2.70554244, -9.59019715]])
In [150]:
# 소프트맥스 함수 (다중 클래스 분류)
from scipy.special import softmax
print(softmax(z, axis = 1)[:10].round(3))
print(y_pred[:10])
[[0.    0.08  0.92 ]
 [0.964 0.036 0.   ]
 [0.982 0.018 0.   ]
 [0.01  0.736 0.253]
 [0.009 0.859 0.132]
 [0.027 0.931 0.042]
 [0.    0.065 0.935]
 [0.004 0.765 0.231]
 [0.    0.175 0.825]
 [0.985 0.015 0.   ]]
['virginica' 'setosa' 'setosa' 'versicolor' 'versicolor' 'versicolor'
 'virginica' 'versicolor' 'virginica' 'setosa']
In [151]:
# 확률값 확인
model.predict_proba(x_test)[:10].round(3)
Out[151]:
array([[0.   , 0.08 , 0.92 ],
       [0.964, 0.036, 0.   ],
       [0.982, 0.018, 0.   ],
       [0.01 , 0.736, 0.253],
       [0.009, 0.859, 0.132],
       [0.027, 0.931, 0.042],
       [0.   , 0.065, 0.935],
       [0.004, 0.765, 0.231],
       [0.   , 0.175, 0.825],
       [0.985, 0.015, 0.   ]])
In [ ]:
 

2. K-Fold Cross Validation

In [153]:
path = 'https://raw.githubusercontent.com/Jangrae/csv/master/diabetes.csv'
data = pd.read_csv(path)
In [154]:
data.tail()
Out[154]:
PregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcome763764765766767
10 101 76 48 180 32.9 0.171 63 0
2 122 70 27 0 36.8 0.340 27 0
5 121 72 23 112 26.2 0.245 30 0
1 126 60 0 0 30.1 0.349 47 1
1 93 70 31 0 30.4 0.315 23 0
In [155]:
from sklearn.model_selection import train_test_split

x = data.drop(columns='Outcome')
y = data.loc[:, 'Outcome']

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y, test_size=0.3, random_state=1)
In [156]:
# 정규화
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_test_s = scaler.transform(x_test)
In [192]:
# DecisionTree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

# 선언
model = DecisionTreeClassifier(max_depth = 5, random_state =1)

# 검증
cv_score = cross_val_score(model, x_train, y_train, cv = 5, scoring = 'r2')

# 확인
print(cv_score)
print('평균:', cv_score.mean())
print('표준편차:', cv_score.std())

# 저장
result = dict()
result['Linear Regression'] = cv_score.mean()
[-0.33984962  0.06616541 -0.1980695  -0.03281853 -0.36332046]
평균: -0.17357854094696193
표준편차: 0.16824887563654517
In [198]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

# 선언
model = KNeighborsClassifier()

# 검증
cv_score = cross_val_score(model, x_train, y_train, scoring = 'r2')

# 저장
result['KNN'] = cv_score.mean()
In [204]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# 선언
model = LogisticRegression()

# 검증
cv_score = cross_val_score(model, x_train, y_train, scoring = 'r2')

# 저장
result['Decision Tree'] = cv_score.mean()
In [206]:
result
Out[206]:
{'Linear Regression': -0.17357854094696193,
 'KNN': -0.11673643568380405,
 'Decision Tree': 0.056492582808372306}

2-1. K-Fold Cross Validation

In [211]:
path = 'https://raw.githubusercontent.com/Jangrae/csv/master/mobile_cust_churn.csv'
data = pd.read_csv(path)
In [213]:
data.tail()
Out[213]:
idCOLLEGEINCOMEOVERAGELEFTOVERHOUSEHANDSET_PRICEOVER_15MINS_CALLS_PER_MONTHAVERAGE_CALL_DURATIONREPORTED_SATISFACTIONREPORTED_USAGE_LEVELCONSIDERING_CHANGE_OF_PLANCHURN1999519996199971999819999
19996 0 153252 0 23 368403 597 1 6 very_sat little actively_looking_into_it STAY
19997 1 107126 71 82 237397 609 5 2 very_sat very_little no STAY
19998 0 78529 0 66 172589 275 0 2 unsat very_high considering LEAVE
19999 0 78674 47 41 572406 288 4 2 very_unsat little considering LEAVE
20000 0 124697 0 0 845575 808 24 14 unsat high actively_looking_into_it LEAVE
In [ ]:
data.drop(columns = 'id', inplace = True)

from sklearn.model_selection import train_test_split
x =data.drop(columns = 'CHURN')
y = data['CHURN']

x_train, x_test, y_train, y_test = train_test_split(x,y, stratify = y, test_size= 0.3, random_state =1)
In [228]:
from sklearn.preprocessing import MinMaxScaler

# 정규화
x_train_s = scaler.fit_transform(x_train)
x_test_s = scaler.transform(x_test)
In [242]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

# 선언
model = KNeighborsClassifier()

# 검증
cv_score = cross_val_score(model, x_train, y_train, cv = 10)

print(cv_score)
[0.81481481 0.74074074 0.77777778 0.81481481 0.64814815 0.74074074
 0.66666667 0.83018868 0.77358491 0.73584906]
In [243]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

# 선언
model = DecisionTreeClassifier()

# 검증
cv_score = cross_val_score(model, x_train, y_train, cv = 10)

print(cv_score)
[0.66666667 0.83333333 0.72222222 0.81481481 0.83333333 0.64814815
 0.72222222 0.79245283 0.77358491 0.69811321]
In [245]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# 선언
model = LogisticRegression()

# 검증
cv_score = cross_val_score(model, x_train, y_train, cv = 10)

print(cv_score)
[0.72222222 0.83333333 0.77777778 0.77777778 0.81481481 0.72222222
 0.75925926 0.75471698 0.71698113 0.81132075]

3. Grid Search로 Decision Tree 알고리즘 모델

In [262]:
path = 'https://raw.githubusercontent.com/jangrae/csv/master/boston.csv'
data = pd.read_csv(path)
In [264]:
data.tail()
Out[264]:
crimzninduschasnoxrmagedisradtaxptratioblacklstatmedv501502503504505
0.06263 0.0 11.93 0 0.573 6.593 69.1 2.4786 1 273 21.0 391.99 9.67 22.4
0.04527 0.0 11.93 0 0.573 6.120 76.7 2.2875 1 273 21.0 396.90 9.08 20.6
0.06076 0.0 11.93 0 0.573 6.976 91.0 2.1675 1 273 21.0 396.90 5.64 23.9
0.10959 0.0 11.93 0 0.573 6.794 89.3 2.3889 1 273 21.0 393.45 6.48 22.0
0.04741 0.0 11.93 0 0.573 6.030 80.8 2.5050 1 273 21.0 396.90 7.88 11.9
In [266]:
x = data.drop(columns='medv')
y = data['medv']

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
In [274]:
# 파라미터 선언
param = {'max_depth' : range(1, 51)}
In [288]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import *

# 선언
model_dt = DecisionTreeRegressor(random_state = 1)

# 성능 예측
model = GridSearchCV(model_dt,
                        param,
                        cv = 5,
                        scoring = 'r2')  # default setting: r2

# 학습
model.fit(x_train, y_train)
Out[288]:
  GridSearchCV?i
 
estimator: DecisionTreeRegressor
 
 DecisionTreeRegressor?
 
In [291]:
# 중요 정보 확인
print('=' * 80)
print(model.cv_results_['mean_test_score'])
print('-' * 80)
print('최적파라미터:', model.best_params_)
print('-' * 80)
print('최고성능:', model.best_score_)
print('=' * 80)
================================================================================
[0.37077174 0.57894062 0.67646772 0.72240391 0.7383174  0.74748839
 0.70622958 0.70782307 0.70520623 0.70563236 0.71278394 0.7003743
 0.71332444 0.70905766 0.70526236 0.71250015 0.70928798 0.71090334
 0.71284838 0.71284838 0.71284838 0.71284838 0.71284838 0.71284838
 0.71284838 0.71284838 0.71284838 0.71284838 0.71284838 0.71284838
 0.71284838 0.71284838 0.71284838 0.71284838 0.71284838 0.71284838
 0.71284838 0.71284838 0.71284838 0.71284838 0.71284838 0.71284838
 0.71284838 0.71284838 0.71284838 0.71284838 0.71284838 0.71284838
 0.71284838 0.71284838]
--------------------------------------------------------------------------------
최적파라미터: {'max_depth': 6}
--------------------------------------------------------------------------------
최고성능: 0.7474883885080482
================================================================================
In [294]:
plt.figure(figsize=(5, 5))
plt.barh(y=list(x), width=model.best_estimator_.feature_importances_)
plt.show()
In [ ]:
 

'KT AIVLE > Daily Review' 카테고리의 다른 글

241021 ~ 241022  (0) 2024.10.28
241018  (0) 2024.10.20
241016  (0) 2024.10.16
241015  (0) 2024.10.15
241014  (0) 2024.10.14