241017
In [129]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')
%config InlineBackend.figure_format='retina'
1. Logistic Regression
In [131]:
path = 'https://raw.githubusercontent.com/jangrae/csv/master/diabetes.csv'
data = pd.read_csv(path)
In [132]:
data.tail()
Out[132]:
PregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcome763764765766767
10 | 101 | 76 | 48 | 180 | 32.9 | 0.171 | 63 | 0 |
2 | 122 | 70 | 27 | 0 | 36.8 | 0.340 | 27 | 0 |
5 | 121 | 72 | 23 | 112 | 26.2 | 0.245 | 30 | 0 |
1 | 126 | 60 | 0 | 0 | 30.1 | 0.349 | 47 | 1 |
1 | 93 | 70 | 31 | 0 | 30.4 | 0.315 | 23 | 0 |
In [133]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Pregnancies 768 non-null int64
1 Glucose 768 non-null int64
2 BloodPressure 768 non-null int64
3 SkinThickness 768 non-null int64
4 Insulin 768 non-null int64
5 BMI 768 non-null float64
6 DiabetesPedigreeFunction 768 non-null float64
7 Age 768 non-null int64
8 Outcome 768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
In [134]:
data.corr(numeric_only = True)
Out[134]:
PregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcomePregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcome
1.000000 | 0.129459 | 0.141282 | -0.081672 | -0.073535 | 0.017683 | -0.033523 | 0.544341 | 0.221898 |
0.129459 | 1.000000 | 0.152590 | 0.057328 | 0.331357 | 0.221071 | 0.137337 | 0.263514 | 0.466581 |
0.141282 | 0.152590 | 1.000000 | 0.207371 | 0.088933 | 0.281805 | 0.041265 | 0.239528 | 0.065068 |
-0.081672 | 0.057328 | 0.207371 | 1.000000 | 0.436783 | 0.392573 | 0.183928 | -0.113970 | 0.074752 |
-0.073535 | 0.331357 | 0.088933 | 0.436783 | 1.000000 | 0.197859 | 0.185071 | -0.042163 | 0.130548 |
0.017683 | 0.221071 | 0.281805 | 0.392573 | 0.197859 | 1.000000 | 0.140647 | 0.036242 | 0.292695 |
-0.033523 | 0.137337 | 0.041265 | 0.183928 | 0.185071 | 0.140647 | 1.000000 | 0.033561 | 0.173844 |
0.544341 | 0.263514 | 0.239528 | -0.113970 | -0.042163 | 0.036242 | 0.033561 | 1.000000 | 0.238356 |
0.221898 | 0.466581 | 0.065068 | 0.074752 | 0.130548 | 0.292695 | 0.173844 | 0.238356 | 1.000000 |
In [135]:
from sklearn.model_selection import train_test_split
x = data.drop(columns = 'Outcome')
y = data['Outcome']
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y,
train_size = 0.7,
random_state = 1)
In [136]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
model = LogisticRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[132 18]
[ 36 45]]
precision recall f1-score support
0 0.79 0.88 0.83 150
1 0.71 0.56 0.62 81
accuracy 0.77 231
macro avg 0.75 0.72 0.73 231
weighted avg 0.76 0.77 0.76 231
In [137]:
# 회귀계수
print(model.coef_) # 계수
print(model.intercept_) # y절편
[[ 1.50114669e-01 3.31625358e-02 -1.64284589e-02 -1.29767759e-03
1.26907089e-04 9.45353144e-02 9.45280772e-01 1.66363363e-02]]
[-8.3819741]
In [138]:
# z 값
z = model.decision_function(x_test)
z[:10]
Out[138]:
array([-1.38003159, 1.38522403, -0.74469448, -3.1947857 , -1.1868971 ,
-1.16430342, 2.76414362, -0.31485228, -0.72954185, -2.02736112])
In [139]:
# 로지스틱 함수 사용(0 or 1)
from scipy.special import expit
expit(z)[:10].round(3)
Out[139]:
array([0.201, 0.8 , 0.322, 0.039, 0.234, 0.238, 0.941, 0.422, 0.325,
0.116])
In [140]:
y_pred[:10]
Out[140]:
array([0, 1, 0, 0, 0, 0, 1, 0, 0, 0], dtype=int64)
In [141]:
# 확률값 확인
p = model.predict_proba(x_test)
p1 = p[:, 1] # 클래스 1의 확률
print(p1[:10].round(3))
[0.201 0.8 0.322 0.039 0.234 0.238 0.941 0.422 0.325 0.116]
In [142]:
# 새로운 예측값: 기준 0.5
y_pred2 = [1 if x >0.5 else 0 for x in p1]
print(y_pred[:10]) # 로직스틱 회귀분석 예측값
print(y_pred2[:10]) # 0.5 기준 판단한 예측값
[0 1 0 0 0 0 1 0 0 0]
[0, 1, 0, 0, 0, 0, 1, 0, 0, 0]
1-1. Logistic Regression
In [144]:
path = 'https://raw.githubusercontent.com/jangrae/csv/master/iris.csv'
data = pd.read_csv(path)
In [145]:
data.tail()
Out[145]:
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies145146147148149
6.7 | 3.0 | 5.2 | 2.3 | virginica |
6.3 | 2.5 | 5.0 | 1.9 | virginica |
6.5 | 3.0 | 5.2 | 2.0 | virginica |
6.2 | 3.4 | 5.4 | 2.3 | virginica |
5.9 | 3.0 | 5.1 | 1.8 | virginica |
In [146]:
from sklearn.model_selection import train_test_split
x = data.drop(columns = 'Species')
y = data['Species']
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y,
test_size = 0.3,
random_state = 1)
In [147]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
model = LogisticRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))
[[15 0 0]
[ 0 15 1]
[ 0 0 14]]
precision recall f1-score support
setosa 1.00 1.00 1.00 15
versicolor 1.00 0.94 0.97 16
virginica 0.93 1.00 0.97 14
accuracy 0.98 45
macro avg 0.98 0.98 0.98 45
weighted avg 0.98 0.98 0.98 45
In [148]:
# LR 회귀식
print(model.coef_.round(3)) # 각 품종에 대한 회귀식: 3 row
print()
print(model.intercept_.round(3)) # y절편 3개
[[-0.481 0.779 -2.291 -0.922]
[ 0.152 -0.217 -0.078 -0.694]
[ 0.329 -0.562 2.37 1.615]]
[ 9.792 3.112 -12.904]
In [149]:
# z 값
z = model.decision_function(x_test)
z[:10]
Out[149]:
array([[-5.0357003 , 1.29709011, 3.7386102 ],
[ 5.9680559 , 2.67427064, -8.64232653],
[ 6.74302654, 2.7542294 , -9.49725593],
[-2.49603916, 1.78172178, 0.71431738],
[-2.41853989, 2.1470536 , 0.27148629],
[-1.32507382, 2.21522906, -0.89015524],
[-6.43274013, 1.88387669, 4.54886344],
[-3.17046422, 2.18284448, 0.98761974],
[-5.07915288, 1.7646631 , 3.31448978],
[ 6.88465471, 2.70554244, -9.59019715]])
In [150]:
# 소프트맥스 함수 (다중 클래스 분류)
from scipy.special import softmax
print(softmax(z, axis = 1)[:10].round(3))
print(y_pred[:10])
[[0. 0.08 0.92 ]
[0.964 0.036 0. ]
[0.982 0.018 0. ]
[0.01 0.736 0.253]
[0.009 0.859 0.132]
[0.027 0.931 0.042]
[0. 0.065 0.935]
[0.004 0.765 0.231]
[0. 0.175 0.825]
[0.985 0.015 0. ]]
['virginica' 'setosa' 'setosa' 'versicolor' 'versicolor' 'versicolor'
'virginica' 'versicolor' 'virginica' 'setosa']
In [151]:
# 확률값 확인
model.predict_proba(x_test)[:10].round(3)
Out[151]:
array([[0. , 0.08 , 0.92 ],
[0.964, 0.036, 0. ],
[0.982, 0.018, 0. ],
[0.01 , 0.736, 0.253],
[0.009, 0.859, 0.132],
[0.027, 0.931, 0.042],
[0. , 0.065, 0.935],
[0.004, 0.765, 0.231],
[0. , 0.175, 0.825],
[0.985, 0.015, 0. ]])
In [ ]:
2. K-Fold Cross Validation
In [153]:
path = 'https://raw.githubusercontent.com/Jangrae/csv/master/diabetes.csv'
data = pd.read_csv(path)
In [154]:
data.tail()
Out[154]:
PregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcome763764765766767
10 | 101 | 76 | 48 | 180 | 32.9 | 0.171 | 63 | 0 |
2 | 122 | 70 | 27 | 0 | 36.8 | 0.340 | 27 | 0 |
5 | 121 | 72 | 23 | 112 | 26.2 | 0.245 | 30 | 0 |
1 | 126 | 60 | 0 | 0 | 30.1 | 0.349 | 47 | 1 |
1 | 93 | 70 | 31 | 0 | 30.4 | 0.315 | 23 | 0 |
In [155]:
from sklearn.model_selection import train_test_split
x = data.drop(columns='Outcome')
y = data.loc[:, 'Outcome']
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y, test_size=0.3, random_state=1)
In [156]:
# 정규화
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_test_s = scaler.transform(x_test)
In [192]:
# DecisionTree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
# 선언
model = DecisionTreeClassifier(max_depth = 5, random_state =1)
# 검증
cv_score = cross_val_score(model, x_train, y_train, cv = 5, scoring = 'r2')
# 확인
print(cv_score)
print('평균:', cv_score.mean())
print('표준편차:', cv_score.std())
# 저장
result = dict()
result['Linear Regression'] = cv_score.mean()
[-0.33984962 0.06616541 -0.1980695 -0.03281853 -0.36332046]
평균: -0.17357854094696193
표준편차: 0.16824887563654517
In [198]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
# 선언
model = KNeighborsClassifier()
# 검증
cv_score = cross_val_score(model, x_train, y_train, scoring = 'r2')
# 저장
result['KNN'] = cv_score.mean()
In [204]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
# 선언
model = LogisticRegression()
# 검증
cv_score = cross_val_score(model, x_train, y_train, scoring = 'r2')
# 저장
result['Decision Tree'] = cv_score.mean()
In [206]:
result
Out[206]:
{'Linear Regression': -0.17357854094696193,
'KNN': -0.11673643568380405,
'Decision Tree': 0.056492582808372306}
2-1. K-Fold Cross Validation
In [211]:
path = 'https://raw.githubusercontent.com/Jangrae/csv/master/mobile_cust_churn.csv'
data = pd.read_csv(path)
In [213]:
data.tail()
Out[213]:
idCOLLEGEINCOMEOVERAGELEFTOVERHOUSEHANDSET_PRICEOVER_15MINS_CALLS_PER_MONTHAVERAGE_CALL_DURATIONREPORTED_SATISFACTIONREPORTED_USAGE_LEVELCONSIDERING_CHANGE_OF_PLANCHURN1999519996199971999819999
19996 | 0 | 153252 | 0 | 23 | 368403 | 597 | 1 | 6 | very_sat | little | actively_looking_into_it | STAY |
19997 | 1 | 107126 | 71 | 82 | 237397 | 609 | 5 | 2 | very_sat | very_little | no | STAY |
19998 | 0 | 78529 | 0 | 66 | 172589 | 275 | 0 | 2 | unsat | very_high | considering | LEAVE |
19999 | 0 | 78674 | 47 | 41 | 572406 | 288 | 4 | 2 | very_unsat | little | considering | LEAVE |
20000 | 0 | 124697 | 0 | 0 | 845575 | 808 | 24 | 14 | unsat | high | actively_looking_into_it | LEAVE |
In [ ]:
data.drop(columns = 'id', inplace = True)
from sklearn.model_selection import train_test_split
x =data.drop(columns = 'CHURN')
y = data['CHURN']
x_train, x_test, y_train, y_test = train_test_split(x,y, stratify = y, test_size= 0.3, random_state =1)
In [228]:
from sklearn.preprocessing import MinMaxScaler
# 정규화
x_train_s = scaler.fit_transform(x_train)
x_test_s = scaler.transform(x_test)
In [242]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
# 선언
model = KNeighborsClassifier()
# 검증
cv_score = cross_val_score(model, x_train, y_train, cv = 10)
print(cv_score)
[0.81481481 0.74074074 0.77777778 0.81481481 0.64814815 0.74074074
0.66666667 0.83018868 0.77358491 0.73584906]
In [243]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
# 선언
model = DecisionTreeClassifier()
# 검증
cv_score = cross_val_score(model, x_train, y_train, cv = 10)
print(cv_score)
[0.66666667 0.83333333 0.72222222 0.81481481 0.83333333 0.64814815
0.72222222 0.79245283 0.77358491 0.69811321]
In [245]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
# 선언
model = LogisticRegression()
# 검증
cv_score = cross_val_score(model, x_train, y_train, cv = 10)
print(cv_score)
[0.72222222 0.83333333 0.77777778 0.77777778 0.81481481 0.72222222
0.75925926 0.75471698 0.71698113 0.81132075]
3. Grid Search로 Decision Tree 알고리즘 모델
In [262]:
path = 'https://raw.githubusercontent.com/jangrae/csv/master/boston.csv'
data = pd.read_csv(path)
In [264]:
data.tail()
Out[264]:
crimzninduschasnoxrmagedisradtaxptratioblacklstatmedv501502503504505
0.06263 | 0.0 | 11.93 | 0 | 0.573 | 6.593 | 69.1 | 2.4786 | 1 | 273 | 21.0 | 391.99 | 9.67 | 22.4 |
0.04527 | 0.0 | 11.93 | 0 | 0.573 | 6.120 | 76.7 | 2.2875 | 1 | 273 | 21.0 | 396.90 | 9.08 | 20.6 |
0.06076 | 0.0 | 11.93 | 0 | 0.573 | 6.976 | 91.0 | 2.1675 | 1 | 273 | 21.0 | 396.90 | 5.64 | 23.9 |
0.10959 | 0.0 | 11.93 | 0 | 0.573 | 6.794 | 89.3 | 2.3889 | 1 | 273 | 21.0 | 393.45 | 6.48 | 22.0 |
0.04741 | 0.0 | 11.93 | 0 | 0.573 | 6.030 | 80.8 | 2.5050 | 1 | 273 | 21.0 | 396.90 | 7.88 | 11.9 |
In [266]:
x = data.drop(columns='medv')
y = data['medv']
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
In [274]:
# 파라미터 선언
param = {'max_depth' : range(1, 51)}
In [288]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import *
# 선언
model_dt = DecisionTreeRegressor(random_state = 1)
# 성능 예측
model = GridSearchCV(model_dt,
param,
cv = 5,
scoring = 'r2') # default setting: r2
# 학습
model.fit(x_train, y_train)
In [291]:
# 중요 정보 확인
print('=' * 80)
print(model.cv_results_['mean_test_score'])
print('-' * 80)
print('최적파라미터:', model.best_params_)
print('-' * 80)
print('최고성능:', model.best_score_)
print('=' * 80)
================================================================================
[0.37077174 0.57894062 0.67646772 0.72240391 0.7383174 0.74748839
0.70622958 0.70782307 0.70520623 0.70563236 0.71278394 0.7003743
0.71332444 0.70905766 0.70526236 0.71250015 0.70928798 0.71090334
0.71284838 0.71284838 0.71284838 0.71284838 0.71284838 0.71284838
0.71284838 0.71284838 0.71284838 0.71284838 0.71284838 0.71284838
0.71284838 0.71284838 0.71284838 0.71284838 0.71284838 0.71284838
0.71284838 0.71284838 0.71284838 0.71284838 0.71284838 0.71284838
0.71284838 0.71284838 0.71284838 0.71284838 0.71284838 0.71284838
0.71284838 0.71284838]
--------------------------------------------------------------------------------
최적파라미터: {'max_depth': 6}
--------------------------------------------------------------------------------
최고성능: 0.7474883885080482
================================================================================
In [294]:
plt.figure(figsize=(5, 5))
plt.barh(y=list(x), width=model.best_estimator_.feature_importances_)
plt.show()

In [ ]:
'KT AIVLE > Daily Review' 카테고리의 다른 글
241021 ~ 241022 (0) | 2024.10.28 |
---|---|
241018 (0) | 2024.10.20 |
241016 (0) | 2024.10.16 |
241015 (0) | 2024.10.15 |
241014 (0) | 2024.10.14 |