KT AIVLE/Daily Review

241029

bestone888 2024. 10. 30. 00:36

241029

In [8]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

import joblib

from sklearn.ensemble import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import *

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

1. 지도학습

In [69]:

data01_train = pd.read_csv('data01_train.csv')
data01_train.drop(columns = 'subject', inplace = True)

In [70]:

# LGBM을 위한 열 이름 변경
data01_train.columns = data01_train.columns.str.replace(r'[^\w]', '_', regex=True)

In [71]:

# 데이터 분리
x = data01_train.drop(columns = 'Activity')
y = data01_train['Activity']
x_train, x_val, y_train, y_val = train_test_split(x, y, stratify = y, test_size = 0.3,random_state = 1)

In [75]:

# 정규화
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)

In [ ]:

# 모델1: Decision Tree
# 모델2: KNN
# 모델3: Random Forest
# 모델4: Logistic Regression
# 모델5: LightGBM
# 모델6: XGBoost

In [38]:

# 모델1: Decision Tree
params = {'max_depth': range(3,30,5)}
model1 = GridSearchCV(DecisionTreeClassifier(random_state = 1),
                                                    params, 
                                                    cv = 5,
                                                    scoring = 'accuracy')

model1.fit(x_train, y_train)
print(model1.best_params_)
print(model1.best_score_)

{'max_depth': 8}
0.930756231641284

In [40]:

# 모델1 저장
joblib.dump(model1.best_estimator_, 'model1.pkl')

Out[40]:

['model1.pkl']

In [44]:

# 모델2: KNN
params = {'n_neighbors': range(10, 301, 10)}
model2 = GridSearchCV(KNeighborsClassifier(),
                     params,
                     cv = 5,
                     scoring = 'accuracy')
model2.fit(x_train, y_train)
print(model2.best_params_)
print(model2.best_score_)

{'n_neighbors': 10}
0.9494673760454884

In [46]:

# 모델2 저장
joblib.dump(model2.best_estimator_, 'model2.pkl')

Out[46]:

['model2.pkl']

In [48]:

# 모델3: Random Forest
params = {'max_depth': range(10, 200, 20)}
model3 = GridSearchCV(RandomForestClassifier(random_state = 1),
                     params,
                     cv = 5, 
                     scoring = 'accuracy')

model3.fit(x_train, y_train)
print(model3.best_estimator_)
print(model3.best_score_)

RandomForestClassifier(max_depth=30, random_state=1)
0.9727901709351296

In [50]:

# 모델3 저장
joblib.dump(model3.best_estimator_, 'model3.pkl')

Out[50]:

['model3.pkl']

In [77]:

# 모델4: Logistic Regression
model4 = LogisticRegression()
model4.fit(x_train, y_train)
y_pred = model4.predict(x_val)

print(accuracy_score(y_pred, y_val))
print(confusion_matrix(y_pred, y_val))
print(classification_report(y_pred, y_val))

0.9790368271954675
[[335   2   0   0   0   0]
 [  0 292  15   0   0   0]
 [  0  15 311   0   0   0]
 [  0   0   0 298   0   1]
 [  0   0   0   1 237   1]
 [  0   1   0   1   0 255]]
                    precision    recall  f1-score   support

            LAYING       1.00      0.99      1.00       337
           SITTING       0.94      0.95      0.95       307
          STANDING       0.95      0.95      0.95       326
           WALKING       0.99      1.00      0.99       299
WALKING_DOWNSTAIRS       1.00      0.99      1.00       239
  WALKING_UPSTAIRS       0.99      0.99      0.99       257

          accuracy                           0.98      1765
         macro avg       0.98      0.98      0.98      1765
      weighted avg       0.98      0.98      0.98      1765

In [83]:

# 모델4 저장
joblib.dump(model4, 'model4.pkl')

Out[83]:

['model4.pkl']

In [79]:

# 모델5: LightGBM
params = {'n_estimators': range(50, 201, 50)}
model5 = GridSearchCV(LGBMClassifier(verbose=-1), params, cv = 5)

model5.fit(x_train, y_train)
print(model5.best_params_)
print(model5.best_score_)

{'n_estimators': 150}
0.9866375089950337

In [85]:

# 모델5 저장
joblib.dump(model5.best_estimator_, 'model5.pkl')

Out[85]:

['model5.pkl']

In [93]:

# 모델6: XGBoost

# XGBoost는 숫자형 데이터만 처리 가능
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_label = le.fit_transform(y_train)
y_val_label = le.transform(y_val)


params = {'n_estimators': range(50, 201, 50)}
model6 = GridSearchCV(XGBClassifier(verbose = -1),
                     params,
                     cv = 5,
                     scoring = 'accuracy')

model6.fit(x_train, y_train_label)
print(model6.best_params_)
print(model6.best_score_)

{'n_estimators': 100}
0.9832365015512746

In [95]:

# 모델6 저장
joblib.dump(model6.best_estimator_, 'model6.pkl')

Out[95]:

['model6.pkl']

In [ ]:

# 정리
# 모델1: Decision Tree
# 모델2: KNN
# 모델3: Random Forest
# 모델4: Logistic Regression
# 모델5: LightGBM
# 모델6: XGBoost

# 모델 1~5 사용해서 soft voting

In [ ]:

# 앙상블 파이프라인

In [106]:

def pipeline1(filename, target):
    df = pd.read_csv(filename)
    # LGBM을 위한 열 이름 전처리 (json 지원하지 않는 특수 문자 변경)
    df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)

    # x,y 분리
    x = df.drop(columns = target)
    y = df[target]
    x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y, test_size = 0.3,random_state = 1)
    
    # 정규화
    scaler = MinMaxScaler()
    scaler.fit(x_train)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)

    model1 = joblib.load('model1.pkl')
    model2 = joblib.load('model2.pkl')
    model3 = joblib.load('model3.pkl')
    model4 = joblib.load('model4.pkl')
    model5 = joblib.load('model5.pkl')

    # 보팅 모델 선언
    estimators = [('DT', model1), 
                  ('KNN', model2), 
                  ('RF', model3), 
                  ('LR', model4), 
                  ('LGBM', model5)]
    
    model_voting = VotingClassifier(estimators=estimators, voting = 'soft')
    
    # 학습하기
    model_voting.fit(x_train, y_train)
    
    # 예측하기
    y_pred = model_voting.predict(x_test)

    acc_score = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    joblib.dump(model_voting,'model_pipeline1.pkl')
    
    return acc_score, conf_matrix, class_report

In [112]:

accuracy, confusion, report = pipeline1('data01_test.csv', 'Activity')
print(accuracy)
print(confusion)
print(report)

0.9570135746606335
[[88  0  0  0  0  0]
 [ 3 62 11  0  0  0]
 [ 0  0 86  0  0  0]
 [ 0  0  0 67  1  0]
 [ 0  0  0  2 57  0]
 [ 0  0  0  0  2 63]]
                    precision    recall  f1-score   support

            LAYING       0.97      1.00      0.98        88
           SITTING       1.00      0.82      0.90        76
          STANDING       0.89      1.00      0.94        86
           WALKING       0.97      0.99      0.98        68
WALKING_DOWNSTAIRS       0.95      0.97      0.96        59
  WALKING_UPSTAIRS       1.00      0.97      0.98        65

          accuracy                           0.96       442
         macro avg       0.96      0.96      0.96       442
      weighted avg       0.96      0.96      0.96       442

In [ ]:

2. 비지도 학습

In [17]:

from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

In [18]:

# standard scaling 된 데이터 data_sc.csv 불러오기
data = pd.read_csv('data_sc.csv')
data.tail()

Out[18]:

AGE고용상태Willingness to pay/Stay상품타입교육수준소득월 납입액타 상품 보유 현황총지불금액거주지사이즈자동차1199511996119971199811999

-1.853401	0.772120	2.224522	-0.313685	-0.366062	1.071545	-0.708232	-0.247307	-0.809917	-0.340235	-0.318628
-0.070427	0.772120	-0.703830	-0.313685	-0.366062	-0.547511	-0.472671	-1.078127	-0.188373	-0.340235	-0.318628
-0.070427	-1.295136	0.025735	-0.313685	-0.366062	-1.242413	-0.237111	-0.247307	1.230306	-0.340235	-0.318628
0.821059	0.772120	-0.066542	-0.313685	-0.366062	-0.536698	-0.001551	0.583512	0.887482	2.939142	-0.318628
-0.070427	-1.295136	-0.774479	-0.313685	-0.366062	-1.242413	-0.472671	-1.078127	-0.221820	-0.340235	-0.318628

In [26]:

# 몇개로 군집화 할지 시각화로 판단
kvalues = range(3, 20)
inertias = list()

for k in kvalues:
    model = KMeans(n_clusters = k, n_init = 'auto', random_state =1)
    model.fit(data)
    inertias.append(model.inertia_)

plt.plot(kvalues, inertias, marker = 'o')
plt.show()

In [38]:

# Elbow Method 활용 k 결정
model = KMeans(random_state = 1, n_init = 'auto')
Elbow_M = KElbowVisualizer(model,k=(2,21))
Elbow_M.fit(data)
Elbow_M.show()

Out[38]:

<Axes: title={'center': 'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>

In [41]:

#???

In [51]:

# 데이터 불러오기
data0 =pd.read_csv('customers_seg.csv')
data0.drop(columns = ['CID'], inplace  =True)

In [53]:

# 6개의 cluster
model_B =KMeans(n_clusters = 6, n_init = 'auto')

model_B.fit(data)

cluster = model_B.predict(data)
cluster

Out[53]:

array([1, 2, 1, ..., 1, 4, 1])

In [59]:

# cluster를 데이터프레임으로 만들고 붙이기
# result에는 cluster가 포함
cluster = pd.DataFrame(cluster, columns = ['cluster'])
result = pd.concat([data0, cluster], axis = 1)
result['cluster'] = pd.Categorical(result['cluster'] )

result.to_csv('result_my.csv', index = False)

In [67]:

# AGE 열 파악하기
feature = 'AGE'

df = pd.crosstab([result[feature]], result['cluster'], margins=True)

# pro_df 확인
display(df)

# 시각화
from statsmodels.graphics.mosaicplot import mosaic
mosaic(result.sort_values('cluster'), ['cluster', feature])
plt.show()

cluster012345AllAGE123456All

43	0	823	87	461	40	1454
38	621	421	106	273	79	1538
154	2623	524	330	753	391	4775
206	355	773	212	1314	264	3124
71	0	177	169	447	193	1057
2	0	1	11	11	27	52
514	3599	2719	915	3259	994	12000

In [ ]:

'KT AIVLE > Daily Review' 카테고리의 다른 글

241031 (0)	2024.10.31
241030 (0)	2024.10.30
241021 ~ 241022 (0)	2024.10.28
241018 (0)	2024.10.20
241017 (0)	2024.10.17

현재글241029

bestone888 님의 블로그

Herzlich willkommen

Today :
Yesterday :

bestone888 님의 블로그

241029

241029

1. 지도학습

2. 비지도 학습

'KT AIVLE > Daily Review' 카테고리의 다른 글

'KT AIVLE/Daily Review'의 다른글

티스토리툴바

« 2025/05 »
일	월	화	수	목	금	토
				1	2	3
4	5	6	7	8	9	10
11	12	13	14	15	16	17
18	19	20	21	22	23	24
25	26	27	28	29	30	31

241029

241029

1. 지도학습

2. 비지도 학습

'KT AIVLE > Daily Review' 카테고리의 다른 글

'KT AIVLE/Daily Review'의 다른글

관련글

티스토리툴바