bestone888 2024. 9. 29. 21:17

240927

복습

In [4]:
# 보스턴 집값 데이터
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as spst
In [6]:
boston = pd.read_csv('https://raw.githubusercontent.com/DA4BAM/dataset/master/boston.csv')
boston.head()
Out[6]:
crimzninduschasnoxrmagedisradtaxptratiolstatmedv01234
0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
In [11]:
# crim(범죄율) -> medv(집값) 시각화, 수치화

sns.scatterplot(x = 'crim', y = 'medv', data = boston)
plt.grid()
plt.show()
In [13]:
temp = boston.loc[boston['crim'].notnull()]

spst.pearsonr(temp['crim'], temp['medv'])
Out[13]:
PearsonRResult(statistic=-0.3883046085868116, pvalue=1.1739870821943826e-19)
In [17]:
# 음의 상관관계
# mdev 50 에 몰린 이유는 설문조사의 최댓값이 50이기 때문일 것이다
In [21]:
# tax(제산세율) -> medv(집값) 시각화, 수치화
sns.scatterplot(x = 'tax', y = 'medv', data = boston)
plt.grid()
plt.show()
In [25]:
temp = boston.loc[boston['tax'].notnull()]
spst.pearsonr(temp['tax'], temp['medv'])
Out[25]:
PearsonRResult(statistic=-0.4685359335677671, pvalue=5.637733627690444e-29)
In [28]:
# 음의 상관관계
In [33]:
# tax(제산세율) -> medv(집값) 시각화, 수치화
# tax의 500 이상인 값은 제거하고 다시 수행

temp = boston.loc[boston['tax']<500]
spst.pearsonr(temp['tax'], temp['medv'])
Out[33]:
PearsonRResult(statistic=-0.2923180757786092, pvalue=1.0549678915090099e-08)
In [35]:
# tax 500 이상인 값이 상관계수 -0.46에 영향을 크게 미침
In [39]:
# lstat(하위계층비율) -> medv(집값) 시각화, 수치화
sns.scatterplot(x = 'lstat', y = 'medv', data= boston)
plt.grid()
plt.show()
In [41]:
temp = boston.loc[boston['lstat'].notnull()]
spst.pearsonr(temp['lstat'], temp['medv'])
Out[41]:
PearsonRResult(statistic=-0.7376627261740148, pvalue=5.081103394387554e-88)
In [43]:
# 강한 음의 상관관계
In [51]:
# ptratio(교사 1명당 학생 수) -> medv(집값) 시각화, 수치화
sns.scatterplot(x ='ptratio', y = 'medv', data = boston)
plt.grid()
plt.show()
In [55]:
# 범주 내에선 규칙이 없어 보이지만, 범주 간에는 있어 보임
In [57]:
# 전체 변수간 상관관계
boston.corr()
Out[57]:
crimzninduschasnoxrmagedisradtaxptratiolstatmedvcrimzninduschasnoxrmagedisradtaxptratiolstatmedv
1.000000 -0.200469 0.406583 -0.055892 0.420972 -0.219247 0.352734 -0.379670 0.625505 0.582764 0.289946 0.455621 -0.388305
-0.200469 1.000000 -0.533828 -0.042697 -0.516604 0.311991 -0.569537 0.664408 -0.311948 -0.314563 -0.391679 -0.412995 0.360445
0.406583 -0.533828 1.000000 0.062938 0.763651 -0.391676 0.644779 -0.708027 0.595129 0.720760 0.383248 0.603800 -0.483725
-0.055892 -0.042697 0.062938 1.000000 0.091203 0.091251 0.086518 -0.099176 -0.007368 -0.035587 -0.121515 -0.053929 0.175260
0.420972 -0.516604 0.763651 0.091203 1.000000 -0.302188 0.731470 -0.769230 0.611441 0.668023 0.188933 0.590879 -0.427321
-0.219247 0.311991 -0.391676 0.091251 -0.302188 1.000000 -0.240265 0.205246 -0.209847 -0.292048 -0.355501 -0.613808 0.695360
0.352734 -0.569537 0.644779 0.086518 0.731470 -0.240265 1.000000 -0.747881 0.456022 0.506456 0.261515 0.602339 -0.376955
-0.379670 0.664408 -0.708027 -0.099176 -0.769230 0.205246 -0.747881 1.000000 -0.494588 -0.534432 -0.232471 -0.496996 0.249929
0.625505 -0.311948 0.595129 -0.007368 0.611441 -0.209847 0.456022 -0.494588 1.000000 0.910228 0.464741 0.488676 -0.381626
0.582764 -0.314563 0.720760 -0.035587 0.668023 -0.292048 0.506456 -0.534432 0.910228 1.000000 0.460853 0.543993 -0.468536
0.289946 -0.391679 0.383248 -0.121515 0.188933 -0.355501 0.261515 -0.232471 0.464741 0.460853 1.000000 0.374044 -0.507787
0.455621 -0.412995 0.603800 -0.053929 0.590879 -0.613808 0.602339 -0.496996 0.488676 0.543993 0.374044 1.000000 -0.737663
-0.388305 0.360445 -0.483725 0.175260 -0.427321 0.695360 -0.376955 0.249929 -0.381626 -0.468536 -0.507787 -0.737663 1.000000
In [ ]:
 

표준오차

  • 표본평균과 모평균은 일치하지 않을 수 있음

95% 신뢰구간

  • 표본평균을 100개 추출했을 때 95개는 모평균을 포함함
In [92]:
# 임의의 모집단 만들기
import random as rd

# 평균 172, 표준편차 7인 정규분포 내에서 랜덤변수 생성
pop = [round(rd.normalvariate(172, 7),1) for i in range(800000)]

sns.histplot(pop, bins = 100)
plt.axvline(np.mean(pop), color = 'r')

# 텍스트 입력
# plt.text(x좌표, y좌표, 쓸 내용)
plt.text(np.mean(pop)+1, 20000,'pop mean: {}'.format(np.mean(pop)), color = 'r')

plt.show()
In [94]:
# 표본크기 50인 표본 100개뽑기
x_mean = []
for i in range(100):
    s1 = rd.sample(pop, 50)
    s1 = pd.Series(s1)
    x_mean.append(round(s1.mean(), 3))

x_mean
Out[94]:
[172.032,
 172.288,
 172.162,
 171.622,
 172.16,
 172.78,
 170.654,
 171.838,
 172.938,
 170.974,
 171.596,
 172.774,
 171.802,
 171.482,
 172.32,
 172.7,
 172.232,
 172.932,
 171.266,
 172.512,
 171.294,
 172.092,
 172.092,
 171.78,
 170.836,
 171.962,
 172.732,
 172.332,
 172.18,
 172.424,
 171.15,
 171.926,
 173.336,
 172.594,
 171.72,
 174.144,
 172.196,
 171.596,
 172.046,
 171.422,
 171.622,
 171.822,
 171.428,
 173.708,
 171.014,
 170.964,
 171.62,
 172.34,
 172.092,
 171.642,
 171.67,
 171.666,
 172.046,
 172.132,
 170.514,
 172.032,
 172.644,
 171.748,
 172.918,
 171.634,
 172.72,
 171.85,
 172.33,
 172.212,
 174.018,
 171.48,
 173.02,
 172.518,
 170.944,
 171.506,
 172.828,
 171.578,
 171.078,
 171.902,
 170.608,
 173.654,
 172.608,
 173.046,
 171.49,
 171.378,
 171.572,
 170.89,
 171.626,
 172.324,
 172.716,
 171.122,
 170.758,
 170.196,
 171.326,
 174.248,
 170.052,
 174.096,
 172.128,
 172.362,
 171.728,
 172.494,
 172.52,
 173.172,
 172.536,
 172.086]
In [109]:
sns.kdeplot(x_mean)
plt.axvline(np.mean(x_mean), color = 'r')
plt.text(np.mean(x_mean)+1, 0.1, 'pop mean {}'.format(round(np.mean(x_mean)),3), color = 'r')
plt.show()
In [113]:
# 모평균 != 표본평균의 평균
# but 유사함

범주 vs 숫자

In [127]:
# 타이타닉 데이터
titanic = pd.read_csv('https://raw.githubusercontent.com/DA4BAM/dataset/master/titanic.0.csv')
titanic.head()
Out[127]:
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked01234
1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [131]:
sns.barplot(x = 'Survived', y = 'Age', data = titanic)
plt.grid()
plt.show()
In [135]:
sns.boxplot(x = 'Survived', y = 'Age', data= titanic)
plt.grid()
plt.show()

수치화

  • t test : spst.ttest_ind()
  • ANOVA : spst.f_oneway()

t-test : 2 범주

  • 두 평균 차이를 표준오차로 나눈 값
  • feature, target 사이의 관계 파악
  • df.notnull()로 NaN 제거
  • t값이 2보다 크거나 -2보다 작으면 차이가 있다
In [155]:
# Survived(범주) -> Age(숫자)

# NaN 값 제거
temp = titanic.loc[titanic['Age'].notnull()]

# 생존자의 Age
survived = temp.loc[temp['Survived'] == 1, 'Age']

# 사망자의 Age
died = temp.loc[temp['Survived'] == 0, 'Age']
In [157]:
# t-test
spst.ttest_ind(survived, died)
Out[157]:
TtestResult(statistic=-2.06668694625381, pvalue=0.03912465401348249, df=712.0)
In [159]:
# Sex(범주) -> Fare(숫자)

# 남성의 Fare
m = titanic.loc[titanic['Sex'] == 'male', 'Fare']

# 여성의 Fare
f = titanic.loc[titanic['Sex'] == 'female', 'Fare']

# t-test
spst.ttest_ind(m, f)
Out[159]:
TtestResult(statistic=-5.529140269385719, pvalue=4.2308678700429995e-08, df=889.0)

ANOVA : 3 범주 이상

In [164]:
# Pclass(범주) -> Age(숫자)

# NaN 제거
temp = titanic.loc[titanic['Age'].notnull()]

# 클래스1
p1 = temp.loc[temp['Pclass'] == 1, 'Age']
p2 = temp.loc[temp['Pclass'] == 2, 'Age']
p3 = temp.loc[temp['Pclass'] == 3, 'Age']

# ANOVA
spst.f_oneway(p1, p2, p3)
Out[164]:
F_onewayResult(statistic=57.443484340676214, pvalue=7.487984171959904e-24)
In [ ]: