bestone888 2024. 9. 26. 23:24

240926

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [10]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/titanic_simple.csv'
titanic = pd.read_csv(path)
titanic.head()
Out[10]:
PassengerIdSurvivedPclassNameSexAgeFareEmbarked01234
1 0 3 Braund, Mr. Owen Harris male 22.0 7.2500 Southampton
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 71.2833 Cherbourg
3 1 3 Heikkinen, Miss. Laina female 26.0 7.9250 Southampton
4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 53.1000 Southampton
5 0 3 Allen, Mr. William Henry male 35.0 8.0500 Southampton
In [13]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/air2.csv'
air = pd.read_csv(path)
air.head()
Out[13]:
OzoneSolar.RWindTempDate01234
41 190.0 7.4 67 1973-05-01
36 118.0 8.0 72 1973-05-02
12 149.0 12.6 74 1973-05-03
18 313.0 11.5 62 1973-05-04
19 NaN 14.3 56 1973-05-05

단변량 범주형 변수

1. 수치화: 기초통계량

  • value_counts()
  • value_counts(normalize = True) #비율
In [18]:
titanic['Pclass'].value_counts()
Out[18]:
Pclass
3    491
1    216
2    184
Name: count, dtype: int64
In [20]:
titanic['Pclass'].value_counts(normalize = True)
Out[20]:
Pclass
3    0.551066
1    0.242424
2    0.206510
Name: proportion, dtype: float64

2. 시각화

  • bar chart
  • pie chart
In [33]:
# sns.countplot()

# 가로로 그리기
sns.countplot(y = 'Pclass', data = titanic)
plt.show()
In [35]:
# 세로로 그리기
sns.countplot(x = 'Pclass', data = titanic)
plt.show()
In [45]:
# plt.pie()

temp = titanic['Pclass'].value_counts()
plt.pie(temp.values, labels = temp.index, autopct = '%.2f%%')
plt.show()
 
In [ ]:
 

이변량 숫자 vs 숫자

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [61]:
titanic = pd.read_csv('https://raw.githubusercontent.com/DA4BAM/dataset/master/titanic.0.csv')
titanic.head()
Out[61]:
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked01234
1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [75]:
air = pd.read_csv('https://raw.githubusercontent.com/DA4BAM/dataset/master/air2.csv')
air['Date'] = pd.to_datetime(air['Date'])
air['Year'] = air['Date'].dt.year
air['Month'] = air['Date'].dt.month
air['Day'] = air['Date'].dt.day
In [91]:
air2 = air[['Ozone', 'Solar.R', 'Wind', 'Temp']]
air2.head()
Out[91]:
OzoneSolar.RWindTemp01234
41 190.0 7.4 67
36 118.0 8.0 72
12 149.0 12.6 74
18 313.0 11.5 62
19 NaN 14.3 56

1. 시각화: 산점도

  • plt.scatter(x = x축값, y= y축값)
  • sns.pairplot(df, kind = 'reg') : 한 번에 그리기
In [99]:
# plt.scatter(x = , y = )
plt.scatter(x = air['Temp'], y = air['Ozone'])
plt.show()
In [105]:
# Temp, Wind, Solar.R과 Ozone과의 관계 시각화
plt.figure(figsize = (14,5))
plt.subplot(1,3,1)
plt.scatter(x = air['Temp'], y = air['Ozone'])

plt.subplot(1,3,2)
plt.scatter(x = air['Wind'], y = air['Ozone'])

plt.subplot(1,3,3)
plt.scatter(x = 'Wind', y = 'Ozone', data=  air)

plt.show()
In [109]:
# sns.pairplot()으로 한 번에 시각화
sns.pairplot(air2)
plt.show()
In [111]:
# kind = 'reg'로 회귀선 추가 가능
sns.pairplot(air2)
plt.show()

2. 수치화: 상관분석

  • 상관계수
  • p-value
In [115]:
import scipy.stats as spst
In [119]:
# spst.pearsonr() : 상관계수와 p value 구하기
# Nan 값 제거하고 사용해야함: .notnull()
# 상관계수와 p-value
spst.pearsonr(air['Temp'], air['Ozone'])
Out[119]:
PearsonRResult(statistic=0.6833717861490114, pvalue=2.197769800200284e-22)
In [124]:
# 데이터프레임 수치형 데이터에 대한 상관계수 구하기
air2.corr()
Out[124]:
OzoneSolar.RWindTempOzoneSolar.RWindTemp
1.000000 0.280068 -0.605478 0.683372
0.280068 1.000000 -0.056792 0.275840
-0.605478 -0.056792 1.000000 -0.457988
0.683372 0.275840 -0.457988 1.000000
In [133]:
## 상관계수 heatmap 시각화
sns.heatmap(air2.corr(),
            annot = True,            # 숫자(상관계수) 표기 여부
            fmt = '.3f',             # 숫자 포멧 : 소수점 3자리까지 표기
            cmap = 'RdYlBu_r',       # 칼라맵
            vmin = -1, vmax = 1)
plt.show()