KT AIVLE/Daily Review
240926
bestone888
2024. 9. 26. 23:24
240926
In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [10]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/titanic_simple.csv'
titanic = pd.read_csv(path)
titanic.head()
Out[10]:
PassengerIdSurvivedPclassNameSexAgeFareEmbarked01234
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 7.2500 | Southampton |
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 71.2833 | Cherbourg |
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 7.9250 | Southampton |
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 53.1000 | Southampton |
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 8.0500 | Southampton |
In [13]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/air2.csv'
air = pd.read_csv(path)
air.head()
Out[13]:
OzoneSolar.RWindTempDate01234
41 | 190.0 | 7.4 | 67 | 1973-05-01 |
36 | 118.0 | 8.0 | 72 | 1973-05-02 |
12 | 149.0 | 12.6 | 74 | 1973-05-03 |
18 | 313.0 | 11.5 | 62 | 1973-05-04 |
19 | NaN | 14.3 | 56 | 1973-05-05 |
단변량 범주형 변수
1. 수치화: 기초통계량
- value_counts()
- value_counts(normalize = True) #비율
In [18]:
titanic['Pclass'].value_counts()
Out[18]:
Pclass
3 491
1 216
2 184
Name: count, dtype: int64
In [20]:
titanic['Pclass'].value_counts(normalize = True)
Out[20]:
Pclass
3 0.551066
1 0.242424
2 0.206510
Name: proportion, dtype: float64
2. 시각화
- bar chart
- pie chart
In [33]:
# sns.countplot()
# 가로로 그리기
sns.countplot(y = 'Pclass', data = titanic)
plt.show()

In [35]:
# 세로로 그리기
sns.countplot(x = 'Pclass', data = titanic)
plt.show()

In [45]:
# plt.pie()
temp = titanic['Pclass'].value_counts()
plt.pie(temp.values, labels = temp.index, autopct = '%.2f%%')
plt.show()
In [ ]:
이변량 숫자 vs 숫자
In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [61]:
titanic = pd.read_csv('https://raw.githubusercontent.com/DA4BAM/dataset/master/titanic.0.csv')
titanic.head()
Out[61]:
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked01234
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
In [75]:
air = pd.read_csv('https://raw.githubusercontent.com/DA4BAM/dataset/master/air2.csv')
air['Date'] = pd.to_datetime(air['Date'])
air['Year'] = air['Date'].dt.year
air['Month'] = air['Date'].dt.month
air['Day'] = air['Date'].dt.day
In [91]:
air2 = air[['Ozone', 'Solar.R', 'Wind', 'Temp']]
air2.head()
Out[91]:
OzoneSolar.RWindTemp01234
41 | 190.0 | 7.4 | 67 |
36 | 118.0 | 8.0 | 72 |
12 | 149.0 | 12.6 | 74 |
18 | 313.0 | 11.5 | 62 |
19 | NaN | 14.3 | 56 |
1. 시각화: 산점도
- plt.scatter(x = x축값, y= y축값)
- sns.pairplot(df, kind = 'reg') : 한 번에 그리기
In [99]:
# plt.scatter(x = , y = )
plt.scatter(x = air['Temp'], y = air['Ozone'])
plt.show()

In [105]:
# Temp, Wind, Solar.R과 Ozone과의 관계 시각화
plt.figure(figsize = (14,5))
plt.subplot(1,3,1)
plt.scatter(x = air['Temp'], y = air['Ozone'])
plt.subplot(1,3,2)
plt.scatter(x = air['Wind'], y = air['Ozone'])
plt.subplot(1,3,3)
plt.scatter(x = 'Wind', y = 'Ozone', data= air)
plt.show()

In [109]:
# sns.pairplot()으로 한 번에 시각화
sns.pairplot(air2)
plt.show()

In [111]:
# kind = 'reg'로 회귀선 추가 가능
sns.pairplot(air2)
plt.show()

2. 수치화: 상관분석
- 상관계수
- p-value
In [115]:
import scipy.stats as spst
In [119]:
# spst.pearsonr() : 상관계수와 p value 구하기
# Nan 값 제거하고 사용해야함: .notnull()
# 상관계수와 p-value
spst.pearsonr(air['Temp'], air['Ozone'])
Out[119]:
PearsonRResult(statistic=0.6833717861490114, pvalue=2.197769800200284e-22)
In [124]:
# 데이터프레임 수치형 데이터에 대한 상관계수 구하기
air2.corr()
Out[124]:
OzoneSolar.RWindTempOzoneSolar.RWindTemp
1.000000 | 0.280068 | -0.605478 | 0.683372 |
0.280068 | 1.000000 | -0.056792 | 0.275840 |
-0.605478 | -0.056792 | 1.000000 | -0.457988 |
0.683372 | 0.275840 | -0.457988 | 1.000000 |
In [133]:
## 상관계수 heatmap 시각화
sns.heatmap(air2.corr(),
annot = True, # 숫자(상관계수) 표기 여부
fmt = '.3f', # 숫자 포멧 : 소수점 3자리까지 표기
cmap = 'RdYlBu_r', # 칼라맵
vmin = -1, vmax = 1)
plt.show()
