KT AIVLE/Daily Review
240912
bestone888
2024. 9. 13. 01:29
240912
numpy
- axis 0 : 행
- axis 1 : 열
- rank : 축 개수
- shape : 배열 크기
- ndim : 배열 차원 확인
- dtype : 요소들의 자료형 확인, 배열은 한가지 자료형만 가질 수 있음
In [110]:
import numpy as np
a1 = list(range(6))
b1 = np.array(a1)
print(b1)
[0 1 2 3 4 5]
In [117]:
a2 = [[1.5, 2.5, 3.2],
[4.2, 5.7, 6.4]]
print(a2)
b2 = np.array(a2)
print(b2)
[[1.5, 2.5, 3.2], [4.2, 5.7, 6.4]]
[[1.5 2.5 3.2]
[4.2 5.7 6.4]]
In [130]:
a3 = [[[1, 3, 1],
[4, 7, 6],
[8, 3, 4]],
[[6, 2, 4],
[8, 1, 5],
[3, 5, 9]]]
b3 = np.array(a3)
print(b3)
[[[1 3 1]
[4 7 6]
[8 3 4]]
[[6 2 4]
[8 1 5]
[3 5 9]]]
In [144]:
print(np.shape(b3))
print(np.ndim(b3))
print()
print(b3.shape)
print(b3.ndim)
print(b3.dtype)
(2, 3, 3)
3
(2, 3, 3)
3
int32
In [158]:
nlist = list(range(10))
print(nlist)
narray = np.array(nlist)
np.array(x for x in narray if x%2 ==0) #짝수만 표시
print(narray)
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[0 1 2 3 4 5 6 7 8 9]
In [229]:
# 조건 조회
nlist = list(range(10,20))
print(nlist)
narray = np.array(nlist)
odd_array = narray[narray%2 == 1] #홀수만 표시
print(odd_array)
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
[11 13 15 17 19]
reshape() 메소드
In [173]:
a = np.array([[1,2,3],[4,5,6]])
print(a)
print('shape :', a.shape,'\n')
reshape_a = a.reshape((3,2))
print(reshape_a)
[[1 2 3]
[4 5 6]]
shape : (2, 3)
[[1 2]
[3 4]
[5 6]]
In [180]:
a = list(range(6))
b = np.array(a)
print(b)
print(b.shape, '\n')
reshape_b = np.reshape(a, (3,2))
print(reshape_b)
print(reshape_b.shape)
[0 1 2 3 4 5]
(6,)
[[0 1]
[2 3]
[4 5]]
(3, 2)
arrray 인덱싱
In [194]:
a = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
print(a.ndim)
print(a.shape)
print(a.dtype)
print()
print(a)
2
(3, 3)
int32
[[1 2 3]
[4 5 6]
[7 8 9]]
In [196]:
print(a[0,1]) # 1행 2열
print(a[1,2]) # 2행 3열
2
6
In [216]:
print(a[[0],:], '\n') # 1행
print(a[[0,1],[1,2]],'\n') # 1행 2열, 2행 3열
print(a[:,[1,2]]) # 2, 3열
[[1 2 3]]
[2 6]
[[2 3]
[5 6]
[8 9]]
array 슬라이싱
In [224]:
a = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
print(a)
[[1 2 3]
[4 5 6]
[7 8 9]]
In [226]:
print(a[0:2, :],'\n') # 1~2행, 모든 열
print(a[0:1, 0:2]) # 1행, 1~2 열
[[1 2 3]
[4 5 6]]
[[1 2]]
array 조건 조회
In [232]:
score= np.array([[78, 91, 84, 89, 93, 65],
[82, 87, 96, 79, 91, 73]])
print(score)
[[78 91 84 89 93 65]
[82 87 96 79 91 73]]
In [240]:
even_score = score[score % 2 ==0] # 짝수만 새롭게 저장
print(even_score)
[78 84 82 96]
In [253]:
print(score)
#85 이상 95 이하 조회
new_score = score[(score>=80) & (score<=95)]
print(new_score)
# () 넣어야 함, & | 만 사용 가능, and or 사용 불가
[[78 91 84 89 93 65]
[82 87 96 79 91 73]]
[91 84 89 93 82 87 91]
배열 연산
- np.add()
- np.substrate()
- np.multiple()
- np.divide()
- np.power()
- np.dot() : 행렬 곱
In [ ]:
데이터프레임
In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [279]:
# 경로로 불러오기
path = 'https://raw.githubusercontent.com/Jangrae/csv/master/weather_simple.csv'
df_prac =pd.read_csv(path)
df_prac.head()
Out[279]:
monthtempmax_tempext_max_tempmin_tempext_min_temp01234
2005-01 | -2.5 | 1.5 | 7.3 | -6.2 | -11.0 |
2005-02 | -1.9 | 2.2 | 10.5 | -5.5 | -13.1 |
2005-03 | 4.1 | 8.8 | 17.6 | 0.2 | -6.5 |
2005-04 | 13.2 | 18.1 | 29.8 | 8.9 | 4.1 |
2005-05 | 17.7 | 22.9 | 30.2 | 13.0 | 6.3 |
In [280]:
# 경로로 불러오기, index 설정
df_prac2 = pd.read_csv(path, index_col = 'month')
df_prac2.tail()
Out[280]:
tempmax_tempext_max_tempmin_tempext_min_tempmonth2021-082021-092021-102021-112021-12
25.9 | 29.7 | 33.7 | 22.8 | 18.1 |
22.6 | 26.9 | 30.3 | 18.8 | 16.0 |
15.6 | 20.5 | 28.8 | 11.6 | 1.3 |
8.2 | 13.1 | 21.3 | 4.0 | -3.7 |
0.6 | 5.1 | 13.3 | -3.5 | -15.5 |
In [281]:
# 불러온 데이터프레임에 index 설정
df_prac.set_index('month', inplace = True)
df_prac.head()
Out[281]:
tempmax_tempext_max_tempmin_tempext_min_tempmonth2005-012005-022005-032005-042005-05
-2.5 | 1.5 | 7.3 | -6.2 | -11.0 |
-1.9 | 2.2 | 10.5 | -5.5 | -13.1 |
4.1 | 8.8 | 17.6 | 0.2 | -6.5 |
13.2 | 18.1 | 29.8 | 8.9 | 4.1 |
17.7 | 22.9 | 30.2 | 13.0 | 6.3 |
In [282]:
# index 이름 삭제
df_prac.index.name = None
df_prac.head()
Out[282]:
tempmax_tempext_max_tempmin_tempext_min_temp2005-012005-022005-032005-042005-05
-2.5 | 1.5 | 7.3 | -6.2 | -11.0 |
-1.9 | 2.2 | 10.5 | -5.5 | -13.1 |
4.1 | 8.8 | 17.6 | 0.2 | -6.5 |
13.2 | 18.1 | 29.8 | 8.9 | 4.1 |
17.7 | 22.9 | 30.2 | 13.0 | 6.3 |
In [284]:
# index를 다시 일발 열로 가져옴
# drop = True 하면 index 삭제 후 일반 열로 가져오지 않음
df_prac.reset_index(inplace = True)
df_prac.head()
Out[284]:
indextempmax_tempext_max_tempmin_tempext_min_temp01234
2005-01 | -2.5 | 1.5 | 7.3 | -6.2 | -11.0 |
2005-02 | -1.9 | 2.2 | 10.5 | -5.5 | -13.1 |
2005-03 | 4.1 | 8.8 | 17.6 | 0.2 | -6.5 |
2005-04 | 13.2 | 18.1 | 29.8 | 8.9 | 4.1 |
2005-05 | 17.7 | 22.9 | 30.2 | 13.0 | 6.3 |
In [289]:
# index에서 돌아온 일반 열에 다시 이름 설정
df_prac.rename({'index': 'month'}, inplace = True)
df_prac.head()
Out[289]:
indextempmax_tempext_max_tempmin_tempext_min_temp01234
2005-01 | -2.5 | 1.5 | 7.3 | -6.2 | -11.0 |
2005-02 | -1.9 | 2.2 | 10.5 | -5.5 | -13.1 |
2005-03 | 4.1 | 8.8 | 17.6 | 0.2 | -6.5 |
2005-04 | 13.2 | 18.1 | 29.8 | 8.9 | 4.1 |
2005-05 | 17.7 | 22.9 | 30.2 | 13.0 | 6.3 |
In [ ]:
데이터프레임 탐색
- head() : 상위 n개 데이터 확인
- tail() : 하위 n개 데이터 확인
- shape : 데이터프레임 크기(행, 열)
- index : 인덱스 정보
- values : 값 정보
- dtypes : 열 자료형
- info() : 열 상세정보
- describe() : 기술통계정보 확인
In [294]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [298]:
path = 'https://raw.githubusercontent.com/Jangrae/csv/master/tips.csv'
tips = pd.read_csv(path)
tips.head()
Out[298]:
total_billtipsexsmokerdaytimesize01234
16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
In [339]:
print(tips.shape)
(244, 7)
In [341]:
print(tips.index) #0 부터 244까지 1씩 증가하는 index를 가짐
print(tips.columns)
RangeIndex(start=0, stop=244, step=1)
Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')
In [343]:
print(tips.values)
[[16.99 1.01 'Female' ... 'Sun' 'Dinner' 2]
[10.34 1.66 'Male' ... 'Sun' 'Dinner' 3]
[21.01 3.5 'Male' ... 'Sun' 'Dinner' 3]
...
[22.67 2.0 'Male' ... 'Sat' 'Dinner' 2]
[17.82 1.75 'Male' ... 'Sat' 'Dinner' 2]
[18.78 3.0 'Female' ... 'Thur' 'Dinner' 2]]
In [345]:
print(tips.dtypes)
total_bill float64
tip float64
sex object
smoker object
day object
time object
size int64
dtype: object
In [347]:
print(tips.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 total_bill 244 non-null float64
1 tip 244 non-null float64
2 sex 244 non-null object
3 smoker 244 non-null object
4 day 244 non-null object
5 time 244 non-null object
6 size 244 non-null int64
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB
None
In [355]:
print(tips.describe()) # 숫자만 표시
total_bill tip size
count 244.000000 244.000000 244.000000
mean 19.785943 2.998279 2.569672
std 8.902412 1.383638 0.951100
min 3.070000 1.000000 1.000000
25% 13.347500 2.000000 2.000000
50% 17.795000 2.900000 2.000000
75% 24.127500 3.562500 3.000000
max 50.810000 10.000000 6.000000
In [363]:
print(tips.describe(include = 'all')) # 전부 표시
total_bill tip sex smoker day time size
count 244.000000 244.000000 244 244 244 244 244.000000
unique NaN NaN 2 2 4 2 NaN
top NaN NaN Male No Sat Dinner NaN
freq NaN NaN 157 151 87 176 NaN
mean 19.785943 2.998279 NaN NaN NaN NaN 2.569672
std 8.902412 1.383638 NaN NaN NaN NaN 0.951100
min 3.070000 1.000000 NaN NaN NaN NaN 1.000000
25% 13.347500 2.000000 NaN NaN NaN NaN 2.000000
50% 17.795000 2.900000 NaN NaN NaN NaN 2.000000
75% 24.127500 3.562500 NaN NaN NaN NaN 3.000000
max 50.810000 10.000000 NaN NaN NaN NaN 6.000000
In [365]:
print(tips['tip'].describe()) # 원하는 column만 표시
count 244.000000
mean 2.998279
std 1.383638
min 1.000000
25% 2.000000
50% 2.900000
75% 3.562500
max 10.000000
Name: tip, dtype: float64
In [367]:
print(tips.describe().T) # transpose
count mean std min 25% 50% 75% max
total_bill 244.0 19.785943 8.902412 3.07 13.3475 17.795 24.1275 50.81
tip 244.0 2.998279 1.383638 1.00 2.0000 2.900 3.5625 10.00
size 244.0 2.569672 0.951100 1.00 2.0000 2.000 3.0000 6.00
데이터 프레임 정렬
- sort_values()
In [371]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [374]:
tips
Out[374]:
total_billtipsexsmokerdaytimesize01234...239240241242243
16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
... | ... | ... | ... | ... | ... | ... |
29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
In [378]:
tips.sort_values('total_bill').head() #total_bill 기준 오름차순
Out[378]:
total_billtipsexsmokerdaytimesize6792111172149
3.07 | 1.00 | Female | Yes | Sat | Dinner | 1 |
5.75 | 1.00 | Female | Yes | Fri | Dinner | 2 |
7.25 | 1.00 | Female | No | Sat | Dinner | 1 |
7.25 | 5.15 | Male | Yes | Sun | Dinner | 2 |
7.51 | 2.00 | Male | No | Thur | Lunch | 2 |
In [394]:
# total_bill로 오름차순 정렬, 같으면 tip 내림차순 정렬
new_tips = tips.sort_values(['total_bill', 'tip'], ascending = [True, False])
new_tips.head()
Out[394]:
total_billtipsexsmokerdaytimesize6792172111149
3.07 | 1.00 | Female | Yes | Sat | Dinner | 1 |
5.75 | 1.00 | Female | Yes | Fri | Dinner | 2 |
7.25 | 5.15 | Male | Yes | Sun | Dinner | 2 |
7.25 | 1.00 | Female | No | Sat | Dinner | 1 |
7.51 | 2.00 | Male | No | Thur | Lunch | 2 |
In [402]:
# 기존 index 삭제
new_tips.reset_index(drop = True, inplace = True)
new_tips.head()
Out[402]:
total_billtipsexsmokerdaytimesize01234
3.07 | 1.00 | Female | Yes | Sat | Dinner | 1 |
5.75 | 1.00 | Female | Yes | Fri | Dinner | 2 |
7.25 | 5.15 | Male | Yes | Sun | Dinner | 2 |
7.25 | 1.00 | Female | No | Sat | Dinner | 1 |
7.51 | 2.00 | Male | No | Thur | Lunch | 2 |
기본 집계
- unique() : 고윳값 확인
- value_counts() : 해당 열에서 고윳값의 개수
- sum()
- max(), min()
- mode() : 최빈값
- median()
In [413]:
tips['tip'].unique()
Out[413]:
array([ 1.01, 1.66, 3.5 , 3.31, 3.61, 4.71, 2. , 3.12, 1.96,
3.23, 1.71, 5. , 1.57, 3. , 3.02, 3.92, 1.67, 3.71,
3.35, 4.08, 2.75, 2.23, 7.58, 3.18, 2.34, 4.3 , 1.45,
2.5 , 2.45, 3.27, 3.6 , 3.07, 2.31, 2.24, 2.54, 3.06,
1.32, 5.6 , 6. , 2.05, 2.6 , 5.2 , 1.56, 4.34, 3.51,
1.5 , 1.76, 6.73, 3.21, 1.98, 3.76, 2.64, 3.15, 2.47,
1. , 2.01, 2.09, 1.97, 3.14, 2.2 , 1.25, 3.08, 4. ,
2.71, 3.4 , 1.83, 2.03, 5.17, 5.85, 3.25, 4.73, 3.48,
1.64, 4.06, 4.29, 2.55, 5.07, 1.8 , 2.92, 1.68, 2.52,
4.2 , 1.48, 2.18, 2.83, 6.7 , 2.3 , 1.36, 1.63, 1.73,
2.74, 5.14, 3.75, 2.61, 4.5 , 1.61, 10. , 3.16, 5.15,
3.11, 3.55, 3.68, 5.65, 6.5 , 4.19, 2.56, 2.02, 1.44,
3.41, 5.16, 9. , 1.1 , 3.09, 1.92, 1.58, 2.72, 2.88,
3.39, 1.47, 1.17, 4.67, 5.92, 1.75])
In [425]:
tips['day'].unique()
Out[425]:
array(['Sun', 'Sat', 'Thur', 'Fri'], dtype=object)
In [427]:
tips['day'].value_counts()
Out[427]:
day
Sat 87
Sun 76
Thur 62
Fri 19
Name: count, dtype: int64
In [429]:
tips['smoker'].unique()
Out[429]:
array(['No', 'Yes'], dtype=object)
In [435]:
tips['smoker'].value_counts(normalize = True) # normalize: 확률로 변경
Out[435]:
smoker
No 0.618852
Yes 0.381148
Name: proportion, dtype: float64
In [453]:
print(tips['tip'].mode())
print()
print(tips['tip'].mode()[0])
0 2.0
Name: tip, dtype: float64
2.0
In [456]:
stock = [[94500, 92100, 92200, 92300],
[96500, 93200, 95900, 94300],
[93400, 91900, 93400, 92100],
[94200, 92100, 94100, 92400],
[94500, 92500, 94300, 92600]]
dates = ['2019-02-15', '2019-02-16', '2019-02-17', '2019-02-18', '2019-02-19']
names = ['High', 'Low', 'Open', 'Close']
df = pd.DataFrame(stock, index = dates, columns = names)
df.head()
Out[456]:
HighLowOpenClose2019-02-152019-02-162019-02-172019-02-182019-02-19
94500 | 92100 | 92200 | 92300 |
96500 | 93200 | 95900 | 94300 |
93400 | 91900 | 93400 | 92100 |
94200 | 92100 | 94100 | 92400 |
94500 | 92500 | 94300 | 92600 |
In [458]:
df.sum()
Out[458]:
High 473100
Low 461800
Open 469900
Close 463700
dtype: int64