KT AIVLE/Daily Review

240912

bestone888 2024. 9. 13. 01:29

240912

numpy

axis 0 : 행
axis 1 : 열
rank : 축 개수
shape : 배열 크기
ndim : 배열 차원 확인
dtype : 요소들의 자료형 확인, 배열은 한가지 자료형만 가질 수 있음

In [110]:

import numpy as np
a1 = list(range(6))
b1 = np.array(a1)

print(b1)

[0 1 2 3 4 5]

In [117]:

a2 = [[1.5, 2.5, 3.2],
      [4.2, 5.7, 6.4]]
print(a2)

b2 = np.array(a2)
print(b2)

[[1.5, 2.5, 3.2], [4.2, 5.7, 6.4]]
[[1.5 2.5 3.2]
 [4.2 5.7 6.4]]

In [130]:

a3 = [[[1, 3, 1],
       [4, 7, 6],
       [8, 3, 4]],
      [[6, 2, 4],
       [8, 1, 5],
       [3, 5, 9]]]
b3 = np.array(a3)

print(b3)

[[[1 3 1]
  [4 7 6]
  [8 3 4]]

 [[6 2 4]
  [8 1 5]
  [3 5 9]]]

In [144]:

print(np.shape(b3))
print(np.ndim(b3))
print()

print(b3.shape)
print(b3.ndim)
print(b3.dtype)

(2, 3, 3)
3

(2, 3, 3)
3
int32

In [158]:

nlist = list(range(10))
print(nlist)

narray = np.array(nlist)

np.array(x for x in narray if x%2 ==0)   #짝수만 표시
print(narray)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[0 1 2 3 4 5 6 7 8 9]

In [229]:

# 조건 조회

nlist = list(range(10,20))
print(nlist)

narray = np.array(nlist)
odd_array = narray[narray%2 == 1]  #홀수만 표시
print(odd_array)

[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
[11 13 15 17 19]

reshape() 메소드

In [173]:

a = np.array([[1,2,3],[4,5,6]])
print(a)
print('shape :', a.shape,'\n')


reshape_a = a.reshape((3,2))
print(reshape_a)

[[1 2 3]
 [4 5 6]]
shape : (2, 3) 

[[1 2]
 [3 4]
 [5 6]]

In [180]:

a = list(range(6))
b = np.array(a)
print(b)
print(b.shape, '\n')


reshape_b = np.reshape(a, (3,2))
print(reshape_b)
print(reshape_b.shape)

[0 1 2 3 4 5]
(6,) 

[[0 1]
 [2 3]
 [4 5]]
(3, 2)

arrray 인덱싱

In [194]:

a = np.array([[1, 2, 3],
              [4, 5, 6],
              [7, 8, 9]])

print(a.ndim)
print(a.shape)
print(a.dtype)

print()
print(a)

2
(3, 3)
int32

[[1 2 3]
 [4 5 6]
 [7 8 9]]

In [196]:

print(a[0,1])  # 1행 2열
print(a[1,2])  # 2행 3열

2
6

In [216]:

print(a[[0],:], '\n')  # 1행
print(a[[0,1],[1,2]],'\n')    # 1행 2열, 2행 3열
print(a[:,[1,2]])    # 2, 3열

[[1 2 3]] 

[2 6] 

[[2 3]
 [5 6]
 [8 9]]

array 슬라이싱

In [224]:

a = np.array([[1, 2, 3],
              [4, 5, 6],
              [7, 8, 9]])

print(a)

[[1 2 3]
 [4 5 6]
 [7 8 9]]

In [226]:

print(a[0:2, :],'\n')    # 1~2행, 모든 열
print(a[0:1, 0:2])    # 1행, 1~2 열

[[1 2 3]
 [4 5 6]] 

[[1 2]]

array 조건 조회

In [232]:

score= np.array([[78, 91, 84, 89, 93, 65],
                 [82, 87, 96, 79, 91, 73]])

print(score)

[[78 91 84 89 93 65]
 [82 87 96 79 91 73]]

In [240]:

even_score = score[score % 2 ==0]    # 짝수만 새롭게 저장
print(even_score)

[78 84 82 96]

In [253]:

print(score)

#85 이상 95 이하 조회
new_score = score[(score>=80) & (score<=95)]
print(new_score)

# () 넣어야 함, & | 만 사용 가능, and or 사용 불가

[[78 91 84 89 93 65]
 [82 87 96 79 91 73]]
[91 84 89 93 82 87 91]

배열 연산

np.add()
np.substrate()
np.multiple()
np.divide()
np.power()
np.dot() : 행렬 곱

In [ ]:

데이터프레임

In [58]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [279]:

# 경로로 불러오기

path = 'https://raw.githubusercontent.com/Jangrae/csv/master/weather_simple.csv'
df_prac =pd.read_csv(path)

df_prac.head()

Out[279]:

monthtempmax_tempext_max_tempmin_tempext_min_temp01234

2005-01	-2.5	1.5	7.3	-6.2	-11.0
2005-02	-1.9	2.2	10.5	-5.5	-13.1
2005-03	4.1	8.8	17.6	0.2	-6.5
2005-04	13.2	18.1	29.8	8.9	4.1
2005-05	17.7	22.9	30.2	13.0	6.3

In [280]:

# 경로로 불러오기, index 설정

df_prac2 = pd.read_csv(path, index_col = 'month')
df_prac2.tail()

Out[280]:

tempmax_tempext_max_tempmin_tempext_min_tempmonth2021-082021-092021-102021-112021-12

25.9	29.7	33.7	22.8	18.1
22.6	26.9	30.3	18.8	16.0
15.6	20.5	28.8	11.6	1.3
8.2	13.1	21.3	4.0	-3.7
0.6	5.1	13.3	-3.5	-15.5

In [281]:

# 불러온 데이터프레임에 index 설정

df_prac.set_index('month', inplace = True)
df_prac.head()

Out[281]:

tempmax_tempext_max_tempmin_tempext_min_tempmonth2005-012005-022005-032005-042005-05

-2.5	1.5	7.3	-6.2	-11.0
-1.9	2.2	10.5	-5.5	-13.1
4.1	8.8	17.6	0.2	-6.5
13.2	18.1	29.8	8.9	4.1
17.7	22.9	30.2	13.0	6.3

In [282]:

# index 이름 삭제

df_prac.index.name = None
df_prac.head()

Out[282]:

tempmax_tempext_max_tempmin_tempext_min_temp2005-012005-022005-032005-042005-05

-2.5	1.5	7.3	-6.2	-11.0
-1.9	2.2	10.5	-5.5	-13.1
4.1	8.8	17.6	0.2	-6.5
13.2	18.1	29.8	8.9	4.1
17.7	22.9	30.2	13.0	6.3

In [284]:

# index를 다시 일발 열로 가져옴
# drop = True 하면 index 삭제 후 일반 열로 가져오지 않음

df_prac.reset_index(inplace = True)
df_prac.head()

Out[284]:

indextempmax_tempext_max_tempmin_tempext_min_temp01234

2005-01	-2.5	1.5	7.3	-6.2	-11.0
2005-02	-1.9	2.2	10.5	-5.5	-13.1
2005-03	4.1	8.8	17.6	0.2	-6.5
2005-04	13.2	18.1	29.8	8.9	4.1
2005-05	17.7	22.9	30.2	13.0	6.3

In [289]:

# index에서 돌아온 일반 열에 다시 이름 설정

df_prac.rename({'index': 'month'}, inplace = True)
df_prac.head()

Out[289]:

indextempmax_tempext_max_tempmin_tempext_min_temp01234

2005-01	-2.5	1.5	7.3	-6.2	-11.0
2005-02	-1.9	2.2	10.5	-5.5	-13.1
2005-03	4.1	8.8	17.6	0.2	-6.5
2005-04	13.2	18.1	29.8	8.9	4.1
2005-05	17.7	22.9	30.2	13.0	6.3

In [ ]:

데이터프레임 탐색

head() : 상위 n개 데이터 확인
tail() : 하위 n개 데이터 확인
shape : 데이터프레임 크기(행, 열)
index : 인덱스 정보
values : 값 정보
dtypes : 열 자료형
info() : 열 상세정보
describe() : 기술통계정보 확인

In [294]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [298]:

path = 'https://raw.githubusercontent.com/Jangrae/csv/master/tips.csv'
tips = pd.read_csv(path)

tips.head()

Out[298]:

total_billtipsexsmokerdaytimesize01234

16.99	1.01	Female	No	Sun	Dinner	2
10.34	1.66	Male	No	Sun	Dinner	3
21.01	3.50	Male	No	Sun	Dinner	3
23.68	3.31	Male	No	Sun	Dinner	2
24.59	3.61	Female	No	Sun	Dinner	4

In [339]:

print(tips.shape)

(244, 7)

In [341]:

print(tips.index)  #0 부터 244까지 1씩 증가하는 index를 가짐
print(tips.columns)

RangeIndex(start=0, stop=244, step=1)
Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [343]:

print(tips.values)

[[16.99 1.01 'Female' ... 'Sun' 'Dinner' 2]
 [10.34 1.66 'Male' ... 'Sun' 'Dinner' 3]
 [21.01 3.5 'Male' ... 'Sun' 'Dinner' 3]
 ...
 [22.67 2.0 'Male' ... 'Sat' 'Dinner' 2]
 [17.82 1.75 'Male' ... 'Sat' 'Dinner' 2]
 [18.78 3.0 'Female' ... 'Thur' 'Dinner' 2]]

In [345]:

print(tips.dtypes)

total_bill    float64
tip           float64
sex            object
smoker         object
day            object
time           object
size            int64
dtype: object

In [347]:

print(tips.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB
None

In [355]:

print(tips.describe())    # 숫자만 표시

       total_bill         tip        size
count  244.000000  244.000000  244.000000
mean    19.785943    2.998279    2.569672
std      8.902412    1.383638    0.951100
min      3.070000    1.000000    1.000000
25%     13.347500    2.000000    2.000000
50%     17.795000    2.900000    2.000000
75%     24.127500    3.562500    3.000000
max     50.810000   10.000000    6.000000

In [363]:

print(tips.describe(include = 'all'))    # 전부 표시

        total_bill         tip   sex smoker  day    time        size
count   244.000000  244.000000   244    244  244     244  244.000000
unique         NaN         NaN     2      2    4       2         NaN
top            NaN         NaN  Male     No  Sat  Dinner         NaN
freq           NaN         NaN   157    151   87     176         NaN
mean     19.785943    2.998279   NaN    NaN  NaN     NaN    2.569672
std       8.902412    1.383638   NaN    NaN  NaN     NaN    0.951100
min       3.070000    1.000000   NaN    NaN  NaN     NaN    1.000000
25%      13.347500    2.000000   NaN    NaN  NaN     NaN    2.000000
50%      17.795000    2.900000   NaN    NaN  NaN     NaN    2.000000
75%      24.127500    3.562500   NaN    NaN  NaN     NaN    3.000000
max      50.810000   10.000000   NaN    NaN  NaN     NaN    6.000000

In [365]:

print(tips['tip'].describe())    # 원하는 column만 표시

count    244.000000
mean       2.998279
std        1.383638
min        1.000000
25%        2.000000
50%        2.900000
75%        3.562500
max       10.000000
Name: tip, dtype: float64

In [367]:

print(tips.describe().T)    # transpose

            count       mean       std   min      25%     50%      75%    max
total_bill  244.0  19.785943  8.902412  3.07  13.3475  17.795  24.1275  50.81
tip         244.0   2.998279  1.383638  1.00   2.0000   2.900   3.5625  10.00
size        244.0   2.569672  0.951100  1.00   2.0000   2.000   3.0000   6.00

데이터 프레임 정렬

sort_values()

In [371]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [374]:

tips

Out[374]:

total_billtipsexsmokerdaytimesize01234...239240241242243

16.99	1.01	Female	No	Sun	Dinner	2
10.34	1.66	Male	No	Sun	Dinner	3
21.01	3.50	Male	No	Sun	Dinner	3
23.68	3.31	Male	No	Sun	Dinner	2
24.59	3.61	Female	No	Sun	Dinner	4
...	...	...	...	...	...	...
29.03	5.92	Male	No	Sat	Dinner	3
27.18	2.00	Female	Yes	Sat	Dinner	2
22.67	2.00	Male	Yes	Sat	Dinner	2
17.82	1.75	Male	No	Sat	Dinner	2
18.78	3.00	Female	No	Thur	Dinner	2

244 rows × 7 columns

In [378]:

tips.sort_values('total_bill').head()    #total_bill 기준 오름차순

Out[378]:

total_billtipsexsmokerdaytimesize6792111172149

3.07	1.00	Female	Yes	Sat	Dinner	1
5.75	1.00	Female	Yes	Fri	Dinner	2
7.25	1.00	Female	No	Sat	Dinner	1
7.25	5.15	Male	Yes	Sun	Dinner	2
7.51	2.00	Male	No	Thur	Lunch	2

In [394]:

# total_bill로 오름차순 정렬, 같으면 tip 내림차순 정렬
new_tips = tips.sort_values(['total_bill', 'tip'], ascending = [True, False])
new_tips.head()

Out[394]:

total_billtipsexsmokerdaytimesize6792172111149

3.07	1.00	Female	Yes	Sat	Dinner	1
5.75	1.00	Female	Yes	Fri	Dinner	2
7.25	5.15	Male	Yes	Sun	Dinner	2
7.25	1.00	Female	No	Sat	Dinner	1
7.51	2.00	Male	No	Thur	Lunch	2

In [402]:

# 기존 index 삭제
new_tips.reset_index(drop = True, inplace = True)
new_tips.head()

Out[402]:

total_billtipsexsmokerdaytimesize01234

3.07	1.00	Female	Yes	Sat	Dinner	1
5.75	1.00	Female	Yes	Fri	Dinner	2
7.25	5.15	Male	Yes	Sun	Dinner	2
7.25	1.00	Female	No	Sat	Dinner	1
7.51	2.00	Male	No	Thur	Lunch	2

기본 집계

unique() : 고윳값 확인
value_counts() : 해당 열에서 고윳값의 개수
sum()
max(), min()
mode() : 최빈값
median()

In [413]:

tips['tip'].unique()

Out[413]:

array([ 1.01,  1.66,  3.5 ,  3.31,  3.61,  4.71,  2.  ,  3.12,  1.96,
        3.23,  1.71,  5.  ,  1.57,  3.  ,  3.02,  3.92,  1.67,  3.71,
        3.35,  4.08,  2.75,  2.23,  7.58,  3.18,  2.34,  4.3 ,  1.45,
        2.5 ,  2.45,  3.27,  3.6 ,  3.07,  2.31,  2.24,  2.54,  3.06,
        1.32,  5.6 ,  6.  ,  2.05,  2.6 ,  5.2 ,  1.56,  4.34,  3.51,
        1.5 ,  1.76,  6.73,  3.21,  1.98,  3.76,  2.64,  3.15,  2.47,
        1.  ,  2.01,  2.09,  1.97,  3.14,  2.2 ,  1.25,  3.08,  4.  ,
        2.71,  3.4 ,  1.83,  2.03,  5.17,  5.85,  3.25,  4.73,  3.48,
        1.64,  4.06,  4.29,  2.55,  5.07,  1.8 ,  2.92,  1.68,  2.52,
        4.2 ,  1.48,  2.18,  2.83,  6.7 ,  2.3 ,  1.36,  1.63,  1.73,
        2.74,  5.14,  3.75,  2.61,  4.5 ,  1.61, 10.  ,  3.16,  5.15,
        3.11,  3.55,  3.68,  5.65,  6.5 ,  4.19,  2.56,  2.02,  1.44,
        3.41,  5.16,  9.  ,  1.1 ,  3.09,  1.92,  1.58,  2.72,  2.88,
        3.39,  1.47,  1.17,  4.67,  5.92,  1.75])

In [425]:

tips['day'].unique()

Out[425]:

array(['Sun', 'Sat', 'Thur', 'Fri'], dtype=object)

In [427]:

tips['day'].value_counts()

Out[427]:

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [429]:

tips['smoker'].unique()

Out[429]:

array(['No', 'Yes'], dtype=object)

In [435]:

tips['smoker'].value_counts(normalize = True)    # normalize: 확률로 변경

Out[435]:

smoker
No     0.618852
Yes    0.381148
Name: proportion, dtype: float64

In [453]:

print(tips['tip'].mode())
print()

print(tips['tip'].mode()[0])

0    2.0
Name: tip, dtype: float64

2.0

In [456]:

stock = [[94500, 92100, 92200, 92300],
         [96500, 93200, 95900, 94300],
         [93400, 91900, 93400, 92100],
         [94200, 92100, 94100, 92400],
         [94500, 92500, 94300, 92600]]
dates = ['2019-02-15', '2019-02-16', '2019-02-17', '2019-02-18', '2019-02-19']
names = ['High', 'Low', 'Open', 'Close']

df = pd.DataFrame(stock, index = dates, columns = names)
df.head()

Out[456]:

HighLowOpenClose2019-02-152019-02-162019-02-172019-02-182019-02-19

94500	92100	92200	92300
96500	93200	95900	94300
93400	91900	93400	92100
94200	92100	94100	92400
94500	92500	94300	92600

In [458]:

df.sum()

Out[458]:

High     473100
Low      461800
Open     469900
Close    463700
dtype: int64