bestone888 2024. 9. 13. 01:29

240912

numpy

  • axis 0 : 행
  • axis 1 : 열
  • rank : 축 개수
  • shape : 배열 크기
  • ndim : 배열 차원 확인
  • dtype : 요소들의 자료형 확인, 배열은 한가지 자료형만 가질 수 있음
In [110]:
import numpy as np
a1 = list(range(6))
b1 = np.array(a1)

print(b1)
[0 1 2 3 4 5]
In [117]:
a2 = [[1.5, 2.5, 3.2],
      [4.2, 5.7, 6.4]]
print(a2)

b2 = np.array(a2)
print(b2)
[[1.5, 2.5, 3.2], [4.2, 5.7, 6.4]]
[[1.5 2.5 3.2]
 [4.2 5.7 6.4]]
In [130]:
a3 = [[[1, 3, 1],
       [4, 7, 6],
       [8, 3, 4]],
      [[6, 2, 4],
       [8, 1, 5],
       [3, 5, 9]]]
b3 = np.array(a3)

print(b3)
[[[1 3 1]
  [4 7 6]
  [8 3 4]]

 [[6 2 4]
  [8 1 5]
  [3 5 9]]]
In [144]:
print(np.shape(b3))
print(np.ndim(b3))
print()

print(b3.shape)
print(b3.ndim)
print(b3.dtype)
(2, 3, 3)
3

(2, 3, 3)
3
int32
In [158]:
nlist = list(range(10))
print(nlist)

narray = np.array(nlist)

np.array(x for x in narray if x%2 ==0)   #짝수만 표시
print(narray)
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[0 1 2 3 4 5 6 7 8 9]
In [229]:
# 조건 조회

nlist = list(range(10,20))
print(nlist)

narray = np.array(nlist)
odd_array = narray[narray%2 == 1]  #홀수만 표시
print(odd_array)
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
[11 13 15 17 19]

reshape() 메소드

In [173]:
a = np.array([[1,2,3],[4,5,6]])
print(a)
print('shape :', a.shape,'\n')


reshape_a = a.reshape((3,2))
print(reshape_a)
[[1 2 3]
 [4 5 6]]
shape : (2, 3) 

[[1 2]
 [3 4]
 [5 6]]
In [180]:
a = list(range(6))
b = np.array(a)
print(b)
print(b.shape, '\n')


reshape_b = np.reshape(a, (3,2))
print(reshape_b)
print(reshape_b.shape)
[0 1 2 3 4 5]
(6,) 

[[0 1]
 [2 3]
 [4 5]]
(3, 2)

arrray 인덱싱

In [194]:
a = np.array([[1, 2, 3],
              [4, 5, 6],
              [7, 8, 9]])

print(a.ndim)
print(a.shape)
print(a.dtype)

print()
print(a)
2
(3, 3)
int32

[[1 2 3]
 [4 5 6]
 [7 8 9]]
In [196]:
print(a[0,1])  # 1행 2열
print(a[1,2])  # 2행 3열
2
6
In [216]:
print(a[[0],:], '\n')  # 1행
print(a[[0,1],[1,2]],'\n')    # 1행 2열, 2행 3열
print(a[:,[1,2]])    # 2, 3열
[[1 2 3]] 

[2 6] 

[[2 3]
 [5 6]
 [8 9]]

array 슬라이싱

In [224]:
a = np.array([[1, 2, 3],
              [4, 5, 6],
              [7, 8, 9]])

print(a)
[[1 2 3]
 [4 5 6]
 [7 8 9]]
In [226]:
print(a[0:2, :],'\n')    # 1~2행, 모든 열
print(a[0:1, 0:2])    # 1행, 1~2 열
[[1 2 3]
 [4 5 6]] 

[[1 2]]

array 조건 조회

In [232]:
score= np.array([[78, 91, 84, 89, 93, 65],
                 [82, 87, 96, 79, 91, 73]])

print(score)
[[78 91 84 89 93 65]
 [82 87 96 79 91 73]]
In [240]:
even_score = score[score % 2 ==0]    # 짝수만 새롭게 저장
print(even_score)
[78 84 82 96]
In [253]:
print(score)

#85 이상 95 이하 조회
new_score = score[(score>=80) & (score<=95)]
print(new_score)

# () 넣어야 함, & | 만 사용 가능, and or 사용 불가
[[78 91 84 89 93 65]
 [82 87 96 79 91 73]]
[91 84 89 93 82 87 91]

배열 연산

  • np.add()
  • np.substrate()
  • np.multiple()
  • np.divide()
  • np.power()
  • np.dot() : 행렬 곱
In [ ]:
 

데이터프레임

In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [279]:
# 경로로 불러오기

path = 'https://raw.githubusercontent.com/Jangrae/csv/master/weather_simple.csv'
df_prac =pd.read_csv(path)

df_prac.head()
Out[279]:
monthtempmax_tempext_max_tempmin_tempext_min_temp01234
2005-01 -2.5 1.5 7.3 -6.2 -11.0
2005-02 -1.9 2.2 10.5 -5.5 -13.1
2005-03 4.1 8.8 17.6 0.2 -6.5
2005-04 13.2 18.1 29.8 8.9 4.1
2005-05 17.7 22.9 30.2 13.0 6.3
In [280]:
# 경로로 불러오기, index 설정

df_prac2 = pd.read_csv(path, index_col = 'month')
df_prac2.tail()
Out[280]:
tempmax_tempext_max_tempmin_tempext_min_tempmonth2021-082021-092021-102021-112021-12
25.9 29.7 33.7 22.8 18.1
22.6 26.9 30.3 18.8 16.0
15.6 20.5 28.8 11.6 1.3
8.2 13.1 21.3 4.0 -3.7
0.6 5.1 13.3 -3.5 -15.5
In [281]:
# 불러온 데이터프레임에 index 설정

df_prac.set_index('month', inplace = True)
df_prac.head()
Out[281]:
tempmax_tempext_max_tempmin_tempext_min_tempmonth2005-012005-022005-032005-042005-05
-2.5 1.5 7.3 -6.2 -11.0
-1.9 2.2 10.5 -5.5 -13.1
4.1 8.8 17.6 0.2 -6.5
13.2 18.1 29.8 8.9 4.1
17.7 22.9 30.2 13.0 6.3
In [282]:
# index 이름 삭제

df_prac.index.name = None
df_prac.head()
Out[282]:
tempmax_tempext_max_tempmin_tempext_min_temp2005-012005-022005-032005-042005-05
-2.5 1.5 7.3 -6.2 -11.0
-1.9 2.2 10.5 -5.5 -13.1
4.1 8.8 17.6 0.2 -6.5
13.2 18.1 29.8 8.9 4.1
17.7 22.9 30.2 13.0 6.3
In [284]:
# index를 다시 일발 열로 가져옴
# drop = True 하면 index 삭제 후 일반 열로 가져오지 않음

df_prac.reset_index(inplace = True)
df_prac.head()
Out[284]:
indextempmax_tempext_max_tempmin_tempext_min_temp01234
2005-01 -2.5 1.5 7.3 -6.2 -11.0
2005-02 -1.9 2.2 10.5 -5.5 -13.1
2005-03 4.1 8.8 17.6 0.2 -6.5
2005-04 13.2 18.1 29.8 8.9 4.1
2005-05 17.7 22.9 30.2 13.0 6.3
In [289]:
# index에서 돌아온 일반 열에 다시 이름 설정

df_prac.rename({'index': 'month'}, inplace = True)
df_prac.head()
Out[289]:
indextempmax_tempext_max_tempmin_tempext_min_temp01234
2005-01 -2.5 1.5 7.3 -6.2 -11.0
2005-02 -1.9 2.2 10.5 -5.5 -13.1
2005-03 4.1 8.8 17.6 0.2 -6.5
2005-04 13.2 18.1 29.8 8.9 4.1
2005-05 17.7 22.9 30.2 13.0 6.3
In [ ]:
 

데이터프레임 탐색

  • head() : 상위 n개 데이터 확인
  • tail() : 하위 n개 데이터 확인
  • shape : 데이터프레임 크기(행, 열)
  • index : 인덱스 정보
  • values : 값 정보
  • dtypes : 열 자료형
  • info() : 열 상세정보
  • describe() : 기술통계정보 확인
In [294]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [298]:
path = 'https://raw.githubusercontent.com/Jangrae/csv/master/tips.csv'
tips = pd.read_csv(path)

tips.head()
Out[298]:
total_billtipsexsmokerdaytimesize01234
16.99 1.01 Female No Sun Dinner 2
10.34 1.66 Male No Sun Dinner 3
21.01 3.50 Male No Sun Dinner 3
23.68 3.31 Male No Sun Dinner 2
24.59 3.61 Female No Sun Dinner 4
In [339]:
print(tips.shape)
(244, 7)
In [341]:
print(tips.index)  #0 부터 244까지 1씩 증가하는 index를 가짐
print(tips.columns)
RangeIndex(start=0, stop=244, step=1)
Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')
In [343]:
print(tips.values)
[[16.99 1.01 'Female' ... 'Sun' 'Dinner' 2]
 [10.34 1.66 'Male' ... 'Sun' 'Dinner' 3]
 [21.01 3.5 'Male' ... 'Sun' 'Dinner' 3]
 ...
 [22.67 2.0 'Male' ... 'Sat' 'Dinner' 2]
 [17.82 1.75 'Male' ... 'Sat' 'Dinner' 2]
 [18.78 3.0 'Female' ... 'Thur' 'Dinner' 2]]
In [345]:
print(tips.dtypes)
total_bill    float64
tip           float64
sex            object
smoker         object
day            object
time           object
size            int64
dtype: object
In [347]:
print(tips.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB
None
In [355]:
print(tips.describe())    # 숫자만 표시
       total_bill         tip        size
count  244.000000  244.000000  244.000000
mean    19.785943    2.998279    2.569672
std      8.902412    1.383638    0.951100
min      3.070000    1.000000    1.000000
25%     13.347500    2.000000    2.000000
50%     17.795000    2.900000    2.000000
75%     24.127500    3.562500    3.000000
max     50.810000   10.000000    6.000000
In [363]:
print(tips.describe(include = 'all'))    # 전부 표시
        total_bill         tip   sex smoker  day    time        size
count   244.000000  244.000000   244    244  244     244  244.000000
unique         NaN         NaN     2      2    4       2         NaN
top            NaN         NaN  Male     No  Sat  Dinner         NaN
freq           NaN         NaN   157    151   87     176         NaN
mean     19.785943    2.998279   NaN    NaN  NaN     NaN    2.569672
std       8.902412    1.383638   NaN    NaN  NaN     NaN    0.951100
min       3.070000    1.000000   NaN    NaN  NaN     NaN    1.000000
25%      13.347500    2.000000   NaN    NaN  NaN     NaN    2.000000
50%      17.795000    2.900000   NaN    NaN  NaN     NaN    2.000000
75%      24.127500    3.562500   NaN    NaN  NaN     NaN    3.000000
max      50.810000   10.000000   NaN    NaN  NaN     NaN    6.000000
In [365]:
print(tips['tip'].describe())    # 원하는 column만 표시
count    244.000000
mean       2.998279
std        1.383638
min        1.000000
25%        2.000000
50%        2.900000
75%        3.562500
max       10.000000
Name: tip, dtype: float64
In [367]:
print(tips.describe().T)    # transpose
            count       mean       std   min      25%     50%      75%    max
total_bill  244.0  19.785943  8.902412  3.07  13.3475  17.795  24.1275  50.81
tip         244.0   2.998279  1.383638  1.00   2.0000   2.900   3.5625  10.00
size        244.0   2.569672  0.951100  1.00   2.0000   2.000   3.0000   6.00

데이터 프레임 정렬

  • sort_values()
In [371]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [374]:
tips
Out[374]:
total_billtipsexsmokerdaytimesize01234...239240241242243
16.99 1.01 Female No Sun Dinner 2
10.34 1.66 Male No Sun Dinner 3
21.01 3.50 Male No Sun Dinner 3
23.68 3.31 Male No Sun Dinner 2
24.59 3.61 Female No Sun Dinner 4
... ... ... ... ... ... ...
29.03 5.92 Male No Sat Dinner 3
27.18 2.00 Female Yes Sat Dinner 2
22.67 2.00 Male Yes Sat Dinner 2
17.82 1.75 Male No Sat Dinner 2
18.78 3.00 Female No Thur Dinner 2

244 rows × 7 columns

In [378]:
tips.sort_values('total_bill').head()    #total_bill 기준 오름차순
Out[378]:
total_billtipsexsmokerdaytimesize6792111172149
3.07 1.00 Female Yes Sat Dinner 1
5.75 1.00 Female Yes Fri Dinner 2
7.25 1.00 Female No Sat Dinner 1
7.25 5.15 Male Yes Sun Dinner 2
7.51 2.00 Male No Thur Lunch 2
In [394]:
# total_bill로 오름차순 정렬, 같으면 tip 내림차순 정렬
new_tips = tips.sort_values(['total_bill', 'tip'], ascending = [True, False])
new_tips.head()
Out[394]:
total_billtipsexsmokerdaytimesize6792172111149
3.07 1.00 Female Yes Sat Dinner 1
5.75 1.00 Female Yes Fri Dinner 2
7.25 5.15 Male Yes Sun Dinner 2
7.25 1.00 Female No Sat Dinner 1
7.51 2.00 Male No Thur Lunch 2
In [402]:
# 기존 index 삭제
new_tips.reset_index(drop = True, inplace = True)
new_tips.head()
Out[402]:
total_billtipsexsmokerdaytimesize01234
3.07 1.00 Female Yes Sat Dinner 1
5.75 1.00 Female Yes Fri Dinner 2
7.25 5.15 Male Yes Sun Dinner 2
7.25 1.00 Female No Sat Dinner 1
7.51 2.00 Male No Thur Lunch 2

기본 집계

  • unique() : 고윳값 확인
  • value_counts() : 해당 열에서 고윳값의 개수
  • sum()
  • max(), min()
  • mode() : 최빈값
  • median()
In [413]:
tips['tip'].unique()
Out[413]:
array([ 1.01,  1.66,  3.5 ,  3.31,  3.61,  4.71,  2.  ,  3.12,  1.96,
        3.23,  1.71,  5.  ,  1.57,  3.  ,  3.02,  3.92,  1.67,  3.71,
        3.35,  4.08,  2.75,  2.23,  7.58,  3.18,  2.34,  4.3 ,  1.45,
        2.5 ,  2.45,  3.27,  3.6 ,  3.07,  2.31,  2.24,  2.54,  3.06,
        1.32,  5.6 ,  6.  ,  2.05,  2.6 ,  5.2 ,  1.56,  4.34,  3.51,
        1.5 ,  1.76,  6.73,  3.21,  1.98,  3.76,  2.64,  3.15,  2.47,
        1.  ,  2.01,  2.09,  1.97,  3.14,  2.2 ,  1.25,  3.08,  4.  ,
        2.71,  3.4 ,  1.83,  2.03,  5.17,  5.85,  3.25,  4.73,  3.48,
        1.64,  4.06,  4.29,  2.55,  5.07,  1.8 ,  2.92,  1.68,  2.52,
        4.2 ,  1.48,  2.18,  2.83,  6.7 ,  2.3 ,  1.36,  1.63,  1.73,
        2.74,  5.14,  3.75,  2.61,  4.5 ,  1.61, 10.  ,  3.16,  5.15,
        3.11,  3.55,  3.68,  5.65,  6.5 ,  4.19,  2.56,  2.02,  1.44,
        3.41,  5.16,  9.  ,  1.1 ,  3.09,  1.92,  1.58,  2.72,  2.88,
        3.39,  1.47,  1.17,  4.67,  5.92,  1.75])
In [425]:
tips['day'].unique()
Out[425]:
array(['Sun', 'Sat', 'Thur', 'Fri'], dtype=object)
In [427]:
tips['day'].value_counts()
Out[427]:
day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64
In [429]:
tips['smoker'].unique()
Out[429]:
array(['No', 'Yes'], dtype=object)
In [435]:
tips['smoker'].value_counts(normalize = True)    # normalize: 확률로 변경
Out[435]:
smoker
No     0.618852
Yes    0.381148
Name: proportion, dtype: float64
In [453]:
print(tips['tip'].mode())
print()

print(tips['tip'].mode()[0])
0    2.0
Name: tip, dtype: float64

2.0
In [456]:
stock = [[94500, 92100, 92200, 92300],
         [96500, 93200, 95900, 94300],
         [93400, 91900, 93400, 92100],
         [94200, 92100, 94100, 92400],
         [94500, 92500, 94300, 92600]]
dates = ['2019-02-15', '2019-02-16', '2019-02-17', '2019-02-18', '2019-02-19']
names = ['High', 'Low', 'Open', 'Close']

df = pd.DataFrame(stock, index = dates, columns = names)
df.head()
Out[456]:
HighLowOpenClose2019-02-152019-02-162019-02-172019-02-182019-02-19
94500 92100 92200 92300
96500 93200 95900 94300
93400 91900 93400 92100
94200 92100 94100 92400
94500 92500 94300 92600
In [458]:
df.sum()
Out[458]:
High     473100
Low      461800
Open     469900
Close    463700
dtype: int64