bestone888 2024. 9. 14. 19:50

240913

데이터 프레임 조회

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [16]:
path = 'https://raw.githubusercontent.com/Jangrae/csv/master/tips.csv'
tip = pd.read_csv(path)

tip
Out[16]:
total_billtipsexsmokerdaytimesize01234...239240241242243
16.99 1.01 Female No Sun Dinner 2
10.34 1.66 Male No Sun Dinner 3
21.01 3.50 Male No Sun Dinner 3
23.68 3.31 Male No Sun Dinner 2
24.59 3.61 Female No Sun Dinner 4
... ... ... ... ... ... ...
29.03 5.92 Male No Sat Dinner 3
27.18 2.00 Female Yes Sat Dinner 2
22.67 2.00 Male Yes Sat Dinner 2
17.82 1.75 Male No Sat Dinner 2
18.78 3.00 Female No Thur Dinner 2

244 rows × 7 columns

In [22]:
print(tip.shape)    # n 행 m 열
print(tip.size)     # n*m
print(tip.info())    # 열 이름, 자료 개수, 자료형 
print(tip.describe())    #통계
(244, 7)
1708
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB
None
       total_bill         tip        size
count  244.000000  244.000000  244.000000
mean    19.785943    2.998279    2.569672
std      8.902412    1.383638    0.951100
min      3.070000    1.000000    1.000000
25%     13.347500    2.000000    2.000000
50%     17.795000    2.900000    2.000000
75%     24.127500    3.562500    3.000000
max     50.810000   10.000000    6.000000

loc() : 행 열 입력해 조회

In [34]:
# total_bill 조회
tip.loc[:, ['total_bill']]
Out[34]:
total_bill01234...239240241242243
16.99
10.34
21.01
23.68
24.59
...
29.03
27.18
22.67
17.82
18.78

244 rows × 1 columns

In [38]:
tip[['total_bill']]
Out[38]:
total_bill01234...239240241242243
16.99
10.34
21.01
23.68
24.59
...
29.03
27.18
22.67
17.82
18.78

244 rows × 1 columns

In [42]:
# tip, total_bill 조회
tip[['tip', 'total_bill']]
Out[42]:
tiptotal_bill01234...239240241242243
1.01 16.99
1.66 10.34
3.50 21.01
3.31 23.68
3.61 24.59
... ...
5.92 29.03
2.00 27.18
2.00 22.67
1.75 17.82
3.00 18.78

244 rows × 2 columns

In [44]:
# sex 부터 size 까지 조회
tip.loc[:, 'sex': 'size']
Out[44]:
sexsmokerdaytimesize01234...239240241242243
Female No Sun Dinner 2
Male No Sun Dinner 3
Male No Sun Dinner 3
Male No Sun Dinner 2
Female No Sun Dinner 4
... ... ... ... ...
Male No Sat Dinner 3
Female Yes Sat Dinner 2
Male Yes Sat Dinner 2
Male No Sat Dinner 2
Female No Thur Dinner 2

244 rows × 5 columns

sort_values() : 원하는 열 기준으로 정렬

In [57]:
# tip, day, time 열을 tip 기준으로 내림차순 정렬
tip_tip = tip.loc[:, ['tip', 'day', 'time']].sort_values(['tip'], ascending = False)
tip_tip
Out[57]:
tipdaytime1702122359141...02361116792
10.00 Sat Dinner
9.00 Sat Dinner
7.58 Sat Dinner
6.73 Sat Dinner
6.70 Thur Lunch
... ... ...
1.01 Sun Dinner
1.00 Sat Dinner
1.00 Sat Dinner
1.00 Sat Dinner
1.00 Fri Dinner

244 rows × 3 columns

iloc() != loc()

In [74]:
tip.iloc[:, 0:2]  # index 0~1 열
Out[74]:
total_billtip01234...239240241242243
16.99 1.01
10.34 1.66
21.01 3.50
23.68 3.31
24.59 3.61
... ...
29.03 5.92
27.18 2.00
22.67 2.00
17.82 1.75
18.78 3.00

244 rows × 2 columns

In [76]:
tip.iloc[5:15, :]    # index 5행 ~14행
Out[76]:
total_billtipsexsmokerdaytimesize567891011121314
25.29 4.71 Male No Sun Dinner 4
8.77 2.00 Male No Sun Dinner 2
26.88 3.12 Male No Sun Dinner 4
15.04 1.96 Male No Sun Dinner 2
14.78 3.23 Male No Sun Dinner 2
10.27 1.71 Male No Sun Dinner 2
35.26 5.00 Female No Sun Dinner 4
15.42 1.57 Male No Sun Dinner 2
18.43 3.00 Male No Sun Dinner 4
14.83 3.02 Female No Sun Dinner 2
In [88]:
# df.loc
tip.loc[1:3, 'total_bill': 'day']
Out[88]:
total_billtipsexsmokerday123
10.34 1.66 Male No Sun
21.01 3.50 Male No Sun
23.68 3.31 Male No Sun
In [90]:
# df.iloc
tip.iloc[1:3, 1:4]
Out[90]:
tipsexsmoker12
1.66 Male No
3.50 Male No

데이터프레임 조건으로 조회

In [99]:
# tip이 6 이상인 행 조회
tip6 = tip.loc[tip['tip']>6.0]
tip6
Out[99]:
total_billtipsexsmokerdaytimesize2359141170183212214
39.42 7.58 Male No Sat Dinner 4
48.27 6.73 Male No Sat Dinner 4
34.30 6.70 Male No Thur Lunch 6
50.81 10.00 Male Yes Sat Dinner 3
23.17 6.50 Male Yes Sun Dinner 4
48.33 9.00 Male No Sat Dinner 4
28.17 6.50 Female Yes Sat Dinner 3
In [121]:
# 그 중 total_bill 40 이상인 행 조회
ans = tip6[tip6['total_bill']>=40]
ans

# tip 내림차순으로 정렬, index 제거
ans = ans.sort_values(['tip'], ascending = False)
ans.reset_index(inplace = True, drop = True)
ans
Out[121]:
total_billtipsexsmokerdaytimesize012
50.81 10.00 Male Yes Sat Dinner 3
48.33 9.00 Male No Sat Dinner 4
48.27 6.73 Male No Sat Dinner 4

isin(), between()

In [124]:
# day가 Sat 또는 Sun
tip.loc[tip['day'].isin(['Sat', 'Sun'])]
Out[124]:
total_billtipsexsmokerdaytimesize01234...238239240241242
16.99 1.01 Female No Sun Dinner 2
10.34 1.66 Male No Sun Dinner 3
21.01 3.50 Male No Sun Dinner 3
23.68 3.31 Male No Sun Dinner 2
24.59 3.61 Female No Sun Dinner 4
... ... ... ... ... ... ...
35.83 4.67 Female No Sat Dinner 3
29.03 5.92 Male No Sat Dinner 3
27.18 2.00 Female Yes Sat Dinner 2
22.67 2.00 Male Yes Sat Dinner 2
17.82 1.75 Male No Sat Dinner 2

163 rows × 7 columns

In [126]:
tip.loc[(tip['day'] =='Sat') | (tip['day'] == 'Sun')]
Out[126]:
total_billtipsexsmokerdaytimesize01234...238239240241242
16.99 1.01 Female No Sun Dinner 2
10.34 1.66 Male No Sun Dinner 3
21.01 3.50 Male No Sun Dinner 3
23.68 3.31 Male No Sun Dinner 2
24.59 3.61 Female No Sun Dinner 4
... ... ... ... ... ... ...
35.83 4.67 Female No Sat Dinner 3
29.03 5.92 Male No Sat Dinner 3
27.18 2.00 Female Yes Sat Dinner 2
22.67 2.00 Male Yes Sat Dinner 2
17.82 1.75 Male No Sat Dinner 2

163 rows × 7 columns

In [134]:
# 범위 지정 between
tip.loc[tip['size'].between(1,3)]
tip.sort_values('size', ascending = True)   # size 오름차순 정렬
Out[134]:
total_billtipsexsmokerdaytimesize67822221110...155143156141125
3.07 1.00 Female Yes Sat Dinner 1
10.07 1.83 Female No Thur Lunch 1
8.58 1.92 Male Yes Fri Lunch 1
7.25 1.00 Female No Sat Dinner 1
16.99 1.01 Female No Sun Dinner 2
... ... ... ... ... ... ...
29.85 5.14 Female No Sun Dinner 5
27.05 5.00 Female No Thur Lunch 6
48.17 5.00 Male No Sun Dinner 6
34.30 6.70 Male No Thur Lunch 6
29.80 4.20 Female No Thur Lunch 6

244 rows × 7 columns

In [138]:
tip.loc[(tip['size']>=1) & (tip['size']<=3)]
Out[138]:
total_billtipsexsmokerdaytimesize01236...239240241242243
16.99 1.01 Female No Sun Dinner 2
10.34 1.66 Male No Sun Dinner 3
21.01 3.50 Male No Sun Dinner 3
23.68 3.31 Male No Sun Dinner 2
8.77 2.00 Male No Sun Dinner 2
... ... ... ... ... ... ...
29.03 5.92 Male No Sat Dinner 3
27.18 2.00 Female Yes Sat Dinner 2
22.67 2.00 Male Yes Sat Dinner 2
17.82 1.75 Male No Sat Dinner 2
18.78 3.00 Female No Thur Dinner 2

198 rows × 7 columns

In [148]:
# 조넉네 맞는 하나의 열 조회
tip.loc[tip['size'] >= 5, ['size']]
Out[148]:
size125141142143155156185187216
6
6
5
6
5
6
5
5
5
In [160]:
tip5 = tip.loc[tip['size']>= 5, ['total_bill', 'tip', 'size']]

# total_bill 오름차순 정렬, 같으면 tip 내림차순 정렬
tip5.sort_values(['total_bill', 'tip'], ascending = [True, False])
Out[160]:
total_billtipsize185143216125155187141142156
20.69 5.00 5
27.05 5.00 6
28.15 3.00 5
29.80 4.20 6
29.85 5.14 5
30.46 2.00 5
34.30 6.70 6
41.19 5.00 5
48.17 5.00 6

reset_index() 인덱스 제거

  • drop = True
  • inplace = True
In [167]:
# tip이 6 이상
good = tip.loc[tip['tip']>=6]
good.reset_index(drop = True, inplace = True)

good
# index 제거
Out[167]:
total_billtipsexsmokerdaytimesize01234567
39.42 7.58 Male No Sat Dinner 4
32.40 6.00 Male No Sun Dinner 4
48.27 6.73 Male No Sat Dinner 4
34.30 6.70 Male No Thur Lunch 6
50.81 10.00 Male Yes Sat Dinner 3
23.17 6.50 Male Yes Sun Dinner 4
48.33 9.00 Male No Sat Dinner 4
28.17 6.50 Female Yes Sat Dinner 3

데이터 프레임 집계

In [176]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [178]:
path = 'https://raw.githubusercontent.com/Jangrae/csv/master/tips.csv'
df = pd.read_csv(path)

df
Out[178]:
total_billtipsexsmokerdaytimesize01234...239240241242243
16.99 1.01 Female No Sun Dinner 2
10.34 1.66 Male No Sun Dinner 3
21.01 3.50 Male No Sun Dinner 3
23.68 3.31 Male No Sun Dinner 2
24.59 3.61 Female No Sun Dinner 4
... ... ... ... ... ... ...
29.03 5.92 Male No Sat Dinner 3
27.18 2.00 Female Yes Sat Dinner 2
22.67 2.00 Male Yes Sat Dinner 2
17.82 1.75 Male No Sat Dinner 2
18.78 3.00 Female No Thur Dinner 2

244 rows × 7 columns

In [180]:
# total_bill, tip :  연속값
# sex, smoker, day, time, size : 범주값
  • sum()
  • max()
  • min()
In [189]:
# sum() 기본적으로 axis = 0 기준
print(tip['total_bill'].sum())
print(tip['total_bill'].sum(axis = 0))
4827.77
4827.77

groupby()

In [200]:
# as_index = True
tip.groupby('day', as_index = True)[['tip']].sum()
Out[200]:
tipdayFriSatSunThur
51.96
260.40
247.39
171.83
In [202]:
# as_index = False
tip.groupby('day', as_index = False)[['tip']].sum()
Out[202]:
daytip0123
Fri 51.96
Sat 260.40
Sun 247.39
Thur 171.83

여러 열 집계

In [277]:
tip_sum = tip.groupby('day', as_index = False)[['total_bill', 'tip']].sum()

tip_sum
Out[277]:
daytotal_billtip0123
Fri 325.88 51.96
Sat 1778.40 260.40
Sun 1627.16 247.39
Thur 1096.33 171.83
In [282]:
tip_sum = tip.groupby(['day', 'smoker'], as_index = False)[['total_bill', 'tip']].sum()
tip_sum
Out[282]:
daysmokertotal_billtip01234567
Fri No 73.68 11.25
Fri Yes 252.20 40.71
Sat No 884.78 139.63
Sat Yes 893.62 120.77
Sun No 1168.88 180.57
Sun Yes 458.28 66.82
Thur No 770.09 120.32
Thur Yes 326.24 51.51

시각화

In [210]:
# 요일 별 팁
tip_sum = tip.groupby('day', as_index =False)[['tip']].sum()
tip_sum
Out[210]:
daytip0123
Fri 51.96
Sat 260.40
Sun 247.39
Thur 171.83
In [220]:
import matplotlib.pyplot as plt

plt.figure(figsize = (5,3))
plt.plot(tip_sum['day'], tip_sum['tip'])
plt.title('TIP BY DAY')
plt.xlabel('DAY')
plt.ylabel('TIP')
plt.show()
In [242]:
tip.loc[:, ['tip', 'total_bill']]
Out[242]:
tiptotal_bill01234...239240241242243
1.01 16.99
1.66 10.34
3.50 21.01
3.31 23.68
3.61 24.59
... ...
5.92 29.03
2.00 27.18
2.00 22.67
1.75 17.82
3.00 18.78

244 rows × 2 columns

In [285]:
plt.figure(figsize = (5,3))
plt.plot(tip[['tip', 'total_bill']], label = ['TIP', 'TOTAL_BILL'])
plt.legend()
plt.title('tip & total_bill')
plt.show()
In [286]:
plt.figure(figsize = (5,3))
plt.bar(tip_sum['day'],tip_sum['tip'])
plt.show()
In [289]:
plt.figure(figsize = (5,3))
tip_mean = tip['tip'].mean()
plt.hist(tip['tip'])
plt.axvline(tip_mean, color = 'r')
plt.show()
In [291]:
plt.figure(figsize = (5,3))
plt.scatter(tip['tip'], tip['total_bill'])
plt.xlabel('TIP')
plt.ylabel('TOTAL_BILL')
plt.show()
In [ ]: