02_pandas_basics.py

Download
python 388 lines 11.6 KB
  1"""
  2Pandas ๊ธฐ์ดˆ (Pandas Basics)
  3Fundamental Pandas Operations
  4
  5Pandas๋Š” ๋ฐ์ดํ„ฐ ๋ถ„์„์„ ์œ„ํ•œ ํ•ต์‹ฌ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ์ž…๋‹ˆ๋‹ค.
  6"""
  7
  8import pandas as pd
  9import numpy as np
 10
 11
 12# =============================================================================
 13# 1. DataFrame๊ณผ Series ์ƒ์„ฑ
 14# =============================================================================
 15def create_dataframe():
 16    """DataFrame๊ณผ Series ์ƒ์„ฑ"""
 17    print("\n[1] DataFrame๊ณผ Series ์ƒ์„ฑ")
 18    print("=" * 50)
 19
 20    # Series ์ƒ์„ฑ
 21    s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
 22    print(f"Series:\n{s}\n")
 23
 24    # ๋”•์…”๋„ˆ๋ฆฌ๋กœ DataFrame ์ƒ์„ฑ
 25    data = {
 26        '์ด๋ฆ„': ['๊น€์ฒ ์ˆ˜', '์ด์˜ํฌ', '๋ฐ•๋ฏผ์ˆ˜', '์ •์ˆ˜์ง„', '์ตœ๋™์šฑ'],
 27        '๋‚˜์ด': [25, 30, 35, 28, 32],
 28        '๋„์‹œ': ['์„œ์šธ', '๋ถ€์‚ฐ', '๋Œ€๊ตฌ', '์„œ์šธ', '์ธ์ฒœ'],
 29        '์ ์ˆ˜': [85, 92, 78, 95, 88]
 30    }
 31    df = pd.DataFrame(data)
 32    print(f"DataFrame:\n{df}\n")
 33
 34    # ๋ฆฌ์ŠคํŠธ๋กœ ์ƒ์„ฑ
 35    df2 = pd.DataFrame(
 36        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
 37        columns=['A', 'B', 'C'],
 38        index=['row1', 'row2', 'row3']
 39    )
 40    print(f"๋ฆฌ์ŠคํŠธ๋กœ ์ƒ์„ฑ:\n{df2}")
 41
 42    return df
 43
 44
 45# =============================================================================
 46# 2. ๋ฐ์ดํ„ฐ ํ™•์ธ
 47# =============================================================================
 48def inspect_data(df):
 49    """๋ฐ์ดํ„ฐ ํ™•์ธ ๋ฉ”์„œ๋“œ"""
 50    print("\n[2] ๋ฐ์ดํ„ฐ ํ™•์ธ")
 51    print("=" * 50)
 52
 53    print(f"์ฒ˜์Œ 2ํ–‰:\n{df.head(2)}\n")
 54    print(f"๋งˆ์ง€๋ง‰ 2ํ–‰:\n{df.tail(2)}\n")
 55    print(f"ํ˜•ํƒœ: {df.shape}")
 56    print(f"์ปฌ๋Ÿผ: {df.columns.tolist()}")
 57    print(f"์ธ๋ฑ์Šค: {df.index.tolist()}")
 58    print(f"\n๋ฐ์ดํ„ฐ ํƒ€์ž…:\n{df.dtypes}\n")
 59    print(f"๊ธฐ๋ณธ ํ†ต๊ณ„:\n{df.describe()}\n")
 60    print(f"์ •๋ณด:")
 61    df.info()
 62
 63
 64# =============================================================================
 65# 3. ์ธ๋ฑ์‹ฑ๊ณผ ์„ ํƒ
 66# =============================================================================
 67def indexing_selection(df):
 68    """์ธ๋ฑ์‹ฑ๊ณผ ์„ ํƒ"""
 69    print("\n[3] ์ธ๋ฑ์‹ฑ๊ณผ ์„ ํƒ")
 70    print("=" * 50)
 71
 72    print(f"์›๋ณธ DataFrame:\n{df}\n")
 73
 74    # ์ปฌ๋Ÿผ ์„ ํƒ
 75    print(f"df['์ด๋ฆ„']:\n{df['์ด๋ฆ„']}\n")
 76    print(f"df[['์ด๋ฆ„', '๋‚˜์ด']]:\n{df[['์ด๋ฆ„', '๋‚˜์ด']]}\n")
 77
 78    # ํ–‰ ์„ ํƒ (loc: ๋ผ๋ฒจ ๊ธฐ๋ฐ˜, iloc: ์œ„์น˜ ๊ธฐ๋ฐ˜)
 79    print(f"df.loc[0]:\n{df.loc[0]}\n")  # ์ฒซ ๋ฒˆ์งธ ํ–‰
 80    print(f"df.iloc[0:2]:\n{df.iloc[0:2]}\n")  # ์ฒ˜์Œ 2ํ–‰
 81    print(f"df.loc[0, '์ด๋ฆ„'] = {df.loc[0, '์ด๋ฆ„']}")  # ํŠน์ • ๊ฐ’
 82    print(f"df.iloc[0, 1] = {df.iloc[0, 1]}")  # ์œ„์น˜๋กœ ์ ‘๊ทผ
 83
 84    # ์กฐ๊ฑด๋ถ€ ์„ ํƒ
 85    print(f"\ndf[df['๋‚˜์ด'] > 28]:\n{df[df['๋‚˜์ด'] > 28]}")
 86    print(f"\ndf[(df['๋‚˜์ด'] > 25) & (df['๋„์‹œ'] == '์„œ์šธ')]:\n{df[(df['๋‚˜์ด'] > 25) & (df['๋„์‹œ'] == '์„œ์šธ')]}")
 87
 88    # ์ฟผ๋ฆฌ ๋ฉ”์„œ๋“œ
 89    result = df.query("๋‚˜์ด > 28 and ์ ์ˆ˜ >= 90")
 90    print(f"\ndf.query(\"๋‚˜์ด > 28 and ์ ์ˆ˜ >= 90\"):\n{result}")
 91
 92
 93# =============================================================================
 94# 4. ๋ฐ์ดํ„ฐ ์ˆ˜์ •
 95# =============================================================================
 96def modify_data():
 97    """๋ฐ์ดํ„ฐ ์ˆ˜์ •"""
 98    print("\n[4] ๋ฐ์ดํ„ฐ ์ˆ˜์ •")
 99    print("=" * 50)
100
101    df = pd.DataFrame({
102        'A': [1, 2, 3],
103        'B': [4, 5, 6],
104        'C': [7, 8, 9]
105    })
106    print(f"์›๋ณธ:\n{df}\n")
107
108    # ์ปฌ๋Ÿผ ์ถ”๊ฐ€
109    df['D'] = df['A'] + df['B']
110    print(f"์ปฌ๋Ÿผ ์ถ”๊ฐ€ (D = A + B):\n{df}\n")
111
112    # ์ปฌ๋Ÿผ ์‚ญ์ œ
113    df_dropped = df.drop('D', axis=1)
114    print(f"์ปฌ๋Ÿผ ์‚ญ์ œ:\n{df_dropped}\n")
115
116    # ๊ฐ’ ๋ณ€๊ฒฝ
117    df.loc[0, 'A'] = 100
118    print(f"๊ฐ’ ๋ณ€๊ฒฝ (df.loc[0, 'A'] = 100):\n{df}\n")
119
120    # ์กฐ๊ฑด๋ถ€ ๋ณ€๊ฒฝ
121    df.loc[df['B'] > 4, 'C'] = 0
122    print(f"์กฐ๊ฑด๋ถ€ ๋ณ€๊ฒฝ:\n{df}\n")
123
124    # ์ปฌ๋Ÿผ ์ด๋ฆ„ ๋ณ€๊ฒฝ
125    df_renamed = df.rename(columns={'A': 'Alpha', 'B': 'Beta'})
126    print(f"์ปฌ๋Ÿผ ์ด๋ฆ„ ๋ณ€๊ฒฝ:\n{df_renamed}")
127
128
129# =============================================================================
130# 5. ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ
131# =============================================================================
132def handle_missing():
133    """๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ"""
134    print("\n[5] ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ")
135    print("=" * 50)
136
137    df = pd.DataFrame({
138        'A': [1, 2, np.nan, 4],
139        'B': [5, np.nan, np.nan, 8],
140        'C': [9, 10, 11, 12]
141    })
142    print(f"์›๋ณธ (NaN ํฌํ•จ):\n{df}\n")
143
144    # ๊ฒฐ์ธก์น˜ ํ™•์ธ
145    print(f"๊ฒฐ์ธก์น˜ ํ™•์ธ:\n{df.isnull()}\n")
146    print(f"์ปฌ๋Ÿผ๋ณ„ ๊ฒฐ์ธก์น˜ ์ˆ˜:\n{df.isnull().sum()}\n")
147
148    # ๊ฒฐ์ธก์น˜ ์ œ๊ฑฐ
149    df_dropna = df.dropna()
150    print(f"dropna() - ํ–‰ ์ œ๊ฑฐ:\n{df_dropna}\n")
151
152    # ๊ฒฐ์ธก์น˜ ์ฑ„์šฐ๊ธฐ
153    df_filled = df.fillna(0)
154    print(f"fillna(0):\n{df_filled}\n")
155
156    df_ffill = df.fillna(method='ffill')
157    print(f"fillna(method='ffill') - ์•ž์˜ ๊ฐ’์œผ๋กœ:\n{df_ffill}\n")
158
159    df_mean = df.fillna(df.mean())
160    print(f"fillna(df.mean()) - ํ‰๊ท ์œผ๋กœ:\n{df_mean}")
161
162
163# =============================================================================
164# 6. ๊ทธ๋ฃนํ™”์™€ ์ง‘๊ณ„
165# =============================================================================
166def groupby_aggregation():
167    """๊ทธ๋ฃนํ™”์™€ ์ง‘๊ณ„"""
168    print("\n[6] ๊ทธ๋ฃนํ™”์™€ ์ง‘๊ณ„")
169    print("=" * 50)
170
171    df = pd.DataFrame({
172        '๋ถ€์„œ': ['์˜์—…', '๊ฐœ๋ฐœ', '์˜์—…', '๊ฐœ๋ฐœ', '์˜์—…', '๊ฐœ๋ฐœ'],
173        '์ด๋ฆ„': ['๊น€์ฒ ์ˆ˜', '์ด์˜ํฌ', '๋ฐ•๋ฏผ์ˆ˜', '์ •์ˆ˜์ง„', '์ตœ๋™์šฑ', '๊ฐ•๋ฏธ์˜'],
174        '๋งค์ถœ': [100, 80, 120, 90, 110, 85],
175        '๊ฒฝ๋ ฅ': [3, 5, 7, 4, 6, 2]
176    })
177    print(f"์›๋ณธ:\n{df}\n")
178
179    # ๊ธฐ๋ณธ ๊ทธ๋ฃนํ™”
180    grouped = df.groupby('๋ถ€์„œ')
181    print(f"๋ถ€์„œ๋ณ„ ๋งค์ถœ ํ•ฉ๊ณ„:\n{grouped['๋งค์ถœ'].sum()}\n")
182    print(f"๋ถ€์„œ๋ณ„ ๋งค์ถœ ํ‰๊ท :\n{grouped['๋งค์ถœ'].mean()}\n")
183
184    # ์—ฌ๋Ÿฌ ์ง‘๊ณ„ ํ•จ์ˆ˜
185    agg_result = grouped.agg({
186        '๋งค์ถœ': ['sum', 'mean', 'max'],
187        '๊ฒฝ๋ ฅ': ['mean', 'min', 'max']
188    })
189    print(f"๋‹ค์ค‘ ์ง‘๊ณ„:\n{agg_result}\n")
190
191    # ์—ฌ๋Ÿฌ ์ปฌ๋Ÿผ์œผ๋กœ ๊ทธ๋ฃนํ™”
192    df['์—ฐ๋„'] = [2023, 2023, 2024, 2024, 2023, 2024]
193    multi_group = df.groupby(['๋ถ€์„œ', '์—ฐ๋„'])['๋งค์ถœ'].sum()
194    print(f"๋ถ€์„œ, ์—ฐ๋„๋ณ„ ๋งค์ถœ:\n{multi_group}")
195
196
197# =============================================================================
198# 7. ์ •๋ ฌ๊ณผ ์ˆœ์œ„
199# =============================================================================
200def sorting_ranking():
201    """์ •๋ ฌ๊ณผ ์ˆœ์œ„"""
202    print("\n[7] ์ •๋ ฌ๊ณผ ์ˆœ์œ„")
203    print("=" * 50)
204
205    df = pd.DataFrame({
206        '์ด๋ฆ„': ['A', 'B', 'C', 'D', 'E'],
207        '์ ์ˆ˜': [85, 92, 78, 95, 88],
208        '๋‚˜์ด': [25, 30, 25, 35, 28]
209    })
210    print(f"์›๋ณธ:\n{df}\n")
211
212    # ๋‹จ์ผ ์ปฌ๋Ÿผ ์ •๋ ฌ
213    sorted_df = df.sort_values('์ ์ˆ˜', ascending=False)
214    print(f"์ ์ˆ˜ ๋‚ด๋ฆผ์ฐจ์ˆœ:\n{sorted_df}\n")
215
216    # ์—ฌ๋Ÿฌ ์ปฌ๋Ÿผ ์ •๋ ฌ
217    sorted_df2 = df.sort_values(['๋‚˜์ด', '์ ์ˆ˜'], ascending=[True, False])
218    print(f"๋‚˜์ด ์˜ค๋ฆ„์ฐจ์ˆœ, ์ ์ˆ˜ ๋‚ด๋ฆผ์ฐจ์ˆœ:\n{sorted_df2}\n")
219
220    # ์ธ๋ฑ์Šค ์ •๋ ฌ
221    df_shuffled = df.sample(frac=1)
222    print(f"์…”ํ”Œ๋œ ๋ฐ์ดํ„ฐ:\n{df_shuffled}")
223    print(f"์ธ๋ฑ์Šค ์ •๋ ฌ:\n{df_shuffled.sort_index()}\n")
224
225    # ์ˆœ์œ„
226    df['์ˆœ์œ„'] = df['์ ์ˆ˜'].rank(ascending=False)
227    print(f"์ˆœ์œ„ ์ถ”๊ฐ€:\n{df}")
228
229
230# =============================================================================
231# 8. ๋ฐ์ดํ„ฐ ๋ณ‘ํ•ฉ
232# =============================================================================
233def merge_data():
234    """๋ฐ์ดํ„ฐ ๋ณ‘ํ•ฉ"""
235    print("\n[8] ๋ฐ์ดํ„ฐ ๋ณ‘ํ•ฉ")
236    print("=" * 50)
237
238    # ๋‘ DataFrame ์ค€๋น„
239    df1 = pd.DataFrame({
240        '์‚ฌ์›ID': [1, 2, 3, 4],
241        '์ด๋ฆ„': ['๊น€์ฒ ์ˆ˜', '์ด์˜ํฌ', '๋ฐ•๋ฏผ์ˆ˜', '์ •์ˆ˜์ง„']
242    })
243
244    df2 = pd.DataFrame({
245        '์‚ฌ์›ID': [2, 3, 4, 5],
246        '๋ถ€์„œ': ['๊ฐœ๋ฐœ', '์˜์—…', '๋งˆ์ผ€ํŒ…', 'HR']
247    })
248
249    print(f"df1:\n{df1}\n")
250    print(f"df2:\n{df2}\n")
251
252    # Inner Join
253    inner = pd.merge(df1, df2, on='์‚ฌ์›ID', how='inner')
254    print(f"Inner Join:\n{inner}\n")
255
256    # Left Join
257    left = pd.merge(df1, df2, on='์‚ฌ์›ID', how='left')
258    print(f"Left Join:\n{left}\n")
259
260    # Outer Join
261    outer = pd.merge(df1, df2, on='์‚ฌ์›ID', how='outer')
262    print(f"Outer Join:\n{outer}\n")
263
264    # Concat
265    df_a = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
266    df_b = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})
267
268    concat_rows = pd.concat([df_a, df_b], ignore_index=True)
269    print(f"์„ธ๋กœ ์—ฐ๊ฒฐ (concat):\n{concat_rows}\n")
270
271    concat_cols = pd.concat([df_a, df_b], axis=1)
272    print(f"๊ฐ€๋กœ ์—ฐ๊ฒฐ (concat, axis=1):\n{concat_cols}")
273
274
275# =============================================================================
276# 9. ํ”ผ๋ฒ— ํ…Œ์ด๋ธ”
277# =============================================================================
278def pivot_tables():
279    """ํ”ผ๋ฒ— ํ…Œ์ด๋ธ”"""
280    print("\n[9] ํ”ผ๋ฒ— ํ…Œ์ด๋ธ”")
281    print("=" * 50)
282
283    df = pd.DataFrame({
284        '๋‚ ์งœ': ['2024-01', '2024-01', '2024-02', '2024-02'] * 2,
285        '์ง€์—ญ': ['์„œ์šธ', '๋ถ€์‚ฐ'] * 4,
286        '์ œํ’ˆ': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'],
287        '๋งค์ถœ': [100, 80, 120, 90, 60, 70, 80, 50]
288    })
289    print(f"์›๋ณธ:\n{df}\n")
290
291    # ํ”ผ๋ฒ— ํ…Œ์ด๋ธ”
292    pivot = df.pivot_table(
293        values='๋งค์ถœ',
294        index='์ง€์—ญ',
295        columns='์ œํ’ˆ',
296        aggfunc='sum'
297    )
298    print(f"ํ”ผ๋ฒ— ํ…Œ์ด๋ธ” (์ง€์—ญ x ์ œํ’ˆ):\n{pivot}\n")
299
300    # ๋ณตํ•ฉ ํ”ผ๋ฒ—
301    pivot2 = df.pivot_table(
302        values='๋งค์ถœ',
303        index=['๋‚ ์งœ', '์ง€์—ญ'],
304        columns='์ œํ’ˆ',
305        aggfunc=['sum', 'mean']
306    )
307    print(f"๋ณตํ•ฉ ํ”ผ๋ฒ— ํ…Œ์ด๋ธ”:\n{pivot2}")
308
309
310# =============================================================================
311# 10. ์‹œ๊ณ„์—ด ๋ฐ์ดํ„ฐ
312# =============================================================================
313def time_series():
314    """์‹œ๊ณ„์—ด ๋ฐ์ดํ„ฐ"""
315    print("\n[10] ์‹œ๊ณ„์—ด ๋ฐ์ดํ„ฐ")
316    print("=" * 50)
317
318    # ๋‚ ์งœ ๋ฒ”์œ„ ์ƒ์„ฑ
319    dates = pd.date_range('2024-01-01', periods=10, freq='D')
320    print(f"๋‚ ์งœ ๋ฒ”์œ„:\n{dates}\n")
321
322    # ์‹œ๊ณ„์—ด DataFrame
323    df = pd.DataFrame({
324        '๋‚ ์งœ': dates,
325        '๊ฐ’': np.random.randn(10).cumsum()
326    })
327    df['๋‚ ์งœ'] = pd.to_datetime(df['๋‚ ์งœ'])
328    df.set_index('๋‚ ์งœ', inplace=True)
329    print(f"์‹œ๊ณ„์—ด ๋ฐ์ดํ„ฐ:\n{df}\n")
330
331    # ๋ฆฌ์ƒ˜ํ”Œ๋ง
332    df_monthly = pd.DataFrame({
333        '๊ฐ’': np.random.randn(100)
334    }, index=pd.date_range('2024-01-01', periods=100, freq='D'))
335
336    monthly_mean = df_monthly.resample('M').mean()
337    print(f"์›”๋ณ„ ํ‰๊ท :\n{monthly_mean}\n")
338
339    # ์ด๋™ ํ‰๊ท 
340    df['์ด๋™ํ‰๊ท '] = df['๊ฐ’'].rolling(window=3).mean()
341    print(f"์ด๋™ ํ‰๊ท  (window=3):\n{df}")
342
343
344# =============================================================================
345# ๋ฉ”์ธ
346# =============================================================================
347def main():
348    print("=" * 60)
349    print("Pandas ๊ธฐ์ดˆ ์˜ˆ์ œ")
350    print("=" * 60)
351
352    df = create_dataframe()
353    inspect_data(df)
354    indexing_selection(df)
355    modify_data()
356    handle_missing()
357    groupby_aggregation()
358    sorting_ranking()
359    merge_data()
360    pivot_tables()
361    time_series()
362
363    print("\n" + "=" * 60)
364    print("Pandas ํ•ต์‹ฌ ์ •๋ฆฌ")
365    print("=" * 60)
366    print("""
367    ํ•ต์‹ฌ ์ž๋ฃŒ๊ตฌ์กฐ:
368    - Series: 1์ฐจ์› (๋ผ๋ฒจ์ด ๋ถ™์€ ๋ฐฐ์—ด)
369    - DataFrame: 2์ฐจ์› (ํ‘œ ํ˜•์‹)
370
371    ์ž์ฃผ ์‚ฌ์šฉํ•˜๋Š” ๋ฉ”์„œ๋“œ:
372    - ํ™•์ธ: head, tail, info, describe, shape
373    - ์„ ํƒ: loc (๋ผ๋ฒจ), iloc (์œ„์น˜), query
374    - ์ˆ˜์ •: drop, rename, fillna
375    - ์ง‘๊ณ„: groupby, agg, pivot_table
376    - ๋ณ‘ํ•ฉ: merge, concat, join
377
378    ํŒ:
379    - ์ฒด์ด๋‹: df.dropna().groupby('col').mean()
380    - ๋ณต์‚ฌ: df.copy() vs ๋ทฐ (์Šฌ๋ผ์ด์‹ฑ)
381    - ๋ฉ”๋ชจ๋ฆฌ: category ํƒ€์ž…์œผ๋กœ ๋ฌธ์ž์—ด ์ ˆ์•ฝ
382    - ์„ฑ๋Šฅ: apply ๋Œ€์‹  ๋ฒกํ„ฐํ™” ์—ฐ์‚ฐ ์‚ฌ์šฉ
383    """)
384
385
386if __name__ == "__main__":
387    main()