1"""
2Pandas ๊ธฐ์ด (Pandas Basics)
3Fundamental Pandas Operations
4
5Pandas๋ ๋ฐ์ดํฐ ๋ถ์์ ์ํ ํต์ฌ ๋ผ์ด๋ธ๋ฌ๋ฆฌ์
๋๋ค.
6"""
7
8import pandas as pd
9import numpy as np
10
11
12# =============================================================================
13# 1. DataFrame๊ณผ Series ์์ฑ
14# =============================================================================
15def create_dataframe():
16 """DataFrame๊ณผ Series ์์ฑ"""
17 print("\n[1] DataFrame๊ณผ Series ์์ฑ")
18 print("=" * 50)
19
20 # Series ์์ฑ
21 s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
22 print(f"Series:\n{s}\n")
23
24 # ๋์
๋๋ฆฌ๋ก DataFrame ์์ฑ
25 data = {
26 '์ด๋ฆ': ['๊น์ฒ ์', '์ด์ํฌ', '๋ฐ๋ฏผ์', '์ ์์ง', '์ต๋์ฑ'],
27 '๋์ด': [25, 30, 35, 28, 32],
28 '๋์': ['์์ธ', '๋ถ์ฐ', '๋๊ตฌ', '์์ธ', '์ธ์ฒ'],
29 '์ ์': [85, 92, 78, 95, 88]
30 }
31 df = pd.DataFrame(data)
32 print(f"DataFrame:\n{df}\n")
33
34 # ๋ฆฌ์คํธ๋ก ์์ฑ
35 df2 = pd.DataFrame(
36 [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
37 columns=['A', 'B', 'C'],
38 index=['row1', 'row2', 'row3']
39 )
40 print(f"๋ฆฌ์คํธ๋ก ์์ฑ:\n{df2}")
41
42 return df
43
44
45# =============================================================================
46# 2. ๋ฐ์ดํฐ ํ์ธ
47# =============================================================================
48def inspect_data(df):
49 """๋ฐ์ดํฐ ํ์ธ ๋ฉ์๋"""
50 print("\n[2] ๋ฐ์ดํฐ ํ์ธ")
51 print("=" * 50)
52
53 print(f"์ฒ์ 2ํ:\n{df.head(2)}\n")
54 print(f"๋ง์ง๋ง 2ํ:\n{df.tail(2)}\n")
55 print(f"ํํ: {df.shape}")
56 print(f"์ปฌ๋ผ: {df.columns.tolist()}")
57 print(f"์ธ๋ฑ์ค: {df.index.tolist()}")
58 print(f"\n๋ฐ์ดํฐ ํ์
:\n{df.dtypes}\n")
59 print(f"๊ธฐ๋ณธ ํต๊ณ:\n{df.describe()}\n")
60 print(f"์ ๋ณด:")
61 df.info()
62
63
64# =============================================================================
65# 3. ์ธ๋ฑ์ฑ๊ณผ ์ ํ
66# =============================================================================
67def indexing_selection(df):
68 """์ธ๋ฑ์ฑ๊ณผ ์ ํ"""
69 print("\n[3] ์ธ๋ฑ์ฑ๊ณผ ์ ํ")
70 print("=" * 50)
71
72 print(f"์๋ณธ DataFrame:\n{df}\n")
73
74 # ์ปฌ๋ผ ์ ํ
75 print(f"df['์ด๋ฆ']:\n{df['์ด๋ฆ']}\n")
76 print(f"df[['์ด๋ฆ', '๋์ด']]:\n{df[['์ด๋ฆ', '๋์ด']]}\n")
77
78 # ํ ์ ํ (loc: ๋ผ๋ฒจ ๊ธฐ๋ฐ, iloc: ์์น ๊ธฐ๋ฐ)
79 print(f"df.loc[0]:\n{df.loc[0]}\n") # ์ฒซ ๋ฒ์งธ ํ
80 print(f"df.iloc[0:2]:\n{df.iloc[0:2]}\n") # ์ฒ์ 2ํ
81 print(f"df.loc[0, '์ด๋ฆ'] = {df.loc[0, '์ด๋ฆ']}") # ํน์ ๊ฐ
82 print(f"df.iloc[0, 1] = {df.iloc[0, 1]}") # ์์น๋ก ์ ๊ทผ
83
84 # ์กฐ๊ฑด๋ถ ์ ํ
85 print(f"\ndf[df['๋์ด'] > 28]:\n{df[df['๋์ด'] > 28]}")
86 print(f"\ndf[(df['๋์ด'] > 25) & (df['๋์'] == '์์ธ')]:\n{df[(df['๋์ด'] > 25) & (df['๋์'] == '์์ธ')]}")
87
88 # ์ฟผ๋ฆฌ ๋ฉ์๋
89 result = df.query("๋์ด > 28 and ์ ์ >= 90")
90 print(f"\ndf.query(\"๋์ด > 28 and ์ ์ >= 90\"):\n{result}")
91
92
93# =============================================================================
94# 4. ๋ฐ์ดํฐ ์์
95# =============================================================================
96def modify_data():
97 """๋ฐ์ดํฐ ์์ """
98 print("\n[4] ๋ฐ์ดํฐ ์์ ")
99 print("=" * 50)
100
101 df = pd.DataFrame({
102 'A': [1, 2, 3],
103 'B': [4, 5, 6],
104 'C': [7, 8, 9]
105 })
106 print(f"์๋ณธ:\n{df}\n")
107
108 # ์ปฌ๋ผ ์ถ๊ฐ
109 df['D'] = df['A'] + df['B']
110 print(f"์ปฌ๋ผ ์ถ๊ฐ (D = A + B):\n{df}\n")
111
112 # ์ปฌ๋ผ ์ญ์
113 df_dropped = df.drop('D', axis=1)
114 print(f"์ปฌ๋ผ ์ญ์ :\n{df_dropped}\n")
115
116 # ๊ฐ ๋ณ๊ฒฝ
117 df.loc[0, 'A'] = 100
118 print(f"๊ฐ ๋ณ๊ฒฝ (df.loc[0, 'A'] = 100):\n{df}\n")
119
120 # ์กฐ๊ฑด๋ถ ๋ณ๊ฒฝ
121 df.loc[df['B'] > 4, 'C'] = 0
122 print(f"์กฐ๊ฑด๋ถ ๋ณ๊ฒฝ:\n{df}\n")
123
124 # ์ปฌ๋ผ ์ด๋ฆ ๋ณ๊ฒฝ
125 df_renamed = df.rename(columns={'A': 'Alpha', 'B': 'Beta'})
126 print(f"์ปฌ๋ผ ์ด๋ฆ ๋ณ๊ฒฝ:\n{df_renamed}")
127
128
129# =============================================================================
130# 5. ๊ฒฐ์ธก์น ์ฒ๋ฆฌ
131# =============================================================================
132def handle_missing():
133 """๊ฒฐ์ธก์น ์ฒ๋ฆฌ"""
134 print("\n[5] ๊ฒฐ์ธก์น ์ฒ๋ฆฌ")
135 print("=" * 50)
136
137 df = pd.DataFrame({
138 'A': [1, 2, np.nan, 4],
139 'B': [5, np.nan, np.nan, 8],
140 'C': [9, 10, 11, 12]
141 })
142 print(f"์๋ณธ (NaN ํฌํจ):\n{df}\n")
143
144 # ๊ฒฐ์ธก์น ํ์ธ
145 print(f"๊ฒฐ์ธก์น ํ์ธ:\n{df.isnull()}\n")
146 print(f"์ปฌ๋ผ๋ณ ๊ฒฐ์ธก์น ์:\n{df.isnull().sum()}\n")
147
148 # ๊ฒฐ์ธก์น ์ ๊ฑฐ
149 df_dropna = df.dropna()
150 print(f"dropna() - ํ ์ ๊ฑฐ:\n{df_dropna}\n")
151
152 # ๊ฒฐ์ธก์น ์ฑ์ฐ๊ธฐ
153 df_filled = df.fillna(0)
154 print(f"fillna(0):\n{df_filled}\n")
155
156 df_ffill = df.fillna(method='ffill')
157 print(f"fillna(method='ffill') - ์์ ๊ฐ์ผ๋ก:\n{df_ffill}\n")
158
159 df_mean = df.fillna(df.mean())
160 print(f"fillna(df.mean()) - ํ๊ท ์ผ๋ก:\n{df_mean}")
161
162
163# =============================================================================
164# 6. ๊ทธ๋ฃนํ์ ์ง๊ณ
165# =============================================================================
166def groupby_aggregation():
167 """๊ทธ๋ฃนํ์ ์ง๊ณ"""
168 print("\n[6] ๊ทธ๋ฃนํ์ ์ง๊ณ")
169 print("=" * 50)
170
171 df = pd.DataFrame({
172 '๋ถ์': ['์์
', '๊ฐ๋ฐ', '์์
', '๊ฐ๋ฐ', '์์
', '๊ฐ๋ฐ'],
173 '์ด๋ฆ': ['๊น์ฒ ์', '์ด์ํฌ', '๋ฐ๋ฏผ์', '์ ์์ง', '์ต๋์ฑ', '๊ฐ๋ฏธ์'],
174 '๋งค์ถ': [100, 80, 120, 90, 110, 85],
175 '๊ฒฝ๋ ฅ': [3, 5, 7, 4, 6, 2]
176 })
177 print(f"์๋ณธ:\n{df}\n")
178
179 # ๊ธฐ๋ณธ ๊ทธ๋ฃนํ
180 grouped = df.groupby('๋ถ์')
181 print(f"๋ถ์๋ณ ๋งค์ถ ํฉ๊ณ:\n{grouped['๋งค์ถ'].sum()}\n")
182 print(f"๋ถ์๋ณ ๋งค์ถ ํ๊ท :\n{grouped['๋งค์ถ'].mean()}\n")
183
184 # ์ฌ๋ฌ ์ง๊ณ ํจ์
185 agg_result = grouped.agg({
186 '๋งค์ถ': ['sum', 'mean', 'max'],
187 '๊ฒฝ๋ ฅ': ['mean', 'min', 'max']
188 })
189 print(f"๋ค์ค ์ง๊ณ:\n{agg_result}\n")
190
191 # ์ฌ๋ฌ ์ปฌ๋ผ์ผ๋ก ๊ทธ๋ฃนํ
192 df['์ฐ๋'] = [2023, 2023, 2024, 2024, 2023, 2024]
193 multi_group = df.groupby(['๋ถ์', '์ฐ๋'])['๋งค์ถ'].sum()
194 print(f"๋ถ์, ์ฐ๋๋ณ ๋งค์ถ:\n{multi_group}")
195
196
197# =============================================================================
198# 7. ์ ๋ ฌ๊ณผ ์์
199# =============================================================================
200def sorting_ranking():
201 """์ ๋ ฌ๊ณผ ์์"""
202 print("\n[7] ์ ๋ ฌ๊ณผ ์์")
203 print("=" * 50)
204
205 df = pd.DataFrame({
206 '์ด๋ฆ': ['A', 'B', 'C', 'D', 'E'],
207 '์ ์': [85, 92, 78, 95, 88],
208 '๋์ด': [25, 30, 25, 35, 28]
209 })
210 print(f"์๋ณธ:\n{df}\n")
211
212 # ๋จ์ผ ์ปฌ๋ผ ์ ๋ ฌ
213 sorted_df = df.sort_values('์ ์', ascending=False)
214 print(f"์ ์ ๋ด๋ฆผ์ฐจ์:\n{sorted_df}\n")
215
216 # ์ฌ๋ฌ ์ปฌ๋ผ ์ ๋ ฌ
217 sorted_df2 = df.sort_values(['๋์ด', '์ ์'], ascending=[True, False])
218 print(f"๋์ด ์ค๋ฆ์ฐจ์, ์ ์ ๋ด๋ฆผ์ฐจ์:\n{sorted_df2}\n")
219
220 # ์ธ๋ฑ์ค ์ ๋ ฌ
221 df_shuffled = df.sample(frac=1)
222 print(f"์
ํ๋ ๋ฐ์ดํฐ:\n{df_shuffled}")
223 print(f"์ธ๋ฑ์ค ์ ๋ ฌ:\n{df_shuffled.sort_index()}\n")
224
225 # ์์
226 df['์์'] = df['์ ์'].rank(ascending=False)
227 print(f"์์ ์ถ๊ฐ:\n{df}")
228
229
230# =============================================================================
231# 8. ๋ฐ์ดํฐ ๋ณํฉ
232# =============================================================================
233def merge_data():
234 """๋ฐ์ดํฐ ๋ณํฉ"""
235 print("\n[8] ๋ฐ์ดํฐ ๋ณํฉ")
236 print("=" * 50)
237
238 # ๋ DataFrame ์ค๋น
239 df1 = pd.DataFrame({
240 '์ฌ์ID': [1, 2, 3, 4],
241 '์ด๋ฆ': ['๊น์ฒ ์', '์ด์ํฌ', '๋ฐ๋ฏผ์', '์ ์์ง']
242 })
243
244 df2 = pd.DataFrame({
245 '์ฌ์ID': [2, 3, 4, 5],
246 '๋ถ์': ['๊ฐ๋ฐ', '์์
', '๋ง์ผํ
', 'HR']
247 })
248
249 print(f"df1:\n{df1}\n")
250 print(f"df2:\n{df2}\n")
251
252 # Inner Join
253 inner = pd.merge(df1, df2, on='์ฌ์ID', how='inner')
254 print(f"Inner Join:\n{inner}\n")
255
256 # Left Join
257 left = pd.merge(df1, df2, on='์ฌ์ID', how='left')
258 print(f"Left Join:\n{left}\n")
259
260 # Outer Join
261 outer = pd.merge(df1, df2, on='์ฌ์ID', how='outer')
262 print(f"Outer Join:\n{outer}\n")
263
264 # Concat
265 df_a = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
266 df_b = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})
267
268 concat_rows = pd.concat([df_a, df_b], ignore_index=True)
269 print(f"์ธ๋ก ์ฐ๊ฒฐ (concat):\n{concat_rows}\n")
270
271 concat_cols = pd.concat([df_a, df_b], axis=1)
272 print(f"๊ฐ๋ก ์ฐ๊ฒฐ (concat, axis=1):\n{concat_cols}")
273
274
275# =============================================================================
276# 9. ํผ๋ฒ ํ
์ด๋ธ
277# =============================================================================
278def pivot_tables():
279 """ํผ๋ฒ ํ
์ด๋ธ"""
280 print("\n[9] ํผ๋ฒ ํ
์ด๋ธ")
281 print("=" * 50)
282
283 df = pd.DataFrame({
284 '๋ ์ง': ['2024-01', '2024-01', '2024-02', '2024-02'] * 2,
285 '์ง์ญ': ['์์ธ', '๋ถ์ฐ'] * 4,
286 '์ ํ': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'],
287 '๋งค์ถ': [100, 80, 120, 90, 60, 70, 80, 50]
288 })
289 print(f"์๋ณธ:\n{df}\n")
290
291 # ํผ๋ฒ ํ
์ด๋ธ
292 pivot = df.pivot_table(
293 values='๋งค์ถ',
294 index='์ง์ญ',
295 columns='์ ํ',
296 aggfunc='sum'
297 )
298 print(f"ํผ๋ฒ ํ
์ด๋ธ (์ง์ญ x ์ ํ):\n{pivot}\n")
299
300 # ๋ณตํฉ ํผ๋ฒ
301 pivot2 = df.pivot_table(
302 values='๋งค์ถ',
303 index=['๋ ์ง', '์ง์ญ'],
304 columns='์ ํ',
305 aggfunc=['sum', 'mean']
306 )
307 print(f"๋ณตํฉ ํผ๋ฒ ํ
์ด๋ธ:\n{pivot2}")
308
309
310# =============================================================================
311# 10. ์๊ณ์ด ๋ฐ์ดํฐ
312# =============================================================================
313def time_series():
314 """์๊ณ์ด ๋ฐ์ดํฐ"""
315 print("\n[10] ์๊ณ์ด ๋ฐ์ดํฐ")
316 print("=" * 50)
317
318 # ๋ ์ง ๋ฒ์ ์์ฑ
319 dates = pd.date_range('2024-01-01', periods=10, freq='D')
320 print(f"๋ ์ง ๋ฒ์:\n{dates}\n")
321
322 # ์๊ณ์ด DataFrame
323 df = pd.DataFrame({
324 '๋ ์ง': dates,
325 '๊ฐ': np.random.randn(10).cumsum()
326 })
327 df['๋ ์ง'] = pd.to_datetime(df['๋ ์ง'])
328 df.set_index('๋ ์ง', inplace=True)
329 print(f"์๊ณ์ด ๋ฐ์ดํฐ:\n{df}\n")
330
331 # ๋ฆฌ์ํ๋ง
332 df_monthly = pd.DataFrame({
333 '๊ฐ': np.random.randn(100)
334 }, index=pd.date_range('2024-01-01', periods=100, freq='D'))
335
336 monthly_mean = df_monthly.resample('M').mean()
337 print(f"์๋ณ ํ๊ท :\n{monthly_mean}\n")
338
339 # ์ด๋ ํ๊ท
340 df['์ด๋ํ๊ท '] = df['๊ฐ'].rolling(window=3).mean()
341 print(f"์ด๋ ํ๊ท (window=3):\n{df}")
342
343
344# =============================================================================
345# ๋ฉ์ธ
346# =============================================================================
347def main():
348 print("=" * 60)
349 print("Pandas ๊ธฐ์ด ์์ ")
350 print("=" * 60)
351
352 df = create_dataframe()
353 inspect_data(df)
354 indexing_selection(df)
355 modify_data()
356 handle_missing()
357 groupby_aggregation()
358 sorting_ranking()
359 merge_data()
360 pivot_tables()
361 time_series()
362
363 print("\n" + "=" * 60)
364 print("Pandas ํต์ฌ ์ ๋ฆฌ")
365 print("=" * 60)
366 print("""
367 ํต์ฌ ์๋ฃ๊ตฌ์กฐ:
368 - Series: 1์ฐจ์ (๋ผ๋ฒจ์ด ๋ถ์ ๋ฐฐ์ด)
369 - DataFrame: 2์ฐจ์ (ํ ํ์)
370
371 ์์ฃผ ์ฌ์ฉํ๋ ๋ฉ์๋:
372 - ํ์ธ: head, tail, info, describe, shape
373 - ์ ํ: loc (๋ผ๋ฒจ), iloc (์์น), query
374 - ์์ : drop, rename, fillna
375 - ์ง๊ณ: groupby, agg, pivot_table
376 - ๋ณํฉ: merge, concat, join
377
378 ํ:
379 - ์ฒด์ด๋: df.dropna().groupby('col').mean()
380 - ๋ณต์ฌ: df.copy() vs ๋ทฐ (์ฌ๋ผ์ด์ฑ)
381 - ๋ฉ๋ชจ๋ฆฌ: category ํ์
์ผ๋ก ๋ฌธ์์ด ์ ์ฝ
382 - ์ฑ๋ฅ: apply ๋์ ๋ฒกํฐํ ์ฐ์ฐ ์ฌ์ฉ
383 """)
384
385
386if __name__ == "__main__":
387 main()