3. Pandas ๊ธฐ์ด
3. Pandas ๊ธฐ์ด¶
์ด์ : NumPy ๊ณ ๊ธ | ๋ค์: Pandas ๋ฐ์ดํฐ ์กฐ์
๊ฐ์¶
Pandas๋ Python์์ ๋ฐ์ดํฐ ๋ถ์์ ์ํ ํต์ฌ ๋ผ์ด๋ธ๋ฌ๋ฆฌ์ ๋๋ค. ํ ์ด๋ธ ํํ์ ๋ฐ์ดํฐ๋ฅผ ํจ์จ์ ์ผ๋ก ๋ค๋ฃจ๊ธฐ ์ํ DataFrame๊ณผ Series ์๋ฃ๊ตฌ์กฐ๋ฅผ ์ ๊ณตํฉ๋๋ค.
1. Pandas ์๋ฃ๊ตฌ์กฐ¶
1.1 Series¶
Series๋ 1์ฐจ์ ๋ ์ด๋ธ์ด ์๋ ๋ฐฐ์ด์ ๋๋ค.
import pandas as pd
import numpy as np
# ๋ฆฌ์คํธ๋ก๋ถํฐ Series ์์ฑ
s = pd.Series([10, 20, 30, 40, 50])
print(s)
# 0 10
# 1 20
# 2 30
# 3 40
# 4 50
# dtype: int64
# ์ธ๋ฑ์ค ์ง์
s = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
print(s)
# a 10
# b 20
# c 30
# ๋์
๋๋ฆฌ๋ก๋ถํฐ ์์ฑ
d = {'apple': 100, 'banana': 200, 'cherry': 150}
s = pd.Series(d)
print(s)
# Series ์์ฑ
print(s.values) # ๊ฐ ๋ฐฐ์ด
print(s.index) # ์ธ๋ฑ์ค
print(s.dtype) # ๋ฐ์ดํฐ ํ์
print(s.name) # Series ์ด๋ฆ
1.2 DataFrame¶
DataFrame์ 2์ฐจ์ ํ ์ด๋ธ ํํ์ ์๋ฃ๊ตฌ์กฐ์ ๋๋ค.
# ๋์
๋๋ฆฌ๋ก๋ถํฐ DataFrame ์์ฑ
data = {
'name': ['Alice', 'Bob', 'Charlie'],
'age': [25, 30, 35],
'city': ['Seoul', 'Busan', 'Incheon']
}
df = pd.DataFrame(data)
print(df)
# name age city
# 0 Alice 25 Seoul
# 1 Bob 30 Busan
# 2 Charlie 35 Incheon
# ๋ฆฌ์คํธ์ ๋ฆฌ์คํธ๋ก ์์ฑ
data = [
['Alice', 25, 'Seoul'],
['Bob', 30, 'Busan'],
['Charlie', 35, 'Incheon']
]
df = pd.DataFrame(data, columns=['name', 'age', 'city'])
# NumPy ๋ฐฐ์ด๋ก ์์ฑ
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df = pd.DataFrame(arr, columns=['A', 'B', 'C'])
# ์ธ๋ฑ์ค ์ง์
df = pd.DataFrame(data,
columns=['name', 'age', 'city'],
index=['p1', 'p2', 'p3'])
1.3 DataFrame ์์ฑ¶
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie'],
'age': [25, 30, 35],
'salary': [50000, 60000, 70000]
})
# ๊ธฐ๋ณธ ์์ฑ
print(df.shape) # (3, 3)
print(df.columns) # Index(['name', 'age', 'salary'], dtype='object')
print(df.index) # RangeIndex(start=0, stop=3, step=1)
print(df.dtypes) # ๊ฐ ์ด์ ๋ฐ์ดํฐ ํ์
print(df.values) # NumPy ๋ฐฐ์ด
print(df.size) # 9 (์ ์ฒด ์์ ์)
print(len(df)) # 3 (ํ ์)
# ๋ฉ๋ชจ๋ฆฌ ์ฌ์ฉ๋
print(df.memory_usage())
# ๋ฐ์ดํฐ ์์ฝ
print(df.info())
print(df.describe()) # ์์นํ ์ด์ ํต๊ณ ์์ฝ
2. ๋ฐ์ดํฐ ๋ก๋ฉ¶
2.1 CSV ํ์ผ¶
# CSV ์ฝ๊ธฐ
df = pd.read_csv('data.csv')
# ์ต์
์ง์
df = pd.read_csv('data.csv',
sep=',', # ๊ตฌ๋ถ์
header=0, # ํค๋ ํ (None์ด๋ฉด ์์)
index_col=0, # ์ธ๋ฑ์ค๋ก ์ฌ์ฉํ ์ด
usecols=['A', 'B'], # ์ฝ์ ์ด ์ง์
dtype={'A': int}, # ๋ฐ์ดํฐ ํ์
์ง์
na_values=['NA', 'N/A'], # ๊ฒฐ์ธก๊ฐ์ผ๋ก ์ฒ๋ฆฌํ ๊ฐ
encoding='utf-8', # ์ธ์ฝ๋ฉ
nrows=100) # ์ฝ์ ํ ์
# ๋์ฉ๋ ํ์ผ ์ฒญํฌ๋ก ์ฝ๊ธฐ
chunks = pd.read_csv('large_data.csv', chunksize=10000)
for chunk in chunks:
process(chunk)
# CSV ์ ์ฅ
df.to_csv('output.csv', index=False)
2.2 Excel ํ์ผ¶
# Excel ์ฝ๊ธฐ (openpyxl ๋๋ xlrd ํ์)
df = pd.read_excel('data.xlsx')
# ์ํธ ์ง์
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
# ์ฌ๋ฌ ์ํธ ์ฝ๊ธฐ
sheets = pd.read_excel('data.xlsx', sheet_name=None) # ๋์
๋๋ฆฌ ๋ฐํ
# Excel ์ ์ฅ
df.to_excel('output.xlsx', index=False, sheet_name='Data')
# ์ฌ๋ฌ ์ํธ ์ ์ฅ
with pd.ExcelWriter('output.xlsx') as writer:
df1.to_excel(writer, sheet_name='Sheet1')
df2.to_excel(writer, sheet_name='Sheet2')
2.3 JSON ํ์ผ¶
# JSON ์ฝ๊ธฐ
df = pd.read_json('data.json')
# JSON ํ์ ์ง์
df = pd.read_json('data.json', orient='records')
# orient: 'split', 'records', 'index', 'columns', 'values'
# JSON ์ ์ฅ
df.to_json('output.json', orient='records')
# ์ค๋ฐ๊ฟ์ผ๋ก ๊ตฌ๋ถ๋ JSON (JSON Lines)
df = pd.read_json('data.jsonl', lines=True)
df.to_json('output.jsonl', orient='records', lines=True)
2.4 SQL ๋ฐ์ดํฐ๋ฒ ์ด์ค¶
import sqlite3
from sqlalchemy import create_engine
# SQLite ์ฐ๊ฒฐ
conn = sqlite3.connect('database.db')
df = pd.read_sql('SELECT * FROM users', conn)
conn.close()
# SQLAlchemy ์์ง ์ฌ์ฉ
engine = create_engine('postgresql://user:pass@host:5432/db')
df = pd.read_sql('SELECT * FROM users', engine)
# ํ
์ด๋ธ ์ฝ๊ธฐ
df = pd.read_sql_table('users', engine)
# ์ฟผ๋ฆฌ ์คํ
df = pd.read_sql_query('SELECT * FROM users WHERE age > 30', engine)
# DataFrame์ SQL๋ก ์ ์ฅ
df.to_sql('users', engine, if_exists='replace', index=False)
# if_exists: 'fail', 'replace', 'append'
2.5 ๊ธฐํ ํ์¶
# HTML ํ
์ด๋ธ
dfs = pd.read_html('https://example.com/table.html')
df = dfs[0] # ์ฒซ ๋ฒ์งธ ํ
์ด๋ธ
# ํด๋ฆฝ๋ณด๋
df = pd.read_clipboard()
# Parquet (pyarrow ํ์)
df = pd.read_parquet('data.parquet')
df.to_parquet('output.parquet')
# Pickle
df = pd.read_pickle('data.pkl')
df.to_pickle('output.pkl')
# HDF5 (tables ํ์)
df = pd.read_hdf('data.h5', key='df')
df.to_hdf('output.h5', key='df')
3. ๋ฐ์ดํฐ ์ ํ๊ณผ ์ ๊ทผ¶
3.1 ์ด ์ ํ¶
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie'],
'age': [25, 30, 35],
'city': ['Seoul', 'Busan', 'Incheon']
})
# ๋จ์ผ ์ด ์ ํ (Series ๋ฐํ)
print(df['name'])
print(df.name) # ์์ฑ ์ ๊ทผ (์ด ์ด๋ฆ์ด ํ์ด์ฌ ์๋ณ์์ผ ๋)
# ์ฌ๋ฌ ์ด ์ ํ (DataFrame ๋ฐํ)
print(df[['name', 'age']])
3.2 ํ ์ ํ¶
# ์ฌ๋ผ์ด์ฑ
print(df[0:2]) # ์ฒ์ 2ํ
# ์กฐ๊ฑด ํํฐ๋ง
print(df[df['age'] > 25])
print(df[df['city'].isin(['Seoul', 'Busan'])])
3.3 loc - ๋ ์ด๋ธ ๊ธฐ๋ฐ ์ ํ¶
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie'],
'age': [25, 30, 35],
'city': ['Seoul', 'Busan', 'Incheon']
}, index=['a', 'b', 'c'])
# ๋จ์ผ ํ
print(df.loc['a'])
# ์ฌ๋ฌ ํ
print(df.loc[['a', 'c']])
# ํ๊ณผ ์ด
print(df.loc['a', 'name']) # ๋จ์ผ ๊ฐ
print(df.loc['a':'b', 'name':'age']) # ๋ฒ์ ์ฌ๋ผ์ด์ฑ
# ์กฐ๊ฑด๊ณผ ํจ๊ป
print(df.loc[df['age'] > 25, ['name', 'city']])
3.4 iloc - ์ ์ ๊ธฐ๋ฐ ์ ํ¶
# ๋จ์ผ ํ
print(df.iloc[0])
# ์ฌ๋ฌ ํ
print(df.iloc[[0, 2]])
# ํ๊ณผ ์ด
print(df.iloc[0, 1]) # ๋จ์ผ ๊ฐ
print(df.iloc[0:2, 0:2]) # ๋ฒ์ ์ฌ๋ผ์ด์ฑ
print(df.iloc[[0, 2], [0, 2]]) # ํน์ ์์น
# ์์ ์ธ๋ฑ์ค
print(df.iloc[-1]) # ๋ง์ง๋ง ํ
3.5 at๊ณผ iat - ๋จ์ผ ๊ฐ ์ ๊ทผ¶
# at: ๋ ์ด๋ธ ๊ธฐ๋ฐ ๋จ์ผ ๊ฐ
print(df.at['a', 'name'])
# iat: ์ ์ ๊ธฐ๋ฐ ๋จ์ผ ๊ฐ
print(df.iat[0, 0])
# ๊ฐ ์์
df.at['a', 'age'] = 26
df.iat[0, 1] = 27
4. ๋ฐ์ดํฐ ํ์ธ๊ณผ ํ์¶
4.1 ๋ฐ์ดํฐ ๋ฏธ๋ฆฌ๋ณด๊ธฐ¶
df = pd.DataFrame({
'A': range(100),
'B': range(100, 200),
'C': range(200, 300)
})
# ์ฒ์/๋ ํ์ธ
print(df.head()) # ์ฒ์ 5ํ
print(df.head(10)) # ์ฒ์ 10ํ
print(df.tail()) # ๋ง์ง๋ง 5ํ
print(df.tail(3)) # ๋ง์ง๋ง 3ํ
# ๋๋ค ์ํ
print(df.sample(5)) # ๋๋ค 5ํ
print(df.sample(frac=0.1)) # 10% ์ํ
4.2 ๋ฐ์ดํฐ ์ ๋ณด¶
df = pd.DataFrame({
'name': ['Alice', 'Bob', None, 'Diana'],
'age': [25, 30, 35, None],
'salary': [50000.0, 60000.0, 70000.0, 80000.0]
})
# ๊ธฐ๋ณธ ์ ๋ณด
print(df.info())
# ์ถ๋ ฅ ์์:
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 4 entries, 0 to 3
# Data columns (total 3 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 name 3 non-null object
# 1 age 3 non-null float64
# 2 salary 4 non-null float64
# dtypes: float64(2), object(1)
# memory usage: 224.0+ bytes
# ํต๊ณ ์์ฝ
print(df.describe())
print(df.describe(include='all')) # ๋ชจ๋ ์ด ํฌํจ
4.3 ๊ณ ์ ๊ฐ๊ณผ ๋น๋¶
df = pd.DataFrame({
'category': ['A', 'B', 'A', 'C', 'B', 'A', 'A', 'C'],
'value': [10, 20, 30, 40, 50, 60, 70, 80]
})
# ๊ณ ์ ๊ฐ
print(df['category'].unique()) # ['A' 'B' 'C']
print(df['category'].nunique()) # 3
# ๋น๋
print(df['category'].value_counts())
# A 4
# B 2
# C 2
# ์ ๊ทํ๋ ๋น๋
print(df['category'].value_counts(normalize=True))
4.4 ๊ฒฐ์ธก๊ฐ ํ์ธ¶
df = pd.DataFrame({
'A': [1, 2, None, 4],
'B': [None, 2, 3, 4],
'C': [1, 2, 3, None]
})
# ๊ฒฐ์ธก๊ฐ ํ์ธ
print(df.isna()) # ๋ถ๋ฆฌ์ธ DataFrame
print(df.isnull()) # isna์ ๋์ผ
# ๊ฒฐ์ธก๊ฐ ๊ฐ์
print(df.isna().sum()) # ์ด๋ณ ๊ฒฐ์ธก๊ฐ ์
print(df.isna().sum().sum()) # ์ ์ฒด ๊ฒฐ์ธก๊ฐ ์
# ๊ฒฐ์ธก๊ฐ์ด ์๋ ํ/์ด
print(df[df.isna().any(axis=1)]) # ๊ฒฐ์ธก๊ฐ์ด ์๋ ํ
5. ๊ธฐ๋ณธ ์ฐ์ฐ¶
5.1 ์ฐ์ ์ฐ์ฐ¶
df = pd.DataFrame({
'A': [1, 2, 3, 4],
'B': [10, 20, 30, 40]
})
# ์ค์นผ๋ผ ์ฐ์ฐ
print(df + 10)
print(df * 2)
print(df ** 2)
# ์ด ๊ฐ ์ฐ์ฐ
df['C'] = df['A'] + df['B']
df['D'] = df['B'] / df['A']
# DataFrame ๊ฐ ์ฐ์ฐ (์ธ๋ฑ์ค ์ ๋ ฌ)
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}, index=[0, 1])
df2 = pd.DataFrame({'A': [10, 20], 'B': [30, 40]}, index=[1, 2])
print(df1 + df2) # ์ธ๋ฑ์ค๊ฐ ์ผ์นํ๋ ๋ถ๋ถ๋ง ์ฐ์ฐ
# ๊ฒฐ์ธก๊ฐ ์ฒ๋ฆฌํ๋ฉฐ ์ฐ์ฐ
print(df1.add(df2, fill_value=0))
5.2 ์ง๊ณ ํจ์¶
df = pd.DataFrame({
'A': [1, 2, 3, 4, 5],
'B': [10, 20, 30, 40, 50]
})
# ๊ธฐ๋ณธ ์ง๊ณ
print(df.sum()) # ์ด๋ณ ํฉ๊ณ
print(df.mean()) # ์ด๋ณ ํ๊ท
print(df.median()) # ์ค์๊ฐ
print(df.std()) # ํ์คํธ์ฐจ
print(df.var()) # ๋ถ์ฐ
print(df.min()) # ์ต์๊ฐ
print(df.max()) # ์ต๋๊ฐ
print(df.count()) # ๋น๊ฒฐ์ธก๊ฐ ๊ฐ์
# ์ถ ์ง์
print(df.sum(axis=0)) # ์ด๋ณ (๊ธฐ๋ณธ๊ฐ)
print(df.sum(axis=1)) # ํ๋ณ
# ๋์ ํจ์
print(df.cumsum()) # ๋์ ํฉ
print(df.cumprod()) # ๋์ ๊ณฑ
print(df.cummax()) # ๋์ ์ต๋
print(df.cummin()) # ๋์ ์ต์
5.3 ์ ๋ ฌ¶
df = pd.DataFrame({
'name': ['Charlie', 'Alice', 'Bob'],
'age': [35, 25, 30],
'score': [85, 95, 75]
})
# ๊ฐ ๊ธฐ์ค ์ ๋ ฌ
print(df.sort_values('age'))
print(df.sort_values('age', ascending=False))
# ์ฌ๋ฌ ์ด ๊ธฐ์ค
print(df.sort_values(['age', 'score']))
print(df.sort_values(['age', 'score'], ascending=[True, False]))
# ์ธ๋ฑ์ค ์ ๋ ฌ
df = df.set_index('name')
print(df.sort_index())
print(df.sort_index(ascending=False))
# ์ ๋ ฌ ์์
print(df.rank()) # ์์
6. ๋ฐ์ดํฐ ์์ ¶
6.1 ์ด ์ถ๊ฐ/์์ ¶
df = pd.DataFrame({
'A': [1, 2, 3],
'B': [4, 5, 6]
})
# ์ ์ด ์ถ๊ฐ
df['C'] = [7, 8, 9]
df['D'] = df['A'] + df['B']
df['E'] = 10 # ์ค์นผ๋ผ ๊ฐ
# assign ๋ฉ์๋ (์๋ณธ ์ ์ง)
df2 = df.assign(F=lambda x: x['A'] * 2,
G=[10, 20, 30])
# insert (ํน์ ์์น์ ์ฝ์
)
df.insert(1, 'new_col', [100, 200, 300])
6.2 ์ด ์ญ์ ¶
# drop ๋ฉ์๋
df = df.drop('C', axis=1)
df = df.drop(['D', 'E'], axis=1)
# del ํค์๋
del df['B']
# pop ๋ฉ์๋ (์ญ์ ํ๊ณ ๋ฐํ)
col = df.pop('A')
6.3 ํ ์ถ๊ฐ/์์ ¶
df = pd.DataFrame({
'A': [1, 2, 3],
'B': [4, 5, 6]
})
# ํ ์ถ๊ฐ (concat ์ฌ์ฉ)
new_row = pd.DataFrame({'A': [4], 'B': [7]})
df = pd.concat([df, new_row], ignore_index=True)
# loc์ผ๋ก ์ถ๊ฐ
df.loc[len(df)] = [5, 8]
# ํ ์ญ์
df = df.drop(0) # ์ธ๋ฑ์ค 0 ์ญ์
df = df.drop([1, 2]) # ์ฌ๋ฌ ํ ์ญ์
6.4 ๊ฐ ์์ ¶
df = pd.DataFrame({
'A': [1, 2, 3],
'B': [4, 5, 6]
})
# ์กฐ๊ฑด์ ๋ฐ๋ฅธ ์์
df.loc[df['A'] > 1, 'B'] = 0
# replace
df['A'] = df['A'].replace(1, 100)
df = df.replace({2: 200, 3: 300})
# where (์กฐ๊ฑด์ด False์ธ ๊ณณ์ ์์ )
df['A'] = df['A'].where(df['A'] > 100, 0)
# mask (์กฐ๊ฑด์ด True์ธ ๊ณณ์ ์์ )
df['B'] = df['B'].mask(df['B'] < 5, -1)
7. ๋ฌธ์์ด ์ฒ๋ฆฌ¶
Pandas๋ .str ์ ๊ทผ์๋ฅผ ํตํด ๋ฌธ์์ด ๋ฉ์๋๋ฅผ ์ ๊ณตํฉ๋๋ค.
df = pd.DataFrame({
'name': [' Alice ', 'BOB', 'charlie'],
'email': ['alice@test.com', 'bob@example.com', 'charlie@test.com']
})
# ๋์๋ฌธ์
print(df['name'].str.lower())
print(df['name'].str.upper())
print(df['name'].str.title())
print(df['name'].str.capitalize())
# ๊ณต๋ฐฑ ์ ๊ฑฐ
print(df['name'].str.strip())
print(df['name'].str.lstrip())
print(df['name'].str.rstrip())
# ๋ฌธ์์ด ๊ธธ์ด
print(df['name'].str.len())
# ๋ฌธ์์ด ํฌํจ ์ฌ๋ถ
print(df['email'].str.contains('test'))
print(df['name'].str.startswith('A'))
print(df['name'].str.endswith('e'))
# ๋ฌธ์์ด ๋ถ๋ฆฌ
print(df['email'].str.split('@'))
print(df['email'].str.split('@').str[0]) # ์ฒซ ๋ฒ์งธ ์์
# ๋ฌธ์์ด ๊ต์ฒด
print(df['email'].str.replace('test', 'example'))
# ์ ๊ท ํํ์
print(df['email'].str.extract(r'@(.+)\.com'))
print(df['email'].str.findall(r'\w+'))
์ฐ์ต ๋ฌธ์ ¶
๋ฌธ์ 1: ๋ฐ์ดํฐ ๋ก๋ฉ๊ณผ ํ์¶
๋ค์ ๋ฐ์ดํฐ๋ฅผ DataFrame์ผ๋ก ์์ฑํ๊ณ ๊ธฐ๋ณธ ์ ๋ณด๋ฅผ ํ์ธํ์ธ์.
data = {
'product': ['Apple', 'Banana', 'Cherry', 'Date'],
'price': [1000, 500, 2000, 1500],
'quantity': [50, 100, 30, 45]
}
# ํ์ด
df = pd.DataFrame(data)
print(df.info())
print(df.describe())
print(df['price'].mean()) # ํ๊ท ๊ฐ๊ฒฉ
๋ฌธ์ 2: ๋ฐ์ดํฐ ์ ํ¶
price๊ฐ 1000 ์ด์์ธ ์ ํ์ ์ด๋ฆ๊ณผ ์๋๋ง ์ ํํ์ธ์.
# ํ์ด
result = df.loc[df['price'] >= 1000, ['product', 'quantity']]
print(result)
๋ฌธ์ 3: ์ด ์ถ๊ฐ¶
์ด ๊ธ์ก(price * quantity) ์ด์ ์ถ๊ฐํ์ธ์.
# ํ์ด
df['total'] = df['price'] * df['quantity']
print(df)
์์ฝ¶
| ๊ธฐ๋ฅ | ํจ์/๋ฉ์๋ |
|---|---|
| ๋ฐ์ดํฐ ๋ก๋ฉ | pd.read_csv(), pd.read_excel(), pd.read_json(), pd.read_sql() |
| ๋ฐ์ดํฐ ์ ์ฅ | to_csv(), to_excel(), to_json(), to_sql() |
| ์ด ์ ํ | df['col'], df[['col1', 'col2']] |
| ํ ์ ํ | df.loc[], df.iloc[], df[condition] |
| ๋ฐ์ดํฐ ํ์ธ | head(), tail(), info(), describe() |
| ์ง๊ณ | sum(), mean(), count(), min(), max() |
| ์ ๋ ฌ | sort_values(), sort_index() |
| ๋ฌธ์์ด | df['col'].str.method() |