3. Pandas ๊ธฐ์ดˆ

3. Pandas ๊ธฐ์ดˆ

์ด์ „: NumPy ๊ณ ๊ธ‰ | ๋‹ค์Œ: Pandas ๋ฐ์ดํ„ฐ ์กฐ์ž‘

๊ฐœ์š”

Pandas๋Š” Python์—์„œ ๋ฐ์ดํ„ฐ ๋ถ„์„์„ ์œ„ํ•œ ํ•ต์‹ฌ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ์ž…๋‹ˆ๋‹ค. ํ…Œ์ด๋ธ” ํ˜•ํƒœ์˜ ๋ฐ์ดํ„ฐ๋ฅผ ํšจ์œจ์ ์œผ๋กœ ๋‹ค๋ฃจ๊ธฐ ์œ„ํ•œ DataFrame๊ณผ Series ์ž๋ฃŒ๊ตฌ์กฐ๋ฅผ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค.


1. Pandas ์ž๋ฃŒ๊ตฌ์กฐ

1.1 Series

Series๋Š” 1์ฐจ์› ๋ ˆ์ด๋ธ”์ด ์žˆ๋Š” ๋ฐฐ์—ด์ž…๋‹ˆ๋‹ค.

import pandas as pd
import numpy as np

# ๋ฆฌ์ŠคํŠธ๋กœ๋ถ€ํ„ฐ Series ์ƒ์„ฑ
s = pd.Series([10, 20, 30, 40, 50])
print(s)
# 0    10
# 1    20
# 2    30
# 3    40
# 4    50
# dtype: int64

# ์ธ๋ฑ์Šค ์ง€์ •
s = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
print(s)
# a    10
# b    20
# c    30

# ๋”•์…”๋„ˆ๋ฆฌ๋กœ๋ถ€ํ„ฐ ์ƒ์„ฑ
d = {'apple': 100, 'banana': 200, 'cherry': 150}
s = pd.Series(d)
print(s)

# Series ์†์„ฑ
print(s.values)  # ๊ฐ’ ๋ฐฐ์—ด
print(s.index)   # ์ธ๋ฑ์Šค
print(s.dtype)   # ๋ฐ์ดํ„ฐ ํƒ€์ž…
print(s.name)    # Series ์ด๋ฆ„

1.2 DataFrame

DataFrame์€ 2์ฐจ์› ํ…Œ์ด๋ธ” ํ˜•ํƒœ์˜ ์ž๋ฃŒ๊ตฌ์กฐ์ž…๋‹ˆ๋‹ค.

# ๋”•์…”๋„ˆ๋ฆฌ๋กœ๋ถ€ํ„ฐ DataFrame ์ƒ์„ฑ
data = {
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'city': ['Seoul', 'Busan', 'Incheon']
}
df = pd.DataFrame(data)
print(df)
#       name  age     city
# 0    Alice   25    Seoul
# 1      Bob   30    Busan
# 2  Charlie   35  Incheon

# ๋ฆฌ์ŠคํŠธ์˜ ๋ฆฌ์ŠคํŠธ๋กœ ์ƒ์„ฑ
data = [
    ['Alice', 25, 'Seoul'],
    ['Bob', 30, 'Busan'],
    ['Charlie', 35, 'Incheon']
]
df = pd.DataFrame(data, columns=['name', 'age', 'city'])

# NumPy ๋ฐฐ์—ด๋กœ ์ƒ์„ฑ
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df = pd.DataFrame(arr, columns=['A', 'B', 'C'])

# ์ธ๋ฑ์Šค ์ง€์ •
df = pd.DataFrame(data,
                  columns=['name', 'age', 'city'],
                  index=['p1', 'p2', 'p3'])

1.3 DataFrame ์†์„ฑ

df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'salary': [50000, 60000, 70000]
})

# ๊ธฐ๋ณธ ์†์„ฑ
print(df.shape)      # (3, 3)
print(df.columns)    # Index(['name', 'age', 'salary'], dtype='object')
print(df.index)      # RangeIndex(start=0, stop=3, step=1)
print(df.dtypes)     # ๊ฐ ์—ด์˜ ๋ฐ์ดํ„ฐ ํƒ€์ž…
print(df.values)     # NumPy ๋ฐฐ์—ด
print(df.size)       # 9 (์ „์ฒด ์š”์†Œ ์ˆ˜)
print(len(df))       # 3 (ํ–‰ ์ˆ˜)

# ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰
print(df.memory_usage())

# ๋ฐ์ดํ„ฐ ์š”์•ฝ
print(df.info())
print(df.describe())  # ์ˆ˜์น˜ํ˜• ์—ด์˜ ํ†ต๊ณ„ ์š”์•ฝ

2. ๋ฐ์ดํ„ฐ ๋กœ๋”ฉ

2.1 CSV ํŒŒ์ผ

# CSV ์ฝ๊ธฐ
df = pd.read_csv('data.csv')

# ์˜ต์…˜ ์ง€์ •
df = pd.read_csv('data.csv',
                 sep=',',           # ๊ตฌ๋ถ„์ž
                 header=0,          # ํ—ค๋” ํ–‰ (None์ด๋ฉด ์—†์Œ)
                 index_col=0,       # ์ธ๋ฑ์Šค๋กœ ์‚ฌ์šฉํ•  ์—ด
                 usecols=['A', 'B'], # ์ฝ์„ ์—ด ์ง€์ •
                 dtype={'A': int},   # ๋ฐ์ดํ„ฐ ํƒ€์ž… ์ง€์ •
                 na_values=['NA', 'N/A'],  # ๊ฒฐ์ธก๊ฐ’์œผ๋กœ ์ฒ˜๋ฆฌํ•  ๊ฐ’
                 encoding='utf-8',   # ์ธ์ฝ”๋”ฉ
                 nrows=100)          # ์ฝ์„ ํ–‰ ์ˆ˜

# ๋Œ€์šฉ๋Ÿ‰ ํŒŒ์ผ ์ฒญํฌ๋กœ ์ฝ๊ธฐ
chunks = pd.read_csv('large_data.csv', chunksize=10000)
for chunk in chunks:
    process(chunk)

# CSV ์ €์žฅ
df.to_csv('output.csv', index=False)

2.2 Excel ํŒŒ์ผ

# Excel ์ฝ๊ธฐ (openpyxl ๋˜๋Š” xlrd ํ•„์š”)
df = pd.read_excel('data.xlsx')

# ์‹œํŠธ ์ง€์ •
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')

# ์—ฌ๋Ÿฌ ์‹œํŠธ ์ฝ๊ธฐ
sheets = pd.read_excel('data.xlsx', sheet_name=None)  # ๋”•์…”๋„ˆ๋ฆฌ ๋ฐ˜ํ™˜

# Excel ์ €์žฅ
df.to_excel('output.xlsx', index=False, sheet_name='Data')

# ์—ฌ๋Ÿฌ ์‹œํŠธ ์ €์žฅ
with pd.ExcelWriter('output.xlsx') as writer:
    df1.to_excel(writer, sheet_name='Sheet1')
    df2.to_excel(writer, sheet_name='Sheet2')

2.3 JSON ํŒŒ์ผ

# JSON ์ฝ๊ธฐ
df = pd.read_json('data.json')

# JSON ํ˜•์‹ ์ง€์ •
df = pd.read_json('data.json', orient='records')
# orient: 'split', 'records', 'index', 'columns', 'values'

# JSON ์ €์žฅ
df.to_json('output.json', orient='records')

# ์ค„๋ฐ”๊ฟˆ์œผ๋กœ ๊ตฌ๋ถ„๋œ JSON (JSON Lines)
df = pd.read_json('data.jsonl', lines=True)
df.to_json('output.jsonl', orient='records', lines=True)

2.4 SQL ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค

import sqlite3
from sqlalchemy import create_engine

# SQLite ์—ฐ๊ฒฐ
conn = sqlite3.connect('database.db')
df = pd.read_sql('SELECT * FROM users', conn)
conn.close()

# SQLAlchemy ์—”์ง„ ์‚ฌ์šฉ
engine = create_engine('postgresql://user:pass@host:5432/db')
df = pd.read_sql('SELECT * FROM users', engine)

# ํ…Œ์ด๋ธ” ์ฝ๊ธฐ
df = pd.read_sql_table('users', engine)

# ์ฟผ๋ฆฌ ์‹คํ–‰
df = pd.read_sql_query('SELECT * FROM users WHERE age > 30', engine)

# DataFrame์„ SQL๋กœ ์ €์žฅ
df.to_sql('users', engine, if_exists='replace', index=False)
# if_exists: 'fail', 'replace', 'append'

2.5 ๊ธฐํƒ€ ํ˜•์‹

# HTML ํ…Œ์ด๋ธ”
dfs = pd.read_html('https://example.com/table.html')
df = dfs[0]  # ์ฒซ ๋ฒˆ์งธ ํ…Œ์ด๋ธ”

# ํด๋ฆฝ๋ณด๋“œ
df = pd.read_clipboard()

# Parquet (pyarrow ํ•„์š”)
df = pd.read_parquet('data.parquet')
df.to_parquet('output.parquet')

# Pickle
df = pd.read_pickle('data.pkl')
df.to_pickle('output.pkl')

# HDF5 (tables ํ•„์š”)
df = pd.read_hdf('data.h5', key='df')
df.to_hdf('output.h5', key='df')

3. ๋ฐ์ดํ„ฐ ์„ ํƒ๊ณผ ์ ‘๊ทผ

3.1 ์—ด ์„ ํƒ

df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'city': ['Seoul', 'Busan', 'Incheon']
})

# ๋‹จ์ผ ์—ด ์„ ํƒ (Series ๋ฐ˜ํ™˜)
print(df['name'])
print(df.name)  # ์†์„ฑ ์ ‘๊ทผ (์—ด ์ด๋ฆ„์ด ํŒŒ์ด์ฌ ์‹๋ณ„์ž์ผ ๋•Œ)

# ์—ฌ๋Ÿฌ ์—ด ์„ ํƒ (DataFrame ๋ฐ˜ํ™˜)
print(df[['name', 'age']])

3.2 ํ–‰ ์„ ํƒ

# ์Šฌ๋ผ์ด์‹ฑ
print(df[0:2])  # ์ฒ˜์Œ 2ํ–‰

# ์กฐ๊ฑด ํ•„ํ„ฐ๋ง
print(df[df['age'] > 25])
print(df[df['city'].isin(['Seoul', 'Busan'])])

3.3 loc - ๋ ˆ์ด๋ธ” ๊ธฐ๋ฐ˜ ์„ ํƒ

df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'city': ['Seoul', 'Busan', 'Incheon']
}, index=['a', 'b', 'c'])

# ๋‹จ์ผ ํ–‰
print(df.loc['a'])

# ์—ฌ๋Ÿฌ ํ–‰
print(df.loc[['a', 'c']])

# ํ–‰๊ณผ ์—ด
print(df.loc['a', 'name'])        # ๋‹จ์ผ ๊ฐ’
print(df.loc['a':'b', 'name':'age'])  # ๋ฒ”์œ„ ์Šฌ๋ผ์ด์‹ฑ

# ์กฐ๊ฑด๊ณผ ํ•จ๊ป˜
print(df.loc[df['age'] > 25, ['name', 'city']])

3.4 iloc - ์ •์ˆ˜ ๊ธฐ๋ฐ˜ ์„ ํƒ

# ๋‹จ์ผ ํ–‰
print(df.iloc[0])

# ์—ฌ๋Ÿฌ ํ–‰
print(df.iloc[[0, 2]])

# ํ–‰๊ณผ ์—ด
print(df.iloc[0, 1])        # ๋‹จ์ผ ๊ฐ’
print(df.iloc[0:2, 0:2])    # ๋ฒ”์œ„ ์Šฌ๋ผ์ด์‹ฑ
print(df.iloc[[0, 2], [0, 2]])  # ํŠน์ • ์œ„์น˜

# ์Œ์ˆ˜ ์ธ๋ฑ์Šค
print(df.iloc[-1])  # ๋งˆ์ง€๋ง‰ ํ–‰

3.5 at๊ณผ iat - ๋‹จ์ผ ๊ฐ’ ์ ‘๊ทผ

# at: ๋ ˆ์ด๋ธ” ๊ธฐ๋ฐ˜ ๋‹จ์ผ ๊ฐ’
print(df.at['a', 'name'])

# iat: ์ •์ˆ˜ ๊ธฐ๋ฐ˜ ๋‹จ์ผ ๊ฐ’
print(df.iat[0, 0])

# ๊ฐ’ ์ˆ˜์ •
df.at['a', 'age'] = 26
df.iat[0, 1] = 27

4. ๋ฐ์ดํ„ฐ ํ™•์ธ๊ณผ ํƒ์ƒ‰

4.1 ๋ฐ์ดํ„ฐ ๋ฏธ๋ฆฌ๋ณด๊ธฐ

df = pd.DataFrame({
    'A': range(100),
    'B': range(100, 200),
    'C': range(200, 300)
})

# ์ฒ˜์Œ/๋ ํ™•์ธ
print(df.head())     # ์ฒ˜์Œ 5ํ–‰
print(df.head(10))   # ์ฒ˜์Œ 10ํ–‰
print(df.tail())     # ๋งˆ์ง€๋ง‰ 5ํ–‰
print(df.tail(3))    # ๋งˆ์ง€๋ง‰ 3ํ–‰

# ๋žœ๋ค ์ƒ˜ํ”Œ
print(df.sample(5))  # ๋žœ๋ค 5ํ–‰
print(df.sample(frac=0.1))  # 10% ์ƒ˜ํ”Œ

4.2 ๋ฐ์ดํ„ฐ ์ •๋ณด

df = pd.DataFrame({
    'name': ['Alice', 'Bob', None, 'Diana'],
    'age': [25, 30, 35, None],
    'salary': [50000.0, 60000.0, 70000.0, 80000.0]
})

# ๊ธฐ๋ณธ ์ •๋ณด
print(df.info())

# ์ถœ๋ ฅ ์˜ˆ์‹œ:
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 4 entries, 0 to 3
# Data columns (total 3 columns):
#  #   Column  Non-Null Count  Dtype
# ---  ------  --------------  -----
#  0   name    3 non-null      object
#  1   age     3 non-null      float64
#  2   salary  4 non-null      float64
# dtypes: float64(2), object(1)
# memory usage: 224.0+ bytes

# ํ†ต๊ณ„ ์š”์•ฝ
print(df.describe())
print(df.describe(include='all'))  # ๋ชจ๋“  ์—ด ํฌํ•จ

4.3 ๊ณ ์œ ๊ฐ’๊ณผ ๋นˆ๋„

df = pd.DataFrame({
    'category': ['A', 'B', 'A', 'C', 'B', 'A', 'A', 'C'],
    'value': [10, 20, 30, 40, 50, 60, 70, 80]
})

# ๊ณ ์œ ๊ฐ’
print(df['category'].unique())    # ['A' 'B' 'C']
print(df['category'].nunique())   # 3

# ๋นˆ๋„
print(df['category'].value_counts())
# A    4
# B    2
# C    2

# ์ •๊ทœํ™”๋œ ๋นˆ๋„
print(df['category'].value_counts(normalize=True))

4.4 ๊ฒฐ์ธก๊ฐ’ ํ™•์ธ

df = pd.DataFrame({
    'A': [1, 2, None, 4],
    'B': [None, 2, 3, 4],
    'C': [1, 2, 3, None]
})

# ๊ฒฐ์ธก๊ฐ’ ํ™•์ธ
print(df.isna())      # ๋ถˆ๋ฆฌ์–ธ DataFrame
print(df.isnull())    # isna์™€ ๋™์ผ

# ๊ฒฐ์ธก๊ฐ’ ๊ฐœ์ˆ˜
print(df.isna().sum())        # ์—ด๋ณ„ ๊ฒฐ์ธก๊ฐ’ ์ˆ˜
print(df.isna().sum().sum())  # ์ „์ฒด ๊ฒฐ์ธก๊ฐ’ ์ˆ˜

# ๊ฒฐ์ธก๊ฐ’์ด ์žˆ๋Š” ํ–‰/์—ด
print(df[df.isna().any(axis=1)])  # ๊ฒฐ์ธก๊ฐ’์ด ์žˆ๋Š” ํ–‰

5. ๊ธฐ๋ณธ ์—ฐ์‚ฐ

5.1 ์‚ฐ์ˆ  ์—ฐ์‚ฐ

df = pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': [10, 20, 30, 40]
})

# ์Šค์นผ๋ผ ์—ฐ์‚ฐ
print(df + 10)
print(df * 2)
print(df ** 2)

# ์—ด ๊ฐ„ ์—ฐ์‚ฐ
df['C'] = df['A'] + df['B']
df['D'] = df['B'] / df['A']

# DataFrame ๊ฐ„ ์—ฐ์‚ฐ (์ธ๋ฑ์Šค ์ •๋ ฌ)
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}, index=[0, 1])
df2 = pd.DataFrame({'A': [10, 20], 'B': [30, 40]}, index=[1, 2])
print(df1 + df2)  # ์ธ๋ฑ์Šค๊ฐ€ ์ผ์น˜ํ•˜๋Š” ๋ถ€๋ถ„๋งŒ ์—ฐ์‚ฐ

# ๊ฒฐ์ธก๊ฐ’ ์ฒ˜๋ฆฌํ•˜๋ฉฐ ์—ฐ์‚ฐ
print(df1.add(df2, fill_value=0))

5.2 ์ง‘๊ณ„ ํ•จ์ˆ˜

df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50]
})

# ๊ธฐ๋ณธ ์ง‘๊ณ„
print(df.sum())      # ์—ด๋ณ„ ํ•ฉ๊ณ„
print(df.mean())     # ์—ด๋ณ„ ํ‰๊ท 
print(df.median())   # ์ค‘์•™๊ฐ’
print(df.std())      # ํ‘œ์ค€ํŽธ์ฐจ
print(df.var())      # ๋ถ„์‚ฐ
print(df.min())      # ์ตœ์†Ÿ๊ฐ’
print(df.max())      # ์ตœ๋Œ“๊ฐ’
print(df.count())    # ๋น„๊ฒฐ์ธก๊ฐ’ ๊ฐœ์ˆ˜

# ์ถ• ์ง€์ •
print(df.sum(axis=0))  # ์—ด๋ณ„ (๊ธฐ๋ณธ๊ฐ’)
print(df.sum(axis=1))  # ํ–‰๋ณ„

# ๋ˆ„์  ํ•จ์ˆ˜
print(df.cumsum())   # ๋ˆ„์  ํ•ฉ
print(df.cumprod())  # ๋ˆ„์  ๊ณฑ
print(df.cummax())   # ๋ˆ„์  ์ตœ๋Œ€
print(df.cummin())   # ๋ˆ„์  ์ตœ์†Œ

5.3 ์ •๋ ฌ

df = pd.DataFrame({
    'name': ['Charlie', 'Alice', 'Bob'],
    'age': [35, 25, 30],
    'score': [85, 95, 75]
})

# ๊ฐ’ ๊ธฐ์ค€ ์ •๋ ฌ
print(df.sort_values('age'))
print(df.sort_values('age', ascending=False))

# ์—ฌ๋Ÿฌ ์—ด ๊ธฐ์ค€
print(df.sort_values(['age', 'score']))
print(df.sort_values(['age', 'score'], ascending=[True, False]))

# ์ธ๋ฑ์Šค ์ •๋ ฌ
df = df.set_index('name')
print(df.sort_index())
print(df.sort_index(ascending=False))

# ์ •๋ ฌ ์ˆœ์„œ
print(df.rank())  # ์ˆœ์œ„

6. ๋ฐ์ดํ„ฐ ์ˆ˜์ •

6.1 ์—ด ์ถ”๊ฐ€/์ˆ˜์ •

df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6]
})

# ์ƒˆ ์—ด ์ถ”๊ฐ€
df['C'] = [7, 8, 9]
df['D'] = df['A'] + df['B']
df['E'] = 10  # ์Šค์นผ๋ผ ๊ฐ’

# assign ๋ฉ”์„œ๋“œ (์›๋ณธ ์œ ์ง€)
df2 = df.assign(F=lambda x: x['A'] * 2,
                G=[10, 20, 30])

# insert (ํŠน์ • ์œ„์น˜์— ์‚ฝ์ž…)
df.insert(1, 'new_col', [100, 200, 300])

6.2 ์—ด ์‚ญ์ œ

# drop ๋ฉ”์„œ๋“œ
df = df.drop('C', axis=1)
df = df.drop(['D', 'E'], axis=1)

# del ํ‚ค์›Œ๋“œ
del df['B']

# pop ๋ฉ”์„œ๋“œ (์‚ญ์ œํ•˜๊ณ  ๋ฐ˜ํ™˜)
col = df.pop('A')

6.3 ํ–‰ ์ถ”๊ฐ€/์ˆ˜์ •

df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6]
})

# ํ–‰ ์ถ”๊ฐ€ (concat ์‚ฌ์šฉ)
new_row = pd.DataFrame({'A': [4], 'B': [7]})
df = pd.concat([df, new_row], ignore_index=True)

# loc์œผ๋กœ ์ถ”๊ฐ€
df.loc[len(df)] = [5, 8]

# ํ–‰ ์‚ญ์ œ
df = df.drop(0)  # ์ธ๋ฑ์Šค 0 ์‚ญ์ œ
df = df.drop([1, 2])  # ์—ฌ๋Ÿฌ ํ–‰ ์‚ญ์ œ

6.4 ๊ฐ’ ์ˆ˜์ •

df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6]
})

# ์กฐ๊ฑด์— ๋”ฐ๋ฅธ ์ˆ˜์ •
df.loc[df['A'] > 1, 'B'] = 0

# replace
df['A'] = df['A'].replace(1, 100)
df = df.replace({2: 200, 3: 300})

# where (์กฐ๊ฑด์ด False์ธ ๊ณณ์„ ์ˆ˜์ •)
df['A'] = df['A'].where(df['A'] > 100, 0)

# mask (์กฐ๊ฑด์ด True์ธ ๊ณณ์„ ์ˆ˜์ •)
df['B'] = df['B'].mask(df['B'] < 5, -1)

7. ๋ฌธ์ž์—ด ์ฒ˜๋ฆฌ

Pandas๋Š” .str ์ ‘๊ทผ์ž๋ฅผ ํ†ตํ•ด ๋ฌธ์ž์—ด ๋ฉ”์„œ๋“œ๋ฅผ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค.

df = pd.DataFrame({
    'name': ['  Alice  ', 'BOB', 'charlie'],
    'email': ['alice@test.com', 'bob@example.com', 'charlie@test.com']
})

# ๋Œ€์†Œ๋ฌธ์ž
print(df['name'].str.lower())
print(df['name'].str.upper())
print(df['name'].str.title())
print(df['name'].str.capitalize())

# ๊ณต๋ฐฑ ์ œ๊ฑฐ
print(df['name'].str.strip())
print(df['name'].str.lstrip())
print(df['name'].str.rstrip())

# ๋ฌธ์ž์—ด ๊ธธ์ด
print(df['name'].str.len())

# ๋ฌธ์ž์—ด ํฌํ•จ ์—ฌ๋ถ€
print(df['email'].str.contains('test'))
print(df['name'].str.startswith('A'))
print(df['name'].str.endswith('e'))

# ๋ฌธ์ž์—ด ๋ถ„๋ฆฌ
print(df['email'].str.split('@'))
print(df['email'].str.split('@').str[0])  # ์ฒซ ๋ฒˆ์งธ ์š”์†Œ

# ๋ฌธ์ž์—ด ๊ต์ฒด
print(df['email'].str.replace('test', 'example'))

# ์ •๊ทœ ํ‘œํ˜„์‹
print(df['email'].str.extract(r'@(.+)\.com'))
print(df['email'].str.findall(r'\w+'))

์—ฐ์Šต ๋ฌธ์ œ

๋ฌธ์ œ 1: ๋ฐ์ดํ„ฐ ๋กœ๋”ฉ๊ณผ ํƒ์ƒ‰

๋‹ค์Œ ๋ฐ์ดํ„ฐ๋ฅผ DataFrame์œผ๋กœ ์ƒ์„ฑํ•˜๊ณ  ๊ธฐ๋ณธ ์ •๋ณด๋ฅผ ํ™•์ธํ•˜์„ธ์š”.

data = {
    'product': ['Apple', 'Banana', 'Cherry', 'Date'],
    'price': [1000, 500, 2000, 1500],
    'quantity': [50, 100, 30, 45]
}

# ํ’€์ด
df = pd.DataFrame(data)
print(df.info())
print(df.describe())
print(df['price'].mean())  # ํ‰๊ท  ๊ฐ€๊ฒฉ

๋ฌธ์ œ 2: ๋ฐ์ดํ„ฐ ์„ ํƒ

price๊ฐ€ 1000 ์ด์ƒ์ธ ์ œํ’ˆ์˜ ์ด๋ฆ„๊ณผ ์ˆ˜๋Ÿ‰๋งŒ ์„ ํƒํ•˜์„ธ์š”.

# ํ’€์ด
result = df.loc[df['price'] >= 1000, ['product', 'quantity']]
print(result)

๋ฌธ์ œ 3: ์—ด ์ถ”๊ฐ€

์ด ๊ธˆ์•ก(price * quantity) ์—ด์„ ์ถ”๊ฐ€ํ•˜์„ธ์š”.

# ํ’€์ด
df['total'] = df['price'] * df['quantity']
print(df)

์š”์•ฝ

๊ธฐ๋Šฅ ํ•จ์ˆ˜/๋ฉ”์„œ๋“œ
๋ฐ์ดํ„ฐ ๋กœ๋”ฉ pd.read_csv(), pd.read_excel(), pd.read_json(), pd.read_sql()
๋ฐ์ดํ„ฐ ์ €์žฅ to_csv(), to_excel(), to_json(), to_sql()
์—ด ์„ ํƒ df['col'], df[['col1', 'col2']]
ํ–‰ ์„ ํƒ df.loc[], df.iloc[], df[condition]
๋ฐ์ดํ„ฐ ํ™•์ธ head(), tail(), info(), describe()
์ง‘๊ณ„ sum(), mean(), count(), min(), max()
์ •๋ ฌ sort_values(), sort_index()
๋ฌธ์ž์—ด df['col'].str.method()
to navigate between lessons