4. Pandas Data Manipulation

4. Pandas Data Manipulation

Previous: Pandas Basics | Next: Pandas Advanced

Overview

This covers core Pandas data manipulation techniques including filtering, sorting, grouping, and merging. These techniques are most commonly used in practical data analysis.


1. Data Filtering

1.1 Conditional Filtering

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'age': [25, 30, 35, 28, 32],
    'department': ['Sales', 'IT', 'IT', 'HR', 'Sales'],
    'salary': [50000, 60000, 70000, 55000, 65000]
})

# Single condition
print(df[df['age'] > 30])
print(df[df['department'] == 'IT'])

# Compound conditions (AND: &, OR: |) (AND: &, OR: |)
print(df[(df['age'] > 25) & (df['salary'] >= 60000)])
print(df[(df['department'] == 'IT') | (df['department'] == 'Sales')])

# NOT condition
print(df[~(df['department'] == 'HR')])

# Range condition
print(df[df['age'].between(25, 30)])  # 25 <= age <= 30

1.2 Filtering with isin

# Match any of multiple values
departments = ['IT', 'Sales']
print(df[df['department'].isin(departments)])

# Not matching
print(df[~df['department'].isin(departments)])

1.3 String Conditions

# Contains string
print(df[df['name'].str.contains('a', case=False)])

# Start/end strings
print(df[df['name'].str.startswith('A')])
print(df[df['name'].str.endswith('e')])

# Regular expression
print(df[df['name'].str.match(r'^[A-C]')])  # A, B, C둜 μ‹œμž‘

1.4 query Method

# SQL-style query
print(df.query('age > 30'))
print(df.query('department == "IT"'))
print(df.query('age > 25 and salary >= 60000'))

# Use variables
min_age = 30
print(df.query('age >= @min_age'))

# Index reference
df_indexed = df.set_index('name')
print(df_indexed.query('index == "Alice"'))

1.5 Filtering Missing Values

df_with_na = pd.DataFrame({
    'A': [1, 2, None, 4],
    'B': [None, 2, 3, 4],
    'C': [1, 2, 3, None]
})

# Rows with missing values
print(df_with_na[df_with_na['A'].isna()])

# Rows without missing values
print(df_with_na[df_with_na['A'].notna()])

# Rows with no missing in any column
print(df_with_na.dropna())

# Rows with no missing in specific columns
print(df_with_na.dropna(subset=['A', 'B']))

2. Sorting

2.1 Sort by Values

df = pd.DataFrame({
    'name': ['Charlie', 'Alice', 'Bob', 'Diana'],
    'age': [35, 25, 30, 25],
    'score': [85, 95, 75, 90]
})

# Single column sort
print(df.sort_values('age'))
print(df.sort_values('age', ascending=False))

# Multiple column sort
print(df.sort_values(['age', 'score']))
print(df.sort_values(['age', 'score'], ascending=[True, False]))

# Missing value position
df_na = df.copy()
df_na.loc[0, 'age'] = None
print(df_na.sort_values('age', na_position='first'))  # κ²°μΈ‘κ°’ 맨 μ•ž
print(df_na.sort_values('age', na_position='last'))   # κ²°μΈ‘κ°’ 맨 λ’€

# In-place sorting
df.sort_values('age', inplace=True)

2.2 Sort by Index

df = pd.DataFrame({
    'value': [10, 20, 30, 40]
}, index=['d', 'b', 'c', 'a'])

# Index ascending
print(df.sort_index())

# Index descending
print(df.sort_index(ascending=False))

# Sort column index
df_wide = pd.DataFrame({
    'C': [1, 2],
    'A': [3, 4],
    'B': [5, 6]
})
print(df_wide.sort_index(axis=1))

2.3 Ranking

df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'score': [85, 95, 85, 90]
})

# Default ranking (average for ties)
df['rank'] = df['score'].rank(ascending=False)
print(df)

# Ranking methods
# method='average': 평균 μˆœμœ„ (κΈ°λ³Έκ°’)
# method='min': μ΅œμ†Œ μˆœμœ„
# method='max': μ΅œλŒ€ μˆœμœ„
# method='first': λ¨Όμ € λ‚˜μ˜¨ μˆœμ„œλŒ€λ‘œ
# method='dense': λ°€μ§‘ μˆœμœ„ (간격 μ—†μŒ)

df['rank_min'] = df['score'].rank(ascending=False, method='min')
df['rank_dense'] = df['score'].rank(ascending=False, method='dense')
print(df)

3. Grouping (GroupBy)

3.1 Basic Grouping

df = pd.DataFrame({
    'department': ['Sales', 'IT', 'IT', 'HR', 'Sales', 'HR'],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve', 'Frank'],
    'salary': [50000, 60000, 70000, 55000, 65000, 52000],
    'bonus': [5000, 8000, 10000, 6000, 7000, 5500]
})

# Grouping
grouped = df.groupby('department')

# Check groups
print(grouped.groups)
print(grouped.ngroups)  # 그룹 수

# Get specific group
print(grouped.get_group('IT'))

3.2 Aggregation Functions

# Single aggregation
print(df.groupby('department')['salary'].mean())
print(df.groupby('department')['salary'].sum())

# Multiple column aggregation
print(df.groupby('department')[['salary', 'bonus']].mean())

# Multiple aggregation functions
print(df.groupby('department')['salary'].agg(['mean', 'sum', 'count']))

# Custom function
print(df.groupby('department')['salary'].agg(lambda x: x.max() - x.min()))

3.3 agg Method

# Different aggregations per column
agg_result = df.groupby('department').agg({
    'salary': ['mean', 'max'],
    'bonus': 'sum',
    'name': 'count'
})
print(agg_result)

# Named aggregations
agg_result = df.groupby('department').agg(
    avg_salary=('salary', 'mean'),
    max_salary=('salary', 'max'),
    total_bonus=('bonus', 'sum'),
    employee_count=('name', 'count')
)
print(agg_result)

3.4 transform and apply

# transform: result with same size as original
df['dept_avg_salary'] = df.groupby('department')['salary'].transform('mean')
print(df)

# Normalize within group
df['salary_normalized'] = df.groupby('department')['salary'].transform(
    lambda x: (x - x.mean()) / x.std()
)

# apply: flexible group operations
def top_n(group, n=2, column='salary'):
    return group.nlargest(n, column)

print(df.groupby('department').apply(top_n, n=1))

3.5 Group by Multiple Columns

df = pd.DataFrame({
    'year': [2022, 2022, 2023, 2023, 2022, 2023],
    'quarter': ['Q1', 'Q2', 'Q1', 'Q2', 'Q1', 'Q1'],
    'sales': [100, 150, 120, 180, 110, 130]
})

# Multiple column grouping
print(df.groupby(['year', 'quarter'])['sales'].sum())

# Result as DataFrame
print(df.groupby(['year', 'quarter'])['sales'].sum().reset_index())

# Unstack
print(df.groupby(['year', 'quarter'])['sales'].sum().unstack())

3.6 Filtering

df = pd.DataFrame({
    'department': ['Sales', 'IT', 'IT', 'HR', 'Sales'],
    'salary': [50000, 60000, 70000, 55000, 65000]
})

# Filter groups satisfying condition
# 평균 κΈ‰μ—¬κ°€ 55000 이상인 λΆ€μ„œ
result = df.groupby('department').filter(lambda x: x['salary'].mean() >= 55000)
print(result)

4. Data Merging

4.1 merge (SQL-style Join)

# Example data
employees = pd.DataFrame({
    'emp_id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'dept_id': [10, 20, 10, 30]
})

departments = pd.DataFrame({
    'dept_id': [10, 20, 40],
    'dept_name': ['Sales', 'IT', 'Marketing']
})

# Inner join (default)
result = pd.merge(employees, departments, on='dept_id')
print(result)

# Left join
result = pd.merge(employees, departments, on='dept_id', how='left')
print(result)

# Right join
result = pd.merge(employees, departments, on='dept_id', how='right')
print(result)

# Outer join
result = pd.merge(employees, departments, on='dept_id', how='outer')
print(result)

4.2 Join with Different Column Names

employees = pd.DataFrame({
    'emp_id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie'],
    'department': [10, 20, 10]
})

departments = pd.DataFrame({
    'id': [10, 20],
    'dept_name': ['Sales', 'IT']
})

# Join with different column names
result = pd.merge(employees, departments,
                  left_on='department', right_on='id')
print(result)

4.3 Index-based Join

employees = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'salary': [50000, 60000, 70000]
}, index=[1, 2, 3])

bonuses = pd.DataFrame({
    'bonus': [5000, 8000, 10000]
}, index=[1, 2, 4])

# Join by index
result = pd.merge(employees, bonuses, left_index=True, right_index=True, how='outer')
print(result)

# join λ©”μ„œλ“œ (인덱슀 기반)
result = employees.join(bonuses, how='outer')
print(result)

4.4 concat (Concatenation)

df1 = pd.DataFrame({
    'A': [1, 2],
    'B': [3, 4]
})

df2 = pd.DataFrame({
    'A': [5, 6],
    'B': [7, 8]
})

# Vertical concatenation (row-wise)
result = pd.concat([df1, df2])
print(result)

# Reset index
result = pd.concat([df1, df2], ignore_index=True)
print(result)

# Horizontal concatenation (column-wise)
result = pd.concat([df1, df2], axis=1)
print(result)

# Add keys
result = pd.concat([df1, df2], keys=['first', 'second'])
print(result)

4.5 append (ν–‰ μΆ”κ°€) - deprecated in pandas 2.0

# concat μ‚¬μš© ꢌμž₯
new_row = pd.DataFrame({'A': [9], 'B': [10]})
result = pd.concat([df1, new_row], ignore_index=True)

5. Pivot and Melt

5.1 pivot

df = pd.DataFrame({
    'date': ['2023-01', '2023-01', '2023-02', '2023-02'],
    'city': ['Seoul', 'Busan', 'Seoul', 'Busan'],
    'sales': [100, 80, 120, 90]
})

# Pivot table
pivot = df.pivot(index='date', columns='city', values='sales')
print(pivot)
#          Busan  Seoul
# date
# 2023-01     80    100
# 2023-02     90    120

5.2 pivot_table

df = pd.DataFrame({
    'date': ['2023-01', '2023-01', '2023-01', '2023-02'],
    'city': ['Seoul', 'Seoul', 'Busan', 'Seoul'],
    'category': ['A', 'B', 'A', 'A'],
    'sales': [100, 150, 80, 120]
})

# Apply aggregation function
pivot = pd.pivot_table(df, values='sales', index='date',
                       columns='city', aggfunc='sum')
print(pivot)

# Multiple aggregation functions
pivot = pd.pivot_table(df, values='sales', index='date',
                       columns='city', aggfunc=['sum', 'mean'])
print(pivot)

# Multiple indices
pivot = pd.pivot_table(df, values='sales',
                       index=['date', 'category'],
                       columns='city',
                       aggfunc='sum',
                       fill_value=0)
print(pivot)

# Add margins
pivot = pd.pivot_table(df, values='sales', index='date',
                       columns='city', aggfunc='sum', margins=True)
print(pivot)

5.3 melt (Unpivot)

df_wide = pd.DataFrame({
    'name': ['Alice', 'Bob'],
    'math': [90, 85],
    'english': [80, 95],
    'science': [85, 90]
})

# Wide β†’ Long λ³€ν™˜
df_long = pd.melt(df_wide,
                  id_vars=['name'],
                  value_vars=['math', 'english', 'science'],
                  var_name='subject',
                  value_name='score')
print(df_long)
#     name  subject  score
# 0  Alice     math     90
# 1    Bob     math     85
# 2  Alice  english     80
# 3    Bob  english     95
# 4  Alice  science     85
# 5    Bob  science     90

5.4 stack and unstack

df = pd.DataFrame({
    'A': [1, 2],
    'B': [3, 4]
}, index=['x', 'y'])

# stack: columns to rows
stacked = df.stack()
print(stacked)
# x  A    1
#    B    3
# y  A    2
#    B    4

# unstack: rows to columns
unstacked = stacked.unstack()
print(unstacked)

6. Handling Duplicates

df = pd.DataFrame({
    'A': [1, 1, 2, 2, 3],
    'B': ['a', 'a', 'b', 'c', 'c'],
    'C': [10, 10, 20, 30, 40]
})

# Check duplicates
print(df.duplicated())
print(df.duplicated(subset=['A', 'B']))
print(df[df.duplicated(keep=False)])  # λͺ¨λ“  Duplicates ν–‰

# Count duplicates
print(df.duplicated().sum())

# Remove duplicates
print(df.drop_duplicates())
print(df.drop_duplicates(subset=['A']))
print(df.drop_duplicates(subset=['A'], keep='last'))  # λ§ˆμ§€λ§‰ μœ μ§€
print(df.drop_duplicates(subset=['A'], keep=False))   # λͺ¨λ“  Duplicates 제거

7. Cross Tabulation

df = pd.DataFrame({
    'gender': ['M', 'F', 'M', 'F', 'M', 'F'],
    'department': ['Sales', 'IT', 'IT', 'Sales', 'HR', 'IT'],
    'salary': [50000, 60000, 55000, 58000, 52000, 62000]
})

# Frequency cross table
ct = pd.crosstab(df['gender'], df['department'])
print(ct)

# Add margins
ct = pd.crosstab(df['gender'], df['department'], margins=True)
print(ct)

# Normalize
ct = pd.crosstab(df['gender'], df['department'], normalize=True)
print(ct)

# Apply aggregation function
ct = pd.crosstab(df['gender'], df['department'],
                 values=df['salary'], aggfunc='mean')
print(ct)

Practice Problems

Problem 1: Group Statistics

Calculate average salary and employee count by department.

df = pd.DataFrame({
    'department': ['Sales', 'IT', 'IT', 'HR', 'Sales'],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'salary': [50000, 60000, 70000, 55000, 65000]
})

# Solution
result = df.groupby('department').agg(
    avg_salary=('salary', 'mean'),
    count=('name', 'count')
)
print(result)

Problem 2: Data Merging

Join two DataFrames to include department names for employees.

employees = pd.DataFrame({
    'emp_id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie'],
    'dept_id': [10, 20, 10]
})

departments = pd.DataFrame({
    'dept_id': [10, 20],
    'dept_name': ['Sales', 'IT']
})

# Solution
result = pd.merge(employees, departments, on='dept_id')
print(result)

Problem 3: Pivot Table

Create a pivot table with sales totals by month and category.

sales = pd.DataFrame({
    'month': ['Jan', 'Jan', 'Feb', 'Feb', 'Jan', 'Feb'],
    'category': ['A', 'B', 'A', 'B', 'A', 'A'],
    'amount': [100, 150, 120, 180, 110, 130]
})

# Solution
pivot = pd.pivot_table(sales, values='amount',
                       index='month', columns='category',
                       aggfunc='sum')
print(pivot)

Summary

Feature Functions/Methods
Filtering df[condition], query(), isin()
Sorting sort_values(), sort_index(), rank()
Grouping groupby(), agg(), transform(), apply()
Merging merge(), join(), concat()
Pivot pivot(), pivot_table(), melt(), stack(), unstack()
Duplicates duplicated(), drop_duplicates()
Cross Tab crosstab()
to navigate between lessons