9. Data Visualization Advanced (Seaborn)
9. Data Visualization Advanced (Seaborn)¶
Previous: Data Visualization Basics | Next: From EDA to Inference
Overview¶
Seaborn is a statistical data visualization library built on Matplotlib. It provides more beautiful default styles and makes it easy to create statistical graphs.
1. Seaborn ๊ธฐ์ด¶
1.1 ๊ธฐ๋ณธ ์ค์ ¶
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# ์คํ์ผ ์ค์
sns.set_theme() # Basic seaborn ํ
๋ง
# sns.set_style("whitegrid") # ๋ฐฐ๊ฒฝ ์คํ์ผ
# sns.set_palette("husl") # ์์ ํ๋ ํธ
# sns.set_context("notebook") # ํฌ๊ธฐ ์ปจํ
์คํธ
# Example ๋ฐ์ดํฐ์
๋ก๋
tips = sns.load_dataset('tips')
iris = sns.load_dataset('iris')
titanic = sns.load_dataset('titanic')
print(tips.head())
1.2 ์คํ์ผ๊ณผ ํ๋ ํธ¶
# Usage ๊ฐ๋ฅํ ์คํ์ผ
styles = ['darkgrid', 'whitegrid', 'dark', 'white', 'ticks']
fig, axes = plt.subplots(1, 5, figsize=(20, 4))
for ax, style in zip(axes, styles):
with sns.axes_style(style):
sns.lineplot(x=[1, 2, 3], y=[1, 4, 2], ax=ax)
ax.set_title(style)
plt.tight_layout()
plt.show()
# ์์ ํ๋ ํธ
palettes = ['deep', 'muted', 'pastel', 'bright', 'dark', 'colorblind']
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
for ax, palette in zip(axes.flat, palettes):
sns.palplot(sns.color_palette(palette), ax=ax)
ax.set_title(palette)
plt.tight_layout()
plt.show()
# ์ปค์คํ
ํ๋ ํธ
custom_palette = sns.color_palette("husl", 8)
sns.set_palette(custom_palette)
2. ๋ถํฌ ์๊ฐํ¶
2.1 ํ์คํ ๊ทธ๋จ๊ณผ KDE¶
tips = sns.load_dataset('tips')
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# histplot: ํ์คํ ๊ทธ๋จ
sns.histplot(data=tips, x='total_bill', bins=30, ax=axes[0, 0])
axes[0, 0].set_title('Histogram')
# KDE plot
sns.kdeplot(data=tips, x='total_bill', fill=True, ax=axes[0, 1])
axes[0, 1].set_title('KDE Plot')
# ํ์คํ ๊ทธ๋จ + KDE
sns.histplot(data=tips, x='total_bill', kde=True, ax=axes[1, 0])
axes[1, 0].set_title('Histogram with KDE')
# ๊ทธ๋ฃน๋ณ ๋ถํฌ
sns.histplot(data=tips, x='total_bill', hue='time', multiple='stack', ax=axes[1, 1])
axes[1, 1].set_title('Stacked Histogram by Time')
plt.tight_layout()
plt.show()
2.2 displot (๋ถํฌ ํ๋กฏ)¶
# FacetGrid ๊ธฐ๋ฐ ๋ถํฌ ํ๋กฏ
g = sns.displot(data=tips, x='total_bill', hue='time', kind='kde',
fill=True, height=5, aspect=1.5)
g.fig.suptitle('Distribution by Time', y=1.02)
plt.show()
# ๋ค์ค ํ๋กฏ
g = sns.displot(data=tips, x='total_bill', col='time', row='smoker',
bins=20, height=4)
plt.show()
2.3 ECDF Plot¶
# ๊ฒฝํ์ ๋์ ๋ถํฌํจ์
fig, ax = plt.subplots(figsize=(10, 6))
sns.ecdfplot(data=tips, x='total_bill', hue='time', ax=ax)
ax.set_title('Empirical Cumulative Distribution Function')
plt.show()
2.4 Rug Plot¶
fig, ax = plt.subplots(figsize=(10, 6))
sns.kdeplot(data=tips, x='total_bill', fill=True, ax=ax)
sns.rugplot(data=tips, x='total_bill', ax=ax, alpha=0.5)
ax.set_title('KDE with Rug Plot')
plt.show()
3. Categorical Data ์๊ฐํ¶
3.1 ์นด์ดํธ ํ๋กฏ¶
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Basic ์นด์ดํธ ํ๋กฏ
sns.countplot(data=tips, x='day', ax=axes[0])
axes[0].set_title('Count by Day')
# ๊ทธ๋ฃน๋ณ
sns.countplot(data=tips, x='day', hue='time', ax=axes[1])
axes[1].set_title('Count by Day and Time')
plt.tight_layout()
plt.show()
3.2 ๋ฐ ํ๋กฏ (ํต๊ณ ๊ธฐ๋ฐ)¶
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# ํ๊ท ๊ณผ ์ ๋ขฐ๊ตฌ๊ฐ
sns.barplot(data=tips, x='day', y='total_bill', ax=axes[0])
axes[0].set_title('Mean Total Bill by Day (with CI)')
# ๊ทธ๋ฃน๋ณ
sns.barplot(data=tips, x='day', y='total_bill', hue='sex', ax=axes[1])
axes[1].set_title('Mean Total Bill by Day and Sex')
plt.tight_layout()
plt.show()
3.3 ๋ฐ์ค ํ๋กฏ¶
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Basic ๋ฐ์คํ๋กฏ
sns.boxplot(data=tips, x='day', y='total_bill', ax=axes[0])
axes[0].set_title('Box Plot')
# ๊ทธ๋ฃน๋ณ
sns.boxplot(data=tips, x='day', y='total_bill', hue='smoker', ax=axes[1])
axes[1].set_title('Box Plot by Smoker Status')
plt.tight_layout()
plt.show()
3.4 ๋ฐ์ด์ฌ๋ฆฐ ํ๋กฏ¶
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# ๋ฐ์ด์ฌ๋ฆฐ ํ๋กฏ
sns.violinplot(data=tips, x='day', y='total_bill', ax=axes[0])
axes[0].set_title('Violin Plot')
# split ์ต์
sns.violinplot(data=tips, x='day', y='total_bill', hue='sex',
split=True, ax=axes[1])
axes[1].set_title('Split Violin Plot')
plt.tight_layout()
plt.show()
3.5 ์คํธ๋ฆฝ ํ๋กฏ๊ณผ ์ค์ ํ๋กฏ¶
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# ์คํธ๋ฆฝ ํ๋กฏ (์ ๊ฒน์นจ ํ์ฉ)
sns.stripplot(data=tips, x='day', y='total_bill', ax=axes[0], alpha=0.6)
axes[0].set_title('Strip Plot')
# ์ค์ ํ๋กฏ (์ ๊ฒน์นจ ๋ฐฉ์ง)
sns.swarmplot(data=tips, x='day', y='total_bill', ax=axes[1])
axes[1].set_title('Swarm Plot')
plt.tight_layout()
plt.show()
# ๋ฐ์คํ๋กฏ๊ณผ ๊ฒฐํฉ
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=tips, x='day', y='total_bill', ax=ax)
sns.stripplot(data=tips, x='day', y='total_bill', ax=ax,
color='black', alpha=0.3, size=3)
ax.set_title('Box Plot with Strip Plot Overlay')
plt.show()
3.6 ํฌ์ธํธ ํ๋กฏ¶
fig, ax = plt.subplots(figsize=(10, 6))
sns.pointplot(data=tips, x='day', y='total_bill', hue='sex',
dodge=True, markers=['o', 's'], linestyles=['-', '--'])
ax.set_title('Point Plot')
plt.show()
3.7 catplot (๋ฒ์ฃผํ ํ๋กฏ ํตํฉ)¶
# FacetGrid ๊ธฐ๋ฐ ๋ฒ์ฃผํ ํ๋กฏ
g = sns.catplot(data=tips, x='day', y='total_bill', hue='sex',
col='time', kind='box', height=5, aspect=1)
g.fig.suptitle('Box Plots by Time', y=1.02)
plt.show()
# kind: 'strip', 'swarm', 'box', 'violin', 'boxen', 'point', 'bar', 'count'
4. ๊ด๊ณ ์๊ฐํ¶
4.1 ์ฐ์ ๋¶
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Basic ์ฐ์ ๋
sns.scatterplot(data=tips, x='total_bill', y='tip', ax=axes[0])
axes[0].set_title('Basic Scatter Plot')
# ์คํ์ผ ์ถ๊ฐ
sns.scatterplot(data=tips, x='total_bill', y='tip',
hue='time', size='size', style='smoker',
ax=axes[1])
axes[1].set_title('Scatter Plot with Style')
plt.tight_layout()
plt.show()
4.2 ํ๊ท ํ๋กฏ¶
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# ์ ํ ํ๊ท
sns.regplot(data=tips, x='total_bill', y='tip', ax=axes[0])
axes[0].set_title('Linear Regression')
# ๋คํญ ํ๊ท
sns.regplot(data=tips, x='total_bill', y='tip', order=2, ax=axes[1])
axes[1].set_title('Polynomial Regression (order=2)')
plt.tight_layout()
plt.show()
4.3 lmplot (FacetGrid ๊ธฐ๋ฐ ํ๊ท)¶
g = sns.lmplot(data=tips, x='total_bill', y='tip', hue='smoker',
col='time', height=5, aspect=1)
g.fig.suptitle('Linear Regression by Time and Smoker', y=1.02)
plt.show()
4.4 jointplot (๊ฒฐํฉ ๋ถํฌ)¶
# ์ฐ์ ๋ + ํ์คํ ๊ทธ๋จ
g = sns.jointplot(data=tips, x='total_bill', y='tip', kind='scatter')
plt.show()
# KDE
g = sns.jointplot(data=tips, x='total_bill', y='tip', kind='kde', fill=True)
plt.show()
# hex
g = sns.jointplot(data=tips, x='total_bill', y='tip', kind='hex')
plt.show()
# ํ๊ท
g = sns.jointplot(data=tips, x='total_bill', y='tip', kind='reg')
plt.show()
4.5 pairplot (ํ์ด ํ๋กฏ)¶
# ๋ชจ๋ ๋ณ์ ์์ ๊ด๊ณ
g = sns.pairplot(iris, hue='species', diag_kind='kde')
plt.show()
# ํน์ ๋ณ์๋ง
g = sns.pairplot(tips, vars=['total_bill', 'tip', 'size'],
hue='time', diag_kind='hist')
plt.show()
5. ํํธ๋งต๊ณผ ํด๋ฌ์คํฐ๋งต¶
5.1 ํํธ๋งต¶
# ์๊ดํ๋ ฌ ํํธ๋งต
correlation = tips[['total_bill', 'tip', 'size']].corr()
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0,
vmin=-1, vmax=1, fmt='.2f', ax=ax)
ax.set_title('Correlation Heatmap')
plt.show()
# ํผ๋ฒ ํ
์ด๋ธ ํํธ๋งต
pivot = tips.pivot_table(values='tip', index='day', columns='time', aggfunc='mean')
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(pivot, annot=True, cmap='YlOrRd', fmt='.2f', ax=ax)
ax.set_title('Average Tip by Day and Time')
plt.show()
5.2 ํด๋ฌ์คํฐ๋งต¶
# ๊ณ์ธต์ ํด๋ฌ์คํฐ๋ง ํํธ๋งต
iris_numeric = iris.drop('species', axis=1)
g = sns.clustermap(iris_numeric.sample(50), cmap='viridis',
standard_scale=1, figsize=(10, 10))
g.fig.suptitle('Clustered Heatmap', y=1.02)
plt.show()
6. ๋ค์ค ํ๋กฏ¶
6.1 FacetGrid¶
# ์ปค์คํ
FacetGrid
g = sns.FacetGrid(tips, col='time', row='smoker', height=4, aspect=1.2)
g.map(sns.histplot, 'total_bill', bins=20)
g.add_legend()
plt.show()
# ๋ ๋ณต์กํ ์
g = sns.FacetGrid(tips, col='day', col_wrap=2, height=4)
g.map_dataframe(sns.scatterplot, x='total_bill', y='tip', hue='time')
g.add_legend()
plt.show()
6.2 PairGrid¶
g = sns.PairGrid(iris, hue='species')
g.map_upper(sns.scatterplot)
g.map_lower(sns.kdeplot, fill=True)
g.map_diag(sns.histplot, kde=True)
g.add_legend()
plt.show()
7. ํต๊ณ์ ์๊ฐํ¶
7.1 ์ค์ฐจ ๋ง๋¶
fig, ax = plt.subplots(figsize=(10, 6))
# ์ค์ฐจ ๋ง๋๊ฐ ์๋ ๋ฐ ํ๋กฏ
sns.barplot(data=tips, x='day', y='total_bill', errorbar='sd', ax=ax)
ax.set_title('Bar Plot with Standard Deviation')
plt.show()
# errorbar ์ต์
: 'ci' (95% ์ ๋ขฐ๊ตฌ๊ฐ), 'pi' (๋ฐฑ๋ถ์์ ๊ตฌ๊ฐ), 'se' (ํ์ค์ค์ฐจ), 'sd' (ํ์คํธ์ฐจ)
7.2 ๋ถํธ์คํธ๋ฉ ์ ๋ขฐ๊ตฌ๊ฐ¶
fig, ax = plt.subplots(figsize=(10, 6))
# ๋ถํธ์คํธ๋ฉ ๊ธฐ๋ฐ ์ ๋ขฐ๊ตฌ๊ฐ
sns.lineplot(data=tips, x='size', y='tip', errorbar=('ci', 95), ax=ax)
ax.set_title('Line Plot with 95% Confidence Interval')
plt.show()
8. ๊ณ ๊ธ ์ปค์คํฐ๋ง์ด์ง¶
8.1 ์์ ์ค์ ¶
# ์ฐ์ํ ์์
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
sns.scatterplot(data=tips, x='total_bill', y='tip', hue='size',
palette='viridis', ax=axes[0])
axes[0].set_title('Viridis Palette')
sns.scatterplot(data=tips, x='total_bill', y='tip', hue='size',
palette='coolwarm', ax=axes[1])
axes[1].set_title('Coolwarm Palette')
sns.scatterplot(data=tips, x='total_bill', y='tip', hue='size',
palette='YlOrRd', ax=axes[2])
axes[2].set_title('YlOrRd Palette')
plt.tight_layout()
plt.show()
# ๋ฒ์ฃผํ ์์
custom_palette = {'Lunch': 'blue', 'Dinner': 'red'}
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=tips, x='day', y='total_bill', hue='time',
palette=custom_palette, ax=ax)
plt.show()
8.2 ์ถ๊ณผ ๋ ์ด๋ธ¶
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=tips, x='day', y='total_bill', ax=ax)
# ์ถ ๋ ์ด๋ธ ์ปค์คํฐ๋ง์ด์ง
ax.set_xlabel('Day of Week', fontsize=14, fontweight='bold')
ax.set_ylabel('Total Bill ($)', fontsize=14, fontweight='bold')
ax.set_title('Distribution of Total Bill by Day', fontsize=16, fontweight='bold')
# x์ถ ๋ ์ด๋ธ ํ์
plt.xticks(rotation=45, ha='right')
# y์ถ ๋ฒ์
ax.set_ylim(0, 60)
plt.tight_layout()
plt.show()
8.3 ์ฃผ์ ์ถ๊ฐ¶
fig, ax = plt.subplots(figsize=(10, 6))
sns.scatterplot(data=tips, x='total_bill', y='tip', ax=ax)
# ์ฃผ์ ์ถ๊ฐ
ax.annotate('High tipper', xy=(50, 10), xytext=(40, 8),
arrowprops=dict(arrowstyle='->', color='red'),
fontsize=12, color='red')
# ์ํ์ /์์ง์
ax.axhline(y=tips['tip'].mean(), color='green', linestyle='--',
label=f'Mean tip: ${tips["tip"].mean():.2f}')
ax.axvline(x=tips['total_bill'].mean(), color='blue', linestyle='--',
label=f'Mean bill: ${tips["total_bill"].mean():.2f}')
ax.legend()
ax.set_title('Scatter Plot with Annotations')
plt.show()
9. ๋์๋ณด๋ ์คํ์ผ ๋ ์ด์์¶
fig = plt.figure(figsize=(16, 12))
# GridSpec ์ฌ์ฉ
from matplotlib.gridspec import GridSpec
gs = GridSpec(3, 3, figure=fig, hspace=0.3, wspace=0.3)
# ํฐ ํ๋กฏ
ax1 = fig.add_subplot(gs[0, :2])
sns.histplot(data=tips, x='total_bill', kde=True, ax=ax1)
ax1.set_title('Distribution of Total Bill')
# ์์ ํ๋กฏ๋ค
ax2 = fig.add_subplot(gs[0, 2])
sns.boxplot(data=tips, y='total_bill', ax=ax2)
ax2.set_title('Box Plot')
ax3 = fig.add_subplot(gs[1, 0])
sns.countplot(data=tips, x='day', ax=ax3)
ax3.set_title('Count by Day')
ax4 = fig.add_subplot(gs[1, 1])
sns.barplot(data=tips, x='day', y='tip', ax=ax4)
ax4.set_title('Average Tip by Day')
ax5 = fig.add_subplot(gs[1, 2])
tips['time'].value_counts().plot(kind='pie', autopct='%1.1f%%', ax=ax5)
ax5.set_title('Time Distribution')
ax6 = fig.add_subplot(gs[2, :])
sns.scatterplot(data=tips, x='total_bill', y='tip', hue='time',
size='size', ax=ax6)
ax6.set_title('Total Bill vs Tip')
plt.suptitle('Restaurant Tips Dashboard', fontsize=20, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()
10. ์ ์ฅ ๋ฐ ๋ด๋ณด๋ด๊ธฐ¶
# ๊ณ ํด์๋ ์ ์ฅ
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=tips, x='day', y='total_bill', ax=ax)
# PNG
fig.savefig('boxplot.png', dpi=300, bbox_inches='tight',
facecolor='white', edgecolor='none')
# PDF (๋ฒกํฐ ํ์)
fig.savefig('boxplot.pdf', bbox_inches='tight')
# SVG (๋ฒกํฐ ํ์)
fig.savefig('boxplot.svg', bbox_inches='tight')
plt.close()
Summary¶
| ํ๋กฏ ์ ํ | Seaborn ํจ์ | ์ฉ๋ |
|---|---|---|
| ๋ถํฌ | histplot(), kdeplot(), displot() |
๋จ์ผ ๋ณ์ ๋ถํฌ |
| ๋ฒ์ฃผํ | countplot(), barplot(), boxplot(), violinplot() |
๋ฒ์ฃผ๋ณ ๋น๊ต |
| ๊ด๊ณ | scatterplot(), regplot(), lmplot() |
๋ณ์ ๊ฐ ๊ด๊ณ |
| ๊ฒฐํฉ | jointplot(), pairplot() |
๋ค๋ณ๋ ๋ถ์ |
| ํํธ๋งต | heatmap(), clustermap() |
ํ๋ ฌ ๋ฐ์ดํฐ |
| ๋ค์ค ํ๋กฏ | FacetGrid, PairGrid, catplot() |
์กฐ๊ฑด๋ณ ์๋ธํ๋กฏ |