9. Data Visualization Advanced (Seaborn)

9. Data Visualization Advanced (Seaborn)

Previous: Data Visualization Basics | Next: From EDA to Inference

Overview

Seaborn is a statistical data visualization library built on Matplotlib. It provides more beautiful default styles and makes it easy to create statistical graphs.


1. Seaborn ๊ธฐ์ดˆ

1.1 ๊ธฐ๋ณธ ์„ค์ •

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# ์Šคํƒ€์ผ ์„ค์ •
sns.set_theme()  # Basic seaborn ํ…Œ๋งˆ
# sns.set_style("whitegrid")  # ๋ฐฐ๊ฒฝ ์Šคํƒ€์ผ
# sns.set_palette("husl")     # ์ƒ‰์ƒ ํŒ”๋ ˆํŠธ
# sns.set_context("notebook") # ํฌ๊ธฐ ์ปจํ…์ŠคํŠธ

# Example ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ
tips = sns.load_dataset('tips')
iris = sns.load_dataset('iris')
titanic = sns.load_dataset('titanic')

print(tips.head())

1.2 ์Šคํƒ€์ผ๊ณผ ํŒ”๋ ˆํŠธ

# Usage ๊ฐ€๋Šฅํ•œ ์Šคํƒ€์ผ
styles = ['darkgrid', 'whitegrid', 'dark', 'white', 'ticks']

fig, axes = plt.subplots(1, 5, figsize=(20, 4))
for ax, style in zip(axes, styles):
    with sns.axes_style(style):
        sns.lineplot(x=[1, 2, 3], y=[1, 4, 2], ax=ax)
        ax.set_title(style)
plt.tight_layout()
plt.show()

# ์ƒ‰์ƒ ํŒ”๋ ˆํŠธ
palettes = ['deep', 'muted', 'pastel', 'bright', 'dark', 'colorblind']

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
for ax, palette in zip(axes.flat, palettes):
    sns.palplot(sns.color_palette(palette), ax=ax)
    ax.set_title(palette)
plt.tight_layout()
plt.show()

# ์ปค์Šคํ…€ ํŒ”๋ ˆํŠธ
custom_palette = sns.color_palette("husl", 8)
sns.set_palette(custom_palette)

2. ๋ถ„ํฌ ์‹œ๊ฐํ™”

2.1 ํžˆ์Šคํ† ๊ทธ๋žจ๊ณผ KDE

tips = sns.load_dataset('tips')

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# histplot: ํžˆ์Šคํ† ๊ทธ๋žจ
sns.histplot(data=tips, x='total_bill', bins=30, ax=axes[0, 0])
axes[0, 0].set_title('Histogram')

# KDE plot
sns.kdeplot(data=tips, x='total_bill', fill=True, ax=axes[0, 1])
axes[0, 1].set_title('KDE Plot')

# ํžˆ์Šคํ† ๊ทธ๋žจ + KDE
sns.histplot(data=tips, x='total_bill', kde=True, ax=axes[1, 0])
axes[1, 0].set_title('Histogram with KDE')

# ๊ทธ๋ฃน๋ณ„ ๋ถ„ํฌ
sns.histplot(data=tips, x='total_bill', hue='time', multiple='stack', ax=axes[1, 1])
axes[1, 1].set_title('Stacked Histogram by Time')

plt.tight_layout()
plt.show()

2.2 displot (๋ถ„ํฌ ํ”Œ๋กฏ)

# FacetGrid ๊ธฐ๋ฐ˜ ๋ถ„ํฌ ํ”Œ๋กฏ
g = sns.displot(data=tips, x='total_bill', hue='time', kind='kde',
                fill=True, height=5, aspect=1.5)
g.fig.suptitle('Distribution by Time', y=1.02)
plt.show()

# ๋‹ค์ค‘ ํ”Œ๋กฏ
g = sns.displot(data=tips, x='total_bill', col='time', row='smoker',
                bins=20, height=4)
plt.show()

2.3 ECDF Plot

# ๊ฒฝํ—˜์  ๋ˆ„์ ๋ถ„ํฌํ•จ์ˆ˜
fig, ax = plt.subplots(figsize=(10, 6))
sns.ecdfplot(data=tips, x='total_bill', hue='time', ax=ax)
ax.set_title('Empirical Cumulative Distribution Function')
plt.show()

2.4 Rug Plot

fig, ax = plt.subplots(figsize=(10, 6))
sns.kdeplot(data=tips, x='total_bill', fill=True, ax=ax)
sns.rugplot(data=tips, x='total_bill', ax=ax, alpha=0.5)
ax.set_title('KDE with Rug Plot')
plt.show()

3. Categorical Data ์‹œ๊ฐํ™”

3.1 ์นด์šดํŠธ ํ”Œ๋กฏ

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Basic ์นด์šดํŠธ ํ”Œ๋กฏ
sns.countplot(data=tips, x='day', ax=axes[0])
axes[0].set_title('Count by Day')

# ๊ทธ๋ฃน๋ณ„
sns.countplot(data=tips, x='day', hue='time', ax=axes[1])
axes[1].set_title('Count by Day and Time')

plt.tight_layout()
plt.show()

3.2 ๋ฐ” ํ”Œ๋กฏ (ํ†ต๊ณ„ ๊ธฐ๋ฐ˜)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# ํ‰๊ท ๊ณผ ์‹ ๋ขฐ๊ตฌ๊ฐ„
sns.barplot(data=tips, x='day', y='total_bill', ax=axes[0])
axes[0].set_title('Mean Total Bill by Day (with CI)')

# ๊ทธ๋ฃน๋ณ„
sns.barplot(data=tips, x='day', y='total_bill', hue='sex', ax=axes[1])
axes[1].set_title('Mean Total Bill by Day and Sex')

plt.tight_layout()
plt.show()

3.3 ๋ฐ•์Šค ํ”Œ๋กฏ

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Basic ๋ฐ•์Šคํ”Œ๋กฏ
sns.boxplot(data=tips, x='day', y='total_bill', ax=axes[0])
axes[0].set_title('Box Plot')

# ๊ทธ๋ฃน๋ณ„
sns.boxplot(data=tips, x='day', y='total_bill', hue='smoker', ax=axes[1])
axes[1].set_title('Box Plot by Smoker Status')

plt.tight_layout()
plt.show()

3.4 ๋ฐ”์ด์˜ฌ๋ฆฐ ํ”Œ๋กฏ

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# ๋ฐ”์ด์˜ฌ๋ฆฐ ํ”Œ๋กฏ
sns.violinplot(data=tips, x='day', y='total_bill', ax=axes[0])
axes[0].set_title('Violin Plot')

# split ์˜ต์…˜
sns.violinplot(data=tips, x='day', y='total_bill', hue='sex',
               split=True, ax=axes[1])
axes[1].set_title('Split Violin Plot')

plt.tight_layout()
plt.show()

3.5 ์ŠคํŠธ๋ฆฝ ํ”Œ๋กฏ๊ณผ ์Šค์›œ ํ”Œ๋กฏ

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# ์ŠคํŠธ๋ฆฝ ํ”Œ๋กฏ (์  ๊ฒน์นจ ํ—ˆ์šฉ)
sns.stripplot(data=tips, x='day', y='total_bill', ax=axes[0], alpha=0.6)
axes[0].set_title('Strip Plot')

# ์Šค์›œ ํ”Œ๋กฏ (์  ๊ฒน์นจ ๋ฐฉ์ง€)
sns.swarmplot(data=tips, x='day', y='total_bill', ax=axes[1])
axes[1].set_title('Swarm Plot')

plt.tight_layout()
plt.show()

# ๋ฐ•์Šคํ”Œ๋กฏ๊ณผ ๊ฒฐํ•ฉ
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=tips, x='day', y='total_bill', ax=ax)
sns.stripplot(data=tips, x='day', y='total_bill', ax=ax,
              color='black', alpha=0.3, size=3)
ax.set_title('Box Plot with Strip Plot Overlay')
plt.show()

3.6 ํฌ์ธํŠธ ํ”Œ๋กฏ

fig, ax = plt.subplots(figsize=(10, 6))

sns.pointplot(data=tips, x='day', y='total_bill', hue='sex',
              dodge=True, markers=['o', 's'], linestyles=['-', '--'])
ax.set_title('Point Plot')

plt.show()

3.7 catplot (๋ฒ”์ฃผํ˜• ํ”Œ๋กฏ ํ†ตํ•ฉ)

# FacetGrid ๊ธฐ๋ฐ˜ ๋ฒ”์ฃผํ˜• ํ”Œ๋กฏ
g = sns.catplot(data=tips, x='day', y='total_bill', hue='sex',
                col='time', kind='box', height=5, aspect=1)
g.fig.suptitle('Box Plots by Time', y=1.02)
plt.show()

# kind: 'strip', 'swarm', 'box', 'violin', 'boxen', 'point', 'bar', 'count'

4. ๊ด€๊ณ„ ์‹œ๊ฐํ™”

4.1 ์‚ฐ์ ๋„

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Basic ์‚ฐ์ ๋„
sns.scatterplot(data=tips, x='total_bill', y='tip', ax=axes[0])
axes[0].set_title('Basic Scatter Plot')

# ์Šคํƒ€์ผ ์ถ”๊ฐ€
sns.scatterplot(data=tips, x='total_bill', y='tip',
                hue='time', size='size', style='smoker',
                ax=axes[1])
axes[1].set_title('Scatter Plot with Style')

plt.tight_layout()
plt.show()

4.2 ํšŒ๊ท€ ํ”Œ๋กฏ

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# ์„ ํ˜• ํšŒ๊ท€
sns.regplot(data=tips, x='total_bill', y='tip', ax=axes[0])
axes[0].set_title('Linear Regression')

# ๋‹คํ•ญ ํšŒ๊ท€
sns.regplot(data=tips, x='total_bill', y='tip', order=2, ax=axes[1])
axes[1].set_title('Polynomial Regression (order=2)')

plt.tight_layout()
plt.show()

4.3 lmplot (FacetGrid ๊ธฐ๋ฐ˜ ํšŒ๊ท€)

g = sns.lmplot(data=tips, x='total_bill', y='tip', hue='smoker',
               col='time', height=5, aspect=1)
g.fig.suptitle('Linear Regression by Time and Smoker', y=1.02)
plt.show()

4.4 jointplot (๊ฒฐํ•ฉ ๋ถ„ํฌ)

# ์‚ฐ์ ๋„ + ํžˆ์Šคํ† ๊ทธ๋žจ
g = sns.jointplot(data=tips, x='total_bill', y='tip', kind='scatter')
plt.show()

# KDE
g = sns.jointplot(data=tips, x='total_bill', y='tip', kind='kde', fill=True)
plt.show()

# hex
g = sns.jointplot(data=tips, x='total_bill', y='tip', kind='hex')
plt.show()

# ํšŒ๊ท€
g = sns.jointplot(data=tips, x='total_bill', y='tip', kind='reg')
plt.show()

4.5 pairplot (ํŽ˜์–ด ํ”Œ๋กฏ)

# ๋ชจ๋“  ๋ณ€์ˆ˜ ์Œ์˜ ๊ด€๊ณ„
g = sns.pairplot(iris, hue='species', diag_kind='kde')
plt.show()

# ํŠน์ • ๋ณ€์ˆ˜๋งŒ
g = sns.pairplot(tips, vars=['total_bill', 'tip', 'size'],
                 hue='time', diag_kind='hist')
plt.show()

5. ํžˆํŠธ๋งต๊ณผ ํด๋Ÿฌ์Šคํ„ฐ๋งต

5.1 ํžˆํŠธ๋งต

# ์ƒ๊ด€ํ–‰๋ ฌ ํžˆํŠธ๋งต
correlation = tips[['total_bill', 'tip', 'size']].corr()

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0,
            vmin=-1, vmax=1, fmt='.2f', ax=ax)
ax.set_title('Correlation Heatmap')
plt.show()

# ํ”ผ๋ฒ— ํ…Œ์ด๋ธ” ํžˆํŠธ๋งต
pivot = tips.pivot_table(values='tip', index='day', columns='time', aggfunc='mean')

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(pivot, annot=True, cmap='YlOrRd', fmt='.2f', ax=ax)
ax.set_title('Average Tip by Day and Time')
plt.show()

5.2 ํด๋Ÿฌ์Šคํ„ฐ๋งต

# ๊ณ„์ธต์  ํด๋Ÿฌ์Šคํ„ฐ๋ง ํžˆํŠธ๋งต
iris_numeric = iris.drop('species', axis=1)

g = sns.clustermap(iris_numeric.sample(50), cmap='viridis',
                   standard_scale=1, figsize=(10, 10))
g.fig.suptitle('Clustered Heatmap', y=1.02)
plt.show()

6. ๋‹ค์ค‘ ํ”Œ๋กฏ

6.1 FacetGrid

# ์ปค์Šคํ…€ FacetGrid
g = sns.FacetGrid(tips, col='time', row='smoker', height=4, aspect=1.2)
g.map(sns.histplot, 'total_bill', bins=20)
g.add_legend()
plt.show()

# ๋” ๋ณต์žกํ•œ ์˜ˆ
g = sns.FacetGrid(tips, col='day', col_wrap=2, height=4)
g.map_dataframe(sns.scatterplot, x='total_bill', y='tip', hue='time')
g.add_legend()
plt.show()

6.2 PairGrid

g = sns.PairGrid(iris, hue='species')
g.map_upper(sns.scatterplot)
g.map_lower(sns.kdeplot, fill=True)
g.map_diag(sns.histplot, kde=True)
g.add_legend()
plt.show()

7. ํ†ต๊ณ„์  ์‹œ๊ฐํ™”

7.1 ์˜ค์ฐจ ๋ง‰๋Œ€

fig, ax = plt.subplots(figsize=(10, 6))

# ์˜ค์ฐจ ๋ง‰๋Œ€๊ฐ€ ์žˆ๋Š” ๋ฐ” ํ”Œ๋กฏ
sns.barplot(data=tips, x='day', y='total_bill', errorbar='sd', ax=ax)
ax.set_title('Bar Plot with Standard Deviation')
plt.show()

# errorbar ์˜ต์…˜: 'ci' (95% ์‹ ๋ขฐ๊ตฌ๊ฐ„), 'pi' (๋ฐฑ๋ถ„์œ„์ˆ˜ ๊ตฌ๊ฐ„), 'se' (ํ‘œ์ค€์˜ค์ฐจ), 'sd' (ํ‘œ์ค€ํŽธ์ฐจ)

7.2 ๋ถ€ํŠธ์ŠคํŠธ๋žฉ ์‹ ๋ขฐ๊ตฌ๊ฐ„

fig, ax = plt.subplots(figsize=(10, 6))

# ๋ถ€ํŠธ์ŠคํŠธ๋žฉ ๊ธฐ๋ฐ˜ ์‹ ๋ขฐ๊ตฌ๊ฐ„
sns.lineplot(data=tips, x='size', y='tip', errorbar=('ci', 95), ax=ax)
ax.set_title('Line Plot with 95% Confidence Interval')
plt.show()

8. ๊ณ ๊ธ‰ ์ปค์Šคํ„ฐ๋งˆ์ด์ง•

8.1 ์ƒ‰์ƒ ์„ค์ •

# ์—ฐ์†ํ˜• ์ƒ‰์ƒ
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

sns.scatterplot(data=tips, x='total_bill', y='tip', hue='size',
                palette='viridis', ax=axes[0])
axes[0].set_title('Viridis Palette')

sns.scatterplot(data=tips, x='total_bill', y='tip', hue='size',
                palette='coolwarm', ax=axes[1])
axes[1].set_title('Coolwarm Palette')

sns.scatterplot(data=tips, x='total_bill', y='tip', hue='size',
                palette='YlOrRd', ax=axes[2])
axes[2].set_title('YlOrRd Palette')

plt.tight_layout()
plt.show()

# ๋ฒ”์ฃผํ˜• ์ƒ‰์ƒ
custom_palette = {'Lunch': 'blue', 'Dinner': 'red'}
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=tips, x='day', y='total_bill', hue='time',
            palette=custom_palette, ax=ax)
plt.show()

8.2 ์ถ•๊ณผ ๋ ˆ์ด๋ธ”

fig, ax = plt.subplots(figsize=(10, 6))

sns.boxplot(data=tips, x='day', y='total_bill', ax=ax)

# ์ถ• ๋ ˆ์ด๋ธ” ์ปค์Šคํ„ฐ๋งˆ์ด์ง•
ax.set_xlabel('Day of Week', fontsize=14, fontweight='bold')
ax.set_ylabel('Total Bill ($)', fontsize=14, fontweight='bold')
ax.set_title('Distribution of Total Bill by Day', fontsize=16, fontweight='bold')

# x์ถ• ๋ ˆ์ด๋ธ” ํšŒ์ „
plt.xticks(rotation=45, ha='right')

# y์ถ• ๋ฒ”์œ„
ax.set_ylim(0, 60)

plt.tight_layout()
plt.show()

8.3 ์ฃผ์„ ์ถ”๊ฐ€

fig, ax = plt.subplots(figsize=(10, 6))

sns.scatterplot(data=tips, x='total_bill', y='tip', ax=ax)

# ์ฃผ์„ ์ถ”๊ฐ€
ax.annotate('High tipper', xy=(50, 10), xytext=(40, 8),
            arrowprops=dict(arrowstyle='->', color='red'),
            fontsize=12, color='red')

# ์ˆ˜ํ‰์„ /์ˆ˜์ง์„ 
ax.axhline(y=tips['tip'].mean(), color='green', linestyle='--',
           label=f'Mean tip: ${tips["tip"].mean():.2f}')
ax.axvline(x=tips['total_bill'].mean(), color='blue', linestyle='--',
           label=f'Mean bill: ${tips["total_bill"].mean():.2f}')

ax.legend()
ax.set_title('Scatter Plot with Annotations')
plt.show()

9. ๋Œ€์‹œ๋ณด๋“œ ์Šคํƒ€์ผ ๋ ˆ์ด์•„์›ƒ

fig = plt.figure(figsize=(16, 12))

# GridSpec ์‚ฌ์šฉ
from matplotlib.gridspec import GridSpec
gs = GridSpec(3, 3, figure=fig, hspace=0.3, wspace=0.3)

# ํฐ ํ”Œ๋กฏ
ax1 = fig.add_subplot(gs[0, :2])
sns.histplot(data=tips, x='total_bill', kde=True, ax=ax1)
ax1.set_title('Distribution of Total Bill')

# ์ž‘์€ ํ”Œ๋กฏ๋“ค
ax2 = fig.add_subplot(gs[0, 2])
sns.boxplot(data=tips, y='total_bill', ax=ax2)
ax2.set_title('Box Plot')

ax3 = fig.add_subplot(gs[1, 0])
sns.countplot(data=tips, x='day', ax=ax3)
ax3.set_title('Count by Day')

ax4 = fig.add_subplot(gs[1, 1])
sns.barplot(data=tips, x='day', y='tip', ax=ax4)
ax4.set_title('Average Tip by Day')

ax5 = fig.add_subplot(gs[1, 2])
tips['time'].value_counts().plot(kind='pie', autopct='%1.1f%%', ax=ax5)
ax5.set_title('Time Distribution')

ax6 = fig.add_subplot(gs[2, :])
sns.scatterplot(data=tips, x='total_bill', y='tip', hue='time',
                size='size', ax=ax6)
ax6.set_title('Total Bill vs Tip')

plt.suptitle('Restaurant Tips Dashboard', fontsize=20, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

10. ์ €์žฅ ๋ฐ ๋‚ด๋ณด๋‚ด๊ธฐ

# ๊ณ ํ•ด์ƒ๋„ ์ €์žฅ
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=tips, x='day', y='total_bill', ax=ax)

# PNG
fig.savefig('boxplot.png', dpi=300, bbox_inches='tight',
            facecolor='white', edgecolor='none')

# PDF (๋ฒกํ„ฐ ํ˜•์‹)
fig.savefig('boxplot.pdf', bbox_inches='tight')

# SVG (๋ฒกํ„ฐ ํ˜•์‹)
fig.savefig('boxplot.svg', bbox_inches='tight')

plt.close()

Summary

ํ”Œ๋กฏ ์œ ํ˜• Seaborn ํ•จ์ˆ˜ ์šฉ๋„
๋ถ„ํฌ histplot(), kdeplot(), displot() ๋‹จ์ผ ๋ณ€์ˆ˜ ๋ถ„ํฌ
๋ฒ”์ฃผํ˜• countplot(), barplot(), boxplot(), violinplot() ๋ฒ”์ฃผ๋ณ„ ๋น„๊ต
๊ด€๊ณ„ scatterplot(), regplot(), lmplot() ๋ณ€์ˆ˜ ๊ฐ„ ๊ด€๊ณ„
๊ฒฐํ•ฉ jointplot(), pairplot() ๋‹ค๋ณ€๋Ÿ‰ ๋ถ„์„
ํžˆํŠธ๋งต heatmap(), clustermap() ํ–‰๋ ฌ ๋ฐ์ดํ„ฐ
๋‹ค์ค‘ ํ”Œ๋กฏ FacetGrid, PairGrid, catplot() ์กฐ๊ฑด๋ณ„ ์„œ๋ธŒํ”Œ๋กฏ
to navigate between lessons