์ฐจ์ ์ถ์ (Dimensionality Reduction)
์ฐจ์ ์ถ์ (Dimensionality Reduction)¶
๊ฐ์¶
์ฐจ์ ์ถ์๋ ๊ณ ์ฐจ์ ๋ฐ์ดํฐ๋ฅผ ์ ์ฐจ์์ผ๋ก ๋ณํํ์ฌ ๊ณ์ฐ ํจ์จ์ฑ์ ๋์ด๊ณ ์๊ฐํ๋ฅผ ๊ฐ๋ฅํ๊ฒ ํฉ๋๋ค. ์ฃผ์ ๋ฐฉ๋ฒ์ผ๋ก PCA, t-SNE, ํน์ฑ ์ ํ ๋ฑ์ด ์์ต๋๋ค.
1. ์ฐจ์ ์ถ์์ ํ์์ฑ¶
1.1 ์ฐจ์์ ์ ์ฃผ (Curse of Dimensionality)¶
"""
์ฐจ์์ ์ ์ฃผ:
1. ๊ณ ์ฐจ์์์ ๋ฐ์ดํฐ ํฌ์ธํธ ๊ฐ ๊ฑฐ๋ฆฌ๊ฐ ๋น์ทํด์ง
2. ๋ฐ์ดํฐ๊ฐ ํฌ์ํด์ง (sparse)
3. ๋ชจ๋ธ ํ์ต์ ๋ ๋ง์ ๋ฐ์ดํฐ ํ์
4. ๊ณผ์ ํฉ ์ํ ์ฆ๊ฐ
5. ๊ณ์ฐ ๋น์ฉ ์ฆ๊ฐ
์ฐจ์ ์ถ์์ ๋ชฉ์ :
1. ์๊ฐํ (2D/3D)
2. ๋
ธ์ด์ฆ ์ ๊ฑฐ
3. ๊ณ์ฐ ํจ์จ์ฑ
4. ๋ค์ค๊ณต์ ์ฑ ์ ๊ฑฐ
5. ํน์ฑ ์ถ์ถ
"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits, load_iris, fetch_olivetti_faces
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# ์ฐจ์์ ์ ์ฃผ ๋ฐ๋ชจ: ๊ณ ์ฐจ์์์ ๊ฑฐ๋ฆฌ ๋ถํฌ
np.random.seed(42)
def distance_distribution(n_dims, n_points=1000):
"""๊ณ ์ฐจ์์์ ๊ฑฐ๋ฆฌ ๋ถํฌ ํ์ธ"""
points = np.random.rand(n_points, n_dims)
# ๋๋ค ํฌ์ธํธ ์ ๊ฐ ๊ฑฐ๋ฆฌ
idx = np.random.choice(n_points, size=(500, 2), replace=False)
distances = [np.linalg.norm(points[i] - points[j]) for i, j in idx]
return distances
# ๋ค์ํ ์ฐจ์์์ ๊ฑฐ๋ฆฌ ๋ถํฌ
dims = [2, 10, 100, 1000]
fig, axes = plt.subplots(1, 4, figsize=(16, 4))
for ax, d in zip(axes, dims):
distances = distance_distribution(d)
ax.hist(distances, bins=30, edgecolor='black')
ax.set_title(f'Dim={d}\nMean={np.mean(distances):.2f}, Std={np.std(distances):.2f}')
ax.set_xlabel('Distance')
plt.tight_layout()
plt.show()
print("์ฐจ์์ด ์ฆ๊ฐํ ์๋ก ๊ฑฐ๋ฆฌ ๋ถํฌ๊ฐ ์ข์์ง โ ํฌ์ธํธ๋ค์ด ๋น์ทํ ๊ฑฐ๋ฆฌ์ ์์น")
2. ์ฃผ์ฑ๋ถ ๋ถ์ (PCA)¶
2.1 PCA์ ์๋ฆฌ¶
"""
PCA (Principal Component Analysis):
- ๋ฐ์ดํฐ์ ๋ถ์ฐ์ ์ต๋ํํ๋ ์ถ(์ฃผ์ฑ๋ถ)์ ์ฐพ์
- ๊ณ ์ฐจ์ โ ์ ์ฐจ์ ํฌ์
- ์ ํ ๋ณํ
์ํ์ ์๋ฆฌ:
1. ๋ฐ์ดํฐ ์ค์ฌํ (ํ๊ท 0)
2. ๊ณต๋ถ์ฐ ํ๋ ฌ ๊ณ์ฐ
3. ๊ณ ์ ๊ฐ ๋ถํด (eigendecomposition)
4. ๊ณ ์ ๊ฐ์ด ํฐ ์์๋ก ๊ณ ์ ๋ฒกํฐ(์ฃผ์ฑ๋ถ) ์ ํ
5. ์ ํ๋ ์ฃผ์ฑ๋ถ์ผ๋ก ๋ฐ์ดํฐ ํฌ์
์ฃผ์ฑ๋ถ:
- ์ฒซ ๋ฒ์งธ ์ฃผ์ฑ๋ถ: ๋ถ์ฐ์ด ๊ฐ์ฅ ํฐ ๋ฐฉํฅ
- ๋ ๋ฒ์งธ ์ฃผ์ฑ๋ถ: ์ฒซ ๋ฒ์งธ์ ์ง๊ตํ๋ฉด์ ๋ถ์ฐ์ด ํฐ ๋ฐฉํฅ
- n๋ฒ์งธ ์ฃผ์ฑ๋ถ: ์ด์ ์ฃผ์ฑ๋ถ๋ค๊ณผ ์ง๊ต
"""
from sklearn.decomposition import PCA
# 2D ์์๋ก PCA ์๊ฐํ
np.random.seed(42)
X_2d = np.dot(np.random.randn(200, 2), [[2, 1], [1, 2]])
# PCA ์ ์ฉ
pca = PCA(n_components=2)
pca.fit(X_2d)
# ์๊ฐํ
plt.figure(figsize=(10, 8))
plt.scatter(X_2d[:, 0], X_2d[:, 1], alpha=0.5)
# ์ฃผ์ฑ๋ถ ๋ฐฉํฅ (ํ์ดํ)
mean = pca.mean_
for i, (comp, var) in enumerate(zip(pca.components_, pca.explained_variance_)):
end = mean + comp * np.sqrt(var) * 3
plt.arrow(mean[0], mean[1], end[0]-mean[0], end[1]-mean[1],
head_width=0.3, head_length=0.2, fc=f'C{i}', ec=f'C{i}',
linewidth=2, label=f'PC{i+1} (Var: {var:.2f})')
plt.xlabel('X1')
plt.ylabel('X2')
plt.title('PCA: Principal Components')
plt.legend()
plt.axis('equal')
plt.grid(True, alpha=0.3)
plt.show()
print(f"์ฃผ์ฑ๋ถ:\n{pca.components_}")
print(f"์ค๋ช
๋ ๋ถ์ฐ: {pca.explained_variance_}")
print(f"์ค๋ช
๋ ๋ถ์ฐ ๋น์จ: {pca.explained_variance_ratio_}")
2.2 sklearn PCA ์ฌ์ฉ๋ฒ¶
from sklearn.decomposition import PCA
# Iris ๋ฐ์ดํฐ
iris = load_iris()
X = iris.data
y = iris.target
# ์ค์ผ์ผ๋ง (PCA ์ ํ์)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# PCA ์ ์ฉ (2์ฐจ์์ผ๋ก ์ถ์)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
print(f"์๋ณธ ํ์: {X.shape}")
print(f"PCA ํ ํ์: {X_pca.shape}")
print(f"์ค๋ช
๋ ๋ถ์ฐ ๋น์จ: {pca.explained_variance_ratio_}")
print(f"๋์ ์ค๋ช
๋ถ์ฐ: {sum(pca.explained_variance_ratio_):.4f}")
# ์๊ฐํ
plt.figure(figsize=(10, 8))
for i, target_name in enumerate(iris.target_names):
mask = y == i
plt.scatter(X_pca[mask, 0], X_pca[mask, 1], label=target_name, alpha=0.7)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%})')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
plt.title('PCA: Iris Dataset')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
2.3 ์ฃผ์ฑ๋ถ ์ ์ ํ¶
# ์ ์ฒด ์ฃผ์ฑ๋ถ์ผ๋ก PCA
pca_full = PCA()
pca_full.fit(X_scaled)
# ๋์ ์ค๋ช
๋ถ์ฐ
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
# ์๊ฐํ
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# ๊ฐ๋ณ ๋ถ์ฐ
axes[0].bar(range(1, len(pca_full.explained_variance_ratio_)+1),
pca_full.explained_variance_ratio_, edgecolor='black')
axes[0].set_xlabel('Principal Component')
axes[0].set_ylabel('Explained Variance Ratio')
axes[0].set_title('Individual Explained Variance')
# ๋์ ๋ถ์ฐ
axes[1].plot(range(1, len(cumulative_variance)+1), cumulative_variance, 'o-')
axes[1].axhline(y=0.95, color='r', linestyle='--', label='95% variance')
axes[1].axhline(y=0.99, color='g', linestyle='--', label='99% variance')
axes[1].set_xlabel('Number of Components')
axes[1].set_ylabel('Cumulative Explained Variance')
axes[1].set_title('Cumulative Explained Variance')
axes[1].legend()
plt.tight_layout()
plt.show()
# 95% ๋ถ์ฐ์ ์ค๋ช
ํ๋ ์ฃผ์ฑ๋ถ ์
n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1
print(f"95% ๋ถ์ฐ ์ค๋ช
์ ํ์ํ ์ฃผ์ฑ๋ถ ์: {n_components_95}")
2.4 PCA๋ก ๋ถ์ฐ ๋น์จ ์ง์ ¶
# ๋ถ์ฐ ๋น์จ๋ก ์ฃผ์ฑ๋ถ ์ ์๋ ๊ฒฐ์
pca_95 = PCA(n_components=0.95) # 95% ๋ถ์ฐ ์ค๋ช
X_pca_95 = pca_95.fit_transform(X_scaled)
print(f"95% ๋ถ์ฐ โ {pca_95.n_components_}๊ฐ ์ฃผ์ฑ๋ถ ์ ํ")
print(f"์ค์ ์ค๋ช
๋ ๋ถ์ฐ: {sum(pca_95.explained_variance_ratio_):.4f}")
# ๋ค์ํ ๋ถ์ฐ ๋น์จ
for var_ratio in [0.8, 0.9, 0.95, 0.99]:
pca_temp = PCA(n_components=var_ratio)
pca_temp.fit(X_scaled)
print(f"{var_ratio*100:.0f}% ๋ถ์ฐ โ {pca_temp.n_components_}๊ฐ ์ฃผ์ฑ๋ถ")
2.5 PCA ํ์ฉ: ๋ ธ์ด์ฆ ์ ๊ฑฐ¶
# ์ซ์ ์ด๋ฏธ์ง ๋ฐ์ดํฐ
digits = load_digits()
X_digits = digits.data
y_digits = digits.target
# ๋
ธ์ด์ฆ ์ถ๊ฐ
np.random.seed(42)
X_noisy = X_digits + np.random.normal(0, 4, X_digits.shape)
# PCA๋ก ๋
ธ์ด์ฆ ์ ๊ฑฐ (์ฃผ์ ์ฃผ์ฑ๋ถ๋ง ์ ์ง)
pca_denoise = PCA(n_components=20)
X_reduced = pca_denoise.fit_transform(X_noisy)
X_denoised = pca_denoise.inverse_transform(X_reduced)
# ์๊ฐํ
fig, axes = plt.subplots(3, 10, figsize=(15, 5))
for i in range(10):
# ์๋ณธ
axes[0, i].imshow(X_digits[i].reshape(8, 8), cmap='gray')
axes[0, i].axis('off')
if i == 0:
axes[0, i].set_title('Original')
# ๋
ธ์ด์ฆ
axes[1, i].imshow(X_noisy[i].reshape(8, 8), cmap='gray')
axes[1, i].axis('off')
if i == 0:
axes[1, i].set_title('Noisy')
# ๋ณต์
axes[2, i].imshow(X_denoised[i].reshape(8, 8), cmap='gray')
axes[2, i].axis('off')
if i == 0:
axes[2, i].set_title('Denoised')
plt.suptitle('PCA for Noise Reduction')
plt.tight_layout()
plt.show()
3. t-SNE¶
3.1 t-SNE ์๋ฆฌ¶
"""
t-SNE (t-distributed Stochastic Neighbor Embedding):
- ๋น์ ํ ์ฐจ์ ์ถ์
- ์๊ฐํ์ ์ฃผ๋ก ์ฌ์ฉ (2D/3D)
- ์ง์ญ ๊ตฌ์กฐ ๋ณด์กด์ ๋ฐ์ด๋จ
์๋ฆฌ:
1. ๊ณ ์ฐจ์์์ ์ ๋ค ๊ฐ ์ ์ฌ๋๋ฅผ ์กฐ๊ฑด๋ถ ํ๋ฅ ๋ก ๊ณ์ฐ
2. ์ ์ฐจ์์์ t-๋ถํฌ ๊ธฐ๋ฐ ์ ์ฌ๋ ์ ์
3. KL-divergence ์ต์ํ๋ก ์ ์ฐจ์ ์ขํ ํ์ต
ํน์ง:
- ๋น์ ํ ๊ด๊ณ ํฌ์ฐฉ
- ํด๋ฌ์คํฐ ๋ถ๋ฆฌ์ ํจ๊ณผ์
- ๊ณ์ฐ ๋น์ฉ ๋์
- ์ ๋ฐ์ดํฐ ๋ณํ ๋ถ๊ฐ (transform ์์)
- ๊ฒฐ๊ณผ ์ฌํ์ฑ ๋ฌธ์ (random_state ์ค์)
"""
from sklearn.manifold import TSNE
# t-SNE ์ ์ฉ
tsne = TSNE(
n_components=2,
perplexity=30, # ์ง์ญ ์ด์ ํฌ๊ธฐ (5-50)
learning_rate='auto', # ํ์ต๋ฅ
n_iter=1000, # ๋ฐ๋ณต ํ์
random_state=42
)
# ์๊ฐ์ด ์ค๋ ๊ฑธ๋ฆฌ๋ฏ๋ก ์ผ๋ถ๋ง ์ฌ์ฉ
X_sample = X_digits[:500]
y_sample = y_digits[:500]
X_tsne = tsne.fit_transform(X_sample)
# ์๊ฐํ
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_sample, cmap='tab10', alpha=0.7)
plt.colorbar(scatter)
plt.title('t-SNE: Digits Dataset')
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.show()
3.2 perplexity ํ๋ผ๋ฏธํฐ¶
# perplexity ํจ๊ณผ
perplexities = [5, 30, 50, 100]
fig, axes = plt.subplots(1, 4, figsize=(20, 5))
for ax, perp in zip(axes, perplexities):
tsne_temp = TSNE(n_components=2, perplexity=perp, random_state=42)
X_temp = tsne_temp.fit_transform(X_sample)
scatter = ax.scatter(X_temp[:, 0], X_temp[:, 1], c=y_sample, cmap='tab10', alpha=0.7)
ax.set_title(f'perplexity={perp}')
ax.set_xlabel('t-SNE 1')
ax.set_ylabel('t-SNE 2')
plt.tight_layout()
plt.show()
print("perplexity ๊ฐ์ด๋:")
print(" - ์์ ๊ฐ (5-10): ์ง์ญ ๊ตฌ์กฐ์ ์ง์ค")
print(" - ํฐ ๊ฐ (30-50): ์ ์ญ ๊ตฌ์กฐ ๊ณ ๋ ค")
print(" - ๋ฐ์ดํฐ ํฌ๊ธฐ์ ๋ฐ๋ผ ์กฐ์ ํ์")
3.3 PCA vs t-SNE ๋น๊ต¶
# ์ค์ผ์ผ๋ง
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_sample)
# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
# t-SNE
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)
# ๋น๊ต ์๊ฐํ
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
scatter1 = axes[0].scatter(X_pca[:, 0], X_pca[:, 1], c=y_sample, cmap='tab10', alpha=0.7)
axes[0].set_title('PCA')
axes[0].set_xlabel('PC1')
axes[0].set_ylabel('PC2')
scatter2 = axes[1].scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_sample, cmap='tab10', alpha=0.7)
axes[1].set_title('t-SNE')
axes[1].set_xlabel('t-SNE 1')
axes[1].set_ylabel('t-SNE 2')
plt.tight_layout()
plt.show()
print("PCA: ๋ถ์ฐ ์ต๋ํ, ์ ํ, ๋น ๋ฆ, ์ ์ญ ๊ตฌ์กฐ")
print("t-SNE: ์ด์ ๋ณด์กด, ๋น์ ํ, ๋๋ฆผ, ์ง์ญ ๊ตฌ์กฐ")
4. UMAP¶
"""
UMAP (Uniform Manifold Approximation and Projection):
- t-SNE๋ณด๋ค ๋น ๋ฆ
- ์ ์ญ ๊ตฌ์กฐ ๋ ์ ๋ณด์กด
- ์ ๋ฐ์ดํฐ ๋ณํ ๊ฐ๋ฅ
# pip install umap-learn
"""
# import umap
# umap_reducer = umap.UMAP(
# n_neighbors=15, # ์ง์ญ ์ด์ ์
# min_dist=0.1, # ํฌ์ธํธ ๊ฐ ์ต์ ๊ฑฐ๋ฆฌ
# n_components=2,
# random_state=42
# )
# X_umap = umap_reducer.fit_transform(X_scaled)
# ์ค์น ์์ด ์ค๋ช
print("UMAP ํน์ง:")
print(" - t-SNE๋ณด๋ค ๋น ๋ฆ")
print(" - ์ ์ญ ๊ตฌ์กฐ ๋ ์ ๋ณด์กด")
print(" - transform() ์ง์ (์ ๋ฐ์ดํฐ ๋ณํ)")
print(" - ์ฃผ์ ํ๋ผ๋ฏธํฐ: n_neighbors, min_dist")
5. ํน์ฑ ์ ํ (Feature Selection)¶
5.1 ํํฐ ๋ฐฉ๋ฒ (Filter Methods)¶
from sklearn.feature_selection import (
SelectKBest, SelectPercentile,
f_classif, mutual_info_classif, chi2
)
"""
ํํฐ ๋ฐฉ๋ฒ:
- ๋ชจ๋ธ๊ณผ ๋
๋ฆฝ์ ์ผ๋ก ํน์ฑ ํ๊ฐ
- ๋น ๋ฆ, ๊ฐ๋จ
- ํต๊ณ์ ๊ฒ์ ๊ธฐ๋ฐ
๋ฐฉ๋ฒ:
1. ๋ถ์ฐ ๊ธฐ๋ฐ: VarianceThreshold
2. ์๊ด๊ด๊ณ ๊ธฐ๋ฐ: ํ๊ฒ๊ณผ์ ์๊ด๊ณ์
3. ํต๊ณ ๊ฒ์ : ANOVA F-value, ์นด์ด์ ๊ณฑ
4. ์ ๋ณด ์ด๋ก : ์ํธ ์ ๋ณด๋
"""
# ๋ฐ์ดํฐ
X, y = load_iris(return_X_y=True)
# ANOVA F-value ๊ธฐ๋ฐ ํน์ฑ ์ ํ
selector = SelectKBest(score_func=f_classif, k=2)
X_selected = selector.fit_transform(X, y)
print("ANOVA F-value ํน์ฑ ์ ํ:")
print(f"์๋ณธ ํน์ฑ ์: {X.shape[1]}")
print(f"์ ํ๋ ํน์ฑ ์: {X_selected.shape[1]}")
print(f"๊ฐ ํน์ฑ ์ ์: {selector.scores_}")
print(f"์ ํ๋ ํน์ฑ ์ธ๋ฑ์ค: {selector.get_support(indices=True)}")
# ์ํธ ์ ๋ณด๋ ๊ธฐ๋ฐ
selector_mi = SelectKBest(score_func=mutual_info_classif, k=2)
selector_mi.fit(X, y)
print(f"\n์ํธ ์ ๋ณด๋ ์ ์: {selector_mi.scores_}")
5.2 ๋ํผ ๋ฐฉ๋ฒ (Wrapper Methods)¶
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LogisticRegression
"""
๋ํผ ๋ฐฉ๋ฒ:
- ๋ชจ๋ธ ์ฑ๋ฅ ๊ธฐ๋ฐ ํน์ฑ ์ ํ
- ์ ํํ์ง๋ง ๋๋ฆผ
- ๊ณผ์ ํฉ ์ํ
๋ฐฉ๋ฒ:
1. RFE (Recursive Feature Elimination)
2. ์ ์ง ์ ํ (Forward Selection)
3. ํ์ง ์ ๊ฑฐ (Backward Elimination)
"""
# RFE (์ฌ๊ท์ ํน์ฑ ์ ๊ฑฐ)
model = LogisticRegression(max_iter=1000)
rfe = RFE(estimator=model, n_features_to_select=2, step=1)
rfe.fit(X, y)
print("RFE ํน์ฑ ์ ํ:")
print(f"์ ํ๋ ํน์ฑ: {rfe.get_support()}")
print(f"ํน์ฑ ์์: {rfe.ranking_}")
# RFECV (๊ต์ฐจ ๊ฒ์ฆ ํฌํจ)
rfecv = RFECV(estimator=model, cv=5, scoring='accuracy')
rfecv.fit(X, y)
print(f"\nRFECV ์ต์ ํน์ฑ ์: {rfecv.n_features_}")
print(f"์ ํ๋ ํน์ฑ: {rfecv.get_support()}")
# CV ์ ์ ์๊ฐํ
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(rfecv.cv_results_['mean_test_score'])+1),
rfecv.cv_results_['mean_test_score'], 'o-')
plt.xlabel('Number of Features')
plt.ylabel('Cross-Validation Score')
plt.title('RFECV: Optimal Number of Features')
plt.grid(True, alpha=0.3)
plt.show()
5.3 ์๋ฒ ๋๋ ๋ฐฉ๋ฒ (Embedded Methods)¶
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso
"""
์๋ฒ ๋๋ ๋ฐฉ๋ฒ:
- ๋ชจ๋ธ ํ์ต ๊ณผ์ ์์ ํน์ฑ ์ ํ
- ํํฐ์ ๋ํผ์ ์ค๊ฐ
- L1 ์ ๊ทํ, ํธ๋ฆฌ ๊ธฐ๋ฐ ๋ชจ๋ธ
๋ฐฉ๋ฒ:
1. L1 ์ ๊ทํ (Lasso)
2. ํธ๋ฆฌ ๊ธฐ๋ฐ ์ค์๋
"""
# Random Forest ์ค์๋ ๊ธฐ๋ฐ
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
# ํน์ฑ ์ค์๋
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
# ์๊ฐํ
plt.figure(figsize=(10, 6))
plt.bar(range(X.shape[1]), importances[indices])
plt.xticks(range(X.shape[1]), [f'Feature {i}' for i in indices])
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title('Random Forest Feature Importance')
plt.show()
# SelectFromModel
selector = SelectFromModel(rf, threshold='median')
selector.fit(X, y)
X_selected = selector.transform(X)
print(f"Random Forest ๊ธฐ๋ฐ ์ ํ๋ ํน์ฑ ์: {X_selected.shape[1]}")
print(f"์ ํ๋ ํน์ฑ: {selector.get_support()}")
6. ๋ถ์ฐ ๊ธฐ๋ฐ ํน์ฑ ์ ํ¶
from sklearn.feature_selection import VarianceThreshold
# ์ํ ๋ฐ์ดํฐ (๋ถ์ฐ์ด ๋ค๋ฅธ ํน์ฑ)
X_var = np.array([
[0, 0, 1, 100],
[0, 0, 0, 101],
[0, 0, 1, 99],
[0, 0, 0, 100],
[0, 0, 1, 102]
])
# ๋ถ์ฐ์ด ๋ฎ์ ํน์ฑ ์ ๊ฑฐ
selector = VarianceThreshold(threshold=0.5)
X_high_var = selector.fit_transform(X_var)
print("๋ถ์ฐ ๊ธฐ๋ฐ ํน์ฑ ์ ํ:")
print(f"๊ฐ ํน์ฑ ๋ถ์ฐ: {selector.variances_}")
print(f"์ ํ๋ ํน์ฑ: {selector.get_support()}")
print(f"์๋ณธ ํ์: {X_var.shape}")
print(f"์ ํ ํ ํ์: {X_high_var.shape}")
7. ์๊ด๊ด๊ณ ๊ธฐ๋ฐ ํน์ฑ ์ ๊ฑฐ¶
import pandas as pd
# ์ํ ๋ฐ์ดํฐ (์๊ด๋ ํน์ฑ ํฌํจ)
np.random.seed(42)
n_samples = 100
X_corr = np.column_stack([
np.random.randn(n_samples), # ํน์ฑ 0
np.random.randn(n_samples), # ํน์ฑ 1
np.random.randn(n_samples), # ํน์ฑ 2
])
# ๋์ ์๊ด๊ด๊ณ ํน์ฑ ์ถ๊ฐ
X_corr = np.column_stack([X_corr, X_corr[:, 0] + np.random.randn(n_samples) * 0.1])
df = pd.DataFrame(X_corr, columns=['F0', 'F1', 'F2', 'F3'])
# ์๊ดํ๋ ฌ
corr_matrix = df.corr().abs()
# ์๊ด๊ด๊ณ ํํธ๋งต
plt.figure(figsize=(8, 6))
plt.imshow(corr_matrix, cmap='coolwarm', vmin=0, vmax=1)
plt.colorbar(label='Correlation')
plt.xticks(range(len(corr_matrix.columns)), corr_matrix.columns)
plt.yticks(range(len(corr_matrix.columns)), corr_matrix.columns)
plt.title('Feature Correlation Matrix')
for i in range(len(corr_matrix)):
for j in range(len(corr_matrix)):
plt.text(j, i, f'{corr_matrix.iloc[i, j]:.2f}',
ha='center', va='center')
plt.show()
# ๋์ ์๊ด๊ด๊ณ ํน์ฑ ์ ๊ฑฐ ํจ์
def remove_highly_correlated(df, threshold=0.9):
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
return df.drop(columns=to_drop), to_drop
df_cleaned, dropped = remove_highly_correlated(df, threshold=0.9)
print(f"์ ๊ฑฐ๋ ํน์ฑ: {dropped}")
print(f"๋จ์ ํน์ฑ: {list(df_cleaned.columns)}")
8. ์ฐจ์ ์ถ์ ํ์ดํ๋ผ์ธ¶
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
# ๋ฐ์ดํฐ
X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# PCA + SVM ํ์ดํ๋ผ์ธ
pipeline = Pipeline([
('scaler', StandardScaler()),
('pca', PCA(n_components=30)),
('svm', SVC(kernel='rbf', random_state=42))
])
# ๊ต์ฐจ ๊ฒ์ฆ
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)
print(f"PCA (30) + SVM CV ์ ์: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
# ์ ์ฒด ํน์ฑ vs PCA
pipeline_full = Pipeline([
('scaler', StandardScaler()),
('svm', SVC(kernel='rbf', random_state=42))
])
cv_scores_full = cross_val_score(pipeline_full, X_train, y_train, cv=5)
print(f"์ ์ฒด ํน์ฑ + SVM CV ์ ์: {cv_scores_full.mean():.4f} (+/- {cv_scores_full.std():.4f})")
print(f"\nPCA๋ก {X.shape[1]} โ 30 ์ฐจ์ ์ถ์")
9. Incremental PCA (๋์ฉ๋ ๋ฐ์ดํฐ)¶
from sklearn.decomposition import IncrementalPCA
"""
Incremental PCA:
- ๋์ฉ๋ ๋ฐ์ดํฐ์ ์ ํฉ
- ๋ฏธ๋๋ฐฐ์น๋ก ์ฒ๋ฆฌ
- ๋ฉ๋ชจ๋ฆฌ ํจ์จ์
"""
# ๋์ฉ๋ ๋ฐ์ดํฐ ์๋ฎฌ๋ ์ด์
X_large = np.random.randn(10000, 100)
# ์ผ๋ฐ PCA
pca_regular = PCA(n_components=10)
pca_regular.fit(X_large)
# Incremental PCA
ipca = IncrementalPCA(n_components=10, batch_size=500)
ipca.fit(X_large)
print("์ผ๋ฐ PCA vs Incremental PCA:")
print(f"์ค๋ช
๋ ๋ถ์ฐ ๋น์จ (์ผ๋ฐ): {sum(pca_regular.explained_variance_ratio_):.4f}")
print(f"์ค๋ช
๋ ๋ถ์ฐ ๋น์จ (์ฆ๋ถ): {sum(ipca.explained_variance_ratio_):.4f}")
# ๋ฐฐ์น๋ก ์ฒ๋ฆฌ (๋ฉ๋ชจ๋ฆฌ ํจ์จ)
ipca_batch = IncrementalPCA(n_components=10)
for batch_start in range(0, len(X_large), 1000):
batch = X_large[batch_start:batch_start+1000]
ipca_batch.partial_fit(batch)
print(f"๋ฐฐ์น ์ฒ๋ฆฌ ์ค๋ช
๋ ๋ถ์ฐ: {sum(ipca_batch.explained_variance_ratio_):.4f}")
10. ์ฐจ์ ์ถ์ ์๊ณ ๋ฆฌ์ฆ ๋น๊ต¶
from sklearn.decomposition import PCA, KernelPCA, TruncatedSVD
from sklearn.manifold import TSNE, MDS, Isomap
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
"""
์ฐจ์ ์ถ์ ์๊ณ ๋ฆฌ์ฆ ๋น๊ต:
1. PCA: ์ ํ, ๋ถ์ฐ ์ต๋ํ, ๋น ๋ฆ
2. Kernel PCA: ๋น์ ํ PCA
3. LDA: ํด๋์ค ๋ถ๋ฆฌ ์ต๋ํ (์ง๋ ํ์ต)
4. t-SNE: ์๊ฐํ, ์ง์ญ ๊ตฌ์กฐ
5. UMAP: ์๊ฐํ, ์ ์ญ+์ง์ญ ๊ตฌ์กฐ
6. MDS: ๊ฑฐ๋ฆฌ ๋ณด์กด
7. Isomap: ์ธก์ง์ ๊ฑฐ๋ฆฌ ๋ณด์กด
"""
# ์๊ณ ๋ฆฌ์ฆ ๋น๊ต (์์ ๋ฐ์ดํฐ์
)
algorithms = {
'PCA': PCA(n_components=2),
'Kernel PCA': KernelPCA(n_components=2, kernel='rbf'),
'LDA': LDA(n_components=2),
't-SNE': TSNE(n_components=2, random_state=42)
}
# ๋ฐ์ดํฐ
X, y = load_iris(return_X_y=True)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# ๋น๊ต ์๊ฐํ
fig, axes = plt.subplots(1, 4, figsize=(20, 5))
for ax, (name, algo) in zip(axes, algorithms.items()):
if name == 'LDA':
X_reduced = algo.fit_transform(X_scaled, y)
else:
X_reduced = algo.fit_transform(X_scaled)
scatter = ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, cmap='viridis', alpha=0.7)
ax.set_title(name)
ax.set_xlabel('Component 1')
ax.set_ylabel('Component 2')
plt.tight_layout()
plt.show()
์ฐ์ต ๋ฌธ์ ¶
๋ฌธ์ 1: PCA ์ ์ฉ¶
Digits ๋ฐ์ดํฐ์ PCA๋ฅผ ์ ์ฉํ๊ณ 95% ๋ถ์ฐ์ ์ค๋ช ํ๋ ์ฃผ์ฑ๋ถ ์๋ฅผ ์ฐพ์ผ์ธ์.
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
digits = load_digits()
X = digits.data
# ํ์ด
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA()
pca.fit(X_scaled)
cumsum = np.cumsum(pca.explained_variance_ratio_)
n_95 = np.argmax(cumsum >= 0.95) + 1
print(f"95% ๋ถ์ฐ์ ํ์ํ ์ฃผ์ฑ๋ถ ์: {n_95}")
print(f"์๋ณธ ์ฐจ์: {X.shape[1]}")
๋ฌธ์ 2: t-SNE ์๊ฐํ¶
Digits ๋ฐ์ดํฐ๋ฅผ t-SNE๋ก ์๊ฐํํ์ธ์.
from sklearn.manifold import TSNE
# ํ์ด (์๊ฐ ๋จ์ถ์ ์ํด ์ผ๋ถ๋ง)
X_sample = X[:500]
y_sample = digits.target[:500]
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_tsne = tsne.fit_transform(X_sample)
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_sample, cmap='tab10')
plt.colorbar(scatter)
plt.title('t-SNE: Digits')
plt.show()
๋ฌธ์ 3: ํน์ฑ ์ ํ¶
Random Forest ์ค์๋ ๊ธฐ๋ฐ์ผ๋ก ์์ 20๊ฐ ํน์ฑ์ ์ ํํ์ธ์.
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
# ํ์ด
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, digits.target)
# ์์ 20๊ฐ
selector = SelectFromModel(rf, max_features=20, threshold=-np.inf)
selector.fit(X, digits.target)
X_selected = selector.transform(X)
print(f"์ ํ๋ ํน์ฑ ์: {X_selected.shape[1]}")
print(f"์ ํ๋ ํน์ฑ ์ธ๋ฑ์ค: {np.where(selector.get_support())[0]}")
์์ฝ¶
| ๋ฐฉ๋ฒ | ์ ํ | ํน์ง | ์ฉ๋ |
|---|---|---|---|
| PCA | ์ ํ | ๋ถ์ฐ ์ต๋ํ | ์ผ๋ฐ์ ์ธ ์ฐจ์ ์ถ์ |
| Kernel PCA | ๋น์ ํ | ์ปค๋ ํธ๋ฆญ | ๋น์ ํ ํจํด |
| LDA | ์ง๋ ํ์ต | ํด๋์ค ๋ถ๋ฆฌ | ๋ถ๋ฅ ์ ์ฒ๋ฆฌ |
| t-SNE | ๋น์ ํ | ์ง์ญ ๊ตฌ์กฐ ๋ณด์กด | ์๊ฐํ |
| UMAP | ๋น์ ํ | ๋น ๋ฆ, ์ ์ญ ๊ตฌ์กฐ | ์๊ฐํ |
ํน์ฑ ์ ํ ๋ฐฉ๋ฒ ๋น๊ต¶
| ๋ฐฉ๋ฒ | ์ ํ | ์ฅ์ | ๋จ์ |
|---|---|---|---|
| Filter | ํต๊ณ ๊ธฐ๋ฐ | ๋น ๋ฆ | ํน์ฑ ๊ฐ ๊ด๊ณ ๋ฌด์ |
| Wrapper | ๋ชจ๋ธ ๊ธฐ๋ฐ | ์ ํ | ๋๋ฆผ, ๊ณผ์ ํฉ |
| Embedded | ํ์ต ์ค ์ ํ | ํจ์จ์ | ๋ชจ๋ธ ์์กด์ |
์ฐจ์ ์ถ์ ์ ํ ๊ฐ์ด๋¶
| ์ํฉ | ๊ถ์ฅ ๋ฐฉ๋ฒ |
|---|---|
| ๋ ธ์ด์ฆ ์ ๊ฑฐ, ์์ถ | PCA |
| ์๊ฐํ (2D/3D) | t-SNE, UMAP |
| ๋ถ๋ฅ ์ ์ฒ๋ฆฌ | LDA |
| ๋น์ ํ ํจํด | Kernel PCA, UMAP |
| ๋์ฉ๋ ๋ฐ์ดํฐ | Incremental PCA, TruncatedSVD |
| ํน์ฑ ํด์ ํ์ | ํน์ฑ ์ ํ (Filter/Embedded) |