1{
2 "cells": [
3 {
4 "cell_type": "markdown",
5 "metadata": {},
6 "source": [
7 "# 12. ์ฐจ์ ์ถ์ (PCA, t-SNE)\n",
8 "\n",
9 "## ํ์ต ๋ชฉํ\n",
10 "- PCA(์ฃผ์ฑ๋ถ ๋ถ์) ์ดํด\n",
11 "- ์ฐจ์ ์ถ์์ ๋ชฉ์ ๊ณผ ํ์ฉ\n",
12 "- t-SNE๋ฅผ ์ด์ฉํ ์๊ฐํ"
13 ]
14 },
15 {
16 "cell_type": "code",
17 "execution_count": null,
18 "metadata": {},
19 "outputs": [],
20 "source": [
21 "import numpy as np\n",
22 "import pandas as pd\n",
23 "import matplotlib.pyplot as plt\n",
24 "from sklearn.decomposition import PCA\n",
25 "from sklearn.manifold import TSNE\n",
26 "from sklearn.preprocessing import StandardScaler\n",
27 "from sklearn.datasets import load_iris, load_digits\n",
28 "import seaborn as sns\n",
29 "\n",
30 "plt.rcParams['font.family'] = 'DejaVu Sans'"
31 ]
32 },
33 {
34 "cell_type": "markdown",
35 "metadata": {},
36 "source": [
37 "## 1. PCA ๊ธฐ๋ณธ ๊ฐ๋
"
38 ]
39 },
40 {
41 "cell_type": "code",
42 "execution_count": null,
43 "metadata": {},
44 "outputs": [],
45 "source": [
46 "# Iris ๋ฐ์ดํฐ\n",
47 "iris = load_iris()\n",
48 "X = iris.data\n",
49 "y = iris.target\n",
50 "\n",
51 "print(f\"Original shape: {X.shape}\")\n",
52 "print(f\"Features: {iris.feature_names}\")"
53 ]
54 },
55 {
56 "cell_type": "code",
57 "execution_count": null,
58 "metadata": {},
59 "outputs": [],
60 "source": [
61 "# ๋ฐ์ดํฐ ์ค์ผ์ผ๋ง (PCA ์ ํ์)\n",
62 "scaler = StandardScaler()\n",
63 "X_scaled = scaler.fit_transform(X)\n",
64 "\n",
65 "# PCA ์ ์ฉ\n",
66 "pca = PCA(n_components=2)\n",
67 "X_pca = pca.fit_transform(X_scaled)\n",
68 "\n",
69 "print(f\"Reduced shape: {X_pca.shape}\")\n",
70 "print(f\"Explained variance ratio: {pca.explained_variance_ratio_}\")\n",
71 "print(f\"Total explained variance: {sum(pca.explained_variance_ratio_):.4f}\")"
72 ]
73 },
74 {
75 "cell_type": "code",
76 "execution_count": null,
77 "metadata": {},
78 "outputs": [],
79 "source": [
80 "# PCA ๊ฒฐ๊ณผ ์๊ฐํ\n",
81 "plt.figure(figsize=(10, 8))\n",
82 "colors = ['blue', 'green', 'red']\n",
83 "\n",
84 "for i, (color, name) in enumerate(zip(colors, iris.target_names)):\n",
85 " idx = y == i\n",
86 " plt.scatter(X_pca[idx, 0], X_pca[idx, 1], c=color, label=name, \n",
87 " alpha=0.7, edgecolors='black')\n",
88 "\n",
89 "plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)')\n",
90 "plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)')\n",
91 "plt.title('PCA - Iris Dataset')\n",
92 "plt.legend()\n",
93 "plt.grid(True, alpha=0.3)\n",
94 "plt.show()"
95 ]
96 },
97 {
98 "cell_type": "markdown",
99 "metadata": {},
100 "source": [
101 "## 2. ์ค๋ช
๋ถ์ฐ ๋ถ์"
102 ]
103 },
104 {
105 "cell_type": "code",
106 "execution_count": null,
107 "metadata": {},
108 "outputs": [],
109 "source": [
110 "# ์ ์ฒด ์ฑ๋ถ PCA\n",
111 "pca_full = PCA()\n",
112 "pca_full.fit(X_scaled)\n",
113 "\n",
114 "# ๋์ ์ค๋ช
๋ถ์ฐ\n",
115 "cumsum = np.cumsum(pca_full.explained_variance_ratio_)\n",
116 "\n",
117 "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
118 "\n",
119 "# ๊ฐ๋ณ ์ค๋ช
๋ถ์ฐ\n",
120 "axes[0].bar(range(1, len(pca_full.explained_variance_ratio_)+1), \n",
121 " pca_full.explained_variance_ratio_)\n",
122 "axes[0].set_xlabel('Principal Component')\n",
123 "axes[0].set_ylabel('Explained Variance Ratio')\n",
124 "axes[0].set_title('Explained Variance by Component')\n",
125 "axes[0].grid(True, alpha=0.3)\n",
126 "\n",
127 "# ๋์ ์ค๋ช
๋ถ์ฐ\n",
128 "axes[1].plot(range(1, len(cumsum)+1), cumsum, 'b-o')\n",
129 "axes[1].axhline(y=0.95, color='r', linestyle='--', label='95% threshold')\n",
130 "axes[1].set_xlabel('Number of Components')\n",
131 "axes[1].set_ylabel('Cumulative Explained Variance')\n",
132 "axes[1].set_title('Cumulative Explained Variance')\n",
133 "axes[1].legend()\n",
134 "axes[1].grid(True, alpha=0.3)\n",
135 "\n",
136 "plt.tight_layout()\n",
137 "plt.show()"
138 ]
139 },
140 {
141 "cell_type": "markdown",
142 "metadata": {},
143 "source": [
144 "## 3. ์ฃผ์ฑ๋ถ ํด์"
145 ]
146 },
147 {
148 "cell_type": "code",
149 "execution_count": null,
150 "metadata": {},
151 "outputs": [],
152 "source": [
153 "# ์ฃผ์ฑ๋ถ ๋ก๋ฉ(๊ฐ์ค์น)\n",
154 "loadings = pd.DataFrame(\n",
155 " pca_full.components_.T,\n",
156 " columns=[f'PC{i+1}' for i in range(len(iris.feature_names))],\n",
157 " index=iris.feature_names\n",
158 ")\n",
159 "\n",
160 "print(\"=== Principal Component Loadings ===\")\n",
161 "print(loadings.round(4))"
162 ]
163 },
164 {
165 "cell_type": "code",
166 "execution_count": null,
167 "metadata": {},
168 "outputs": [],
169 "source": [
170 "# ๋ก๋ฉ ํํธ๋งต\n",
171 "plt.figure(figsize=(10, 6))\n",
172 "sns.heatmap(loadings.iloc[:, :2], annot=True, cmap='coolwarm', center=0)\n",
173 "plt.title('PCA Component Loadings')\n",
174 "plt.tight_layout()\n",
175 "plt.show()"
176 ]
177 },
178 {
179 "cell_type": "markdown",
180 "metadata": {},
181 "source": [
182 "## 4. t-SNE ์๊ฐํ"
183 ]
184 },
185 {
186 "cell_type": "code",
187 "execution_count": null,
188 "metadata": {},
189 "outputs": [],
190 "source": [
191 "# Digits ๋ฐ์ดํฐ์
(๊ณ ์ฐจ์)\n",
192 "digits = load_digits()\n",
193 "X_digits = digits.data\n",
194 "y_digits = digits.target\n",
195 "\n",
196 "print(f\"Digits shape: {X_digits.shape}\")\n",
197 "print(f\"Number of classes: {len(np.unique(y_digits))}\")"
198 ]
199 },
200 {
201 "cell_type": "code",
202 "execution_count": null,
203 "metadata": {},
204 "outputs": [],
205 "source": [
206 "# PCA vs t-SNE ๋น๊ต\n",
207 "# PCA\n",
208 "pca_digits = PCA(n_components=2)\n",
209 "X_pca_digits = pca_digits.fit_transform(X_digits)\n",
210 "\n",
211 "# t-SNE (์๊ฐ์ด ๊ฑธ๋ฆด ์ ์์)\n",
212 "tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)\n",
213 "X_tsne = tsne.fit_transform(X_digits)\n",
214 "\n",
215 "print(\"Transformation complete!\")"
216 ]
217 },
218 {
219 "cell_type": "code",
220 "execution_count": null,
221 "metadata": {},
222 "outputs": [],
223 "source": [
224 "# ๋น๊ต ์๊ฐํ\n",
225 "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
226 "\n",
227 "# PCA\n",
228 "scatter1 = axes[0].scatter(X_pca_digits[:, 0], X_pca_digits[:, 1], \n",
229 " c=y_digits, cmap='tab10', alpha=0.6, s=20)\n",
230 "axes[0].set_title('PCA - Digits Dataset')\n",
231 "axes[0].set_xlabel('PC1')\n",
232 "axes[0].set_ylabel('PC2')\n",
233 "plt.colorbar(scatter1, ax=axes[0], label='Digit')\n",
234 "\n",
235 "# t-SNE\n",
236 "scatter2 = axes[1].scatter(X_tsne[:, 0], X_tsne[:, 1], \n",
237 " c=y_digits, cmap='tab10', alpha=0.6, s=20)\n",
238 "axes[1].set_title('t-SNE - Digits Dataset')\n",
239 "axes[1].set_xlabel('t-SNE 1')\n",
240 "axes[1].set_ylabel('t-SNE 2')\n",
241 "plt.colorbar(scatter2, ax=axes[1], label='Digit')\n",
242 "\n",
243 "plt.tight_layout()\n",
244 "plt.show()"
245 ]
246 },
247 {
248 "cell_type": "markdown",
249 "metadata": {},
250 "source": [
251 "## 5. PCA๋ฅผ ์ด์ฉํ ๋
ธ์ด์ฆ ์ ๊ฑฐ"
252 ]
253 },
254 {
255 "cell_type": "code",
256 "execution_count": null,
257 "metadata": {},
258 "outputs": [],
259 "source": [
260 "# ๋
ธ์ด์ฆ ์๋ Digits ์ด๋ฏธ์ง\n",
261 "np.random.seed(42)\n",
262 "noise = np.random.normal(0, 4, X_digits.shape)\n",
263 "X_noisy = X_digits + noise\n",
264 "\n",
265 "# PCA๋ก ์ฌ๊ตฌ์ฑ (์์ ์ฑ๋ถ๋ง ์ฌ์ฉ)\n",
266 "pca_denoise = PCA(n_components=20)\n",
267 "X_reconstructed = pca_denoise.inverse_transform(\n",
268 " pca_denoise.fit_transform(X_noisy)\n",
269 ")\n",
270 "\n",
271 "# ๊ฒฐ๊ณผ ์๊ฐํ\n",
272 "fig, axes = plt.subplots(3, 10, figsize=(15, 5))\n",
273 "\n",
274 "for i in range(10):\n",
275 " # ์๋ณธ\n",
276 " axes[0, i].imshow(X_digits[i].reshape(8, 8), cmap='gray')\n",
277 " axes[0, i].axis('off')\n",
278 " if i == 0:\n",
279 " axes[0, i].set_title('Original')\n",
280 " \n",
281 " # ๋
ธ์ด์ฆ\n",
282 " axes[1, i].imshow(X_noisy[i].reshape(8, 8), cmap='gray')\n",
283 " axes[1, i].axis('off')\n",
284 " if i == 0:\n",
285 " axes[1, i].set_title('Noisy')\n",
286 " \n",
287 " # ๋ณต์\n",
288 " axes[2, i].imshow(X_reconstructed[i].reshape(8, 8), cmap='gray')\n",
289 " axes[2, i].axis('off')\n",
290 " if i == 0:\n",
291 " axes[2, i].set_title('Denoised')\n",
292 "\n",
293 "plt.suptitle('PCA Denoising (20 components)', fontsize=14)\n",
294 "plt.tight_layout()\n",
295 "plt.show()"
296 ]
297 },
298 {
299 "cell_type": "markdown",
300 "metadata": {},
301 "source": [
302 "## ์ ๋ฆฌ\n",
303 "\n",
304 "### PCA vs t-SNE\n",
305 "\n",
306 "| ํน์ฑ | PCA | t-SNE |\n",
307 "|------|-----|-------|\n",
308 "| ๋ชฉ์ | ์ฐจ์ ์ถ์, ๋ถ์ฐ ์ต๋ํ | ์๊ฐํ, ๊ตฐ์ง ๊ตฌ์กฐ |\n",
309 "| ์ ํ์ฑ | ์ ํ ๋ณํ | ๋น์ ํ ๋ณํ |\n",
310 "| ์๋ | ๋น ๋ฆ | ๋๋ฆผ |\n",
311 "| ์ฌํ์ฑ | ๊ฒฐ์ ์ | random_state ํ์ |\n",
312 "| ํด์ | ๊ฐ๋ฅ (๋ก๋ฉ) | ์ด๋ ค์ |\n",
313 "\n",
314 "### ํ์ฉ\n",
315 "- **PCA**: ์ ์ฒ๋ฆฌ, ๋
ธ์ด์ฆ ์ ๊ฑฐ, ๋ค์ค๊ณต์ ์ฑ ํด์\n",
316 "- **t-SNE**: ๊ณ ์ฐจ์ ๋ฐ์ดํฐ ์๊ฐํ, ๊ตฐ์ง ํ์"
317 ]
318 }
319 ],
320 "metadata": {
321 "kernelspec": {
322 "display_name": "Python 3",
323 "language": "python",
324 "name": "python3"
325 },
326 "language_info": {
327 "name": "python",
328 "version": "3.10.0"
329 }
330 },
331 "nbformat": 4,
332 "nbformat_minor": 4
333}