12_pca.ipynb

Download
json 334 lines 9.0 KB
  1{
  2 "cells": [
  3  {
  4   "cell_type": "markdown",
  5   "metadata": {},
  6   "source": [
  7    "# 12. ์ฐจ์› ์ถ•์†Œ (PCA, t-SNE)\n",
  8    "\n",
  9    "## ํ•™์Šต ๋ชฉํ‘œ\n",
 10    "- PCA(์ฃผ์„ฑ๋ถ„ ๋ถ„์„) ์ดํ•ด\n",
 11    "- ์ฐจ์› ์ถ•์†Œ์˜ ๋ชฉ์ ๊ณผ ํ™œ์šฉ\n",
 12    "- t-SNE๋ฅผ ์ด์šฉํ•œ ์‹œ๊ฐํ™”"
 13   ]
 14  },
 15  {
 16   "cell_type": "code",
 17   "execution_count": null,
 18   "metadata": {},
 19   "outputs": [],
 20   "source": [
 21    "import numpy as np\n",
 22    "import pandas as pd\n",
 23    "import matplotlib.pyplot as plt\n",
 24    "from sklearn.decomposition import PCA\n",
 25    "from sklearn.manifold import TSNE\n",
 26    "from sklearn.preprocessing import StandardScaler\n",
 27    "from sklearn.datasets import load_iris, load_digits\n",
 28    "import seaborn as sns\n",
 29    "\n",
 30    "plt.rcParams['font.family'] = 'DejaVu Sans'"
 31   ]
 32  },
 33  {
 34   "cell_type": "markdown",
 35   "metadata": {},
 36   "source": [
 37    "## 1. PCA ๊ธฐ๋ณธ ๊ฐœ๋…"
 38   ]
 39  },
 40  {
 41   "cell_type": "code",
 42   "execution_count": null,
 43   "metadata": {},
 44   "outputs": [],
 45   "source": [
 46    "# Iris ๋ฐ์ดํ„ฐ\n",
 47    "iris = load_iris()\n",
 48    "X = iris.data\n",
 49    "y = iris.target\n",
 50    "\n",
 51    "print(f\"Original shape: {X.shape}\")\n",
 52    "print(f\"Features: {iris.feature_names}\")"
 53   ]
 54  },
 55  {
 56   "cell_type": "code",
 57   "execution_count": null,
 58   "metadata": {},
 59   "outputs": [],
 60   "source": [
 61    "# ๋ฐ์ดํ„ฐ ์Šค์ผ€์ผ๋ง (PCA ์ „ ํ•„์ˆ˜)\n",
 62    "scaler = StandardScaler()\n",
 63    "X_scaled = scaler.fit_transform(X)\n",
 64    "\n",
 65    "# PCA ์ ์šฉ\n",
 66    "pca = PCA(n_components=2)\n",
 67    "X_pca = pca.fit_transform(X_scaled)\n",
 68    "\n",
 69    "print(f\"Reduced shape: {X_pca.shape}\")\n",
 70    "print(f\"Explained variance ratio: {pca.explained_variance_ratio_}\")\n",
 71    "print(f\"Total explained variance: {sum(pca.explained_variance_ratio_):.4f}\")"
 72   ]
 73  },
 74  {
 75   "cell_type": "code",
 76   "execution_count": null,
 77   "metadata": {},
 78   "outputs": [],
 79   "source": [
 80    "# PCA ๊ฒฐ๊ณผ ์‹œ๊ฐํ™”\n",
 81    "plt.figure(figsize=(10, 8))\n",
 82    "colors = ['blue', 'green', 'red']\n",
 83    "\n",
 84    "for i, (color, name) in enumerate(zip(colors, iris.target_names)):\n",
 85    "    idx = y == i\n",
 86    "    plt.scatter(X_pca[idx, 0], X_pca[idx, 1], c=color, label=name, \n",
 87    "                alpha=0.7, edgecolors='black')\n",
 88    "\n",
 89    "plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)')\n",
 90    "plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)')\n",
 91    "plt.title('PCA - Iris Dataset')\n",
 92    "plt.legend()\n",
 93    "plt.grid(True, alpha=0.3)\n",
 94    "plt.show()"
 95   ]
 96  },
 97  {
 98   "cell_type": "markdown",
 99   "metadata": {},
100   "source": [
101    "## 2. ์„ค๋ช… ๋ถ„์‚ฐ ๋ถ„์„"
102   ]
103  },
104  {
105   "cell_type": "code",
106   "execution_count": null,
107   "metadata": {},
108   "outputs": [],
109   "source": [
110    "# ์ „์ฒด ์„ฑ๋ถ„ PCA\n",
111    "pca_full = PCA()\n",
112    "pca_full.fit(X_scaled)\n",
113    "\n",
114    "# ๋ˆ„์  ์„ค๋ช… ๋ถ„์‚ฐ\n",
115    "cumsum = np.cumsum(pca_full.explained_variance_ratio_)\n",
116    "\n",
117    "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
118    "\n",
119    "# ๊ฐœ๋ณ„ ์„ค๋ช… ๋ถ„์‚ฐ\n",
120    "axes[0].bar(range(1, len(pca_full.explained_variance_ratio_)+1), \n",
121    "            pca_full.explained_variance_ratio_)\n",
122    "axes[0].set_xlabel('Principal Component')\n",
123    "axes[0].set_ylabel('Explained Variance Ratio')\n",
124    "axes[0].set_title('Explained Variance by Component')\n",
125    "axes[0].grid(True, alpha=0.3)\n",
126    "\n",
127    "# ๋ˆ„์  ์„ค๋ช… ๋ถ„์‚ฐ\n",
128    "axes[1].plot(range(1, len(cumsum)+1), cumsum, 'b-o')\n",
129    "axes[1].axhline(y=0.95, color='r', linestyle='--', label='95% threshold')\n",
130    "axes[1].set_xlabel('Number of Components')\n",
131    "axes[1].set_ylabel('Cumulative Explained Variance')\n",
132    "axes[1].set_title('Cumulative Explained Variance')\n",
133    "axes[1].legend()\n",
134    "axes[1].grid(True, alpha=0.3)\n",
135    "\n",
136    "plt.tight_layout()\n",
137    "plt.show()"
138   ]
139  },
140  {
141   "cell_type": "markdown",
142   "metadata": {},
143   "source": [
144    "## 3. ์ฃผ์„ฑ๋ถ„ ํ•ด์„"
145   ]
146  },
147  {
148   "cell_type": "code",
149   "execution_count": null,
150   "metadata": {},
151   "outputs": [],
152   "source": [
153    "# ์ฃผ์„ฑ๋ถ„ ๋กœ๋”ฉ(๊ฐ€์ค‘์น˜)\n",
154    "loadings = pd.DataFrame(\n",
155    "    pca_full.components_.T,\n",
156    "    columns=[f'PC{i+1}' for i in range(len(iris.feature_names))],\n",
157    "    index=iris.feature_names\n",
158    ")\n",
159    "\n",
160    "print(\"=== Principal Component Loadings ===\")\n",
161    "print(loadings.round(4))"
162   ]
163  },
164  {
165   "cell_type": "code",
166   "execution_count": null,
167   "metadata": {},
168   "outputs": [],
169   "source": [
170    "# ๋กœ๋”ฉ ํžˆํŠธ๋งต\n",
171    "plt.figure(figsize=(10, 6))\n",
172    "sns.heatmap(loadings.iloc[:, :2], annot=True, cmap='coolwarm', center=0)\n",
173    "plt.title('PCA Component Loadings')\n",
174    "plt.tight_layout()\n",
175    "plt.show()"
176   ]
177  },
178  {
179   "cell_type": "markdown",
180   "metadata": {},
181   "source": [
182    "## 4. t-SNE ์‹œ๊ฐํ™”"
183   ]
184  },
185  {
186   "cell_type": "code",
187   "execution_count": null,
188   "metadata": {},
189   "outputs": [],
190   "source": [
191    "# Digits ๋ฐ์ดํ„ฐ์…‹ (๊ณ ์ฐจ์›)\n",
192    "digits = load_digits()\n",
193    "X_digits = digits.data\n",
194    "y_digits = digits.target\n",
195    "\n",
196    "print(f\"Digits shape: {X_digits.shape}\")\n",
197    "print(f\"Number of classes: {len(np.unique(y_digits))}\")"
198   ]
199  },
200  {
201   "cell_type": "code",
202   "execution_count": null,
203   "metadata": {},
204   "outputs": [],
205   "source": [
206    "# PCA vs t-SNE ๋น„๊ต\n",
207    "# PCA\n",
208    "pca_digits = PCA(n_components=2)\n",
209    "X_pca_digits = pca_digits.fit_transform(X_digits)\n",
210    "\n",
211    "# t-SNE (์‹œ๊ฐ„์ด ๊ฑธ๋ฆด ์ˆ˜ ์žˆ์Œ)\n",
212    "tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)\n",
213    "X_tsne = tsne.fit_transform(X_digits)\n",
214    "\n",
215    "print(\"Transformation complete!\")"
216   ]
217  },
218  {
219   "cell_type": "code",
220   "execution_count": null,
221   "metadata": {},
222   "outputs": [],
223   "source": [
224    "# ๋น„๊ต ์‹œ๊ฐํ™”\n",
225    "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
226    "\n",
227    "# PCA\n",
228    "scatter1 = axes[0].scatter(X_pca_digits[:, 0], X_pca_digits[:, 1], \n",
229    "                           c=y_digits, cmap='tab10', alpha=0.6, s=20)\n",
230    "axes[0].set_title('PCA - Digits Dataset')\n",
231    "axes[0].set_xlabel('PC1')\n",
232    "axes[0].set_ylabel('PC2')\n",
233    "plt.colorbar(scatter1, ax=axes[0], label='Digit')\n",
234    "\n",
235    "# t-SNE\n",
236    "scatter2 = axes[1].scatter(X_tsne[:, 0], X_tsne[:, 1], \n",
237    "                           c=y_digits, cmap='tab10', alpha=0.6, s=20)\n",
238    "axes[1].set_title('t-SNE - Digits Dataset')\n",
239    "axes[1].set_xlabel('t-SNE 1')\n",
240    "axes[1].set_ylabel('t-SNE 2')\n",
241    "plt.colorbar(scatter2, ax=axes[1], label='Digit')\n",
242    "\n",
243    "plt.tight_layout()\n",
244    "plt.show()"
245   ]
246  },
247  {
248   "cell_type": "markdown",
249   "metadata": {},
250   "source": [
251    "## 5. PCA๋ฅผ ์ด์šฉํ•œ ๋…ธ์ด์ฆˆ ์ œ๊ฑฐ"
252   ]
253  },
254  {
255   "cell_type": "code",
256   "execution_count": null,
257   "metadata": {},
258   "outputs": [],
259   "source": [
260    "# ๋…ธ์ด์ฆˆ ์žˆ๋Š” Digits ์ด๋ฏธ์ง€\n",
261    "np.random.seed(42)\n",
262    "noise = np.random.normal(0, 4, X_digits.shape)\n",
263    "X_noisy = X_digits + noise\n",
264    "\n",
265    "# PCA๋กœ ์žฌ๊ตฌ์„ฑ (์ƒ์œ„ ์„ฑ๋ถ„๋งŒ ์‚ฌ์šฉ)\n",
266    "pca_denoise = PCA(n_components=20)\n",
267    "X_reconstructed = pca_denoise.inverse_transform(\n",
268    "    pca_denoise.fit_transform(X_noisy)\n",
269    ")\n",
270    "\n",
271    "# ๊ฒฐ๊ณผ ์‹œ๊ฐํ™”\n",
272    "fig, axes = plt.subplots(3, 10, figsize=(15, 5))\n",
273    "\n",
274    "for i in range(10):\n",
275    "    # ์›๋ณธ\n",
276    "    axes[0, i].imshow(X_digits[i].reshape(8, 8), cmap='gray')\n",
277    "    axes[0, i].axis('off')\n",
278    "    if i == 0:\n",
279    "        axes[0, i].set_title('Original')\n",
280    "    \n",
281    "    # ๋…ธ์ด์ฆˆ\n",
282    "    axes[1, i].imshow(X_noisy[i].reshape(8, 8), cmap='gray')\n",
283    "    axes[1, i].axis('off')\n",
284    "    if i == 0:\n",
285    "        axes[1, i].set_title('Noisy')\n",
286    "    \n",
287    "    # ๋ณต์›\n",
288    "    axes[2, i].imshow(X_reconstructed[i].reshape(8, 8), cmap='gray')\n",
289    "    axes[2, i].axis('off')\n",
290    "    if i == 0:\n",
291    "        axes[2, i].set_title('Denoised')\n",
292    "\n",
293    "plt.suptitle('PCA Denoising (20 components)', fontsize=14)\n",
294    "plt.tight_layout()\n",
295    "plt.show()"
296   ]
297  },
298  {
299   "cell_type": "markdown",
300   "metadata": {},
301   "source": [
302    "## ์ •๋ฆฌ\n",
303    "\n",
304    "### PCA vs t-SNE\n",
305    "\n",
306    "| ํŠน์„ฑ | PCA | t-SNE |\n",
307    "|------|-----|-------|\n",
308    "| ๋ชฉ์  | ์ฐจ์› ์ถ•์†Œ, ๋ถ„์‚ฐ ์ตœ๋Œ€ํ™” | ์‹œ๊ฐํ™”, ๊ตฐ์ง‘ ๊ตฌ์กฐ |\n",
309    "| ์„ ํ˜•์„ฑ | ์„ ํ˜• ๋ณ€ํ™˜ | ๋น„์„ ํ˜• ๋ณ€ํ™˜ |\n",
310    "| ์†๋„ | ๋น ๋ฆ„ | ๋А๋ฆผ |\n",
311    "| ์žฌํ˜„์„ฑ | ๊ฒฐ์ •์  | random_state ํ•„์š” |\n",
312    "| ํ•ด์„ | ๊ฐ€๋Šฅ (๋กœ๋”ฉ) | ์–ด๋ ค์›€ |\n",
313    "\n",
314    "### ํ™œ์šฉ\n",
315    "- **PCA**: ์ „์ฒ˜๋ฆฌ, ๋…ธ์ด์ฆˆ ์ œ๊ฑฐ, ๋‹ค์ค‘๊ณต์„ ์„ฑ ํ•ด์†Œ\n",
316    "- **t-SNE**: ๊ณ ์ฐจ์› ๋ฐ์ดํ„ฐ ์‹œ๊ฐํ™”, ๊ตฐ์ง‘ ํƒ์ƒ‰"
317   ]
318  }
319 ],
320 "metadata": {
321  "kernelspec": {
322   "display_name": "Python 3",
323   "language": "python",
324   "name": "python3"
325  },
326  "language_info": {
327   "name": "python",
328   "version": "3.10.0"
329  }
330 },
331 "nbformat": 4,
332 "nbformat_minor": 4
333}