11_clustering.ipynb

Download
json 312 lines 9.1 KB
  1{
  2 "cells": [
  3  {
  4   "cell_type": "markdown",
  5   "metadata": {},
  6   "source": [
  7    "# 11. ํด๋Ÿฌ์Šคํ„ฐ๋ง (Clustering)\n",
  8    "\n",
  9    "## ํ•™์Šต ๋ชฉํ‘œ\n",
 10    "- K-Means ํด๋Ÿฌ์Šคํ„ฐ๋ง ์ดํ•ด\n",
 11    "- DBSCAN ์•Œ๊ณ ๋ฆฌ์ฆ˜\n",
 12    "- ํด๋Ÿฌ์Šคํ„ฐ ํ‰๊ฐ€ ์ง€ํ‘œ"
 13   ]
 14  },
 15  {
 16   "cell_type": "code",
 17   "execution_count": null,
 18   "metadata": {},
 19   "outputs": [],
 20   "source": [
 21    "import numpy as np\n",
 22    "import pandas as pd\n",
 23    "import matplotlib.pyplot as plt\n",
 24    "from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering\n",
 25    "from sklearn.preprocessing import StandardScaler\n",
 26    "from sklearn.metrics import silhouette_score, calinski_harabasz_score\n",
 27    "from sklearn.datasets import make_blobs, make_moons\n",
 28    "import seaborn as sns\n",
 29    "\n",
 30    "plt.rcParams['font.family'] = 'DejaVu Sans'"
 31   ]
 32  },
 33  {
 34   "cell_type": "markdown",
 35   "metadata": {},
 36   "source": [
 37    "## 1. K-Means ํด๋Ÿฌ์Šคํ„ฐ๋ง"
 38   ]
 39  },
 40  {
 41   "cell_type": "code",
 42   "execution_count": null,
 43   "metadata": {},
 44   "outputs": [],
 45   "source": [
 46    "# ๋ฐ์ดํ„ฐ ์ƒ์„ฑ\n",
 47    "X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.6, random_state=42)\n",
 48    "\n",
 49    "plt.figure(figsize=(10, 6))\n",
 50    "plt.scatter(X[:, 0], X[:, 1], alpha=0.6, edgecolors='black')\n",
 51    "plt.title('Original Data (Unlabeled)')\n",
 52    "plt.xlabel('Feature 1')\n",
 53    "plt.ylabel('Feature 2')\n",
 54    "plt.grid(True, alpha=0.3)\n",
 55    "plt.show()"
 56   ]
 57  },
 58  {
 59   "cell_type": "code",
 60   "execution_count": null,
 61   "metadata": {},
 62   "outputs": [],
 63   "source": [
 64    "# K-Means ํ•™์Šต\n",
 65    "kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)\n",
 66    "y_kmeans = kmeans.fit_predict(X)\n",
 67    "\n",
 68    "# ๊ฒฐ๊ณผ ์‹œ๊ฐํ™”\n",
 69    "plt.figure(figsize=(10, 6))\n",
 70    "scatter = plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, cmap='viridis', \n",
 71    "                      alpha=0.6, edgecolors='black')\n",
 72    "plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],\n",
 73    "            c='red', marker='X', s=200, label='Centroids')\n",
 74    "plt.title('K-Means Clustering (K=4)')\n",
 75    "plt.xlabel('Feature 1')\n",
 76    "plt.ylabel('Feature 2')\n",
 77    "plt.colorbar(scatter, label='Cluster')\n",
 78    "plt.legend()\n",
 79    "plt.grid(True, alpha=0.3)\n",
 80    "plt.show()\n",
 81    "\n",
 82    "print(f\"Inertia: {kmeans.inertia_:.2f}\")\n",
 83    "print(f\"Silhouette Score: {silhouette_score(X, y_kmeans):.4f}\")"
 84   ]
 85  },
 86  {
 87   "cell_type": "markdown",
 88   "metadata": {},
 89   "source": [
 90    "## 2. ์ตœ์ ์˜ K ์ฐพ๊ธฐ (Elbow Method)"
 91   ]
 92  },
 93  {
 94   "cell_type": "code",
 95   "execution_count": null,
 96   "metadata": {},
 97   "outputs": [],
 98   "source": [
 99    "# Elbow Method\n",
100    "inertias = []\n",
101    "silhouettes = []\n",
102    "K_range = range(2, 11)\n",
103    "\n",
104    "for k in K_range:\n",
105    "    km = KMeans(n_clusters=k, random_state=42, n_init=10)\n",
106    "    km.fit(X)\n",
107    "    inertias.append(km.inertia_)\n",
108    "    silhouettes.append(silhouette_score(X, km.labels_))\n",
109    "\n",
110    "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
111    "\n",
112    "# Elbow Plot\n",
113    "axes[0].plot(K_range, inertias, 'b-o')\n",
114    "axes[0].set_xlabel('Number of Clusters (K)')\n",
115    "axes[0].set_ylabel('Inertia')\n",
116    "axes[0].set_title('Elbow Method')\n",
117    "axes[0].grid(True, alpha=0.3)\n",
118    "\n",
119    "# Silhouette Score\n",
120    "axes[1].plot(K_range, silhouettes, 'g-o')\n",
121    "axes[1].set_xlabel('Number of Clusters (K)')\n",
122    "axes[1].set_ylabel('Silhouette Score')\n",
123    "axes[1].set_title('Silhouette Score vs K')\n",
124    "axes[1].grid(True, alpha=0.3)\n",
125    "\n",
126    "plt.tight_layout()\n",
127    "plt.show()"
128   ]
129  },
130  {
131   "cell_type": "markdown",
132   "metadata": {},
133   "source": [
134    "## 3. DBSCAN"
135   ]
136  },
137  {
138   "cell_type": "code",
139   "execution_count": null,
140   "metadata": {},
141   "outputs": [],
142   "source": [
143    "# ๋น„๊ตฌํ˜• ๋ฐ์ดํ„ฐ ์ƒ์„ฑ\n",
144    "X_moons, y_moons = make_moons(n_samples=300, noise=0.05, random_state=42)\n",
145    "\n",
146    "# K-Means vs DBSCAN ๋น„๊ต\n",
147    "fig, axes = plt.subplots(1, 3, figsize=(18, 5))\n",
148    "\n",
149    "# ์›๋ณธ ๋ฐ์ดํ„ฐ\n",
150    "axes[0].scatter(X_moons[:, 0], X_moons[:, 1], alpha=0.6, edgecolors='black')\n",
151    "axes[0].set_title('Original Data (Moons)')\n",
152    "axes[0].grid(True, alpha=0.3)\n",
153    "\n",
154    "# K-Means\n",
155    "km_moons = KMeans(n_clusters=2, random_state=42, n_init=10)\n",
156    "y_km_moons = km_moons.fit_predict(X_moons)\n",
157    "axes[1].scatter(X_moons[:, 0], X_moons[:, 1], c=y_km_moons, cmap='viridis', \n",
158    "                alpha=0.6, edgecolors='black')\n",
159    "axes[1].set_title('K-Means (K=2)')\n",
160    "axes[1].grid(True, alpha=0.3)\n",
161    "\n",
162    "# DBSCAN\n",
163    "dbscan = DBSCAN(eps=0.2, min_samples=5)\n",
164    "y_dbscan = dbscan.fit_predict(X_moons)\n",
165    "axes[2].scatter(X_moons[:, 0], X_moons[:, 1], c=y_dbscan, cmap='viridis', \n",
166    "                alpha=0.6, edgecolors='black')\n",
167    "axes[2].set_title('DBSCAN (eps=0.2, min_samples=5)')\n",
168    "axes[2].grid(True, alpha=0.3)\n",
169    "\n",
170    "plt.tight_layout()\n",
171    "plt.show()\n",
172    "\n",
173    "print(f\"K-Means Silhouette: {silhouette_score(X_moons, y_km_moons):.4f}\")\n",
174    "print(f\"DBSCAN Silhouette: {silhouette_score(X_moons, y_dbscan):.4f}\")\n",
175    "print(f\"DBSCAN found {len(set(y_dbscan)) - (1 if -1 in y_dbscan else 0)} clusters\")\n",
176    "print(f\"DBSCAN noise points: {sum(y_dbscan == -1)}\")"
177   ]
178  },
179  {
180   "cell_type": "markdown",
181   "metadata": {},
182   "source": [
183    "## 4. ๊ณ„์ธต์  ํด๋Ÿฌ์Šคํ„ฐ๋ง"
184   ]
185  },
186  {
187   "cell_type": "code",
188   "execution_count": null,
189   "metadata": {},
190   "outputs": [],
191   "source": [
192    "from scipy.cluster.hierarchy import dendrogram, linkage\n",
193    "\n",
194    "# ๋ด๋“œ๋กœ๊ทธ๋žจ\n",
195    "X_small = X[:50]  # ์‹œ๊ฐํ™”๋ฅผ ์œ„ํ•ด ์ผ๋ถ€ ๋ฐ์ดํ„ฐ๋งŒ ์‚ฌ์šฉ\n",
196    "linkage_matrix = linkage(X_small, method='ward')\n",
197    "\n",
198    "plt.figure(figsize=(15, 7))\n",
199    "dendrogram(linkage_matrix)\n",
200    "plt.title('Hierarchical Clustering Dendrogram')\n",
201    "plt.xlabel('Sample Index')\n",
202    "plt.ylabel('Distance')\n",
203    "plt.show()"
204   ]
205  },
206  {
207   "cell_type": "code",
208   "execution_count": null,
209   "metadata": {},
210   "outputs": [],
211   "source": [
212    "# Agglomerative Clustering\n",
213    "agg = AgglomerativeClustering(n_clusters=4)\n",
214    "y_agg = agg.fit_predict(X)\n",
215    "\n",
216    "plt.figure(figsize=(10, 6))\n",
217    "plt.scatter(X[:, 0], X[:, 1], c=y_agg, cmap='viridis', alpha=0.6, edgecolors='black')\n",
218    "plt.title('Agglomerative Clustering')\n",
219    "plt.xlabel('Feature 1')\n",
220    "plt.ylabel('Feature 2')\n",
221    "plt.grid(True, alpha=0.3)\n",
222    "plt.show()\n",
223    "\n",
224    "print(f\"Silhouette Score: {silhouette_score(X, y_agg):.4f}\")"
225   ]
226  },
227  {
228   "cell_type": "markdown",
229   "metadata": {},
230   "source": [
231    "## 5. ์‹ค์ œ ๋ฐ์ดํ„ฐ ์˜ˆ์ œ"
232   ]
233  },
234  {
235   "cell_type": "code",
236   "execution_count": null,
237   "metadata": {},
238   "outputs": [],
239   "source": [
240    "from sklearn.datasets import load_iris\n",
241    "\n",
242    "# Iris ๋ฐ์ดํ„ฐ ํด๋Ÿฌ์Šคํ„ฐ๋ง\n",
243    "iris = load_iris()\n",
244    "X_iris = iris.data\n",
245    "\n",
246    "# ์Šค์ผ€์ผ๋ง\n",
247    "scaler = StandardScaler()\n",
248    "X_scaled = scaler.fit_transform(X_iris)\n",
249    "\n",
250    "# K-Means\n",
251    "kmeans_iris = KMeans(n_clusters=3, random_state=42, n_init=10)\n",
252    "y_kmeans_iris = kmeans_iris.fit_predict(X_scaled)\n",
253    "\n",
254    "# ๊ฒฐ๊ณผ ๋น„๊ต\n",
255    "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
256    "\n",
257    "# ์‹ค์ œ ๋ ˆ์ด๋ธ”\n",
258    "axes[0].scatter(X_iris[:, 0], X_iris[:, 1], c=iris.target, cmap='viridis', \n",
259    "                alpha=0.6, edgecolors='black')\n",
260    "axes[0].set_title('True Labels')\n",
261    "axes[0].set_xlabel('Sepal Length')\n",
262    "axes[0].set_ylabel('Sepal Width')\n",
263    "\n",
264    "# K-Means ๊ฒฐ๊ณผ\n",
265    "axes[1].scatter(X_iris[:, 0], X_iris[:, 1], c=y_kmeans_iris, cmap='viridis', \n",
266    "                alpha=0.6, edgecolors='black')\n",
267    "axes[1].set_title('K-Means Clustering')\n",
268    "axes[1].set_xlabel('Sepal Length')\n",
269    "axes[1].set_ylabel('Sepal Width')\n",
270    "\n",
271    "plt.tight_layout()\n",
272    "plt.show()\n",
273    "\n",
274    "print(f\"Silhouette Score: {silhouette_score(X_scaled, y_kmeans_iris):.4f}\")"
275   ]
276  },
277  {
278   "cell_type": "markdown",
279   "metadata": {},
280   "source": [
281    "## ์ •๋ฆฌ\n",
282    "\n",
283    "### ์•Œ๊ณ ๋ฆฌ์ฆ˜ ๋น„๊ต\n",
284    "\n",
285    "| ์•Œ๊ณ ๋ฆฌ์ฆ˜ | ์žฅ์  | ๋‹จ์  |\n",
286    "|---------|------|------|\n",
287    "| K-Means | ๋น ๋ฆ„, ๊ฐ„๋‹จ | K ์ง€์ • ํ•„์š”, ๊ตฌํ˜• ํด๋Ÿฌ์Šคํ„ฐ ๊ฐ€์ • |\n",
288    "| DBSCAN | K ๋ถˆํ•„์š”, ๋น„๊ตฌํ˜• ๊ฐ€๋Šฅ, ๋…ธ์ด์ฆˆ ์ฒ˜๋ฆฌ | eps, min_samples ์„ค์ • |\n",
289    "| Hierarchical | ๋ด๋“œ๋กœ๊ทธ๋žจ, ๋‹ค์–‘ํ•œ K | ๋Œ€๊ทœ๋ชจ ๋ฐ์ดํ„ฐ์— ๋А๋ฆผ |\n",
290    "\n",
291    "### ํ‰๊ฐ€ ์ง€ํ‘œ\n",
292    "- **Silhouette Score**: -1~1, ๋†’์„์ˆ˜๋ก ์ข‹์Œ\n",
293    "- **Inertia**: ํด๋Ÿฌ์Šคํ„ฐ ๋‚ด ๋ถ„์‚ฐ, ๋‚ฎ์„์ˆ˜๋ก ์ข‹์Œ\n",
294    "- **Calinski-Harabasz**: ํด๋Ÿฌ์Šคํ„ฐ ๊ฐ„/๋‚ด ๋ถ„์‚ฐ ๋น„์œจ"
295   ]
296  }
297 ],
298 "metadata": {
299  "kernelspec": {
300   "display_name": "Python 3",
301   "language": "python",
302   "name": "python3"
303  },
304  "language_info": {
305   "name": "python",
306   "version": "3.10.0"
307  }
308 },
309 "nbformat": 4,
310 "nbformat_minor": 4
311}