11_clustering.ipynb

  1{
  2 "cells": [
  3  {
  4   "cell_type": "markdown",
  5   "metadata": {},
  6   "source": [
  7    "# 11. 클러스터링 (Clustering)\n",
  8    "\n",
  9    "## 학습 목표\n",
 10    "- K-Means 클러스터링 이해\n",
 11    "- DBSCAN 알고리즘\n",
 12    "- 클러스터 평가 지표"
 13   ]
 14  },
 15  {
 16   "cell_type": "code",
 17   "execution_count": null,
 18   "metadata": {},
 19   "outputs": [],
 20   "source": [
 21    "import numpy as np\n",
 22    "import pandas as pd\n",
 23    "import matplotlib.pyplot as plt\n",
 24    "from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering\n",
 25    "from sklearn.preprocessing import StandardScaler\n",
 26    "from sklearn.metrics import silhouette_score, calinski_harabasz_score\n",
 27    "from sklearn.datasets import make_blobs, make_moons\n",
 28    "import seaborn as sns\n",
 29    "\n",
 30    "plt.rcParams['font.family'] = 'DejaVu Sans'"
 31   ]
 32  },
 33  {
 34   "cell_type": "markdown",
 35   "metadata": {},
 36   "source": [
 37    "## 1. K-Means 클러스터링"
 38   ]
 39  },
 40  {
 41   "cell_type": "code",
 42   "execution_count": null,
 43   "metadata": {},
 44   "outputs": [],
 45   "source": [
 46    "# 데이터 생성\n",
 47    "X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.6, random_state=42)\n",
 48    "\n",
 49    "plt.figure(figsize=(10, 6))\n",
 50    "plt.scatter(X[:, 0], X[:, 1], alpha=0.6, edgecolors='black')\n",
 51    "plt.title('Original Data (Unlabeled)')\n",
 52    "plt.xlabel('Feature 1')\n",
 53    "plt.ylabel('Feature 2')\n",
 54    "plt.grid(True, alpha=0.3)\n",
 55    "plt.show()"
 56   ]
 57  },
 58  {
 59   "cell_type": "code",
 60   "execution_count": null,
 61   "metadata": {},
 62   "outputs": [],
 63   "source": [
 64    "# K-Means 학습\n",
 65    "kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)\n",
 66    "y_kmeans = kmeans.fit_predict(X)\n",
 67    "\n",
 68    "# 결과 시각화\n",
 69    "plt.figure(figsize=(10, 6))\n",
 70    "scatter = plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, cmap='viridis', \n",
 71    "                      alpha=0.6, edgecolors='black')\n",
 72    "plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],\n",
 73    "            c='red', marker='X', s=200, label='Centroids')\n",
 74    "plt.title('K-Means Clustering (K=4)')\n",
 75    "plt.xlabel('Feature 1')\n",
 76    "plt.ylabel('Feature 2')\n",
 77    "plt.colorbar(scatter, label='Cluster')\n",
 78    "plt.legend()\n",
 79    "plt.grid(True, alpha=0.3)\n",
 80    "plt.show()\n",
 81    "\n",
 82    "print(f\"Inertia: {kmeans.inertia_:.2f}\")\n",
 83    "print(f\"Silhouette Score: {silhouette_score(X, y_kmeans):.4f}\")"
 84   ]
 85  },
 86  {
 87   "cell_type": "markdown",
 88   "metadata": {},
 89   "source": [
 90    "## 2. 최적의 K 찾기 (Elbow Method)"
 91   ]
 92  },
 93  {
 94   "cell_type": "code",
 95   "execution_count": null,
 96   "metadata": {},
 97   "outputs": [],
 98   "source": [
 99    "# Elbow Method\n",
100    "inertias = []\n",
101    "silhouettes = []\n",
102    "K_range = range(2, 11)\n",
103    "\n",
104    "for k in K_range:\n",
105    "    km = KMeans(n_clusters=k, random_state=42, n_init=10)\n",
106    "    km.fit(X)\n",
107    "    inertias.append(km.inertia_)\n",
108    "    silhouettes.append(silhouette_score(X, km.labels_))\n",
109    "\n",
110    "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
111    "\n",
112    "# Elbow Plot\n",
113    "axes[0].plot(K_range, inertias, 'b-o')\n",
114    "axes[0].set_xlabel('Number of Clusters (K)')\n",
115    "axes[0].set_ylabel('Inertia')\n",
116    "axes[0].set_title('Elbow Method')\n",
117    "axes[0].grid(True, alpha=0.3)\n",
118    "\n",
119    "# Silhouette Score\n",
120    "axes[1].plot(K_range, silhouettes, 'g-o')\n",
121    "axes[1].set_xlabel('Number of Clusters (K)')\n",
122    "axes[1].set_ylabel('Silhouette Score')\n",
123    "axes[1].set_title('Silhouette Score vs K')\n",
124    "axes[1].grid(True, alpha=0.3)\n",
125    "\n",
126    "plt.tight_layout()\n",
127    "plt.show()"
128   ]
129  },
130  {
131   "cell_type": "markdown",
132   "metadata": {},
133   "source": [
134    "## 3. DBSCAN"
135   ]
136  },
137  {
138   "cell_type": "code",
139   "execution_count": null,
140   "metadata": {},
141   "outputs": [],
142   "source": [
143    "# 비구형 데이터 생성\n",
144    "X_moons, y_moons = make_moons(n_samples=300, noise=0.05, random_state=42)\n",
145    "\n",
146    "# K-Means vs DBSCAN 비교\n",
147    "fig, axes = plt.subplots(1, 3, figsize=(18, 5))\n",
148    "\n",
149    "# 원본 데이터\n",
150    "axes[0].scatter(X_moons[:, 0], X_moons[:, 1], alpha=0.6, edgecolors='black')\n",
151    "axes[0].set_title('Original Data (Moons)')\n",
152    "axes[0].grid(True, alpha=0.3)\n",
153    "\n",
154    "# K-Means\n",
155    "km_moons = KMeans(n_clusters=2, random_state=42, n_init=10)\n",
156    "y_km_moons = km_moons.fit_predict(X_moons)\n",
157    "axes[1].scatter(X_moons[:, 0], X_moons[:, 1], c=y_km_moons, cmap='viridis', \n",
158    "                alpha=0.6, edgecolors='black')\n",
159    "axes[1].set_title('K-Means (K=2)')\n",
160    "axes[1].grid(True, alpha=0.3)\n",
161    "\n",
162    "# DBSCAN\n",
163    "dbscan = DBSCAN(eps=0.2, min_samples=5)\n",
164    "y_dbscan = dbscan.fit_predict(X_moons)\n",
165    "axes[2].scatter(X_moons[:, 0], X_moons[:, 1], c=y_dbscan, cmap='viridis', \n",
166    "                alpha=0.6, edgecolors='black')\n",
167    "axes[2].set_title('DBSCAN (eps=0.2, min_samples=5)')\n",
168    "axes[2].grid(True, alpha=0.3)\n",
169    "\n",
170    "plt.tight_layout()\n",
171    "plt.show()\n",
172    "\n",
173    "print(f\"K-Means Silhouette: {silhouette_score(X_moons, y_km_moons):.4f}\")\n",
174    "print(f\"DBSCAN Silhouette: {silhouette_score(X_moons, y_dbscan):.4f}\")\n",
175    "print(f\"DBSCAN found {len(set(y_dbscan)) - (1 if -1 in y_dbscan else 0)} clusters\")\n",
176    "print(f\"DBSCAN noise points: {sum(y_dbscan == -1)}\")"
177   ]
178  },
179  {
180   "cell_type": "markdown",
181   "metadata": {},
182   "source": [
183    "## 4. 계층적 클러스터링"
184   ]
185  },
186  {
187   "cell_type": "code",
188   "execution_count": null,
189   "metadata": {},
190   "outputs": [],
191   "source": [
192    "from scipy.cluster.hierarchy import dendrogram, linkage\n",
193    "\n",
194    "# 덴드로그램\n",
195    "X_small = X[:50]  # 시각화를 위해 일부 데이터만 사용\n",
196    "linkage_matrix = linkage(X_small, method='ward')\n",
197    "\n",
198    "plt.figure(figsize=(15, 7))\n",
199    "dendrogram(linkage_matrix)\n",
200    "plt.title('Hierarchical Clustering Dendrogram')\n",
201    "plt.xlabel('Sample Index')\n",
202    "plt.ylabel('Distance')\n",
203    "plt.show()"
204   ]
205  },
206  {
207   "cell_type": "code",
208   "execution_count": null,
209   "metadata": {},
210   "outputs": [],
211   "source": [
212    "# Agglomerative Clustering\n",
213    "agg = AgglomerativeClustering(n_clusters=4)\n",
214    "y_agg = agg.fit_predict(X)\n",
215    "\n",
216    "plt.figure(figsize=(10, 6))\n",
217    "plt.scatter(X[:, 0], X[:, 1], c=y_agg, cmap='viridis', alpha=0.6, edgecolors='black')\n",
218    "plt.title('Agglomerative Clustering')\n",
219    "plt.xlabel('Feature 1')\n",
220    "plt.ylabel('Feature 2')\n",
221    "plt.grid(True, alpha=0.3)\n",
222    "plt.show()\n",
223    "\n",
224    "print(f\"Silhouette Score: {silhouette_score(X, y_agg):.4f}\")"
225   ]
226  },
227  {
228   "cell_type": "markdown",
229   "metadata": {},
230   "source": [
231    "## 5. 실제 데이터 예제"
232   ]
233  },
234  {
235   "cell_type": "code",
236   "execution_count": null,
237   "metadata": {},
238   "outputs": [],
239   "source": [
240    "from sklearn.datasets import load_iris\n",
241    "\n",
242    "# Iris 데이터 클러스터링\n",
243    "iris = load_iris()\n",
244    "X_iris = iris.data\n",
245    "\n",
246    "# 스케일링\n",
247    "scaler = StandardScaler()\n",
248    "X_scaled = scaler.fit_transform(X_iris)\n",
249    "\n",
250    "# K-Means\n",
251    "kmeans_iris = KMeans(n_clusters=3, random_state=42, n_init=10)\n",
252    "y_kmeans_iris = kmeans_iris.fit_predict(X_scaled)\n",
253    "\n",
254    "# 결과 비교\n",
255    "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
256    "\n",
257    "# 실제 레이블\n",
258    "axes[0].scatter(X_iris[:, 0], X_iris[:, 1], c=iris.target, cmap='viridis', \n",
259    "                alpha=0.6, edgecolors='black')\n",
260    "axes[0].set_title('True Labels')\n",
261    "axes[0].set_xlabel('Sepal Length')\n",
262    "axes[0].set_ylabel('Sepal Width')\n",
263    "\n",
264    "# K-Means 결과\n",
265    "axes[1].scatter(X_iris[:, 0], X_iris[:, 1], c=y_kmeans_iris, cmap='viridis', \n",
266    "                alpha=0.6, edgecolors='black')\n",
267    "axes[1].set_title('K-Means Clustering')\n",
268    "axes[1].set_xlabel('Sepal Length')\n",
269    "axes[1].set_ylabel('Sepal Width')\n",
270    "\n",
271    "plt.tight_layout()\n",
272    "plt.show()\n",
273    "\n",
274    "print(f\"Silhouette Score: {silhouette_score(X_scaled, y_kmeans_iris):.4f}\")"
275   ]
276  },
277  {
278   "cell_type": "markdown",
279   "metadata": {},
280   "source": [
281    "## 정리\n",
282    "\n",
283    "### 알고리즘 비교\n",
284    "\n",
285    "| 알고리즘 | 장점 | 단점 |\n",
286    "|---------|------|------|\n",
287    "| K-Means | 빠름, 간단 | K 지정 필요, 구형 클러스터 가정 |\n",
288    "| DBSCAN | K 불필요, 비구형 가능, 노이즈 처리 | eps, min_samples 설정 |\n",
289    "| Hierarchical | 덴드로그램, 다양한 K | 대규모 데이터에 느림 |\n",
290    "\n",
291    "### 평가 지표\n",
292    "- **Silhouette Score**: -1~1, 높을수록 좋음\n",
293    "- **Inertia**: 클러스터 내 분산, 낮을수록 좋음\n",
294    "- **Calinski-Harabasz**: 클러스터 간/내 분산 비율"
295   ]
296  }
297 ],
298 "metadata": {
299  "kernelspec": {
300   "display_name": "Python 3",
301   "language": "python",
302   "name": "python3"
303  },
304  "language_info": {
305   "name": "python",
306   "version": "3.10.0"
307  }
308 },
309 "nbformat": 4,
310 "nbformat_minor": 4
311}