1{
2 "cells": [
3 {
4 "cell_type": "markdown",
5 "metadata": {},
6 "source": [
7 "# 11. ํด๋ฌ์คํฐ๋ง (Clustering)\n",
8 "\n",
9 "## ํ์ต ๋ชฉํ\n",
10 "- K-Means ํด๋ฌ์คํฐ๋ง ์ดํด\n",
11 "- DBSCAN ์๊ณ ๋ฆฌ์ฆ\n",
12 "- ํด๋ฌ์คํฐ ํ๊ฐ ์งํ"
13 ]
14 },
15 {
16 "cell_type": "code",
17 "execution_count": null,
18 "metadata": {},
19 "outputs": [],
20 "source": [
21 "import numpy as np\n",
22 "import pandas as pd\n",
23 "import matplotlib.pyplot as plt\n",
24 "from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering\n",
25 "from sklearn.preprocessing import StandardScaler\n",
26 "from sklearn.metrics import silhouette_score, calinski_harabasz_score\n",
27 "from sklearn.datasets import make_blobs, make_moons\n",
28 "import seaborn as sns\n",
29 "\n",
30 "plt.rcParams['font.family'] = 'DejaVu Sans'"
31 ]
32 },
33 {
34 "cell_type": "markdown",
35 "metadata": {},
36 "source": [
37 "## 1. K-Means ํด๋ฌ์คํฐ๋ง"
38 ]
39 },
40 {
41 "cell_type": "code",
42 "execution_count": null,
43 "metadata": {},
44 "outputs": [],
45 "source": [
46 "# ๋ฐ์ดํฐ ์์ฑ\n",
47 "X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.6, random_state=42)\n",
48 "\n",
49 "plt.figure(figsize=(10, 6))\n",
50 "plt.scatter(X[:, 0], X[:, 1], alpha=0.6, edgecolors='black')\n",
51 "plt.title('Original Data (Unlabeled)')\n",
52 "plt.xlabel('Feature 1')\n",
53 "plt.ylabel('Feature 2')\n",
54 "plt.grid(True, alpha=0.3)\n",
55 "plt.show()"
56 ]
57 },
58 {
59 "cell_type": "code",
60 "execution_count": null,
61 "metadata": {},
62 "outputs": [],
63 "source": [
64 "# K-Means ํ์ต\n",
65 "kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)\n",
66 "y_kmeans = kmeans.fit_predict(X)\n",
67 "\n",
68 "# ๊ฒฐ๊ณผ ์๊ฐํ\n",
69 "plt.figure(figsize=(10, 6))\n",
70 "scatter = plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, cmap='viridis', \n",
71 " alpha=0.6, edgecolors='black')\n",
72 "plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],\n",
73 " c='red', marker='X', s=200, label='Centroids')\n",
74 "plt.title('K-Means Clustering (K=4)')\n",
75 "plt.xlabel('Feature 1')\n",
76 "plt.ylabel('Feature 2')\n",
77 "plt.colorbar(scatter, label='Cluster')\n",
78 "plt.legend()\n",
79 "plt.grid(True, alpha=0.3)\n",
80 "plt.show()\n",
81 "\n",
82 "print(f\"Inertia: {kmeans.inertia_:.2f}\")\n",
83 "print(f\"Silhouette Score: {silhouette_score(X, y_kmeans):.4f}\")"
84 ]
85 },
86 {
87 "cell_type": "markdown",
88 "metadata": {},
89 "source": [
90 "## 2. ์ต์ ์ K ์ฐพ๊ธฐ (Elbow Method)"
91 ]
92 },
93 {
94 "cell_type": "code",
95 "execution_count": null,
96 "metadata": {},
97 "outputs": [],
98 "source": [
99 "# Elbow Method\n",
100 "inertias = []\n",
101 "silhouettes = []\n",
102 "K_range = range(2, 11)\n",
103 "\n",
104 "for k in K_range:\n",
105 " km = KMeans(n_clusters=k, random_state=42, n_init=10)\n",
106 " km.fit(X)\n",
107 " inertias.append(km.inertia_)\n",
108 " silhouettes.append(silhouette_score(X, km.labels_))\n",
109 "\n",
110 "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
111 "\n",
112 "# Elbow Plot\n",
113 "axes[0].plot(K_range, inertias, 'b-o')\n",
114 "axes[0].set_xlabel('Number of Clusters (K)')\n",
115 "axes[0].set_ylabel('Inertia')\n",
116 "axes[0].set_title('Elbow Method')\n",
117 "axes[0].grid(True, alpha=0.3)\n",
118 "\n",
119 "# Silhouette Score\n",
120 "axes[1].plot(K_range, silhouettes, 'g-o')\n",
121 "axes[1].set_xlabel('Number of Clusters (K)')\n",
122 "axes[1].set_ylabel('Silhouette Score')\n",
123 "axes[1].set_title('Silhouette Score vs K')\n",
124 "axes[1].grid(True, alpha=0.3)\n",
125 "\n",
126 "plt.tight_layout()\n",
127 "plt.show()"
128 ]
129 },
130 {
131 "cell_type": "markdown",
132 "metadata": {},
133 "source": [
134 "## 3. DBSCAN"
135 ]
136 },
137 {
138 "cell_type": "code",
139 "execution_count": null,
140 "metadata": {},
141 "outputs": [],
142 "source": [
143 "# ๋น๊ตฌํ ๋ฐ์ดํฐ ์์ฑ\n",
144 "X_moons, y_moons = make_moons(n_samples=300, noise=0.05, random_state=42)\n",
145 "\n",
146 "# K-Means vs DBSCAN ๋น๊ต\n",
147 "fig, axes = plt.subplots(1, 3, figsize=(18, 5))\n",
148 "\n",
149 "# ์๋ณธ ๋ฐ์ดํฐ\n",
150 "axes[0].scatter(X_moons[:, 0], X_moons[:, 1], alpha=0.6, edgecolors='black')\n",
151 "axes[0].set_title('Original Data (Moons)')\n",
152 "axes[0].grid(True, alpha=0.3)\n",
153 "\n",
154 "# K-Means\n",
155 "km_moons = KMeans(n_clusters=2, random_state=42, n_init=10)\n",
156 "y_km_moons = km_moons.fit_predict(X_moons)\n",
157 "axes[1].scatter(X_moons[:, 0], X_moons[:, 1], c=y_km_moons, cmap='viridis', \n",
158 " alpha=0.6, edgecolors='black')\n",
159 "axes[1].set_title('K-Means (K=2)')\n",
160 "axes[1].grid(True, alpha=0.3)\n",
161 "\n",
162 "# DBSCAN\n",
163 "dbscan = DBSCAN(eps=0.2, min_samples=5)\n",
164 "y_dbscan = dbscan.fit_predict(X_moons)\n",
165 "axes[2].scatter(X_moons[:, 0], X_moons[:, 1], c=y_dbscan, cmap='viridis', \n",
166 " alpha=0.6, edgecolors='black')\n",
167 "axes[2].set_title('DBSCAN (eps=0.2, min_samples=5)')\n",
168 "axes[2].grid(True, alpha=0.3)\n",
169 "\n",
170 "plt.tight_layout()\n",
171 "plt.show()\n",
172 "\n",
173 "print(f\"K-Means Silhouette: {silhouette_score(X_moons, y_km_moons):.4f}\")\n",
174 "print(f\"DBSCAN Silhouette: {silhouette_score(X_moons, y_dbscan):.4f}\")\n",
175 "print(f\"DBSCAN found {len(set(y_dbscan)) - (1 if -1 in y_dbscan else 0)} clusters\")\n",
176 "print(f\"DBSCAN noise points: {sum(y_dbscan == -1)}\")"
177 ]
178 },
179 {
180 "cell_type": "markdown",
181 "metadata": {},
182 "source": [
183 "## 4. ๊ณ์ธต์ ํด๋ฌ์คํฐ๋ง"
184 ]
185 },
186 {
187 "cell_type": "code",
188 "execution_count": null,
189 "metadata": {},
190 "outputs": [],
191 "source": [
192 "from scipy.cluster.hierarchy import dendrogram, linkage\n",
193 "\n",
194 "# ๋ด๋๋ก๊ทธ๋จ\n",
195 "X_small = X[:50] # ์๊ฐํ๋ฅผ ์ํด ์ผ๋ถ ๋ฐ์ดํฐ๋ง ์ฌ์ฉ\n",
196 "linkage_matrix = linkage(X_small, method='ward')\n",
197 "\n",
198 "plt.figure(figsize=(15, 7))\n",
199 "dendrogram(linkage_matrix)\n",
200 "plt.title('Hierarchical Clustering Dendrogram')\n",
201 "plt.xlabel('Sample Index')\n",
202 "plt.ylabel('Distance')\n",
203 "plt.show()"
204 ]
205 },
206 {
207 "cell_type": "code",
208 "execution_count": null,
209 "metadata": {},
210 "outputs": [],
211 "source": [
212 "# Agglomerative Clustering\n",
213 "agg = AgglomerativeClustering(n_clusters=4)\n",
214 "y_agg = agg.fit_predict(X)\n",
215 "\n",
216 "plt.figure(figsize=(10, 6))\n",
217 "plt.scatter(X[:, 0], X[:, 1], c=y_agg, cmap='viridis', alpha=0.6, edgecolors='black')\n",
218 "plt.title('Agglomerative Clustering')\n",
219 "plt.xlabel('Feature 1')\n",
220 "plt.ylabel('Feature 2')\n",
221 "plt.grid(True, alpha=0.3)\n",
222 "plt.show()\n",
223 "\n",
224 "print(f\"Silhouette Score: {silhouette_score(X, y_agg):.4f}\")"
225 ]
226 },
227 {
228 "cell_type": "markdown",
229 "metadata": {},
230 "source": [
231 "## 5. ์ค์ ๋ฐ์ดํฐ ์์ "
232 ]
233 },
234 {
235 "cell_type": "code",
236 "execution_count": null,
237 "metadata": {},
238 "outputs": [],
239 "source": [
240 "from sklearn.datasets import load_iris\n",
241 "\n",
242 "# Iris ๋ฐ์ดํฐ ํด๋ฌ์คํฐ๋ง\n",
243 "iris = load_iris()\n",
244 "X_iris = iris.data\n",
245 "\n",
246 "# ์ค์ผ์ผ๋ง\n",
247 "scaler = StandardScaler()\n",
248 "X_scaled = scaler.fit_transform(X_iris)\n",
249 "\n",
250 "# K-Means\n",
251 "kmeans_iris = KMeans(n_clusters=3, random_state=42, n_init=10)\n",
252 "y_kmeans_iris = kmeans_iris.fit_predict(X_scaled)\n",
253 "\n",
254 "# ๊ฒฐ๊ณผ ๋น๊ต\n",
255 "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
256 "\n",
257 "# ์ค์ ๋ ์ด๋ธ\n",
258 "axes[0].scatter(X_iris[:, 0], X_iris[:, 1], c=iris.target, cmap='viridis', \n",
259 " alpha=0.6, edgecolors='black')\n",
260 "axes[0].set_title('True Labels')\n",
261 "axes[0].set_xlabel('Sepal Length')\n",
262 "axes[0].set_ylabel('Sepal Width')\n",
263 "\n",
264 "# K-Means ๊ฒฐ๊ณผ\n",
265 "axes[1].scatter(X_iris[:, 0], X_iris[:, 1], c=y_kmeans_iris, cmap='viridis', \n",
266 " alpha=0.6, edgecolors='black')\n",
267 "axes[1].set_title('K-Means Clustering')\n",
268 "axes[1].set_xlabel('Sepal Length')\n",
269 "axes[1].set_ylabel('Sepal Width')\n",
270 "\n",
271 "plt.tight_layout()\n",
272 "plt.show()\n",
273 "\n",
274 "print(f\"Silhouette Score: {silhouette_score(X_scaled, y_kmeans_iris):.4f}\")"
275 ]
276 },
277 {
278 "cell_type": "markdown",
279 "metadata": {},
280 "source": [
281 "## ์ ๋ฆฌ\n",
282 "\n",
283 "### ์๊ณ ๋ฆฌ์ฆ ๋น๊ต\n",
284 "\n",
285 "| ์๊ณ ๋ฆฌ์ฆ | ์ฅ์ | ๋จ์ |\n",
286 "|---------|------|------|\n",
287 "| K-Means | ๋น ๋ฆ, ๊ฐ๋จ | K ์ง์ ํ์, ๊ตฌํ ํด๋ฌ์คํฐ ๊ฐ์ |\n",
288 "| DBSCAN | K ๋ถํ์, ๋น๊ตฌํ ๊ฐ๋ฅ, ๋
ธ์ด์ฆ ์ฒ๋ฆฌ | eps, min_samples ์ค์ |\n",
289 "| Hierarchical | ๋ด๋๋ก๊ทธ๋จ, ๋ค์ํ K | ๋๊ท๋ชจ ๋ฐ์ดํฐ์ ๋๋ฆผ |\n",
290 "\n",
291 "### ํ๊ฐ ์งํ\n",
292 "- **Silhouette Score**: -1~1, ๋์์๋ก ์ข์\n",
293 "- **Inertia**: ํด๋ฌ์คํฐ ๋ด ๋ถ์ฐ, ๋ฎ์์๋ก ์ข์\n",
294 "- **Calinski-Harabasz**: ํด๋ฌ์คํฐ ๊ฐ/๋ด ๋ถ์ฐ ๋น์จ"
295 ]
296 }
297 ],
298 "metadata": {
299 "kernelspec": {
300 "display_name": "Python 3",
301 "language": "python",
302 "name": "python3"
303 },
304 "language_info": {
305 "name": "python",
306 "version": "3.10.0"
307 }
308 },
309 "nbformat": 4,
310 "nbformat_minor": 4
311}