02_logistic_regression.ipynb

  1{
  2 "cells": [
  3  {
  4   "cell_type": "markdown",
  5   "metadata": {},
  6   "source": [
  7    "# 02. 로지스틱 회귀 (Logistic Regression)\n",
  8    "\n",
  9    "## 학습 목표\n",
 10    "- 로지스틱 회귀의 원리 이해 (이진 분류)\n",
 11    "- 시그모이드 함수와 확률 해석\n",
 12    "- 분류 평가 지표 (정확도, 정밀도, 재현율, F1)"
 13   ]
 14  },
 15  {
 16   "cell_type": "code",
 17   "execution_count": null,
 18   "metadata": {},
 19   "outputs": [],
 20   "source": [
 21    "import numpy as np\n",
 22    "import pandas as pd\n",
 23    "import matplotlib.pyplot as plt\n",
 24    "from sklearn.linear_model import LogisticRegression\n",
 25    "from sklearn.model_selection import train_test_split\n",
 26    "from sklearn.metrics import (accuracy_score, precision_score, recall_score, \n",
 27    "                             f1_score, confusion_matrix, classification_report,\n",
 28    "                             roc_curve, roc_auc_score)\n",
 29    "from sklearn.datasets import make_classification, load_iris\n",
 30    "import seaborn as sns\n",
 31    "\n",
 32    "plt.rcParams['font.family'] = 'DejaVu Sans'"
 33   ]
 34  },
 35  {
 36   "cell_type": "markdown",
 37   "metadata": {},
 38   "source": [
 39    "## 1. 시그모이드 함수 이해"
 40   ]
 41  },
 42  {
 43   "cell_type": "code",
 44   "execution_count": null,
 45   "metadata": {},
 46   "outputs": [],
 47   "source": [
 48    "def sigmoid(z):\n",
 49    "    return 1 / (1 + np.exp(-z))\n",
 50    "\n",
 51    "# 시그모이드 함수 시각화\n",
 52    "z = np.linspace(-10, 10, 100)\n",
 53    "plt.figure(figsize=(10, 5))\n",
 54    "plt.plot(z, sigmoid(z), linewidth=2)\n",
 55    "plt.axhline(y=0.5, color='r', linestyle='--', alpha=0.5)\n",
 56    "plt.axvline(x=0, color='gray', linestyle='--', alpha=0.5)\n",
 57    "plt.xlabel('z')\n",
 58    "plt.ylabel('σ(z)')\n",
 59    "plt.title('Sigmoid Function: σ(z) = 1 / (1 + e^(-z))')\n",
 60    "plt.grid(True, alpha=0.3)\n",
 61    "plt.show()"
 62   ]
 63  },
 64  {
 65   "cell_type": "markdown",
 66   "metadata": {},
 67   "source": [
 68    "## 2. 이진 분류 데이터 생성"
 69   ]
 70  },
 71  {
 72   "cell_type": "code",
 73   "execution_count": null,
 74   "metadata": {},
 75   "outputs": [],
 76   "source": [
 77    "# 인공 데이터 생성\n",
 78    "X, y = make_classification(\n",
 79    "    n_samples=500, \n",
 80    "    n_features=2,\n",
 81    "    n_redundant=0,\n",
 82    "    n_informative=2,\n",
 83    "    n_clusters_per_class=1,\n",
 84    "    random_state=42\n",
 85    ")\n",
 86    "\n",
 87    "print(f\"X shape: {X.shape}\")\n",
 88    "print(f\"y classes: {np.unique(y)}\")\n",
 89    "print(f\"Class distribution: {np.bincount(y)}\")"
 90   ]
 91  },
 92  {
 93   "cell_type": "code",
 94   "execution_count": null,
 95   "metadata": {},
 96   "outputs": [],
 97   "source": [
 98    "# 데이터 시각화\n",
 99    "plt.figure(figsize=(10, 6))\n",
100    "plt.scatter(X[y==0, 0], X[y==0, 1], c='blue', label='Class 0', alpha=0.6, edgecolors='black')\n",
101    "plt.scatter(X[y==1, 0], X[y==1, 1], c='red', label='Class 1', alpha=0.6, edgecolors='black')\n",
102    "plt.xlabel('Feature 1')\n",
103    "plt.ylabel('Feature 2')\n",
104    "plt.title('Binary Classification Data')\n",
105    "plt.legend()\n",
106    "plt.grid(True, alpha=0.3)\n",
107    "plt.show()"
108   ]
109  },
110  {
111   "cell_type": "markdown",
112   "metadata": {},
113   "source": [
114    "## 3. 모델 학습"
115   ]
116  },
117  {
118   "cell_type": "code",
119   "execution_count": null,
120   "metadata": {},
121   "outputs": [],
122   "source": [
123    "# 데이터 분할\n",
124    "X_train, X_test, y_train, y_test = train_test_split(\n",
125    "    X, y, test_size=0.2, random_state=42\n",
126    ")\n",
127    "\n",
128    "# 로지스틱 회귀 모델\n",
129    "model = LogisticRegression(random_state=42)\n",
130    "model.fit(X_train, y_train)\n",
131    "\n",
132    "print(f\"Coefficients: {model.coef_}\")\n",
133    "print(f\"Intercept: {model.intercept_}\")"
134   ]
135  },
136  {
137   "cell_type": "markdown",
138   "metadata": {},
139   "source": [
140    "## 4. 예측 및 평가"
141   ]
142  },
143  {
144   "cell_type": "code",
145   "execution_count": null,
146   "metadata": {},
147   "outputs": [],
148   "source": [
149    "# 예측\n",
150    "y_pred = model.predict(X_test)\n",
151    "y_prob = model.predict_proba(X_test)[:, 1]  # 클래스 1의 확률\n",
152    "\n",
153    "# 평가 지표\n",
154    "print(\"=== Classification Metrics ===\")\n",
155    "print(f\"Accuracy: {accuracy_score(y_test, y_pred):.4f}\")\n",
156    "print(f\"Precision: {precision_score(y_test, y_pred):.4f}\")\n",
157    "print(f\"Recall: {recall_score(y_test, y_pred):.4f}\")\n",
158    "print(f\"F1 Score: {f1_score(y_test, y_pred):.4f}\")\n",
159    "print(f\"ROC-AUC: {roc_auc_score(y_test, y_prob):.4f}\")"
160   ]
161  },
162  {
163   "cell_type": "code",
164   "execution_count": null,
165   "metadata": {},
166   "outputs": [],
167   "source": [
168    "# Classification Report\n",
169    "print(\"\\n=== Classification Report ===\")\n",
170    "print(classification_report(y_test, y_pred))"
171   ]
172  },
173  {
174   "cell_type": "code",
175   "execution_count": null,
176   "metadata": {},
177   "outputs": [],
178   "source": [
179    "# 혼동 행렬 시각화\n",
180    "cm = confusion_matrix(y_test, y_pred)\n",
181    "\n",
182    "plt.figure(figsize=(8, 6))\n",
183    "sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', \n",
184    "            xticklabels=['Predicted 0', 'Predicted 1'],\n",
185    "            yticklabels=['Actual 0', 'Actual 1'])\n",
186    "plt.title('Confusion Matrix')\n",
187    "plt.ylabel('Actual')\n",
188    "plt.xlabel('Predicted')\n",
189    "plt.show()\n",
190    "\n",
191    "print(f\"\\nTrue Negatives: {cm[0,0]}\")\n",
192    "print(f\"False Positives: {cm[0,1]}\")\n",
193    "print(f\"False Negatives: {cm[1,0]}\")\n",
194    "print(f\"True Positives: {cm[1,1]}\")"
195   ]
196  },
197  {
198   "cell_type": "markdown",
199   "metadata": {},
200   "source": [
201    "## 5. ROC 곡선"
202   ]
203  },
204  {
205   "cell_type": "code",
206   "execution_count": null,
207   "metadata": {},
208   "outputs": [],
209   "source": [
210    "# ROC 곡선\n",
211    "fpr, tpr, thresholds = roc_curve(y_test, y_prob)\n",
212    "auc = roc_auc_score(y_test, y_prob)\n",
213    "\n",
214    "plt.figure(figsize=(8, 6))\n",
215    "plt.plot(fpr, tpr, linewidth=2, label=f'ROC Curve (AUC = {auc:.3f})')\n",
216    "plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')\n",
217    "plt.xlabel('False Positive Rate')\n",
218    "plt.ylabel('True Positive Rate')\n",
219    "plt.title('ROC Curve')\n",
220    "plt.legend()\n",
221    "plt.grid(True, alpha=0.3)\n",
222    "plt.show()"
223   ]
224  },
225  {
226   "cell_type": "markdown",
227   "metadata": {},
228   "source": [
229    "## 6. 결정 경계 시각화"
230   ]
231  },
232  {
233   "cell_type": "code",
234   "execution_count": null,
235   "metadata": {},
236   "outputs": [],
237   "source": [
238    "def plot_decision_boundary(model, X, y, title='Decision Boundary'):\n",
239    "    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n",
240    "    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n",
241    "    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200),\n",
242    "                         np.linspace(y_min, y_max, 200))\n",
243    "    \n",
244    "    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])\n",
245    "    Z = Z.reshape(xx.shape)\n",
246    "    \n",
247    "    plt.figure(figsize=(10, 6))\n",
248    "    plt.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')\n",
249    "    plt.scatter(X[y==0, 0], X[y==0, 1], c='blue', label='Class 0', edgecolors='black')\n",
250    "    plt.scatter(X[y==1, 0], X[y==1, 1], c='red', label='Class 1', edgecolors='black')\n",
251    "    plt.xlabel('Feature 1')\n",
252    "    plt.ylabel('Feature 2')\n",
253    "    plt.title(title)\n",
254    "    plt.legend()\n",
255    "    plt.show()\n",
256    "\n",
257    "plot_decision_boundary(model, X_test, y_test, 'Logistic Regression Decision Boundary')"
258   ]
259  },
260  {
261   "cell_type": "markdown",
262   "metadata": {},
263   "source": [
264    "## 7. 다중 클래스 분류 (Iris Dataset)"
265   ]
266  },
267  {
268   "cell_type": "code",
269   "execution_count": null,
270   "metadata": {},
271   "outputs": [],
272   "source": [
273    "# Iris 데이터 로드\n",
274    "iris = load_iris()\n",
275    "X_iris = iris.data\n",
276    "y_iris = iris.target\n",
277    "\n",
278    "print(f\"Features: {iris.feature_names}\")\n",
279    "print(f\"Classes: {iris.target_names}\")\n",
280    "print(f\"Shape: {X_iris.shape}\")"
281   ]
282  },
283  {
284   "cell_type": "code",
285   "execution_count": null,
286   "metadata": {},
287   "outputs": [],
288   "source": [
289    "# 다중 클래스 로지스틱 회귀\n",
290    "X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(\n",
291    "    X_iris, y_iris, test_size=0.3, random_state=42\n",
292    ")\n",
293    "\n",
294    "model_multi = LogisticRegression(multi_class='multinomial', max_iter=200, random_state=42)\n",
295    "model_multi.fit(X_train_i, y_train_i)\n",
296    "\n",
297    "y_pred_i = model_multi.predict(X_test_i)\n",
298    "print(f\"Accuracy: {accuracy_score(y_test_i, y_pred_i):.4f}\")\n",
299    "print(f\"\\nClassification Report:\")\n",
300    "print(classification_report(y_test_i, y_pred_i, target_names=iris.target_names))"
301   ]
302  },
303  {
304   "cell_type": "code",
305   "execution_count": null,
306   "metadata": {},
307   "outputs": [],
308   "source": [
309    "# 다중 클래스 혼동 행렬\n",
310    "cm_multi = confusion_matrix(y_test_i, y_pred_i)\n",
311    "\n",
312    "plt.figure(figsize=(8, 6))\n",
313    "sns.heatmap(cm_multi, annot=True, fmt='d', cmap='Blues',\n",
314    "            xticklabels=iris.target_names,\n",
315    "            yticklabels=iris.target_names)\n",
316    "plt.title('Confusion Matrix - Iris Dataset')\n",
317    "plt.ylabel('Actual')\n",
318    "plt.xlabel('Predicted')\n",
319    "plt.show()"
320   ]
321  },
322  {
323   "cell_type": "markdown",
324   "metadata": {},
325   "source": [
326    "## 정리\n",
327    "\n",
328    "### 핵심 개념\n",
329    "- **로지스틱 회귀**: 시그모이드 함수로 확률 출력 (0~1)\n",
330    "- **결정 경계**: P(y=1) = 0.5인 지점\n",
331    "- **평가 지표**:\n",
332    "  - Accuracy: 전체 정확도\n",
333    "  - Precision: 양성 예측 중 실제 양성 비율\n",
334    "  - Recall: 실제 양성 중 예측 성공 비율\n",
335    "  - F1 Score: Precision과 Recall의 조화 평균\n",
336    "\n",
337    "### 다음 단계\n",
338    "- 규제 (L1/L2 Regularization)\n",
339    "- 임계값 조정\n",
340    "- 불균형 데이터 처리"
341   ]
342  }
343 ],
344 "metadata": {
345  "kernelspec": {
346   "display_name": "Python 3",
347   "language": "python",
348   "name": "python3"
349  },
350  "language_info": {
351   "name": "python",
352   "version": "3.10.0"
353  }
354 },
355 "nbformat": 4,
356 "nbformat_minor": 4
357}