01_linear_regression.ipynb

  1{
  2 "cells": [
  3  {
  4   "cell_type": "markdown",
  5   "metadata": {},
  6   "source": [
  7    "# 01. 선형 회귀 (Linear Regression)\n",
  8    "\n",
  9    "## 학습 목표\n",
 10    "- 선형 회귀의 원리 이해\n",
 11    "- scikit-learn으로 모델 구현\n",
 12    "- 모델 평가 지표 (MSE, R²) 이해"
 13   ]
 14  },
 15  {
 16   "cell_type": "code",
 17   "execution_count": null,
 18   "metadata": {},
 19   "outputs": [],
 20   "source": [
 21    "# 라이브러리 임포트\n",
 22    "import numpy as np\n",
 23    "import pandas as pd\n",
 24    "import matplotlib.pyplot as plt\n",
 25    "from sklearn.linear_model import LinearRegression\n",
 26    "from sklearn.model_selection import train_test_split\n",
 27    "from sklearn.metrics import mean_squared_error, r2_score\n",
 28    "from sklearn.datasets import make_regression\n",
 29    "\n",
 30    "# 한글 폰트 설정 (선택)\n",
 31    "plt.rcParams['font.family'] = 'DejaVu Sans'\n",
 32    "plt.rcParams['axes.unicode_minus'] = False"
 33   ]
 34  },
 35  {
 36   "cell_type": "markdown",
 37   "metadata": {},
 38   "source": [
 39    "## 1. 데이터 생성"
 40   ]
 41  },
 42  {
 43   "cell_type": "code",
 44   "execution_count": null,
 45   "metadata": {},
 46   "outputs": [],
 47   "source": [
 48    "# 인공 데이터 생성\n",
 49    "np.random.seed(42)\n",
 50    "X, y = make_regression(n_samples=100, n_features=1, noise=15, random_state=42)\n",
 51    "\n",
 52    "print(f\"X shape: {X.shape}\")\n",
 53    "print(f\"y shape: {y.shape}\")"
 54   ]
 55  },
 56  {
 57   "cell_type": "code",
 58   "execution_count": null,
 59   "metadata": {},
 60   "outputs": [],
 61   "source": [
 62    "# 데이터 시각화\n",
 63    "plt.figure(figsize=(10, 6))\n",
 64    "plt.scatter(X, y, alpha=0.7, edgecolors='black')\n",
 65    "plt.xlabel('Feature (X)')\n",
 66    "plt.ylabel('Target (y)')\n",
 67    "plt.title('Linear Regression Data')\n",
 68    "plt.grid(True, alpha=0.3)\n",
 69    "plt.show()"
 70   ]
 71  },
 72  {
 73   "cell_type": "markdown",
 74   "metadata": {},
 75   "source": [
 76    "## 2. 데이터 분할"
 77   ]
 78  },
 79  {
 80   "cell_type": "code",
 81   "execution_count": null,
 82   "metadata": {},
 83   "outputs": [],
 84   "source": [
 85    "# 훈련/테스트 데이터 분할 (80:20)\n",
 86    "X_train, X_test, y_train, y_test = train_test_split(\n",
 87    "    X, y, test_size=0.2, random_state=42\n",
 88    ")\n",
 89    "\n",
 90    "print(f\"Train set: {X_train.shape[0]} samples\")\n",
 91    "print(f\"Test set: {X_test.shape[0]} samples\")"
 92   ]
 93  },
 94  {
 95   "cell_type": "markdown",
 96   "metadata": {},
 97   "source": [
 98    "## 3. 모델 학습"
 99   ]
100  },
101  {
102   "cell_type": "code",
103   "execution_count": null,
104   "metadata": {},
105   "outputs": [],
106   "source": [
107    "# 선형 회귀 모델 생성 및 학습\n",
108    "model = LinearRegression()\n",
109    "model.fit(X_train, y_train)\n",
110    "\n",
111    "# 모델 파라미터 확인\n",
112    "print(f\"기울기 (Coefficient): {model.coef_[0]:.4f}\")\n",
113    "print(f\"절편 (Intercept): {model.intercept_:.4f}\")\n",
114    "print(f\"\\n회귀식: y = {model.coef_[0]:.4f} * x + {model.intercept_:.4f}\")"
115   ]
116  },
117  {
118   "cell_type": "markdown",
119   "metadata": {},
120   "source": [
121    "## 4. 예측 및 평가"
122   ]
123  },
124  {
125   "cell_type": "code",
126   "execution_count": null,
127   "metadata": {},
128   "outputs": [],
129   "source": [
130    "# 예측\n",
131    "y_pred = model.predict(X_test)\n",
132    "\n",
133    "# 평가 지표\n",
134    "mse = mean_squared_error(y_test, y_pred)\n",
135    "rmse = np.sqrt(mse)\n",
136    "r2 = r2_score(y_test, y_pred)\n",
137    "\n",
138    "print(\"=== 모델 평가 ===\")\n",
139    "print(f\"MSE (Mean Squared Error): {mse:.4f}\")\n",
140    "print(f\"RMSE (Root MSE): {rmse:.4f}\")\n",
141    "print(f\"R² Score: {r2:.4f}\")"
142   ]
143  },
144  {
145   "cell_type": "code",
146   "execution_count": null,
147   "metadata": {},
148   "outputs": [],
149   "source": [
150    "# 결과 시각화\n",
151    "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
152    "\n",
153    "# 회귀선\n",
154    "axes[0].scatter(X_test, y_test, alpha=0.7, label='Actual', edgecolors='black')\n",
155    "axes[0].plot(X_test, y_pred, color='red', linewidth=2, label='Predicted')\n",
156    "axes[0].set_xlabel('Feature (X)')\n",
157    "axes[0].set_ylabel('Target (y)')\n",
158    "axes[0].set_title('Linear Regression - Test Data')\n",
159    "axes[0].legend()\n",
160    "axes[0].grid(True, alpha=0.3)\n",
161    "\n",
162    "# 잔차 플롯\n",
163    "residuals = y_test - y_pred\n",
164    "axes[1].scatter(y_pred, residuals, alpha=0.7, edgecolors='black')\n",
165    "axes[1].axhline(y=0, color='red', linestyle='--')\n",
166    "axes[1].set_xlabel('Predicted Values')\n",
167    "axes[1].set_ylabel('Residuals')\n",
168    "axes[1].set_title('Residual Plot')\n",
169    "axes[1].grid(True, alpha=0.3)\n",
170    "\n",
171    "plt.tight_layout()\n",
172    "plt.show()"
173   ]
174  },
175  {
176   "cell_type": "markdown",
177   "metadata": {},
178   "source": [
179    "## 5. 다중 선형 회귀"
180   ]
181  },
182  {
183   "cell_type": "code",
184   "execution_count": null,
185   "metadata": {},
186   "outputs": [],
187   "source": [
188    "# 다중 특성 데이터 생성\n",
189    "X_multi, y_multi = make_regression(\n",
190    "    n_samples=200, \n",
191    "    n_features=3, \n",
192    "    noise=10, \n",
193    "    random_state=42\n",
194    ")\n",
195    "\n",
196    "# DataFrame으로 변환\n",
197    "df = pd.DataFrame(X_multi, columns=['Feature_1', 'Feature_2', 'Feature_3'])\n",
198    "df['Target'] = y_multi\n",
199    "print(df.head())\n",
200    "print(f\"\\nShape: {df.shape}\")"
201   ]
202  },
203  {
204   "cell_type": "code",
205   "execution_count": null,
206   "metadata": {},
207   "outputs": [],
208   "source": [
209    "# 다중 회귀 모델 학습\n",
210    "X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(\n",
211    "    X_multi, y_multi, test_size=0.2, random_state=42\n",
212    ")\n",
213    "\n",
214    "model_multi = LinearRegression()\n",
215    "model_multi.fit(X_train_m, y_train_m)\n",
216    "\n",
217    "# 결과\n",
218    "print(\"=== 다중 선형 회귀 ===\")\n",
219    "print(f\"Coefficients: {model_multi.coef_}\")\n",
220    "print(f\"Intercept: {model_multi.intercept_:.4f}\")\n",
221    "\n",
222    "y_pred_m = model_multi.predict(X_test_m)\n",
223    "print(f\"\\nR² Score: {r2_score(y_test_m, y_pred_m):.4f}\")"
224   ]
225  },
226  {
227   "cell_type": "markdown",
228   "metadata": {},
229   "source": [
230    "## 6. 실제 데이터 예제 (Boston Housing 대체)\n",
231    "\n",
232    "sklearn의 Boston Housing 데이터셋은 deprecated되었으므로 California Housing을 사용합니다."
233   ]
234  },
235  {
236   "cell_type": "code",
237   "execution_count": null,
238   "metadata": {},
239   "outputs": [],
240   "source": [
241    "from sklearn.datasets import fetch_california_housing\n",
242    "\n",
243    "# 데이터 로드\n",
244    "housing = fetch_california_housing()\n",
245    "X_housing = housing.data\n",
246    "y_housing = housing.target\n",
247    "\n",
248    "print(f\"Features: {housing.feature_names}\")\n",
249    "print(f\"Shape: {X_housing.shape}\")\n",
250    "print(f\"Target: Median house value (in $100,000s)\")"
251   ]
252  },
253  {
254   "cell_type": "code",
255   "execution_count": null,
256   "metadata": {},
257   "outputs": [],
258   "source": [
259    "# 데이터 분할 및 학습\n",
260    "X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(\n",
261    "    X_housing, y_housing, test_size=0.2, random_state=42\n",
262    ")\n",
263    "\n",
264    "model_housing = LinearRegression()\n",
265    "model_housing.fit(X_train_h, y_train_h)\n",
266    "\n",
267    "y_pred_h = model_housing.predict(X_test_h)\n",
268    "\n",
269    "print(\"=== California Housing 회귀 결과 ===\")\n",
270    "print(f\"R² Score: {r2_score(y_test_h, y_pred_h):.4f}\")\n",
271    "print(f\"RMSE: {np.sqrt(mean_squared_error(y_test_h, y_pred_h)):.4f}\")"
272   ]
273  },
274  {
275   "cell_type": "code",
276   "execution_count": null,
277   "metadata": {},
278   "outputs": [],
279   "source": [
280    "# 특성 중요도 시각화\n",
281    "importance = pd.DataFrame({\n",
282    "    'Feature': housing.feature_names,\n",
283    "    'Coefficient': model_housing.coef_\n",
284    "}).sort_values('Coefficient', key=abs, ascending=True)\n",
285    "\n",
286    "plt.figure(figsize=(10, 6))\n",
287    "plt.barh(importance['Feature'], importance['Coefficient'])\n",
288    "plt.xlabel('Coefficient')\n",
289    "plt.title('Feature Coefficients - California Housing')\n",
290    "plt.grid(True, alpha=0.3)\n",
291    "plt.tight_layout()\n",
292    "plt.show()"
293   ]
294  },
295  {
296   "cell_type": "markdown",
297   "metadata": {},
298   "source": [
299    "## 정리\n",
300    "\n",
301    "### 핵심 개념\n",
302    "- **선형 회귀**: y = wx + b 형태의 선형 관계 학습\n",
303    "- **경사 하강법**: 손실 함수(MSE)를 최소화하는 방향으로 파라미터 업데이트\n",
304    "- **R² Score**: 모델이 데이터의 분산을 얼마나 설명하는지 (0~1, 높을수록 좋음)\n",
305    "\n",
306    "### 다음 단계\n",
307    "- 다항 회귀 (Polynomial Regression)\n",
308    "- 정규화 (Ridge, Lasso)\n",
309    "- 특성 스케일링의 중요성"
310   ]
311  }
312 ],
313 "metadata": {
314  "kernelspec": {
315   "display_name": "Python 3",
316   "language": "python",
317   "name": "python3"
318  },
319  "language_info": {
320   "name": "python",
321   "version": "3.10.0"
322  }
323 },
324 "nbformat": 4,
325 "nbformat_minor": 4
326}