01_linear_regression.ipynb

Download
json 327 lines 8.7 KB
  1{
  2 "cells": [
  3  {
  4   "cell_type": "markdown",
  5   "metadata": {},
  6   "source": [
  7    "# 01. ์„ ํ˜• ํšŒ๊ท€ (Linear Regression)\n",
  8    "\n",
  9    "## ํ•™์Šต ๋ชฉํ‘œ\n",
 10    "- ์„ ํ˜• ํšŒ๊ท€์˜ ์›๋ฆฌ ์ดํ•ด\n",
 11    "- scikit-learn์œผ๋กœ ๋ชจ๋ธ ๊ตฌํ˜„\n",
 12    "- ๋ชจ๋ธ ํ‰๊ฐ€ ์ง€ํ‘œ (MSE, Rยฒ) ์ดํ•ด"
 13   ]
 14  },
 15  {
 16   "cell_type": "code",
 17   "execution_count": null,
 18   "metadata": {},
 19   "outputs": [],
 20   "source": [
 21    "# ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ\n",
 22    "import numpy as np\n",
 23    "import pandas as pd\n",
 24    "import matplotlib.pyplot as plt\n",
 25    "from sklearn.linear_model import LinearRegression\n",
 26    "from sklearn.model_selection import train_test_split\n",
 27    "from sklearn.metrics import mean_squared_error, r2_score\n",
 28    "from sklearn.datasets import make_regression\n",
 29    "\n",
 30    "# ํ•œ๊ธ€ ํฐํŠธ ์„ค์ • (์„ ํƒ)\n",
 31    "plt.rcParams['font.family'] = 'DejaVu Sans'\n",
 32    "plt.rcParams['axes.unicode_minus'] = False"
 33   ]
 34  },
 35  {
 36   "cell_type": "markdown",
 37   "metadata": {},
 38   "source": [
 39    "## 1. ๋ฐ์ดํ„ฐ ์ƒ์„ฑ"
 40   ]
 41  },
 42  {
 43   "cell_type": "code",
 44   "execution_count": null,
 45   "metadata": {},
 46   "outputs": [],
 47   "source": [
 48    "# ์ธ๊ณต ๋ฐ์ดํ„ฐ ์ƒ์„ฑ\n",
 49    "np.random.seed(42)\n",
 50    "X, y = make_regression(n_samples=100, n_features=1, noise=15, random_state=42)\n",
 51    "\n",
 52    "print(f\"X shape: {X.shape}\")\n",
 53    "print(f\"y shape: {y.shape}\")"
 54   ]
 55  },
 56  {
 57   "cell_type": "code",
 58   "execution_count": null,
 59   "metadata": {},
 60   "outputs": [],
 61   "source": [
 62    "# ๋ฐ์ดํ„ฐ ์‹œ๊ฐํ™”\n",
 63    "plt.figure(figsize=(10, 6))\n",
 64    "plt.scatter(X, y, alpha=0.7, edgecolors='black')\n",
 65    "plt.xlabel('Feature (X)')\n",
 66    "plt.ylabel('Target (y)')\n",
 67    "plt.title('Linear Regression Data')\n",
 68    "plt.grid(True, alpha=0.3)\n",
 69    "plt.show()"
 70   ]
 71  },
 72  {
 73   "cell_type": "markdown",
 74   "metadata": {},
 75   "source": [
 76    "## 2. ๋ฐ์ดํ„ฐ ๋ถ„ํ• "
 77   ]
 78  },
 79  {
 80   "cell_type": "code",
 81   "execution_count": null,
 82   "metadata": {},
 83   "outputs": [],
 84   "source": [
 85    "# ํ›ˆ๋ จ/ํ…Œ์ŠคํŠธ ๋ฐ์ดํ„ฐ ๋ถ„ํ•  (80:20)\n",
 86    "X_train, X_test, y_train, y_test = train_test_split(\n",
 87    "    X, y, test_size=0.2, random_state=42\n",
 88    ")\n",
 89    "\n",
 90    "print(f\"Train set: {X_train.shape[0]} samples\")\n",
 91    "print(f\"Test set: {X_test.shape[0]} samples\")"
 92   ]
 93  },
 94  {
 95   "cell_type": "markdown",
 96   "metadata": {},
 97   "source": [
 98    "## 3. ๋ชจ๋ธ ํ•™์Šต"
 99   ]
100  },
101  {
102   "cell_type": "code",
103   "execution_count": null,
104   "metadata": {},
105   "outputs": [],
106   "source": [
107    "# ์„ ํ˜• ํšŒ๊ท€ ๋ชจ๋ธ ์ƒ์„ฑ ๋ฐ ํ•™์Šต\n",
108    "model = LinearRegression()\n",
109    "model.fit(X_train, y_train)\n",
110    "\n",
111    "# ๋ชจ๋ธ ํŒŒ๋ผ๋ฏธํ„ฐ ํ™•์ธ\n",
112    "print(f\"๊ธฐ์šธ๊ธฐ (Coefficient): {model.coef_[0]:.4f}\")\n",
113    "print(f\"์ ˆํŽธ (Intercept): {model.intercept_:.4f}\")\n",
114    "print(f\"\\nํšŒ๊ท€์‹: y = {model.coef_[0]:.4f} * x + {model.intercept_:.4f}\")"
115   ]
116  },
117  {
118   "cell_type": "markdown",
119   "metadata": {},
120   "source": [
121    "## 4. ์˜ˆ์ธก ๋ฐ ํ‰๊ฐ€"
122   ]
123  },
124  {
125   "cell_type": "code",
126   "execution_count": null,
127   "metadata": {},
128   "outputs": [],
129   "source": [
130    "# ์˜ˆ์ธก\n",
131    "y_pred = model.predict(X_test)\n",
132    "\n",
133    "# ํ‰๊ฐ€ ์ง€ํ‘œ\n",
134    "mse = mean_squared_error(y_test, y_pred)\n",
135    "rmse = np.sqrt(mse)\n",
136    "r2 = r2_score(y_test, y_pred)\n",
137    "\n",
138    "print(\"=== ๋ชจ๋ธ ํ‰๊ฐ€ ===\")\n",
139    "print(f\"MSE (Mean Squared Error): {mse:.4f}\")\n",
140    "print(f\"RMSE (Root MSE): {rmse:.4f}\")\n",
141    "print(f\"Rยฒ Score: {r2:.4f}\")"
142   ]
143  },
144  {
145   "cell_type": "code",
146   "execution_count": null,
147   "metadata": {},
148   "outputs": [],
149   "source": [
150    "# ๊ฒฐ๊ณผ ์‹œ๊ฐํ™”\n",
151    "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
152    "\n",
153    "# ํšŒ๊ท€์„ \n",
154    "axes[0].scatter(X_test, y_test, alpha=0.7, label='Actual', edgecolors='black')\n",
155    "axes[0].plot(X_test, y_pred, color='red', linewidth=2, label='Predicted')\n",
156    "axes[0].set_xlabel('Feature (X)')\n",
157    "axes[0].set_ylabel('Target (y)')\n",
158    "axes[0].set_title('Linear Regression - Test Data')\n",
159    "axes[0].legend()\n",
160    "axes[0].grid(True, alpha=0.3)\n",
161    "\n",
162    "# ์ž”์ฐจ ํ”Œ๋กฏ\n",
163    "residuals = y_test - y_pred\n",
164    "axes[1].scatter(y_pred, residuals, alpha=0.7, edgecolors='black')\n",
165    "axes[1].axhline(y=0, color='red', linestyle='--')\n",
166    "axes[1].set_xlabel('Predicted Values')\n",
167    "axes[1].set_ylabel('Residuals')\n",
168    "axes[1].set_title('Residual Plot')\n",
169    "axes[1].grid(True, alpha=0.3)\n",
170    "\n",
171    "plt.tight_layout()\n",
172    "plt.show()"
173   ]
174  },
175  {
176   "cell_type": "markdown",
177   "metadata": {},
178   "source": [
179    "## 5. ๋‹ค์ค‘ ์„ ํ˜• ํšŒ๊ท€"
180   ]
181  },
182  {
183   "cell_type": "code",
184   "execution_count": null,
185   "metadata": {},
186   "outputs": [],
187   "source": [
188    "# ๋‹ค์ค‘ ํŠน์„ฑ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ\n",
189    "X_multi, y_multi = make_regression(\n",
190    "    n_samples=200, \n",
191    "    n_features=3, \n",
192    "    noise=10, \n",
193    "    random_state=42\n",
194    ")\n",
195    "\n",
196    "# DataFrame์œผ๋กœ ๋ณ€ํ™˜\n",
197    "df = pd.DataFrame(X_multi, columns=['Feature_1', 'Feature_2', 'Feature_3'])\n",
198    "df['Target'] = y_multi\n",
199    "print(df.head())\n",
200    "print(f\"\\nShape: {df.shape}\")"
201   ]
202  },
203  {
204   "cell_type": "code",
205   "execution_count": null,
206   "metadata": {},
207   "outputs": [],
208   "source": [
209    "# ๋‹ค์ค‘ ํšŒ๊ท€ ๋ชจ๋ธ ํ•™์Šต\n",
210    "X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(\n",
211    "    X_multi, y_multi, test_size=0.2, random_state=42\n",
212    ")\n",
213    "\n",
214    "model_multi = LinearRegression()\n",
215    "model_multi.fit(X_train_m, y_train_m)\n",
216    "\n",
217    "# ๊ฒฐ๊ณผ\n",
218    "print(\"=== ๋‹ค์ค‘ ์„ ํ˜• ํšŒ๊ท€ ===\")\n",
219    "print(f\"Coefficients: {model_multi.coef_}\")\n",
220    "print(f\"Intercept: {model_multi.intercept_:.4f}\")\n",
221    "\n",
222    "y_pred_m = model_multi.predict(X_test_m)\n",
223    "print(f\"\\nRยฒ Score: {r2_score(y_test_m, y_pred_m):.4f}\")"
224   ]
225  },
226  {
227   "cell_type": "markdown",
228   "metadata": {},
229   "source": [
230    "## 6. ์‹ค์ œ ๋ฐ์ดํ„ฐ ์˜ˆ์ œ (Boston Housing ๋Œ€์ฒด)\n",
231    "\n",
232    "sklearn์˜ Boston Housing ๋ฐ์ดํ„ฐ์…‹์€ deprecated๋˜์—ˆ์œผ๋ฏ€๋กœ California Housing์„ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค."
233   ]
234  },
235  {
236   "cell_type": "code",
237   "execution_count": null,
238   "metadata": {},
239   "outputs": [],
240   "source": [
241    "from sklearn.datasets import fetch_california_housing\n",
242    "\n",
243    "# ๋ฐ์ดํ„ฐ ๋กœ๋“œ\n",
244    "housing = fetch_california_housing()\n",
245    "X_housing = housing.data\n",
246    "y_housing = housing.target\n",
247    "\n",
248    "print(f\"Features: {housing.feature_names}\")\n",
249    "print(f\"Shape: {X_housing.shape}\")\n",
250    "print(f\"Target: Median house value (in $100,000s)\")"
251   ]
252  },
253  {
254   "cell_type": "code",
255   "execution_count": null,
256   "metadata": {},
257   "outputs": [],
258   "source": [
259    "# ๋ฐ์ดํ„ฐ ๋ถ„ํ•  ๋ฐ ํ•™์Šต\n",
260    "X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(\n",
261    "    X_housing, y_housing, test_size=0.2, random_state=42\n",
262    ")\n",
263    "\n",
264    "model_housing = LinearRegression()\n",
265    "model_housing.fit(X_train_h, y_train_h)\n",
266    "\n",
267    "y_pred_h = model_housing.predict(X_test_h)\n",
268    "\n",
269    "print(\"=== California Housing ํšŒ๊ท€ ๊ฒฐ๊ณผ ===\")\n",
270    "print(f\"Rยฒ Score: {r2_score(y_test_h, y_pred_h):.4f}\")\n",
271    "print(f\"RMSE: {np.sqrt(mean_squared_error(y_test_h, y_pred_h)):.4f}\")"
272   ]
273  },
274  {
275   "cell_type": "code",
276   "execution_count": null,
277   "metadata": {},
278   "outputs": [],
279   "source": [
280    "# ํŠน์„ฑ ์ค‘์š”๋„ ์‹œ๊ฐํ™”\n",
281    "importance = pd.DataFrame({\n",
282    "    'Feature': housing.feature_names,\n",
283    "    'Coefficient': model_housing.coef_\n",
284    "}).sort_values('Coefficient', key=abs, ascending=True)\n",
285    "\n",
286    "plt.figure(figsize=(10, 6))\n",
287    "plt.barh(importance['Feature'], importance['Coefficient'])\n",
288    "plt.xlabel('Coefficient')\n",
289    "plt.title('Feature Coefficients - California Housing')\n",
290    "plt.grid(True, alpha=0.3)\n",
291    "plt.tight_layout()\n",
292    "plt.show()"
293   ]
294  },
295  {
296   "cell_type": "markdown",
297   "metadata": {},
298   "source": [
299    "## ์ •๋ฆฌ\n",
300    "\n",
301    "### ํ•ต์‹ฌ ๊ฐœ๋…\n",
302    "- **์„ ํ˜• ํšŒ๊ท€**: y = wx + b ํ˜•ํƒœ์˜ ์„ ํ˜• ๊ด€๊ณ„ ํ•™์Šต\n",
303    "- **๊ฒฝ์‚ฌ ํ•˜๊ฐ•๋ฒ•**: ์†์‹ค ํ•จ์ˆ˜(MSE)๋ฅผ ์ตœ์†Œํ™”ํ•˜๋Š” ๋ฐฉํ–ฅ์œผ๋กœ ํŒŒ๋ผ๋ฏธํ„ฐ ์—…๋ฐ์ดํŠธ\n",
304    "- **Rยฒ Score**: ๋ชจ๋ธ์ด ๋ฐ์ดํ„ฐ์˜ ๋ถ„์‚ฐ์„ ์–ผ๋งˆ๋‚˜ ์„ค๋ช…ํ•˜๋Š”์ง€ (0~1, ๋†’์„์ˆ˜๋ก ์ข‹์Œ)\n",
305    "\n",
306    "### ๋‹ค์Œ ๋‹จ๊ณ„\n",
307    "- ๋‹คํ•ญ ํšŒ๊ท€ (Polynomial Regression)\n",
308    "- ์ •๊ทœํ™” (Ridge, Lasso)\n",
309    "- ํŠน์„ฑ ์Šค์ผ€์ผ๋ง์˜ ์ค‘์š”์„ฑ"
310   ]
311  }
312 ],
313 "metadata": {
314  "kernelspec": {
315   "display_name": "Python 3",
316   "language": "python",
317   "name": "python3"
318  },
319  "language_info": {
320   "name": "python",
321   "version": "3.10.0"
322  }
323 },
324 "nbformat": 4,
325 "nbformat_minor": 4
326}