1{
2 "cells": [
3 {
4 "cell_type": "markdown",
5 "metadata": {},
6 "source": [
7 "# 01. ์ ํ ํ๊ท (Linear Regression)\n",
8 "\n",
9 "## ํ์ต ๋ชฉํ\n",
10 "- ์ ํ ํ๊ท์ ์๋ฆฌ ์ดํด\n",
11 "- scikit-learn์ผ๋ก ๋ชจ๋ธ ๊ตฌํ\n",
12 "- ๋ชจ๋ธ ํ๊ฐ ์งํ (MSE, Rยฒ) ์ดํด"
13 ]
14 },
15 {
16 "cell_type": "code",
17 "execution_count": null,
18 "metadata": {},
19 "outputs": [],
20 "source": [
21 "# ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ํฌํธ\n",
22 "import numpy as np\n",
23 "import pandas as pd\n",
24 "import matplotlib.pyplot as plt\n",
25 "from sklearn.linear_model import LinearRegression\n",
26 "from sklearn.model_selection import train_test_split\n",
27 "from sklearn.metrics import mean_squared_error, r2_score\n",
28 "from sklearn.datasets import make_regression\n",
29 "\n",
30 "# ํ๊ธ ํฐํธ ์ค์ (์ ํ)\n",
31 "plt.rcParams['font.family'] = 'DejaVu Sans'\n",
32 "plt.rcParams['axes.unicode_minus'] = False"
33 ]
34 },
35 {
36 "cell_type": "markdown",
37 "metadata": {},
38 "source": [
39 "## 1. ๋ฐ์ดํฐ ์์ฑ"
40 ]
41 },
42 {
43 "cell_type": "code",
44 "execution_count": null,
45 "metadata": {},
46 "outputs": [],
47 "source": [
48 "# ์ธ๊ณต ๋ฐ์ดํฐ ์์ฑ\n",
49 "np.random.seed(42)\n",
50 "X, y = make_regression(n_samples=100, n_features=1, noise=15, random_state=42)\n",
51 "\n",
52 "print(f\"X shape: {X.shape}\")\n",
53 "print(f\"y shape: {y.shape}\")"
54 ]
55 },
56 {
57 "cell_type": "code",
58 "execution_count": null,
59 "metadata": {},
60 "outputs": [],
61 "source": [
62 "# ๋ฐ์ดํฐ ์๊ฐํ\n",
63 "plt.figure(figsize=(10, 6))\n",
64 "plt.scatter(X, y, alpha=0.7, edgecolors='black')\n",
65 "plt.xlabel('Feature (X)')\n",
66 "plt.ylabel('Target (y)')\n",
67 "plt.title('Linear Regression Data')\n",
68 "plt.grid(True, alpha=0.3)\n",
69 "plt.show()"
70 ]
71 },
72 {
73 "cell_type": "markdown",
74 "metadata": {},
75 "source": [
76 "## 2. ๋ฐ์ดํฐ ๋ถํ "
77 ]
78 },
79 {
80 "cell_type": "code",
81 "execution_count": null,
82 "metadata": {},
83 "outputs": [],
84 "source": [
85 "# ํ๋ จ/ํ
์คํธ ๋ฐ์ดํฐ ๋ถํ (80:20)\n",
86 "X_train, X_test, y_train, y_test = train_test_split(\n",
87 " X, y, test_size=0.2, random_state=42\n",
88 ")\n",
89 "\n",
90 "print(f\"Train set: {X_train.shape[0]} samples\")\n",
91 "print(f\"Test set: {X_test.shape[0]} samples\")"
92 ]
93 },
94 {
95 "cell_type": "markdown",
96 "metadata": {},
97 "source": [
98 "## 3. ๋ชจ๋ธ ํ์ต"
99 ]
100 },
101 {
102 "cell_type": "code",
103 "execution_count": null,
104 "metadata": {},
105 "outputs": [],
106 "source": [
107 "# ์ ํ ํ๊ท ๋ชจ๋ธ ์์ฑ ๋ฐ ํ์ต\n",
108 "model = LinearRegression()\n",
109 "model.fit(X_train, y_train)\n",
110 "\n",
111 "# ๋ชจ๋ธ ํ๋ผ๋ฏธํฐ ํ์ธ\n",
112 "print(f\"๊ธฐ์ธ๊ธฐ (Coefficient): {model.coef_[0]:.4f}\")\n",
113 "print(f\"์ ํธ (Intercept): {model.intercept_:.4f}\")\n",
114 "print(f\"\\nํ๊ท์: y = {model.coef_[0]:.4f} * x + {model.intercept_:.4f}\")"
115 ]
116 },
117 {
118 "cell_type": "markdown",
119 "metadata": {},
120 "source": [
121 "## 4. ์์ธก ๋ฐ ํ๊ฐ"
122 ]
123 },
124 {
125 "cell_type": "code",
126 "execution_count": null,
127 "metadata": {},
128 "outputs": [],
129 "source": [
130 "# ์์ธก\n",
131 "y_pred = model.predict(X_test)\n",
132 "\n",
133 "# ํ๊ฐ ์งํ\n",
134 "mse = mean_squared_error(y_test, y_pred)\n",
135 "rmse = np.sqrt(mse)\n",
136 "r2 = r2_score(y_test, y_pred)\n",
137 "\n",
138 "print(\"=== ๋ชจ๋ธ ํ๊ฐ ===\")\n",
139 "print(f\"MSE (Mean Squared Error): {mse:.4f}\")\n",
140 "print(f\"RMSE (Root MSE): {rmse:.4f}\")\n",
141 "print(f\"Rยฒ Score: {r2:.4f}\")"
142 ]
143 },
144 {
145 "cell_type": "code",
146 "execution_count": null,
147 "metadata": {},
148 "outputs": [],
149 "source": [
150 "# ๊ฒฐ๊ณผ ์๊ฐํ\n",
151 "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
152 "\n",
153 "# ํ๊ท์ \n",
154 "axes[0].scatter(X_test, y_test, alpha=0.7, label='Actual', edgecolors='black')\n",
155 "axes[0].plot(X_test, y_pred, color='red', linewidth=2, label='Predicted')\n",
156 "axes[0].set_xlabel('Feature (X)')\n",
157 "axes[0].set_ylabel('Target (y)')\n",
158 "axes[0].set_title('Linear Regression - Test Data')\n",
159 "axes[0].legend()\n",
160 "axes[0].grid(True, alpha=0.3)\n",
161 "\n",
162 "# ์์ฐจ ํ๋กฏ\n",
163 "residuals = y_test - y_pred\n",
164 "axes[1].scatter(y_pred, residuals, alpha=0.7, edgecolors='black')\n",
165 "axes[1].axhline(y=0, color='red', linestyle='--')\n",
166 "axes[1].set_xlabel('Predicted Values')\n",
167 "axes[1].set_ylabel('Residuals')\n",
168 "axes[1].set_title('Residual Plot')\n",
169 "axes[1].grid(True, alpha=0.3)\n",
170 "\n",
171 "plt.tight_layout()\n",
172 "plt.show()"
173 ]
174 },
175 {
176 "cell_type": "markdown",
177 "metadata": {},
178 "source": [
179 "## 5. ๋ค์ค ์ ํ ํ๊ท"
180 ]
181 },
182 {
183 "cell_type": "code",
184 "execution_count": null,
185 "metadata": {},
186 "outputs": [],
187 "source": [
188 "# ๋ค์ค ํน์ฑ ๋ฐ์ดํฐ ์์ฑ\n",
189 "X_multi, y_multi = make_regression(\n",
190 " n_samples=200, \n",
191 " n_features=3, \n",
192 " noise=10, \n",
193 " random_state=42\n",
194 ")\n",
195 "\n",
196 "# DataFrame์ผ๋ก ๋ณํ\n",
197 "df = pd.DataFrame(X_multi, columns=['Feature_1', 'Feature_2', 'Feature_3'])\n",
198 "df['Target'] = y_multi\n",
199 "print(df.head())\n",
200 "print(f\"\\nShape: {df.shape}\")"
201 ]
202 },
203 {
204 "cell_type": "code",
205 "execution_count": null,
206 "metadata": {},
207 "outputs": [],
208 "source": [
209 "# ๋ค์ค ํ๊ท ๋ชจ๋ธ ํ์ต\n",
210 "X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(\n",
211 " X_multi, y_multi, test_size=0.2, random_state=42\n",
212 ")\n",
213 "\n",
214 "model_multi = LinearRegression()\n",
215 "model_multi.fit(X_train_m, y_train_m)\n",
216 "\n",
217 "# ๊ฒฐ๊ณผ\n",
218 "print(\"=== ๋ค์ค ์ ํ ํ๊ท ===\")\n",
219 "print(f\"Coefficients: {model_multi.coef_}\")\n",
220 "print(f\"Intercept: {model_multi.intercept_:.4f}\")\n",
221 "\n",
222 "y_pred_m = model_multi.predict(X_test_m)\n",
223 "print(f\"\\nRยฒ Score: {r2_score(y_test_m, y_pred_m):.4f}\")"
224 ]
225 },
226 {
227 "cell_type": "markdown",
228 "metadata": {},
229 "source": [
230 "## 6. ์ค์ ๋ฐ์ดํฐ ์์ (Boston Housing ๋์ฒด)\n",
231 "\n",
232 "sklearn์ Boston Housing ๋ฐ์ดํฐ์
์ deprecated๋์์ผ๋ฏ๋ก California Housing์ ์ฌ์ฉํฉ๋๋ค."
233 ]
234 },
235 {
236 "cell_type": "code",
237 "execution_count": null,
238 "metadata": {},
239 "outputs": [],
240 "source": [
241 "from sklearn.datasets import fetch_california_housing\n",
242 "\n",
243 "# ๋ฐ์ดํฐ ๋ก๋\n",
244 "housing = fetch_california_housing()\n",
245 "X_housing = housing.data\n",
246 "y_housing = housing.target\n",
247 "\n",
248 "print(f\"Features: {housing.feature_names}\")\n",
249 "print(f\"Shape: {X_housing.shape}\")\n",
250 "print(f\"Target: Median house value (in $100,000s)\")"
251 ]
252 },
253 {
254 "cell_type": "code",
255 "execution_count": null,
256 "metadata": {},
257 "outputs": [],
258 "source": [
259 "# ๋ฐ์ดํฐ ๋ถํ ๋ฐ ํ์ต\n",
260 "X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(\n",
261 " X_housing, y_housing, test_size=0.2, random_state=42\n",
262 ")\n",
263 "\n",
264 "model_housing = LinearRegression()\n",
265 "model_housing.fit(X_train_h, y_train_h)\n",
266 "\n",
267 "y_pred_h = model_housing.predict(X_test_h)\n",
268 "\n",
269 "print(\"=== California Housing ํ๊ท ๊ฒฐ๊ณผ ===\")\n",
270 "print(f\"Rยฒ Score: {r2_score(y_test_h, y_pred_h):.4f}\")\n",
271 "print(f\"RMSE: {np.sqrt(mean_squared_error(y_test_h, y_pred_h)):.4f}\")"
272 ]
273 },
274 {
275 "cell_type": "code",
276 "execution_count": null,
277 "metadata": {},
278 "outputs": [],
279 "source": [
280 "# ํน์ฑ ์ค์๋ ์๊ฐํ\n",
281 "importance = pd.DataFrame({\n",
282 " 'Feature': housing.feature_names,\n",
283 " 'Coefficient': model_housing.coef_\n",
284 "}).sort_values('Coefficient', key=abs, ascending=True)\n",
285 "\n",
286 "plt.figure(figsize=(10, 6))\n",
287 "plt.barh(importance['Feature'], importance['Coefficient'])\n",
288 "plt.xlabel('Coefficient')\n",
289 "plt.title('Feature Coefficients - California Housing')\n",
290 "plt.grid(True, alpha=0.3)\n",
291 "plt.tight_layout()\n",
292 "plt.show()"
293 ]
294 },
295 {
296 "cell_type": "markdown",
297 "metadata": {},
298 "source": [
299 "## ์ ๋ฆฌ\n",
300 "\n",
301 "### ํต์ฌ ๊ฐ๋
\n",
302 "- **์ ํ ํ๊ท**: y = wx + b ํํ์ ์ ํ ๊ด๊ณ ํ์ต\n",
303 "- **๊ฒฝ์ฌ ํ๊ฐ๋ฒ**: ์์ค ํจ์(MSE)๋ฅผ ์ต์ํํ๋ ๋ฐฉํฅ์ผ๋ก ํ๋ผ๋ฏธํฐ ์
๋ฐ์ดํธ\n",
304 "- **Rยฒ Score**: ๋ชจ๋ธ์ด ๋ฐ์ดํฐ์ ๋ถ์ฐ์ ์ผ๋ง๋ ์ค๋ช
ํ๋์ง (0~1, ๋์์๋ก ์ข์)\n",
305 "\n",
306 "### ๋ค์ ๋จ๊ณ\n",
307 "- ๋คํญ ํ๊ท (Polynomial Regression)\n",
308 "- ์ ๊ทํ (Ridge, Lasso)\n",
309 "- ํน์ฑ ์ค์ผ์ผ๋ง์ ์ค์์ฑ"
310 ]
311 }
312 ],
313 "metadata": {
314 "kernelspec": {
315 "display_name": "Python 3",
316 "language": "python",
317 "name": "python3"
318 },
319 "language_info": {
320 "name": "python",
321 "version": "3.10.0"
322 }
323 },
324 "nbformat": 4,
325 "nbformat_minor": 4
326}