Linear Regression

Linear Regression

Overview

Linear regression is the most basic regression algorithm that predicts continuous values. It models the linear relationship between input and output variables.


1. Simple Linear Regression

1.1 Concept

Predict dependent variable (y) using one independent variable (X).

y = β₀ + β₁x + ε

- β₀: intercept
- β₁: slope
- ε: error term

1.2 Implementation

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Generate data
np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)  # y = 4 + 3x + noise

# Train model
model = LinearRegression()
model.fit(X, y)

# Check coefficients
print(f"Intercept (β₀): {model.intercept_[0]:.4f}")
print(f"Slope (β₁): {model.coef_[0][0]:.4f}")

# Predict
X_new = np.array([[0], [2]])
y_pred = model.predict(X_new)
print(f"\nPredictions: X=0 → y={y_pred[0][0]:.2f}, X=2 → y={y_pred[1][0]:.2f}")

# Visualization
plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.7, label='Data')
plt.plot(X_new, y_pred, 'r-', linewidth=2, label='Regression line')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Simple Linear Regression')
plt.legend()
plt.show()

1.3 Ordinary Least Squares (OLS)

# OLS: Minimize residual sum of squares (RSS)
# RSS = Σ(yᵢ - ŷᵢ)²

# Analytical solution
X_b = np.c_[np.ones((100, 1)), X]  # Add bias
theta_best = np.linalg.inv(X_b.T @ X_b) @ X_b.T @ y

print(f"Analytical solution:")
print(f"θ₀ = {theta_best[0][0]:.4f}")
print(f"θ₁ = {theta_best[1][0]:.4f}")

2. Multiple Linear Regression

2.1 Concept

Predict dependent variable using multiple independent variables.

y = β₀ + β₁x₁ + β₂x₂ + ... + βₙxₙ + ε

2.2 Implementation

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Diabetes dataset
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target
print(f"Features: {diabetes.feature_names}")
print(f"Data shape: {X.shape}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_scaled)

print(f"\nMSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")

# Check coefficients
coefficients = pd.DataFrame({
    'feature': diabetes.feature_names,
    'coefficient': model.coef_
}).sort_values('coefficient', key=abs, ascending=False)
print(f"\nRegression coefficients:")
print(coefficients)

3. Gradient Descent

3.1 Batch Gradient Descent

# Cost function: J(θ) = (1/2m) Σ(h(xᵢ) - yᵢ)²
# Update: θ = θ - α * ∇J(θ)

def batch_gradient_descent(X, y, learning_rate=0.01, n_iterations=1000):
    m = len(y)
    X_b = np.c_[np.ones((m, 1)), X]  # Add bias
    theta = np.random.randn(2, 1)  # Random initialization

    cost_history = []

    for iteration in range(n_iterations):
        gradients = (1/m) * X_b.T @ (X_b @ theta - y)
        theta = theta - learning_rate * gradients

        cost = (1/(2*m)) * np.sum((X_b @ theta - y)**2)
        cost_history.append(cost)

    return theta, cost_history

# Execute
np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

theta, cost_history = batch_gradient_descent(X, y, learning_rate=0.1, n_iterations=1000)

print(f"θ₀ = {theta[0][0]:.4f}")
print(f"θ₁ = {theta[1][0]:.4f}")

# Visualize cost function convergence
plt.figure(figsize=(10, 4))
plt.plot(cost_history[:100])
plt.xlabel('Iteration')
plt.ylabel('Cost')
plt.title('Gradient Descent Convergence')
plt.show()

3.2 Stochastic Gradient Descent (SGD)

from sklearn.linear_model import SGDRegressor

# Prepare data
X_train, X_test, y_train, y_test = train_test_split(X, y.ravel(), test_size=0.2)

# Scaling (required for SGD)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SGD regression
sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, penalty=None,
                       eta0=0.01, random_state=42)
sgd_reg.fit(X_train_scaled, y_train)

print(f"SGD intercept: {sgd_reg.intercept_[0]:.4f}")
print(f"SGD coefficient: {sgd_reg.coef_[0]:.4f}")

3.3 Mini-batch Gradient Descent

def mini_batch_gradient_descent(X, y, batch_size=20, learning_rate=0.01, n_epochs=50):
    m = len(y)
    X_b = np.c_[np.ones((m, 1)), X]
    theta = np.random.randn(2, 1)

    for epoch in range(n_epochs):
        shuffled_indices = np.random.permutation(m)
        X_b_shuffled = X_b[shuffled_indices]
        y_shuffled = y[shuffled_indices]

        for i in range(0, m, batch_size):
            xi = X_b_shuffled[i:i+batch_size]
            yi = y_shuffled[i:i+batch_size]
            gradients = (1/len(yi)) * xi.T @ (xi @ theta - yi)
            theta = theta - learning_rate * gradients

    return theta

theta = mini_batch_gradient_descent(X, y)
print(f"Mini-batch GD result: θ₀={theta[0][0]:.4f}, θ₁={theta[1][0]:.4f}")

4. Regularization

Penalize model complexity to prevent overfitting.

4.1 Ridge Regression (L2 Regularization)

from sklearn.linear_model import Ridge

# Cost function: J(θ) = MSE + α * Σθᵢ²

# Experiment with different alpha values
alphas = [0, 0.1, 1, 10, 100]

plt.figure(figsize=(12, 4))
for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train_scaled, y_train)
    y_pred = ridge.predict(X_test_scaled)
    print(f"Alpha={alpha}: R²={r2_score(y_test, y_pred):.4f}, Coef sum={sum(abs(ridge.coef_)):.4f}")

4.2 Lasso Regression (L1 Regularization)

from sklearn.linear_model import Lasso

# Cost function: J(θ) = MSE + α * Σ|θᵢ|
# Feature: Sets some coefficients to zero (feature selection)

lasso = Lasso(alpha=0.1)
lasso.fit(X_train_scaled, y_train)

# Check non-zero coefficients
non_zero = np.sum(lasso.coef_ != 0)
print(f"Number of non-zero coefficients: {non_zero}/{len(lasso.coef_)}")

y_pred = lasso.predict(X_test_scaled)
print(f"Lasso R²: {r2_score(y_test, y_pred):.4f}")

4.3 Elastic Net

from sklearn.linear_model import ElasticNet

# Combines L1 and L2
# Cost function: J(θ) = MSE + r*α*Σ|θᵢ| + (1-r)*α*Σθᵢ²/2

elastic = ElasticNet(alpha=0.1, l1_ratio=0.5)  # l1_ratio = r
elastic.fit(X_train_scaled, y_train)

y_pred = elastic.predict(X_test_scaled)
print(f"Elastic Net R²: {r2_score(y_test, y_pred):.4f}")

4.4 Regularization Comparison

from sklearn.datasets import make_regression

# Generate data (features > samples)
X, y = make_regression(n_samples=50, n_features=100, noise=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Compare models
models = {
    'Linear': LinearRegression(),
    'Ridge': Ridge(alpha=1),
    'Lasso': Lasso(alpha=0.1),
    'ElasticNet': ElasticNet(alpha=0.1, l1_ratio=0.5)
}

print("Regularization method comparison:")
for name, model in models.items():
    model.fit(X_train, y_train)
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    non_zero = np.sum(model.coef_ != 0) if hasattr(model, 'coef_') else len(model.coef_)
    print(f"{name:12}: Train R²={train_score:.3f}, Test R²={test_score:.3f}, Non-zero coefs={non_zero}")

5. Polynomial Regression

Model nonlinear relationships using linear regression.

from sklearn.preprocessing import PolynomialFeatures

# Generate nonlinear data
np.random.seed(42)
X = 6 * np.random.rand(100, 1) - 3
y = 0.5 * X**2 + X + 2 + np.random.randn(100, 1)

# Generate polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
print(f"Original features: {X.shape}")
print(f"Polynomial features: {X_poly.shape}")
print(f"Feature names: {poly.get_feature_names_out()}")

# Apply linear regression
model = LinearRegression()
model.fit(X_poly, y)

print(f"\nCoefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")

# Visualization
X_plot = np.linspace(-3, 3, 100).reshape(-1, 1)
X_plot_poly = poly.transform(X_plot)
y_plot = model.predict(X_plot_poly)

plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.7)
plt.plot(X_plot, y_plot, 'r-', linewidth=2)
plt.xlabel('X')
plt.ylabel('y')
plt.title('Polynomial Regression (degree=2)')
plt.show()

6. Regression Evaluation Metrics

from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    mean_absolute_percentage_error
)

# Predictions
y_true = np.array([3, -0.5, 2, 7])
y_pred = np.array([2.5, 0.0, 2, 8])

# MAE (Mean Absolute Error)
mae = mean_absolute_error(y_true, y_pred)
print(f"MAE: {mae:.4f}")

# MSE (Mean Squared Error)
mse = mean_squared_error(y_true, y_pred)
print(f"MSE: {mse:.4f}")

# RMSE (Root Mean Squared Error)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse:.4f}")

# R² (Coefficient of Determination)
r2 = r2_score(y_true, y_pred)
print(f"R²: {r2:.4f}")

# MAPE (Mean Absolute Percentage Error)
mape = mean_absolute_percentage_error(y_true, y_pred)
print(f"MAPE: {mape:.4f}")

Practice Problems

Problem 1: Simple Linear Regression

Train a linear regression model with the following data and predict the value when X=7.

X = np.array([[1], [2], [3], [4], [5], [6]])
y = np.array([2, 4, 5, 4, 5, 7])

# Solution
model = LinearRegression()
model.fit(X, y)
prediction = model.predict([[7]])
print(f"Prediction when X=7: {prediction[0]:.2f}")
print(f"R²: {model.score(X, y):.4f}")

Problem 2: Ridge vs Lasso

Compare the performance of Ridge and Lasso on diabetes data.

from sklearn.datasets import load_diabetes

diabetes = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(
    diabetes.data, diabetes.target, test_size=0.2, random_state=42
)

# Solution
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

for Model, name in [(Ridge, 'Ridge'), (Lasso, 'Lasso')]:
    model = Model(alpha=1)
    model.fit(X_train_s, y_train)
    print(f"{name} R²: {model.score(X_test_s, y_test):.4f}")

Summary

Method Features When to Use
Linear Regression Basic, interpretable Baseline model
Ridge (L2) Shrinks coefficients, prevents overfitting Multicollinearity
Lasso (L1) Feature selection, sparse model Many features
Elastic Net L1+L2 combination Correlated features
Polynomial Regression Nonlinear relationships Curved patterns
to navigate between lessons