Logistic Regression
Logistic Regression¶
Overview¶
Despite its name, logistic regression is a classification algorithm. It predicts probabilities for binary and multi-class classification problems.
1. Binary Classification¶
1.1 Sigmoid Function¶
import numpy as np
import matplotlib.pyplot as plt
def sigmoid(z):
return 1 / (1 + np.exp(-z))
# Visualize sigmoid function
z = np.linspace(-10, 10, 100)
plt.figure(figsize=(10, 5))
plt.plot(z, sigmoid(z), 'b-', linewidth=2)
plt.axhline(y=0.5, color='r', linestyle='--', alpha=0.5)
plt.axvline(x=0, color='r', linestyle='--', alpha=0.5)
plt.xlabel('z')
plt.ylabel('σ(z)')
plt.title('Sigmoid Function')
plt.grid(True, alpha=0.3)
plt.ylim(-0.1, 1.1)
plt.show()
# Properties:
# - Output range: (0, 1) → interpretable as probability
# - 0.5 when z=0
# - 1 when z → ∞, 0 when z → -∞
1.2 Logistic Regression Model¶
P(y=1|X) = σ(θᵀX) = 1 / (1 + e^(-θᵀX))
Decision boundary:
- P(y=1|X) >= 0.5 → Predict class 1
- P(y=1|X) < 0.5 → Predict class 0
1.3 sklearn Implementation¶
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
# Breast cancer dataset (binary classification)
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
print(f"Classes: {cancer.target_names}")
print(f"Number of features: {X.shape[1]}")
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)
# Predict
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)
# Evaluate
print(f"\nAccuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=cancer.target_names))
# Example prediction probabilities
print(f"\nFirst 5 sample prediction probabilities:")
for i in range(5):
print(f" Sample {i}: {cancer.target_names[0]}={y_proba[i][0]:.3f}, "
f"{cancer.target_names[1]}={y_proba[i][1]:.3f} → Prediction: {cancer.target_names[y_pred[i]]}")
2. Cost Function and Optimization¶
2.1 Log Loss (Binary Cross-Entropy)¶
# Cost function:
# J(θ) = -1/m * Σ[yᵢlog(ŷᵢ) + (1-yᵢ)log(1-ŷᵢ)]
from sklearn.metrics import log_loss
# Example
y_true = [0, 0, 1, 1]
y_proba = [0.1, 0.4, 0.35, 0.8]
loss = log_loss(y_true, y_proba)
print(f"Log Loss: {loss:.4f}")
# Perfect prediction
y_proba_perfect = [0.0, 0.0, 1.0, 1.0]
loss_perfect = log_loss(y_true, y_proba_perfect)
print(f"Perfect prediction Log Loss: {loss_perfect:.4f}")
2.2 Gradient Descent¶
def logistic_regression_gd(X, y, learning_rate=0.1, n_iterations=1000):
m, n = X.shape
X_b = np.c_[np.ones((m, 1)), X] # Add bias
theta = np.zeros(n + 1)
for _ in range(n_iterations):
z = X_b @ theta
h = sigmoid(z)
gradient = (1/m) * X_b.T @ (h - y)
theta = theta - learning_rate * gradient
return theta
# Test
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=2, n_redundant=0,
n_informative=2, random_state=42)
theta = logistic_regression_gd(X, y)
print(f"Learned coefficients: {theta}")
3. Regularization¶
3.1 L2 Regularization (default)¶
# penalty='l2' (default)
# C = 1/λ (smaller values mean stronger regularization)
Cs = [0.001, 0.01, 0.1, 1, 10, 100]
for C in Cs:
model = LogisticRegression(C=C, max_iter=1000)
model.fit(X_train_scaled, y_train)
train_acc = model.score(X_train_scaled, y_train)
test_acc = model.score(X_test_scaled, y_test)
print(f"C={C:6}: Train={train_acc:.4f}, Test={test_acc:.4f}")
3.2 L1 Regularization (Lasso)¶
# Feature selection effect
model_l1 = LogisticRegression(penalty='l1', solver='saga', C=0.1, max_iter=1000)
model_l1.fit(X_train_scaled, y_train)
# Number of non-zero coefficients
non_zero = np.sum(model_l1.coef_ != 0)
print(f"L1 regularization: non-zero coefficients = {non_zero}/{X.shape[1]}")
print(f"Accuracy: {model_l1.score(X_test_scaled, y_test):.4f}")
3.3 Elastic Net¶
model_en = LogisticRegression(penalty='elasticnet', solver='saga',
l1_ratio=0.5, C=1, max_iter=1000)
model_en.fit(X_train_scaled, y_train)
print(f"Elastic Net accuracy: {model_en.score(X_test_scaled, y_test):.4f}")
4. Multi-class Classification¶
4.1 One-vs-Rest (OvR)¶
from sklearn.datasets import load_iris
from sklearn.multiclass import OneVsRestClassifier
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# OvR (default for multi_class='ovr')
model_ovr = LogisticRegression(multi_class='ovr', max_iter=1000)
model_ovr.fit(X_train, y_train)
print(f"OvR accuracy: {model_ovr.score(X_test, y_test):.4f}")
print(f"Coefficient shape: {model_ovr.coef_.shape}") # (3, 4) = num_classes x num_features
4.2 Softmax (Multinomial)¶
# Softmax function: outputs probability for each class
# P(y=k|X) = exp(θₖᵀX) / Σexp(θⱼᵀX)
model_softmax = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model_softmax.fit(X_train, y_train)
print(f"Softmax accuracy: {model_softmax.score(X_test, y_test):.4f}")
# Prediction probabilities
y_proba = model_softmax.predict_proba(X_test[:3])
print("\nPrediction probabilities (first 3 samples):")
for i, proba in enumerate(y_proba):
print(f" Sample {i}: {proba} → Prediction: {iris.target_names[np.argmax(proba)]}")
4.3 Comparison¶
from sklearn.model_selection import cross_val_score
models = {
'OvR': LogisticRegression(multi_class='ovr', max_iter=1000),
'Multinomial': LogisticRegression(multi_class='multinomial', max_iter=1000)
}
for name, model in models.items():
scores = cross_val_score(model, X, y, cv=5)
print(f"{name}: {scores.mean():.4f} (+/- {scores.std():.4f})")
5. Decision Boundary Visualization¶
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
# Generate 2D data
X, y = make_classification(n_samples=200, n_features=2, n_redundant=0,
n_informative=2, n_clusters_per_class=1,
random_state=42)
# Train model
model = LogisticRegression()
model.fit(X, y)
# Visualize decision boundary
def plot_decision_boundary(model, X, y):
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='black', cmap='RdYlBu')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Logistic Regression Decision Boundary')
plt.show()
plot_decision_boundary(model, X, y)
# Visualize probability boundary
def plot_probability_boundary(model, X, y):
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, levels=20, alpha=0.8, cmap='RdYlBu')
plt.colorbar(label='P(y=1)')
plt.contour(xx, yy, Z, levels=[0.5], colors='black', linewidths=2)
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='black', cmap='RdYlBu')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Prediction Probability and Decision Boundary (0.5)')
plt.show()
plot_probability_boundary(model, X, y)
6. Threshold Adjustment¶
from sklearn.metrics import precision_recall_curve, roc_curve
# Prepare data
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, test_size=0.2, random_state=42
)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_s, y_train)
y_proba = model.predict_proba(X_test_s)[:, 1]
# Predict with various thresholds
thresholds = [0.3, 0.5, 0.7]
print("Performance by threshold:")
for thresh in thresholds:
y_pred_thresh = (y_proba >= thresh).astype(int)
from sklearn.metrics import precision_score, recall_score
prec = precision_score(y_test, y_pred_thresh)
rec = recall_score(y_test, y_pred_thresh)
print(f" threshold={thresh}: Precision={prec:.3f}, Recall={rec:.3f}")
# Precision-Recall curve
precision, recall, thresholds_pr = precision_recall_curve(y_test, y_proba)
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(thresholds_pr, precision[:-1], 'b-', label='Precision')
plt.plot(thresholds_pr, recall[:-1], 'r-', label='Recall')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision/Recall vs Threshold')
plt.legend()
plt.grid(True)
plt.subplot(1, 2, 2)
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.grid(True)
plt.tight_layout()
plt.show()
7. Handling Imbalanced Data¶
from sklearn.datasets import make_classification
# Generate imbalanced data
X, y = make_classification(n_samples=1000, n_classes=2, weights=[0.9, 0.1],
n_features=10, random_state=42)
print(f"Class distribution: {np.bincount(y)}")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Default model
model_default = LogisticRegression(max_iter=1000)
model_default.fit(X_train, y_train)
# class_weight='balanced'
model_balanced = LogisticRegression(class_weight='balanced', max_iter=1000)
model_balanced.fit(X_train, y_train)
# Compare
from sklearn.metrics import classification_report
print("=== Default Model ===")
print(classification_report(y_test, model_default.predict(X_test)))
print("=== class_weight='balanced' ===")
print(classification_report(y_test, model_balanced.predict(X_test)))
Practice Problems¶
Problem 1: Binary Classification¶
Train a logistic regression model on breast cancer data and compute the F1-score.
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import f1_score
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, test_size=0.2, random_state=42
)
# Solution
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_s, y_train)
y_pred = model.predict(X_test_s)
print(f"F1-score: {f1_score(y_test, y_pred):.4f}")
Problem 2: Multi-class Classification¶
Perform 3-class classification on Iris data.
from sklearn.datasets import load_iris
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, test_size=0.2, random_state=42
)
# Solution
model = LogisticRegression(multi_class='multinomial', max_iter=1000)
model.fit(X_train, y_train)
print(f"Accuracy: {model.score(X_test, y_test):.4f}")
print(f"\nPrediction probability (first sample): {model.predict_proba(X_test[:1])}")
Summary¶
| Concept | Description |
|---|---|
| Sigmoid | Probability output (0~1) |
| Log Loss | Cost function (Binary Cross-Entropy) |
| OvR | Multi-class (One-vs-Rest) |
| Softmax | Multi-class (Multinomial) |
| C | Regularization strength (1/λ) |
| class_weight | Handle imbalanced data |