로지스틱 회귀 (Logistic Regression)
로지스틱 회귀 (Logistic Regression)¶
개요¶
로지스틱 회귀는 이름과 달리 분류 알고리즘입니다. 이진 분류와 다중 분류 문제에서 확률을 예측합니다.
1. 이진 분류¶
1.1 시그모이드 함수¶
import numpy as np
import matplotlib.pyplot as plt
def sigmoid(z):
return 1 / (1 + np.exp(-z))
# 시그모이드 함수 시각화
z = np.linspace(-10, 10, 100)
plt.figure(figsize=(10, 5))
plt.plot(z, sigmoid(z), 'b-', linewidth=2)
plt.axhline(y=0.5, color='r', linestyle='--', alpha=0.5)
plt.axvline(x=0, color='r', linestyle='--', alpha=0.5)
plt.xlabel('z')
plt.ylabel('σ(z)')
plt.title('시그모이드 함수')
plt.grid(True, alpha=0.3)
plt.ylim(-0.1, 1.1)
plt.show()
# 특성:
# - 출력 범위: (0, 1) → 확률로 해석 가능
# - z=0일 때 0.5
# - z → ∞ 일 때 1, z → -∞ 일 때 0
1.2 로지스틱 회귀 모델¶
P(y=1|X) = σ(θᵀX) = 1 / (1 + e^(-θᵀX))
결정 경계:
- P(y=1|X) >= 0.5 → 클래스 1 예측
- P(y=1|X) < 0.5 → 클래스 0 예측
1.3 sklearn 구현¶
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
# 유방암 데이터셋 (이진 분류)
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
print(f"클래스: {cancer.target_names}")
print(f"특성 수: {X.shape[1]}")
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 모델 학습
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)
# 예측
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)
# 평가
print(f"\n정확도: {accuracy_score(y_test, y_pred):.4f}")
print("\n분류 리포트:")
print(classification_report(y_test, y_pred, target_names=cancer.target_names))
# 예측 확률 예시
print(f"\n첫 5개 샘플 예측 확률:")
for i in range(5):
print(f" 샘플 {i}: {cancer.target_names[0]}={y_proba[i][0]:.3f}, "
f"{cancer.target_names[1]}={y_proba[i][1]:.3f} → 예측: {cancer.target_names[y_pred[i]]}")
2. 비용 함수와 최적화¶
2.1 로그 손실 (Log Loss / Binary Cross-Entropy)¶
# 비용 함수:
# J(θ) = -1/m * Σ[yᵢlog(ŷᵢ) + (1-yᵢ)log(1-ŷᵢ)]
from sklearn.metrics import log_loss
# 예시
y_true = [0, 0, 1, 1]
y_proba = [0.1, 0.4, 0.35, 0.8]
loss = log_loss(y_true, y_proba)
print(f"Log Loss: {loss:.4f}")
# 완벽한 예측
y_proba_perfect = [0.0, 0.0, 1.0, 1.0]
loss_perfect = log_loss(y_true, y_proba_perfect)
print(f"완벽한 예측 Log Loss: {loss_perfect:.4f}")
2.2 경사하강법¶
def logistic_regression_gd(X, y, learning_rate=0.1, n_iterations=1000):
m, n = X.shape
X_b = np.c_[np.ones((m, 1)), X] # bias 추가
theta = np.zeros(n + 1)
for _ in range(n_iterations):
z = X_b @ theta
h = sigmoid(z)
gradient = (1/m) * X_b.T @ (h - y)
theta = theta - learning_rate * gradient
return theta
# 테스트
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=2, n_redundant=0,
n_informative=2, random_state=42)
theta = logistic_regression_gd(X, y)
print(f"학습된 계수: {theta}")
3. 정규화¶
3.1 L2 정규화 (기본값)¶
# penalty='l2' (기본값)
# C = 1/λ (작을수록 강한 정규화)
Cs = [0.001, 0.01, 0.1, 1, 10, 100]
for C in Cs:
model = LogisticRegression(C=C, max_iter=1000)
model.fit(X_train_scaled, y_train)
train_acc = model.score(X_train_scaled, y_train)
test_acc = model.score(X_test_scaled, y_test)
print(f"C={C:6}: Train={train_acc:.4f}, Test={test_acc:.4f}")
3.2 L1 정규화 (Lasso)¶
# 특성 선택 효과
model_l1 = LogisticRegression(penalty='l1', solver='saga', C=0.1, max_iter=1000)
model_l1.fit(X_train_scaled, y_train)
# 0이 아닌 계수 수
non_zero = np.sum(model_l1.coef_ != 0)
print(f"L1 정규화: 0이 아닌 계수 = {non_zero}/{X.shape[1]}")
print(f"정확도: {model_l1.score(X_test_scaled, y_test):.4f}")
3.3 Elastic Net¶
model_en = LogisticRegression(penalty='elasticnet', solver='saga',
l1_ratio=0.5, C=1, max_iter=1000)
model_en.fit(X_train_scaled, y_train)
print(f"Elastic Net 정확도: {model_en.score(X_test_scaled, y_test):.4f}")
4. 다중 클래스 분류¶
4.1 One-vs-Rest (OvR)¶
from sklearn.datasets import load_iris
from sklearn.multiclass import OneVsRestClassifier
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# OvR (기본값 for multi_class='ovr')
model_ovr = LogisticRegression(multi_class='ovr', max_iter=1000)
model_ovr.fit(X_train, y_train)
print(f"OvR 정확도: {model_ovr.score(X_test, y_test):.4f}")
print(f"계수 형태: {model_ovr.coef_.shape}") # (3, 4) = 클래스 수 x 특성 수
4.2 Softmax (Multinomial)¶
# Softmax 함수: 각 클래스 확률 출력
# P(y=k|X) = exp(θₖᵀX) / Σexp(θⱼᵀX)
model_softmax = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model_softmax.fit(X_train, y_train)
print(f"Softmax 정확도: {model_softmax.score(X_test, y_test):.4f}")
# 예측 확률
y_proba = model_softmax.predict_proba(X_test[:3])
print("\n예측 확률 (첫 3개 샘플):")
for i, proba in enumerate(y_proba):
print(f" 샘플 {i}: {proba} → 예측: {iris.target_names[np.argmax(proba)]}")
4.3 비교¶
from sklearn.model_selection import cross_val_score
models = {
'OvR': LogisticRegression(multi_class='ovr', max_iter=1000),
'Multinomial': LogisticRegression(multi_class='multinomial', max_iter=1000)
}
for name, model in models.items():
scores = cross_val_score(model, X, y, cv=5)
print(f"{name}: {scores.mean():.4f} (+/- {scores.std():.4f})")
5. 결정 경계 시각화¶
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
# 2D 데이터 생성
X, y = make_classification(n_samples=200, n_features=2, n_redundant=0,
n_informative=2, n_clusters_per_class=1,
random_state=42)
# 모델 학습
model = LogisticRegression()
model.fit(X, y)
# 결정 경계 시각화
def plot_decision_boundary(model, X, y):
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='black', cmap='RdYlBu')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('로지스틱 회귀 결정 경계')
plt.show()
plot_decision_boundary(model, X, y)
# 확률 경계 시각화
def plot_probability_boundary(model, X, y):
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, levels=20, alpha=0.8, cmap='RdYlBu')
plt.colorbar(label='P(y=1)')
plt.contour(xx, yy, Z, levels=[0.5], colors='black', linewidths=2)
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='black', cmap='RdYlBu')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('예측 확률과 결정 경계 (0.5)')
plt.show()
plot_probability_boundary(model, X, y)
6. 임계값 조정¶
from sklearn.metrics import precision_recall_curve, roc_curve
# 데이터 준비
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, test_size=0.2, random_state=42
)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_s, y_train)
y_proba = model.predict_proba(X_test_s)[:, 1]
# 다양한 임계값으로 예측
thresholds = [0.3, 0.5, 0.7]
print("임계값에 따른 성능:")
for thresh in thresholds:
y_pred_thresh = (y_proba >= thresh).astype(int)
from sklearn.metrics import precision_score, recall_score
prec = precision_score(y_test, y_pred_thresh)
rec = recall_score(y_test, y_pred_thresh)
print(f" threshold={thresh}: Precision={prec:.3f}, Recall={rec:.3f}")
# Precision-Recall 곡선
precision, recall, thresholds_pr = precision_recall_curve(y_test, y_proba)
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(thresholds_pr, precision[:-1], 'b-', label='Precision')
plt.plot(thresholds_pr, recall[:-1], 'r-', label='Recall')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision/Recall vs Threshold')
plt.legend()
plt.grid(True)
plt.subplot(1, 2, 2)
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.grid(True)
plt.tight_layout()
plt.show()
7. 불균형 데이터 처리¶
from sklearn.datasets import make_classification
# 불균형 데이터 생성
X, y = make_classification(n_samples=1000, n_classes=2, weights=[0.9, 0.1],
n_features=10, random_state=42)
print(f"클래스 분포: {np.bincount(y)}")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 기본 모델
model_default = LogisticRegression(max_iter=1000)
model_default.fit(X_train, y_train)
# class_weight='balanced'
model_balanced = LogisticRegression(class_weight='balanced', max_iter=1000)
model_balanced.fit(X_train, y_train)
# 비교
from sklearn.metrics import classification_report
print("=== 기본 모델 ===")
print(classification_report(y_test, model_default.predict(X_test)))
print("=== class_weight='balanced' ===")
print(classification_report(y_test, model_balanced.predict(X_test)))
연습 문제¶
문제 1: 이진 분류¶
유방암 데이터로 로지스틱 회귀 모델을 학습하고 F1-score를 구하세요.
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import f1_score
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, test_size=0.2, random_state=42
)
# 풀이
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_s, y_train)
y_pred = model.predict(X_test_s)
print(f"F1-score: {f1_score(y_test, y_pred):.4f}")
문제 2: 다중 분류¶
Iris 데이터로 3-클래스 분류를 수행하세요.
from sklearn.datasets import load_iris
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, test_size=0.2, random_state=42
)
# 풀이
model = LogisticRegression(multi_class='multinomial', max_iter=1000)
model.fit(X_train, y_train)
print(f"정확도: {model.score(X_test, y_test):.4f}")
print(f"\n예측 확률 (첫 샘플): {model.predict_proba(X_test[:1])}")
요약¶
| 개념 | 설명 |
|---|---|
| 시그모이드 | 확률 출력 (0~1) |
| Log Loss | 비용 함수 (Binary Cross-Entropy) |
| OvR | 다중 분류 (One-vs-Rest) |
| Softmax | 다중 분류 (Multinomial) |
| C | 정규화 강도 (1/λ) |
| class_weight | 불균형 데이터 처리 |