Model Evaluation¶

Overview¶

Model evaluation is the process of objectively measuring the performance of trained models. Different evaluation metrics are used for classification and regression problems.

1. Classification Evaluation Metrics¶

1.1 Confusion Matrix¶

import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Example data
y_true = [1, 0, 1, 1, 0, 1, 0, 0, 1, 1]
y_pred = [1, 0, 1, 0, 0, 1, 1, 0, 1, 1]

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(cm)

# Visualization
fig, ax = plt.subplots(figsize=(6, 5))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Negative', 'Positive'])
disp.plot(ax=ax, cmap='Blues')
plt.title('Confusion Matrix')
plt.show()

# Confusion matrix elements
tn, fp, fn, tp = cm.ravel()
print(f"\nTN (True Negative): {tn}")
print(f"FP (False Positive): {fp} - Type I Error")
print(f"FN (False Negative): {fn} - Type II Error")
print(f"TP (True Positive): {tp}")

1.2 Accuracy¶

from sklearn.metrics import accuracy_score

# Accuracy = (TP + TN) / (TP + TN + FP + FN)
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Manual calculation
accuracy_manual = (tp + tn) / (tp + tn + fp + fn)
print(f"Accuracy (manual): {accuracy_manual:.4f}")

# Warning: accuracy alone is inadequate for imbalanced data
# Example: 99% negative → predicting all negative gives 99% accuracy

1.3 Precision, Recall, F1-score¶

from sklearn.metrics import precision_score, recall_score, f1_score

# Precision = TP / (TP + FP)
# "Proportion of actual positives among predicted positives"
precision = precision_score(y_true, y_pred)
print(f"Precision: {precision:.4f}")

# Recall (Sensitivity) = TP / (TP + FN)
# "Proportion of predicted positives among actual positives"
recall = recall_score(y_true, y_pred)
print(f"Recall: {recall:.4f}")

# F1-Score = 2 * (Precision * Recall) / (Precision + Recall)
# Harmonic mean of precision and recall
f1 = f1_score(y_true, y_pred)
print(f"F1-Score: {f1:.4f}")

# Manual calculation
precision_manual = tp / (tp + fp)
recall_manual = tp / (tp + fn)
f1_manual = 2 * precision_manual * recall_manual / (precision_manual + recall_manual)
print(f"\nManual calculation:")
print(f"Precision: {precision_manual:.4f}")
print(f"Recall: {recall_manual:.4f}")
print(f"F1: {f1_manual:.4f}")

1.4 Classification Report¶

from sklearn.metrics import classification_report

y_true = [0, 0, 0, 1, 1, 1, 2, 2, 2]
y_pred = [0, 0, 1, 1, 1, 2, 2, 2, 0]

report = classification_report(y_true, y_pred, target_names=['Class A', 'Class B', 'Class C'])
print("Classification Report:")
print(report)

# Return as dictionary
report_dict = classification_report(y_true, y_pred, output_dict=True)
print(f"\nClass B F1-score: {report_dict['Class B']['f1-score']:.4f}")

1.5 ROC Curve and AUC¶

from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Prepare data
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, test_size=0.2, random_state=42
)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Prediction probabilities
y_proba = model.predict_proba(X_test)[:, 1]

# ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

# Visualization
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, 'b-', linewidth=2, label=f'ROC Curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], 'r--', linewidth=1, label='Random Classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.show()

print(f"AUC Score: {roc_auc:.4f}")
print(f"AUC Score (sklearn): {roc_auc_score(y_test, y_proba):.4f}")

1.6 PR Curve (Precision-Recall)¶

from sklearn.metrics import precision_recall_curve, average_precision_score

# PR curve
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
ap = average_precision_score(y_test, y_proba)

# Visualization
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, 'b-', linewidth=2, label=f'PR Curve (AP = {ap:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"Average Precision: {ap:.4f}")

# ROC vs PR
# - ROC: Stable on imbalanced data, but FPR may look low when positive class is rare
# - PR: More sensitive on imbalanced data, focuses on positive class prediction performance

2. Multi-class Classification Evaluation¶

2.1 Multi-class Metrics¶

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# F1-score (various averaging methods)
print(f"\nF1-Score (macro): {f1_score(y_test, y_pred, average='macro'):.4f}")
print(f"F1-Score (weighted): {f1_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1-Score (micro): {f1_score(y_test, y_pred, average='micro'):.4f}")

# macro: Simple average of F1 for each class
# weighted: Weighted average by number of samples per class
# micro: Calculate by summing all TP, FP, FN

2.2 Multi-class ROC¶

from sklearn.preprocessing import label_binarize

# Binarize labels
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
y_proba = model.predict_proba(X_test)

# ROC for each class
plt.figure(figsize=(10, 6))
colors = ['blue', 'red', 'green']

for i, (color, name) in enumerate(zip(colors, iris.target_names)):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_proba[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, color=color, linewidth=2,
             label=f'{name} (AUC = {roc_auc:.4f})')

plt.plot([0, 1], [0, 1], 'k--', linewidth=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multi-class ROC Curves')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.show()

3. Regression Evaluation Metrics¶

import numpy as np
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    mean_absolute_percentage_error
)

# Example data
y_true = np.array([3, -0.5, 2, 7, 4.5])
y_pred = np.array([2.5, 0.0, 2, 8, 4.0])

# MAE (Mean Absolute Error)
mae = mean_absolute_error(y_true, y_pred)
print(f"MAE: {mae:.4f}")
# Interpretation: On average, predictions deviate from actual values by {mae}

# MSE (Mean Squared Error)
mse = mean_squared_error(y_true, y_pred)
print(f"MSE: {mse:.4f}")
# Feature: Larger penalties for larger errors

# RMSE (Root Mean Squared Error)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse:.4f}")
# Interpretation: Same units as target, interpretable

# R² (Coefficient of Determination)
r2 = r2_score(y_true, y_pred)
print(f"R²: {r2:.4f}")
# Interpretation: 0~1, closer to 1 is better, what % of variance is explained by model

# MAPE (Mean Absolute Percentage Error)
mape = mean_absolute_percentage_error(y_true, y_pred)
print(f"MAPE: {mape:.4f}")
# Warning: Unstable when y_true is close to 0

# Manual calculation
print("\n=== Manual Calculation ===")
print(f"MAE: {np.mean(np.abs(y_true - y_pred)):.4f}")
print(f"MSE: {np.mean((y_true - y_pred)**2):.4f}")
print(f"R²: {1 - np.sum((y_true - y_pred)**2) / np.sum((y_true - np.mean(y_true))**2):.4f}")

3.1 R² Score Interpretation¶

from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression

diabetes = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(
    diabetes.data, diabetes.target, test_size=0.2, random_state=42
)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.4f}")
print(f"Interpretation: The model explains {r2*100:.1f}% of target variance.")

# Visualization
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title(f'Actual vs Predicted (R² = {r2:.4f})')
plt.grid(True, alpha=0.3)
plt.show()

4. Evaluation Metric Selection Guide¶

"""
Classification Problems:

1. Balanced data
   - Accuracy, F1-score

2. Imbalanced data
   - Precision, Recall, F1-score, PR-AUC
   - Positive class important: Emphasize Recall
   - False positive cost: Emphasize Precision

3. Probability prediction quality
   - ROC-AUC, PR-AUC, Log Loss

4. Multi-class
   - Macro F1: Equal importance per class
   - Weighted F1: Importance proportional to sample count
   - Micro F1: Similar to overall accuracy


Regression Problems:

1. Basic
   - MSE, RMSE, MAE

2. Outlier sensitivity
   - MAE (robust), MSE (sensitive)

3. Relative error
   - MAPE, R²

4. Model comparison
   - R² (normalized to 0~1 range)
"""

# Evaluation metric comparison functions
def evaluate_classification(y_true, y_pred, y_proba=None):
    """Comprehensive classification evaluation"""
    print("=== Classification Evaluation Results ===")
    print(f"Accuracy:  {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"Recall:    {recall_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"F1-Score:  {f1_score(y_true, y_pred, average='weighted'):.4f}")
    if y_proba is not None:
        print(f"ROC-AUC:   {roc_auc_score(y_true, y_proba):.4f}")

def evaluate_regression(y_true, y_pred):
    """Comprehensive regression evaluation"""
    print("=== Regression Evaluation Results ===")
    print(f"MAE:  {mean_absolute_error(y_true, y_pred):.4f}")
    print(f"MSE:  {mean_squared_error(y_true, y_pred):.4f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.4f}")
    print(f"R²:   {r2_score(y_true, y_pred):.4f}")

5. Learning Curves and Validation Curves¶

5.1 Learning Curve¶

from sklearn.model_selection import learning_curve

# Prepare data
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target

# Calculate learning curve
train_sizes, train_scores, val_scores = learning_curve(
    LogisticRegression(max_iter=1000),
    X, y,
    train_sizes=np.linspace(0.1, 1.0, 10),
    cv=5,
    scoring='accuracy'
)

# Mean and standard deviation
train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)
val_mean = val_scores.mean(axis=1)
val_std = val_scores.std(axis=1)

# Visualization
plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color='orange')
plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Training Score')
plt.plot(train_sizes, val_mean, 'o-', color='orange', label='Validation Score')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy')
plt.title('Learning Curve')
plt.legend(loc='best')
plt.grid(True, alpha=0.3)
plt.show()

# Interpretation:
# - Both curves low → Underfitting
# - Training curve high, validation curve low → Overfitting
# - Curves converge → Good fit

5.2 Validation Curve¶

from sklearn.model_selection import validation_curve

# Hyperparameter range
param_range = np.logspace(-4, 2, 10)

# Calculate validation curve
train_scores, val_scores = validation_curve(
    LogisticRegression(max_iter=1000),
    X, y,
    param_name='C',
    param_range=param_range,
    cv=5,
    scoring='accuracy'
)

train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)
val_mean = val_scores.mean(axis=1)
val_std = val_scores.std(axis=1)

# Visualization
plt.figure(figsize=(10, 6))
plt.semilogx(param_range, train_mean, 'o-', color='blue', label='Training Score')
plt.semilogx(param_range, val_mean, 'o-', color='orange', label='Validation Score')
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
plt.fill_between(param_range, val_mean - val_std, val_mean + val_std, alpha=0.1, color='orange')
plt.xlabel('C (Regularization Parameter)')
plt.ylabel('Accuracy')
plt.title('Validation Curve')
plt.legend(loc='best')
plt.grid(True, alpha=0.3)
plt.show()

Practice Problems¶

Problem 1: Classification Evaluation¶

Calculate Precision, Recall, and F1-score from the confusion matrix.

# TN=50, FP=10, FN=5, TP=35

# Solution
tn, fp, fn, tp = 50, 10, 5, 35
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * precision * recall / (precision + recall)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Problem 2: Regression Evaluation¶

Calculate R² from predictions and actual values.

y_true = [100, 150, 200, 250, 300]
y_pred = [110, 140, 210, 240, 290]

# Solution
from sklearn.metrics import r2_score
print(f"R² Score: {r2_score(y_true, y_pred):.4f}")

Summary¶

Metric	Classification/Regression	Range	Description
Accuracy	Classification	0-1	Overall correct proportion
Precision	Classification	0-1	Actual positives among positive predictions
Recall	Classification	0-1	Positive predictions among actual positives
F1-Score	Classification	0-1	Harmonic mean of Precision/Recall
ROC-AUC	Classification	0-1	Overall classifier performance
MAE	Regression	0-∞	Mean absolute error
MSE	Regression	0-∞	Mean squared error
R²	Regression	-∞-1	Proportion of explained variance