Cross-Validation and Hyperparameter Tuning¶

Overview¶

Cross-validation is used to evaluate a model's generalization performance more accurately, and hyperparameter tuning is the process of finding optimal model settings.

1. Cross-Validation¶

1.1 K-Fold Cross-Validation¶

import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris

# Load data
iris = load_iris()
X, y = iris.data, iris.target

# Create model
model = LogisticRegression(max_iter=1000)

# K-Fold cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

print("K-Fold Cross-Validation (K=5)")
print(f"Fold scores: {scores}")
print(f"Mean accuracy: {scores.mean():.4f}")
print(f"Standard deviation: {scores.std():.4f}")
print(f"95% confidence interval: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

1.2 Stratified K-Fold¶

from sklearn.model_selection import StratifiedKFold

# Preserve class ratios
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')

print("\nStratified K-Fold")
print(f"Mean accuracy: {scores.mean():.4f}")

# Check class distribution in each fold
print("\nClass distribution per fold:")
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    train_classes = np.bincount(y[train_idx])
    val_classes = np.bincount(y[val_idx])
    print(f"  Fold {fold}: Train={train_classes}, Val={val_classes}")

1.3 Various Cross-Validation Methods¶

from sklearn.model_selection import (
    LeaveOneOut,
    LeavePOut,
    ShuffleSplit,
    RepeatedKFold,
    RepeatedStratifiedKFold
)

# Leave-One-Out (LOO)
loo = LeaveOneOut()
print(f"LOO splits: {loo.get_n_splits(X)}")  # Equal to number of samples

# Shuffle Split (random split)
ss = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
scores = cross_val_score(model, X, y, cv=ss)
print(f"\nShuffle Split mean: {scores.mean():.4f}")

# Repeated K-Fold
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=42)
scores = cross_val_score(model, X, y, cv=rkf)
print(f"Repeated K-Fold mean: {scores.mean():.4f}")
print(f"Repeated K-Fold total splits: {len(scores)}")  # 5 * 10 = 50

1.4 Time Series Cross-Validation¶

from sklearn.model_selection import TimeSeriesSplit

# For time series data (past → future prediction)
tscv = TimeSeriesSplit(n_splits=5)

print("Time Series Split:")
for fold, (train_idx, test_idx) in enumerate(tscv.split(X), 1):
    print(f"  Fold {fold}: Train=[{train_idx[0]}:{train_idx[-1]}], Test=[{test_idx[0]}:{test_idx[-1]}]")

2. cross_val_score vs cross_validate¶

from sklearn.model_selection import cross_validate

# Evaluate multiple metrics simultaneously
scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']

cv_results = cross_validate(
    model, X, y,
    cv=5,
    scoring=scoring,
    return_train_score=True
)

print("cross_validate results:")
for metric in scoring:
    train_key = f'train_{metric}'
    test_key = f'test_{metric}'
    print(f"\n{metric}:")
    print(f"  Train: {cv_results[train_key].mean():.4f} (+/- {cv_results[train_key].std():.4f})")
    print(f"  Test:  {cv_results[test_key].mean():.4f} (+/- {cv_results[test_key].std():.4f})")

# Training time information
print(f"\nAverage training time: {cv_results['fit_time'].mean():.4f}s")
print(f"Average prediction time: {cv_results['score_time'].mean():.4f}s")

3. Hyperparameter Tuning¶

3.1 Grid Search¶

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler

# Prepare data
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Hyperparameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}

# Grid Search
grid_search = GridSearchCV(
    SVC(),
    param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1  # Use all CPUs
)

grid_search.fit(X_scaled, y)

print("\nGrid Search results:")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")

# View all results
import pandas as pd
results = pd.DataFrame(grid_search.cv_results_)
print(f"\nTop 5 combinations:")
print(results.nsmallest(5, 'rank_test_score')[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']])

3.2 Randomized Search¶

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# Hyperparameter distributions
param_distributions = {
    'C': uniform(0.1, 100),  # Uniform distribution from 0.1 to 100.1
    'gamma': uniform(0.001, 1),
    'kernel': ['rbf', 'linear', 'poly']
}

# Randomized Search
random_search = RandomizedSearchCV(
    SVC(),
    param_distributions,
    n_iter=50,  # Try 50 combinations
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_scaled, y)

print("Randomized Search results:")
print(f"Best parameters: {random_search.best_params_}")
print(f"Best score: {random_search.best_score_:.4f}")

3.3 Grid Search vs Randomized Search¶

"""
Grid Search:
- Pros: Exhaustive search, optimal solution guaranteed (within grid)
- Cons: Exponential growth in combinations

Randomized Search:
- Pros: Computationally efficient, can explore continuous distributions
- Cons: No guarantee of optimal solution

Selection criteria:
- Few parameters with clear range → Grid Search
- Many parameters or uncertain range → Randomized Search
"""

4. Advanced Tuning Techniques¶

4.1 Halving Search¶

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

# Progressively allocate resources during search
halving_search = HalvingGridSearchCV(
    SVC(),
    param_grid,
    cv=5,
    factor=3,  # Reduce candidates to 1/3 each round
    resource='n_samples',
    random_state=42
)

halving_search.fit(X_scaled, y)

print("Halving Grid Search results:")
print(f"Best parameters: {halving_search.best_params_}")
print(f"Best score: {halving_search.best_score_:.4f}")

4.2 Bayesian Optimization (Optuna)¶

# pip install optuna

import optuna
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

def objective(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    max_depth = trial.suggest_int('max_depth', 2, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    return scores.mean()

# Run optimization
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=100)

# print(f"Best parameters: {study.best_params}")
# print(f"Best score: {study.best_value:.4f}")

5. Nested Cross-Validation¶

from sklearn.model_selection import cross_val_score, GridSearchCV

# Outer loop: Model evaluation
# Inner loop: Hyperparameter tuning

# Inner CV (hyperparameter tuning)
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1, 0.01]}
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = GridSearchCV(SVC(), param_grid, cv=inner_cv, scoring='accuracy')

# Outer CV (model evaluation)
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
nested_scores = cross_val_score(grid_search, X_scaled, y, cv=outer_cv, scoring='accuracy')

print("Nested cross-validation results:")
print(f"Outer fold scores: {nested_scores}")
print(f"Mean score: {nested_scores.mean():.4f} (+/- {nested_scores.std():.4f})")

# Compare: Regular CV vs Nested CV
grid_search.fit(X_scaled, y)
print(f"\nRegular CV best score: {grid_search.best_score_:.4f}")
print(f"Nested CV mean score: {nested_scores.mean():.4f}")
# Nested CV provides more realistic generalization performance estimate

6. Using with Pipelines¶

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Define pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

# Parameter names: step__parameter
param_grid = {
    'svm__C': [0.1, 1, 10],
    'svm__gamma': [0.1, 0.01, 0.001],
    'svm__kernel': ['rbf', 'linear']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

print("Pipeline Grid Search results:")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")

7. Practical Tips¶

7.1 Scoring Functions¶

from sklearn.metrics import make_scorer, f1_score, mean_squared_error

# Built-in scoring
# Classification: 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'
# Regression: 'r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'

# Custom scoring function
def custom_score(y_true, y_pred):
    return f1_score(y_true, y_pred, average='weighted')

custom_scorer = make_scorer(custom_score)

scores = cross_val_score(model, X, y, cv=5, scoring=custom_scorer)
print(f"Custom score: {scores.mean():.4f}")

7.2 Early Stopping Callbacks¶

# Early stopping in Optuna
# import optuna

# def objective(trial):
#     # ...
#     for epoch in range(100):
#         accuracy = train_epoch()
#         trial.report(accuracy, epoch)
#         if trial.should_prune():
#             raise optuna.TrialPruned()
#     return accuracy

# study = optuna.create_study(direction='maximize',
#                            pruner=optuna.pruners.MedianPruner())

7.3 Saving Results¶

import joblib
import json

# Save best model
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'best_model.pkl')

# Save results
results = {
    'best_params': grid_search.best_params_,
    'best_score': grid_search.best_score_,
    'cv_results': {k: v.tolist() if isinstance(v, np.ndarray) else v
                   for k, v in grid_search.cv_results_.items()}
}

with open('tuning_results.json', 'w') as f:
    json.dump(results, f, indent=2)

Exercises¶

Exercise 1: K-Fold Cross-Validation¶

Perform 10-Fold cross-validation on the Iris dataset.

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

iris = load_iris()
model = LogisticRegression(max_iter=1000)

# Solution
scores = cross_val_score(model, iris.data, iris.target, cv=10)
print(f"Mean accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")

Exercise 2: Grid Search¶

Tune the C parameter of logistic regression.

from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

# Solution
grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5)
grid.fit(iris.data, iris.target)
print(f"Best C: {grid.best_params_['C']}")
print(f"Best score: {grid.best_score_:.4f}")

Summary¶

Technique	Purpose	Features
K-Fold	Model evaluation	Split data into K parts
Stratified K-Fold	Imbalanced data	Preserve class ratios
Time Series Split	Time series	Maintain temporal order
Grid Search	Parameter tuning	Exhaustive search
Randomized Search	Parameter tuning	Random sampling
Nested CV	Reliable evaluation	Separate tuning and evaluation