110 MINS intermediate
10. Modeling Techniques
Module 10: Modeling
Regression, Classification, Ensembles, and Hyperparameter Tuning
Knowing when to use which model is the mark of a senior data scientist. This module covers the algorithms that power 90% of production data science — linear models, tree-based methods, and gradient boosting ensembles — with the model selection principles, hyperparameter tuning workflows, and interpretability tools needed to deploy and explain them.
📈 Linear Models: Interpretable Baselines
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
# Linear Regression — always start here
lr = LinearRegression()
scores = cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='r2')
print(f'Linear Regression R²: {scores.mean():.4f} ± {scores.std():.4f}')
# When coefficients are meaningful, standardize inputs for comparability
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
lr.fit(X_scaled, y_train)
coef_df = pd.DataFrame({
'feature': X_train.columns,
'coefficient': lr.coef_,
'abs_coefficient': np.abs(lr.coef_)
}).sort_values('abs_coefficient', ascending=False)
print('Feature importance (via standardized coefficients):')
print(coef_df.head(10))
# Ridge (L2) vs Lasso (L1) — when to use which
# Ridge: Shrinks ALL coefficients toward zero, keeps all features. Use when all features potentially relevant.
# Lasso: Shrinks some coefficients to EXACTLY zero (automatic feature selection). Use for sparse models.
# ElasticNet: Combination of both. Flexible.
for alpha in [0.01, 0.1, 1.0, 10.0]:
ridge = Ridge(alpha=alpha)
lasso = Lasso(alpha=alpha, max_iter=10000)
r_scores = cross_val_score(ridge, X_train_scaled, y_train, cv=5, scoring='r2')
l_scores = cross_val_score(lasso, X_train_scaled, y_train, cv=5, scoring='r2')
lasso.fit(X_train_scaled, y_train)
n_features = (lasso.coef_ != 0).sum()
print(f'alpha={alpha}: Ridge R²={r_scores.mean():.4f} | Lasso R²={l_scores.mean():.4f} (uses {n_features} features)')🌲 Tree-Based Models and Gradient Boosting
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
# Random Forest — strong, interpretable baseline
rf = RandomForestRegressor(
n_estimators=300,
max_depth=None,
min_samples_split=5,
max_features='sqrt',
n_jobs=-1,
random_state=42
)
rf_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='r2')
print(f'Random Forest R²: {rf_scores.mean():.4f}')
# XGBoost — production standard for tabular data
xgb_model = xgb.XGBRegressor(
n_estimators=1000,
learning_rate=0.05,
max_depth=6,
min_child_weight=1,
subsample=0.8,
colsample_bytree=0.8,
reg_alpha=0.1, # L1 regularization
reg_lambda=1.0, # L2 regularization
random_state=42,
n_jobs=-1,
eval_metric='rmse',
early_stopping_rounds=50 # stop if no improvement for 50 rounds
)
xgb_model.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
verbose=100
)
# LightGBM — faster than XGBoost on large datasets
lgb_model = lgb.LGBMRegressor(
n_estimators=1000,
learning_rate=0.05,
num_leaves=31, # max leaves per tree (controls complexity)
min_child_samples=20,
subsample=0.8,
colsample_bytree=0.8,
random_state=42,
n_jobs=-1
)
lgb_model.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)🎯 Hyperparameter Tuning with Optuna
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
def objective(trial):
params = {
'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
'max_depth': trial.suggest_int('max_depth', 3, 9),
'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
'subsample': trial.suggest_float('subsample', 0.6, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
'reg_alpha': trial.suggest_float('reg_alpha', 1e-5, 1.0, log=True),
'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 1.0, log=True),
'random_state': 42,
'n_jobs': -1
}
model = xgb.XGBRegressor(**params)
scores = cross_val_score(model, X_train, y_train, cv=3, scoring='r2', n_jobs=-1)
return scores.mean()
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, show_progress_bar=True)
print(f'Best R²: {study.best_value:.4f}')
print('Best parameters:')
for key, val in study.best_params.items():
print(f' {key}: {val}')
# Retrain with best params
best_model = xgb.XGBRegressor(**study.best_params)
best_model.fit(X_train, y_train)🔍 Model Interpretability with SHAP
import shap
# SHAP provides unified, game-theoretic feature importance
# Works for any model; TreeExplainer is fast for tree-based models
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)
# Global importance — what drives predictions overall?
shap.summary_plot(shap_values, X_test, plot_type='bar', feature_names=X_test.columns)
# Detailed impact plot — direction of each feature's effect
shap.summary_plot(shap_values, X_test, feature_names=X_test.columns)
# Local explanation — why did the model make this specific prediction?
row_index = 42 # explain prediction for row 42
shap.waterfall_plot(shap.Explanation(
values=shap_values[row_index],
base_values=explainer.expected_value,
data=X_test.iloc[row_index].values,
feature_names=X_test.columns.tolist()
))
print(f'Prediction for row {row_index}: {best_model.predict(X_test.iloc[[row_index]])[0]:.4f}')
print('Features pushing prediction UP:',
X_test.columns[shap_values[row_index] > 0].tolist()[:3])
print('Features pushing prediction DOWN:',
X_test.columns[shap_values[row_index] < 0].tolist()[:3])Data Science: Model Training Arena
Epochs
0
Mean Squared Error (Loss)
---
Independent Variable (X)
Target Variable (Y)
train_model.py
X_train, y_train loaded
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
TRAINING OUTPUT
[11:25:49][SYSTEM] Environment ready. Waiting for model training initialization...
Knowledge Check
Ready to test your understanding of 10. Modeling Techniques?