Data Science | VoidX Academy

10. Modeling Techniques

Module 10: Modeling

Regression, Classification, Ensembles, and Hyperparameter Tuning

Knowing when to use which model is the mark of a senior data scientist. This module covers the algorithms that power 90% of production data science — linear models, tree-based methods, and gradient boosting ensembles — with the model selection principles, hyperparameter tuning workflows, and interpretability tools needed to deploy and explain them.

📈 Linear Models: Interpretable Baselines

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# Linear Regression — always start here
lr = LinearRegression()
scores = cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='r2')
print(f'Linear Regression R²: {scores.mean():.4f} ± {scores.std():.4f}')

# When coefficients are meaningful, standardize inputs for comparability
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
lr.fit(X_scaled, y_train)

coef_df = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': lr.coef_,
    'abs_coefficient': np.abs(lr.coef_)
}).sort_values('abs_coefficient', ascending=False)
print('Feature importance (via standardized coefficients):')
print(coef_df.head(10))

# Ridge (L2) vs Lasso (L1) — when to use which
# Ridge: Shrinks ALL coefficients toward zero, keeps all features. Use when all features potentially relevant.
# Lasso: Shrinks some coefficients to EXACTLY zero (automatic feature selection). Use for sparse models.
# ElasticNet: Combination of both. Flexible.
for alpha in [0.01, 0.1, 1.0, 10.0]:
    ridge = Ridge(alpha=alpha)
    lasso = Lasso(alpha=alpha, max_iter=10000)
    r_scores = cross_val_score(ridge, X_train_scaled, y_train, cv=5, scoring='r2')
    l_scores = cross_val_score(lasso, X_train_scaled, y_train, cv=5, scoring='r2')
    lasso.fit(X_train_scaled, y_train)
    n_features = (lasso.coef_ != 0).sum()
    print(f'alpha={alpha}: Ridge R²={r_scores.mean():.4f} | Lasso R²={l_scores.mean():.4f} (uses {n_features} features)')

🌲 Tree-Based Models and Gradient Boosting

import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

# Random Forest — strong, interpretable baseline
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_split=5,
    max_features='sqrt',
    n_jobs=-1,
    random_state=42
)
rf_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='r2')
print(f'Random Forest R²: {rf_scores.mean():.4f}')

# XGBoost — production standard for tabular data
xgb_model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,   # L1 regularization
    reg_lambda=1.0,  # L2 regularization
    random_state=42,
    n_jobs=-1,
    eval_metric='rmse',
    early_stopping_rounds=50  # stop if no improvement for 50 rounds
)
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=100
)

# LightGBM — faster than XGBoost on large datasets
lgb_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,    # max leaves per tree (controls complexity)
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)

🎯 Hyperparameter Tuning with Optuna

import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-5, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 1.0, log=True),
        'random_state': 42,
        'n_jobs': -1
    }
    model = xgb.XGBRegressor(**params)
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='r2', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, show_progress_bar=True)

print(f'Best R²: {study.best_value:.4f}')
print('Best parameters:')
for key, val in study.best_params.items():
    print(f'  {key}: {val}')

# Retrain with best params
best_model = xgb.XGBRegressor(**study.best_params)
best_model.fit(X_train, y_train)

🔍 Model Interpretability with SHAP

import shap

# SHAP provides unified, game-theoretic feature importance
# Works for any model; TreeExplainer is fast for tree-based models
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)

# Global importance — what drives predictions overall?
shap.summary_plot(shap_values, X_test, plot_type='bar', feature_names=X_test.columns)

# Detailed impact plot — direction of each feature's effect
shap.summary_plot(shap_values, X_test, feature_names=X_test.columns)

# Local explanation — why did the model make this specific prediction?
row_index = 42  # explain prediction for row 42
shap.waterfall_plot(shap.Explanation(
    values=shap_values[row_index],
    base_values=explainer.expected_value,
    data=X_test.iloc[row_index].values,
    feature_names=X_test.columns.tolist()
))

print(f'Prediction for row {row_index}: {best_model.predict(X_test.iloc[[row_index]])[0]:.4f}')
print('Features pushing prediction UP:', 
      X_test.columns[shap_values[row_index] > 0].tolist()[:3])
print('Features pushing prediction DOWN:', 
      X_test.columns[shap_values[row_index] < 0].tolist()[:3])

Data Science: Model Training Arena

Epochs

Mean Squared Error (Loss)

---

Independent Variable (X)

Target Variable (Y)

train_model.py

X_train, y_train loaded

TRAINING OUTPUT

[11:25:49][SYSTEM] Environment ready. Waiting for model training initialization...

10. Modeling Techniques

Module 10: Modeling

Regression, Classification, Ensembles, and Hyperparameter Tuning

📈 Linear Models: Interpretable Baselines

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# Linear Regression — always start here
lr = LinearRegression()
scores = cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='r2')
print(f'Linear Regression R²: {scores.mean():.4f} ± {scores.std():.4f}')

# When coefficients are meaningful, standardize inputs for comparability
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
lr.fit(X_scaled, y_train)

coef_df = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': lr.coef_,
    'abs_coefficient': np.abs(lr.coef_)
}).sort_values('abs_coefficient', ascending=False)
print('Feature importance (via standardized coefficients):')
print(coef_df.head(10))

# Ridge (L2) vs Lasso (L1) — when to use which
# Ridge: Shrinks ALL coefficients toward zero, keeps all features. Use when all features potentially relevant.
# Lasso: Shrinks some coefficients to EXACTLY zero (automatic feature selection). Use for sparse models.
# ElasticNet: Combination of both. Flexible.
for alpha in [0.01, 0.1, 1.0, 10.0]:
    ridge = Ridge(alpha=alpha)
    lasso = Lasso(alpha=alpha, max_iter=10000)
    r_scores = cross_val_score(ridge, X_train_scaled, y_train, cv=5, scoring='r2')
    l_scores = cross_val_score(lasso, X_train_scaled, y_train, cv=5, scoring='r2')
    lasso.fit(X_train_scaled, y_train)
    n_features = (lasso.coef_ != 0).sum()
    print(f'alpha={alpha}: Ridge R²={r_scores.mean():.4f} | Lasso R²={l_scores.mean():.4f} (uses {n_features} features)')

🌲 Tree-Based Models and Gradient Boosting

import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

# Random Forest — strong, interpretable baseline
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_split=5,
    max_features='sqrt',
    n_jobs=-1,
    random_state=42
)
rf_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='r2')
print(f'Random Forest R²: {rf_scores.mean():.4f}')

# XGBoost — production standard for tabular data
xgb_model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,   # L1 regularization
    reg_lambda=1.0,  # L2 regularization
    random_state=42,
    n_jobs=-1,
    eval_metric='rmse',
    early_stopping_rounds=50  # stop if no improvement for 50 rounds
)
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=100
)

# LightGBM — faster than XGBoost on large datasets
lgb_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,    # max leaves per tree (controls complexity)
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)

🎯 Hyperparameter Tuning with Optuna

import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-5, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 1.0, log=True),
        'random_state': 42,
        'n_jobs': -1
    }
    model = xgb.XGBRegressor(**params)
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='r2', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, show_progress_bar=True)

print(f'Best R²: {study.best_value:.4f}')
print('Best parameters:')
for key, val in study.best_params.items():
    print(f'  {key}: {val}')

# Retrain with best params
best_model = xgb.XGBRegressor(**study.best_params)
best_model.fit(X_train, y_train)

🔍 Model Interpretability with SHAP

import shap

# SHAP provides unified, game-theoretic feature importance
# Works for any model; TreeExplainer is fast for tree-based models
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)

# Global importance — what drives predictions overall?
shap.summary_plot(shap_values, X_test, plot_type='bar', feature_names=X_test.columns)

# Detailed impact plot — direction of each feature's effect
shap.summary_plot(shap_values, X_test, feature_names=X_test.columns)

# Local explanation — why did the model make this specific prediction?
row_index = 42  # explain prediction for row 42
shap.waterfall_plot(shap.Explanation(
    values=shap_values[row_index],
    base_values=explainer.expected_value,
    data=X_test.iloc[row_index].values,
    feature_names=X_test.columns.tolist()
))

print(f'Prediction for row {row_index}: {best_model.predict(X_test.iloc[[row_index]])[0]:.4f}')
print('Features pushing prediction UP:', 
      X_test.columns[shap_values[row_index] > 0].tolist()[:3])
print('Features pushing prediction DOWN:', 
      X_test.columns[shap_values[row_index] < 0].tolist()[:3])

Data Science: Model Training Arena

Epochs

Mean Squared Error (Loss)

---

Independent Variable (X)

Target Variable (Y)

train_model.py

X_train, y_train loaded

TRAINING OUTPUT

[11:25:49][SYSTEM] Environment ready. Waiting for model training initialization...

10. Modeling Techniques

Regression, Classification, Ensembles, and Hyperparameter Tuning

📈 Linear Models: Interpretable Baselines

🌲 Tree-Based Models and Gradient Boosting

🎯 Hyperparameter Tuning with Optuna

🔍 Model Interpretability with SHAP

Knowledge Check

10. Modeling Techniques

Regression, Classification, Ensembles, and Hyperparameter Tuning

📈 Linear Models: Interpretable Baselines

🌲 Tree-Based Models and Gradient Boosting

🎯 Hyperparameter Tuning with Optuna

🔍 Model Interpretability with SHAP

Knowledge Check