95 MINS advanced
11. Time Series Analysis
Module 11: Time Series
Trend, Seasonality, and Forecasting
Time series data — anything indexed by time — is the most common data type in business analytics. Revenue trends, user growth, inventory levels, sensor readings, and web traffic all share a fundamental characteristic: the order of observations matters. Techniques designed for independent samples fail on temporal data. This module covers the complete time series toolkit from classical decomposition to modern machine learning forecasting.
🔍 Time Series Decomposition
import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import seasonal_decompose, STL
from statsmodels.tsa.stattools import adfuller, kpss
# Load and prepare time series
df = pd.read_csv('sales.csv', parse_dates=['date'], index_col='date')
series = df['revenue'].resample('D').sum().fillna(method='ffill')
# Decompose into trend, seasonality, and residual
# Use 'multiplicative' when seasonality scales with trend (common for revenue)
# Use 'additive' when seasonal variation is constant regardless of trend level
decomposition = seasonal_decompose(series, model='multiplicative', period=7)
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
print('Trend range:', f'${trend.min():,.0f} - ${trend.max():,.0f}')
print('Seasonal factor range:', f'{seasonal.min():.3f} - {seasonal.max():.3f}')
print('Residual std (unexplained variance):', f'{residual.std():.4f}')
# STL decomposition — more robust, handles outliers better
stl = STL(series, period=7, robust=True)
stl_result = stl.fit()
print('\nSTL decomposition complete')
print(f'Seasonality strength: {1 - stl_result.resid.var() / (stl_result.seasonal + stl_result.resid).var():.4f}')
# Stationarity testing — required before ARIMA
# Null hypothesis: series has a unit root (non-stationary)
adf_result = adfuller(series.dropna())
print(f'\nADF Test p-value: {adf_result[1]:.6f}')
print('Series is', 'stationary' if adf_result[1] < 0.05 else 'NON-STATIONARY (needs differencing)')
# Make stationary through differencing
series_diff = series.diff().dropna()
adf_diff = adfuller(series_diff)
print(f'After 1st differencing, ADF p-value: {adf_diff[1]:.6f}')📉 Classical Forecasting: ARIMA and ETS
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_absolute_percentage_error
import warnings
warnings.filterwarnings('ignore')
# Time series train/test split — never random, always chronological
train_size = int(len(series) * 0.8)
train, test = series[:train_size], series[train_size:]
# ARIMA(p, d, q) — AutoRegressive Integrated Moving Average
# p: autoregressive order (lags of the series)
# d: differencing order (1 = first difference for stationarity)
# q: moving average order (lags of forecast errors)
arima = ARIMA(train, order=(7, 1, 2)) # weekly AR, 1 difference, MA=2
arima_fit = arima.fit()
forecast_arima = arima_fit.forecast(steps=len(test))
mape_arima = mean_absolute_percentage_error(test, forecast_arima)
print(f'ARIMA MAPE: {mape_arima:.4%}')
# ETS — Exponential Smoothing with Trend and Seasonality (Holt-Winters)
ets = ExponentialSmoothing(
train,
trend='add',
seasonal='mul',
seasonal_periods=7,
initialization_method='estimated'
)
ets_fit = ets.fit(optimized=True)
forecast_ets = ets_fit.forecast(len(test))
mape_ets = mean_absolute_percentage_error(test, forecast_ets)
print(f'ETS (Holt-Winters) MAPE: {mape_ets:.4%}')
print(f'\nBetter model: {"ARIMA" if mape_arima < mape_ets else "ETS"}')🤖 ML-Based Forecasting with Features
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_percentage_error
def create_lag_features(series: pd.Series, lags: list = [1,2,3,7,14,28]) -> pd.DataFrame:
df = pd.DataFrame({'y': series})
for lag in lags:
df[f'lag_{lag}'] = series.shift(lag)
df['rolling_mean_7'] = series.shift(1).rolling(7).mean()
df['rolling_std_7'] = series.shift(1).rolling(7).std()
df['rolling_mean_28'] = series.shift(1).rolling(28).mean()
df['day_of_week'] = series.index.dayofweek
df['month'] = series.index.month
df['day_of_month'] = series.index.day
df['is_weekend'] = (series.index.dayofweek >= 5).astype(int)
df['week_of_year'] = series.index.isocalendar().week.astype(int)
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
return df.dropna()
# Create feature matrix
df_features = create_lag_features(series)
X = df_features.drop('y', axis=1)
y = df_features['y']
# Chronological split
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
# Train LightGBM forecaster
lgb_forecast = LGBMRegressor(n_estimators=500, learning_rate=0.05, num_leaves=31,
random_state=42, n_jobs=-1)
lgb_forecast.fit(X_train, y_train)
y_pred = lgb_forecast.predict(X_test)
mape_lgb = mean_absolute_percentage_error(y_test, y_pred)
print(f'LightGBM MAPE: {mape_lgb:.4%}')
# Feature importance for forecasting
importance_df = pd.DataFrame({
'feature': X.columns,
'importance': lgb_forecast.feature_importances_
}).sort_values('importance', ascending=False)
print('\nTop forecast drivers:')
print(importance_df.head(8).to_string(index=False))Data Science: Model Training Arena
Epochs
0
Mean Squared Error (Loss)
---
Independent Variable (X)
Target Variable (Y)
train_model.py
X_train, y_train loaded
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
TRAINING OUTPUT
[12:54:57][SYSTEM] Environment ready. Waiting for model training initialization...
Knowledge Check
Ready to test your understanding of 11. Time Series Analysis?