90 MINS intermediate
7. Statistics for Decision Making
Module 07: Statistics
Probability, Hypothesis Testing, and A/B Experimentation
Statistics is the science of making decisions under uncertainty. Every business decision involves uncertainty โ will the new checkout flow increase conversions? Does the marketing email subject line affect open rates? Is the revenue difference between two segments real or due to random variation? This module covers the statistical reasoning that separates decisions based on evidence from decisions based on gut feeling.
๐ฒ Probability Distributions in Practice
import numpy as np
from scipy import stats
import pandas as pd
# The distributions you encounter most in data science
# Normal Distribution โ the backbone of statistical inference
# Real-world: exam scores, measurement errors, aggregated metrics
mu, sigma = 100, 15 # IQ scores
normal_dist = stats.norm(mu, sigma)
print(f'P(IQ > 130): {normal_dist.sf(130):.4f}') # 2.28% โ 2 std above mean
print(f'90th percentile: {normal_dist.ppf(0.90):.1f}') # 119.2
# Binomial Distribution โ conversion rates, defect rates
# Real-world: click-through rates, A/B test outcomes
n_visitors = 10_000
p_conversion = 0.05
binom = stats.binom(n=n_visitors, p=p_conversion)
print(f'Expected conversions: {binom.mean():.0f}')
print(f'P(< 400 conversions): {binom.cdf(400):.4f}')
print(f'95% CI: [{binom.ppf(0.025):.0f}, {binom.ppf(0.975):.0f}]')
# Poisson Distribution โ count of events in fixed time
# Real-world: customer support tickets per hour, errors per day
mu_calls = 12 # average calls per hour
poisson = stats.poisson(mu_calls)
print(f'P(> 20 calls this hour): {poisson.sf(20):.4f}')
print(f'Need capacity for 99% of scenarios: {poisson.ppf(0.99):.0f} calls/hour')
# Central Limit Theorem โ the foundation of A/B testing
# Sample means approach normality regardless of underlying distribution
df = pd.read_csv('transactions.csv')
print(f'Transaction distribution skewness: {df.amount.skew():.2f}') # likely very skewed
# But sample means of size n=30+ are approximately normal
sample_means = [df.amount.sample(50).mean() for _ in range(10000)]
print(f'Sampling distribution skewness: {pd.Series(sample_means).skew():.2f}') # near zero๐งช Hypothesis Testing: The Complete Framework
from scipy import stats
import numpy as np
from typing import Dict, Tuple
def hypothesis_test(
group_a: np.ndarray,
group_b: np.ndarray,
alpha: float = 0.05,
alternative: str = 'two-sided'
) -> Dict:
'''
Complete hypothesis testing framework.
H0: means are equal
H1: means are different (two-sided) or one is greater (one-sided)
'''
n_a, n_b = len(group_a), len(group_b)
mean_a, mean_b = np.mean(group_a), np.mean(group_b)
# Check normality (only reliable for n > 50)
if n_a >= 50 and n_b >= 50:
_, p_norm_a = stats.shapiro(group_a[:50]) # sample for speed
_, p_norm_b = stats.shapiro(group_b[:50])
both_normal = p_norm_a > 0.05 and p_norm_b > 0.05
else:
both_normal = False
# Choose test based on data characteristics
if both_normal or (n_a >= 30 and n_b >= 30): # CLT kicks in
# Welch's t-test โ does not assume equal variances
statistic, p_value = stats.ttest_ind(group_a, group_b,
equal_var=False, alternative=alternative)
test_name = 'Welch t-test'
else:
# Mann-Whitney U โ non-parametric, for small or non-normal samples
statistic, p_value = stats.mannwhitneyu(group_a, group_b, alternative=alternative)
test_name = 'Mann-Whitney U'
# Effect size: Cohen's d
pooled_std = np.sqrt((group_a.std()**2 + group_b.std()**2) / 2)
cohens_d = (mean_b - mean_a) / pooled_std
# Power analysis โ did we have enough data?
from statsmodels.stats.power import TTestIndPower
power_analysis = TTestIndPower()
power = power_analysis.solve_power(
effect_size=abs(cohens_d), nobs1=n_a, alpha=alpha, ratio=n_b/n_a
)
return {
'test': test_name,
'n_a': n_a, 'n_b': n_b,
'mean_a': mean_a, 'mean_b': mean_b,
'relative_diff': (mean_b - mean_a) / mean_a,
'statistic': statistic, 'p_value': p_value,
'significant': p_value < alpha,
'cohens_d': cohens_d,
'effect_size': 'small' if abs(cohens_d) < 0.2 else 'medium' if abs(cohens_d) < 0.8 else 'large',
'power': power,
'underpowered': power < 0.8
}
# Real A/B test analysis
np.random.seed(42)
control = np.random.normal(loc=50, scale=15, size=500) # current version
variant = np.random.normal(loc=53, scale=15, size=500) # new version
result = hypothesis_test(control, variant)
for k, v in result.items():
if isinstance(v, float):
print(f'{k}: {v:.4f}')
else:
print(f'{k}: {v}')๐ The Multiple Testing Problem
from statsmodels.stats.multitest import multipletests
import numpy as np
# Running 20 tests at alpha=0.05 means ~1 false positive expected by chance alone
np.random.seed(42)
n_tests = 20
p_values = np.random.uniform(0, 1, n_tests) # simulated p-values from null data
print('Uncorrected significant results:', (p_values < 0.05).sum())
# Bonferroni correction โ most conservative
reject_bonf, p_bonf, _, _ = multipletests(p_values, method='bonferroni')
print(f'After Bonferroni correction: {reject_bonf.sum()} significant')
# Benjamini-Hochberg FDR โ recommended for most use cases
reject_bh, p_bh, _, _ = multipletests(p_values, method='fdr_bh')
print(f'After BH-FDR correction: {reject_bh.sum()} significant')
# Required sample size calculation
from statsmodels.stats.power import TTestIndPower
analysis = TTestIndPower()
required_n = analysis.solve_power(
effect_size=0.2, # detect a 20% improvement
alpha=0.05, # 5% false positive rate
power=0.8 # 80% chance of detecting real effect
)
print(f'Required sample size per group: {required_n:.0f}')Data Science: A/B Test Simulator
Variant A (Control)
12.00%
1000 visitors
Variant B (Treatment)
12.00%
1000 visitors
Run experiment to visualize distributions
Conversion Rate (%) โ
Statistical Verdictp = ---
Waiting for data...
ab_test.py
Python 3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
CONSOLE OUTPUT
Awaiting experiment execution...
Knowledge Check
Ready to test your understanding of 7. Statistics for Decision Making?