Data Science | VoidX Academy

7. Statistics for Decision Making

Module 07: Statistics

Probability, Hypothesis Testing, and A/B Experimentation

Statistics is the science of making decisions under uncertainty. Every business decision involves uncertainty — will the new checkout flow increase conversions? Does the marketing email subject line affect open rates? Is the revenue difference between two segments real or due to random variation? This module covers the statistical reasoning that separates decisions based on evidence from decisions based on gut feeling.

🎲 Probability Distributions in Practice

import numpy as np
from scipy import stats
import pandas as pd

# The distributions you encounter most in data science

# Normal Distribution — the backbone of statistical inference
# Real-world: exam scores, measurement errors, aggregated metrics
mu, sigma = 100, 15  # IQ scores
normal_dist = stats.norm(mu, sigma)
print(f'P(IQ > 130): {normal_dist.sf(130):.4f}')       # 2.28% — 2 std above mean
print(f'90th percentile: {normal_dist.ppf(0.90):.1f}') # 119.2

# Binomial Distribution — conversion rates, defect rates
# Real-world: click-through rates, A/B test outcomes
n_visitors = 10_000
p_conversion = 0.05
binom = stats.binom(n=n_visitors, p=p_conversion)
print(f'Expected conversions: {binom.mean():.0f}')
print(f'P(< 400 conversions): {binom.cdf(400):.4f}')
print(f'95% CI: [{binom.ppf(0.025):.0f}, {binom.ppf(0.975):.0f}]')

# Poisson Distribution — count of events in fixed time
# Real-world: customer support tickets per hour, errors per day
mu_calls = 12  # average calls per hour
poisson = stats.poisson(mu_calls)
print(f'P(> 20 calls this hour): {poisson.sf(20):.4f}')
print(f'Need capacity for 99% of scenarios: {poisson.ppf(0.99):.0f} calls/hour')

# Central Limit Theorem — the foundation of A/B testing
# Sample means approach normality regardless of underlying distribution
df = pd.read_csv('transactions.csv')
print(f'Transaction distribution skewness: {df.amount.skew():.2f}')  # likely very skewed

# But sample means of size n=30+ are approximately normal
sample_means = [df.amount.sample(50).mean() for _ in range(10000)]
print(f'Sampling distribution skewness: {pd.Series(sample_means).skew():.2f}')  # near zero

🧪 Hypothesis Testing: The Complete Framework

from scipy import stats
import numpy as np
from typing import Dict, Tuple

def hypothesis_test(
    group_a: np.ndarray,
    group_b: np.ndarray,
    alpha: float = 0.05,
    alternative: str = 'two-sided'
) -> Dict:
    '''
    Complete hypothesis testing framework.
    H0: means are equal
    H1: means are different (two-sided) or one is greater (one-sided)
    '''
    n_a, n_b = len(group_a), len(group_b)
    mean_a, mean_b = np.mean(group_a), np.mean(group_b)
    
    # Check normality (only reliable for n > 50)
    if n_a >= 50 and n_b >= 50:
        _, p_norm_a = stats.shapiro(group_a[:50])  # sample for speed
        _, p_norm_b = stats.shapiro(group_b[:50])
        both_normal = p_norm_a > 0.05 and p_norm_b > 0.05
    else:
        both_normal = False
    
    # Choose test based on data characteristics
    if both_normal or (n_a >= 30 and n_b >= 30):  # CLT kicks in
        # Welch's t-test — does not assume equal variances
        statistic, p_value = stats.ttest_ind(group_a, group_b,
                                              equal_var=False, alternative=alternative)
        test_name = 'Welch t-test'
    else:
        # Mann-Whitney U — non-parametric, for small or non-normal samples
        statistic, p_value = stats.mannwhitneyu(group_a, group_b, alternative=alternative)
        test_name = 'Mann-Whitney U'
    
    # Effect size: Cohen's d
    pooled_std = np.sqrt((group_a.std()**2 + group_b.std()**2) / 2)
    cohens_d = (mean_b - mean_a) / pooled_std
    
    # Power analysis — did we have enough data?
    from statsmodels.stats.power import TTestIndPower
    power_analysis = TTestIndPower()
    power = power_analysis.solve_power(
        effect_size=abs(cohens_d), nobs1=n_a, alpha=alpha, ratio=n_b/n_a
    )
    
    return {
        'test': test_name,
        'n_a': n_a, 'n_b': n_b,
        'mean_a': mean_a, 'mean_b': mean_b,
        'relative_diff': (mean_b - mean_a) / mean_a,
        'statistic': statistic, 'p_value': p_value,
        'significant': p_value < alpha,
        'cohens_d': cohens_d,
        'effect_size': 'small' if abs(cohens_d) < 0.2 else 'medium' if abs(cohens_d) < 0.8 else 'large',
        'power': power,
        'underpowered': power < 0.8
    }

# Real A/B test analysis
np.random.seed(42)
control = np.random.normal(loc=50, scale=15, size=500)   # current version
variant = np.random.normal(loc=53, scale=15, size=500)   # new version

result = hypothesis_test(control, variant)
for k, v in result.items():
    if isinstance(v, float):
        print(f'{k}: {v:.4f}')
    else:
        print(f'{k}: {v}')

📉 The Multiple Testing Problem

from statsmodels.stats.multitest import multipletests
import numpy as np

# Running 20 tests at alpha=0.05 means ~1 false positive expected by chance alone
np.random.seed(42)
n_tests = 20
p_values = np.random.uniform(0, 1, n_tests)  # simulated p-values from null data

print('Uncorrected significant results:', (p_values < 0.05).sum())

# Bonferroni correction — most conservative
reject_bonf, p_bonf, _, _ = multipletests(p_values, method='bonferroni')
print(f'After Bonferroni correction: {reject_bonf.sum()} significant')

# Benjamini-Hochberg FDR — recommended for most use cases
reject_bh, p_bh, _, _ = multipletests(p_values, method='fdr_bh')
print(f'After BH-FDR correction: {reject_bh.sum()} significant')

# Required sample size calculation
from statsmodels.stats.power import TTestIndPower
analysis = TTestIndPower()
required_n = analysis.solve_power(
    effect_size=0.2,  # detect a 20% improvement
    alpha=0.05,       # 5% false positive rate
    power=0.8         # 80% chance of detecting real effect
)
print(f'Required sample size per group: {required_n:.0f}')

Data Science: A/B Test Simulator

Variant A (Control)

12.00%

1000 visitors

Variant B (Treatment)

12.00%

1000 visitors

Run experiment to visualize distributions

Conversion Rate (%) →

Statistical Verdictp = ---

Waiting for data...

ab_test.py

Python 3

CONSOLE OUTPUT

Awaiting experiment execution...

7. Statistics for Decision Making

Module 07: Statistics

Probability, Hypothesis Testing, and A/B Experimentation

🎲 Probability Distributions in Practice

import numpy as np
from scipy import stats
import pandas as pd

# The distributions you encounter most in data science

# Normal Distribution — the backbone of statistical inference
# Real-world: exam scores, measurement errors, aggregated metrics
mu, sigma = 100, 15  # IQ scores
normal_dist = stats.norm(mu, sigma)
print(f'P(IQ > 130): {normal_dist.sf(130):.4f}')       # 2.28% — 2 std above mean
print(f'90th percentile: {normal_dist.ppf(0.90):.1f}') # 119.2

# Binomial Distribution — conversion rates, defect rates
# Real-world: click-through rates, A/B test outcomes
n_visitors = 10_000
p_conversion = 0.05
binom = stats.binom(n=n_visitors, p=p_conversion)
print(f'Expected conversions: {binom.mean():.0f}')
print(f'P(< 400 conversions): {binom.cdf(400):.4f}')
print(f'95% CI: [{binom.ppf(0.025):.0f}, {binom.ppf(0.975):.0f}]')

# Poisson Distribution — count of events in fixed time
# Real-world: customer support tickets per hour, errors per day
mu_calls = 12  # average calls per hour
poisson = stats.poisson(mu_calls)
print(f'P(> 20 calls this hour): {poisson.sf(20):.4f}')
print(f'Need capacity for 99% of scenarios: {poisson.ppf(0.99):.0f} calls/hour')

# Central Limit Theorem — the foundation of A/B testing
# Sample means approach normality regardless of underlying distribution
df = pd.read_csv('transactions.csv')
print(f'Transaction distribution skewness: {df.amount.skew():.2f}')  # likely very skewed

# But sample means of size n=30+ are approximately normal
sample_means = [df.amount.sample(50).mean() for _ in range(10000)]
print(f'Sampling distribution skewness: {pd.Series(sample_means).skew():.2f}')  # near zero

🧪 Hypothesis Testing: The Complete Framework

from scipy import stats
import numpy as np
from typing import Dict, Tuple

def hypothesis_test(
    group_a: np.ndarray,
    group_b: np.ndarray,
    alpha: float = 0.05,
    alternative: str = 'two-sided'
) -> Dict:
    '''
    Complete hypothesis testing framework.
    H0: means are equal
    H1: means are different (two-sided) or one is greater (one-sided)
    '''
    n_a, n_b = len(group_a), len(group_b)
    mean_a, mean_b = np.mean(group_a), np.mean(group_b)
    
    # Check normality (only reliable for n > 50)
    if n_a >= 50 and n_b >= 50:
        _, p_norm_a = stats.shapiro(group_a[:50])  # sample for speed
        _, p_norm_b = stats.shapiro(group_b[:50])
        both_normal = p_norm_a > 0.05 and p_norm_b > 0.05
    else:
        both_normal = False
    
    # Choose test based on data characteristics
    if both_normal or (n_a >= 30 and n_b >= 30):  # CLT kicks in
        # Welch's t-test — does not assume equal variances
        statistic, p_value = stats.ttest_ind(group_a, group_b,
                                              equal_var=False, alternative=alternative)
        test_name = 'Welch t-test'
    else:
        # Mann-Whitney U — non-parametric, for small or non-normal samples
        statistic, p_value = stats.mannwhitneyu(group_a, group_b, alternative=alternative)
        test_name = 'Mann-Whitney U'
    
    # Effect size: Cohen's d
    pooled_std = np.sqrt((group_a.std()**2 + group_b.std()**2) / 2)
    cohens_d = (mean_b - mean_a) / pooled_std
    
    # Power analysis — did we have enough data?
    from statsmodels.stats.power import TTestIndPower
    power_analysis = TTestIndPower()
    power = power_analysis.solve_power(
        effect_size=abs(cohens_d), nobs1=n_a, alpha=alpha, ratio=n_b/n_a
    )
    
    return {
        'test': test_name,
        'n_a': n_a, 'n_b': n_b,
        'mean_a': mean_a, 'mean_b': mean_b,
        'relative_diff': (mean_b - mean_a) / mean_a,
        'statistic': statistic, 'p_value': p_value,
        'significant': p_value < alpha,
        'cohens_d': cohens_d,
        'effect_size': 'small' if abs(cohens_d) < 0.2 else 'medium' if abs(cohens_d) < 0.8 else 'large',
        'power': power,
        'underpowered': power < 0.8
    }

# Real A/B test analysis
np.random.seed(42)
control = np.random.normal(loc=50, scale=15, size=500)   # current version
variant = np.random.normal(loc=53, scale=15, size=500)   # new version

result = hypothesis_test(control, variant)
for k, v in result.items():
    if isinstance(v, float):
        print(f'{k}: {v:.4f}')
    else:
        print(f'{k}: {v}')

📉 The Multiple Testing Problem

from statsmodels.stats.multitest import multipletests
import numpy as np

# Running 20 tests at alpha=0.05 means ~1 false positive expected by chance alone
np.random.seed(42)
n_tests = 20
p_values = np.random.uniform(0, 1, n_tests)  # simulated p-values from null data

print('Uncorrected significant results:', (p_values < 0.05).sum())

# Bonferroni correction — most conservative
reject_bonf, p_bonf, _, _ = multipletests(p_values, method='bonferroni')
print(f'After Bonferroni correction: {reject_bonf.sum()} significant')

# Benjamini-Hochberg FDR — recommended for most use cases
reject_bh, p_bh, _, _ = multipletests(p_values, method='fdr_bh')
print(f'After BH-FDR correction: {reject_bh.sum()} significant')

# Required sample size calculation
from statsmodels.stats.power import TTestIndPower
analysis = TTestIndPower()
required_n = analysis.solve_power(
    effect_size=0.2,  # detect a 20% improvement
    alpha=0.05,       # 5% false positive rate
    power=0.8         # 80% chance of detecting real effect
)
print(f'Required sample size per group: {required_n:.0f}')

Data Science: A/B Test Simulator

Variant A (Control)

12.00%

1000 visitors

Variant B (Treatment)

12.00%

1000 visitors

Run experiment to visualize distributions

Conversion Rate (%) →

Statistical Verdictp = ---

Waiting for data...

ab_test.py

Python 3

CONSOLE OUTPUT

Awaiting experiment execution...

7. Statistics for Decision Making

Probability, Hypothesis Testing, and A/B Experimentation

🎲 Probability Distributions in Practice

🧪 Hypothesis Testing: The Complete Framework

📉 The Multiple Testing Problem

Variant A (Control)

Variant B (Treatment)

Knowledge Check

7. Statistics for Decision Making

Probability, Hypothesis Testing, and A/B Experimentation

🎲 Probability Distributions in Practice

🧪 Hypothesis Testing: The Complete Framework

📉 The Multiple Testing Problem

Variant A (Control)

Variant B (Treatment)

Knowledge Check