85 MINS intermediate
5. Exploratory Data Analysis
Module 05: EDA
Statistical Summaries, Distributions, and Correlation
Exploratory Data Analysis is the most undervalued step in data science. Models built without EDA are black boxes — you have no intuition about what's driving predictions, what data quality issues lurk, or whether the model's outputs make any sense. Professional data scientists spend significant time in EDA because it prevents expensive downstream mistakes and often reveals the answer before any model is needed.
📊 The EDA Diagnostic Framework
import pandas as pd
import numpy as np
from scipy import stats
df = pd.read_csv('sales_dataset.csv', parse_dates=['date'])
# ===== UNIVARIATE ANALYSIS =====
# For each numeric column, understand its distribution completely
def describe_numeric(series: pd.Series) -> dict:
desc = series.describe()
return {
'count': desc['count'],
'mean': desc['mean'],
'median': series.median(),
'std': desc['std'],
'skewness': series.skew(),
'kurtosis': series.kurtosis(),
'iqr': desc['75%'] - desc['25%'],
'cv': desc['std'] / desc['mean'] if desc['mean'] != 0 else np.nan, # coefficient of variation
'pct_zero': (series == 0).mean(),
'pct_negative': (series < 0).mean(),
}
for col in df.select_dtypes(include='number').columns:
stats_dict = describe_numeric(df[col])
print(f'\n=== {col} ===')
for k, v in stats_dict.items():
print(f' {k}: {v:.4f}')
# ===== BIVARIATE ANALYSIS =====
# Correlation matrix with statistical significance
def correlation_with_significance(df: pd.DataFrame) -> tuple:
numeric_df = df.select_dtypes(include='number')
n = len(numeric_df)
corr_matrix = numeric_df.corr()
p_matrix = pd.DataFrame(np.ones((len(corr_matrix), len(corr_matrix))),
index=corr_matrix.index, columns=corr_matrix.columns)
for col1 in numeric_df.columns:
for col2 in numeric_df.columns:
if col1 != col2:
_, p_val = stats.pearsonr(
numeric_df[col1].dropna(),
numeric_df[col2].dropna()
)
p_matrix.loc[col1, col2] = p_val
return corr_matrix, p_matrix
corr, pvals = correlation_with_significance(df)
print('Significant correlations (|r| > 0.5 and p < 0.05):')
for col1 in corr.columns:
for col2 in corr.index:
if col1 < col2 and abs(corr.loc[col1, col2]) > 0.5 and pvals.loc[col1, col2] < 0.05:
print(f' {col1} ↔ {col2}: r={corr.loc[col1, col2]:.3f}, p={pvals.loc[col1, col2]:.4f}')🎯 Target Variable Analysis
import pandas as pd
import numpy as np
from scipy import stats
def analyze_target(df: pd.DataFrame, target: str, features: list) -> pd.DataFrame:
'''Compute relationship between each feature and the target variable'''
results = []
for feature in features:
if df[feature].dtype in ['object', 'category']:
# For categorical features: ANOVA F-test
groups = [group[target].dropna().values
for _, group in df.groupby(feature)]
f_stat, p_val = stats.f_oneway(*groups)
# Effect size: Eta-squared
ss_between = sum(len(g) * (np.mean(g) - df[target].mean())**2 for g in groups)
ss_total = sum((df[target] - df[target].mean())**2)
eta_sq = ss_between / ss_total
results.append({'feature': feature, 'type': 'categorical',
'statistic': f_stat, 'p_value': p_val, 'effect_size': eta_sq})
else:
# For numeric features: Pearson correlation
r, p_val = stats.pearsonr(df[feature].dropna(), df[target][df[feature].notna()])
results.append({'feature': feature, 'type': 'numeric',
'statistic': r, 'p_value': p_val, 'effect_size': r**2})
return (
pd.DataFrame(results)
.sort_values('effect_size', ascending=False)
.assign(significant=lambda df: df['p_value'] < 0.05)
)
feature_importance = analyze_target(df, 'revenue',
features=[c for c in df.columns if c != 'revenue'])
print(feature_importance.to_string(index=False))⏰ Time Series EDA
import pandas as pd
import numpy as np
def time_series_eda(df: pd.DataFrame, date_col: str, value_col: str) -> dict:
df = df.set_index(date_col).sort_index()
series = df[value_col]
daily = series.resample('D').sum()
weekly = series.resample('W').sum()
monthly = series.resample('ME').sum()
# Detect seasonality patterns
daily['day_of_week'] = pd.to_datetime(daily.index).dayofweek
dow_effect = daily.groupby('day_of_week')[value_col].mean()
# Month-over-month growth rates
mom_growth = monthly.pct_change().dropna()
# Autocorrelation — is today's value related to yesterday's?
autocorr = {f'lag_{i}': series.autocorr(lag=i) for i in [1, 7, 14, 30]}
return {
'total': series.sum(),
'daily_avg': daily.mean(),
'best_day': daily.idxmax(),
'worst_day': daily.idxmin(),
'mom_growth_mean': mom_growth.mean(),
'mom_growth_std': mom_growth.std(),
'autocorrelation': autocorr,
'dow_pattern': dow_effect.to_dict()
}
ts_summary = time_series_eda(df, 'date', 'revenue')
for key, value in ts_summary.items():
print(f'{key}: {value}')Data Science: EDA Terminal
ecommerce_churn.csv
1000 rows × 4 columns| customer_id | tenure_months | total_spend | churn |
|---|---|---|---|
| usr_892 | 12 | 450.50 | 0 |
| usr_104 | 2 | 45.00 | 1 |
| usr_443 | 36 | 2100.00 | 0 |
| usr_991 | 1 | 12.99 | 1 |
| usr_202 | 24 | 1250.75 | 0 |
| usr_331 | 48 | 3400.20 | 0 |
| usr_705 | 3 | 110.00 | 1 |
Run
df.describe() in Python to see statistical summaries.analyze.py
Jupyter Runtime
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
Awaiting execution...
Knowledge Check
Ready to test your understanding of 5. Exploratory Data Analysis?