Data Science | VoidX Academy

5. Exploratory Data Analysis

Module 05: EDA

Statistical Summaries, Distributions, and Correlation

Exploratory Data Analysis is the most undervalued step in data science. Models built without EDA are black boxes — you have no intuition about what's driving predictions, what data quality issues lurk, or whether the model's outputs make any sense. Professional data scientists spend significant time in EDA because it prevents expensive downstream mistakes and often reveals the answer before any model is needed.

📊 The EDA Diagnostic Framework

import pandas as pd
import numpy as np
from scipy import stats

df = pd.read_csv('sales_dataset.csv', parse_dates=['date'])

# ===== UNIVARIATE ANALYSIS =====
# For each numeric column, understand its distribution completely
def describe_numeric(series: pd.Series) -> dict:
    desc = series.describe()
    return {
        'count': desc['count'],
        'mean': desc['mean'],
        'median': series.median(),
        'std': desc['std'],
        'skewness': series.skew(),
        'kurtosis': series.kurtosis(),
        'iqr': desc['75%'] - desc['25%'],
        'cv': desc['std'] / desc['mean'] if desc['mean'] != 0 else np.nan,  # coefficient of variation
        'pct_zero': (series == 0).mean(),
        'pct_negative': (series < 0).mean(),
    }

for col in df.select_dtypes(include='number').columns:
    stats_dict = describe_numeric(df[col])
    print(f'\n=== {col} ===')
    for k, v in stats_dict.items():
        print(f'  {k}: {v:.4f}')

# ===== BIVARIATE ANALYSIS =====
# Correlation matrix with statistical significance
def correlation_with_significance(df: pd.DataFrame) -> tuple:
    numeric_df = df.select_dtypes(include='number')
    n = len(numeric_df)
    corr_matrix = numeric_df.corr()
    p_matrix = pd.DataFrame(np.ones((len(corr_matrix), len(corr_matrix))),
                            index=corr_matrix.index, columns=corr_matrix.columns)
    for col1 in numeric_df.columns:
        for col2 in numeric_df.columns:
            if col1 != col2:
                _, p_val = stats.pearsonr(
                    numeric_df[col1].dropna(),
                    numeric_df[col2].dropna()
                )
                p_matrix.loc[col1, col2] = p_val
    return corr_matrix, p_matrix

corr, pvals = correlation_with_significance(df)
print('Significant correlations (|r| > 0.5 and p < 0.05):')
for col1 in corr.columns:
    for col2 in corr.index:
        if col1 < col2 and abs(corr.loc[col1, col2]) > 0.5 and pvals.loc[col1, col2] < 0.05:
            print(f'  {col1} ↔ {col2}: r={corr.loc[col1, col2]:.3f}, p={pvals.loc[col1, col2]:.4f}')

🎯 Target Variable Analysis

import pandas as pd
import numpy as np
from scipy import stats

def analyze_target(df: pd.DataFrame, target: str, features: list) -> pd.DataFrame:
    '''Compute relationship between each feature and the target variable'''
    results = []
    
    for feature in features:
        if df[feature].dtype in ['object', 'category']:
            # For categorical features: ANOVA F-test
            groups = [group[target].dropna().values 
                      for _, group in df.groupby(feature)]
            f_stat, p_val = stats.f_oneway(*groups)
            # Effect size: Eta-squared
            ss_between = sum(len(g) * (np.mean(g) - df[target].mean())**2 for g in groups)
            ss_total = sum((df[target] - df[target].mean())**2)
            eta_sq = ss_between / ss_total
            results.append({'feature': feature, 'type': 'categorical',
                           'statistic': f_stat, 'p_value': p_val, 'effect_size': eta_sq})
        else:
            # For numeric features: Pearson correlation
            r, p_val = stats.pearsonr(df[feature].dropna(), df[target][df[feature].notna()])
            results.append({'feature': feature, 'type': 'numeric',
                           'statistic': r, 'p_value': p_val, 'effect_size': r**2})
    
    return (
        pd.DataFrame(results)
        .sort_values('effect_size', ascending=False)
        .assign(significant=lambda df: df['p_value'] < 0.05)
    )

feature_importance = analyze_target(df, 'revenue',
    features=[c for c in df.columns if c != 'revenue'])
print(feature_importance.to_string(index=False))

⏰ Time Series EDA

import pandas as pd
import numpy as np

def time_series_eda(df: pd.DataFrame, date_col: str, value_col: str) -> dict:
    df = df.set_index(date_col).sort_index()
    series = df[value_col]
    
    daily = series.resample('D').sum()
    weekly = series.resample('W').sum()
    monthly = series.resample('ME').sum()
    
    # Detect seasonality patterns
    daily['day_of_week'] = pd.to_datetime(daily.index).dayofweek
    dow_effect = daily.groupby('day_of_week')[value_col].mean()
    
    # Month-over-month growth rates
    mom_growth = monthly.pct_change().dropna()
    
    # Autocorrelation — is today's value related to yesterday's?
    autocorr = {f'lag_{i}': series.autocorr(lag=i) for i in [1, 7, 14, 30]}
    
    return {
        'total': series.sum(),
        'daily_avg': daily.mean(),
        'best_day': daily.idxmax(),
        'worst_day': daily.idxmin(),
        'mom_growth_mean': mom_growth.mean(),
        'mom_growth_std': mom_growth.std(),
        'autocorrelation': autocorr,
        'dow_pattern': dow_effect.to_dict()
    }

ts_summary = time_series_eda(df, 'date', 'revenue')
for key, value in ts_summary.items():
    print(f'{key}: {value}')

Data Science: EDA Terminal

ecommerce_churn.csv

1000 rows × 4 columns

customer_id	tenure_months	total_spend	churn
usr_892	12	450.50	0
usr_104	2	45.00	1
usr_443	36	2100.00	0
usr_991	1	12.99	1
usr_202	24	1250.75	0
usr_331	48	3400.20	0
usr_705	3	110.00	1

Run df.describe() in Python to see statistical summaries.

analyze.py

Jupyter Runtime

Awaiting execution...

5. Exploratory Data Analysis

Module 05: EDA

Statistical Summaries, Distributions, and Correlation

📊 The EDA Diagnostic Framework

import pandas as pd
import numpy as np
from scipy import stats

df = pd.read_csv('sales_dataset.csv', parse_dates=['date'])

# ===== UNIVARIATE ANALYSIS =====
# For each numeric column, understand its distribution completely
def describe_numeric(series: pd.Series) -> dict:
    desc = series.describe()
    return {
        'count': desc['count'],
        'mean': desc['mean'],
        'median': series.median(),
        'std': desc['std'],
        'skewness': series.skew(),
        'kurtosis': series.kurtosis(),
        'iqr': desc['75%'] - desc['25%'],
        'cv': desc['std'] / desc['mean'] if desc['mean'] != 0 else np.nan,  # coefficient of variation
        'pct_zero': (series == 0).mean(),
        'pct_negative': (series < 0).mean(),
    }

for col in df.select_dtypes(include='number').columns:
    stats_dict = describe_numeric(df[col])
    print(f'\n=== {col} ===')
    for k, v in stats_dict.items():
        print(f'  {k}: {v:.4f}')

# ===== BIVARIATE ANALYSIS =====
# Correlation matrix with statistical significance
def correlation_with_significance(df: pd.DataFrame) -> tuple:
    numeric_df = df.select_dtypes(include='number')
    n = len(numeric_df)
    corr_matrix = numeric_df.corr()
    p_matrix = pd.DataFrame(np.ones((len(corr_matrix), len(corr_matrix))),
                            index=corr_matrix.index, columns=corr_matrix.columns)
    for col1 in numeric_df.columns:
        for col2 in numeric_df.columns:
            if col1 != col2:
                _, p_val = stats.pearsonr(
                    numeric_df[col1].dropna(),
                    numeric_df[col2].dropna()
                )
                p_matrix.loc[col1, col2] = p_val
    return corr_matrix, p_matrix

corr, pvals = correlation_with_significance(df)
print('Significant correlations (|r| > 0.5 and p < 0.05):')
for col1 in corr.columns:
    for col2 in corr.index:
        if col1 < col2 and abs(corr.loc[col1, col2]) > 0.5 and pvals.loc[col1, col2] < 0.05:
            print(f'  {col1} ↔ {col2}: r={corr.loc[col1, col2]:.3f}, p={pvals.loc[col1, col2]:.4f}')

🎯 Target Variable Analysis

import pandas as pd
import numpy as np
from scipy import stats

def analyze_target(df: pd.DataFrame, target: str, features: list) -> pd.DataFrame:
    '''Compute relationship between each feature and the target variable'''
    results = []
    
    for feature in features:
        if df[feature].dtype in ['object', 'category']:
            # For categorical features: ANOVA F-test
            groups = [group[target].dropna().values 
                      for _, group in df.groupby(feature)]
            f_stat, p_val = stats.f_oneway(*groups)
            # Effect size: Eta-squared
            ss_between = sum(len(g) * (np.mean(g) - df[target].mean())**2 for g in groups)
            ss_total = sum((df[target] - df[target].mean())**2)
            eta_sq = ss_between / ss_total
            results.append({'feature': feature, 'type': 'categorical',
                           'statistic': f_stat, 'p_value': p_val, 'effect_size': eta_sq})
        else:
            # For numeric features: Pearson correlation
            r, p_val = stats.pearsonr(df[feature].dropna(), df[target][df[feature].notna()])
            results.append({'feature': feature, 'type': 'numeric',
                           'statistic': r, 'p_value': p_val, 'effect_size': r**2})
    
    return (
        pd.DataFrame(results)
        .sort_values('effect_size', ascending=False)
        .assign(significant=lambda df: df['p_value'] < 0.05)
    )

feature_importance = analyze_target(df, 'revenue',
    features=[c for c in df.columns if c != 'revenue'])
print(feature_importance.to_string(index=False))

⏰ Time Series EDA

import pandas as pd
import numpy as np

def time_series_eda(df: pd.DataFrame, date_col: str, value_col: str) -> dict:
    df = df.set_index(date_col).sort_index()
    series = df[value_col]
    
    daily = series.resample('D').sum()
    weekly = series.resample('W').sum()
    monthly = series.resample('ME').sum()
    
    # Detect seasonality patterns
    daily['day_of_week'] = pd.to_datetime(daily.index).dayofweek
    dow_effect = daily.groupby('day_of_week')[value_col].mean()
    
    # Month-over-month growth rates
    mom_growth = monthly.pct_change().dropna()
    
    # Autocorrelation — is today's value related to yesterday's?
    autocorr = {f'lag_{i}': series.autocorr(lag=i) for i in [1, 7, 14, 30]}
    
    return {
        'total': series.sum(),
        'daily_avg': daily.mean(),
        'best_day': daily.idxmax(),
        'worst_day': daily.idxmin(),
        'mom_growth_mean': mom_growth.mean(),
        'mom_growth_std': mom_growth.std(),
        'autocorrelation': autocorr,
        'dow_pattern': dow_effect.to_dict()
    }

ts_summary = time_series_eda(df, 'date', 'revenue')
for key, value in ts_summary.items():
    print(f'{key}: {value}')

Data Science: EDA Terminal

ecommerce_churn.csv

1000 rows × 4 columns

customer_id	tenure_months	total_spend	churn
usr_892	12	450.50	0
usr_104	2	45.00	1
usr_443	36	2100.00	0
usr_991	1	12.99	1
usr_202	24	1250.75	0
usr_331	48	3400.20	0
usr_705	3	110.00	1

Run df.describe() in Python to see statistical summaries.

analyze.py

Jupyter Runtime

Awaiting execution...

5. Exploratory Data Analysis

Statistical Summaries, Distributions, and Correlation

📊 The EDA Diagnostic Framework

🎯 Target Variable Analysis

⏰ Time Series EDA

ecommerce_churn.csv

Knowledge Check

5. Exploratory Data Analysis

Statistical Summaries, Distributions, and Correlation

📊 The EDA Diagnostic Framework

🎯 Target Variable Analysis

⏰ Time Series EDA

ecommerce_churn.csv

Knowledge Check