Data Science | VoidX Academy

4. Data Cleaning & Preparation

Module 04: Data Cleaning

Turning Raw Data Into Trusted Data

Data cleaning is the unglamorous core of data science. No matter how sophisticated your model, it cannot overcome systematically corrupted inputs. This module teaches the complete data cleaning workflow: detecting and handling missing values, identifying and treating outliers, fixing data types, standardizing formats, and deduplicating records — all with production-ready patterns that scale beyond notebooks.

🔍 Missing Data: Strategies and Trade-offs

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

df = pd.read_csv('raw_sales.csv')

# Step 1: Understand the missingness pattern
missing_report = pd.DataFrame({
    'count': df.isnull().sum(),
    'pct': (df.isnull().sum() / len(df) * 100).round(2),
    'dtype': df.dtypes
}).query('count > 0').sort_values('pct', ascending=False)
print(missing_report)

# MCAR, MAR, MNAR analysis — critical for choosing the right strategy
# MCAR (Missing Completely At Random): Can safely drop or impute
# MAR (Missing At Random): Imputation with related features works
# MNAR (Missing Not At Random): Missingness carries signal — create indicator column

# Strategy 1: Drop rows with > 50% missing (clearly corrupted records)
df_clean = df[df.isnull().mean(axis=1) < 0.5].copy()

# Strategy 2: Forward fill for time series data (value persists until changed)
df['price'] = df['price'].ffill().bfill()

# Strategy 3: Statistical imputation for numeric columns
for col in df_clean.select_dtypes(include='number').columns:
    pct_missing = df_clean[col].isnull().mean()
    if pct_missing < 0.05:  # < 5% missing: use median (robust to outliers)
        df_clean[col] = df_clean[col].fillna(df_clean[col].median())
    elif pct_missing < 0.30:  # 5-30% missing: KNN imputation
        pass  # handle with KNNImputer below
    else:  # > 30% missing: drop column or create 'unknown' category
        df_clean = df_clean.drop(columns=[col])

# Strategy 4: KNN Imputation — uses similar records to estimate missing values
knn_imputer = KNNImputer(n_neighbors=5)
numeric_cols = df_clean.select_dtypes(include='number').columns
df_clean[numeric_cols] = knn_imputer.fit_transform(df_clean[numeric_cols])

# Strategy 5: Create missingness indicator before imputing (preserve signal)
df_clean['had_missing_revenue'] = df['revenue'].isnull().astype(int)
df_clean['revenue'] = df['revenue'].fillna(df['revenue'].median())

📊 Outlier Detection and Treatment

import pandas as pd
import numpy as np
from scipy import stats
from sklearn.ensemble import IsolationForest

def detect_outliers(df: pd.DataFrame, column: str, method: str = 'iqr') -> pd.Series:
    '''Returns boolean mask: True = outlier'''
    series = df[column].dropna()
    
    if method == 'iqr':
        Q1, Q3 = series.quantile([0.25, 0.75])
        IQR = Q3 - Q1
        lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
        return ~df[column].between(lower, upper)
    
    elif method == 'zscore':
        z_scores = np.abs(stats.zscore(series))
        return pd.Series(z_scores > 3, index=series.index).reindex(df.index, fill_value=False)
    
    elif method == 'modified_zscore':  # Robust to non-normal distributions
        median = series.median()
        mad = np.median(np.abs(series - median))  # Median Absolute Deviation
        modified_z = 0.6745 * (series - median) / (mad + 1e-10)
        return pd.Series(np.abs(modified_z) > 3.5, index=series.index).reindex(df.index, fill_value=False)

# Compare outlier methods
df = pd.DataFrame({'revenue': np.concatenate([
    np.random.lognormal(7, 0.5, 995),  # Normal data
    [0, -100, 1e9, 5e9, -500]  # Obvious outliers
])})

for method in ['iqr', 'zscore', 'modified_zscore']:
    outlier_mask = detect_outliers(df, 'revenue', method)
    print(f'{method}: {outlier_mask.sum()} outliers detected')

# Treatment options
# 1. Remove (only if clearly erroneous)
df_no_outliers = df[~detect_outliers(df, 'revenue', 'iqr')]

# 2. Winsorize (cap at percentiles) — preserves data, limits extreme influence
df['revenue_winsorized'] = df['revenue'].clip(
    lower=df['revenue'].quantile(0.01),
    upper=df['revenue'].quantile(0.99)
)

# 3. Log transform — compresses right-skewed distributions
df['revenue_log'] = np.log1p(df['revenue'].clip(lower=0))  # log(1+x) handles zeros

print('Skewness before:', df['revenue'].skew().round(3))
print('Skewness after log:', df['revenue_log'].skew().round(3))

🔄 Data Type Fixing and Standardization

import pandas as pd
import re

df = pd.DataFrame({
    'date': ['2024-01-15', '15/01/2024', 'Jan 15, 2024', '20240115'],
    'revenue': ['$1,234.56', '£2,000', '1500.00 EUR', '999'],
    'customer': ['  john doe  ', 'JANE SMITH', 'Bob   Jones', 'alice WANG'],
    'phone': ['+1-555-0100', '(555) 0101', '5550102', '+1.555.0103'],
})

# Standardize dates — handle multiple formats
df['date_clean'] = pd.to_datetime(df['date'], infer_datetime_format=True, dayfirst=False)

# Standardize currency values
def parse_currency(val: str) -> float:
    '''Extract numeric value from any currency string'''
    if pd.isna(val):
        return np.nan
    cleaned = re.sub(r'[^\d.]', '', str(val).replace(',', ''))
    return float(cleaned) if cleaned else np.nan

df['revenue_clean'] = df['revenue'].apply(parse_currency)

# Standardize text
df['customer_clean'] = (
    df['customer']
    .str.strip()           # remove leading/trailing whitespace
    .str.lower()           # normalize case
    .str.replace(r'\s+', ' ', regex=True)  # collapse multiple spaces
    .str.title()           # title case
)

# Standardize phone numbers
def standardize_phone(phone: str) -> str:
    digits = re.sub(r'\D', '', str(phone))
    if len(digits) == 11 and digits.startswith('1'):
        digits = digits[1:]
    if len(digits) == 10:
        return f'({digits[:3]}) {digits[3:6]}-{digits[6:]}'
    return 'INVALID'

df['phone_clean'] = df['phone'].apply(standardize_phone)
print(df[['date_clean', 'revenue_clean', 'customer_clean', 'phone_clean']])

Data Science: Feature Forge

ID	Age	Tier	Income	Score
101	28	Pro	$85,000	7.2
102	NaN	Free	$42,000	4.1
103	45	Max	$150,000	9.8
104	NaN	Pro	$92,000	8
105	22	Free	$35,000	5.5

pipeline.py

Python 3

CONSOLE OUTPUT

Awaiting transformation pipeline...

4. Data Cleaning & Preparation

Module 04: Data Cleaning

Turning Raw Data Into Trusted Data

🔍 Missing Data: Strategies and Trade-offs

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

df = pd.read_csv('raw_sales.csv')

# Step 1: Understand the missingness pattern
missing_report = pd.DataFrame({
    'count': df.isnull().sum(),
    'pct': (df.isnull().sum() / len(df) * 100).round(2),
    'dtype': df.dtypes
}).query('count > 0').sort_values('pct', ascending=False)
print(missing_report)

# MCAR, MAR, MNAR analysis — critical for choosing the right strategy
# MCAR (Missing Completely At Random): Can safely drop or impute
# MAR (Missing At Random): Imputation with related features works
# MNAR (Missing Not At Random): Missingness carries signal — create indicator column

# Strategy 1: Drop rows with > 50% missing (clearly corrupted records)
df_clean = df[df.isnull().mean(axis=1) < 0.5].copy()

# Strategy 2: Forward fill for time series data (value persists until changed)
df['price'] = df['price'].ffill().bfill()

# Strategy 3: Statistical imputation for numeric columns
for col in df_clean.select_dtypes(include='number').columns:
    pct_missing = df_clean[col].isnull().mean()
    if pct_missing < 0.05:  # < 5% missing: use median (robust to outliers)
        df_clean[col] = df_clean[col].fillna(df_clean[col].median())
    elif pct_missing < 0.30:  # 5-30% missing: KNN imputation
        pass  # handle with KNNImputer below
    else:  # > 30% missing: drop column or create 'unknown' category
        df_clean = df_clean.drop(columns=[col])

# Strategy 4: KNN Imputation — uses similar records to estimate missing values
knn_imputer = KNNImputer(n_neighbors=5)
numeric_cols = df_clean.select_dtypes(include='number').columns
df_clean[numeric_cols] = knn_imputer.fit_transform(df_clean[numeric_cols])

# Strategy 5: Create missingness indicator before imputing (preserve signal)
df_clean['had_missing_revenue'] = df['revenue'].isnull().astype(int)
df_clean['revenue'] = df['revenue'].fillna(df['revenue'].median())

📊 Outlier Detection and Treatment

import pandas as pd
import numpy as np
from scipy import stats
from sklearn.ensemble import IsolationForest

def detect_outliers(df: pd.DataFrame, column: str, method: str = 'iqr') -> pd.Series:
    '''Returns boolean mask: True = outlier'''
    series = df[column].dropna()
    
    if method == 'iqr':
        Q1, Q3 = series.quantile([0.25, 0.75])
        IQR = Q3 - Q1
        lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
        return ~df[column].between(lower, upper)
    
    elif method == 'zscore':
        z_scores = np.abs(stats.zscore(series))
        return pd.Series(z_scores > 3, index=series.index).reindex(df.index, fill_value=False)
    
    elif method == 'modified_zscore':  # Robust to non-normal distributions
        median = series.median()
        mad = np.median(np.abs(series - median))  # Median Absolute Deviation
        modified_z = 0.6745 * (series - median) / (mad + 1e-10)
        return pd.Series(np.abs(modified_z) > 3.5, index=series.index).reindex(df.index, fill_value=False)

# Compare outlier methods
df = pd.DataFrame({'revenue': np.concatenate([
    np.random.lognormal(7, 0.5, 995),  # Normal data
    [0, -100, 1e9, 5e9, -500]  # Obvious outliers
])})

for method in ['iqr', 'zscore', 'modified_zscore']:
    outlier_mask = detect_outliers(df, 'revenue', method)
    print(f'{method}: {outlier_mask.sum()} outliers detected')

# Treatment options
# 1. Remove (only if clearly erroneous)
df_no_outliers = df[~detect_outliers(df, 'revenue', 'iqr')]

# 2. Winsorize (cap at percentiles) — preserves data, limits extreme influence
df['revenue_winsorized'] = df['revenue'].clip(
    lower=df['revenue'].quantile(0.01),
    upper=df['revenue'].quantile(0.99)
)

# 3. Log transform — compresses right-skewed distributions
df['revenue_log'] = np.log1p(df['revenue'].clip(lower=0))  # log(1+x) handles zeros

print('Skewness before:', df['revenue'].skew().round(3))
print('Skewness after log:', df['revenue_log'].skew().round(3))

🔄 Data Type Fixing and Standardization

import pandas as pd
import re

df = pd.DataFrame({
    'date': ['2024-01-15', '15/01/2024', 'Jan 15, 2024', '20240115'],
    'revenue': ['$1,234.56', '£2,000', '1500.00 EUR', '999'],
    'customer': ['  john doe  ', 'JANE SMITH', 'Bob   Jones', 'alice WANG'],
    'phone': ['+1-555-0100', '(555) 0101', '5550102', '+1.555.0103'],
})

# Standardize dates — handle multiple formats
df['date_clean'] = pd.to_datetime(df['date'], infer_datetime_format=True, dayfirst=False)

# Standardize currency values
def parse_currency(val: str) -> float:
    '''Extract numeric value from any currency string'''
    if pd.isna(val):
        return np.nan
    cleaned = re.sub(r'[^\d.]', '', str(val).replace(',', ''))
    return float(cleaned) if cleaned else np.nan

df['revenue_clean'] = df['revenue'].apply(parse_currency)

# Standardize text
df['customer_clean'] = (
    df['customer']
    .str.strip()           # remove leading/trailing whitespace
    .str.lower()           # normalize case
    .str.replace(r'\s+', ' ', regex=True)  # collapse multiple spaces
    .str.title()           # title case
)

# Standardize phone numbers
def standardize_phone(phone: str) -> str:
    digits = re.sub(r'\D', '', str(phone))
    if len(digits) == 11 and digits.startswith('1'):
        digits = digits[1:]
    if len(digits) == 10:
        return f'({digits[:3]}) {digits[3:6]}-{digits[6:]}'
    return 'INVALID'

df['phone_clean'] = df['phone'].apply(standardize_phone)
print(df[['date_clean', 'revenue_clean', 'customer_clean', 'phone_clean']])

Data Science: Feature Forge

ID	Age	Tier	Income	Score
101	28	Pro	$85,000	7.2
102	NaN	Free	$42,000	4.1
103	45	Max	$150,000	9.8
104	NaN	Pro	$92,000	8
105	22	Free	$35,000	5.5

pipeline.py

Python 3

CONSOLE OUTPUT

Awaiting transformation pipeline...

4. Data Cleaning & Preparation

Turning Raw Data Into Trusted Data

🔍 Missing Data: Strategies and Trade-offs

📊 Outlier Detection and Treatment

🔄 Data Type Fixing and Standardization

Knowledge Check

4. Data Cleaning & Preparation

Turning Raw Data Into Trusted Data

🔍 Missing Data: Strategies and Trade-offs

📊 Outlier Detection and Treatment

🔄 Data Type Fixing and Standardization

Knowledge Check