90 MINS beginner
4. Data Cleaning & Preparation
Module 04: Data Cleaning
Turning Raw Data Into Trusted Data
Data cleaning is the unglamorous core of data science. No matter how sophisticated your model, it cannot overcome systematically corrupted inputs. This module teaches the complete data cleaning workflow: detecting and handling missing values, identifying and treating outliers, fixing data types, standardizing formats, and deduplicating records — all with production-ready patterns that scale beyond notebooks.
🔍 Missing Data: Strategies and Trade-offs
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
df = pd.read_csv('raw_sales.csv')
# Step 1: Understand the missingness pattern
missing_report = pd.DataFrame({
'count': df.isnull().sum(),
'pct': (df.isnull().sum() / len(df) * 100).round(2),
'dtype': df.dtypes
}).query('count > 0').sort_values('pct', ascending=False)
print(missing_report)
# MCAR, MAR, MNAR analysis — critical for choosing the right strategy
# MCAR (Missing Completely At Random): Can safely drop or impute
# MAR (Missing At Random): Imputation with related features works
# MNAR (Missing Not At Random): Missingness carries signal — create indicator column
# Strategy 1: Drop rows with > 50% missing (clearly corrupted records)
df_clean = df[df.isnull().mean(axis=1) < 0.5].copy()
# Strategy 2: Forward fill for time series data (value persists until changed)
df['price'] = df['price'].ffill().bfill()
# Strategy 3: Statistical imputation for numeric columns
for col in df_clean.select_dtypes(include='number').columns:
pct_missing = df_clean[col].isnull().mean()
if pct_missing < 0.05: # < 5% missing: use median (robust to outliers)
df_clean[col] = df_clean[col].fillna(df_clean[col].median())
elif pct_missing < 0.30: # 5-30% missing: KNN imputation
pass # handle with KNNImputer below
else: # > 30% missing: drop column or create 'unknown' category
df_clean = df_clean.drop(columns=[col])
# Strategy 4: KNN Imputation — uses similar records to estimate missing values
knn_imputer = KNNImputer(n_neighbors=5)
numeric_cols = df_clean.select_dtypes(include='number').columns
df_clean[numeric_cols] = knn_imputer.fit_transform(df_clean[numeric_cols])
# Strategy 5: Create missingness indicator before imputing (preserve signal)
df_clean['had_missing_revenue'] = df['revenue'].isnull().astype(int)
df_clean['revenue'] = df['revenue'].fillna(df['revenue'].median())📊 Outlier Detection and Treatment
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.ensemble import IsolationForest
def detect_outliers(df: pd.DataFrame, column: str, method: str = 'iqr') -> pd.Series:
'''Returns boolean mask: True = outlier'''
series = df[column].dropna()
if method == 'iqr':
Q1, Q3 = series.quantile([0.25, 0.75])
IQR = Q3 - Q1
lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
return ~df[column].between(lower, upper)
elif method == 'zscore':
z_scores = np.abs(stats.zscore(series))
return pd.Series(z_scores > 3, index=series.index).reindex(df.index, fill_value=False)
elif method == 'modified_zscore': # Robust to non-normal distributions
median = series.median()
mad = np.median(np.abs(series - median)) # Median Absolute Deviation
modified_z = 0.6745 * (series - median) / (mad + 1e-10)
return pd.Series(np.abs(modified_z) > 3.5, index=series.index).reindex(df.index, fill_value=False)
# Compare outlier methods
df = pd.DataFrame({'revenue': np.concatenate([
np.random.lognormal(7, 0.5, 995), # Normal data
[0, -100, 1e9, 5e9, -500] # Obvious outliers
])})
for method in ['iqr', 'zscore', 'modified_zscore']:
outlier_mask = detect_outliers(df, 'revenue', method)
print(f'{method}: {outlier_mask.sum()} outliers detected')
# Treatment options
# 1. Remove (only if clearly erroneous)
df_no_outliers = df[~detect_outliers(df, 'revenue', 'iqr')]
# 2. Winsorize (cap at percentiles) — preserves data, limits extreme influence
df['revenue_winsorized'] = df['revenue'].clip(
lower=df['revenue'].quantile(0.01),
upper=df['revenue'].quantile(0.99)
)
# 3. Log transform — compresses right-skewed distributions
df['revenue_log'] = np.log1p(df['revenue'].clip(lower=0)) # log(1+x) handles zeros
print('Skewness before:', df['revenue'].skew().round(3))
print('Skewness after log:', df['revenue_log'].skew().round(3))🔄 Data Type Fixing and Standardization
import pandas as pd
import re
df = pd.DataFrame({
'date': ['2024-01-15', '15/01/2024', 'Jan 15, 2024', '20240115'],
'revenue': ['$1,234.56', '£2,000', '1500.00 EUR', '999'],
'customer': [' john doe ', 'JANE SMITH', 'Bob Jones', 'alice WANG'],
'phone': ['+1-555-0100', '(555) 0101', '5550102', '+1.555.0103'],
})
# Standardize dates — handle multiple formats
df['date_clean'] = pd.to_datetime(df['date'], infer_datetime_format=True, dayfirst=False)
# Standardize currency values
def parse_currency(val: str) -> float:
'''Extract numeric value from any currency string'''
if pd.isna(val):
return np.nan
cleaned = re.sub(r'[^\d.]', '', str(val).replace(',', ''))
return float(cleaned) if cleaned else np.nan
df['revenue_clean'] = df['revenue'].apply(parse_currency)
# Standardize text
df['customer_clean'] = (
df['customer']
.str.strip() # remove leading/trailing whitespace
.str.lower() # normalize case
.str.replace(r'\s+', ' ', regex=True) # collapse multiple spaces
.str.title() # title case
)
# Standardize phone numbers
def standardize_phone(phone: str) -> str:
digits = re.sub(r'\D', '', str(phone))
if len(digits) == 11 and digits.startswith('1'):
digits = digits[1:]
if len(digits) == 10:
return f'({digits[:3]}) {digits[3:6]}-{digits[6:]}'
return 'INVALID'
df['phone_clean'] = df['phone'].apply(standardize_phone)
print(df[['date_clean', 'revenue_clean', 'customer_clean', 'phone_clean']])Data Science: Feature Forge
| ID | Age | Tier | Income | Score |
|---|---|---|---|---|
| 101 | 28 | Pro | $85,000 | 7.2 |
| 102 | NaN | Free | $42,000 | 4.1 |
| 103 | 45 | Max | $150,000 | 9.8 |
| 104 | NaN | Pro | $92,000 | 8 |
| 105 | 22 | Free | $35,000 | 5.5 |
pipeline.py
Python 3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
CONSOLE OUTPUT
Awaiting transformation pipeline...
Knowledge Check
Ready to test your understanding of 4. Data Cleaning & Preparation?