Data Science | VoidX Academy

2. Python for Data Systems

Module 02: Python Core

NumPy, Pandas, and the Data I/O Stack

Python dominates data science not because of Python itself, but because of the ecosystem built on top of it. NumPy provides the mathematical foundation. Pandas provides the data manipulation interface. Together, they handle 90% of everything a data scientist does before reaching a model. This module covers the patterns you will use in every project for the rest of your career.

We focus on data systems patterns — not basic Python tutorials. The code here mirrors what production data engineering teams use daily.

🔢 NumPy: The Mathematical Core

NumPy's ndarray is the foundation of every scientific computing library in Python. Its power comes from vectorized operations that run in compiled C rather than interpreted Python — making operations on millions of data points feasible in milliseconds.

import numpy as np

# The core performance insight: vectorization vs loops
arr = np.random.randn(1_000_000)  # 1 million random numbers

# WRONG: Python loop — processes one element at a time in Python
total = 0
for x in arr:
    total += x ** 2

# RIGHT: Vectorized — entire array processed in compiled C
total = np.sum(arr ** 2)  # ~100x faster

# Broadcasting: operations on arrays of different shapes
prices = np.array([[100, 200, 150], [90, 210, 145], [105, 195, 160]])
daily_mean = prices.mean(axis=0)  # mean across rows (per column)
normalized = (prices - daily_mean) / prices.std(axis=0)  # broadcast subtract
print('Normalized shape:', normalized.shape)  # (3, 3)

# Statistical operations
returns = np.diff(prices, axis=0) / prices[:-1]  # daily % returns
print(f'Mean daily return: {returns.mean():.4f}')
print(f'Return volatility: {returns.std():.4f}')
print(f'Sharpe ratio (approx): {returns.mean() / returns.std():.4f}')

# Linear algebra — the backbone of ML
A = np.array([[2, 1], [1, 3]])
b = np.array([5, 7])
x = np.linalg.solve(A, b)  # solve system of linear equations
print(f'Solution: {x}')  # prints [1.6, 1.8]

# Boolean indexing — filter without loops
outlier_mask = np.abs(arr) > 3  # values more than 3 std from mean
outliers = arr[outlier_mask]
print(f'Outliers found: {len(outliers)} ({len(outliers)/len(arr):.4%} of data)')

🐼 Pandas: Data Manipulation at Scale

import pandas as pd
import numpy as np
from pathlib import Path

# Creating DataFrames from multiple sources
df_from_dict = pd.DataFrame({
    'customer_id': range(1000, 1010),
    'revenue': np.random.lognormal(7, 1, 10),
    'region': np.random.choice(['North', 'South', 'East', 'West'], 10),
    'signup_date': pd.date_range('2023-01-01', periods=10, freq='ME'),
    'is_churned': np.random.choice([True, False], 10, p=[0.2, 0.8])
})

# The essential diagnostic workflow — run this on every new dataset
print('=== DATASET OVERVIEW ===')
print(f'Shape: {df_from_dict.shape}')
print(f'Memory: {df_from_dict.memory_usage(deep=True).sum() / 1024:.1f} KB')
print('\nData types:')
print(df_from_dict.dtypes)
print('\nMissing values:')
print(df_from_dict.isnull().sum())
print('\nStatistical summary:')
print(df_from_dict.describe(include='all'))

# Advanced groupby patterns
region_stats = df_from_dict.groupby('region').agg(
    total_revenue=('revenue', 'sum'),
    avg_revenue=('revenue', 'mean'),
    customer_count=('customer_id', 'count'),
    churn_rate=('is_churned', 'mean')
).round(2)
print('\nRegion statistics:')
print(region_stats)

# Method chaining — the professional pandas style
analysis = (
    df_from_dict
    .query('is_churned == False')                        # filter active customers
    .assign(revenue_tier=lambda df: pd.cut(              # create bins
        df['revenue'],
        bins=[0, 500, 2000, float('inf')],
        labels=['Low', 'Medium', 'High']
    ))
    .groupby(['region', 'revenue_tier'])['revenue']
    .agg(['mean', 'count'])
    .rename(columns={'mean': 'avg_revenue', 'count': 'customers'})
    .reset_index()
    .sort_values('avg_revenue', ascending=False)
)
print(analysis)

📥 Data I/O: Reading and Writing Everything

import pandas as pd
import sqlite3
from pathlib import Path

# CSV — the universal data format
df = pd.read_csv('data.csv',
    dtype={'customer_id': 'int32', 'revenue': 'float32'},  # memory optimization
    parse_dates=['signup_date'],
    na_values=['', 'NULL', 'N/A', '-'],  # recognize all null patterns
    low_memory=False  # prevent mixed type warnings on large files
)

# Parquet — the professional format for large datasets
# Parquet is columnar, compressed, and 10-100x faster than CSV for analytics
df.to_parquet('data.parquet', engine='pyarrow', compression='snappy')
df_back = pd.read_parquet('data.parquet', columns=['customer_id', 'revenue'])  # read only needed cols

# SQL databases — the source of most enterprise data
conn = sqlite3.connect('analytics.db')

# Write DataFrame to SQL
df.to_sql('customers', conn, if_exists='replace', index=False)

# Read with SQL query — push computation to database, not Python
query = '''
    SELECT region,
           COUNT(*) as customers,
           SUM(revenue) as total_revenue,
           AVG(revenue) as avg_revenue
    FROM customers
    WHERE is_churned = 0
    GROUP BY region
    ORDER BY total_revenue DESC
'''
result = pd.read_sql(query, conn)
print(result)
conn.close()

# Chunked reading for files larger than RAM
chunk_results = []
for chunk in pd.read_csv('huge_file.csv', chunksize=100_000):
    # Process each chunk without loading all data
    summary = chunk.groupby('region')['revenue'].sum()
    chunk_results.append(summary)

final_result = pd.concat(chunk_results).groupby(level=0).sum()
print('Total revenue by region:')
print(final_result)

⚡ Performance Optimization Patterns

import pandas as pd
import numpy as np

# Memory optimization: choose the right data types
def optimize_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    '''Reduce DataFrame memory usage by 50-80% with appropriate dtypes'''
    original_memory = df.memory_usage(deep=True).sum()
    
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    
    for col in df.select_dtypes(include=['object']).columns:
        # Convert low-cardinality string columns to category
        if df[col].nunique() / len(df) < 0.5:
            df[col] = df[col].astype('category')
    
    new_memory = df.memory_usage(deep=True).sum()
    print(f'Memory reduced: {original_memory/1024:.0f}KB → {new_memory/1024:.0f}KB '
          f'({(1 - new_memory/original_memory):.1%} reduction)')
    return df

# Apply vectorized operations instead of apply() where possible
df = pd.DataFrame({'value': np.random.randn(100_000), 'category': np.random.choice(['A', 'B', 'C'], 100_000)})

# SLOW: apply() uses Python loop
df['slow_result'] = df['value'].apply(lambda x: x ** 2 + 1 if x > 0 else 0)

# FAST: vectorized NumPy operations
df['fast_result'] = np.where(df['value'] > 0, df['value'] ** 2 + 1, 0)

print('Both produce the same result; vectorized is 10-50x faster on large datasets')

2. Python for Data Systems

Module 02: Python Core

NumPy, Pandas, and the Data I/O Stack

We focus on data systems patterns — not basic Python tutorials. The code here mirrors what production data engineering teams use daily.

🔢 NumPy: The Mathematical Core

import numpy as np

# The core performance insight: vectorization vs loops
arr = np.random.randn(1_000_000)  # 1 million random numbers

# WRONG: Python loop — processes one element at a time in Python
total = 0
for x in arr:
    total += x ** 2

# RIGHT: Vectorized — entire array processed in compiled C
total = np.sum(arr ** 2)  # ~100x faster

# Broadcasting: operations on arrays of different shapes
prices = np.array([[100, 200, 150], [90, 210, 145], [105, 195, 160]])
daily_mean = prices.mean(axis=0)  # mean across rows (per column)
normalized = (prices - daily_mean) / prices.std(axis=0)  # broadcast subtract
print('Normalized shape:', normalized.shape)  # (3, 3)

# Statistical operations
returns = np.diff(prices, axis=0) / prices[:-1]  # daily % returns
print(f'Mean daily return: {returns.mean():.4f}')
print(f'Return volatility: {returns.std():.4f}')
print(f'Sharpe ratio (approx): {returns.mean() / returns.std():.4f}')

# Linear algebra — the backbone of ML
A = np.array([[2, 1], [1, 3]])
b = np.array([5, 7])
x = np.linalg.solve(A, b)  # solve system of linear equations
print(f'Solution: {x}')  # prints [1.6, 1.8]

# Boolean indexing — filter without loops
outlier_mask = np.abs(arr) > 3  # values more than 3 std from mean
outliers = arr[outlier_mask]
print(f'Outliers found: {len(outliers)} ({len(outliers)/len(arr):.4%} of data)')

🐼 Pandas: Data Manipulation at Scale

import pandas as pd
import numpy as np
from pathlib import Path

# Creating DataFrames from multiple sources
df_from_dict = pd.DataFrame({
    'customer_id': range(1000, 1010),
    'revenue': np.random.lognormal(7, 1, 10),
    'region': np.random.choice(['North', 'South', 'East', 'West'], 10),
    'signup_date': pd.date_range('2023-01-01', periods=10, freq='ME'),
    'is_churned': np.random.choice([True, False], 10, p=[0.2, 0.8])
})

# The essential diagnostic workflow — run this on every new dataset
print('=== DATASET OVERVIEW ===')
print(f'Shape: {df_from_dict.shape}')
print(f'Memory: {df_from_dict.memory_usage(deep=True).sum() / 1024:.1f} KB')
print('\nData types:')
print(df_from_dict.dtypes)
print('\nMissing values:')
print(df_from_dict.isnull().sum())
print('\nStatistical summary:')
print(df_from_dict.describe(include='all'))

# Advanced groupby patterns
region_stats = df_from_dict.groupby('region').agg(
    total_revenue=('revenue', 'sum'),
    avg_revenue=('revenue', 'mean'),
    customer_count=('customer_id', 'count'),
    churn_rate=('is_churned', 'mean')
).round(2)
print('\nRegion statistics:')
print(region_stats)

# Method chaining — the professional pandas style
analysis = (
    df_from_dict
    .query('is_churned == False')                        # filter active customers
    .assign(revenue_tier=lambda df: pd.cut(              # create bins
        df['revenue'],
        bins=[0, 500, 2000, float('inf')],
        labels=['Low', 'Medium', 'High']
    ))
    .groupby(['region', 'revenue_tier'])['revenue']
    .agg(['mean', 'count'])
    .rename(columns={'mean': 'avg_revenue', 'count': 'customers'})
    .reset_index()
    .sort_values('avg_revenue', ascending=False)
)
print(analysis)

📥 Data I/O: Reading and Writing Everything

import pandas as pd
import sqlite3
from pathlib import Path

# CSV — the universal data format
df = pd.read_csv('data.csv',
    dtype={'customer_id': 'int32', 'revenue': 'float32'},  # memory optimization
    parse_dates=['signup_date'],
    na_values=['', 'NULL', 'N/A', '-'],  # recognize all null patterns
    low_memory=False  # prevent mixed type warnings on large files
)

# Parquet — the professional format for large datasets
# Parquet is columnar, compressed, and 10-100x faster than CSV for analytics
df.to_parquet('data.parquet', engine='pyarrow', compression='snappy')
df_back = pd.read_parquet('data.parquet', columns=['customer_id', 'revenue'])  # read only needed cols

# SQL databases — the source of most enterprise data
conn = sqlite3.connect('analytics.db')

# Write DataFrame to SQL
df.to_sql('customers', conn, if_exists='replace', index=False)

# Read with SQL query — push computation to database, not Python
query = '''
    SELECT region,
           COUNT(*) as customers,
           SUM(revenue) as total_revenue,
           AVG(revenue) as avg_revenue
    FROM customers
    WHERE is_churned = 0
    GROUP BY region
    ORDER BY total_revenue DESC
'''
result = pd.read_sql(query, conn)
print(result)
conn.close()

# Chunked reading for files larger than RAM
chunk_results = []
for chunk in pd.read_csv('huge_file.csv', chunksize=100_000):
    # Process each chunk without loading all data
    summary = chunk.groupby('region')['revenue'].sum()
    chunk_results.append(summary)

final_result = pd.concat(chunk_results).groupby(level=0).sum()
print('Total revenue by region:')
print(final_result)

⚡ Performance Optimization Patterns

import pandas as pd
import numpy as np

# Memory optimization: choose the right data types
def optimize_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    '''Reduce DataFrame memory usage by 50-80% with appropriate dtypes'''
    original_memory = df.memory_usage(deep=True).sum()
    
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    
    for col in df.select_dtypes(include=['object']).columns:
        # Convert low-cardinality string columns to category
        if df[col].nunique() / len(df) < 0.5:
            df[col] = df[col].astype('category')
    
    new_memory = df.memory_usage(deep=True).sum()
    print(f'Memory reduced: {original_memory/1024:.0f}KB → {new_memory/1024:.0f}KB '
          f'({(1 - new_memory/original_memory):.1%} reduction)')
    return df

# Apply vectorized operations instead of apply() where possible
df = pd.DataFrame({'value': np.random.randn(100_000), 'category': np.random.choice(['A', 'B', 'C'], 100_000)})

# SLOW: apply() uses Python loop
df['slow_result'] = df['value'].apply(lambda x: x ** 2 + 1 if x > 0 else 0)

# FAST: vectorized NumPy operations
df['fast_result'] = np.where(df['value'] > 0, df['value'] ** 2 + 1, 0)

print('Both produce the same result; vectorized is 10-50x faster on large datasets')

2. Python for Data Systems

NumPy, Pandas, and the Data I/O Stack

🔢 NumPy: The Mathematical Core

🐼 Pandas: Data Manipulation at Scale

📥 Data I/O: Reading and Writing Everything

⚡ Performance Optimization Patterns

Knowledge Check

2. Python for Data Systems

NumPy, Pandas, and the Data I/O Stack

🔢 NumPy: The Mathematical Core

🐼 Pandas: Data Manipulation at Scale

📥 Data I/O: Reading and Writing Everything

⚡ Performance Optimization Patterns

Knowledge Check