3. Web Scraping Core

Module 03: Web Scraping

HTTP, HTML, and BeautifulSoup

Web scraping is the extraction of structured data from the chaos of HTML. Every webpage you visit is just text — HTML markup interspersed with content. Your job as an automation engineer is to write code that fetches that text and extracts exactly what you need with surgical precision.

This module covers the complete scraping pipeline: making HTTP requests correctly, parsing HTML with BeautifulSoup, extracting data using CSS selectors and XPath, and handling the common edge cases that break amateur scrapers in production.

🌐 How HTTP Works for Scrapers

Before you scrape, you must think like a browser. When you visit a webpage, your browser sends an HTTP request with specific headers. Websites use these headers to identify who is making the request. A raw Python request with no headers looks nothing like a browser — and many sites will immediately block or return empty content.

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import time
import random

# Build a session that looks like a real browser
def create_session() -> requests.Session:
    session = requests.Session()
    
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'DNT': '1',
    })
    
    # Automatic retry on network errors
    retry_strategy = Retry(
        total=3,
        backoff_factor=2,  # waits 2s, 4s, 8s between retries
        status_forcelist=[429, 500, 502, 503, 504]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount('https://', adapter)
    session.mount('http://', adapter)
    
    return session

session = create_session()

def polite_get(url: str, min_delay=1.0, max_delay=3.0) -> requests.Response:
    time.sleep(random.uniform(min_delay, max_delay))  # be polite
    response = session.get(url, timeout=15)
    response.raise_for_status()  # raises exception for 4xx/5xx
    return response

🔍 BeautifulSoup: Parsing HTML

BeautifulSoup turns raw HTML into a navigable tree structure. You can find elements by tag, class, id, attribute, or any CSS selector. The lxml parser is significantly faster than Python's built-in html.parser:

from bs4 import BeautifulSoup
import re

html = response.text
soup = BeautifulSoup(html, 'lxml')  # always use lxml

# Find by tag
title = soup.find('h1').get_text(strip=True)

# Find by CSS class
products = soup.find_all('div', class_='product-card')

# CSS selectors (most powerful)
prices = soup.select('div.product-card span.price')

# Find by attribute
links = soup.find_all('a', href=re.compile(r'/product/\d+'))

# Navigate the tree
for product in products:
    name = product.select_one('h2.product-title').get_text(strip=True)
    price_text = product.select_one('span.price').get_text(strip=True)
    price = float(re.sub(r'[^\d.]', '', price_text))  # extract number
    image_url = product.select_one('img')['src']
    product_url = product.select_one('a.product-link')['href']
    
    print(f'{name}: ${price:.2f}')

🏭 Production Scraper Pattern

A production scraper handles pagination, errors, deduplication, and structured output. Here is the complete pattern:

import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass, asdict
from typing import List, Optional, Iterator
import csv
import json
from loguru import logger
from datetime import datetime

@dataclass
class Product:
    name: str
    price: float
    url: str
    image_url: str
    scraped_at: str = ''
    
    def __post_init__(self):
        self.scraped_at = datetime.now().isoformat()

class EcommerceScraper:
    BASE_URL = 'https://books.toscrape.com'
    
    def __init__(self):
        self.session = create_session()
        self.seen_urls = set()  # deduplication
    
    def get_page(self, url: str) -> Optional[BeautifulSoup]:
        try:
            resp = polite_get(url)
            return BeautifulSoup(resp.text, 'lxml')
        except requests.RequestException as e:
            logger.error(f'Failed to fetch {url}: {e}')
            return None
    
    def parse_product(self, article: BeautifulSoup, base_url: str) -> Optional[Product]:
        try:
            name = article.select_one('h3 a')['title']
            price_text = article.select_one('p.price_color').get_text(strip=True)
            price = float(price_text.replace('£', '').replace('Â', ''))
            rel_url = article.select_one('h3 a')['href'].replace('../', '')
            url = f'{self.BASE_URL}/catalogue/{rel_url}'
            image = self.BASE_URL + article.select_one('img')['src'].replace('..', '')
            return Product(name=name, price=price, url=url, image_url=image)
        except (AttributeError, KeyError, ValueError) as e:
            logger.warning(f'Parse error: {e}')
            return None
    
    def scrape_all(self) -> Iterator[Product]:
        page = 1
        while True:
            url = f'{self.BASE_URL}/catalogue/page-{page}.html'
            logger.info(f'Scraping page {page}')
            soup = self.get_page(url)
            if not soup:
                break
            
            articles = soup.select('article.product_pod')
            if not articles:
                logger.info('No more products — scraping complete')
                break
            
            for article in articles:
                product = self.parse_product(article, self.BASE_URL)
                if product and product.url not in self.seen_urls:
                    self.seen_urls.add(product.url)
                    yield product
            
            page += 1

# Run the scraper
scraper = EcommerceScraper()
products = list(scraper.scrape_all())

# Save to CSV
with open('products.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['name', 'price', 'url', 'image_url', 'scraped_at'])
    writer.writeheader()
    writer.writerows([asdict(p) for p in products])

logger.success(f'Scraped {len(products)} unique products')

🤖 Respecting robots.txt

from urllib.robotparser import RobotFileParser
from urllib.parse import urljoin, urlparse

def can_fetch(base_url: str, path: str, user_agent: str = '*') -> bool:
    rp = RobotFileParser()
    robots_url = urljoin(base_url, '/robots.txt')
    rp.set_url(robots_url)
    rp.read()
    target = urljoin(base_url, path)
    return rp.can_fetch(user_agent, target)

# Always check before scraping
if can_fetch('https://example.com', '/products/'):
    print('Scraping is permitted')
else:
    print('robots.txt disallows scraping this path')

Automation Arena: Target Practice

VOID MART.

CyberDeck Keyboard

$149.99

In Stock

Neural-Link Headset

$899.00

Low Stock

Haptic Gloves v2

$245.50

Out of Stock

Quantum SSD (2TB)

$120.00

In Stock

Target Selector

Scraper Output0 matches

No elements matched.

3. Web Scraping Core

Module 03: Web Scraping

HTTP, HTML, and BeautifulSoup

🌐 How HTTP Works for Scrapers

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import time
import random

# Build a session that looks like a real browser
def create_session() -> requests.Session:
    session = requests.Session()
    
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'DNT': '1',
    })
    
    # Automatic retry on network errors
    retry_strategy = Retry(
        total=3,
        backoff_factor=2,  # waits 2s, 4s, 8s between retries
        status_forcelist=[429, 500, 502, 503, 504]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount('https://', adapter)
    session.mount('http://', adapter)
    
    return session

session = create_session()

def polite_get(url: str, min_delay=1.0, max_delay=3.0) -> requests.Response:
    time.sleep(random.uniform(min_delay, max_delay))  # be polite
    response = session.get(url, timeout=15)
    response.raise_for_status()  # raises exception for 4xx/5xx
    return response

🔍 BeautifulSoup: Parsing HTML

from bs4 import BeautifulSoup
import re

html = response.text
soup = BeautifulSoup(html, 'lxml')  # always use lxml

# Find by tag
title = soup.find('h1').get_text(strip=True)

# Find by CSS class
products = soup.find_all('div', class_='product-card')

# CSS selectors (most powerful)
prices = soup.select('div.product-card span.price')

# Find by attribute
links = soup.find_all('a', href=re.compile(r'/product/\d+'))

# Navigate the tree
for product in products:
    name = product.select_one('h2.product-title').get_text(strip=True)
    price_text = product.select_one('span.price').get_text(strip=True)
    price = float(re.sub(r'[^\d.]', '', price_text))  # extract number
    image_url = product.select_one('img')['src']
    product_url = product.select_one('a.product-link')['href']
    
    print(f'{name}: ${price:.2f}')

🏭 Production Scraper Pattern

A production scraper handles pagination, errors, deduplication, and structured output. Here is the complete pattern:

import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass, asdict
from typing import List, Optional, Iterator
import csv
import json
from loguru import logger
from datetime import datetime

@dataclass
class Product:
    name: str
    price: float
    url: str
    image_url: str
    scraped_at: str = ''
    
    def __post_init__(self):
        self.scraped_at = datetime.now().isoformat()

class EcommerceScraper:
    BASE_URL = 'https://books.toscrape.com'
    
    def __init__(self):
        self.session = create_session()
        self.seen_urls = set()  # deduplication
    
    def get_page(self, url: str) -> Optional[BeautifulSoup]:
        try:
            resp = polite_get(url)
            return BeautifulSoup(resp.text, 'lxml')
        except requests.RequestException as e:
            logger.error(f'Failed to fetch {url}: {e}')
            return None
    
    def parse_product(self, article: BeautifulSoup, base_url: str) -> Optional[Product]:
        try:
            name = article.select_one('h3 a')['title']
            price_text = article.select_one('p.price_color').get_text(strip=True)
            price = float(price_text.replace('£', '').replace('Â', ''))
            rel_url = article.select_one('h3 a')['href'].replace('../', '')
            url = f'{self.BASE_URL}/catalogue/{rel_url}'
            image = self.BASE_URL + article.select_one('img')['src'].replace('..', '')
            return Product(name=name, price=price, url=url, image_url=image)
        except (AttributeError, KeyError, ValueError) as e:
            logger.warning(f'Parse error: {e}')
            return None
    
    def scrape_all(self) -> Iterator[Product]:
        page = 1
        while True:
            url = f'{self.BASE_URL}/catalogue/page-{page}.html'
            logger.info(f'Scraping page {page}')
            soup = self.get_page(url)
            if not soup:
                break
            
            articles = soup.select('article.product_pod')
            if not articles:
                logger.info('No more products — scraping complete')
                break
            
            for article in articles:
                product = self.parse_product(article, self.BASE_URL)
                if product and product.url not in self.seen_urls:
                    self.seen_urls.add(product.url)
                    yield product
            
            page += 1

# Run the scraper
scraper = EcommerceScraper()
products = list(scraper.scrape_all())

# Save to CSV
with open('products.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['name', 'price', 'url', 'image_url', 'scraped_at'])
    writer.writeheader()
    writer.writerows([asdict(p) for p in products])

logger.success(f'Scraped {len(products)} unique products')

🤖 Respecting robots.txt

from urllib.robotparser import RobotFileParser
from urllib.parse import urljoin, urlparse

def can_fetch(base_url: str, path: str, user_agent: str = '*') -> bool:
    rp = RobotFileParser()
    robots_url = urljoin(base_url, '/robots.txt')
    rp.set_url(robots_url)
    rp.read()
    target = urljoin(base_url, path)
    return rp.can_fetch(user_agent, target)

# Always check before scraping
if can_fetch('https://example.com', '/products/'):
    print('Scraping is permitted')
else:
    print('robots.txt disallows scraping this path')

Automation Arena: Target Practice

VOID MART.

CyberDeck Keyboard

$149.99

In Stock

Neural-Link Headset

$899.00

Low Stock

Haptic Gloves v2

$245.50

Out of Stock

Quantum SSD (2TB)

$120.00

In Stock

Target Selector

Scraper Output0 matches

No elements matched.

3. Web Scraping Core

HTTP, HTML, and BeautifulSoup

🌐 How HTTP Works for Scrapers

🔍 BeautifulSoup: Parsing HTML

🏭 Production Scraper Pattern

🤖 Respecting robots.txt

VOID MART.

CyberDeck Keyboard

Neural-Link Headset

Haptic Gloves v2

Quantum SSD (2TB)

Knowledge Check

3. Web Scraping Core

HTTP, HTML, and BeautifulSoup

🌐 How HTTP Works for Scrapers

🔍 BeautifulSoup: Parsing HTML

🏭 Production Scraper Pattern

🤖 Respecting robots.txt

VOID MART.

CyberDeck Keyboard

Neural-Link Headset

Haptic Gloves v2

Quantum SSD (2TB)

Knowledge Check