3. Web Scraping Core
HTTP, HTML, and BeautifulSoup
Web scraping is the extraction of structured data from the chaos of HTML. Every webpage you visit is just text โ HTML markup interspersed with content. Your job as an automation engineer is to write code that fetches that text and extracts exactly what you need with surgical precision.
This module covers the complete scraping pipeline: making HTTP requests correctly, parsing HTML with BeautifulSoup, extracting data using CSS selectors and XPath, and handling the common edge cases that break amateur scrapers in production.
๐ How HTTP Works for Scrapers
Before you scrape, you must think like a browser. When you visit a webpage, your browser sends an HTTP request with specific headers. Websites use these headers to identify who is making the request. A raw Python request with no headers looks nothing like a browser โ and many sites will immediately block or return empty content.
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import time
import random
# Build a session that looks like a real browser
def create_session() -> requests.Session:
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'DNT': '1',
})
# Automatic retry on network errors
retry_strategy = Retry(
total=3,
backoff_factor=2, # waits 2s, 4s, 8s between retries
status_forcelist=[429, 500, 502, 503, 504]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount('https://', adapter)
session.mount('http://', adapter)
return session
session = create_session()
def polite_get(url: str, min_delay=1.0, max_delay=3.0) -> requests.Response:
time.sleep(random.uniform(min_delay, max_delay)) # be polite
response = session.get(url, timeout=15)
response.raise_for_status() # raises exception for 4xx/5xx
return response๐ BeautifulSoup: Parsing HTML
BeautifulSoup turns raw HTML into a navigable tree structure. You can find elements by tag, class, id, attribute, or any CSS selector. The lxml parser is significantly faster than Python's built-in html.parser:
from bs4 import BeautifulSoup
import re
html = response.text
soup = BeautifulSoup(html, 'lxml') # always use lxml
# Find by tag
title = soup.find('h1').get_text(strip=True)
# Find by CSS class
products = soup.find_all('div', class_='product-card')
# CSS selectors (most powerful)
prices = soup.select('div.product-card span.price')
# Find by attribute
links = soup.find_all('a', href=re.compile(r'/product/\d+'))
# Navigate the tree
for product in products:
name = product.select_one('h2.product-title').get_text(strip=True)
price_text = product.select_one('span.price').get_text(strip=True)
price = float(re.sub(r'[^\d.]', '', price_text)) # extract number
image_url = product.select_one('img')['src']
product_url = product.select_one('a.product-link')['href']
print(f'{name}: ${price:.2f}')๐ญ Production Scraper Pattern
A production scraper handles pagination, errors, deduplication, and structured output. Here is the complete pattern:
import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass, asdict
from typing import List, Optional, Iterator
import csv
import json
from loguru import logger
from datetime import datetime
@dataclass
class Product:
name: str
price: float
url: str
image_url: str
scraped_at: str = ''
def __post_init__(self):
self.scraped_at = datetime.now().isoformat()
class EcommerceScraper:
BASE_URL = 'https://books.toscrape.com'
def __init__(self):
self.session = create_session()
self.seen_urls = set() # deduplication
def get_page(self, url: str) -> Optional[BeautifulSoup]:
try:
resp = polite_get(url)
return BeautifulSoup(resp.text, 'lxml')
except requests.RequestException as e:
logger.error(f'Failed to fetch {url}: {e}')
return None
def parse_product(self, article: BeautifulSoup, base_url: str) -> Optional[Product]:
try:
name = article.select_one('h3 a')['title']
price_text = article.select_one('p.price_color').get_text(strip=True)
price = float(price_text.replace('ยฃ', '').replace('ร', ''))
rel_url = article.select_one('h3 a')['href'].replace('../', '')
url = f'{self.BASE_URL}/catalogue/{rel_url}'
image = self.BASE_URL + article.select_one('img')['src'].replace('..', '')
return Product(name=name, price=price, url=url, image_url=image)
except (AttributeError, KeyError, ValueError) as e:
logger.warning(f'Parse error: {e}')
return None
def scrape_all(self) -> Iterator[Product]:
page = 1
while True:
url = f'{self.BASE_URL}/catalogue/page-{page}.html'
logger.info(f'Scraping page {page}')
soup = self.get_page(url)
if not soup:
break
articles = soup.select('article.product_pod')
if not articles:
logger.info('No more products โ scraping complete')
break
for article in articles:
product = self.parse_product(article, self.BASE_URL)
if product and product.url not in self.seen_urls:
self.seen_urls.add(product.url)
yield product
page += 1
# Run the scraper
scraper = EcommerceScraper()
products = list(scraper.scrape_all())
# Save to CSV
with open('products.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['name', 'price', 'url', 'image_url', 'scraped_at'])
writer.writeheader()
writer.writerows([asdict(p) for p in products])
logger.success(f'Scraped {len(products)} unique products')๐ค Respecting robots.txt
from urllib.robotparser import RobotFileParser
from urllib.parse import urljoin, urlparse
def can_fetch(base_url: str, path: str, user_agent: str = '*') -> bool:
rp = RobotFileParser()
robots_url = urljoin(base_url, '/robots.txt')
rp.set_url(robots_url)
rp.read()
target = urljoin(base_url, path)
return rp.can_fetch(user_agent, target)
# Always check before scraping
if can_fetch('https://example.com', '/products/'):
print('Scraping is permitted')
else:
print('robots.txt disallows scraping this path')VOID MART.
CyberDeck Keyboard
Neural-Link Headset
Haptic Gloves v2
Quantum SSD (2TB)
No elements matched.
Knowledge Check
Ready to test your understanding of 3. Web Scraping Core?