4. Advanced Scraping
JavaScript Rendering and Headless Browsers
Up to 40% of the modern web is built with JavaScript frameworks (React, Vue, Angular) that render content client-side after the page loads. A standard requests scraper fetches only the initial HTML shell โ the actual data never appears because it's populated by JavaScript after load. To scrape these sites, you need a real browser engine that executes JavaScript.
Playwright is the modern standard for headless browser automation. It controls Chromium, Firefox, or WebKit and gives you programmatic control over everything a user can do in a browser.
๐ญ Playwright: The Complete Guide
from playwright.sync_api import sync_playwright, Page, Browser
from typing import Optional
import json
def create_stealth_browser(playwright):
browser = playwright.chromium.launch(
headless=True, # set False to watch it run
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
]
)
context = browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
locale='en-US',
timezone_id='America/New_York',
)
# Block images and fonts to speed up scraping
context.route('**/*.{png,jpg,jpeg,gif,webp,svg,woff,woff2,ttf}',
lambda route: route.abort())
return browser, context
with sync_playwright() as p:
browser, context = create_stealth_browser(p)
page = context.new_page()
# Navigate and wait for content
page.goto('https://spa-example.com/products', wait_until='networkidle')
# Wait for a specific element to appear
page.wait_for_selector('div.product-grid', timeout=10000)
# Scroll to load lazy content
for _ in range(5):
page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
page.wait_for_timeout(1500)
# Extract data using JavaScript evaluation
products = page.evaluate('''
() => {
return Array.from(document.querySelectorAll('div.product-card')).map(card => ({
name: card.querySelector('h2')?.innerText || '',
price: card.querySelector('.price')?.innerText || '',
rating: card.querySelector('.stars')?.dataset.rating || '0',
}));
}
''')
print(json.dumps(products[:3], indent=2))
browser.close()๐ก Intercepting Network Requests
The most powerful advanced scraping technique: instead of scraping HTML, intercept the underlying API calls the JavaScript makes. This gives you clean JSON data directly from the source:
from playwright.sync_api import sync_playwright
import json
api_responses = []
def handle_response(response):
# Capture JSON API responses
if 'api' in response.url and response.status == 200:
try:
data = response.json()
api_responses.append({'url': response.url, 'data': data})
print(f'Captured API call: {response.url}')
except Exception:
pass
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
# Register the response handler BEFORE navigating
page.on('response', handle_response)
page.goto('https://example-spa.com')
page.wait_for_load_state('networkidle')
# Now you have clean JSON from the API
for captured in api_responses:
print(f"URL: {captured['url']}")
print(f"Items: {len(captured['data'].get('items', []))}")
browser.close()๐ Async Playwright for Scale
import asyncio
from playwright.async_api import async_playwright
from typing import List, Dict
async def scrape_page(context, url: str) -> Dict:
page = await context.new_page()
try:
await page.goto(url, wait_until='domcontentloaded', timeout=15000)
await page.wait_for_selector('main', timeout=5000)
data = await page.evaluate('''
() => ({
title: document.querySelector('h1')?.innerText,
price: document.querySelector('.price')?.innerText,
})
''')
return {'url': url, **data}
except Exception as e:
return {'url': url, 'error': str(e)}
finally:
await page.close() # always close pages
async def scrape_batch(urls: List[str], concurrency: int = 5):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
semaphore = asyncio.Semaphore(concurrency)
async def bounded_scrape(url):
async with semaphore:
return await scrape_page(context, url)
results = await asyncio.gather(*[bounded_scrape(url) for url in urls])
await browser.close()
return results
urls = ['https://example.com/item/{}'.format(i) for i in range(1, 51)]
results = asyncio.run(scrape_batch(urls, concurrency=5))
print(f'Scraped {len(results)} pages')VOID MART.
CyberDeck Keyboard
Neural-Link Headset
Haptic Gloves v2
Quantum SSD (2TB)
No elements matched.
Knowledge Check
Ready to test your understanding of 4. Advanced Scraping?