Web Scraping and API Data Collection
Web scraping and APIs are essential for collecting data not available in standard datasets. This lesson covers ethical approaches to data collection.
HTTP Basics
BeautifulSoup for HTML Scraping
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Basic scraping workflow
url = "https://example.com/products"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Find elements
titles = soup.find_all('h2', class_='product-title')
prices = soup.find_all('span', class_='price')
# Extract data
data = []
for title, price in zip(titles, prices):
data.append({
'title': title.text.strip(),
'price': float(price.text.strip().replace('$', ''))
})
df = pd.DataFrame(data)
# Handle pagination
all_products = []
for page in range(1, 11):
url = f"https://example.com/products?page={page}"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Extract and append products
products = extract_products(soup)
all_products.extend(products)
Scrapy Framework
import scrapy
class ProductSpider(scrapy.Spider):
name = 'products'
start_urls = ['https://example.com/products']
def parse(self, response):
for product in response.css('div.product'):
yield {
'title': product.css('h2::text').get(),
'price': product.css('span.price::text').get(),
'rating': product.css('div.rating::attr(data-rating)').get(),
}
# Follow pagination
next_page = response.css('a.next::attr(href)').get()
if next_page:
yield response.follow(next_page, self.parse)
REST API Integration
import requests
import pandas as pd
from datetime import datetime
# Basic API request
api_url = "https://api.example.com/v1/data"
headers = {
"Authorization": "Bearer YOUR_API_KEY",
"Content-Type": "application/json"
}
params = {
"start_date": "2024-01-01",
"end_date": "2024-12-31",
"limit": 1000
}
response = requests.get(api_url, headers=headers, params=params)
if response.status_code == 200:
data = response.json()
df = pd.DataFrame(data['results'])
else:
print(f"Error: {response.status_code}")
# Pagination handling
def fetch_all_pages(api_url, headers, params):
all_data = []
page = 1
while True:
params['page'] = page
response = requests.get(api_url, headers=headers, params=params)
if response.status_code != 200:
break
data = response.json()
all_data.extend(data['results'])
if not data.get('has_next_page'):
break
page += 1
return all_data
Rate Limiting and Ethics
import time
import random
from urllib.robotparser import RobotFileParser
# Check robots.txt
def can_scrape(url):
rp = RobotFileParser()
rp.set_url(f"{url}/robots.txt")
rp.read()
return rp.can_fetch("*", url)
# Rate limiting
class RateLimiter:
def __init__(self, requests_per_second=1):
self.delay = 1 / requests_per_second
self.last_request = 0
def wait(self):
elapsed = time.time() - self.last_request
if elapsed < self.delay:
time.sleep(self.delay - elapsed)
self.last_request = time.time()
# Usage with polite scraping
limiter = RateLimiter(requests_per_second=2)
for url in urls:
if can_scrape(url):
limiter.wait()
response = requests.get(url, headers={
'User-Agent': 'DataScienceBot/1.0 (contact@example.com)'
})
# Process response
Key Takeaways
- Always check robots.txt and terms of service
- Implement rate limiting to be respectful
- Use APIs when available - they're more reliable
- Handle errors and retries gracefully
- Store data responsibly and check legal compliance