Python CSV — Reading & Writing Tabular Data
CSV (Comma-Separated Values) is the most common format for tabular data exchange. Python's csv module and libraries like pandas provide powerful tools for working with CSV files.
Learning Objectives
- Read and write CSV files with the csv module
- Handle different delimiters, quoting styles, and encodings
- Process large CSV files efficiently with generators
- Use DictReader and DictWriter for named column access
- Convert between CSV and other formats (JSON, database)
- Handle common real-world data quality issues
Reading CSV Files
The csv module provides several reader classes for different needs:
import csv
# Basic reading with csv.reader
with open('data.csv', 'r', newline='', encoding='utf-8') as f:
reader = csv.reader(f)
header = next(reader) # Skip header row
for row in reader:
print(row) # Each row is a list of strings
DictReader — Named Column Access
import csv
# DictReader maps each row to a dictionary using the header row
with open('employees.csv', 'r', newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
# Access columns by name — much more readable!
print(f"{row['name']} works in {row['department']}")
print(f" Salary: ${row['salary']}")
Reading with Specific Types
import csv
with open('sales.csv', 'r', newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
total = 0.0
count = 0
for row in reader:
# Convert strings to appropriate types
quantity = int(row['quantity'])
price = float(row['price'])
total += quantity * price
count += 1
print(f"Total revenue: ${total:,.2f}")
print(f"Average order: ${total/count:,.2f}")
Writing CSV Files
import csv
# Basic writing with csv.writer
data = [
["Name", "Age", "City", "Salary"],
["Alice", 30, "New York", 75000],
["Bob", 25, "Los Angeles", 65000],
["Charlie", 35, "Chicago", 80000],
["Diana", 28, "Houston", 70000]
]
with open('employees.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerows(data) # Write all rows at once
DictWriter — Write from Dictionaries
import csv
employees = [
{"name": "Alice", "age": 30, "department": "Engineering", "salary": 75000},
{"name": "Bob", "age": 25, "department": "Marketing", "salary": 65000},
{"name": "Charlie", "age": 35, "department": "Engineering", "salary": 80000},
]
fieldnames = ['name', 'age', 'department', 'salary']
with open('employees.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader() # Write column names first
writer.writerows(employees)
Appending to Existing CSV
import csv
# Open in append mode ('a')
new_employee = {"name": "Eve", "age": 22, "department": "Sales", "salary": 60000}
with open('employees.csv', 'a', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['name', 'age', 'department', 'salary'])
writer.writerow(new_employee)
Different Delimiters and Formats
CSV files can use various delimiters depending on locale and application:
# Tab-separated values (TSV)
with open('data.tsv', 'r', newline='', encoding='utf-8') as f:
reader = csv.reader(f, delimiter='\t')
for row in reader:
print(row)
# Semicolon-separated (common in European locales)
with open('data.csv', 'r', newline='', encoding='utf-8') as f:
reader = csv.reader(f, delimiter=';')
for row in reader:
print(row)
# Pipe-separated (common in SQL exports)
with open('data.psv', 'r', newline='', encoding='utf-8') as f:
reader = csv.reader(f, delimiter='|')
for row in reader:
print(row)
Dialect Options
import csv
# Register a custom dialect
csv.register_dialect('pipe', delimiter='|', quoting=csv.QUOTE_MINIMAL)
# Use the registered dialect
with open('data.psv', 'r', newline='', encoding='utf-8') as f:
reader = csv.reader(f, dialect='pipe')
for row in reader:
print(row)
# List available dialects
print(csv.list_dialects()) # ['excel', 'excel-tab', 'unix']
Quoting Styles
import csv
# QUOTE_MINIMAL — quote only when necessary (default)
# QUOTE_ALL — quote every field
# QUOTE_NONNUMERIC — quote all non-numeric fields
# QUOTE_NONE — never quote (escape instead)
data = [
["Name", "Description"],
["Widget", 'A "nice" widget'],
["Gadget", "Comma, in description"]
]
with open('quoted.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f, quoting=csv.QUOTE_ALL)
writer.writerows(data)
Handling Encoding Issues
Real-world CSV files often have encoding problems:
import csv
# UTF-8 with BOM (common from Excel exports)
with open('excel_export.csv', 'r', encoding='utf-8-sig') as f:
reader = csv.DictReader(f)
for row in reader:
print(row['name'])
# Latin-1 encoding (common in legacy systems)
with open('legacy_data.csv', 'r', encoding='latin-1') as f:
reader = csv.DictReader(f)
for row in reader:
print(row['name'])
# Handle encoding errors gracefully
with open('data.csv', 'r', encoding='utf-8', errors='replace') as f:
reader = csv.reader(f)
for row in reader:
print(row)
Processing Large Files
For files that don't fit in memory, process row-by-row:
import csv
def process_large_csv(filename):
"""Calculate statistics without loading entire file into memory."""
total_amount = 0.0
count = 0
max_amount = float('-inf')
min_amount = float('inf')
with open(filename, 'r', newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
amount = float(row['amount'])
total_amount += amount
count += 1
max_amount = max(max_amount, amount)
min_amount = min(min_amount, amount)
return {
'total': total_amount,
'count': count,
'average': total_amount / count if count else 0,
'max': max_amount,
'min': min_amount
}
Generator-Based Processing
import csv
def read_csv_chunks(filename, chunk_size=1000):
"""Read CSV in chunks for memory-efficient processing."""
with open(filename, 'r', newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
chunk = []
for row in reader:
chunk.append(row)
if len(chunk) >= chunk_size:
yield chunk
chunk = []
if chunk:
yield chunk
# Process file in chunks
for chunk in read_csv_chunks('huge_file.csv'):
# Process each chunk (e.g., insert into database)
process_chunk(chunk)
Real-World Examples
Example 1: Data Processing Pipeline
import csv
from datetime import datetime
def process_sales_data(input_file, output_file):
"""Clean and transform sales data."""
with open(input_file, 'r', newline='', encoding='utf-8') as infile, \
open(output_file, 'w', newline='', encoding='utf-8') as outfile:
reader = csv.DictReader(infile)
writer = csv.DictWriter(outfile, fieldnames=[
'date', 'product', 'quantity', 'price', 'total', 'category'
])
writer.writeheader()
for row in reader:
# Clean and validate data
try:
quantity = int(row['quantity'])
price = float(row['price'].replace('$', '').replace(',', ''))
total = quantity * price
date = datetime.strptime(row['date'], '%Y-%m-%d').strftime('%m/%d/%Y')
writer.writerow({
'date': date,
'product': row['product'].strip().title(),
'quantity': quantity,
'price': f"${price:.2f}",
'total': f"${total:.2f}",
'category': categorize_product(row['product'])
})
except (ValueError, KeyError) as e:
print(f"Skipping row due to error: {e}")
continue
def categorize_product(product_name):
"""Simple product categorization."""
name_lower = product_name.lower()
if any(word in name_lower for word in ['laptop', 'computer', 'phone']):
return 'Electronics'
elif any(word in name_lower for word in ['shirt', 'pants', 'dress']):
return 'Clothing'
else:
return 'Other'
Example 2: CSV to JSON Conversion
import csv
import json
def csv_to_json(csv_file, json_file):
"""Convert CSV file to JSON format."""
data = []
with open(csv_file, 'r', newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
# Convert numeric fields
for key, value in row.items():
try:
row[key] = int(value)
except ValueError:
try:
row[key] = float(value)
except ValueError:
pass # Keep as string
data.append(row)
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2)
print(f"Converted {len(data)} rows from CSV to JSON")
return data
Example 3: Merging Multiple CSV Files
import csv
import glob
def merge_csv_files(pattern, output_file):
"""Merge multiple CSV files with the same structure."""
all_rows = []
fieldnames = None
for filename in glob.glob(pattern):
with open(filename, 'r', newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
if fieldnames is None:
fieldnames = reader.fieldnames
for row in reader:
all_rows.append(row)
with open(output_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_rows)
print(f"Merged {len(all_rows)} rows from {len(glob.glob(pattern))} files")
Example 4: CSV Validation and Reporting
import csv
from collections import defaultdict
def validate_csv(filename, required_columns=None):
"""Validate CSV file structure and data quality."""
issues = []
row_count = 0
column_counts = defaultdict(int)
with open(filename, 'r', newline='', encoding='utf-8') as f:
reader = csv.reader(f)
header = next(reader)
# Check required columns
if required_columns:
missing = set(required_columns) - set(header)
if missing:
issues.append(f"Missing required columns: {missing}")
for row_num, row in enumerate(reader, start=2):
row_count += 1
# Check for empty rows
if not any(cell.strip() for cell in row):
issues.append(f"Row {row_num}: Empty row")
continue
# Check for inconsistent column counts
if len(row) != len(header):
issues.append(f"Row {row_num}: Expected {len(header)} columns, got {len(row)}")
# Track column usage
for i, cell in enumerate(row):
if i < len(header):
column_counts[header[i]] += 1
# Generate report
report = {
'total_rows': row_count,
'columns': list(header),
'issues': issues,
'column_usage': dict(column_counts)
}
return report
Common Mistakes
| Mistake | Problem | Solution |
|---|---|---|
Forgetting newline='' | Extra blank lines on Windows | Always use newline='' in open() |
| Not specifying encoding | UnicodeDecodeError on special characters | Use encoding='utf-8' or encoding='latin-1' |
| Reading entire file into memory | Memory issues with large files | Process row-by-row or use generators |
Using csv.writer without writerows() | Only writes one row | Use writerows() for multiple rows |
| Not handling BOM | First column name has \ufeff prefix | Use encoding='utf-8-sig' for BOM files |
| Assuming all rows have same length | IndexError on malformed data | Check len(row) before accessing |
Best Practices
import csv
from contextlib import contextmanager
# 1. Always use context managers
@contextmanager
def read_csv(filename):
with open(filename, 'r', newline='', encoding='utf-8') as f:
yield csv.DictReader(f)
# 2. Use DictReader for readable code
with read_csv('data.csv') as reader:
for row in reader:
process(row['name'], row['value'])
# 3. Validate data before processing
def safe_float(value, default=0.0):
try:
return float(value)
except (ValueError, TypeError):
return default
# 4. Use type hints for clarity
from typing import List, Dict, Any
def process_rows(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
return [transform(row) for row in rows]
# 5. Log errors and continue processing
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def robust_csv_processor(filename):
with open(filename, 'r', newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader, start=2):
try:
process(row)
except Exception as e:
logger.error(f"Error processing row {i}: {e}")
continue
Pandas CSV Operations
For more complex CSV operations, pandas provides additional functionality:
import pandas as pd
# Read CSV with pandas
df = pd.read_csv('data.csv')
print(df.head())
print(df.describe())
# Read only specific columns
df = pd.read_csv('data.csv', usecols=['name', 'age', 'salary'])
# Read in chunks
chunks = pd.read_csv('huge_file.csv', chunksize=10000)
for chunk in chunks:
process(chunk)
# Write CSV with pandas
df.to_csv('output.csv', index=False)
# Handle different encodings
df = pd.read_csv('data.csv', encoding='latin-1')
Key Takeaways
- Always use
newline=''when opening CSV files in Python to prevent extra blank lines DictReaderandDictWriterprovide named column access for more readable code- Always specify
encodingparameter — useutf-8-sigfor files with BOM - Process row-by-row for large files to avoid memory issues
- Use context managers (
withstatements) for safe file handling - Validate data quality before processing — check for empty rows, missing columns, and type errors
- For complex transformations, consider using pandas which handles edge cases automatically