File I/O and Data Import
Loading data from files and databases is where every data science project begins. This lesson covers reading and writing structured data formats, file handling best practices, and techniques for working with large files.
Data Import Pipeline
File Basics
CSV format: Each row is a record, columns separated by delimiter: .
JSON format: Nested key-value pairs: where values can be arrays or nested objects.
File size: .
Opening and Closing Files
# Basic file open/close
f = open("data.txt", "r")
content = f.read()
f.close()
# Better: use context manager (auto-closes)
with open("data.txt", "r") as f:
content = f.read()
File Modes
"r" Read (default). File must exist.
"w" Write. Creates new or truncates existing.
"a" Append. Creates new or appends to existing.
"x" Exclusive create. Fails if file exists.
"rb" Read binary.
"wb" Write binary.
"r+" Read and write. File must exist.
"w+" Write and read. Truncates existing.
Reading Files
# Read entire file
with open("data.txt", "r") as f:
content = f.read()
# Read line by line (memory efficient for large files)
with open("large.txt", "r") as f:
for line in f:
process(line.strip())
# Read all lines into a list
with open("data.txt", "r") as f:
lines = f.readlines() # Includes newline characters
# Read without newlines
with open("data.txt", "r") as f:
lines = [line.strip() for line in f]
Writing Files
# Write strings
with open("output.txt", "w") as f:
f.write("Hello, World!\n")
f.write("Second line\n")
# Write multiple lines
lines = ["line 1", "line 2", "line 3"]
with open("output.txt", "w") as f:
f.writelines(line + "\n" for line in lines)
# Append to existing file
with open("log.txt", "a") as f:
f.write("New log entry\n")
Reading CSV Files
CSV is the most common data format. Python's csv module and pandas handle it well.
With the csv Module
import csv
# Basic reading
with open("data.csv", "r") as f:
reader = csv.reader(f)
header = next(reader) # First row as header
for row in reader:
print(row) # Each row is a list of strings
# Reading as dictionaries
with open("data.csv", "r") as f:
reader = csv.DictReader(f)
for row in reader:
print(row["name"], row["age"]) # Access by column name
# Writing CSV
with open("output.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["name", "age", "city"]) # Header
writer.writerow(["Alice", 30, "NYC"])
writer.writerow(["Bob", 25, "LA"])
With pandas
import pandas as pd
# Basic read
df = pd.read_csv("data.csv")
print(df.head())
# Common options
df = pd.read_csv(
"data.csv",
sep=",", # Delimiter (default comma)
header=0, # Row number for column names
names=["col1", "col2"], # Custom column names
index_col="id", # Use column as index
usecols=["name", "age"], # Read only these columns
dtype={"age": int}, # Specify column types
na_values=["N/A", ""], # Custom NA values
parse_dates=["date"], # Parse date columns
nrows=1000, # Read only first 1000 rows
skiprows=[1, 3], # Skip specific rows
encoding="utf-8", # File encoding
)
# Writing CSV
df.to_csv("output.csv", index=False)
df.to_csv("output.csv", columns=["name", "age"]) # Specific columns
Reading JSON Files
JSON is the standard for API responses and nested data.
import json
# Read JSON
with open("data.json", "r") as f:
data = json.load(f) # Parses to Python dict/list
# Read JSON lines (one JSON object per line)
records = []
with open("events.jsonl", "r") as f:
for line in f:
records.append(json.loads(line))
# Write JSON
with open("output.json", "w") as f:
json.dump(data, f, indent=2)
# Pretty print
print(json.dumps(data, indent=2, sort_keys=True))
JSON with pandas
import pandas as pd
# Simple JSON (flat)
df = pd.read_json("data.json")
# Nested JSON
import json
with open("nested.json", "r") as f:
raw = json.load(f)
# Flatten nested structure
df = pd.json_normalize(
raw,
record_path="events", # Key containing the list
meta=["user_id", "session"], # Parent fields to keep
errors="ignore"
)
# JSON Lines format (common in ML datasets)
df = pd.read_json("data.jsonl", lines=True)
# Write JSON
df.to_json("output.json", orient="records", indent=2)
df.to_json("output.jsonl", orient="records", lines=True)
Reading Excel Files
import pandas as pd
# Basic read
df = pd.read_excel("data.xlsx")
# With options
df = pd.read_excel(
"data.xlsx",
sheet_name="Sheet1", # Or index 0
header=0, # Row number for header
names=["col1", "col2"], # Custom names
skiprows=2, # Skip first 2 rows
usecols="A:C", # Excel range notation
nrows=1000,
na_values=["N/A"],
engine="openpyxl", # Required engine
)
# Read multiple sheets
sheets = pd.read_excel("data.xlsx", sheet_name=None) # Dict of DataFrames
for name, df in sheets.items():
print(f"Sheet '{name}': {df.shape}")
# Write Excel
df.to_excel("output.xlsx", index=False, sheet_name="Results")
# Multiple sheets
with pd.ExcelWriter("output.xlsx") as writer:
df1.to_excel(writer, sheet_name="Summary", index=False)
df2.to_excel(writer, sheet_name="Details", index=False)
Reading from Databases
SQLite (Built-in)
import sqlite3
import pandas as pd
# Connect to database (creates if doesn't exist)
conn = sqlite3.connect("my_database.db")
# Read query results into DataFrame
df = pd.read_sql_query("SELECT * FROM users WHERE age > 25", conn)
# Execute queries
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS users (
id INTEGER PRIMARY KEY,
name TEXT,
age INTEGER
)
""")
cursor.execute("INSERT INTO users (name, age) VALUES (?, ?)", ("Alice", 30))
conn.commit()
# Close connection
conn.close()
With SQLAlchemy (Works with Most Databases)
from sqlalchemy import create_engine
import pandas as pd
# Create engine (supports many databases)
engine = create_engine("sqlite:///my_database.db")
# engine = create_engine("postgresql://user:pass@localhost:5432/mydb")
# engine = create_engine("mysql+pymysql://user:pass@localhost/mydb")
# engine = create_engine("mssql+pyodbc://user:pass@dsn")
# Read
df = pd.read_sql("SELECT * FROM users", engine)
# Write (creates table automatically)
df.to_sql("users", engine, if_exists="replace", index=False)
df.to_sql("new_users", engine, if_exists="append", index=False) # Add rows
Chunked Reading for Large Files
When files are too large to fit in memory, process them in chunks.
CSV Chunks
import pandas as pd
# Process CSV in chunks
chunk_size = 10_000
results = []
for chunk in pd.read_csv("huge_file.csv", chunksize=chunk_size):
# Process each chunk
filtered = chunk[chunk["amount"] > 100]
results.append(filtered)
# Combine results
df = pd.concat(results, ignore_index=True)
print(f"Total rows processed: {len(df)}")
# Or process and write incrementally
with pd.open("output.csv", "w") as writer:
for chunk in pd.read_csv("huge_file.csv", chunksize=10_000):
processed = chunk.groupby("category").sum()
processed.to_csv(writer)
JSON Lines Chunks
import json
import pandas as pd
# Process JSONL in chunks
chunk_size = 5000
chunk_num = 0
records = []
with open("events.jsonl", "r") as f:
for line in f:
records.append(json.loads(line))
if len(records) >= chunk_size:
df = pd.DataFrame(records)
process_chunk(df)
records = []
chunk_num += 1
# Process remaining records
if records:
df = pd.DataFrame(records)
process_chunk(df)
Working with Compressed Files
import pandas as pd
import gzip
import json
# Pandas handles compression automatically
df = pd.read_csv("data.csv.gz") # gzip
df = pd.read_csv("data.csv.bz2") # bzip2
df = pd.read_csv("data.csv.xz") # xz
df = pd.read_json("data.json.gz", compression="gzip")
# Writing compressed
df.to_csv("output.csv.gz", compression="gzip")
# Manual gzip reading
with gzip.open("data.json.gz", "rt", encoding="utf-8") as f:
data = json.load(f)
Encoding Issues
# Common encodings
df = pd.read_csv("data.csv", encoding="utf-8") # Default, most common
df = pd.read_csv("data.csv", encoding="latin-1") # Western European
df = pd.read_csv("data.csv", encoding="cp1252") # Windows
df = pd.read_csv("data.csv", encoding="utf-8-sig") # UTF-8 with BOM
# Detect encoding (install chardet first)
import chardet
with open("mystery.csv", "rb") as f:
raw = f.read(10000)
result = chardet.detect(raw)
print(result) # {'encoding': 'utf-8', 'confidence': 0.99}
Practical Pipeline
import pandas as pd
from pathlib import Path
def load_all_csvs(folder_path):
"""Load and concatenate all CSV files in a folder."""
folder = Path(folder_path)
files = sorted(folder.glob("*.csv"))
dfs = []
for f in files:
df = pd.read_csv(f)
df["source_file"] = f.name # Track origin
dfs.append(df)
combined = pd.concat(dfs, ignore_index=True)
print(f"Loaded {len(files)} files, {len(combined)} total rows")
return combined
def auto_detect_and_read(filepath):
"""Read a file based on its extension."""
path = Path(filepath)
ext = path.suffix.lower()
readers = {
".csv": lambda: pd.read_csv(path),
".tsv": lambda: pd.read_csv(path, sep="\t"),
".json": lambda: pd.read_json(path),
".jsonl": lambda: pd.read_json(path, lines=True),
".xlsx": lambda: pd.read_excel(path),
".parquet": lambda: pd.read_parquet(path),
}
reader = readers.get(ext)
if reader is None:
raise ValueError(f"Unsupported format: {ext}")
df = reader()
print(f"Loaded {ext} file: {df.shape[0]} rows, {df.shape[1]} columns")
return df
Key Takeaways
- Always use context managers (
withstatements) for file operations. - Pandas provides the most convenient API for structured data files.
- Use chunked reading for files larger than available memory.
- JSON Lines format is ideal for streaming and large datasets.
- Choose the right encoding (UTF-8 is the safe default).
- Parquet is more efficient than CSV for columnar data (covered later).