File I/O and Data Import

Loading data from files and databases is where every data science project begins. This lesson covers reading and writing structured data formats, file handling best practices, and techniques for working with large files.

Data Import Pipeline

File Basics

CSV format: Each row $r_i$ is a record, columns $c_j$ separated by delimiter: $r_i = c_1, c_2, \ldots, c_m$ .

JSON format: Nested key-value pairs: $\text{obj} = \{k_1: v_1, k_2: v_2, \ldots\}$ where values can be arrays $[v_1, v_2]$ or nested objects.

File size: $N_{bytes} = n_{rows} \times n_{cols} \times \text{bytes\_per\_value}$ .

Opening and Closing Files

# Basic file open/close
f = open("data.txt", "r")
content = f.read()
f.close()

# Better: use context manager (auto-closes)
with open("data.txt", "r") as f:
    content = f.read()

File Modes

Architecture Diagram

"r"   Read (default). File must exist.
"w"   Write. Creates new or truncates existing.
"a"   Append. Creates new or appends to existing.
"x"   Exclusive create. Fails if file exists.
"rb"  Read binary.
"wb"  Write binary.
"r+"  Read and write. File must exist.
"w+"  Write and read. Truncates existing.

Reading Files

# Read entire file
with open("data.txt", "r") as f:
    content = f.read()

# Read line by line (memory efficient for large files)
with open("large.txt", "r") as f:
    for line in f:
        process(line.strip())

# Read all lines into a list
with open("data.txt", "r") as f:
    lines = f.readlines()  # Includes newline characters

# Read without newlines
with open("data.txt", "r") as f:
    lines = [line.strip() for line in f]

Writing Files

# Write strings
with open("output.txt", "w") as f:
    f.write("Hello, World!\n")
    f.write("Second line\n")

# Write multiple lines
lines = ["line 1", "line 2", "line 3"]
with open("output.txt", "w") as f:
    f.writelines(line + "\n" for line in lines)

# Append to existing file
with open("log.txt", "a") as f:
    f.write("New log entry\n")

Reading CSV Files

CSV is the most common data format. Python's csv module and pandas handle it well.

With the csv Module

import csv

# Basic reading
with open("data.csv", "r") as f:
    reader = csv.reader(f)
    header = next(reader)  # First row as header
    for row in reader:
        print(row)  # Each row is a list of strings

# Reading as dictionaries
with open("data.csv", "r") as f:
    reader = csv.DictReader(f)
    for row in reader:
        print(row["name"], row["age"])  # Access by column name

# Writing CSV
with open("output.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["name", "age", "city"])  # Header
    writer.writerow(["Alice", 30, "NYC"])
    writer.writerow(["Bob", 25, "LA"])

With pandas

import pandas as pd

# Basic read
df = pd.read_csv("data.csv")
print(df.head())

# Common options
df = pd.read_csv(
    "data.csv",
    sep=",",                    # Delimiter (default comma)
    header=0,                   # Row number for column names
    names=["col1", "col2"],     # Custom column names
    index_col="id",             # Use column as index
    usecols=["name", "age"],    # Read only these columns
    dtype={"age": int},         # Specify column types
    na_values=["N/A", ""],      # Custom NA values
    parse_dates=["date"],       # Parse date columns
    nrows=1000,                 # Read only first 1000 rows
    skiprows=[1, 3],            # Skip specific rows
    encoding="utf-8",           # File encoding
)

# Writing CSV
df.to_csv("output.csv", index=False)
df.to_csv("output.csv", columns=["name", "age"])  # Specific columns

Reading JSON Files

JSON is the standard for API responses and nested data.

import json

# Read JSON
with open("data.json", "r") as f:
    data = json.load(f)  # Parses to Python dict/list

# Read JSON lines (one JSON object per line)
records = []
with open("events.jsonl", "r") as f:
    for line in f:
        records.append(json.loads(line))

# Write JSON
with open("output.json", "w") as f:
    json.dump(data, f, indent=2)

# Pretty print
print(json.dumps(data, indent=2, sort_keys=True))

JSON with pandas

import pandas as pd

# Simple JSON (flat)
df = pd.read_json("data.json")

# Nested JSON
import json

with open("nested.json", "r") as f:
    raw = json.load(f)

# Flatten nested structure
df = pd.json_normalize(
    raw,
    record_path="events",         # Key containing the list
    meta=["user_id", "session"],  # Parent fields to keep
    errors="ignore"
)

# JSON Lines format (common in ML datasets)
df = pd.read_json("data.jsonl", lines=True)

# Write JSON
df.to_json("output.json", orient="records", indent=2)
df.to_json("output.jsonl", orient="records", lines=True)

Reading Excel Files

import pandas as pd

# Basic read
df = pd.read_excel("data.xlsx")

# With options
df = pd.read_excel(
    "data.xlsx",
    sheet_name="Sheet1",        # Or index 0
    header=0,                   # Row number for header
    names=["col1", "col2"],     # Custom names
    skiprows=2,                 # Skip first 2 rows
    usecols="A:C",              # Excel range notation
    nrows=1000,
    na_values=["N/A"],
    engine="openpyxl",          # Required engine
)

# Read multiple sheets
sheets = pd.read_excel("data.xlsx", sheet_name=None)  # Dict of DataFrames
for name, df in sheets.items():
    print(f"Sheet '{name}': {df.shape}")

# Write Excel
df.to_excel("output.xlsx", index=False, sheet_name="Results")

# Multiple sheets
with pd.ExcelWriter("output.xlsx") as writer:
    df1.to_excel(writer, sheet_name="Summary", index=False)
    df2.to_excel(writer, sheet_name="Details", index=False)

Reading from Databases

SQLite (Built-in)

import sqlite3
import pandas as pd

# Connect to database (creates if doesn't exist)
conn = sqlite3.connect("my_database.db")

# Read query results into DataFrame
df = pd.read_sql_query("SELECT * FROM users WHERE age > 25", conn)

# Execute queries
cursor = conn.cursor()
cursor.execute("""
    CREATE TABLE IF NOT EXISTS users (
        id INTEGER PRIMARY KEY,
        name TEXT,
        age INTEGER
    )
""")
cursor.execute("INSERT INTO users (name, age) VALUES (?, ?)", ("Alice", 30))
conn.commit()

# Close connection
conn.close()

With SQLAlchemy (Works with Most Databases)

from sqlalchemy import create_engine
import pandas as pd

# Create engine (supports many databases)
engine = create_engine("sqlite:///my_database.db")
# engine = create_engine("postgresql://user:pass@localhost:5432/mydb")
# engine = create_engine("mysql+pymysql://user:pass@localhost/mydb")
# engine = create_engine("mssql+pyodbc://user:pass@dsn")

# Read
df = pd.read_sql("SELECT * FROM users", engine)

# Write (creates table automatically)
df.to_sql("users", engine, if_exists="replace", index=False)
df.to_sql("new_users", engine, if_exists="append", index=False)  # Add rows

Chunked Reading for Large Files

When files are too large to fit in memory, process them in chunks.

CSV Chunks

import pandas as pd

# Process CSV in chunks
chunk_size = 10_000
results = []

for chunk in pd.read_csv("huge_file.csv", chunksize=chunk_size):
    # Process each chunk
    filtered = chunk[chunk["amount"] > 100]
    results.append(filtered)

# Combine results
df = pd.concat(results, ignore_index=True)
print(f"Total rows processed: {len(df)}")

# Or process and write incrementally
with pd.open("output.csv", "w") as writer:
    for chunk in pd.read_csv("huge_file.csv", chunksize=10_000):
        processed = chunk.groupby("category").sum()
        processed.to_csv(writer)

JSON Lines Chunks

import json
import pandas as pd

# Process JSONL in chunks
chunk_size = 5000
chunk_num = 0

records = []
with open("events.jsonl", "r") as f:
    for line in f:
        records.append(json.loads(line))

        if len(records) >= chunk_size:
            df = pd.DataFrame(records)
            process_chunk(df)
            records = []
            chunk_num += 1

# Process remaining records
if records:
    df = pd.DataFrame(records)
    process_chunk(df)

Working with Compressed Files

import pandas as pd
import gzip
import json

# Pandas handles compression automatically
df = pd.read_csv("data.csv.gz")          # gzip
df = pd.read_csv("data.csv.bz2")         # bzip2
df = pd.read_csv("data.csv.xz")          # xz
df = pd.read_json("data.json.gz", compression="gzip")

# Writing compressed
df.to_csv("output.csv.gz", compression="gzip")

# Manual gzip reading
with gzip.open("data.json.gz", "rt", encoding="utf-8") as f:
    data = json.load(f)

Encoding Issues

# Common encodings
df = pd.read_csv("data.csv", encoding="utf-8")       # Default, most common
df = pd.read_csv("data.csv", encoding="latin-1")     # Western European
df = pd.read_csv("data.csv", encoding="cp1252")      # Windows
df = pd.read_csv("data.csv", encoding="utf-8-sig")   # UTF-8 with BOM

# Detect encoding (install chardet first)
import chardet

with open("mystery.csv", "rb") as f:
    raw = f.read(10000)
    result = chardet.detect(raw)
    print(result)  # {'encoding': 'utf-8', 'confidence': 0.99}

Practical Pipeline

import pandas as pd
from pathlib import Path

def load_all_csvs(folder_path):
    """Load and concatenate all CSV files in a folder."""
    folder = Path(folder_path)
    files = sorted(folder.glob("*.csv"))

    dfs = []
    for f in files:
        df = pd.read_csv(f)
        df["source_file"] = f.name  # Track origin
        dfs.append(df)

    combined = pd.concat(dfs, ignore_index=True)
    print(f"Loaded {len(files)} files, {len(combined)} total rows")
    return combined

def auto_detect_and_read(filepath):
    """Read a file based on its extension."""
    path = Path(filepath)
    ext = path.suffix.lower()

    readers = {
        ".csv": lambda: pd.read_csv(path),
        ".tsv": lambda: pd.read_csv(path, sep="\t"),
        ".json": lambda: pd.read_json(path),
        ".jsonl": lambda: pd.read_json(path, lines=True),
        ".xlsx": lambda: pd.read_excel(path),
        ".parquet": lambda: pd.read_parquet(path),
    }

    reader = readers.get(ext)
    if reader is None:
        raise ValueError(f"Unsupported format: {ext}")

    df = reader()
    print(f"Loaded {ext} file: {df.shape[0]} rows, {df.shape[1]} columns")
    return df

Key Takeaways

Always use context managers (with statements) for file operations.
Pandas provides the most convenient API for structured data files.
Use chunked reading for files larger than available memory.
JSON Lines format is ideal for streaming and large datasets.
Choose the right encoding (UTF-8 is the safe default).
Parquet is more efficient than CSV for columnar data (covered later).

File I/O and Data Import

File I/O and Data Import

Data Import Pipeline

File Basics

Opening and Closing Files

File Modes

Reading Files

Writing Files

Reading CSV Files

With the csv Module

With pandas

Reading JSON Files

JSON with pandas

Reading Excel Files

Reading from Databases

SQLite (Built-in)

With SQLAlchemy (Works with Most Databases)

Chunked Reading for Large Files

CSV Chunks

JSON Lines Chunks

Working with Compressed Files

Encoding Issues

Practical Pipeline

Key Takeaways

Premium Content

Need Expert Data Science Help?