GroupBy, Merge, and Pivot Tables
Data rarely comes in the shape you need. You split data into groups, combine multiple tables, and reshape structures to answer your questions. This lesson covers the essential reshaping and combination operations in pandas.
Split-Apply-Combine Flow
Split-Apply-Combine
GroupBy implements the split-apply-combine pattern:
- Split: Divide data into groups based on one or more keys.
- Apply: Apply a function to each group independently.
- Combine: Merge the results back together.
import pandas as pd
import numpy as np
sales = pd.DataFrame({
"region": ["East", "West", "East", "West", "East", "West"],
"product": ["A", "A", "B", "B", "A", "B"],
"revenue": [100, 150, 200, 120, 130, 180],
"quantity": [10, 15, 20, 12, 13, 18],
})
# Simple groupby
region_totals = sales.groupby("region")["revenue"].sum()
print(region_totals)
# region
# East 430
# West 450
# Multiple aggregations
print(sales.groupby("region")["revenue"].agg(["sum", "mean", "count"]))
# sum mean count
# East 430 143.333333 3
# West 450 150.000000 3
# Multiple columns grouped
print(sales.groupby(["region", "product"])["revenue"].sum())
# region product
# East A 230
# B 200
# West A 150
# B 300
# Custom aggregation functions
def revenue_range(x):
return x.max() - x.min()
print(sales.groupby("region")["revenue"].agg(revenue_range))
Aggregation Methods
df = pd.DataFrame({
"category": ["A", "A", "A", "B", "B", "B"],
"value": [10, 20, 30, 40, 50, 60],
})
grouped = df.groupby("category")
print(grouped["value"].sum()) # Sum per group
print(grouped["value"].mean()) # Mean per group
print(grouped["value"].median()) # Median per group
print(grouped["value"].std()) # Std per group
print(grouped["value"].min()) # Min per group
print(grouped["value"].max()) # Max per group
print(grouped["value"].first()) # First value per group
print(grouped["value"].last()) # Last value per group
print(grouped["value"].nth(1)) # Nth value per group
print(grouped.size()) # Count of rows per group
print(grouped.count()) # Non-null count per group
Named Aggregation
result = sales.groupby("region").agg(
total_revenue=("revenue", "sum"),
avg_revenue=("revenue", "mean"),
total_quantity=("quantity", "sum"),
num_transactions=("revenue", "count"),
)
print(result)
Transform
transform returns a result the same size as the input β useful for adding group-level calculations as new columns.
sales["region_avg"] = sales.groupby("region")["revenue"].transform("mean")
sales["revenue_vs_avg"] = sales["revenue"] - sales["region_avg"]
print(sales)
# region product revenue quantity region_avg revenue_vs_avg
# 0 East A 100 10 143.333333 -43.333333
# 1 West A 150 15 150.000000 0.000000
# 2 East B 200 20 143.333333 56.666667
# ...
Filter
# Keep only groups that satisfy a condition
big_regions = sales.groupby("region").filter(lambda x: x["revenue"].sum() > 400)
print(big_regions)
# Keep groups above a threshold
above_avg = sales.groupby("region")["revenue"].transform(lambda x: x > x.mean())
print(sales[above_avg])
Merging and Joining
Merge combines two DataFrames based on common columns or indices.
Types of Joins
employees = pd.DataFrame({
"emp_id": [1, 2, 3, 4],
"name": ["Alice", "Bob", "Charlie", "Diana"],
"dept_id": [101, 102, 101, 103],
})
departments = pd.DataFrame({
"dept_id": [101, 102, 104],
"dept_name": ["Engineering", "Marketing", "Sales"],
})
# Inner join (only matching rows)
inner = pd.merge(employees, departments, on="dept_id", how="inner")
print(inner)
# emp_id name dept_id dept_name
# 0 1 Alice 101 Engineering
# 1 3 Charlie 101 Engineering
# 2 2 Bob 102 Marketing
# Left join (all left, matching right)
left = pd.merge(employees, departments, on="dept_id", how="left")
print(left)
# emp_id name dept_id dept_name
# 0 1 Alice 101 Engineering
# 1 2 Bob 102 Marketing
# 2 3 Charlie 101 Engineering
# 3 4 Diana 103 NaN (no match)
# Right join (all right, matching left)
right = pd.merge(employees, departments, on="dept_id", how="right")
# Outer join (all rows from both)
outer = pd.merge(employees, departments, on="dept_id", how="outer")
Merge on Different Column Names
df1 = pd.DataFrame({"id": [1, 2, 3], "value": ["a", "b", "c"]})
df2 = pd.DataFrame({"user_id": [2, 3, 4], "score": [10, 20, 30]})
merged = pd.merge(df1, df2, left_on="id", right_on="user_id", how="inner")
print(merged)
# id value user_id score
# 0 2 b 2 10
# 1 3 c 3 20
# Drop redundant column
merged = merged.drop(columns=["user_id"])
Merge on Index
df1 = pd.DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"])
df2 = pd.DataFrame({"B": [4, 5, 6]}, index=["b", "c", "d"])
# Join on index
result = df1.join(df2, how="inner")
print(result)
# A B
# b 2 4
# c 3 5
# Using merge with index
result = pd.merge(df1, df2, left_index=True, right_index=True, how="outer")
Cross Join
sizes = pd.DataFrame({"size": ["S", "M", "L"]})
colors = pd.DataFrame({"color": ["red", "blue"]})
cross = pd.merge(sizes, colors, how="cross")
print(cross)
# size color
# 0 S red
# 1 S blue
# 2 M red
# 3 M blue
# 4 L red
# 5 L blue
Pivot Tables
Pivot tables aggregate data and reshape it from long to wide format.
sales = pd.DataFrame({
"date": ["2024-01", "2024-01", "2024-02", "2024-02", "2024-03", "2024-03"],
"region": ["East", "West", "East", "West", "East", "West"],
"revenue": [100, 150, 120, 180, 130, 200],
})
# Basic pivot table
pivot = pd.pivot_table(
sales,
values="revenue", # Values to aggregate
index="region", # Rows
columns="date", # Columns
aggfunc="sum", # Aggregation function
fill_value=0, # Fill missing with 0
)
print(pivot)
# date 2024-01 2024-02 2024-03
# region
# East 100 120 130
# West 150 180 200
# Multiple aggregations
pivot_multi = pd.pivot_table(
sales,
values="revenue",
index="region",
columns="date",
aggfunc=["sum", "mean", "count"],
)
print(pivot_multi)
# Multiple value columns
pivot_values = pd.pivot_table(
sales,
values=["revenue"],
index="region",
columns="date",
aggfunc="sum",
margins=True, # Add row/column totals
margins_name="Total",
)
print(pivot_values)
Pivot vs Pivot Table
# pivot: reshapes without aggregation (no duplicates allowed)
df = pd.DataFrame({
"name": ["Alice", "Bob", "Alice", "Bob"],
"subject": ["math", "math", "science", "science"],
"score": [90, 85, 88, 92],
})
# pivot_table: handles duplicates via aggregation
pivot = pd.pivot_table(
df,
values="score",
index="name",
columns="subject",
aggfunc="mean", # Handles duplicates
)
print(pivot)
# subject math science
# name
# Alice 90 88
# Bob 85 92
Melt (Unpivot)
Melt does the opposite of pivot β it converts wide format to long format.
# Wide format
df = pd.DataFrame({
"name": ["Alice", "Bob"],
"math": [90, 85],
"science": [88, 92],
"english": [95, 78],
})
print(df)
# name math science english
# 0 Alice 90 88 95
# 1 Bob 85 92 78
# Melt to long format
melted = pd.melt(
df,
id_vars=["name"], # Columns to keep
value_vars=["math", "science", "english"], # Columns to unpivot
var_name="subject", # Name for variable column
value_name="score", # Name for value column
)
print(melted)
# name subject score
# 0 Alice math 90
# 1 Bob math 85
# 2 Alice science 88
# 3 Bob science 92
# 4 Alice english 95
# 5 Bob english 78
# Melt all non-id columns automatically
melted = pd.melt(df, id_vars=["name"])
Stack and Unstack
Stack moves columns to rows (wide to long). Unstack moves rows to columns (long to wide).
# Create multi-index DataFrame
arrays = [
["East", "East", "West", "West"],
["Q1", "Q2", "Q1", "Q2"],
]
index = pd.MultiIndex.from_arrays(arrays, names=["region", "quarter"])
df = pd.DataFrame({
"revenue": [100, 120, 150, 180],
"cost": [60, 70, 80, 90],
}, index=index)
print(df)
# revenue cost
# region quarter
# East Q1 100 60
# Q2 120 70
# West Q1 150 80
# Q2 180 90
# Unstack: move quarter level to columns
unstacked = df.unstack(level="quarter")
print(unstacked)
# revenue cost
# quarter Q1 Q2 Q1 Q2
# region
# East 100 120 60 70
# West 150 180 80 90
# Stack: move quarter back to rows
stacked = unstacked.stack()
print(stacked)
# revenue cost
# region quarter
# East Q1 100 60
# Q2 120 70
# West Q1 150 80
# Q2 180 90
Cross-Tabulation
# Create sample data
data = pd.DataFrame({
"gender": ["M", "F", "M", "F", "M", "F", "M", "F"],
"preference": ["A", "B", "A", "A", "B", "B", "A", "B"],
})
# Cross-tabulation (frequency table)
ct = pd.crosstab(data["gender"], data["preference"])
print(ct)
# preference A B
# gender
# F 1 3
# M 3 1
# With margins
ct = pd.crosstab(data["gender"], data["preference"], margins=True)
print(ct)
# With normalization (proportions)
ct = pd.crosstab(data["gender"], data["preference"], normalize="index")
print(ct)
# preference A B
# gender
# F 0.250000 0.750000
# M 0.750000 0.250000
Aggregation Formulas
The GroupBy operation can be expressed mathematically:
Common aggregation functions:
Named aggregation:
Practical Pipeline
import pandas as pd
def analyze_sales(sales_df):
"""Complete sales analysis pipeline."""
# 1. Pivot: revenue by region and month
pivot = pd.pivot_table(
sales_df,
values="revenue",
index="region",
columns=sales_df["date"].dt.to_period("M"),
aggfunc="sum",
fill_value=0,
)
# 2. Add growth rate
pivot["growth_rate"] = pivot.pct_change(axis=1).iloc[:, -1]
# 3. Melt for visualization
melted = pd.melt(
pivot.reset_index(),
id_vars=["region"],
var_name="month",
value_name="revenue",
)
# 4. Cross-tab for summary
summary = pd.crosstab(
sales_df["region"],
sales_df["product"],
values=sales_df["revenue"],
aggfunc="sum",
margins=True,
)
return pivot, melted, summary
Key Takeaways
- GroupBy splits data by keys and applies functions independently to each group.
- Use
agg()for multiple aggregations andtransform()to add group stats as columns. - Merge types: inner (matching), left (all left), right (all right), outer (all).
- Pivot tables aggregate and reshape simultaneously β they are your most powerful reshaping tool.
- Melt is the inverse of pivot β use it to convert wide to long format.
- Stack/unstack operate on multi-index levels for hierarchical data.
- Always inspect your data shape before and after reshaping operations.