Pandas Series and DataFrames
Pandas is the most important library in Python data science. It provides two primary data structures – Series and DataFrame – that make working with structured data intuitive and powerful.
Series
A Series is a one-dimensional labeled array. Think of it as a single column of data with an index.
import pandas as pd
import numpy as np
# Creating a Series
s = pd.Series([10, 20, 30, 40, 50])
print(s)
# 0 10
# 1 20
# 2 30
# 3 40
# 4 50
# dtype: int64
# With custom index
s = pd.Series(
[10, 20, 30],
index=["a", "b", "c"],
name="values"
)
print(s["b"]) # 20
# From dictionary
s = pd.Series({"apple": 1.5, "banana": 0.75, "cherry": 2.0})
print(s["banana"]) # 0.75
# From scalar value (broadcast)
s = pd.Series(5, index=["a", "b", "c"])
print(s)
# a 5
# b 5
# c 5
Series Operations
s = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"])
# Element-wise operations
print(s + 10) # Add 10 to every element
print(s * 2) # Multiply every element
print(np.sqrt(s)) # Square root of each
# Boolean operations
print(s > 3) # [False False False True True]
# Filtering
print(s[s > 3]) # d 4, e 5
# Aggregations
print(s.sum()) # 15
print(s.mean()) # 3.0
print(s.std()) # 1.58...
print(s.describe())
# count 5.000000
# mean 3.000000
# std 1.581139
# min 1.000000
# 25% 2.000000
# 50% 3.000000
# 75% 4.000000
# max 5.000000
DataFrame Structure
DataFrame
A DataFrame is a two-dimensional labeled data structure – like a spreadsheet or SQL table.
Creating DataFrames
# From dictionary
df = pd.DataFrame({
"name": ["Alice", "Bob", "Charlie", "Diana"],
"age": [25, 30, 35, 28],
"city": ["NYC", "LA", "Chicago", "Houston"],
"salary": [70000, 80000, 90000, 75000],
})
print(df)
# name age city salary
# 0 Alice 25 NYC 70000
# 1 Bob 30 LA 80000
# 2 Charlie 35 Chicago 90000
# 3 Diana 28 Houston 75000
# From list of dictionaries
records = [
{"name": "Alice", "age": 25},
{"name": "Bob", "age": 30},
]
df = pd.DataFrame(records)
# From NumPy array
data = np.random.randint(0, 100, size=(4, 3))
df = pd.DataFrame(data, columns=["A", "B", "C"])
# From CSV
df = pd.read_csv("data.csv")
Basic Properties
df = pd.DataFrame({
"name": ["Alice", "Bob", "Charlie"],
"age": [25, 30, 35],
"salary": [70000, 80000, 90000],
})
print(df.shape) # (3, 3) – rows x columns
print(df.columns) # Index(['name', 'age', 'salary'], dtype='object')
print(df.index) # RangeIndex(start=0, stop=3, step=1)
print(df.dtypes) # name=object, age=int64, salary=int64
print(df.values) # NumPy array
print(df.size) # 9 (total elements)
print(df.ndim) # 2
Indexing: loc and iloc
loc accesses by label. iloc accesses by position. Understanding this distinction is critical.
df = pd.DataFrame({
"name": ["Alice", "Bob", "Charlie", "Diana"],
"age": [25, 30, 35, 28],
"salary": [70000, 80000, 90000, 75000],
}, index=["emp1", "emp2", "emp3", "emp4"])
# iloc – integer position based
print(df.iloc[0]) # First row (as Series)
print(df.iloc[0, 1]) # First row, second column (25)
print(df.iloc[0:2]) # First two rows
print(df.iloc[0:2, 0:2]) # First two rows, first two columns
print(df.iloc[[0, 2]]) # Rows 0 and 2
# loc – label based
print(df.loc["emp1"]) # Row with index "emp1"
print(df.loc["emp1", "age"]) # Row "emp1", column "age"
print(df.loc["emp1":"emp3"]) # Rows emp1 through emp3 (inclusive!)
print(df.loc["emp1":"emp3", "name":"age"]) # Sliced rows and columns
print(df.loc[["emp1", "emp3"]]) # Specific rows
# Boolean indexing
print(df[df["age"] > 28]) # Filter rows
print(df.loc[df["age"] > 28, "name"]) # Filter + select column
loc vs iloc Cheat Sheet
Architecture Diagram
loc: Uses labels (index names, column names)
Slicing is INCLUSIVE on both ends
df.loc["a":"c"] includes "c"
iloc: Uses integer positions (0, 1, 2...)
Slicing is EXCLUSIVE on end (like Python)
df.iloc[0:3] stops at index 2
Selecting Columns
df = pd.DataFrame({
"name": ["Alice", "Bob"],
"age": [25, 30],
"salary": [70000, 80000],
})
# Single column (returns Series)
print(df["name"])
# Single column (returns DataFrame)
print(df[["name"]])
# Multiple columns
print(df[["name", "salary"]])
# Dot notation (works for valid Python identifiers)
print(df.name)
Adding and Removing Columns
df = pd.DataFrame({
"name": ["Alice", "Bob"],
"age": [25, 30],
})
# Add new column
df["salary"] = [70000, 80000]
# Derived column
df["salary_k"] = df["salary"] / 1000
# Conditional column
df["senior"] = df["age"] >= 30
# Insert at specific position
df.insert(1, "id", [101, 102])
# Drop columns
df = df.drop(columns=["senior"])
df = df.drop(columns=["id", "salary_k"])
# Rename columns
df = df.rename(columns={"name": "employee_name", "age": "employee_age"})
dtypes and Type Conversion
df = pd.DataFrame({
"price": ["19.99", "42.50", "100.00"], # Strings!
"quantity": ["3", "5", "2"],
"date": ["2024-01-15", "2024-02-20", "2024-03-25"],
})
print(df.dtypes)
# price object
# quantity object
# date object
# Convert types
df["price"] = df["price"].astype(float)
df["quantity"] = df["quantity"].astype(int)
df["date"] = pd.to_datetime(df["date"])
# Smart conversion
df["price"] = pd.to_numeric(df["price"], errors="coerce") # Bad values become NaN
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
print(df.dtypes)
# price float64
# quantity int32
# date datetime64[ns]
describe and info
df = pd.DataFrame({
"age": [25, 30, 35, 28, 40],
"salary": [70000, 80000, 90000, 75000, 95000],
"department": ["eng", "eng", "sales", "eng", "sales"],
})
# describe – statistical summary
print(df.describe())
# age salary
# count 5.000000 5.000000
# mean 31.600000 82000.000000
# std 5.549775 9617.692035
# min 25.000000 70000.000000
# 25% 28.000000 75000.000000
# 50% 30.000000 80000.000000
# 75% 35.000000 90000.000000
# max 40.000000 95000.000000
# describe for all columns (including categorical)
print(df.describe(include="all"))
# info – DataFrame structure
print(df.info())
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 5 entries, 0 to 4
# Data columns (total 3 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 age 5 non-null int64
# 1 salary 5 non-null int64
# 2 department 5 non-null object
# dtypes: int64(2), object(1)
# memory usage: 208.0+ bytes
# value_counts – frequency of unique values
print(df["department"].value_counts())
# eng 3
# sales 2
DataFrame Indexing Notation
Selection examples:
Handling Missing Data
df = pd.DataFrame({
"name": ["Alice", "Bob", None, "Diana"],
"age": [25, np.nan, 35, 28],
"salary": [70000, 80000, np.nan, 75000],
})
# Detect missing values
print(df.isnull())
print(df.isnull().sum()) # Count per column
# Drop rows with any missing values
print(df.dropna())
# Drop rows where specific columns are missing
print(df.dropna(subset=["name"]))
# Fill missing values
print(df.fillna(0)) # Fill with constant
print(df.fillna({"age": df["age"].mean()}) ) # Different fill per column
print(df["age"].fillna(method="ffill")) # Forward fill
# Interpolation
print(df["salary"].interpolate())
apply and map
df = pd.DataFrame({
"name": ["Alice", "Bob", "Charlie"],
"age": [25, 30, 35],
})
# apply with function
df["age_group"] = df["age"].apply(lambda x: "junior" if x < 30 else "senior")
# apply with custom function
def categorize(row):
if row["age"] < 28:
return "young"
elif row["age"] < 33:
return "mid"
else:
return "senior"
df["category"] = df.apply(categorize, axis=1)
# map on Series (for value transformation)
name_lengths = df["name"].map(len)
print(name_lengths)
# 0 5
# 1 3
# 2 7
# replace values
df["age"] = df["age"].replace({25: 26, 30: 31})
Sorting
df = pd.DataFrame({
"name": ["Alice", "Bob", "Charlie", "Diana"],
"age": [25, 30, 35, 28],
"salary": [70000, 80000, 90000, 75000],
})
# Sort by column
print(df.sort_values("age"))
print(df.sort_values("age", ascending=False))
# Sort by multiple columns
print(df.sort_values(["age", "salary"], ascending=[True, False]))
# Sort by index
print(df.sort_index())
# nlargest / nsmallest
print(df.nlargest(2, "salary")) # Top 2 earners
print(df.nsmallest(2, "age")) # 2 youngest
Practical Example: Exploring a Dataset
import pandas as pd
# Load dataset
df = pd.read_csv("titanic.csv")
# First look
print(f"Shape: {df.shape}")
print(df.head())
print(df.info())
print(df.describe())
# Quick analysis
print("\n--- Survival rates ---")
print(df.groupby("Survived")["Age"].mean())
print("\n--- Missing values ---")
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print(missing)
print("\n--- Categorical distributions ---")
print(df["Pclass"].value_counts())
print(df["Sex"].value_counts())
Key Takeaways
- Series is one-dimensional; DataFrame is two-dimensional – both are labeled.
- Use
locfor label-based access andilocfor position-based access. describe()gives you instant statistical insight;info()shows structure and types.- Always check
dtypesand convert types before analysis. - Boolean indexing is your primary filtering tool.
apply()is powerful but slower than vectorized operations – prefer vectorized when possible.