Functions, Lambda, and Comprehensions
Functions are the building blocks of clean, reusable code. In data science, you will write functions to transform data, create features, and build pipelines. This lesson covers everything from basic definitions to advanced patterns.
Defining Functions
A function maps inputs to outputs: . In Python, define with def:
# Basic function
def greet(name):
"""Return a greeting message."""
return f"Hello, {name}!"
print(greet("Alice")) # Hello, Alice!
# Function with default arguments
def power(base, exponent=2):
return base ** exponent
print(power(3)) # 9
print(power(3, 3)) # 27
Parameters and Arguments
Positional and Keyword Arguments
def create_profile(name, age, city, occupation="Unknown"):
return {
"name": name,
"age": age,
"city": city,
"occupation": occupation,
}
# Positional
p1 = create_profile("Alice", 30, "NYC")
# Keyword (order doesn't matter)
p2 = create_profile(age=25, name="Bob", city="LA")
# Mixing (positional must come first)
p3 = create_profile("Charlie", city="Chicago", age=35)
*args and **kwargs
*args collects extra positional arguments as a tuple. **kwargs collects extra keyword arguments as a dictionary.
def flexible(*args, **kwargs):
print(f"Positional args: {args}")
print(f"Keyword args: {kwargs}")
flexible(1, 2, 3, name="Alice", age=30)
# Positional args: (1, 2, 3)
# Keyword args: {'name': 'Alice', 'age': 30}
# Practical use: flexible aggregation
def aggregate(series, method="mean", **kwargs):
if method == "mean":
return series.mean()
elif method == "median":
return series.median()
elif method == "quantile":
q = kwargs.get("q", 0.5)
return series.quantile(q)
else:
raise ValueError(f"Unknown method: {method}")
Unpacking Arguments
def add(a, b, c):
return a + b + c
nums = [1, 2, 3]
print(add(*nums)) # Unpack list as positional args
config = {"a": 1, "b": 2, "c": 3}
print(add(**config)) # Unpack dict as keyword args
Return Values
# Single return value
def square(n):
return n ** 2
# Multiple return values (returns a tuple)
def min_max(numbers):
return min(numbers), max(numbers)
low, high = min_max([3, 1, 4, 1, 5, 9])
print(f"Min: {low}, Max: {high}") # Min: 1, Max: 9
# Returning early
def divide(a, b):
if b == 0:
return None # Early return
return a / b
# No explicit return returns None
def log_message(msg):
print(msg)
# Implicitly returns None
Scope and Closures
# Local vs global scope
x = 10
def modify():
x = 20 # Local variable (does not affect global)
print(x) # 20
modify()
print(x) # 10
# Using global (avoid this)
counter = 0
def increment():
global counter
counter += 1
# Closures – functions that remember their environment
def make_multiplier(factor):
def multiply(x):
return x * factor
return multiply
double = make_multiplier(2)
triple = make_multiplier(3)
print(double(5)) # 10
print(triple(5)) # 15
Lambda Functions
Lambdas are anonymous functions: . Equivalent to .
# Basic lambda
square = lambda x: x ** 2
print(square(5)) # 25
# Lambda with multiple arguments
add = lambda a, b: a + b
print(add(3, 4)) # 7
# Lambda with default argument
greet = lambda name, greeting="Hello": f"{greeting}, {name}!"
print(greet("Alice")) # Hello, Alice!
print(greet("Bob", greeting="Hey")) # Hey, Bob!
When to Use Lambdas
# Good: short callback for sorting
students = [("Alice", 90), ("Bob", 80), ("Charlie", 95)]
students.sort(key=lambda s: s[1], reverse=True)
print(students) # [('Charlie', 95), ('Alice', 90), ('Bob', 80)]
# Good: transforming in apply()
import pandas as pd
df = pd.DataFrame({"name": ["alice", "bob"], "score": [85, 92]})
df["name_upper"] = df["name"].apply(lambda x: x.upper())
# Bad: complex logic (use def instead)
# Don't do this:
# process = lambda x: x * 2 if x > 0 else -x if x < 0 else 0
# Do this:
def process(x):
if x > 0:
return x * 2
elif x < 0:
return -x
return 0
map, filter, and reduce
map
map() applies a function to every element of an iterable.
numbers = [1, 2, 3, 4, 5]
# Using map
squared = list(map(lambda x: x ** 2, numbers))
print(squared) # [1, 4, 9, 16, 25]
# Equivalent list comprehension (preferred)
squared = [x ** 2 for x in numbers]
# map with multiple iterables
a = [1, 2, 3]
b = [10, 20, 30]
sums = list(map(lambda x, y: x + y, a, b))
print(sums) # [11, 22, 33]
# Practical: parsing strings to integers
str_nums = ["1", "2", "3", "4", "5"]
int_nums = list(map(int, str_nums))
print(int_nums) # [1, 2, 3, 4, 5]
filter
filter() keeps elements where the function returns True.
numbers = range(1, 21)
# Even numbers only
evens = list(filter(lambda x: x % 2 == 0, numbers))
print(evens) # [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
# Equivalent list comprehension
evens = [x for x in numbers if x % 2 == 0]
# Filter out None values
data = [1, None, 3, None, 5]
clean = list(filter(None, data))
print(clean) # [1, 3, 5]
reduce
reduce() accumulates elements into a single value. It is in the functools module.
from functools import reduce
numbers = [1, 2, 3, 4, 5]
# Sum all elements
total = reduce(lambda acc, x: acc + x, numbers)
print(total) # 15
# Find maximum
maximum = reduce(lambda a, b: a if a > b else b, numbers)
print(maximum) # 5
# Equivalent to built-in functions
total = sum(numbers)
maximum = max(numbers)
Comprehension Transformation Flow
List Comprehensions
List comprehensions create new lists: . Faster and more readable than loops.
Basic Syntax
# [expression for item in iterable]
squares = [x ** 2 for x in range(10)]
print(squares) # [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
# Equivalent loop
squares = []
for x in range(10):
squares.append(x ** 2)
Filtering
# [expression for item in iterable if condition]
evens = [x for x in range(20) if x % 2 == 0]
print(evens) # [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
# With elif (use ternary in the expression)
labels = ["even" if x % 2 == 0 else "odd" for x in range(5)]
print(labels) # ['even', 'odd', 'even', 'odd', 'even']
Nested Comprehensions
# Flattening a matrix
matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
flat = [num for row in matrix for num in row]
print(flat) # [1, 2, 3, 4, 5, 6, 7, 8, 9]
# Creating a matrix
transposed = [[row[i] for row in matrix] for i in range(3)]
print(transposed) # [[1, 4, 7], [2, 5, 8], [3, 6, 9]]
Practical Data Science Examples
import pandas as pd
# Clean column names
columns = ["First Name", "Last Name", " Age ", "Annual Income ($)", " Debt "]
cleaned = [col.strip().lower().replace(" ", "_").replace("($)", "").rstrip("_") for col in columns]
print(cleaned)
# ['first_name', 'last_name', 'age', 'annual_income', 'debt']
# Extract numeric values from strings
raw_prices = ["$19.99", "$42.50", "$100.00", "$5.25"]
prices = [float(p.replace("$", "")) for p in raw_prices]
print(prices) # [19.99, 42.5, 100.0, 5.25]
# Filter and transform in one step
scores = [85, 92, 78, 95, 60, 88, 72, 100, 45, 81]
passing_high = [s for s in scores if s >= 70]
print(passing_high) # [85, 92, 78, 95, 88, 72, 100, 81]
Dictionary Comprehensions
# {key_expression: value_expression for item in iterable}
word = "mississippi"
char_count = {c: word.count(c) for c in set(word)}
print(char_count) # {'m': 1, 'i': 4, 's': 4, 'p': 2}
# Invert a dictionary
original = {"a": 1, "b": 2, "c": 3}
inverted = {v: k for k, v in original.items()}
print(inverted) # {1: 'a', 2: 'b', 3: 'c'}
# Filter a dictionary
prices = {"apple": 1.5, "banana": 0.75, "cherry": 2.0, "date": 5.0}
expensive = {k: v for k, v in prices.items() if v > 1.5}
print(expensive) # {'cherry': 2.0, 'date': 5.0}
# From two lists
keys = ["name", "age", "city"]
values = ["Alice", 30, "NYC"]
person = {k: v for k, v in zip(keys, values)}
print(person) # {'name': 'Alice', 'age': 30, 'city': 'NYC'}
Set Comprehensions
# {expression for item in iterable}
words = ["hello", "world", "hello", "python", "world"]
unique_lengths = {len(w) for w in words}
print(unique_lengths) # {5, 6}
Generator Expressions
Generator expressions look like list comprehensions but use parentheses. They produce values lazily, which saves memory.
import sys
# List comprehension (loads everything into memory)
squares_list = [x ** 2 for x in range(1_000_000)]
print(sys.getsizeof(squares_list)) # ~8 MB
# Generator expression (generates on demand)
squares_gen = (x ** 2 for x in range(1_000_000))
print(sys.getsizeof(squares_gen)) # ~200 bytes
# Use generators for large datasets
total = sum(x ** 2 for x in range(1_000_000)) # No list created
Function Best Practices
# 1. Use descriptive names
def calculate_average(values): # Good
pass
def calc(v): # Bad
pass
# 2. Keep functions small and focused
def load_data(filepath):
"""Load data from a CSV file."""
return pd.read_csv(filepath)
def clean_data(df):
"""Remove missing values and duplicates."""
return df.dropna().drop_duplicates()
def analyze_data(df):
"""Compute summary statistics."""
return df.describe()
# 3. Document with docstrings
def feature_engineer(df):
"""
Create new features from existing columns.
Parameters
----------
df : pd.DataFrame
Input data with 'price' and 'quantity' columns.
Returns
-------
pd.DataFrame
Data with added 'total' column.
"""
df = df.copy()
df["total"] = df["price"] * df["quantity"]
return df
# 4. Avoid mutable default arguments
def add_item(item, lst=None): # Good
if lst is None:
lst = []
lst.append(item)
return lst
Key Takeaways
- Functions encapsulate logic and make code reusable.
- Use
*argsand**kwargsfor flexible function signatures. - Lambdas are best for short, one-off operations (sorting, mapping).
- List comprehensions are preferred over
map/filterfor readability. - Generator expressions save memory for large datasets.
- Always write descriptive function names and document with docstrings.