OOP for Data Science
Object-oriented programming (OOP) is how Python organizes complex systems. Libraries like pandas, scikit-learn, and PyTorch are built on OOP. Understanding OOP helps you use these libraries effectively and build your own reusable components.
Classes and Objects
A class is a blueprint: . An object is an instance: .
Inheritance: means Child inherits Parent's attributes and methods.
class Dataset:
"""A simple dataset wrapper."""
def __init__(self, data, name="unnamed"):
self.data = data
self.name = name
self.created_at = None
def summary(self):
return {
"name": self.name,
"rows": len(self.data),
"type": type(self.data).__name__,
}
# Create instances
ds1 = Dataset([1, 2, 3], name="scores")
ds2 = Dataset({"a": 1, "b": 2})
print(ds1.summary()) # {'name': 'scores', 'rows': 3, 'type': 'list'}
print(ds2.summary()) # {'name': 'unnamed', 'rows': 2, 'type': 'dict'}
Instance Variables and Methods
class DataProcessor:
def __init__(self, filepath):
self.filepath = filepath
self._data = None # Private by convention
self._processed = False
@property
def data(self):
"""Lazy load data on first access."""
if self._data is None:
import pandas as pd
self._data = pd.read_csv(self.filepath)
return self._data
@property
def shape(self):
return self.data.shape
def clean(self, drop_na=True):
"""Clean the dataset."""
if drop_na:
self._data = self.data.dropna()
self._processed = True
return self # Enable method chaining
def filter_rows(self, column, value):
"""Filter rows where column equals value."""
self._data = self.data[self.data[column] == value]
return self
# Method chaining
processor = DataProcessor("sales.csv")
result = processor.clean().filter_rows("region", "East")
print(result.shape)
Class Variables vs Instance Variables
class Feature:
# Class variable (shared across all instances)
total_features = 0
def __init__(self, name, dtype):
self.name = name # Instance variable
self.dtype = dtype # Instance variable
Feature.total_features += 1
def __del__(self):
Feature.total_features -= 1
f1 = Feature("age", "int")
f2 = Feature("salary", "float")
print(Feature.total_features) # 2
print(f1.total_features) # 2 (accessed via class)
del f1
print(Feature.total_features) # 1
Class Hierarchy in Data Science
Inheritance
Inheritance lets you extend existing classes without modifying them.
class BaseEstimator:
"""Base class inspired by scikit-learn's pattern."""
def __init__(self):
self.is_fitted = False
def fit(self, X, y):
raise NotImplementedError("Subclass must implement fit()")
def predict(self, X):
raise NotImplementedError("Subclass must implement predict()")
def fit_predict(self, X, y):
self.fit(X, y)
return self.predict(X)
class MeanPredictor(BaseEstimator):
"""Predicts the mean of the training target."""
def __init__(self):
super().__init__()
self.mean_ = None
def fit(self, X, y):
self.mean_ = sum(y) / len(y)
self.is_fitted = True
return self
def predict(self, X):
if not self.is_fitted:
raise ValueError("Model not fitted yet")
return [self.mean_] * len(X)
model = MeanPredictor()
X_train = [[1], [2], [3]]
y_train = [10, 20, 30]
model.fit(X_train, y_train)
print(model.mean_) # 20.0
print(model.predict([[4], [5]])) # [20.0, 20.0]
Multiple Inheritance
class Loggable:
def log(self, message):
print(f"[{self.__class__.__name__}] {message}")
class Validatable:
def validate(self, data):
if not data:
raise ValueError("Data cannot be empty")
return True
class Pipeline(Loggable, Validatable):
def __init__(self, steps):
self.steps = steps
self.log("Pipeline created")
def run(self, data):
self.validate(data)
for name, func in self.steps:
self.log(f"Running {name}")
data = func(data)
self.log("Pipeline complete")
return data
pipeline = Pipeline([
("strip", str.strip),
("lower", str.lower),
])
result = pipeline.run(" Hello World ")
print(result) # "hello world"
Dunder (Magic) Methods
Dunder methods define operator behavior: calls __add__, len(obj) calls __len__, repr(obj) calls __repr__.
Property pattern: triggers @property, obj.attr = val triggers @attr.setter.
class Vector:
"""A 2D vector with math operations."""
def __init__(self, x, y):
self.x = x
self.y = y
# String representations
def __repr__(self):
return f"Vector({self.x}, {self.y})"
def __str__(self):
return f"({self.x}, {self.y})"
# Arithmetic operators
def __add__(self, other):
return Vector(self.x + other.x, self.y + other.y)
def __sub__(self, other):
return Vector(self.x - other.x, self.y - other.y)
def __mul__(self, scalar):
return Vector(self.x * scalar, self.y * scalar)
def __rmul__(self, scalar):
return self.__mul__(scalar)
def __truediv__(self, scalar):
return Vector(self.x / scalar, self.y / scalar)
# Comparison
def __eq__(self, other):
return self.x == other.x and self.y == other.y
def __lt__(self, other):
return self.magnitude() < other.magnitude()
# Container protocol
def __len__(self):
return 2
def __getitem__(self, index):
if index == 0:
return self.x
elif index == 1:
return self.y
raise IndexError("Index out of range")
# Math
def __abs__(self):
return self.magnitude()
def magnitude(self):
return (self.x ** 2 + self.y ** 2) ** 0.5
def dot(self, other):
return self.x * other.x + self.y * other.y
# Usage
v1 = Vector(3, 4)
v2 = Vector(1, 2)
print(v1 + v2) # (4, 6)
print(v1 - v2) # (2, 2)
print(v1 * 3) # (9, 12)
print(3 * v1) # (9, 12)
print(abs(v1)) # 5.0
print(v1[0]) # 3
print(len(v1)) # 2
print(v1 == Vector(3, 4)) # True
print(v1 < v2) # False (5 > 2.236)
Common Dunder Methods
class DataRecord:
def __init__(self, **kwargs):
self.__dict__.update(kwargs)
# Called by print()
def __str__(self):
return str(self.__dict__)
# Called in interpreter / debugger
def __repr__(self):
return f"DataRecord({self.__dict__})"
# Called by len()
def __len__(self):
return len(self.__dict__)
# Called by 'in' operator
def __contains__(self, key):
return key in self.__dict__
# Called for iteration
def __iter__(self):
return iter(self.__dict__)
# Called to check truthiness
def __bool__(self):
return bool(self.__dict__)
# Called for attribute access
def __getattr__(self, name):
return f"Attribute '{name}' not found"
# Called for hashing (required for sets/dict keys)
def __hash__(self):
return hash(tuple(sorted(self.__dict__.items())))
Properties and Encapsulation
class TemperatureSensor:
def __init__(self, celsius=0):
self._celsius = celsius # Convention: underscore = private
@property
def celsius(self):
"""Get temperature in Celsius."""
return self._celsius
@celsius.setter
def celsius(self, value):
"""Set temperature with validation."""
if value < -273.15:
raise ValueError("Temperature below absolute zero")
self._celsius = value
@property
def fahrenheit(self):
"""Computed property."""
return self._celsius * 9/5 + 32
@fahrenheit.setter
def fahrenheit(self, value):
self._celsius = (value - 32) * 5/9
@property
def kelvin(self):
return self._celsius + 273.15
sensor = TemperatureSensor(25)
print(sensor.celsius) # 25
print(sensor.fahrenheit) # 77.0
print(sensor.kelvin) # 298.15
sensor.fahrenheit = 32
print(sensor.celsius) # 0.0
How OOP Is Used in Data Science Libraries
scikit-learn Pattern
# scikit-learn uses this exact pattern for all models
class CustomTransformer:
"""Mimics scikit-learn's transformer interface."""
def __init__(self, factor=1.0):
self.factor = factor
self.is_fitted = False
self.mean_ = None
def fit(self, X, y=None):
"""Learn parameters from data."""
self.mean_ = sum(X) / len(X)
self.is_fitted = True
return self # Always return self for chaining
def transform(self, X):
"""Apply learned parameters to data."""
return [(x - self.mean_) * self.factor for x in X]
def fit_transform(self, X, y=None):
"""Fit and transform in one step."""
return self.fit(X, y).transform(X)
# Usage (identical to scikit-learn)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
pipe = Pipeline([
("scaler", StandardScaler()),
# ("custom", CustomTransformer(factor=2)),
])
Custom DataFrame Extension
import pandas as pd
class AnalysisMixin:
"""Mixin that adds analysis methods to DataFrames."""
def quick_stats(self):
"""Quick summary statistics."""
return {
"shape": self.shape,
"dtypes": self.dtypes.to_dict(),
"nulls": self.isnull().sum().to_dict(),
"memory_mb": self.memory_usage(deep=True).sum() / 1e6,
}
def outliers_iqr(self, column, threshold=1.5):
"""Find outliers using IQR method."""
q1 = self[column].quantile(0.25)
q3 = self[column].quantile(0.75)
iqr = q3 - q1
lower = q1 - threshold * iqr
upper = q3 + threshold * iqr
return self[(self[column] < lower) | (self[column] > upper)]
def correlation_with(self, target, top_n=10):
"""Top correlations with a target variable."""
numeric = self.select_dtypes(include="number")
correlations = numeric.corrwith(target).abs().sort_values(ascending=False)
return correlations.head(top_n)
# Apply mixin to DataFrame
df = pd.DataFrame({
"age": [25, 30, 35, 40, 100], # 100 is an outlier
"salary": [50000, 60000, 70000, 80000, 90000],
})
# Now df has the mixin methods
stats = df.quick_stats()
print(stats)
outliers = df.outliers_iqr("age")
print(outliers)
Design Patterns for Data Science
Strategy Pattern
class OutlierDetector:
"""Pick different detection strategies at runtime."""
def __init__(self, strategy="iqr"):
strategies = {
"iqr": self._iqr_strategy,
"zscore": self._zscore_strategy,
"percentile": self._percentile_strategy,
}
self._strategy = strategies.get(strategy, self._iqr_strategy)
def detect(self, data):
return self._strategy(data)
@staticmethod
def _iqr_strategy(data):
q1, q3 = data.quantile([0.25, 0.75])
iqr = q3 - q1
return data[(data < q1 - 1.5*iqr) | (data > q3 + 1.5*iqr)]
@staticmethod
def _zscore_strategy(data):
z = (data - data.mean()) / data.std()
return data[z.abs() > 3]
@staticmethod
def _percentile_strategy(data):
lower = data.quantile(0.01)
upper = data.quantile(0.99)
return data[(data < lower) | (data > upper)]
detector = OutlierDetector(strategy="zscore")
# detector.detect(my_series)
Key Takeaways
- Classes encapsulate data and behavior together.
- Inheritance lets you extend functionality without rewriting code.
- Dunder methods make your objects work naturally with Python's syntax.
- Properties provide controlled access to internal state.
- Data science libraries follow consistent OOP patterns (fit/transform/predict).
- Mixins and composition are often more flexible than deep inheritance.