OOP for Data Science

Object-oriented programming (OOP) is how Python organizes complex systems. Libraries like pandas, scikit-learn, and PyTorch are built on OOP. Understanding OOP helps you use these libraries effectively and build your own reusable components.

Classes and Objects

A class is a blueprint: $\text{Class} = (\text{Attributes}, \text{Methods})$ . An object is an instance: $\text{obj} = \text{Class}(\text{init\_args})$ .

Inheritance: $\text{Child} \leftarrow \text{Parent}$ means Child inherits Parent's attributes and methods.

class Dataset:
    """A simple dataset wrapper."""

    def __init__(self, data, name="unnamed"):
        self.data = data
        self.name = name
        self.created_at = None

    def summary(self):
        return {
            "name": self.name,
            "rows": len(self.data),
            "type": type(self.data).__name__,
        }

# Create instances
ds1 = Dataset([1, 2, 3], name="scores")
ds2 = Dataset({"a": 1, "b": 2})

print(ds1.summary())  # {'name': 'scores', 'rows': 3, 'type': 'list'}
print(ds2.summary())  # {'name': 'unnamed', 'rows': 2, 'type': 'dict'}

Instance Variables and Methods

class DataProcessor:
    def __init__(self, filepath):
        self.filepath = filepath
        self._data = None  # Private by convention
        self._processed = False

    @property
    def data(self):
        """Lazy load data on first access."""
        if self._data is None:
            import pandas as pd
            self._data = pd.read_csv(self.filepath)
        return self._data

    @property
    def shape(self):
        return self.data.shape

    def clean(self, drop_na=True):
        """Clean the dataset."""
        if drop_na:
            self._data = self.data.dropna()
        self._processed = True
        return self  # Enable method chaining

    def filter_rows(self, column, value):
        """Filter rows where column equals value."""
        self._data = self.data[self.data[column] == value]
        return self

# Method chaining
processor = DataProcessor("sales.csv")
result = processor.clean().filter_rows("region", "East")
print(result.shape)

Class Variables vs Instance Variables

class Feature:
    # Class variable (shared across all instances)
    total_features = 0

    def __init__(self, name, dtype):
        self.name = name       # Instance variable
        self.dtype = dtype     # Instance variable
        Feature.total_features += 1

    def __del__(self):
        Feature.total_features -= 1

f1 = Feature("age", "int")
f2 = Feature("salary", "float")

print(Feature.total_features)  # 2
print(f1.total_features)       # 2 (accessed via class)

del f1
print(Feature.total_features)  # 1

Class Hierarchy in Data Science

Inheritance

Inheritance lets you extend existing classes without modifying them.

class BaseEstimator:
    """Base class inspired by scikit-learn's pattern."""

    def __init__(self):
        self.is_fitted = False

    def fit(self, X, y):
        raise NotImplementedError("Subclass must implement fit()")

    def predict(self, X):
        raise NotImplementedError("Subclass must implement predict()")

    def fit_predict(self, X, y):
        self.fit(X, y)
        return self.predict(X)


class MeanPredictor(BaseEstimator):
    """Predicts the mean of the training target."""

    def __init__(self):
        super().__init__()
        self.mean_ = None

    def fit(self, X, y):
        self.mean_ = sum(y) / len(y)
        self.is_fitted = True
        return self

    def predict(self, X):
        if not self.is_fitted:
            raise ValueError("Model not fitted yet")
        return [self.mean_] * len(X)


model = MeanPredictor()
X_train = [[1], [2], [3]]
y_train = [10, 20, 30]

model.fit(X_train, y_train)
print(model.mean_)  # 20.0
print(model.predict([[4], [5]]))  # [20.0, 20.0]

Multiple Inheritance

class Loggable:
    def log(self, message):
        print(f"[{self.__class__.__name__}] {message}")


class Validatable:
    def validate(self, data):
        if not data:
            raise ValueError("Data cannot be empty")
        return True


class Pipeline(Loggable, Validatable):
    def __init__(self, steps):
        self.steps = steps
        self.log("Pipeline created")

    def run(self, data):
        self.validate(data)
        for name, func in self.steps:
            self.log(f"Running {name}")
            data = func(data)
        self.log("Pipeline complete")
        return data


pipeline = Pipeline([
    ("strip", str.strip),
    ("lower", str.lower),
])
result = pipeline.run("  Hello World  ")
print(result)  # "hello world"

Dunder (Magic) Methods

Dunder methods define operator behavior: $\text{obj} + \text{other}$ calls __add__, len(obj) calls __len__, repr(obj) calls __repr__.

Property pattern: $\text{obj.attr}$ triggers @property, obj.attr = val triggers @attr.setter.

class Vector:
    """A 2D vector with math operations."""

    def __init__(self, x, y):
        self.x = x
        self.y = y

    # String representations
    def __repr__(self):
        return f"Vector({self.x}, {self.y})"

    def __str__(self):
        return f"({self.x}, {self.y})"

    # Arithmetic operators
    def __add__(self, other):
        return Vector(self.x + other.x, self.y + other.y)

    def __sub__(self, other):
        return Vector(self.x - other.x, self.y - other.y)

    def __mul__(self, scalar):
        return Vector(self.x * scalar, self.y * scalar)

    def __rmul__(self, scalar):
        return self.__mul__(scalar)

    def __truediv__(self, scalar):
        return Vector(self.x / scalar, self.y / scalar)

    # Comparison
    def __eq__(self, other):
        return self.x == other.x and self.y == other.y

    def __lt__(self, other):
        return self.magnitude() < other.magnitude()

    # Container protocol
    def __len__(self):
        return 2

    def __getitem__(self, index):
        if index == 0:
            return self.x
        elif index == 1:
            return self.y
        raise IndexError("Index out of range")

    # Math
    def __abs__(self):
        return self.magnitude()

    def magnitude(self):
        return (self.x ** 2 + self.y ** 2) ** 0.5

    def dot(self, other):
        return self.x * other.x + self.y * other.y


# Usage
v1 = Vector(3, 4)
v2 = Vector(1, 2)

print(v1 + v2)       # (4, 6)
print(v1 - v2)       # (2, 2)
print(v1 * 3)        # (9, 12)
print(3 * v1)        # (9, 12)
print(abs(v1))       # 5.0
print(v1[0])         # 3
print(len(v1))       # 2
print(v1 == Vector(3, 4))  # True
print(v1 < v2)       # False (5 > 2.236)

Common Dunder Methods

class DataRecord:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

    # Called by print()
    def __str__(self):
        return str(self.__dict__)

    # Called in interpreter / debugger
    def __repr__(self):
        return f"DataRecord({self.__dict__})"

    # Called by len()
    def __len__(self):
        return len(self.__dict__)

    # Called by 'in' operator
    def __contains__(self, key):
        return key in self.__dict__

    # Called for iteration
    def __iter__(self):
        return iter(self.__dict__)

    # Called to check truthiness
    def __bool__(self):
        return bool(self.__dict__)

    # Called for attribute access
    def __getattr__(self, name):
        return f"Attribute '{name}' not found"

    # Called for hashing (required for sets/dict keys)
    def __hash__(self):
        return hash(tuple(sorted(self.__dict__.items())))

Properties and Encapsulation

class TemperatureSensor:
    def __init__(self, celsius=0):
        self._celsius = celsius  # Convention: underscore = private

    @property
    def celsius(self):
        """Get temperature in Celsius."""
        return self._celsius

    @celsius.setter
    def celsius(self, value):
        """Set temperature with validation."""
        if value < -273.15:
            raise ValueError("Temperature below absolute zero")
        self._celsius = value

    @property
    def fahrenheit(self):
        """Computed property."""
        return self._celsius * 9/5 + 32

    @fahrenheit.setter
    def fahrenheit(self, value):
        self._celsius = (value - 32) * 5/9

    @property
    def kelvin(self):
        return self._celsius + 273.15


sensor = TemperatureSensor(25)
print(sensor.celsius)      # 25
print(sensor.fahrenheit)   # 77.0
print(sensor.kelvin)       # 298.15

sensor.fahrenheit = 32
print(sensor.celsius)      # 0.0

How OOP Is Used in Data Science Libraries

scikit-learn Pattern

# scikit-learn uses this exact pattern for all models
class CustomTransformer:
    """Mimics scikit-learn's transformer interface."""

    def __init__(self, factor=1.0):
        self.factor = factor
        self.is_fitted = False
        self.mean_ = None

    def fit(self, X, y=None):
        """Learn parameters from data."""
        self.mean_ = sum(X) / len(X)
        self.is_fitted = True
        return self  # Always return self for chaining

    def transform(self, X):
        """Apply learned parameters to data."""
        return [(x - self.mean_) * self.factor for x in X]

    def fit_transform(self, X, y=None):
        """Fit and transform in one step."""
        return self.fit(X, y).transform(X)

# Usage (identical to scikit-learn)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipe = Pipeline([
    ("scaler", StandardScaler()),
    # ("custom", CustomTransformer(factor=2)),
])

Custom DataFrame Extension

import pandas as pd

class AnalysisMixin:
    """Mixin that adds analysis methods to DataFrames."""

    def quick_stats(self):
        """Quick summary statistics."""
        return {
            "shape": self.shape,
            "dtypes": self.dtypes.to_dict(),
            "nulls": self.isnull().sum().to_dict(),
            "memory_mb": self.memory_usage(deep=True).sum() / 1e6,
        }

    def outliers_iqr(self, column, threshold=1.5):
        """Find outliers using IQR method."""
        q1 = self[column].quantile(0.25)
        q3 = self[column].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - threshold * iqr
        upper = q3 + threshold * iqr
        return self[(self[column] < lower) | (self[column] > upper)]

    def correlation_with(self, target, top_n=10):
        """Top correlations with a target variable."""
        numeric = self.select_dtypes(include="number")
        correlations = numeric.corrwith(target).abs().sort_values(ascending=False)
        return correlations.head(top_n)


# Apply mixin to DataFrame
df = pd.DataFrame({
    "age": [25, 30, 35, 40, 100],  # 100 is an outlier
    "salary": [50000, 60000, 70000, 80000, 90000],
})

# Now df has the mixin methods
stats = df.quick_stats()
print(stats)

outliers = df.outliers_iqr("age")
print(outliers)

Design Patterns for Data Science

Strategy Pattern

class OutlierDetector:
    """Pick different detection strategies at runtime."""

    def __init__(self, strategy="iqr"):
        strategies = {
            "iqr": self._iqr_strategy,
            "zscore": self._zscore_strategy,
            "percentile": self._percentile_strategy,
        }
        self._strategy = strategies.get(strategy, self._iqr_strategy)

    def detect(self, data):
        return self._strategy(data)

    @staticmethod
    def _iqr_strategy(data):
        q1, q3 = data.quantile([0.25, 0.75])
        iqr = q3 - q1
        return data[(data < q1 - 1.5*iqr) | (data > q3 + 1.5*iqr)]

    @staticmethod
    def _zscore_strategy(data):
        z = (data - data.mean()) / data.std()
        return data[z.abs() > 3]

    @staticmethod
    def _percentile_strategy(data):
        lower = data.quantile(0.01)
        upper = data.quantile(0.99)
        return data[(data < lower) | (data > upper)]


detector = OutlierDetector(strategy="zscore")
# detector.detect(my_series)

Key Takeaways

Classes encapsulate data and behavior together.
Inheritance lets you extend functionality without rewriting code.
Dunder methods make your objects work naturally with Python's syntax.
Properties provide controlled access to internal state.
Data science libraries follow consistent OOP patterns (fit/transform/predict).
Mixins and composition are often more flexible than deep inheritance.

OOP for Data Science

OOP for Data Science

Classes and Objects

Instance Variables and Methods

Class Variables vs Instance Variables

Class Hierarchy in Data Science

Inheritance

Multiple Inheritance

Dunder (Magic) Methods

Common Dunder Methods

Properties and Encapsulation

How OOP Is Used in Data Science Libraries

scikit-learn Pattern

Custom DataFrame Extension

Design Patterns for Data Science

Strategy Pattern

Key Takeaways

Premium Content

Need Expert Data Science Help?