Survival Analysis

Survival analysis answers a fundamental question: how long until an event happens? Unlike standard regression, it handles censored data – subjects where the event hasn't occurred yet – making it essential for medical research, customer churn, reliability engineering, and more.

Kaplan-Meier Survival Curve

Core Concepts

The event time T is a random variable. We observe it directly only for subjects where the event occurred. For others, we only know the event hadn't happened by the last observation – this is censoring.

import pandas as pd
import numpy as np
from lifelines import (
    KaplanMeierFitter, CoxPHFitter, WeibullFitter,
    LogNormalFitter, ExponentialFitter
)
from lifelines.statistics import logrank_test, multivariate_logrank_test
from lifelines.plotting import plot_lifetimes
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

Understanding Censoring

# Types of censoring
np.random.seed(42)
n = 500

# Generate survival times
true_survival = np.random.exponential(scale=12, size=n)  # months

# Right censoring: study ends before event occurs
study_end = 24  # months
observed_time = np.minimum(true_survival, study_end)
event_observed = (true_survival <= study_end).astype(int)

# Left censoring: event occurred before observation started
# Interval censoring: event occurred between two observations
# Administrative censoring: study ended (most common)

print(f"Right-censored: {(1 - event_observed).mean():.1%} of subjects")
print(f"Events observed: {event_observed.sum()}, Censored: {(1-event_observed).sum()}")

Kaplan-Meier Estimator

The Kaplan-Meier curve estimates the survival function non-parametrically.

# Create survival dataset
survival_df = pd.DataFrame({
    'duration': observed_time,
    'event': event_observed,
    'group': np.random.choice(['treatment', 'control'], n)
})

# Fit Kaplan-Meier
kmf = KaplanMeierFitter()

# Overall survival curve
kmf.fit(survival_df['duration'], survival_df['event'], label='Overall')
print(f"Median survival time: {kmf.median_survival_time_:.1f} months")
print(f"Survival at 12 months: {kmf.predict(12):.3f}")

# Group-specific curves
fig, ax = plt.subplots(figsize=(10, 6))
for group in ['treatment', 'control']:
    mask = survival_df['group'] == group
    kmf.fit(survival_df.loc[mask, 'duration'], survival_df.loc[mask, 'event'], label=group)
    kmf.plot_survival_function(ax=ax)

plt.title('Kaplan-Meier Survival Curves')
plt.ylabel('Survival Probability')
plt.xlabel('Time (months)')
plt.legend()
plt.tight_layout()
plt.savefig('km_curves.png', dpi=150)
print("Kaplan-Meier curves plotted")

Log-Rank Test

Testing whether survival curves differ significantly between groups.

# Two-group comparison
treatment = survival_df[survival_df['group'] == 'treatment']
control = survival_df[survival_df['group'] == 'control']

result = logrank_test(
    treatment['duration'], control['duration'],
    treatment['event'], control['event']
)
print(f"Log-rank test p-value: {result.p_value:.4f}")
print(f"Significant difference: {'Yes' if result.p_value < 0.05 else 'No'}")

# Multi-group comparison
survival_df['risk_group'] = pd.cut(
    survival_df['duration'], bins=3, labels=['low', 'medium', 'high']
)
multi_result = multivariate_logrank_test(
    survival_df['duration'], survival_df['risk_group'], survival_df['event']
)
print(f"Multi-group test p-value: {multi_result.p_value:.4f}")

Cox Proportional Hazards Model

Cox regression estimates hazard ratios while making minimal assumptions about the baseline hazard.

# Prepare data with covariates
np.random.seed(42)
cox_df = pd.DataFrame({
    'duration': np.random.exponential(12, 500),
    'event': np.random.binomial(1, 0.6, 500),
    'age': np.random.normal(60, 10, 500),
    'treatment': np.random.binomial(1, 0.5, 500),
    'biomarker': np.random.normal(5, 2, 500),
    'smoking': np.random.binomial(1, 0.3, 500)
})

# Fit Cox model
cph = CoxPHFitter()
cph.fit(cox_df, duration_col='duration', event_col='event')
cph.print_summary()

# Hazard ratios
print("\nHazard Ratios:")
print(cph.hazard_ratios_)

Assessing Proportional Hazards Assumption

# Test proportional hazards assumption
cph.check_assumptions(cox_df, p_value_threshold=0.05, show_plots=False)

# Schoenfeld residuals for visual check
cph.plot_partial_effects_on_outcome(
    covariates='age', values=[50, 60, 70, 80],
    cmap='coolwarm'
)
plt.title('Partial Effects of Age on Survival')
plt.savefig('partial_effects.png', dpi=150)
print("Proportional hazards assumption checked")

Cox Model with Time-Varying Covariates

# Create time-varying dataset
id_col = np.repeat(range(100), 3)
time_periods = np.tile([0, 8, 16], 100)
treatment_varying = np.random.binomial(1, 0.5, 300)
biomarker_varying = np.random.normal(5, 2, 300) + np.random.normal(0, 0.5, 300)

tv_df = pd.DataFrame({
    'id': id_col,
    'start': time_periods,
    'stop': time_periods + 8,
    'event': 0,
    'treatment': treatment_varying,
    'biomarker': biomarker_varying
})

# Set events at last observation for some subjects
event_subjects = np.random.choice(100, 30, replace=False)
tv_df.loc[(tv_df['id'].isin(event_subjects)) & (tv_df['stop'] == 24), 'event'] = 1

print(f"Time-varying dataset: {tv_df.shape}")
print(f"Events: {tv_df['event'].sum()}")

Parametric Survival Models

When the baseline hazard follows a known distribution, parametric models are more efficient.

# Weibull model
weibull = WeibullFitter()
weibull.fit(cox_df['duration'], cox_df['event'])
print(f"Weibull AIC: {weibull.AIC_:.1f}")
print(f"Weibull lambda: {weibull.lambda_:.3f}, rho: {weibull.rho_:.3f}")

# Exponential model (special case of Weibull with rho=1)
exponential = ExponentialFitter()
exponential.fit(cox_df['duration'], cox_df['event'])
print(f"Exponential AIC: {exponential.AIC_:.1f}")

# Log-normal model
lognormal = LogNormalFitter()
lognormal.fit(cox_df['duration'], cox_df['event'])
print(f"Log-normal AIC: {lognormal.AIC_:.1f}")

# Compare models
models = {
    'Weibull': weibull.AIC_,
    'Exponential': exponential.AIC_,
    'Log-Normal': lognormal.AIC_
}
print(f"\nBest model: {min(models, key=models.get)}")

Survival Prediction

# Predict individual survival curves
cph.fit(cox_df, duration_col='duration', event_col='event')

# New patient
new_patient = pd.DataFrame({
    'age': [65],
    'treatment': [1],
    'biomarker': [6.5],
    'smoking': [0]
})

# Predict survival function
surv_func = cph.predict_survival_function(new_patient)
print("Survival probabilities for new patient:")
print(surv_func.loc[[1, 6, 12, 24, 36]].to_string())

# Predict median survival
median_survival = cph.predict_median(new_patient)
print(f"\nPredicted median survival: {median_survival.values[0]:.1f} months")

# Risk scores
risk_scores = cph.predict_partial_hazard(new_patient)
print(f"Partial hazard: {risk_scores.values[0]:.4f}")

Concordance Index

The C-index measures how well the model ranks survival times.

from lifelines.utils import concordance_index

# Concordance index
c_index = concordance_index(
    cox_df['duration'],
    -cph.predict_partial_hazard(cox_df).values.flatten(),
    cox_df['event']
)
print(f"Concordance index: {c_index:.4f}")
print(f"Interpretation: {'Good' if c_index > 0.7 else 'Fair' if c_index > 0.6 else 'Poor'} discrimination")

Restricted Mean Survival Time

# RMST as an alternative to median survival
kmf.fit(cox_df['duration'], cox_df['event'])

# Restricted to 24 months
rmst_24 = kmf.restricted_mean_survival_time(t=24)
print(f"RMST (24 months): {rmst_24:.2f} months")
print(f"Interpretation: On average, subjects survive {rmst_24:.1f} months within the 24-month window")

Practical Applications

# Customer churn example
np.random.seed(42)
churn_df = pd.DataFrame({
    'tenure_months': np.random.exponential(24, 1000),
    'churned': np.random.binomial(1, 0.3, 1000),
    'monthly_charge': np.random.normal(65, 20, 1000),
    'contract_type': np.random.choice([0, 1, 2], 1000),  # month-to-month, 1yr, 2yr
    'support_tickets': np.random.poisson(2, 1000)
})

# Cox model for churn
cph_churn = CoxPHFitter()
cph_churn.fit(churn_df, duration_col='tenure_months', event_col='churned')
cph_churn.print_summary()

# Risk segments
churn_df['risk_score'] = cph_churn.predict_partial_hazard(churn_df)
churn_df['risk_segment'] = pd.qcut(churn_df['risk_score'], q=3, labels=['low', 'medium', 'high'])

print("\nChurn rates by risk segment:")
print(churn_df.groupby('risk_segment')['churned'].mean())

Best Practices

Check censoring patterns – heavy censoring limits model reliability
Test proportional hazards – violations require stratified or time-varying models
Compare parametric models – AIC/BIC guide model selection
Use RMST for non-proportional hazards – more robust than hazard ratios
Validate with C-index – the standard metric for survival models
Report confidence intervals – survival estimates are inherently uncertain

Summary

Survival analysis handles censored data that standard methods cannot. Kaplan-Meier curves visualize group differences, Cox regression quantifies risk factors, and parametric models enable extrapolation. Master these tools for medical research, customer analytics, and reliability engineering.