Survival Analysis
Survival analysis answers a fundamental question: how long until an event happens? Unlike standard regression, it handles censored data β subjects where the event hasn't occurred yet β making it essential for medical research, customer churn, reliability engineering, and more.
Kaplan-Meier Survival Curve
Core Concepts
The event time T is a random variable. We observe it directly only for subjects where the event occurred. For others, we only know the event hadn't happened by the last observation β this is censoring.
import pandas as pd
import numpy as np
from lifelines import (
KaplanMeierFitter, CoxPHFitter, WeibullFitter,
LogNormalFitter, ExponentialFitter
)
from lifelines.statistics import logrank_test, multivariate_logrank_test
from lifelines.plotting import plot_lifetimes
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
Understanding Censoring
# Types of censoring
np.random.seed(42)
n = 500
# Generate survival times
true_survival = np.random.exponential(scale=12, size=n) # months
# Right censoring: study ends before event occurs
study_end = 24 # months
observed_time = np.minimum(true_survival, study_end)
event_observed = (true_survival <= study_end).astype(int)
# Left censoring: event occurred before observation started
# Interval censoring: event occurred between two observations
# Administrative censoring: study ended (most common)
print(f"Right-censored: {(1 - event_observed).mean():.1%} of subjects")
print(f"Events observed: {event_observed.sum()}, Censored: {(1-event_observed).sum()}")
Kaplan-Meier Estimator
The Kaplan-Meier curve estimates the survival function non-parametrically.
# Create survival dataset
survival_df = pd.DataFrame({
'duration': observed_time,
'event': event_observed,
'group': np.random.choice(['treatment', 'control'], n)
})
# Fit Kaplan-Meier
kmf = KaplanMeierFitter()
# Overall survival curve
kmf.fit(survival_df['duration'], survival_df['event'], label='Overall')
print(f"Median survival time: {kmf.median_survival_time_:.1f} months")
print(f"Survival at 12 months: {kmf.predict(12):.3f}")
# Group-specific curves
fig, ax = plt.subplots(figsize=(10, 6))
for group in ['treatment', 'control']:
mask = survival_df['group'] == group
kmf.fit(survival_df.loc[mask, 'duration'], survival_df.loc[mask, 'event'], label=group)
kmf.plot_survival_function(ax=ax)
plt.title('Kaplan-Meier Survival Curves')
plt.ylabel('Survival Probability')
plt.xlabel('Time (months)')
plt.legend()
plt.tight_layout()
plt.savefig('km_curves.png', dpi=150)
print("Kaplan-Meier curves plotted")
Log-Rank Test
Testing whether survival curves differ significantly between groups.
# Two-group comparison
treatment = survival_df[survival_df['group'] == 'treatment']
control = survival_df[survival_df['group'] == 'control']
result = logrank_test(
treatment['duration'], control['duration'],
treatment['event'], control['event']
)
print(f"Log-rank test p-value: {result.p_value:.4f}")
print(f"Significant difference: {'Yes' if result.p_value < 0.05 else 'No'}")
# Multi-group comparison
survival_df['risk_group'] = pd.cut(
survival_df['duration'], bins=3, labels=['low', 'medium', 'high']
)
multi_result = multivariate_logrank_test(
survival_df['duration'], survival_df['risk_group'], survival_df['event']
)
print(f"Multi-group test p-value: {multi_result.p_value:.4f}")
Cox Proportional Hazards Model
Cox regression estimates hazard ratios while making minimal assumptions about the baseline hazard.
# Prepare data with covariates
np.random.seed(42)
cox_df = pd.DataFrame({
'duration': np.random.exponential(12, 500),
'event': np.random.binomial(1, 0.6, 500),
'age': np.random.normal(60, 10, 500),
'treatment': np.random.binomial(1, 0.5, 500),
'biomarker': np.random.normal(5, 2, 500),
'smoking': np.random.binomial(1, 0.3, 500)
})
# Fit Cox model
cph = CoxPHFitter()
cph.fit(cox_df, duration_col='duration', event_col='event')
cph.print_summary()
# Hazard ratios
print("\nHazard Ratios:")
print(cph.hazard_ratios_)
Assessing Proportional Hazards Assumption
# Test proportional hazards assumption
cph.check_assumptions(cox_df, p_value_threshold=0.05, show_plots=False)
# Schoenfeld residuals for visual check
cph.plot_partial_effects_on_outcome(
covariates='age', values=[50, 60, 70, 80],
cmap='coolwarm'
)
plt.title('Partial Effects of Age on Survival')
plt.savefig('partial_effects.png', dpi=150)
print("Proportional hazards assumption checked")
Cox Model with Time-Varying Covariates
# Create time-varying dataset
id_col = np.repeat(range(100), 3)
time_periods = np.tile([0, 8, 16], 100)
treatment_varying = np.random.binomial(1, 0.5, 300)
biomarker_varying = np.random.normal(5, 2, 300) + np.random.normal(0, 0.5, 300)
tv_df = pd.DataFrame({
'id': id_col,
'start': time_periods,
'stop': time_periods + 8,
'event': 0,
'treatment': treatment_varying,
'biomarker': biomarker_varying
})
# Set events at last observation for some subjects
event_subjects = np.random.choice(100, 30, replace=False)
tv_df.loc[(tv_df['id'].isin(event_subjects)) & (tv_df['stop'] == 24), 'event'] = 1
print(f"Time-varying dataset: {tv_df.shape}")
print(f"Events: {tv_df['event'].sum()}")
Parametric Survival Models
When the baseline hazard follows a known distribution, parametric models are more efficient.
# Weibull model
weibull = WeibullFitter()
weibull.fit(cox_df['duration'], cox_df['event'])
print(f"Weibull AIC: {weibull.AIC_:.1f}")
print(f"Weibull lambda: {weibull.lambda_:.3f}, rho: {weibull.rho_:.3f}")
# Exponential model (special case of Weibull with rho=1)
exponential = ExponentialFitter()
exponential.fit(cox_df['duration'], cox_df['event'])
print(f"Exponential AIC: {exponential.AIC_:.1f}")
# Log-normal model
lognormal = LogNormalFitter()
lognormal.fit(cox_df['duration'], cox_df['event'])
print(f"Log-normal AIC: {lognormal.AIC_:.1f}")
# Compare models
models = {
'Weibull': weibull.AIC_,
'Exponential': exponential.AIC_,
'Log-Normal': lognormal.AIC_
}
print(f"\nBest model: {min(models, key=models.get)}")
Survival Prediction
# Predict individual survival curves
cph.fit(cox_df, duration_col='duration', event_col='event')
# New patient
new_patient = pd.DataFrame({
'age': [65],
'treatment': [1],
'biomarker': [6.5],
'smoking': [0]
})
# Predict survival function
surv_func = cph.predict_survival_function(new_patient)
print("Survival probabilities for new patient:")
print(surv_func.loc[[1, 6, 12, 24, 36]].to_string())
# Predict median survival
median_survival = cph.predict_median(new_patient)
print(f"\nPredicted median survival: {median_survival.values[0]:.1f} months")
# Risk scores
risk_scores = cph.predict_partial_hazard(new_patient)
print(f"Partial hazard: {risk_scores.values[0]:.4f}")
Concordance Index
The C-index measures how well the model ranks survival times.
from lifelines.utils import concordance_index
# Concordance index
c_index = concordance_index(
cox_df['duration'],
-cph.predict_partial_hazard(cox_df).values.flatten(),
cox_df['event']
)
print(f"Concordance index: {c_index:.4f}")
print(f"Interpretation: {'Good' if c_index > 0.7 else 'Fair' if c_index > 0.6 else 'Poor'} discrimination")
Restricted Mean Survival Time
# RMST as an alternative to median survival
kmf.fit(cox_df['duration'], cox_df['event'])
# Restricted to 24 months
rmst_24 = kmf.restricted_mean_survival_time(t=24)
print(f"RMST (24 months): {rmst_24:.2f} months")
print(f"Interpretation: On average, subjects survive {rmst_24:.1f} months within the 24-month window")
Practical Applications
# Customer churn example
np.random.seed(42)
churn_df = pd.DataFrame({
'tenure_months': np.random.exponential(24, 1000),
'churned': np.random.binomial(1, 0.3, 1000),
'monthly_charge': np.random.normal(65, 20, 1000),
'contract_type': np.random.choice([0, 1, 2], 1000), # month-to-month, 1yr, 2yr
'support_tickets': np.random.poisson(2, 1000)
})
# Cox model for churn
cph_churn = CoxPHFitter()
cph_churn.fit(churn_df, duration_col='tenure_months', event_col='churned')
cph_churn.print_summary()
# Risk segments
churn_df['risk_score'] = cph_churn.predict_partial_hazard(churn_df)
churn_df['risk_segment'] = pd.qcut(churn_df['risk_score'], q=3, labels=['low', 'medium', 'high'])
print("\nChurn rates by risk segment:")
print(churn_df.groupby('risk_segment')['churned'].mean())
Best Practices
- Check censoring patterns β heavy censoring limits model reliability
- Test proportional hazards β violations require stratified or time-varying models
- Compare parametric models β AIC/BIC guide model selection
- Use RMST for non-proportional hazards β more robust than hazard ratios
- Validate with C-index β the standard metric for survival models
- Report confidence intervals β survival estimates are inherently uncertain
Summary
Survival analysis handles censored data that standard methods cannot. Kaplan-Meier curves visualize group differences, Cox regression quantifies risk factors, and parametric models enable extrapolation. Master these tools for medical research, customer analytics, and reliability engineering.