The Interview Question
βΉοΈ
Question: You're building a customer segmentation system for a professional networking platform:
- Dataset: 5M users with features: profile views, connections, posts, job applications, industry
- Requirements: Identify distinct user segments for targeted marketing
- Challenges: High-dimensional data, noise, varying cluster densities
Walk through your clustering approach:
- How do you determine the optimal number of clusters?
- Which clustering algorithm would you choose and why?
- How do you evaluate clustering quality?
- How do you interpret and use the segments?
Detailed Answer
1. Determining Optimal Number of Clusters
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import cdist
import warnings
warnings.filterwarnings('ignore')
class OptimalClustersFinder:
"""Find optimal number of clusters using multiple methods"""
def __init__(self, X):
self.X = X
self.results = {}
def elbow_method(self, k_range=range(2, 11)):
"""Elbow method using within-cluster sum of squares"""
inertias = []
K = []
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(self.X)
inertias.append(kmeans.inertia_)
K.append(k)
# Calculate elbow point
deltas = np.diff(inertias)
delta_deltas = np.diff(deltas)
elbow_idx = np.argmax(np.abs(delta_deltas)) + 2
optimal_k_elbow = K[elbow_idx]
self.results['elbow'] = {
'k_values': K,
'inertias': inertias,
'optimal_k': optimal_k_elbow
}
return optimal_k_elbow
def silhouette_method(self, k_range=range(2, 11)):
"""Silhouette analysis"""
silhouette_scores = []
K = []
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = kmeans.fit_predict(self.X)
score = silhouette_score(self.X, labels)
silhouette_scores.append(score)
K.append(k)
optimal_k_silhouette = K[np.argmax(silhouette_scores)]
self.results['silhouette'] = {
'k_values': K,
'scores': silhouette_scores,
'optimal_k': optimal_k_silhouette,
'best_score': max(silhouette_scores)
}
return optimal_k_silhouette
def gap_statistic(self, k_range=range(2, 11), n_refs=10):
"""Gap statistic method"""
gaps = []
K = []
for k in k_range:
# Calculate WCSS for actual data
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(self.X)
WCSS = kmeans.inertia_
# Calculate WCSS for reference datasets
ref_WCSS = []
for _ in range(n_refs):
# Generate random reference data
random_data = np.random.uniform(
low=self.X.min(axis=0),
high=self.X.max(axis=0),
size=self.X.shape
)
kmeans_ref = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans_ref.fit(random_data)
ref_WCSS.append(kmeans_ref.inertia_)
# Calculate gap
gap = np.mean(np.log(ref_WCSS)) - np.log(WCSS)
gaps.append(gap)
K.append(k)
# Find optimal k (first k where gap(k) >= gap(k+1) - std_error)
optimal_k_gap = K[np.argmax(gaps)]
self.results['gap'] = {
'k_values': K,
'gaps': gaps,
'optimal_k': optimal_k_gap
}
return optimal_k_gap
def bayesian_information_criterion(self, k_range=range(2, 11)):
"""BIC for Gaussian Mixture Models"""
from sklearn.mixture import GaussianMixture
bics = []
K = []
for k in k_range:
gmm = GaussianMixture(n_components=k, random_state=42)
gmm.fit(self.X)
bics.append(gmm.bic(self.X))
K.append(k)
optimal_k_bic = K[np.argmin(bics)]
self.results['bic'] = {
'k_values': K,
'bics': bics,
'optimal_k': optimal_k_bic
}
return optimal_k_bic
def find_optimal_k(self, k_range=range(2, 11)):
"""Run all methods and find consensus"""
print("Running multiple methods to find optimal k...")
print("=" * 60)
self.elbow_method(k_range)
self.silhouette_method(k_range)
self.gap_statistic(k_range)
self.bayesian_information_criterion(k_range)
# Consensus
all_k = [
self.results['elbow']['optimal_k'],
self.results['silhouette']['optimal_k'],
self.results['gap']['optimal_k'],
self.results['bic']['optimal_k']
]
# Find most common k
from collections import Counter
k_counts = Counter(all_k)
consensus_k = k_counts.most_common(1)[0][0]
print(f"\nOptimal k estimates:")
print(f" Elbow method: {self.results['elbow']['optimal_k']}")
print(f" Silhouette: {self.results['silhouette']['optimal_k']}")
print(f" Gap statistic: {self.results['gap']['optimal_k']}")
print(f" BIC: {self.results['bic']['optimal_k']}")
print(f"\nConsensus optimal k: {consensus_k}")
return consensus_k
def visualize_methods(self):
"""Visualize all methods"""
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# Elbow method
axes[0, 0].plot(self.results['elbow']['k_values'],
self.results['elbow']['inertias'], 'bo-', linewidth=2)
axes[0, 0].axvline(x=self.results['elbow']['optimal_k'],
color='r', linestyle='--', label=f"Optimal k={self.results['elbow']['optimal_k']}")
axes[0, 0].set_xlabel('Number of Clusters (k)')
axes[0, 0].set_ylabel('Inertia')
axes[0, 0].set_title('Elbow Method')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)
# Silhouette method
axes[0, 1].plot(self.results['silhouette']['k_values'],
self.results['silhouette']['scores'], 'go-', linewidth=2)
axes[0, 1].axvline(x=self.results['silhouette']['optimal_k'],
color='r', linestyle='--', label=f"Optimal k={self.results['silhouette']['optimal_k']}")
axes[0, 1].set_xlabel('Number of Clusters (k)')
axes[0, 1].set_ylabel('Silhouette Score')
axes[0, 1].set_title('Silhouette Method')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)
# Gap statistic
axes[1, 0].plot(self.results['gap']['k_values'],
self.results['gap']['gaps'], 'mo-', linewidth=2)
axes[1, 0].axvline(x=self.results['gap']['optimal_k'],
color='r', linestyle='--', label=f"Optimal k={self.results['gap']['optimal_k']}")
axes[1, 0].set_xlabel('Number of Clusters (k)')
axes[1, 0].set_ylabel('Gap Statistic')
axes[1, 0].set_title('Gap Statistic')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
# BIC
axes[1, 1].plot(self.results['bic']['k_values'],
self.results['bic']['bics'], 'co-', linewidth=2)
axes[1, 1].axvline(x=self.results['bic']['optimal_k'],
color='r', linestyle='--', label=f"Optimal k={self.results['bic']['optimal_k']}")
axes[1, 1].set_xlabel('Number of Clusters (k)')
axes[1, 1].set_ylabel('BIC')
axes[1, 1].set_title('Bayesian Information Criterion')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('optimal_k_methods.png', dpi=150, bbox_inches='tight')
plt.show()
# Example usage
# finder = OptimalClustersFinder(X_scaled)
# optimal_k = finder.find_optimal_k()
# finder.visualize_methods()
2. Clustering Algorithms Comparison
class ClusteringComparison:
"""Compare multiple clustering algorithms"""
def __init__(self):
self.models = {}
self.results = {}
def define_models(self, n_clusters=5):
"""Define clustering models"""
self.models = {
'kmeans': KMeans(n_clusters=n_clusters, random_state=42, n_init=10),
'agglomerative': AgglomerativeClustering(n_clusters=n_clusters),
'dbscan': DBSCAN(eps=0.5, min_samples=5),
'hierarchical': AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
}
return self.models
def find_dbscan_params(self, X, k=5):
"""Find optimal DBSCAN parameters using k-distance graph"""
# Fit nearest neighbors
neighbors = NearestNeighbors(n_neighbors=k)
neighbors.fit(X)
distances, indices = neighbors.kneighbors(X)
# Sort distances to k-th neighbor
distances = np.sort(distances[:, -1])
# Find elbow point
from kneed import KneeLocator
kneedle = KneeLocator(range(len(distances)), distances, curve='convex', direction='increasing')
optimal_eps = distances[kneedle.elbow] if kneedle.elbow else np.median(distances)
print(f"Optimal eps for DBSCAN: {optimal_eps:.4f}")
return optimal_eps
def compare_performance(self, X):
"""Compare all clustering algorithms"""
results = []
for name, model in self.models.items():
print(f"\nRunning {name}...")
# Fit and predict
if name == 'dbscan':
labels = model.fit_predict(X)
else:
labels = model.fit_predict(X)
# Calculate metrics (skip if only 1 cluster)
n_clusters = len(np.unique(labels[labels >= 0]))
if n_clusters > 1:
# Exclude noise points for DBSCAN
mask = labels >= 0
if mask.sum() > 0:
silhouette = silhouette_score(X[mask], labels[mask])
calinski = calinski_harabasz_score(X[mask], labels[mask])
davies_bouldin = davies_bouldin_score(X[mask], labels[mask])
else:
silhouette, calinski, davies_bouldin = 0, 0, float('inf')
else:
silhouette, calinski, davies_bouldin = 0, 0, float('inf')
result = {
'algorithm': name,
'n_clusters': n_clusters,
'n_noise': (labels == -1).sum() if name == 'dbscan' else 0,
'silhouette': silhouette,
'calinski_harabasz': calinski,
'davies_bouldin': davies_bouldin
}
results.append(result)
self.results = pd.DataFrame(results)
return self.results
def visualize_comparison(self):
"""Visualize algorithm comparison"""
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
metrics = ['silhouette', 'calinski_harabasz', 'davies_bouldin']
titles = ['Silhouette Score\n(higher=better)',
'Calinski-Harabasz\n(higher=better)',
'Davies-Bouldin\n(lower=better)']
for idx, (metric, title) in enumerate(zip(metrics, titles)):
bars = axes[idx].barh(self.results['algorithm'], self.results[metric])
axes[idx].set_xlabel(metric.replace('_', ' ').title())
axes[idx].set_title(title)
# Add value labels
for bar, value in zip(bars, self.results[metric]):
axes[idx].text(value + 0.01 * value, bar.get_y() + bar.get_height()/2,
f'{value:.3f}', va='center', fontsize=9)
plt.tight_layout()
plt.savefig('clustering_comparison.png', dpi=150, bbox_inches='tight')
plt.show()
# Example usage
# comparator = ClusteringComparison()
# comparator.define_models(n_clusters=5)
# results = comparator.compare_performance(X_scaled)
# comparator.visualize_comparison()
3. Clustering Evaluation Metrics
class ClusteringEvaluator:
"""Comprehensive clustering evaluation"""
def __init__(self, X, labels):
self.X = X
self.labels = labels
self.n_clusters = len(np.unique(labels[labels >= 0]))
def calculate_all_metrics(self):
"""Calculate comprehensive clustering metrics"""
# Filter out noise points
mask = self.labels >= 0
X_filtered = self.X[mask]
labels_filtered = self.labels[mask]
metrics = {
'n_clusters': self.n_clusters,
'n_noise': (self.labels == -1).sum(),
'cluster_sizes': pd.Series(labels_filtered).value_counts().to_dict()
}
if self.n_clusters > 1:
# Internal metrics
metrics['silhouette'] = silhouette_score(X_filtered, labels_filtered)
metrics['calinski_harabasz'] = calinski_harabasz_score(X_filtered, labels_filtered)
metrics['davies_bouldin'] = davies_bouldin_score(X_filtered, labels_filtered)
# Inertia
from sklearn.metrics import pairwise_distances_argmin_min
centroids = np.array([X_filtered[labels_filtered == i].mean(axis=0)
for i in range(self.n_clusters)])
_, distances = pairwise_distances_argmin_min(X_filtered, centroids)
metrics['inertia'] = np.sum(distances ** 2)
# Cluster separation
metrics['inter_cluster_distance'] = self._inter_cluster_distance(centroids)
metrics['intra_cluster_distance'] = self._intra_cluster_distance(X_filtered, labels_filtered)
return metrics
def _inter_cluster_distance(self, centroids):
"""Calculate inter-cluster distance"""
from scipy.spatial.distance import pdist
if len(centroids) > 1:
distances = pdist(centroids)
return np.mean(distances)
return 0
def _intra_cluster_distance(self, X, labels):
"""Calculate average intra-cluster distance"""
intra_distances = []
for i in range(self.n_clusters):
cluster_points = X[labels == i]
if len(cluster_points) > 1:
from scipy.spatial.distance import pdist
distances = pdist(cluster_points)
intra_distances.append(np.mean(distances))
return np.mean(intra_distances) if intra_distances else 0
def plot_silhouette_analysis(self):
"""Plot silhouette analysis"""
from sklearn.metrics import silhouette_samples
sample_silhouette_values = silhouette_samples(self.X, self.labels)
fig, ax = plt.subplots(figsize=(8, 6))
y_lower = 10
for i in range(self.n_clusters):
ith_cluster_silhouette_values = sample_silhouette_values[self.labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = plt.cm.tab20(i / self.n_clusters)
ax.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
ax.set_title("Silhouette Analysis")
ax.set_xlabel("Silhouette Coefficient")
ax.set_ylabel("Cluster")
# Add average silhouette line
avg_silhouette = np.mean(sample_silhouette_values)
ax.axvline(x=avg_silhouette, color="red", linestyle="--",
label=f"Average: {avg_silhouette:.3f}")
ax.legend()
plt.tight_layout()
plt.savefig('silhouette_analysis.png', dpi=150, bbox_inches='tight')
plt.show()
# Example usage
# evaluator = ClusteringEvaluator(X_scaled, labels)
# metrics = evaluator.calculate_all_metrics()
# evaluator.plot_silhouette_analysis()
4. Real-World Application: User Segmentation
class UserSegmentation:
"""Complete user segmentation system"""
def __init__(self):
self.scaler = StandardScaler()
self.pca = PCA(n_components=0.95) # Retain 95% variance
self.model = None
self.segments = None
def preprocess_features(self, df, feature_columns):
"""Preprocess features for clustering"""
# Handle missing values
X = df[feature_columns].fillna(0)
# Log transform skewed features
for col in feature_columns:
if X[col].skew() > 1:
X[col] = np.log1p(X[col])
# Scale features
X_scaled = self.scaler.fit_transform(X)
# Reduce dimensionality if needed
if X.shape[1] > 20:
X_scaled = self.pca.fit_transform(X_scaled)
print(f"PCA reduced dimensions from {len(feature_columns)} to {X_scaled.shape[1]}")
return X_scaled
def find_optimal_segments(self, X_scaled):
"""Find optimal number of segments"""
finder = OptimalClustersFinder(X_scaled)
optimal_k = finder.find_optimal_k(k_range=range(2, 15))
return optimal_k
def create_segments(self, X_scaled, n_clusters):
"""Create user segments"""
self.model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
self.segments = self.model.fit_predict(X_scaled)
return self.segments
def profile_segments(self, df, feature_columns, segment_col='segment'):
"""Create detailed segment profiles"""
df_with_segments = df.copy()
df_with_segments[segment_col] = self.segments
profiles = {}
for segment in range(self.n_clusters):
segment_data = df_with_segments[df_with_segments[segment_col] == segment]
profile = {
'size': len(segment_data),
'percentage': len(segment_data) / len(df) * 100,
'features': {}
}
for feature in feature_columns:
if feature in segment_data.columns:
profile['features'][feature] = {
'mean': segment_data[feature].mean(),
'median': segment_data[feature].median(),
'std': segment_data[feature].std()
}
profiles[f'Segment {segment}'] = profile
return profiles
def interpret_segments(self, profiles, feature_columns):
"""Generate human-readable segment interpretations"""
interpretations = {}
# Calculate global statistics
global_means = {}
for feature in feature_columns:
means = [profiles[seg]['features'][feature]['mean']
for seg in profiles.keys()]
global_means[feature] = np.mean(means)
# Interpret each segment
for segment_name, profile in profiles.items():
interpretation = []
for feature in feature_columns:
segment_mean = profile['features'][feature]['mean']
global_mean = global_means[feature]
if segment_mean > global_mean * 1.2:
interpretation.append(f"High {feature}")
elif segment_mean < global_mean * 0.8:
interpretation.append(f"Low {feature}")
interpretations[segment_name] = {
'profile': interpretation,
'size': profile['size'],
'percentage': profile['percentage']
}
return interpretations
def create_marketing_strategies(self, interpretations):
"""Generate marketing strategies for each segment"""
strategies = {}
for segment_name, info in interpretations.items():
profile = info['profile']
if 'High engagement' in str(profile) and 'High connections' in str(profile):
strategy = 'Power users - Focus on retention and advocacy programs'
elif 'Low engagement' in str(profile) and 'New user' in str(profile):
strategy = 'New users - Focus on onboarding and activation'
elif 'High job applications' in str(profile):
strategy = 'Job seekers - Focus on job-related features'
elif 'High content creation' in str(profile):
strategy = 'Content creators - Focus on creator tools and monetization'
else:
strategy = 'General users - Standard engagement campaigns'
strategies[segment_name] = {
'interpretation': profile,
'strategy': strategy,
'size': info['size'],
'priority': 'High' if info['percentage'] > 20 else 'Medium'
}
return strategies
# Example usage
# segmentation = UserSegmentation()
# X_scaled = segmentation.preprocess_features(df, feature_columns)
# optimal_k = segmentation.find_optimal_segments(X_scaled)
# segments = segmentation.create_segments(X_scaled, optimal_k)
# profiles = segmentation.profile_segments(df, feature_columns)
# interpretations = segmentation.interpret_segments(profiles, feature_columns)
# strategies = segmentation.create_marketing_strategies(interpretations)
π‘
Pro Tip: When profiling segments, don't just look at means. Examine the full distribution, including skewness and outliers, to understand the true nature of each segment.
5. Common Follow-Up Questions
Follow-up 1: How do you handle high-dimensional data in clustering?
def handle_high_dimensionality(X, method='pca'):
"""Handle high-dimensional data for clustering"""
if method == 'pca':
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95) # Retain 95% variance
X_reduced = pca.fit_transform(X)
print(f"PCA: {X.shape[1]} β {X_reduced.shape[1]} dimensions")
print(f"Explained variance: {pca.explained_variance_ratio_.sum():.2%}")
elif method == 'tsne':
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_reduced = tsne.fit_transform(X)
print(f"t-SNE: {X.shape[1]} β {X_reduced.shape[1]} dimensions")
elif method == 'umap':
import umap
reducer = umap.UMAP(n_components=2, random_state=42)
X_reduced = reducer.fit_transform(X)
print(f"UMAP: {X.shape[1]} β {X_reduced.shape[1]} dimensions")
elif method == 'autoencoder':
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
# Simple autoencoder
input_dim = X.shape[1]
encoding_dim = 10
input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='sigmoid')(encoded)
autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X, X, epochs=50, batch_size=256, verbose=0)
X_reduced = encoder.predict(X)
print(f"Autoencoder: {X.shape[1]} β {X_reduced.shape[1]} dimensions")
return X_reduced
Follow-up 2: How do you handle streaming data for clustering?
class StreamingClustering:
"""Handle streaming data for clustering"""
def __init__(self, initial_data, n_clusters=5):
self.n_clusters = n_clusters
self.centroids = None
self.counts = np.zeros(n_clusters)
self.initialize(initial_data)
def initialize(self, X):
"""Initialize centroids using K-Means++"""
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=self.n_clusters, init='k-means++', random_state=42)
kmeans.fit(X)
self.centroids = kmeans.cluster_centers_
def partial_fit(self, X_batch):
"""Update centroids with new data batch"""
# Assign points to nearest centroid
from sklearn.metrics import pairwise_distances_argmin_min
labels, _ = pairwise_distances_argmin_min(X_batch, self.centroids)
# Update centroids using online K-Means
for i in range(self.n_clusters):
mask = labels == i
if mask.any():
self.counts[i] += mask.sum()
# Online update: centroid = centroid + lr * (point - centroid)
lr = 1 / self.counts[i]
self.centroids[i] += lr * (X_batch[mask].mean(axis=0) - self.centroids[i])
def predict(self, X):
"""Predict cluster for new points"""
from sklearn.metrics import pairwise_distances_argmin_min
labels, _ = pairwise_distances_argmin_min(X, self.centroids)
return labels
# Example usage
# streaming = StreamingClustering(X_initial, n_clusters=5)
# for batch in data_stream:
# streaming.partial_fit(batch)
# labels = streaming.predict(X_test)
Company-Specific Tips
βΉοΈ
Google Tips:
- Google values scalable clustering algorithms
- Be prepared to discuss Mini-Batch K-Means for large datasets
- Know how to use clustering for anomaly detection
- Understand hierarchical clustering for taxonomy
LinkedIn Tips:
- LinkedIn focuses on user segmentation for recommendations
- Know how to cluster professional profiles
- Be comfortable with graph-based clustering
- Understand how to use clustering for feed personalization
Quiz Section
Related Topics
- Dimensionality Reduction β PCA, t-SNE for preprocessing
- Anomaly Detection β Using clustering for outlier detection
- Recommendation Systems β Clustering for user similarity
- Customer Segmentation β EDA for segmentation