The Interview Question
"You receive a dataset with 30% missing values, inconsistent formats, and potential biases. How do you handle this before building a model?"
Data quality is the foundation of reliable data science β garbage in, garbage out.
Why Companies Ask This
βΉοΈ
Google and Amazon know that 80% of a data scientist's time is spent on data preparation. They need to verify you can handle real-world messiness, not just clean Kaggle datasets.
Interviewers evaluate:
- Pragmatism β Can you work with imperfect data?
- Systematic Thinking β Do you have a process for data cleaning?
- Bias Awareness β Can you identify and address data biases?
- Trade-off Judgment β Do you know when to clean vs. when to accept imperfection?
- Production Mindset β Can you build robust pipelines?
The Data Quality Framework
Step 1: Profile the Data
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple
def comprehensive_data_profile(df: pd.DataFrame) -> Dict:
"""
Generate a comprehensive data quality profile.
"""
profile = {
'basic_stats': {
'rows': len(df),
'columns': len(df.columns),
'memory_mb': df.memory_usage(deep=True).sum() / 1024**2,
},
'missing_values': {},
'data_types': {},
'unique_values': {},
'outliers': {},
'distributions': {},
}
for col in df.columns:
# Missing values
missing_count = df[col].isnull().sum()
missing_pct = missing_count / len(df)
profile['missing_values'][col] = {
'count': missing_count,
'percentage': missing_pct,
'severity': 'critical' if missing_pct > 0.5 else
'warning' if missing_pct > 0.2 else 'ok',
}
# Data types and consistency
profile['data_types'][col] = {
'dtype': str(df[col].dtype),
'sample_values': df[col].dropna().head(5).tolist(),
}
# Unique values
n_unique = df[col].nunique()
profile['unique_values'][col] = {
'count': n_unique,
'ratio': n_unique / len(df),
}
# Outliers for numeric columns
if df[col].dtype in ['int64', 'float64']:
q1 = df[col].quantile(0.25)
q3 = df[col].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outlier_count = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
profile['outliers'][col] = {
'count': outlier_count,
'percentage': outlier_count / len(df),
'bounds': (lower_bound, upper_bound),
}
return profile
# Example
df = pd.read_csv('messy_data.csv')
profile = comprehensive_data_profile(df)
# Print critical issues
for col, info in profile['missing_values'].items():
if info['severity'] == 'critical':
print(f"CRITICAL: {col} has {info['percentage']:.1%} missing values")
Step 2: Handle Missing Values
class MissingValueHandler:
"""
Systematic approach to handling missing values.
"""
def __init__(self, strategy='auto'):
self.strategy = strategy
self.fill_values = {}
def fit(self, df: pd.DataFrame) -> 'MissingValueHandler':
"""
Learn fill values from training data.
"""
for col in df.columns:
if df[col].isnull().sum() > 0:
if df[col].dtype in ['int64', 'float64']:
if self.strategy == 'auto':
# Use median for numeric (robust to outliers)
self.fill_values[col] = df[col].median()
elif self.strategy == 'mean':
self.fill_values[col] = df[col].mean()
elif self.strategy == 'mode':
self.fill_values[col] = df[col].mode()[0]
else:
# Use mode for categorical
self.fill_values[col] = df[col].mode()[0]
return self
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Apply fill values to data.
"""
df_filled = df.copy()
for col, fill_value in self.fill_values.items():
df_filled[col] = df_filled[col].fillna(fill_value)
return df_filled
def create_missing_indicators(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Create binary indicators for missing values.
Sometimes the fact that data is missing is informative.
"""
df_with_indicators = df.copy()
for col in df.columns:
if df[col].isnull().sum() > 0:
df_with_indicators[f'{col}_is_missing'] = df[col].isnull().astype(int)
return df_with_indicators
# Advanced: Multiple imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
def multiple_imputation(df, n_imputations=5):
"""
Multiple imputation for uncertainty-aware analysis.
"""
imputed_datasets = []
for i in range(n_imputations):
imputer = IterativeImputer(
random_state=i,
max_iter=10,
sample_posterior=True # Important for multiple imputation
)
imputed_array = imputer.fit_transform(df.select_dtypes(include=[np.number]))
imputed_df = pd.DataFrame(
imputed_array,
columns=df.select_dtypes(include=[np.number]).columns,
index=df.index
)
imputed_datasets.append(imputed_df)
return imputed_datasets
Step 3: Handle Inconsistent Formats
class DataStandardizer:
"""
Standardize inconsistent data formats.
"""
def __init__(self):
self.standardization_rules = {}
def standardize_dates(self, series: pd.Series) -> pd.Series:
"""
Handle multiple date formats.
"""
# Try multiple formats
formats = [
'%Y-%m-%d',
'%m/%d/%Y',
'%d/%m/%Y',
'%Y/%m/%d',
'%m-%d-%Y',
]
for fmt in formats:
try:
return pd.to_datetime(series, format=fmt)
except (ValueError, TypeError):
continue
# If all else fails, let pandas infer
return pd.to_datetime(series, infer_datetime_format=True)
def standardize_strings(self, series: pd.Series) -> pd.Series:
"""
Standardize string formats.
"""
return (
series.str.lower()
.str.strip()
.str.replace(r'\s+', ' ', regex=True) # Multiple spaces to single
)
def standardize_categories(self, series: pd.Series,
mapping: Dict = None) -> pd.Series:
"""
Standardize categorical values.
"""
if mapping is None:
# Create mapping from common variations
unique_values = series.dropna().unique()
mapping = {}
for val in unique_values:
normalized = val.lower().strip()
if normalized not in mapping:
mapping[normalized] = val
return series.map(mapping).fillna(series)
# Example usage
standardizer = DataStandardizer()
df['date'] = standardizer.standardize_dates(df['date'])
df['name'] = standardizer.standardize_strings(df['name'])
df['country'] = standardizer.standardize_categories(df['country'])
Step 4: Handle Outliers
class OutlierHandler:
"""
Handle outliers with multiple strategies.
"""
def __init__(self, strategy='clip'):
self.strategy = strategy
self.bounds = {}
def detect_outliers(self, df: pd.DataFrame, method='iqr') -> pd.DataFrame:
"""
Detect outliers using various methods.
"""
outlier_mask = pd.DataFrame(False, index=df.index, columns=df.columns)
for col in df.select_dtypes(include=[np.number]).columns:
if method == 'iqr':
q1 = df[col].quantile(0.25)
q3 = df[col].quantile(0.75)
iqr = q3 - q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
self.bounds[col] = (lower, upper)
outlier_mask[col] = (df[col] < lower) | (df[col] > upper)
elif method == 'zscore':
z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
outlier_mask[col] = z_scores > 3
return outlier_mask
def handle_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Handle outliers based on strategy.
"""
df_clean = df.copy()
for col, (lower, upper) in self.bounds.items():
if self.strategy == 'clip':
df_clean[col] = df_clean[col].clip(lower, upper)
elif self.strategy == 'remove':
df_clean = df_clean[
(df_clean[col] >= lower) & (df_clean[col] <= upper)
]
elif self.strategy == 'flag':
df_clean[f'{col}_is_outlier'] = (
(df_clean[col] < lower) | (df_clean[col] > upper)
)
elif self.strategy == 'impute':
median = df[col].median()
outlier_mask = (df[col] < lower) | (df[col] > upper)
df_clean.loc[outlier_mask, col] = median
return df_clean
Identifying and Addressing Data Bias
Common Data Biases
data_biases = {
'selection_bias': {
'description': 'Data doesn\'t represent the population',
'example': 'Survey only sent to active users',
'detection': 'Compare sample demographics to population',
'mitigation': 'Stratified sampling, weighting',
},
'survivorship_bias': {
'description': 'Only looking at survivors/winners',
'example': 'Analyzing successful companies without failed ones',
'detection': 'Check if you\'re missing "failed" cases',
'mitigation': 'Include all cases, not just successes',
},
'temporal_bias': {
'description': 'Data from different time periods',
'example': 'Training on 2020 data to predict 2026 behavior',
'detection': 'Check for temporal shifts in distributions',
'mitigation': 'Time-aware validation, recent data weighting',
},
'measurement_bias': {
'description': 'Inconsistent measurement across groups',
'example': 'Different surveys for different segments',
'detection': 'Check for systematic differences in measurement',
'mitigation': 'Standardize measurement, calibrate across groups',
},
'label_bias': {
'description': 'Labels reflect human biases',
'example': 'Resume screening trained on historically biased decisions',
'detection': 'Check label distributions across protected groups',
'mitigation': 'Debias labels, use fairness-aware algorithms',
},
}
Detecting Bias in Data
def detect_bias(df, protected_attribute, target_column):
"""
Detect bias in data across protected groups.
"""
bias_report = {}
groups = df[protected_attribute].unique()
for group in groups:
group_data = df[df[protected_attribute] == group]
bias_report[group] = {
'sample_size': len(group_data),
'target_mean': group_data[target_column].mean(),
'target_std': group_data[target_column].std(),
'feature_means': group_data.select_dtypes(include=[np.number]).mean().to_dict(),
}
# Statistical tests for differences
from scipy.stats import chi2_contingency, mannwhitneyu
for i, group1 in enumerate(groups):
for group2 in groups[i+1:]:
data1 = df[df[protected_attribute] == group1][target_column]
data2 = df[df[protected_attribute] == group2][target_column]
# Mann-Whitney U test (non-parametric)
stat, p_value = mannwhitneyu(data1, data2, alternative='two-sided')
bias_report[f'{group1}_vs_{group2}'] = {
'test': 'Mann-Whitney U',
'statistic': stat,
'p_value': p_value,
'significant': p_value < 0.05,
}
return bias_report
Google-Specific Data Quality Considerations
Scale and Diversity
π‘
Google processes petabytes of data daily. Data quality issues that are minor at small scale become critical at Google's scale. They need data scientists who can handle messy data at scale.
- Multilingual data β Text in 100+ languages with different formats
- Temporal data β Data spanning decades with changing schemas
- Cross-product data β Combining data from Search, Maps, YouTube, etc.
Privacy-Preserving Data Quality
# Google's approach to data quality with privacy
privacy_preserving_quality = {
'differential_privacy': 'Add noise to protect individual privacy while maintaining data utility',
'federated_learning': 'Train models without centralizing data',
'data_minimization': 'Collect only what\'s needed',
'k_anonymity': 'Ensure groups of k individuals are indistinguishable',
}
Amazon-Specific Data Quality Considerations
Customer Data Integrity
amazon_data_quality = {
'customer_obsession': 'Data quality directly affects customer experience',
'scale': 'Billions of transactions, millions of products',
'real_time': 'Data quality checks must happen in real-time',
'global': 'Data from 20+ countries with different formats',
}
Common Mistakes to Avoid
β οΈ
These mistakes can lead to incorrect conclusions and poor models:
- Over-cleaning β Removing too much data can introduce bias
- Under-cleaning β Not addressing critical issues leads to bad models
- Not documenting β Future you won't remember what you did
- Not validating β Cleaning on train+test together causes data leakage
- Ignoring bias β Clean but biased data leads to biased models
- Not monitoring β Data quality degrades over time