The Interview Question
βΉοΈ
Question: You receive a dataset with 50 columns and 1 million rows for a user engagement analysis project. Walk through your complete EDA process:
- How do you profile the data for quality issues?
- What visualizations do you create and why?
- How do you detect and handle outliers?
- What automated checks would you implement?
Detailed Answer
1. Systematic Data Profiling
Data profiling is the first step in any analysis. It helps you understand data quality, identify issues, and plan your approach.
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
class DataProfiler:
"""Comprehensive data profiling toolkit"""
def __init__(self, df):
self.df = df
self.profile = {}
def generate_full_profile(self):
"""Generate complete data profile"""
self.basic_stats()
self.missing_analysis()
self.dtype_analysis()
self.cardinality_analysis()
self.distribution_analysis()
return self.profile
def basic_stats(self):
"""Basic statistics for all columns"""
self.profile['shape'] = self.df.shape
self.profile['memory_usage'] = self.df.memory_usage(deep=True).sum() / 1e9
# Numeric columns
numeric_cols = self.df.select_dtypes(include=[np.number]).columns
self.profile['numeric_columns'] = len(numeric_cols)
# Categorical columns
cat_cols = self.df.select_dtypes(include=['object', 'category']).columns
self.profile['categorical_columns'] = len(cat_cols)
# Date columns
date_cols = self.df.select_dtypes(include=['datetime64']).columns
self.profile['datetime_columns'] = len(date_cols)
print("Basic Statistics")
print("=" * 50)
print(f"Shape: {self.profile['shape']}")
print(f"Memory usage: {self.profile['memory_usage']:.2f} GB")
print(f"Numeric columns: {self.profile['numeric_columns']}")
print(f"Categorical columns: {self.profile['categorical_columns']}")
print(f"Datetime columns: {self.profile['datetime_columns']}")
def missing_analysis(self):
"""Analyze missing values"""
missing = self.df.isnull().sum()
missing_pct = (missing / len(self.df) * 100).round(2)
missing_df = pd.DataFrame({
'missing_count': missing,
'missing_pct': missing_pct
})
# Categories of missingness
no_missing = missing_df[missing_df['missing_count'] == 0]
low_missing = missing_df[(missing_df['missing_pct'] > 0) & (missing_df['missing_pct'] <= 5)]
medium_missing = missing_df[(missing_df['missing_pct'] > 5) & (missing_df['missing_pct'] <= 20)]
high_missing = missing_df[missing_df['missing_pct'] > 20]
self.profile['missing_analysis'] = {
'no_missing': len(no_missing),
'low_missing': len(low_missing),
'medium_missing': len(medium_missing),
'high_missing': len(high_missing)
}
print("\nMissing Value Analysis")
print("=" * 50)
print(f"Columns with no missing: {len(no_missing)}")
print(f"Columns with low missing (0-5%): {len(low_missing)}")
print(f"Columns with medium missing (5-20%): {len(medium_missing)}")
print(f"Columns with high missing (>20%): {len(high_missing)}")
if len(high_missing) > 0:
print(f"\nHigh missing columns:")
for col in high_missing.index[:10]:
print(f" {col}: {high_missing.loc[col, 'missing_pct']:.1f}%")
def dtype_analysis(self):
"""Analyze data types and suggest optimizations"""
dtype_recommendations = {}
for col in self.df.columns:
col_type = self.df[col].dtype
if col_type in ['int64', 'int32']:
# Check if we can downcast
c_min, c_max = self.df[col].min(), self.df[col].max()
if c_min >= -128 and c_max <= 127:
dtype_recommendations[col] = 'int8'
elif c_min >= -32768 and c_max <= 32767:
dtype_recommendations[col] = 'int16'
elif c_min >= -2147483648 and c_max <= 2147483647:
dtype_recommendations[col] = 'int32'
elif col_type in ['float64', 'float32']:
# Check if we can downcast
c_min, c_max = self.df[col].min(), self.df[col].max()
if np.finfo(np.float32).min <= c_min and c_max <= np.finfo(np.float32).max:
dtype_recommendations[col] = 'float32'
elif col_type == 'object':
# Check if it should be categorical
if self.df[col].nunique() / len(self.df) < 0.1:
dtype_recommendations[col] = 'category'
self.profile['dtype_recommendations'] = dtype_recommendations
print("\nData Type Optimization")
print("=" * 50)
print(f"Columns that can be optimized: {len(dtype_recommendations)}")
if dtype_recommendations:
print("Top recommendations:")
for col, dtype in list(dtype_recommendations.items())[:10]:
print(f" {col}: {dtype}")
def cardinality_analysis(self):
"""Analyze unique values for categorical columns"""
cat_cols = self.df.select_dtypes(include=['object', 'category']).columns
cardinality_info = {}
for col in cat_cols:
n_unique = self.df[col].nunique()
top_values = self.df[col].value_counts().head(5)
cardinality_info[col] = {
'n_unique': n_unique,
'top_values': top_values.to_dict()
}
self.profile['cardinality'] = cardinality_info
print("\nCardinality Analysis")
print("=" * 50)
high_cardinality = {k: v for k, v in cardinality_info.items() if v['n_unique'] > 100}
low_cardinality = {k: v for k, v in cardinality_info.items() if v['n_unique'] <= 10}
print(f"High cardinality columns (>100 unique): {len(high_cardinality)}")
print(f"Low cardinality columns (β€10 unique): {len(low_cardinality)}")
def distribution_analysis(self):
"""Analyze distributions of numeric columns"""
numeric_cols = self.df.select_dtypes(include=[np.number]).columns[:10] # Sample first 10
distribution_stats = {}
for col in numeric_cols:
data = self.df[col].dropna()
distribution_stats[col] = {
'mean': data.mean(),
'std': data.std(),
'skewness': data.skew(),
'kurtosis': data.kurtosis(),
'normality_p_value': stats.shapiro(data.sample(min(1000, len(data))))[1]
}
self.profile['distributions'] = distribution_stats
2. Visualization Strategy
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
class EDAVisualizer:
"""Comprehensive EDA visualization toolkit"""
def __init__(self, df):
self.df = df
self.setup_style()
def setup_style(self):
"""Set up visualization style"""
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
def plot_missing_values(self):
"""Visualize missing value patterns"""
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Missing values by column
missing = self.df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=True)
if len(missing) > 0:
axes[0].barh(missing.index[:20], missing.values[:20])
axes[0].set_xlabel('Missing Count')
axes[0].set_title('Top 20 Columns with Missing Values')
# Missing value heatmap
missing_matrix = self.df.isnull().astype(int)
sns.heatmap(missing_matrix.iloc[:, :20], cbar=True, ax=axes[1],
yticklabels=False)
axes[1].set_title('Missing Value Pattern (First 20 Columns)')
plt.tight_layout()
plt.savefig('missing_values.png', dpi=150, bbox_inches='tight')
plt.show()
def plot_distributions(self, columns=None, n_cols=3):
"""Plot distributions for numeric columns"""
if columns is None:
columns = self.df.select_dtypes(include=[np.number]).columns[:9]
n_rows = (len(columns) + n_cols - 1) // n_cols
fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows))
axes = axes.flatten() if n_rows > 1 else [axes]
for idx, col in enumerate(columns):
if idx < len(axes):
ax = axes[idx]
data = self.df[col].dropna()
# Histogram with KDE
sns.histplot(data, kde=True, ax=ax, bins=50)
# Add statistics
stats_text = f'Mean: {data.mean():.2f}\nStd: {data.std():.2f}\nSkew: {data.skew():.2f}'
ax.text(0.95, 0.95, stats_text, transform=ax.transAxes,
verticalalignment='top', horizontalalignment='right',
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
ax.set_title(f'Distribution of {col}')
# Hide empty subplots
for idx in range(len(columns), len(axes)):
axes[idx].set_visible(False)
plt.tight_layout()
plt.savefig('distributions.png', dpi=150, bbox_inches='tight')
plt.show()
def plot_correlations(self):
"""Plot correlation matrix"""
numeric_df = self.df.select_dtypes(include=[np.number])
if numeric_df.shape[1] < 2:
print("Not enough numeric columns for correlation analysis")
return
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Correlation heatmap
corr_matrix = numeric_df.corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f',
cmap='coolwarm', center=0, ax=axes[0])
axes[0].set_title('Correlation Heatmap')
# Top correlations
corr_pairs = []
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
corr_pairs.append({
'pair': f"{corr_matrix.columns[i]} - {corr_matrix.columns[j]}",
'correlation': corr_matrix.iloc[i, j]
})
corr_df = pd.DataFrame(corr_pairs).sort_values('correlation',
key=abs,
ascending=False)
top_pos = corr_df.head(5)
top_neg = corr_df.tail(5)
y_pos = range(len(top_pos))
y_neg = range(len(top_neg))
axes[1].barh(y_pos, top_pos['correlation'].values, color='green', alpha=0.7, label='Positive')
axes[1].barh(y_neg, top_neg['correlation'].values, color='red', alpha=0.7, label='Negative')
axes[1].set_yticks(list(y_pos) + list(y_neg))
axes[1].set_yticklabels(list(top_pos['pair']) + list(top_neg['pair']))
axes[1].set_xlabel('Correlation')
axes[1].set_title('Top Correlations')
axes[1].legend()
axes[1].axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.savefig('correlations.png', dpi=150, bbox_inches='tight')
plt.show()
def plot_categorical_analysis(self, columns=None):
"""Analyze categorical variables"""
if columns is None:
columns = self.df.select_dtypes(include=['object', 'category']).columns[:6]
n_cols = 3
n_rows = (len(columns) + n_cols - 1) // n_cols
fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows))
axes = axes.flatten() if n_rows > 1 else [axes]
for idx, col in enumerate(columns):
if idx < len(axes):
ax = axes[idx]
value_counts = self.df[col].value_counts().head(10)
bars = ax.bar(range(len(value_counts)), value_counts.values)
ax.set_xticks(range(len(value_counts)))
ax.set_xticklabels(value_counts.index, rotation=45, ha='right')
ax.set_ylabel('Count')
ax.set_title(f'Distribution of {col}')
# Add percentage labels
total = len(self.df)
for bar, count in zip(bars, value_counts.values):
percentage = count / total * 100
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
f'{percentage:.1f}%', ha='center', va='bottom', fontsize=8)
# Hide empty subplots
for idx in range(len(columns), len(axes)):
axes[idx].set_visible(False)
plt.tight_layout()
plt.savefig('categorical_analysis.png', dpi=150, bbox_inches='tight')
plt.show()
def plot_temporal_analysis(self, date_column, value_column):
"""Analyze temporal patterns"""
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
df_temp = self.df.copy()
df_temp[date_column] = pd.to_datetime(df_temp[date_column])
# Daily pattern
df_temp['date'] = df_temp[date_column].dt.date
daily = df_temp.groupby('date')[value_column].mean()
axes[0, 0].plot(daily.index, daily.values, alpha=0.7)
axes[0, 0].set_title(f'Daily {value_column}')
axes[0, 0].tick_params(axis='x', rotation=45)
# Hourly pattern
df_temp['hour'] = df_temp[date_column].dt.hour
hourly = df_temp.groupby('hour')[value_column].mean()
axes[0, 1].bar(hourly.index, hourly.values)
axes[0, 1].set_title(f'Hourly {value_column}')
axes[0, 1].set_xlabel('Hour')
# Day of week pattern
df_temp['dayofweek'] = df_temp[date_column].dt.dayofweek
daily_pattern = df_temp.groupby('dayofweek')[value_column].mean()
axes[1, 0].bar(range(7), daily_pattern.values)
axes[1, 0].set_xticks(range(7))
axes[1, 0].set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
axes[1, 0].set_title(f'Day of Week Pattern')
# Monthly pattern
df_temp['month'] = df_temp[date_column].dt.month
monthly = df_temp.groupby('month')[value_column].mean()
axes[1, 1].plot(monthly.index, monthly.values, marker='o')
axes[1, 1].set_xticks(range(1, 13))
axes[1, 1].set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
axes[1, 1].set_title(f'Monthly {value_column}')
plt.tight_layout()
plt.savefig('temporal_analysis.png', dpi=150, bbox_inches='tight')
plt.show()
3. Outlier Detection Methods
import numpy as np
from scipy import stats
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
class OutlierDetector:
"""Multiple outlier detection methods"""
def __init__(self, df):
self.df = df
self.outlier_summary = {}
def detect_univariate_outliers(self, column, method='iqr'):
"""Detect outliers in a single column"""
data = self.df[column].dropna()
if method == 'iqr':
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = data[(data < lower_bound) | (data > upper_bound)]
elif method == 'zscore':
z_scores = np.abs(stats.zscore(data))
outliers = data[z_scores > 3]
elif method == 'modified_zscore':
median = np.median(data)
mad = np.median(np.abs(data - median))
modified_z_scores = 0.6745 * (data - median) / mad
outliers = data[np.abs(modified_z_scores) > 3.5]
elif method == 'percentile':
lower = data.quantile(0.01)
upper = data.quantile(0.99)
outliers = data[(data < lower) | (data > upper)]
return {
'method': method,
'n_outliers': len(outliers),
'pct_outliers': len(outliers) / len(data) * 100,
'lower_bound': lower_bound if method == 'iqr' else None,
'upper_bound': upper_bound if method == 'iqr' else None,
'outlier_values': outliers.values[:10] # Sample
}
def detect_multivariate_outliers(self, columns, method='isolation_forest'):
"""Detect multivariate outliers"""
data = self.df[columns].dropna()
if method == 'isolation_forest':
clf = IsolationForest(contamination=0.1, random_state=42)
outlier_labels = clf.fit_predict(data)
outliers = data[outlier_labels == -1]
elif method == 'dbscan':
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
db = DBSCAN(eps=0.5, min_samples=5)
outlier_labels = db.fit_predict(data_scaled)
outliers = data[outlier_labels == -1]
elif method == 'mahalanobis':
from scipy.spatial.distance import mahalanobis
mean = data.mean().values
cov = data.cov().values
cov_inv = np.linalg.inv(cov)
distances = []
for _, row in data.iterrows():
dist = mahalanobis(row.values, mean, cov_inv)
distances.append(dist)
threshold = np.percentile(distances, 95)
outlier_mask = np.array(distances) > threshold
outliers = data[outlier_mask]
return {
'method': method,
'n_outliers': len(outliers),
'pct_outliers': len(outliers) / len(data) * 100,
'outlier_indices': outliers.index.tolist()[:100]
}
def comprehensive_outlier_report(self, columns=None):
"""Generate comprehensive outlier report"""
if columns is None:
columns = self.df.select_dtypes(include=[np.number]).columns
report = {}
for col in columns:
print(f"\nAnalyzing {col}...")
# Univariate analysis
iqr_result = self.detect_univariate_outliers(col, 'iqr')
zscore_result = self.detect_univariate_outliers(col, 'zscore')
report[col] = {
'iqr': iqr_result,
'zscore': zscore_result,
'statistics': {
'mean': self.df[col].mean(),
'std': self.df[col].std(),
'skewness': self.df[col].skew(),
'kurtosis': self.df[col].kurtosis()
}
}
# Multivariate analysis
numeric_cols = self.df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) >= 2:
print("\nRunning multivariate outlier detection...")
multivariate_result = self.detect_multivariate_outliers(
numeric_cols[:5], # Limit to first 5 for performance
'isolation_forest'
)
report['multivariate'] = multivariate_result
self.outlier_summary = report
return report
def visualize_outliers(self, column):
"""Visualize outliers in a column"""
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
data = self.df[column].dropna()
# Box plot
axes[0, 0].boxplot(data)
axes[0, 0].set_title(f'Box Plot of {column}')
axes[0, 0].set_ylabel('Value')
# Histogram with outlier regions
axes[0, 1].hist(data, bins=50, alpha=0.7, color='blue', edgecolor='black')
Q1, Q3 = data.quantile(0.25), data.quantile(0.75)
IQR = Q3 - Q1
lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
axes[0, 1].axvline(lower, color='red', linestyle='--', label=f'Lower: {lower:.2f}')
axes[0, 1].axvline(upper, color='red', linestyle='--', label=f'Upper: {upper:.2f}')
axes[0, 1].set_title(f'Histogram with Outlier Boundaries')
axes[0, 1].legend()
# QQ plot
stats.probplot(data, dist="norm", plot=axes[1, 0])
axes[1, 0].set_title(f'Q-Q Plot')
# Z-score distribution
z_scores = np.abs(stats.zscore(data))
axes[1, 1].hist(z_scores, bins=50, alpha=0.7, color='green', edgecolor='black')
axes[1, 1].axvline(3, color='red', linestyle='--', label='|z| > 3')
axes[1, 1].set_title(f'Z-Score Distribution')
axes[1, 1].set_xlabel('|Z-Score|')
axes[1, 1].legend()
plt.tight_layout()
plt.savefig(f'outliers_{column}.png', dpi=150, bbox_inches='tight')
plt.show()
π‘
Pro Tip: Outliers aren't always errors. Investigate them β they might represent important edge cases, fraud, or new patterns worth understanding.
4. Automated Quality Checks
class DataQualityChecker:
"""Automated data quality validation"""
def __init__(self, df, schema=None):
self.df = df
self.schema = schema or {}
self.violations = []
def check_completeness(self, required_columns):
"""Check for required columns"""
missing_cols = set(required_columns) - set(self.df.columns)
if missing_cols:
self.violations.append({
'type': 'completeness',
'severity': 'critical',
'message': f'Missing required columns: {missing_cols}'
})
def check_uniqueness(self, column):
"""Check for duplicate values"""
duplicates = self.df[column].duplicated().sum()
if duplicates > 0:
self.violations.append({
'type': 'uniqueness',
'severity': 'warning',
'message': f'Column {column} has {duplicates} duplicate values'
})
def check_validity(self, column, constraints):
"""Check value constraints"""
if 'min' in constraints:
violations = self.df[column] < constraints['min']
if violations.any():
self.violations.append({
'type': 'validity',
'severity': 'error',
'message': f'{column} has {violations.sum()} values below minimum {constraints["min"]}'
})
if 'max' in constraints:
violations = self.df[column] > constraints['max']
if violations.any():
self.violations.append({
'type': 'validity',
'severity': 'error',
'message': f'{column} has {violations.sum()} values above maximum {constraints["max"]}'
})
if 'allowed_values' in constraints:
violations = ~self.df[column].isin(constraints['allowed_values'])
if violations.any():
self.violations.append({
'type': 'validity',
'severity': 'error',
'message': f'{column} has {violations.sum()} values not in allowed set'
})
def check_timeliness(self, date_column, max_age_days=365):
"""Check if data is recent enough"""
if pd.api.types.is_datetime64_any_dtype(self.df[date_column]):
max_date = self.df[date_column].max()
age_days = (pd.Timestamp.now() - max_date).days
if age_days > max_age_days:
self.violations.append({
'type': 'timeliness',
'severity': 'warning',
'message': f'{date_column} data is {age_days} days old (max: {max_age_days})'
})
def check_consistency(self, column_pairs):
"""Check consistency between related columns"""
for col1, col2, rule in column_pairs:
if rule == 'col1 <= col2':
violations = self.df[col1] > self.df[col2]
if violations.any():
self.violations.append({
'type': 'consistency',
'severity': 'error',
'message': f'{col1} > {col2} in {violations.sum()} rows'
})
def generate_report(self):
"""Generate quality report"""
report = {
'total_checks': len(self.violations),
'critical': len([v for v in self.violations if v['severity'] == 'critical']),
'errors': len([v for v in self.violations if v['severity'] == 'error']),
'warnings': len([v for v in self.violations if v['severity'] == 'warning']),
'violations': self.violations
}
print("Data Quality Report")
print("=" * 50)
print(f"Total violations: {report['total_checks']}")
print(f"Critical: {report['critical']}")
print(f"Errors: {report['errors']}")
print(f"Warnings: {report['warnings']}")
if self.violations:
print("\nViolations:")
for v in self.violations:
print(f" [{v['severity'].upper()}] {v['message']}")
return report
# Example usage
checker = DataQualityChecker(sample_df)
checker.check_completeness(['user_id', 'age', 'income'])
checker.check_uniqueness('user_id')
checker.check_validity('age', {'min': 0, 'max': 120})
checker.check_timeliness('signup_date', max_age_days=365*2)
report = checker.generate_report()
5. Common Follow-Up Questions
Follow-up 1: How do you handle different types of missing data?
# Types of missing data and handling strategies
def analyze_missing_type(series):
"""Analyze the type of missing data"""
missing_mask = series.isnull()
# MCAR: Missing Completely at Random
# Test: Little's test (simplified)
# If missingness is independent of both observed and unobserved data
# MAR: Missing at Random
# Missingness depends on observed data but not unobserved
# MNAR: Missing Not at Random
# Missingness depends on unobserved data
# Simple heuristic: Check correlation with other variables
return {
'total_missing': missing_mask.sum(),
'pct_missing': missing_mask.mean() * 100,
'pattern': 'Check correlation with other variables'
}
# Imputation strategies
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
def impute_missing_data(df, strategy='advanced'):
"""Apply different imputation strategies"""
df_imputed = df.copy()
if strategy == 'simple':
# Mean/median/mode imputation
numeric_cols = df.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy='median')
df_imputed[numeric_cols] = imputer.fit_transform(df[numeric_cols])
elif strategy == 'advanced':
# Multiple imputation using MICE
numeric_cols = df.select_dtypes(include=[np.number]).columns
imputer = IterativeImputer(max_iter=10, random_state=42)
df_imputed[numeric_cols] = imputer.fit_transform(df[numeric_cols])
elif strategy == 'knn':
# KNN imputation
numeric_cols = df.select_dtypes(include=[np.number]).columns
imputer = KNNImputer(n_neighbors=5)
df_imputed[numeric_cols] = imputer.fit_transform(df[numeric_cols])
return df_imputed
Follow-up 2: How do you handle high-cardinality categorical variables?
# High cardinality handling techniques
def handle_high_cardinality(df, column, method='target_encoding',
target=None, n_categories=100):
"""Handle high cardinality categorical variables"""
if df[column].nunique() <= n_categories:
return df # No need to reduce
if method == 'frequency_encoding':
freq_map = df[column].value_counts(normalize=True).to_dict()
df[f'{column}_freq'] = df[column].map(freq_map)
elif method == 'target_encoding' and target is not None:
# Mean encoding
means = df.groupby(column)[target].mean()
df[f'{column}_target_enc'] = df[column].map(means)
elif method == 'hash_encoding':
from sklearn.feature_extraction import FeatureHasher
hasher = FeatureHasher(n_features=32, input_type='string')
hashed = hasher.transform(df[column].astype(str).values.reshape(-1, 1))
df_hashed = pd.DataFrame(hashed.toarray(),
columns=[f'{column}_hash_{i}' for i in range(32)])
df = pd.concat([df, df_hashed], axis=1)
elif method == 'grouping':
# Group rare categories
value_counts = df[column].value_counts()
rare_categories = value_counts[value_counts < 10].index
df[f'{column}_grouped'] = df[column].apply(
lambda x: 'Other' if x in rare_categories else x
)
return df
Company-Specific Tips
βΉοΈ
Google Tips:
- Google values systematic, reproducible EDA workflows
- Be prepared to discuss automated data quality pipelines
- Know how to handle time series data in EDA
- Understand the importance of data lineage
Meta Tips:
- Meta focuses on scalable EDA for large datasets
- Know how to use Spark for distributed EDA
- Be comfortable with sampling strategies for exploration
- Understand privacy-preserving EDA techniques
Quiz Section
Related Topics
- Data Cleaning β Handling quality issues found in EDA
- Feature Engineering β Creating features from clean data
- Statistical Analysis β Formal analysis after EDA
- Visualization Best Practices β Advanced visualization techniques