🎉 75% of content is free forever — Unlock Premium from $10/mo →
CW
Search courses…
💼 Servicesℹ️ About✉️ ContactView Pricing Plansfrom $10

Project 1: EDA on Real Dataset

Module 6: EDA Project🟢 Free Lesson

Advertisement

Project 1: EDA on Real Dataset

EDA Workflow Steps1. LoadCSV/SQL2. CleanNulls3. ExploreStats4. VizCharts5. ActInsightsMean: x̄ = (1/n)Σxᵢ | Std: s = √(Σ(xᵢ-x̄)²/(n-1)) | Corr: r = Σ(xᵢ-x̄)(yᵢ-ȳ)/√(Σ(xᵢ-x̄)²Σ(yᵢ-ȳ)²)IQR = Q3 - Q1 | Outlier: < Q1 - 1.5·IQR or > Q3 + 1.5·IQRSkewness = E[(X-μ)³]/σ³ | Kurtosis = E[(X-μ)⁴]/σ⁴ - 3

This project guides you through a complete Exploratory Data Analysis workflow using a real-world dataset.

EDA Workflow

Project Setup

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Load dataset
df = pd.read_csv('dataset.csv')

# Initial exploration
print(f"Shape: {df.shape}")
print(f"\nColumn types:\n{df.dtypes}")
print(f"\nFirst 5 rows:\n{df.head()}")
print(f"\nBasic statistics:\n{df.describe()}")

Data Cleaning

Missing %=count of nullstotal rows×100,Duplicate %=count of duplicatestotal rows×100\text{Missing \%} = \frac{\text{count of nulls}}{\text{total rows}} \times 100, \quad \text{Duplicate \%} = \frac{\text{count of duplicates}}{\text{total rows}} \times 100
# Check for missing values
missing = df.isnull().sum()
missing_percent = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing': missing,
    'Percent': missing_percent
}).sort_values('Percent', ascending=False)

# Handle missing values
def handle_missing(df):
    # Numerical columns: fill with median
    num_cols = df.select_dtypes(include=[np.number]).columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())
    
    # Categorical columns: fill with mode
    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    
    return df

df_clean = handle_missing(df)

# Remove duplicates
print(f"Duplicates before: {df_clean.duplicated().sum()}")
df_clean = df_clean.drop_duplicates()
print(f"Duplicates after: {df_clean.duplicated().sum()}")

Univariate Analysis

def univariate_analysis(df, column):
    """Comprehensive univariate analysis for a single column"""
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    if df[column].dtype in ['int64', 'float64']:
        # Numerical
        axes[0].hist(df[column], bins=30, edgecolor='black', alpha=0.7)
        axes[0].set_title(f'{column} Distribution')
        
        axes[1].boxplot(df[column])
        axes[1].set_title(f'{column} Boxplot')
        
        # QQ plot
        stats.probplot(df[column], dist="norm", plot=axes[2])
        axes[2].set_title(f'{column} QQ Plot')
    else:
        # Categorical
        value_counts = df[column].value_counts()
        axes[0].bar(value_counts.index, value_counts.values)
        axes[0].set_title(f'{column} Counts')
        axes[0].tick_params(axis='x', rotation=45)
        
        # Pie chart
        axes[1].pie(value_counts.values, labels=value_counts.index, autopct='%1.1f%%')
        axes[1].set_title(f'{column} Distribution')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nStatistics for {column}:")
    print(df[column].describe())

# Apply to all columns
for col in df_clean.columns:
    univariate_analysis(df_clean, col)

Bivariate Analysis

# Correlation matrix
plt.figure(figsize=(12, 8))
corr_matrix = df_clean.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

# Scatter plot matrix
sns.pairplot(df_clean, hue='target_column')
plt.show()

# Categorical vs Numerical
def cat_vs_num_analysis(df, cat_col, num_col):
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=cat_col, y=num_col, data=df)
    plt.title(f'{num_col} by {cat_col}')
    plt.xticks(rotation=45)
    plt.show()
    
    # Statistical test
    groups = [group[num_col].dropna() for name, group in df.groupby(cat_col)]
    if len(groups) == 2:
        stat, p_value = stats.mannwhitneyu(*groups)
    else:
        stat, p_value = stats.kruskal(*groups)
    print(f"Statistical test p-value: {p_value:.4f}")

Visualization Best Practices

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# Time series visualization
def plot_time_series(df, date_col, value_col):
    plt.figure(figsize=(14, 6))
    plt.plot(df[date_col], df[value_col], marker='o', markersize=3)
    plt.title(f'{value_col} Over Time')
    plt.xlabel('Date')
    plt.ylabel(value_col)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Distribution comparison
def compare_distributions(df, col, group_col):
    fig, ax = plt.subplots(figsize=(10, 6))
    for group in df[group_col].unique():
        subset = df[df[group_col] == group][col]
        ax.hist(subset, alpha=0.5, label=group, bins=30)
    ax.set_title(f'{col} Distribution by {group_col}')
    ax.legend()
    plt.show()

Key Takeaways

  1. Always start with data shape and types
  2. Handle missing values systematically
  3. Use multiple visualization types for comprehensive understanding
  4. Document findings at each step
  5. Generate actionable insights from patterns

Premium Content

Project 1: EDA on Real Dataset

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
💼Interview Prep
📜Certificates
🤝Community Access

Already a member? Log in

Need Expert Data Science Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement