🔍 Duplicate Detection

Amazon & Google Interview Deep Dive

🏢 Amazon🏢 Google⚡ Difficulty: Medium-Hard⏱️ 35 min

📋 Interview Question

ℹ️🔴 Amazon/Google Interview Question

"Given a customers table with potential duplicates (different emails, slightly different names), write queries to: 1) Find exact duplicates, 2) Find fuzzy duplicates (similar names/emails), 3) Merge duplicates keeping the best record, 4) Prevent future duplicates."

Companies: Amazon, Google | Difficulty: Medium-Hard | Time: 35 minutes

📊 Setup: Customers with Duplicates

CREATE TABLE customers (
    customer_id SERIAL PRIMARY KEY,
    first_name VARCHAR(100),
    last_name VARCHAR(100),
    email VARCHAR(200),
    phone VARCHAR(50),
    city VARCHAR(100),
    created_at TIMESTAMP DEFAULT NOW(),
    updated_at TIMESTAMP DEFAULT NOW()
);

-- Insert data with duplicates
INSERT INTO customers (first_name, last_name, email, phone, city, created_at) VALUES
('John', 'Smith', 'john.smith@email.com', '555-1234', 'New York', '2023-01-01'),
('Jon', 'Smith', 'j.smith@email.com', '555-1234', 'New York', '2023-02-15'),
('Jonathan', 'Smith', 'johnsmith@email.com', '555-5678', 'New York', '2023-03-20'),
('Jane', 'Doe', 'jane.doe@email.com', '555-9012', 'Chicago', '2023-01-05'),
('Janet', 'Doe', 'jane.d@email.com', '555-9012', 'Chicago', '2023-04-10'),
('Bob', 'Johnson', 'bob.j@email.com', '555-3456', 'Houston', '2023-02-01'),
('Robert', 'Johnson', 'robert.j@email.com', '555-3456', 'Houston', '2023-05-15'),
('Alice', 'Williams', 'alice.w@email.com', '555-7890', 'Phoenix', '2023-01-10'),
('Alicia', 'Williams', 'alice.williams@email.com', '555-1111', 'Phoenix', '2023-06-01');

🔢 Part 1: Finding Exact Duplicates

-- Find exact duplicates based on specific columns
SELECT
    email,
    phone,
    COUNT(*) AS duplicate_count,
    ARRAY_AGG(customer_id) AS customer_ids
FROM customers
GROUP BY email, phone
HAVING COUNT(*) > 1;

-- Find duplicates by name (ignoring case)
SELECT
    LOWER(first_name) AS first_name,
    LOWER(last_name) AS last_name,
    COUNT(*) AS count,
    ARRAY_AGG(customer_id) AS ids
FROM customers
GROUP BY LOWER(first_name), LOWER(last_name)
HAVING COUNT(*) > 1;

-- Find all rows that are duplicates
WITH duplicate_groups AS (
    SELECT
        *,
        COUNT(*) OVER (
            PARTITION BY LOWER(first_name), LOWER(last_name), email
        ) AS group_count
    FROM customers
)
SELECT *
FROM duplicate_groups
WHERE group_count > 1
ORDER BY LOWER(last_name), LOWER(first_name), created_at;

🌫️ Part 2: Finding Fuzzy Duplicates

ℹ️🔍 Fuzzy Matching

Fuzzy matching finds similar but not identical records using:

Levenshtein distance (edit distance)
Trigram similarity
Soundex/Metaphone (phonetic matching)

Using Trigram Similarity

-- Enable pg_trgm extension
CREATE EXTENSION IF NOT EXISTS pg_trgm;

-- Find similar names using trigram similarity
SELECT
    a.customer_id AS id1,
    a.first_name || ' ' || a.last_name AS name1,
    b.customer_id AS id2,
    b.first_name || ' ' || b.last_name AS name2,
    similarity(
        a.first_name || ' ' || a.last_name,
        b.first_name || ' ' || b.last_name
    ) AS name_similarity
FROM customers a
CROSS JOIN customers b
WHERE a.customer_id < b.customer_id
AND similarity(
    a.first_name || ' ' || a.last_name,
    b.first_name || ' ' || b.last_name
) > 0.3
ORDER BY name_similarity DESC;

-- Find similar emails
SELECT
    a.customer_id,
    a.email AS email1,
    b.customer_id,
    b.email AS email2,
    similarity(a.email, b.email) AS email_similarity
FROM customers a
CROSS JOIN customers b
WHERE a.customer_id < b.customer_id
AND similarity(a.email, b.email) > 0.5;

Using Levenshtein Distance

-- Enable fuzzystrmatch extension
CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;

-- Find names with small edit distance
SELECT
    a.customer_id,
    a.first_name AS name1,
    b.customer_id,
    b.first_name AS name2,
    levenshtein(LOWER(a.first_name), LOWER(b.first_name)) AS edit_distance
FROM customers a
CROSS JOIN customers b
WHERE a.customer_id < b.customer_id
AND levenshtein(LOWER(a.first_name), LOWER(b.first_name)) <= 2
ORDER BY edit_distance;

-- Combined similarity score
SELECT
    a.customer_id,
    a.first_name || ' ' || a.last_name AS name1,
    a.email AS email1,
    b.customer_id,
    b.first_name || ' ' || b.last_name AS name2,
    b.email AS email2,
    (
        similarity(a.first_name, b.first_name) +
        similarity(a.last_name, b.last_name) +
        similarity(a.email, b.email)
    ) / 3 AS combined_similarity
FROM customers a
CROSS JOIN customers b
WHERE a.customer_id < b.customer_id
AND (
    similarity(a.first_name, b.first_name) > 0.3
    OR similarity(a.email, b.email) > 0.5
    OR (a.phone = b.phone AND a.phone IS NOT NULL)
)
ORDER BY combined_similarity DESC;

🧹 Part 3: Removing Duplicates

Keep First Record

-- Delete duplicates, keeping the record with earliest created_at
WITH ranked_duplicates AS (
    SELECT
        customer_id,
        ROW_NUMBER() OVER (
            PARTITION BY LOWER(first_name), LOWER(last_name), email
            ORDER BY created_at
        ) AS rn
    FROM customers
)
DELETE FROM customers
WHERE customer_id IN (
    SELECT customer_id
    FROM ranked_duplicates
    WHERE rn > 1
);

Keep Most Complete Record

-- Delete duplicates, keeping the record with most complete data
WITH completeness_score AS (
    SELECT
        customer_id,
        (
            CASE WHEN first_name IS NOT NULL THEN 1 ELSE 0 END +
            CASE WHEN last_name IS NOT NULL THEN 1 ELSE 0 END +
            CASE WHEN email IS NOT NULL THEN 1 ELSE 0 END +
            CASE WHEN phone IS NOT NULL THEN 1 ELSE 0 END +
            CASE WHEN city IS NOT NULL THEN 1 ELSE 0 END
        ) AS score,
        ROW_NUMBER() OVER (
            PARTITION BY LOWER(first_name), LOWER(last_name)
            ORDER BY
                -- Most complete first
                (
                    CASE WHEN first_name IS NOT NULL THEN 1 ELSE 0 END +
                    CASE WHEN last_name IS NOT NULL THEN 1 ELSE 0 END +
                    CASE WHEN email IS NOT NULL THEN 1 ELSE 0 END +
                    CASE WHEN phone IS NOT NULL THEN 1 ELSE 0 END +
                    CASE WHEN city IS NOT NULL THEN 1 ELSE 0 END
                ) DESC,
                -- Then most recent
                created_at DESC
        ) AS rn
    FROM customers
)
DELETE FROM customers
WHERE customer_id IN (
    SELECT customer_id
    FROM completeness_score
    WHERE rn > 1
);

Merge Duplicate Data

-- Create merged records from duplicates
CREATE TABLE customers_merged AS
WITH ranked AS (
    SELECT
        *,
        ROW_NUMBER() OVER (
            PARTITION BY LOWER(first_name), LOWER(last_name)
            ORDER BY created_at
        ) AS rn,
        COUNT(*) OVER (
            PARTITION BY LOWER(first_name), LOWER(last_name)
        ) AS total_count
    FROM customers
),
merged AS (
    SELECT
        MIN(customer_id) AS customer_id,
        -- Keep most common values
        MODE() WITHIN GROUP (ORDER BY first_name) AS first_name,
        MODE() WITHIN GROUP (ORDER BY last_name) AS last_name,
        MAX(email) AS email,  -- Or use most recent
        MAX(phone) AS phone,
        MAX(city) AS city,
        MIN(created_at) AS created_at,
        MAX(updated_at) AS updated_at,
        total_count AS duplicate_count
    FROM ranked
    GROUP BY LOWER(first_name), LOWER(last_name), total_count
)
SELECT * FROM merged;

🛡️ Part 4: Preventing Future Duplicates

Unique Constraints

-- Add unique constraint on email
ALTER TABLE customers
ADD CONSTRAINT unique_email UNIQUE (email);

-- Partial unique index (for active records only)
CREATE UNIQUE INDEX idx_unique_active_email
ON customers (email)
WHERE is_active = true;

-- Unique constraint on multiple columns
ALTER TABLE customers
ADD CONSTRAINT unique_name_email
UNIQUE (LOWER(first_name), LOWER(last_name), email);

Upsert (INSERT ON CONFLICT)

-- Insert or update on conflict
INSERT INTO customers (first_name, last_name, email, phone, city)
VALUES ('John', 'Smith', 'john.smith@email.com', '555-1234', 'New York')
ON CONFLICT (email)
DO UPDATE SET
    first_name = EXCLUDED.first_name,
    last_name = EXCLUDED.last_name,
    phone = EXCLUDED.phone,
    city = EXCLUDED.city,
    updated_at = NOW();

-- Insert or ignore on conflict
INSERT INTO customers (first_name, last_name, email, phone, city)
VALUES ('John', 'Smith', 'john.smith@email.com', '555-1234', 'New York')
ON CONFLICT (email)
DO NOTHING;

Trigger for Duplicate Prevention

-- Trigger to prevent duplicates
CREATE OR REPLACE FUNCTION prevent_duplicate_email()
RETURNS TRIGGER AS $$
BEGIN
    IF EXISTS (
        SELECT 1 FROM customers
        WHERE LOWER(email) = LOWER(NEW.email)
        AND customer_id != NEW.customer_id
    ) THEN
        RAISE EXCEPTION 'Email % already exists', NEW.email;
    END IF;
    RETURN NEW;
END;
$$ LANGUAGE plpgsql;

CREATE TRIGGER trg_prevent_duplicate_email
BEFORE INSERT OR UPDATE ON customers
FOR EACH ROW
EXECUTE FUNCTION prevent_duplicate_email();

🎯 Quiz Section

🏆 Best Practices for Interviews

💡✅ Deduplication Best Practices

1. Define Duplicate Criteria First:

-- What makes two records "duplicates"?
-- Same email? Same name? Same phone? Similar combination?

2. Use Staging Tables:

-- Process duplicates in a staging table first
CREATE TABLE customers_staging AS SELECT * FROM customers;

-- Identify and merge duplicates
-- Then replace original table

3. Preserve Original IDs:

-- Keep track of original IDs during merge
CREATE TABLE duplicate_mapping (
    original_id INT,
    merged_into_id INT,
    merge_date TIMESTAMP DEFAULT NOW()
);

4. Validate After Deduplication:

-- Verify no data was lost
SELECT COUNT(*) FROM customers_before
UNION ALL
SELECT COUNT(*) FROM customers_after;

5. Create Audit Trail:

-- Log all deduplication actions
CREATE TABLE dedup_audit (
    audit_id SERIAL PRIMARY KEY,
    table_name VARCHAR(100),
    action VARCHAR(50),
    records_affected INT,
    performed_at TIMESTAMP DEFAULT NOW()
);

⚠️⚠️ Common Pitfalls

Case sensitivity: 'John' != 'john' unless using LOWER()
Whitespace: 'John Smith' != 'John Smith'
NULL handling: NULL != NULL in comparisons
Performance: Cross joins for fuzzy matching are O(n²)
False positives: Fuzzy matching may flag non-duplicates

Topic: Finding and Removing Duplicates in SQL for FAANG Interviews