π Duplicate Detection
Amazon & Google Interview Deep Dive
π’ Amazonπ’ Googleβ‘ Difficulty: Medium-Hardβ±οΈ 35 min
π Interview Question
βΉοΈπ΄ Amazon/Google Interview Question
"Given a customers table with potential duplicates (different emails, slightly different names), write queries to: 1) Find exact duplicates, 2) Find fuzzy duplicates (similar names/emails), 3) Merge duplicates keeping the best record, 4) Prevent future duplicates."
Companies: Amazon, Google | Difficulty: Medium-Hard | Time: 35 minutes
π Setup: Customers with Duplicates
CREATE TABLE customers (
customer_id SERIAL PRIMARY KEY,
first_name VARCHAR(100),
last_name VARCHAR(100),
email VARCHAR(200),
phone VARCHAR(50),
city VARCHAR(100),
created_at TIMESTAMP DEFAULT NOW(),
updated_at TIMESTAMP DEFAULT NOW()
);
-- Insert data with duplicates
INSERT INTO customers (first_name, last_name, email, phone, city, created_at) VALUES
('John', 'Smith', 'john.smith@email.com', '555-1234', 'New York', '2023-01-01'),
('Jon', 'Smith', 'j.smith@email.com', '555-1234', 'New York', '2023-02-15'),
('Jonathan', 'Smith', 'johnsmith@email.com', '555-5678', 'New York', '2023-03-20'),
('Jane', 'Doe', 'jane.doe@email.com', '555-9012', 'Chicago', '2023-01-05'),
('Janet', 'Doe', 'jane.d@email.com', '555-9012', 'Chicago', '2023-04-10'),
('Bob', 'Johnson', 'bob.j@email.com', '555-3456', 'Houston', '2023-02-01'),
('Robert', 'Johnson', 'robert.j@email.com', '555-3456', 'Houston', '2023-05-15'),
('Alice', 'Williams', 'alice.w@email.com', '555-7890', 'Phoenix', '2023-01-10'),
('Alicia', 'Williams', 'alice.williams@email.com', '555-1111', 'Phoenix', '2023-06-01');
π’ Part 1: Finding Exact Duplicates
-- Find exact duplicates based on specific columns
SELECT
email,
phone,
COUNT(*) AS duplicate_count,
ARRAY_AGG(customer_id) AS customer_ids
FROM customers
GROUP BY email, phone
HAVING COUNT(*) > 1;
-- Find duplicates by name (ignoring case)
SELECT
LOWER(first_name) AS first_name,
LOWER(last_name) AS last_name,
COUNT(*) AS count,
ARRAY_AGG(customer_id) AS ids
FROM customers
GROUP BY LOWER(first_name), LOWER(last_name)
HAVING COUNT(*) > 1;
-- Find all rows that are duplicates
WITH duplicate_groups AS (
SELECT
*,
COUNT(*) OVER (
PARTITION BY LOWER(first_name), LOWER(last_name), email
) AS group_count
FROM customers
)
SELECT *
FROM duplicate_groups
WHERE group_count > 1
ORDER BY LOWER(last_name), LOWER(first_name), created_at;
π«οΈ Part 2: Finding Fuzzy Duplicates
βΉοΈπ Fuzzy Matching
Fuzzy matching finds similar but not identical records using:
- Levenshtein distance (edit distance)
- Trigram similarity
- Soundex/Metaphone (phonetic matching)
Using Trigram Similarity
-- Enable pg_trgm extension
CREATE EXTENSION IF NOT EXISTS pg_trgm;
-- Find similar names using trigram similarity
SELECT
a.customer_id AS id1,
a.first_name || ' ' || a.last_name AS name1,
b.customer_id AS id2,
b.first_name || ' ' || b.last_name AS name2,
similarity(
a.first_name || ' ' || a.last_name,
b.first_name || ' ' || b.last_name
) AS name_similarity
FROM customers a
CROSS JOIN customers b
WHERE a.customer_id < b.customer_id
AND similarity(
a.first_name || ' ' || a.last_name,
b.first_name || ' ' || b.last_name
) > 0.3
ORDER BY name_similarity DESC;
-- Find similar emails
SELECT
a.customer_id,
a.email AS email1,
b.customer_id,
b.email AS email2,
similarity(a.email, b.email) AS email_similarity
FROM customers a
CROSS JOIN customers b
WHERE a.customer_id < b.customer_id
AND similarity(a.email, b.email) > 0.5;
Using Levenshtein Distance
-- Enable fuzzystrmatch extension
CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
-- Find names with small edit distance
SELECT
a.customer_id,
a.first_name AS name1,
b.customer_id,
b.first_name AS name2,
levenshtein(LOWER(a.first_name), LOWER(b.first_name)) AS edit_distance
FROM customers a
CROSS JOIN customers b
WHERE a.customer_id < b.customer_id
AND levenshtein(LOWER(a.first_name), LOWER(b.first_name)) <= 2
ORDER BY edit_distance;
-- Combined similarity score
SELECT
a.customer_id,
a.first_name || ' ' || a.last_name AS name1,
a.email AS email1,
b.customer_id,
b.first_name || ' ' || b.last_name AS name2,
b.email AS email2,
(
similarity(a.first_name, b.first_name) +
similarity(a.last_name, b.last_name) +
similarity(a.email, b.email)
) / 3 AS combined_similarity
FROM customers a
CROSS JOIN customers b
WHERE a.customer_id < b.customer_id
AND (
similarity(a.first_name, b.first_name) > 0.3
OR similarity(a.email, b.email) > 0.5
OR (a.phone = b.phone AND a.phone IS NOT NULL)
)
ORDER BY combined_similarity DESC;
π§Ή Part 3: Removing Duplicates
Keep First Record
-- Delete duplicates, keeping the record with earliest created_at
WITH ranked_duplicates AS (
SELECT
customer_id,
ROW_NUMBER() OVER (
PARTITION BY LOWER(first_name), LOWER(last_name), email
ORDER BY created_at
) AS rn
FROM customers
)
DELETE FROM customers
WHERE customer_id IN (
SELECT customer_id
FROM ranked_duplicates
WHERE rn > 1
);
Keep Most Complete Record
-- Delete duplicates, keeping the record with most complete data
WITH completeness_score AS (
SELECT
customer_id,
(
CASE WHEN first_name IS NOT NULL THEN 1 ELSE 0 END +
CASE WHEN last_name IS NOT NULL THEN 1 ELSE 0 END +
CASE WHEN email IS NOT NULL THEN 1 ELSE 0 END +
CASE WHEN phone IS NOT NULL THEN 1 ELSE 0 END +
CASE WHEN city IS NOT NULL THEN 1 ELSE 0 END
) AS score,
ROW_NUMBER() OVER (
PARTITION BY LOWER(first_name), LOWER(last_name)
ORDER BY
-- Most complete first
(
CASE WHEN first_name IS NOT NULL THEN 1 ELSE 0 END +
CASE WHEN last_name IS NOT NULL THEN 1 ELSE 0 END +
CASE WHEN email IS NOT NULL THEN 1 ELSE 0 END +
CASE WHEN phone IS NOT NULL THEN 1 ELSE 0 END +
CASE WHEN city IS NOT NULL THEN 1 ELSE 0 END
) DESC,
-- Then most recent
created_at DESC
) AS rn
FROM customers
)
DELETE FROM customers
WHERE customer_id IN (
SELECT customer_id
FROM completeness_score
WHERE rn > 1
);
Merge Duplicate Data
-- Create merged records from duplicates
CREATE TABLE customers_merged AS
WITH ranked AS (
SELECT
*,
ROW_NUMBER() OVER (
PARTITION BY LOWER(first_name), LOWER(last_name)
ORDER BY created_at
) AS rn,
COUNT(*) OVER (
PARTITION BY LOWER(first_name), LOWER(last_name)
) AS total_count
FROM customers
),
merged AS (
SELECT
MIN(customer_id) AS customer_id,
-- Keep most common values
MODE() WITHIN GROUP (ORDER BY first_name) AS first_name,
MODE() WITHIN GROUP (ORDER BY last_name) AS last_name,
MAX(email) AS email, -- Or use most recent
MAX(phone) AS phone,
MAX(city) AS city,
MIN(created_at) AS created_at,
MAX(updated_at) AS updated_at,
total_count AS duplicate_count
FROM ranked
GROUP BY LOWER(first_name), LOWER(last_name), total_count
)
SELECT * FROM merged;
π‘οΈ Part 4: Preventing Future Duplicates
Unique Constraints
-- Add unique constraint on email
ALTER TABLE customers
ADD CONSTRAINT unique_email UNIQUE (email);
-- Partial unique index (for active records only)
CREATE UNIQUE INDEX idx_unique_active_email
ON customers (email)
WHERE is_active = true;
-- Unique constraint on multiple columns
ALTER TABLE customers
ADD CONSTRAINT unique_name_email
UNIQUE (LOWER(first_name), LOWER(last_name), email);
Upsert (INSERT ON CONFLICT)
-- Insert or update on conflict
INSERT INTO customers (first_name, last_name, email, phone, city)
VALUES ('John', 'Smith', 'john.smith@email.com', '555-1234', 'New York')
ON CONFLICT (email)
DO UPDATE SET
first_name = EXCLUDED.first_name,
last_name = EXCLUDED.last_name,
phone = EXCLUDED.phone,
city = EXCLUDED.city,
updated_at = NOW();
-- Insert or ignore on conflict
INSERT INTO customers (first_name, last_name, email, phone, city)
VALUES ('John', 'Smith', 'john.smith@email.com', '555-1234', 'New York')
ON CONFLICT (email)
DO NOTHING;
Trigger for Duplicate Prevention
-- Trigger to prevent duplicates
CREATE OR REPLACE FUNCTION prevent_duplicate_email()
RETURNS TRIGGER AS $$
BEGIN
IF EXISTS (
SELECT 1 FROM customers
WHERE LOWER(email) = LOWER(NEW.email)
AND customer_id != NEW.customer_id
) THEN
RAISE EXCEPTION 'Email % already exists', NEW.email;
END IF;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER trg_prevent_duplicate_email
BEFORE INSERT OR UPDATE ON customers
FOR EACH ROW
EXECUTE FUNCTION prevent_duplicate_email();
π― Quiz Section
π Best Practices for Interviews
π‘β Deduplication Best Practices
1. Define Duplicate Criteria First:
-- What makes two records "duplicates"?
-- Same email? Same name? Same phone? Similar combination?
2. Use Staging Tables:
-- Process duplicates in a staging table first
CREATE TABLE customers_staging AS SELECT * FROM customers;
-- Identify and merge duplicates
-- Then replace original table
3. Preserve Original IDs:
-- Keep track of original IDs during merge
CREATE TABLE duplicate_mapping (
original_id INT,
merged_into_id INT,
merge_date TIMESTAMP DEFAULT NOW()
);
4. Validate After Deduplication:
-- Verify no data was lost
SELECT COUNT(*) FROM customers_before
UNION ALL
SELECT COUNT(*) FROM customers_after;
5. Create Audit Trail:
-- Log all deduplication actions
CREATE TABLE dedup_audit (
audit_id SERIAL PRIMARY KEY,
table_name VARCHAR(100),
action VARCHAR(50),
records_affected INT,
performed_at TIMESTAMP DEFAULT NOW()
);
β οΈβ οΈ Common Pitfalls
- Case sensitivity: 'John' != 'john' unless using LOWER()
- Whitespace: 'John Smith' != 'John Smith'
- NULL handling: NULL != NULL in comparisons
- Performance: Cross joins for fuzzy matching are O(nΒ²)
- False positives: Fuzzy matching may flag non-duplicates