πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

Topic: Monitoring, Alerts & Observability

Snowflake AdvancedMonitoring⭐ Premium

Advertisement

Snowflake Advanced Β· Interview Prep

Monitoring, Alerts & Observability

Difficulty: Hard Β· Commonly asked at Amazon, Meta, Google

Interview Question

"Design a comprehensive monitoring system for a Snowflake deployment. What metrics would you track, how would you set up alerts, and how would you integrate with external monitoring tools?"

ℹ️

Companies Asking This: Amazon (L6 Data Engineer), Meta (Data Platform Engineer), Google (Senior Cloud Engineer), Microsoft (Principal Data Architect)


Monitoring Architecture

Snowflake AccountAccount UsageWarehouse MeteringQuery HistoryTask HistoryData CollectionINFORMATION_SCHEMAACCOUNT_USAGEACCESS_HISTORYExternal IntegrationDatadogPagerDutySlackCustom Webhook

Key Metrics to Track

1. Query Performance

-- Slow queries (> 30 seconds)
SELECT 
    query_id,
    query_text,
    user_name,
    warehouse_name,
    total_elapsed_time_ms / 1000 AS execution_seconds,
    bytes_scanned / (1024*1024*1024) AS gb_scanned,
    start_time
FROM snowflake.account_usage.query_history
WHERE query_type = 'SELECT'
  AND total_elapsed_time_ms > 30000
  AND start_time >= DATEADD(hour, -24, CURRENT_TIMESTAMP())
ORDER BY total_elapsed_time_ms DESC;

-- Query performance trends
SELECT 
    DATE_TRUNC('hour', start_time) AS hour,
    COUNT(*) AS query_count,
    AVG(total_elapsed_time_ms) / 1000 AS avg_seconds,
    MAX(total_elapsed_time_ms) / 1000 AS max_seconds,
    PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY total_elapsed_time_ms) / 1000 AS p95_seconds
FROM snowflake.account_usage.query_history
WHERE query_type = 'SELECT'
  AND start_time >= DATEADD(day, -7, CURRENT_TIMESTAMP())
GROUP BY 1
ORDER BY 1;

2. Warehouse Utilization

-- Warehouse credit consumption
SELECT 
    warehouse_name,
    warehouse_size,
    SUM(credits_used) AS total_credits,
    SUM(credits_used) * 10 AS estimated_cost_usd,
    AVG(credits_used) AS avg_credits_per_hour
FROM snowflake.account_usage.warehouse_metering_history
WHERE start_time >= DATEADD(day, -7, CURRENT_TIMESTAMP())
GROUP BY 1, 2
ORDER BY 3 DESC;

-- Warehouse queue depth
SELECT 
    warehouse_name,
    AVG(queries_queued) AS avg_queue_depth,
    MAX(queries_queued) AS max_queue_depth,
    AVG(queries_executing) AS avg_concurrent_queries
FROM snowflake.account_usage.warehouse_load_history
WHERE start_time >= DATEADD(day, -7, CURRENT_TIMESTAMP())
GROUP BY 1
HAVING AVG(queries_queued) > 5
ORDER BY 2 DESC;

-- Warehouse scaling events
SELECT 
    warehouse_name,
    event_timestamp,
    event_type,
    cluster_number,
    new_cluster_count
FROM snowflake.account_usage.warehouse_events_history
WHERE event_timestamp >= DATEADD(day, -1, CURRENT_TIMESTAMP())
  AND event_type LIKE '%CLUSTER%'
ORDER BY event_timestamp DESC;

3. Storage & Costs

-- Storage by database
SELECT 
    table_database,
    SUM(bytes) / (1024*1024*1024*1024) AS storage_tb,
    SUM(bytes) / (1024*1024*1024*1024) * 23 AS monthly_storage_cost_usd
FROM information_schema.tables
GROUP BY 1
ORDER BY 2 DESC;

-- Time travel storage impact
SELECT 
    table_name,
    retention_time,
    bytes / (1024*1024*1024) AS current_gb,
    bytes * retention_time / (1024*1024*1024) / 30 AS estimated_total_gb,
    bytes * retention_time / (1024*1024*1024) / 30 * 23 / 30 AS daily_cost_usd
FROM information_schema.tables
WHERE retention_time > 7
ORDER BY 5 DESC;

Alerting Strategy

Create Alert Procedure

-- 1. Create alert table
CREATE TABLE monitoring_alerts (
    alert_id NUMBER AUTOINCREMENT,
    alert_type VARCHAR(50),
    severity VARCHAR(20),
    message VARCHAR(2000),
    metric_value NUMBER,
    threshold NUMBER,
    created_at TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP(),
    acknowledged BOOLEAN DEFAULT FALSE
);

-- 2. Slow query alert
CREATE OR REPLACE PROCEDURE check_slow_queries()
RETURNS VARCHAR
LANGUAGE SQL
AS
$$
DECLARE
    slow_count NUMBER;
    threshold_ms NUMBER := 30000;  -- 30 seconds
BEGIN
    SELECT COUNT(*) INTO slow_count
    FROM snowflake.account_usage.query_history
    WHERE query_type = 'SELECT'
      AND total_elapsed_time_ms > threshold_ms
      AND start_time >= DATEADD(hour, -1, CURRENT_TIMESTAMP());
    
    IF (slow_count > 10) THEN
        INSERT INTO monitoring_alerts (alert_type, severity, message, metric_value, threshold)
        VALUES (
            'SLOW_QUERIES',
            'WARNING',
            slow_count || ' queries exceeded ' || threshold_ms/1000 || 's in the last hour',
            slow_count,
            10
        );
        
        RETURN 'ALERT: ' || slow_count || ' slow queries detected';
    END IF;
    
    RETURN 'OK: No slow query alerts';
END;
$$;

-- 3. Credit consumption alert
CREATE OR REPLACE PROCEDURE check_credit_usage()
RETURNS VARCHAR
LANGUAGE SQL
AS
$$
DECLARE
    daily_credits NUMBER;
    monthly_credits NUMBER;
    daily_threshold NUMBER := 500;
    monthly_threshold NUMBER := 10000;
BEGIN
    -- Check daily usage
    SELECT SUM(credits_used) INTO daily_credits
    FROM snowflake.account_usage.warehouse_metering_history
    WHERE start_time >= DATE_TRUNC('day', CURRENT_TIMESTAMP());
    
    -- Check monthly usage
    SELECT SUM(credits_used) INTO monthly_credits
    FROM snowflake.account_usage.warehouse_metering_history
    WHERE start_time >= DATE_TRUNC('month', CURRENT_TIMESTAMP());
    
    IF (daily_credits > daily_threshold) THEN
        INSERT INTO monitoring_alerts (alert_type, severity, message, metric_value, threshold)
        VALUES (
            'CREDIT_USAGE_DAILY',
            'CRITICAL',
            'Daily credit usage (' || daily_credits || ') exceeds threshold (' || daily_threshold || ')',
            daily_credits,
            daily_threshold
        );
        
        RETURN 'ALERT: Daily credit usage critical';
    END IF;
    
    IF (monthly_credits > monthly_threshold) THEN
        INSERT INTO monitoring_alerts (alert_type, severity, message, metric_value, threshold)
        VALUES (
            'CREDIT_USAGE_MONTHLY',
            'WARNING',
            'Monthly credit usage (' || monthly_credits || ') approaching threshold (' || monthly_threshold || ')',
            monthly_credits,
            monthly_threshold
        );
        
        RETURN 'ALERT: Monthly credit usage warning';
    END IF;
    
    RETURN 'OK: Credit usage normal';
END;
$$;

-- 4. Schedule monitoring checks
CREATE OR REPLACE TASK monitor_slow_queries
    WAREHOUSE = admin_wh
    SCHEDULE = 'USING CRON 0 * * * * UTC'  -- Every hour
AS
    CALL check_slow_queries();

CREATE OR REPLACE TASK monitor_credit_usage
    WAREHOUSE = admin_wh
    SCHEDULE = 'USING CRON 0 0 * * * UTC'  -- Daily
AS
    CALL check_credit_usage();

Real-World Scenario: Amazon

Question: "How would you integrate Snowflake monitoring with external tools like Datadog or PagerDuty?"

External Integration Pattern

-- 1. Create external function for webhook alerts
CREATE OR REPLACE EXTERNAL FUNCTION send_alert(message VARCHAR)
    RETURNS VARCHAR
    HTTP_METHOD = 'POST'
    URL = 'https://hooks.pagerduty.com/your-webhook'
    AUTHORIZATION_TYPE = 'CUSTOM'
    AUTHENTICATION_ROLE = 'API_ROLE'
    API_CONTEXT = 'integration';

-- 2. Create monitoring view for external tools
CREATE OR REPLACE VIEW monitoring_dashboard AS
SELECT 
    'queries' AS metric_type,
    COUNT(*) AS value,
    'Queries in last hour' AS description
FROM snowflake.account_usage.query_history
WHERE start_time >= DATEADD(hour, -1, CURRENT_TIMESTAMP())

UNION ALL

SELECT 
    'slow_queries' AS metric_type,
    COUNT(*) AS value,
    'Slow queries (>30s) in last hour' AS description
FROM snowflake.account_usage.query_history
WHERE total_elapsed_time_ms > 30000
  AND start_time >= DATEADD(hour, -1, CURRENT_TIMESTAMP())

UNION ALL

SELECT 
    'credits_used' AS metric_type,
    SUM(credits_used) AS value,
    'Credits used today' AS description
FROM snowflake.account_usage.warehouse_metering_history
WHERE start_time >= DATE_TRUNC('day', CURRENT_TIMESTAMP())

UNION ALL

SELECT 
    'storage_tb' AS metric_type,
    SUM(bytes) / (1024*1024*1024*1024) AS value,
    'Total storage in TB' AS description
FROM information_schema.tables;

-- 3. Create alert rule table
CREATE TABLE alert_rules (
    rule_id NUMBER AUTOINCREMENT,
    metric_name VARCHAR(100),
    threshold NUMBER,
    severity VARCHAR(20),
    notification_channel VARCHAR(50),
    enabled BOOLEAN DEFAULT TRUE
);

INSERT INTO alert_rules (metric_name, threshold, severity, notification_channel)
VALUES 
    ('slow_queries', 10, 'WARNING', 'slack'),
    ('slow_queries', 50, 'CRITICAL', 'pagerduty'),
    ('credits_used_daily', 500, 'WARNING', 'slack'),
    ('credits_used_daily', 1000, 'CRITICAL', 'pagerduty');

Best Practices

MetricThresholdAlert Level
Query time> 30sWARNING
Query time> 300sCRITICAL
Queue depth> 10WARNING
Queue depth> 50CRITICAL
Daily credits> 500WARNING
Daily credits> 1000CRITICAL
Storage growth> 10% monthlyWARNING

⚠️

Monitoring Anti-Patterns:

  1. No alerts β€” Issues go undetected until users complain
  2. Too many alerts β€” Alert fatigue causes real issues to be ignored
  3. No baseline β€” Can't detect anomalies without baseline metrics
  4. No retention β€” Historical data needed for trend analysis
  5. Manual monitoring β€” Automate with scheduled tasks

Advertisement