--- name: db-performance-watchlist description: Defines database performance monitoring strategy with slow query detection, resource usage alerts, query execution thresholds, and automated alerting. Use for "database monitoring", "performance alerts", "slow queries", or "DB metrics". --- # DB Performance Watchlist Monitor database performance and prevent regressions. ## Key Performance Metrics ```typescript // performance-metrics.ts export interface DBMetrics { // Query Performance slowQueries: { threshold: number; // ms count: number; queries: SlowQuery[]; }; // Connection Pool connections: { active: number; idle: number; total: number; maxConnections: number; utilizationPercent: number; }; // Resource Usage resources: { cpuPercent: number; memoryPercent: number; diskUsagePercent: number; iops: number; }; // Query Statistics queryStats: { selectsPerSecond: number; insertsPerSecond: number; updatesPerSecond: number; deletesPerSecond: number; }; // Cache Performance cache: { hitRate: number; // % size: number; // MB evictions: number; }; // Index Usage indexes: { unusedIndexes: string[]; missingIndexes: string[]; }; } interface SlowQuery { query: string; duration: number; calls: number; avgDuration: number; table: string; } ``` ## Slow Query Detection ```typescript // scripts/detect-slow-queries.ts async function detectSlowQueries(thresholdMs: number = 100) { // Enable slow query logging (PostgreSQL) await prisma.$executeRaw` ALTER DATABASE mydb SET log_min_duration_statement = ${thresholdMs}; `; // Query pg_stat_statements for slow queries const slowQueries = await prisma.$queryRaw` SELECT query, calls, total_exec_time / 1000 as total_time_ms, mean_exec_time / 1000 as avg_time_ms, max_exec_time / 1000 as max_time_ms, (total_exec_time / sum(total_exec_time) OVER()) * 100 as percent_of_total FROM pg_stat_statements WHERE mean_exec_time > ${thresholdMs} ORDER BY mean_exec_time DESC LIMIT 20 `; console.log("🐌 Slow Queries Detected:\n"); slowQueries.forEach((q, i) => { console.log(`${i + 1}. ${q.query.substring(0, 80)}...`); console.log(` Calls: ${q.calls}`); console.log(` Avg: ${q.avg_time_ms.toFixed(2)}ms`); console.log(` Max: ${q.max_time_ms.toFixed(2)}ms`); console.log(` % of total time: ${q.percent_of_total.toFixed(1)}%\n`); }); return slowQueries; } ``` ## Connection Pool Monitoring ```typescript async function monitorConnectionPool() { const stats = await prisma.$queryRaw` SELECT sum(numbackends) as total_connections, sum(CASE WHEN state = 'active' THEN 1 ELSE 0 END) as active, sum(CASE WHEN state = 'idle' THEN 1 ELSE 0 END) as idle, max_connections FROM pg_stat_database CROSS JOIN (SELECT setting::int as max_connections FROM pg_settings WHERE name = 'max_connections') WHERE datname = current_database() GROUP BY max_connections `; const { total_connections, active, idle, max_connections } = stats[0]; const utilization = (total_connections / max_connections) * 100; console.log("šŸ”Œ Connection Pool Status:"); console.log( ` Total: ${total_connections}/${max_connections} (${utilization.toFixed( 1 )}%)` ); console.log(` Active: ${active}`); console.log(` Idle: ${idle}`); // Alert if > 80% utilization if (utilization > 80) { console.warn("āš ļø Connection pool >80% utilized!"); await sendAlert({ title: "High connection pool usage", message: `${utilization.toFixed(1)}% of connections in use`, }); } } ``` ## Resource Monitoring ```typescript async function monitorResources() { // CPU Usage const cpuStats = await prisma.$queryRaw` SELECT (sum(total_exec_time) / (extract(epoch from (now() - stats_reset)) * 1000 * 100)) as cpu_percent FROM pg_stat_statements, pg_stat_database WHERE datname = current_database() `; // Memory Usage const memStats = await prisma.$queryRaw` SELECT pg_size_pretty(pg_database_size(current_database())) as db_size, pg_size_pretty(sum(pg_relation_size(schemaname||'.'||tablename))) as tables_size FROM pg_tables WHERE schemaname = 'public' `; // Cache Hit Rate const cacheStats = await prisma.$queryRaw` SELECT sum(heap_blks_hit) / (sum(heap_blks_hit) + sum(heap_blks_read)) * 100 as cache_hit_rate FROM pg_statio_user_tables `; console.log("šŸ“Š Resource Usage:"); console.log(` CPU: ${cpuStats[0].cpu_percent.toFixed(1)}%`); console.log(` Database Size: ${memStats[0].db_size}`); console.log(` Cache Hit Rate: ${cacheStats[0].cache_hit_rate.toFixed(1)}%`); // Alert if cache hit rate < 90% if (cacheStats[0].cache_hit_rate < 90) { console.warn("āš ļø Cache hit rate below 90%!"); await sendAlert({ title: "Low cache hit rate", message: `Cache hit rate: ${cacheStats[0].cache_hit_rate.toFixed(1)}%`, }); } } ``` ## Index Usage Analysis ```typescript async function analyzeIndexUsage() { // Find unused indexes const unusedIndexes = await prisma.$queryRaw` SELECT schemaname, tablename, indexname, idx_scan FROM pg_stat_user_indexes WHERE idx_scan = 0 AND indexname NOT LIKE '%_pkey' ORDER BY pg_relation_size(indexrelid) DESC `; console.log("šŸ—‚ļø Unused Indexes:\n"); unusedIndexes.forEach((idx) => { console.log(` ${idx.tablename}.${idx.indexname} (0 scans)`); }); // Find missing indexes (sequential scans on large tables) const missingIndexes = await prisma.$queryRaw` SELECT schemaname, tablename, seq_scan, seq_tup_read, idx_scan, n_live_tup FROM pg_stat_user_tables WHERE seq_scan > 1000 AND n_live_tup > 10000 ORDER BY seq_scan * n_live_tup DESC LIMIT 10 `; console.log("\nšŸ“‰ Tables with High Sequential Scans:\n"); missingIndexes.forEach((table) => { console.log(` ${table.tablename}:`); console.log(` Sequential scans: ${table.seq_scan}`); console.log(` Rows: ${table.n_live_tup}`); console.log(` Index scans: ${table.idx_scan}`); }); } ``` ## Alert Thresholds ```typescript const ALERT_THRESHOLDS = { slowQuery: { avgDuration: 500, // ms maxDuration: 2000, // ms callsPerMinute: 100, }, connections: { utilizationWarning: 70, // % utilizationCritical: 85, // % }, resources: { cpuWarning: 70, // % cpuCritical: 85, // % memoryWarning: 80, // % memoryCritical: 90, // % diskWarning: 75, // % diskCritical: 85, // % }, cache: { hitRateWarning: 90, // % hitRateCritical: 80, // % }, queryRate: { maxSelectsPerSecond: 10000, maxWritesPerSecond: 1000, }, }; async function checkThresholds() { const metrics = await gatherMetrics(); // Check slow queries if (metrics.slowQueries.count > 10) { await sendAlert({ level: "warning", title: "Slow queries detected", message: `${metrics.slowQueries.count} queries exceeding ${ALERT_THRESHOLDS.slowQuery.avgDuration}ms`, }); } // Check connection pool if ( metrics.connections.utilizationPercent > ALERT_THRESHOLDS.connections.utilizationCritical ) { await sendAlert({ level: "critical", title: "Connection pool critical", message: `${metrics.connections.utilizationPercent.toFixed( 1 )}% utilization`, }); } // Check cache hit rate if (metrics.cache.hitRate < ALERT_THRESHOLDS.cache.hitRateCritical) { await sendAlert({ level: "critical", title: "Cache hit rate critical", message: `${metrics.cache.hitRate.toFixed(1)}% hit rate`, }); } } ``` ## Monitoring Dashboard ```typescript // Generate monitoring report async function generatePerformanceReport() { console.log("šŸ“Š Database Performance Report\n"); console.log("=".repeat(50) + "\n"); // Slow queries const slowQueries = await detectSlowQueries(100); console.log(`Slow Queries (>100ms): ${slowQueries.length}\n`); // Connection pool await monitorConnectionPool(); console.log(); // Resources await monitorResources(); console.log(); // Index usage await analyzeIndexUsage(); console.log(); // Query rates const queryStats = await prisma.$queryRaw` SELECT sum(xact_commit + xact_rollback) as transactions, sum(tup_returned) as rows_read, sum(tup_inserted) as rows_inserted, sum(tup_updated) as rows_updated, sum(tup_deleted) as rows_deleted FROM pg_stat_database WHERE datname = current_database() `; console.log("šŸ“ˆ Query Statistics:"); console.log(` Transactions: ${queryStats[0].transactions}`); console.log(` Rows read: ${queryStats[0].rows_read}`); console.log(` Rows inserted: ${queryStats[0].rows_inserted}`); console.log(` Rows updated: ${queryStats[0].rows_updated}`); console.log(` Rows deleted: ${queryStats[0].rows_deleted}`); } ``` ## Automated Monitoring Script ```typescript // scripts/monitor-db.ts import cron from "node-cron"; // Run every 5 minutes cron.schedule("*/5 * * * *", async () => { await checkThresholds(); }); // Generate report every hour cron.schedule("0 * * * *", async () => { await generatePerformanceReport(); }); // Analyze indexes weekly cron.schedule("0 0 * * 0", async () => { await analyzeIndexUsage(); }); ``` ## Grafana Dashboard Queries ```sql -- Query latency over time SELECT bucket, AVG(mean_exec_time) as avg_latency, MAX(max_exec_time) as max_latency, SUM(calls) as total_calls FROM pg_stat_statements WHERE query NOT LIKE '%pg_stat_statements%' GROUP BY time_bucket('5 minutes', queryid) ORDER BY bucket; -- Connection count over time SELECT now() as time, count(*) as total, count(*) FILTER (WHERE state = 'active') as active, count(*) FILTER (WHERE state = 'idle') as idle FROM pg_stat_activity; -- Cache hit rate SELECT sum(heap_blks_hit) / (sum(heap_blks_hit) + sum(heap_blks_read)) * 100 as cache_hit_rate FROM pg_statio_user_tables; ``` ## Best Practices 1. **Monitor continuously**: Don't wait for problems 2. **Set appropriate thresholds**: Based on your SLAs 3. **Alert on trends**: Not just absolute values 4. **Review regularly**: Weekly performance reviews 5. **Automate everything**: No manual checks 6. **Document baselines**: Know what's normal 7. **Test alerts**: Ensure they work ## Output Checklist - [ ] Slow query detection configured - [ ] Connection pool monitoring - [ ] Resource usage tracking - [ ] Cache hit rate monitoring - [ ] Index usage analysis - [ ] Alert thresholds defined - [ ] Monitoring dashboard setup - [ ] Automated checks scheduled - [ ] Grafana/alerting integration - [ ] Performance baseline documented