--- name: data-analysis-sql description: SQL for data analysis with exploratory analysis, advanced aggregations, statistical functions, outlier detection, and business insights. 50+ real-world analytics queries. sasmp_version: "1.3.0" bonded_agent: 05-data-analyst bond_type: PRIMARY_BOND --- # SQL for Data Analysis ## Exploratory Data Analysis (EDA) ### Data Profiling ```sql -- Understand data structure and quality SELECT COUNT(*) as record_count FROM employees; SELECT COUNT(DISTINCT department) as unique_departments FROM employees; SELECT COUNT(*) - COUNT(email) as missing_emails FROM employees; -- Column value distribution SELECT salary, COUNT(*) as frequency FROM employees GROUP BY salary ORDER BY frequency DESC; -- Missing data analysis SELECT COUNT(*) as total_records, COUNT(phone) as non_null_phone, COUNT(*) - COUNT(phone) as missing_phone, ROUND(100.0 * (COUNT(*) - COUNT(phone)) / COUNT(*), 2) as missing_percentage FROM employees; -- Data type and range checks SELECT MIN(salary) as min_salary, MAX(salary) as max_salary, ROUND(AVG(salary), 2) as avg_salary, ROUND(STDDEV(salary), 2) as salary_stddev FROM employees; ``` ### Distribution Analysis ```sql -- Value frequency distribution SELECT department, COUNT(*) as emp_count, ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER (), 2) as percentage FROM employees GROUP BY department ORDER BY emp_count DESC; -- Salary ranges and distribution SELECT CASE WHEN salary < 50000 THEN 'Under 50K' WHEN salary < 75000 THEN '50K-75K' WHEN salary < 100000 THEN '75K-100K' ELSE '100K+' END as salary_range, COUNT(*) as emp_count, MIN(salary) as min_sal, MAX(salary) as max_sal, ROUND(AVG(salary), 2) as avg_sal FROM employees GROUP BY salary_range ORDER BY MIN(salary); -- Distribution visualization data SELECT salary, COUNT(*) as frequency, ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER (), 2) as pct, RPAD('*', COUNT(*) / 10, '*') as bar_chart FROM employees GROUP BY salary ORDER BY salary; ``` ## Statistical Analysis ### Summary Statistics ```sql -- Comprehensive statistics by group SELECT department, COUNT(*) as count, ROUND(AVG(salary), 2) as mean_salary, ROUND(MIN(salary), 2) as min_salary, ROUND(MAX(salary), 2) as max_salary, ROUND(STDDEV(salary), 2) as stddev_salary, ROUND(AVG(ABS(salary - (SELECT AVG(salary) FROM employees WHERE department = e.department))), 2) as avg_deviation FROM employees e GROUP BY department ORDER BY mean_salary DESC; -- Percentile analysis SELECT department, ROUND(PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY salary), 2) as q1, ROUND(PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY salary), 2) as median, ROUND(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY salary), 2) as q3, ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY salary), 2) as p95 FROM employees GROUP BY department; ``` ### Outlier Detection ```sql -- Find outliers using standard deviation SELECT emp_id, first_name, salary, ROUND(AVG(salary) OVER (), 2) as avg_salary, ROUND(STDDEV(salary) OVER (), 2) as stddev_salary, ROUND(ABS(salary - AVG(salary) OVER ()) / NULLIF(STDDEV(salary) OVER (), 0), 2) as z_score FROM employees HAVING ABS(salary - AVG(salary) OVER ()) / NULLIF(STDDEV(salary) OVER (), 0) > 3 ORDER BY z_score DESC; -- IQR method for outliers WITH salary_stats AS ( SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY salary) as q1, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY salary) as q3 FROM employees ) SELECT emp_id, salary, CASE WHEN salary < (SELECT q1 FROM salary_stats) - 1.5 * ((SELECT q3 FROM salary_stats) - (SELECT q1 FROM salary_stats)) OR salary > (SELECT q3 FROM salary_stats) + 1.5 * ((SELECT q3 FROM salary_stats) - (SELECT q1 FROM salary_stats)) THEN 'Outlier' ELSE 'Normal' END as outlier_status FROM employees; ``` ## Comparative Analysis ### Period-over-Period Comparison ```sql -- Year-over-year sales comparison SELECT EXTRACT(QUARTER FROM order_date) as quarter, EXTRACT(YEAR FROM order_date) as year, ROUND(SUM(amount), 2) as total_sales, ROUND(LAG(SUM(amount)) OVER (ORDER BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date)), 2) as prev_period, ROUND(SUM(amount) - LAG(SUM(amount)) OVER (ORDER BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date)), 2) as yoy_change, ROUND(100.0 * (SUM(amount) - LAG(SUM(amount)) OVER (ORDER BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date))) / LAG(SUM(amount)) OVER (ORDER BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date)), 2) as yoy_pct_change FROM orders GROUP BY EXTRACT(YEAR FROM order_date), EXTRACT(QUARTER FROM order_date) ORDER BY year, quarter; ``` ### Cohort Analysis ```sql -- User cohort analysis WITH user_cohorts AS ( SELECT DATE_TRUNC('month', first_order_date)::DATE as cohort_month, user_id, DATE_TRUNC('month', order_date)::DATE as order_month FROM users u LEFT JOIN orders o ON u.id = o.user_id ) SELECT cohort_month, DATE_PART('month', order_month - cohort_month) / 1 as months_since_cohort, COUNT(DISTINCT user_id) as users, ROUND(100.0 * COUNT(DISTINCT user_id) / (SELECT COUNT(DISTINCT user_id) FROM user_cohorts WHERE order_month = cohort_month), 2) as retention_rate FROM user_cohorts WHERE order_month >= cohort_month GROUP BY cohort_month, months_since_cohort ORDER BY cohort_month, months_since_cohort; ``` ## Correlation & Relationship Analysis ```sql -- Correlation between variables WITH salary_data AS ( SELECT years_experience, salary, AVG(salary) OVER () as avg_salary, AVG(years_experience) OVER () as avg_experience, STDDEV(salary) OVER () as stddev_salary, STDDEV(years_experience) OVER () as stddev_experience FROM employees ) SELECT ROUND( SUM((years_experience - avg_experience) * (salary - avg_salary)) / (COUNT(*) * stddev_salary * stddev_experience), 4 ) as correlation FROM salary_data; -- Segment analysis SELECT CASE WHEN years_experience < 2 THEN 'Junior' WHEN years_experience < 5 THEN 'Mid-level' WHEN years_experience < 10 THEN 'Senior' ELSE 'Expert' END as experience_level, COUNT(*) as count, ROUND(AVG(salary), 2) as avg_salary, ROUND(AVG(performance_rating), 2) as avg_rating FROM employees GROUP BY experience_level ORDER BY COUNT(*) DESC; ``` ## Data Quality Validation ```sql -- Check for invalid values SELECT CASE WHEN salary < 0 THEN 'Negative salary' WHEN salary > 1000000 THEN 'Unusually high salary' WHEN email NOT LIKE '%@%' THEN 'Invalid email' WHEN hire_date > CURRENT_DATE THEN 'Future hire date' WHEN years_experience > 70 THEN 'Impossible experience' ELSE NULL END as data_quality_issue, COUNT(*) as count FROM employees WHERE salary < 0 OR salary > 1000000 OR email NOT LIKE '%@%' OR hire_date > CURRENT_DATE OR years_experience > 70 GROUP BY data_quality_issue; -- Duplicate detection SELECT email, COUNT(*) as occurrence_count, STRING_AGG(DISTINCT emp_id::text, ', ') as emp_ids FROM employees WHERE email IS NOT NULL GROUP BY email HAVING COUNT(*) > 1 ORDER BY occurrence_count DESC; ``` ## Trend Analysis ```sql -- Moving average SELECT order_date, amount, ROUND(AVG(amount) OVER ( ORDER BY order_date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW ), 2) as moving_avg_7day, ROUND(AVG(amount) OVER ( ORDER BY order_date ROWS BETWEEN 29 PRECEDING AND CURRENT ROW ), 2) as moving_avg_30day FROM daily_orders ORDER BY order_date; -- Growth rate SELECT DATE_TRUNC('month', order_date)::DATE as month, ROUND(SUM(amount), 2) as monthly_revenue, ROUND((SUM(amount) - LAG(SUM(amount)) OVER (ORDER BY DATE_TRUNC('month', order_date))) / LAG(SUM(amount)) OVER (ORDER BY DATE_TRUNC('month', order_date)) * 100, 2) as growth_rate_pct FROM orders GROUP BY DATE_TRUNC('month', order_date) ORDER BY month; ``` ## Next Steps Learn advanced SQL concepts and optimization techniques in the `advanced-sql` skill.