--- name: data-analysis description: Patterns for data loading, exploration, and statistical analysis --- # Data Analysis Patterns ## When to Use Load this skill when working with datasets that require exploration, cleaning, and statistical analysis. ## Data Loading ```python print("[DATA] Loading dataset") df = pd.read_csv("data.csv") print(f"[SHAPE] {df.shape[0]} rows, {df.shape[1]} columns") print(f"[DTYPE] {dict(df.dtypes)}") print(f"[MISSING] {df.isnull().sum().to_dict()}") ``` ## Exploratory Data Analysis (EDA) ### Descriptive Statistics ```python print("[STAT] Descriptive statistics:") print(df.describe()) print(f"[RANGE] {col}: {df[col].min()} to {df[col].max()}") ``` ### Distribution Analysis ```python print("[ANALYSIS] Checking distribution normality") from scipy import stats stat, p_value = stats.shapiro(df[col]) print(f"[STAT] Shapiro-Wilk p-value: {p_value:.4f}") ``` ### Correlation Analysis ```python print("[CORR] Correlation matrix:") print(df.corr()) ``` ## Statistical Tests ### T-Test ```python from scipy.stats import ttest_ind stat, p = ttest_ind(group1, group2) print(f"[STAT] T-test: t={stat:.3f}, p={p:.4f}") ``` ### ANOVA ```python from scipy.stats import f_oneway stat, p = f_oneway(group1, group2, group3) print(f"[STAT] ANOVA: F={stat:.3f}, p={p:.4f}") ``` ## Confidence Interval Patterns ### Parametric CI for Means ```python import numpy as np from scipy import stats def mean_ci(data, confidence=0.95): """Calculate parametric confidence interval for mean.""" n = len(data) mean = np.mean(data) se = stats.sem(data) # Standard error of mean h = se * stats.t.ppf((1 + confidence) / 2, n - 1) return mean, mean - h, mean + h mean, ci_low, ci_high = mean_ci(df[col]) print(f"[STAT:estimate] mean = {mean:.3f}") print(f"[STAT:ci] 95% CI [{ci_low:.3f}, {ci_high:.3f}]") ``` ### Bootstrap CI for Medians/Complex Statistics ```python import numpy as np def bootstrap_ci(data, stat_func=np.median, n_bootstrap=10000, confidence=0.95): """Calculate bootstrap confidence interval for any statistic.""" boot_stats = [] n = len(data) for _ in range(n_bootstrap): sample = np.random.choice(data, size=n, replace=True) boot_stats.append(stat_func(sample)) alpha = 1 - confidence ci_low = np.percentile(boot_stats, 100 * alpha / 2) ci_high = np.percentile(boot_stats, 100 * (1 - alpha / 2)) return stat_func(data), ci_low, ci_high median, ci_low, ci_high = bootstrap_ci(df[col], stat_func=np.median) print(f"[STAT:estimate] median = {median:.3f}") print(f"[STAT:ci] 95% Bootstrap CI [{ci_low:.3f}, {ci_high:.3f}]") ``` ### Wilson CI for Proportions ```python from scipy import stats def wilson_ci(successes, trials, confidence=0.95): """Calculate Wilson score interval for proportions (better for small n).""" p = successes / trials z = stats.norm.ppf((1 + confidence) / 2) denominator = 1 + z**2 / trials center = (p + z**2 / (2 * trials)) / denominator spread = z * np.sqrt((p * (1 - p) + z**2 / (4 * trials)) / trials) / denominator return p, center - spread, center + spread prop, ci_low, ci_high = wilson_ci(successes=45, trials=100) print(f"[STAT:estimate] proportion = {prop:.3f}") print(f"[STAT:ci] 95% Wilson CI [{ci_low:.3f}, {ci_high:.3f}]") ``` ## Effect Size Calculation ### Cohen's d for Group Comparisons ```python import numpy as np def cohens_d(group1, group2): """Calculate Cohen's d effect size for two independent groups.""" n1, n2 = len(group1), len(group2) var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1) # Pooled standard deviation pooled_std = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2)) d = (np.mean(group1) - np.mean(group2)) / pooled_std # Interpretation magnitude = "small" if abs(d) < 0.5 else "medium" if abs(d) < 0.8 else "large" return d, magnitude d, magnitude = cohens_d(treatment, control) print(f"[STAT:effect_size] Cohen's d = {d:.3f} ({magnitude})") ``` ### r² for Correlations ```python from scipy import stats def correlation_r2(x, y): """Calculate Pearson r and r² with interpretation.""" r, p = stats.pearsonr(x, y) r2 = r ** 2 # Interpretation (based on Cohen's guidelines for r) magnitude = "small" if abs(r) < 0.3 else "medium" if abs(r) < 0.5 else "large" return r, r2, p, magnitude r, r2, p, magnitude = correlation_r2(df[x_col], df[y_col]) print(f"[STAT:estimate] r = {r:.3f}") print(f"[STAT:effect_size] r² = {r2:.3f} ({magnitude} effect, {r2*100:.1f}% variance explained)") print(f"[STAT:p_value] p = {p:.4f}") ``` ### Cliff's Delta for Non-Parametric Comparisons ```python import numpy as np def cliffs_delta(group1, group2): """Calculate Cliff's delta (non-parametric effect size).""" n1, n2 = len(group1), len(group2) # Count dominance more = sum(1 for x in group1 for y in group2 if x > y) less = sum(1 for x in group1 for y in group2 if x < y) delta = (more - less) / (n1 * n2) # Interpretation (Romano et al., 2006) abs_d = abs(delta) magnitude = "negligible" if abs_d < 0.147 else "small" if abs_d < 0.33 else "medium" if abs_d < 0.474 else "large" return delta, magnitude delta, magnitude = cliffs_delta(treatment, control) print(f"[STAT:effect_size] Cliff's delta = {delta:.3f} ({magnitude})") ``` ## Assumption Checking ### Normality: Shapiro-Wilk and Q-Q Plot ```python from scipy import stats import matplotlib.pyplot as plt def check_normality(data, col_name="variable", alpha=0.05): """Check normality assumption with Shapiro-Wilk test and Q-Q plot.""" # Shapiro-Wilk test (best for n < 5000) stat, p = stats.shapiro(data) is_normal = p > alpha print(f"[CHECK:normality] Shapiro-Wilk W={stat:.4f}, p={p:.4f}") print(f"[CHECK:normality] {'PASS' if is_normal else 'FAIL'}: Data {'is' if is_normal else 'is NOT'} normally distributed (α={alpha})") # Q-Q plot for visual inspection fig, ax = plt.subplots(figsize=(6, 6)) stats.probplot(data, dist="norm", plot=ax) ax.set_title(f"Q-Q Plot: {col_name}") plt.savefig(f"reports/figures/qq_plot_{col_name}.png", dpi=150, bbox_inches="tight") plt.close() return is_normal, stat, p is_normal, stat, p = check_normality(df[col], col_name=col) ``` ### Homogeneity of Variance: Levene's Test ```python from scipy import stats def check_homogeneity(*groups, alpha=0.05): """Check homogeneity of variance (homoscedasticity) with Levene's test.""" stat, p = stats.levene(*groups) is_homogeneous = p > alpha print(f"[CHECK:homogeneity] Levene's W={stat:.4f}, p={p:.4f}") print(f"[CHECK:homogeneity] {'PASS' if is_homogeneous else 'FAIL'}: Variances {'are' if is_homogeneous else 'are NOT'} equal (α={alpha})") if not is_homogeneous: print("[CHECK:homogeneity] Recommendation: Use Welch's t-test instead of Student's t-test") return is_homogeneous, stat, p is_homogeneous, stat, p = check_homogeneity(group1, group2) ``` ### Independence: Durbin-Watson Test (for Regression Residuals) ```python from statsmodels.stats.stattools import durbin_watson def check_independence(residuals): """Check independence of residuals with Durbin-Watson test.""" dw_stat = durbin_watson(residuals) # Interpretation: DW ≈ 2 means no autocorrelation # DW < 1.5 suggests positive autocorrelation # DW > 2.5 suggests negative autocorrelation if dw_stat < 1.5: status = "FAIL - positive autocorrelation detected" elif dw_stat > 2.5: status = "FAIL - negative autocorrelation detected" else: status = "PASS - no significant autocorrelation" print(f"[CHECK:independence] Durbin-Watson = {dw_stat:.3f}") print(f"[CHECK:independence] {status}") return dw_stat, status dw_stat, status = check_independence(model.resid) ``` ## Robust Alternatives ### Welch's t-test (Instead of Student's t-test) ```python from scipy import stats def welchs_ttest(group1, group2, alpha=0.05): """ Welch's t-test - DEFAULT choice for comparing two groups. Does NOT assume equal variances (more robust than Student's t-test). """ stat, p = stats.ttest_ind(group1, group2, equal_var=False) # equal_var=False for Welch's print(f"[DECISION] Using Welch's t-test: Does not assume equal variances") print(f"[STAT:estimate] t-statistic = {stat:.3f}") print(f"[STAT:p_value] p = {p:.4f}") # Effect size from numpy import sqrt, var, mean n1, n2 = len(group1), len(group2) pooled_std = sqrt(((n1-1)*var(group1, ddof=1) + (n2-1)*var(group2, ddof=1)) / (n1+n2-2)) d = (mean(group1) - mean(group2)) / pooled_std magnitude = "small" if abs(d) < 0.5 else "medium" if abs(d) < 0.8 else "large" print(f"[STAT:effect_size] Cohen's d = {d:.3f} ({magnitude})") return stat, p, d t_stat, p_value, effect_size = welchs_ttest(treatment, control) ``` ### Mann-Whitney U Test (for Non-Normal Data) ```python from scipy import stats import numpy as np def mann_whitney_test(group1, group2, alpha=0.05): """ Mann-Whitney U test - Non-parametric alternative to t-test. Use when normality assumption is violated. """ stat, p = stats.mannwhitneyu(group1, group2, alternative='two-sided') print(f"[DECISION] Using Mann-Whitney U: Non-parametric, does not assume normality") print(f"[STAT:estimate] U-statistic = {stat:.3f}") print(f"[STAT:p_value] p = {p:.4f}") # Effect size: Cliff's delta (appropriate for non-parametric) n1, n2 = len(group1), len(group2) more = sum(1 for x in group1 for y in group2 if x > y) less = sum(1 for x in group1 for y in group2 if x < y) delta = (more - less) / (n1 * n2) magnitude = "negligible" if abs(delta) < 0.147 else "small" if abs(delta) < 0.33 else "medium" if abs(delta) < 0.474 else "large" print(f"[STAT:effect_size] Cliff's delta = {delta:.3f} ({magnitude})") return stat, p, delta u_stat, p_value, effect_size = mann_whitney_test(treatment, control) ``` ### Permutation Test (for Complex Designs) ```python import numpy as np def permutation_test(group1, group2, n_permutations=10000, stat_func=None): """ Permutation test - Most robust, makes minimal assumptions. Use when parametric assumptions are violated or for complex statistics. """ if stat_func is None: stat_func = lambda x, y: np.mean(x) - np.mean(y) observed = stat_func(group1, group2) combined = np.concatenate([group1, group2]) n1 = len(group1) # Generate permutation distribution perm_stats = [] for _ in range(n_permutations): np.random.shuffle(combined) perm_stat = stat_func(combined[:n1], combined[n1:]) perm_stats.append(perm_stat) # Two-tailed p-value p_value = np.mean(np.abs(perm_stats) >= np.abs(observed)) print(f"[DECISION] Using permutation test: Assumption-free, {n_permutations} permutations") print(f"[STAT:estimate] Observed difference = {observed:.4f}") print(f"[STAT:p_value] p = {p_value:.4f} (permutation-based)") # Bootstrap CI for the observed statistic boot_diffs = [] for _ in range(n_permutations): b1 = np.random.choice(group1, size=len(group1), replace=True) b2 = np.random.choice(group2, size=len(group2), replace=True) boot_diffs.append(stat_func(b1, b2)) ci_low, ci_high = np.percentile(boot_diffs, [2.5, 97.5]) print(f"[STAT:ci] 95% Bootstrap CI [{ci_low:.4f}, {ci_high:.4f}]") return observed, p_value, ci_low, ci_high obs, p_val, ci_low, ci_high = permutation_test(treatment, control) ``` ## Memory Management ```python print(f"[MEMORY] DataFrame size: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB") # Clean up del large_df import gc; gc.collect() ```