library(ggplot2) library(dplyr) df <- read.csv("https://raw.githubusercontent.com/roualdes/data/refs/heads/master/finches.csv") m <- mean(df$beakwidth) s <- sd(df$beakwidth) ggplot(data = df) + geom_histogram(aes(x = beakwidth, y = after_stat(density)), bins = 11) + geom_function(fun = \(x) dnorm(x, m, s)) dfr <- data.frame( r = df$beakwidth - m ) ggplot(data = dfr) + geom_histogram(aes(x=r, y=after_stat(density)), bins = 11) (5 - m) / s pnorm((5 - m) / s) * 100 # question about different cutoffs for outliers q1 <- qnorm(0.25) q3 <- qnorm(0.75) iqr <- q3 - q1 q1 - 1.5 * iqr q3 + 1.5 * iqr # ANOVA ggplot(data = df) + geom_jitter(aes(island, beakwidth), width = 0.2) fit <- lm(beakwidth ~ island, data = df) summary(fit) dfr <- data.frame( # r = residuals(fit) r = rstandard(fit) ) ggplot(data = dfr) + geom_histogram(aes(r), bins = 11) # Simple Linear Regression ggplot(data = df, aes(taillength, beakwidth)) + geom_point() + geom_smooth(method = "lm") fit <- lm(beakwidth ~ taillength, data = df) summary(fit) dfr <- data.frame( r = rstandard(fit) ) ggplot(data = dfr) + geom_histogram(aes(r), bins = 11) df <- df |> mutate( r = dfr$r, outlier = ifelse(abs(r) > 3, "out", "normal") ) ggplot(data = df, aes(taillength, beakwidth, color = outlier)) + geom_point() + geom_smooth(method = "lm", se = FALSE) # State data data(state) state <- as.data.frame(state.x77) fit <- lm(Income ~ Illiteracy, data = state) summary(fit) dfr <- data.frame( r = rstandard(fit) ) ggplot(data = dfr) + geom_histogram(aes(r), bins = 11) state <- state |> mutate( r = dfr$r, yhat = predict(fit) ) ggplot(data = state, aes(Illiteracy, Income)) + geom_point() + geom_smooth(method = "lm", se = FALSE) ggplot(data = state) + geom_point(aes(yhat, r))