\[Y = f(X) + \epsilon\]
\[Y = \beta_0 + \beta_{1}X_1\]
\[y = \beta_0 + \beta_1 * x\]
lm()
sim1_mod <- lm(y ~ x, data = sim1)
coef(sim1_mod)
## (Intercept) x
## 4.220822 2.051533
str(lm())
str(sim1_mod)
## List of 12
## $ coefficients : Named num [1:2] 4.22 2.05
## ..- attr(*, "names")= chr [1:2] "(Intercept)" "x"
## $ residuals : Named num [1:30] -2.072 1.238 -4.147 0.665 1.919 ...
## ..- attr(*, "names")= chr [1:30] "1" "2" "3" "4" ...
## $ effects : Named num [1:30] -84.92 32.275 -4.13 0.761 2.015 ...
## ..- attr(*, "names")= chr [1:30] "(Intercept)" "x" "" "" ...
## $ rank : int 2
## $ fitted.values: Named num [1:30] 6.27 6.27 6.27 8.32 8.32 ...
## ..- attr(*, "names")= chr [1:30] "1" "2" "3" "4" ...
## $ assign : int [1:2] 0 1
## $ qr :List of 5
## ..$ qr : num [1:30, 1:2] -5.477 0.183 0.183 0.183 0.183 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:30] "1" "2" "3" "4" ...
## .. .. ..$ : chr [1:2] "(Intercept)" "x"
## .. ..- attr(*, "assign")= int [1:2] 0 1
## ..$ qraux: num [1:2] 1.18 1.24
## ..$ pivot: int [1:2] 1 2
## ..$ tol : num 1e-07
## ..$ rank : int 2
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 28
## $ xlevels : Named list()
## $ call : language lm(formula = y ~ x, data = sim1)
## $ terms :Classes 'terms', 'formula' language y ~ x
## .. ..- attr(*, "variables")= language list(y, x)
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "y" "x"
## .. .. .. ..$ : chr "x"
## .. ..- attr(*, "term.labels")= chr "x"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(y, x)
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. ..- attr(*, "names")= chr [1:2] "y" "x"
## $ model :'data.frame': 30 obs. of 2 variables:
## ..$ y: num [1:30] 4.2 7.51 2.13 8.99 10.24 ...
## ..$ x: int [1:30] 1 1 1 2 2 2 3 3 3 4 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language y ~ x
## .. .. ..- attr(*, "variables")= language list(y, x)
## .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:2] "y" "x"
## .. .. .. .. ..$ : chr "x"
## .. .. ..- attr(*, "term.labels")= chr "x"
## .. .. ..- attr(*, "order")= int 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(y, x)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. .. ..- attr(*, "names")= chr [1:2] "y" "x"
## - attr(*, "class")= chr "lm"
gapminder
model## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.801 8425333 779.4453
## 2 Afghanistan Asia 1957 30.332 9240934 820.8530
## 3 Afghanistan Asia 1962 31.997 10267083 853.1007
## 4 Afghanistan Asia 1967 34.020 11537966 836.1971
## 5 Afghanistan Asia 1972 36.088 13079460 739.9811
## 6 Afghanistan Asia 1977 38.438 14880372 786.1134
## 7 Afghanistan Asia 1982 39.854 12881816 978.0114
## 8 Afghanistan Asia 1987 40.822 13867957 852.3959
## 9 Afghanistan Asia 1992 41.674 16317921 649.3414
## 10 Afghanistan Asia 1997 41.763 22227415 635.3414
## # ... with 1,694 more rows
gapminder %>%
ggplot(aes(year, lifeExp, group = country)) +
geom_line(alpha = 1/3)
gapminder
modelgapminder_mod <- lm(lifeExp ~ year, data = gapminder)
summary(gapminder_mod)
##
## Call:
## lm(formula = lifeExp ~ year, data = gapminder)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.949 -9.651 1.697 10.335 22.158
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -585.65219 32.31396 -18.12 <2e-16 ***
## year 0.32590 0.01632 19.96 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.63 on 1702 degrees of freedom
## Multiple R-squared: 0.1898, Adjusted R-squared: 0.1893
## F-statistic: 398.6 on 1 and 1702 DF, p-value: < 2.2e-16
grid <- gapminder %>%
data_grid(year, country)
grid
## # A tibble: 1,704 x 2
## year country
## <int> <fctr>
## 1 1952 Afghanistan
## 2 1952 Albania
## 3 1952 Algeria
## 4 1952 Angola
## 5 1952 Argentina
## 6 1952 Australia
## 7 1952 Austria
## 8 1952 Bahrain
## 9 1952 Bangladesh
## 10 1952 Belgium
## # ... with 1,694 more rows
grid <- grid %>%
add_predictions(gapminder_mod)
grid
## # A tibble: 1,704 x 3
## year country pred
## <int> <fctr> <dbl>
## 1 1952 Afghanistan 50.51208
## 2 1952 Albania 50.51208
## 3 1952 Algeria 50.51208
## 4 1952 Angola 50.51208
## 5 1952 Argentina 50.51208
## 6 1952 Australia 50.51208
## 7 1952 Austria 50.51208
## 8 1952 Bahrain 50.51208
## 9 1952 Bangladesh 50.51208
## 10 1952 Belgium 50.51208
## # ... with 1,694 more rows
ggplot(gapminder, aes(year, group = country)) +
geom_line(aes(y = lifeExp), alpha = .2) +
geom_line(aes(y = pred), data = grid, color = "red", size = 1)
tidy()
tidy(gapminder_mod)
## term estimate std.error statistic p.value
## 1 (Intercept) -585.6521874 32.31396452 -18.12381 2.897807e-67
## 2 year 0.3259038 0.01632369 19.96509 7.546795e-80
tidy(gapminder_mod) %>%
str()
## 'data.frame': 2 obs. of 5 variables:
## $ term : chr "(Intercept)" "year"
## $ estimate : num -585.652 0.326
## $ std.error: num 32.314 0.0163
## $ statistic: num -18.1 20
## $ p.value : num 2.90e-67 7.55e-80
augment()
augment(gapminder_mod) %>%
as_tibble()
## # A tibble: 1,704 x 9
## lifeExp year .fitted .se.fit .resid .hat .sigma
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 28.801 1952 50.51208 0.5299963 -21.71108 0.0020765619 11.62203
## 2 30.332 1957 52.14160 0.4629044 -21.80960 0.0015840967 11.62193
## 3 31.997 1962 53.77112 0.4012330 -21.77412 0.0011901244 11.62197
## 4 34.020 1967 55.40064 0.3478771 -21.38064 0.0008946453 11.62241
## 5 36.088 1972 57.03016 0.3072006 -20.94216 0.0006976591 11.62288
## 6 38.438 1977 58.65968 0.2846912 -20.22168 0.0005991661 11.62363
## 7 39.854 1982 60.28920 0.2846912 -20.43520 0.0005991661 11.62341
## 8 40.822 1987 61.91872 0.3072006 -21.09672 0.0006976591 11.62271
## 9 41.674 1992 63.54824 0.3478771 -21.87424 0.0008946453 11.62187
## 10 41.763 1997 65.17776 0.4012330 -23.41476 0.0011901244 11.62010
## # ... with 1,694 more rows, and 2 more variables: .cooksd <dbl>,
## # .std.resid <dbl>
glance()
glance(gapminder_mod)
## r.squared adj.r.squared sigma statistic p.value df logLik
## 1 0.1897571 0.1892811 11.63055 398.6047 7.546795e-80 2 -6597.866
## AIC BIC deviance df.residual
## 1 13201.73 13218.05 230229.2 1702
map()
and nested data framesby_country <- gapminder %>%
group_by(country, continent) %>%
nest()
by_country
## # A tibble: 142 x 3
## country continent data
## <fctr> <fctr> <list>
## 1 Afghanistan Asia <tibble [12 x 4]>
## 2 Albania Europe <tibble [12 x 4]>
## 3 Algeria Africa <tibble [12 x 4]>
## 4 Angola Africa <tibble [12 x 4]>
## 5 Argentina Americas <tibble [12 x 4]>
## 6 Australia Oceania <tibble [12 x 4]>
## 7 Austria Europe <tibble [12 x 4]>
## 8 Bahrain Asia <tibble [12 x 4]>
## 9 Bangladesh Asia <tibble [12 x 4]>
## 10 Belgium Europe <tibble [12 x 4]>
## # ... with 132 more rows
map()
and nested data framesby_country$data[[1]]
## # A tibble: 12 x 4
## year lifeExp pop gdpPercap
## <int> <dbl> <int> <dbl>
## 1 1952 28.801 8425333 779.4453
## 2 1957 30.332 9240934 820.8530
## 3 1962 31.997 10267083 853.1007
## 4 1967 34.020 11537966 836.1971
## 5 1972 36.088 13079460 739.9811
## 6 1977 38.438 14880372 786.1134
## 7 1982 39.854 12881816 978.0114
## 8 1987 40.822 13867957 852.3959
## 9 1992 41.674 16317921 649.3414
## 10 1997 41.763 22227415 635.3414
## 11 2002 42.129 25268405 726.7341
## 12 2007 43.828 31889923 974.5803
map()
and nested data framescountry_model <- function(df) {
lm(lifeExp ~ year, data = df)
}
by_country <- by_country %>%
mutate(model = map(data, country_model))
by_country
## # A tibble: 142 x 4
## country continent data model
## <fctr> <fctr> <list> <list>
## 1 Afghanistan Asia <tibble [12 x 4]> <S3: lm>
## 2 Albania Europe <tibble [12 x 4]> <S3: lm>
## 3 Algeria Africa <tibble [12 x 4]> <S3: lm>
## 4 Angola Africa <tibble [12 x 4]> <S3: lm>
## 5 Argentina Americas <tibble [12 x 4]> <S3: lm>
## 6 Australia Oceania <tibble [12 x 4]> <S3: lm>
## 7 Austria Europe <tibble [12 x 4]> <S3: lm>
## 8 Bahrain Asia <tibble [12 x 4]> <S3: lm>
## 9 Bangladesh Asia <tibble [12 x 4]> <S3: lm>
## 10 Belgium Europe <tibble [12 x 4]> <S3: lm>
## # ... with 132 more rows
by_country %>%
filter(continent == "Europe")
## # A tibble: 30 x 4
## country continent data model
## <fctr> <fctr> <list> <list>
## 1 Albania Europe <tibble [12 x 4]> <S3: lm>
## 2 Austria Europe <tibble [12 x 4]> <S3: lm>
## 3 Belgium Europe <tibble [12 x 4]> <S3: lm>
## 4 Bosnia and Herzegovina Europe <tibble [12 x 4]> <S3: lm>
## 5 Bulgaria Europe <tibble [12 x 4]> <S3: lm>
## 6 Croatia Europe <tibble [12 x 4]> <S3: lm>
## 7 Czech Republic Europe <tibble [12 x 4]> <S3: lm>
## 8 Denmark Europe <tibble [12 x 4]> <S3: lm>
## 9 Finland Europe <tibble [12 x 4]> <S3: lm>
## 10 France Europe <tibble [12 x 4]> <S3: lm>
## # ... with 20 more rows
by_country <- by_country %>%
mutate(resids = map2(data, model, add_residuals))
by_country
## # A tibble: 142 x 5
## country continent data model resids
## <fctr> <fctr> <list> <list> <list>
## 1 Afghanistan Asia <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 2 Albania Europe <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 3 Algeria Africa <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 4 Angola Africa <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 5 Argentina Americas <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 6 Australia Oceania <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 7 Austria Europe <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 8 Bahrain Asia <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 9 Bangladesh Asia <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## 10 Belgium Europe <tibble [12 x 4]> <S3: lm> <tibble [12 x 5]>
## # ... with 132 more rows
resids <- unnest(by_country, resids)
resids
## # A tibble: 1,704 x 7
## country continent year lifeExp pop gdpPercap resid
## <fctr> <fctr> <int> <dbl> <int> <dbl> <dbl>
## 1 Afghanistan Asia 1952 28.801 8425333 779.4453 -1.10629487
## 2 Afghanistan Asia 1957 30.332 9240934 820.8530 -0.95193823
## 3 Afghanistan Asia 1962 31.997 10267083 853.1007 -0.66358159
## 4 Afghanistan Asia 1967 34.020 11537966 836.1971 -0.01722494
## 5 Afghanistan Asia 1972 36.088 13079460 739.9811 0.67413170
## 6 Afghanistan Asia 1977 38.438 14880372 786.1134 1.64748834
## 7 Afghanistan Asia 1982 39.854 12881816 978.0114 1.68684499
## 8 Afghanistan Asia 1987 40.822 13867957 852.3959 1.27820163
## 9 Afghanistan Asia 1992 41.674 16317921 649.3414 0.75355828
## 10 Afghanistan Asia 1997 41.763 22227415 635.3414 -0.53408508
## # ... with 1,694 more rows
resids %>%
ggplot(aes(year, resid)) +
geom_line(aes(group = country), alpha = 1 / 3) +
geom_smooth(se = FALSE)
scorecard
library(titanic)
titanic <- titanic_train %>%
as_tibble()
str(titanic)
## Classes 'tbl_df', 'tbl' and 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
\[P(\text{survival} = \text{Yes} | \text{age})\]
survive_age <- glm(Survived ~ Age, data = titanic, family = binomial)
summary(survive_age)
##
## Call:
## glm(formula = Survived ~ Age, family = binomial, data = titanic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1488 -1.0361 -0.9544 1.3159 1.5908
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.05672 0.17358 -0.327 0.7438
## Age -0.01096 0.00533 -2.057 0.0397 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 964.52 on 713 degrees of freedom
## Residual deviance: 960.23 on 712 degrees of freedom
## (177 observations deleted due to missingness)
## AIC: 964.23
##
## Number of Fisher Scoring iterations: 4
##
## Call:
## glm(formula = Survived ~ Age + Sex, family = binomial, data = titanic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7405 -0.6885 -0.6558 0.7533 1.8989
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.277273 0.230169 5.549 2.87e-08 ***
## Age -0.005426 0.006310 -0.860 0.39
## Sexmale -2.465920 0.185384 -13.302 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 964.52 on 713 degrees of freedom
## Residual deviance: 749.96 on 711 degrees of freedom
## (177 observations deleted due to missingness)
## AIC: 755.96
##
## Number of Fisher Scoring iterations: 4
\[f = \beta_{0} + \beta_{1}\text{age} + \beta_{2}\text{gender}\]
\[f = \beta_{0} + \beta_{1}\text{age} + \beta_{2}\text{gender} + \beta_{3}(\text{age} \times \text{gender})\]
\[f = \beta_{0} + \beta_{1}\text{age} + \beta_{2}\text{gender}\]
\[f = \beta_{0} + \beta_{1}\text{age} + \beta_{2}\text{gender} + \beta_{3}(\text{age} \times \text{gender})\]
age_accuracy <- titanic %>%
add_predictions(survive_age) %>%
mutate(pred = logit2prob(pred),
pred = as.numeric(pred > .5))
mean(age_accuracy$Survived != age_accuracy$pred, na.rm = TRUE)
## [1] 0.4061625
x_accuracy <- titanic %>%
add_predictions(survive_age_woman_x) %>%
mutate(pred = logit2prob(pred),
pred = as.numeric(pred > .5))
mean(x_accuracy$Survived != x_accuracy$pred, na.rm = TRUE)
## [1] 0.219888