--- title: "Topic 1. Introduction to Analytics" author: "EW (Jed) Frees" date: "16 November 2024" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # Example 2.3.1. Under- and Over-Fitting. From Loss Data Analytics ```{r eval = FALSE} # Generate the Data rmse <- Metrics::rmse n <- 100 set.seed(1234) u <- sample(6, n, replace = TRUE) x1 <- as.factor((u == 4) + (u == 5)) x2 <- as.factor(u) y <- 1 * (x1 == 1) + rnorm(n, sd = 1) xyData <- data.frame(x1, x2, y) n <- nrow(xyData) set.seed(123) # Number of folds k <- 5 splt <- split(sample(n), 1:k) Rmse.mat <- matrix(0, nrow = k, ncol = 3) -> AIC.mat for (i in 1:k) { test.id <- splt[[i]] test <- xyData[test.id, ] train <- xyData[-test.id, ] model0 <- lm(y ~ 1, data = train) model1 <- lm(y ~ x1, data = train) model2 <- lm(y ~ x2, data = train) Rmse.mat[i, 1] <- rmse(test$y, predict(model0, test)) Rmse.mat[i, 2] <- rmse(test$y, predict(model1, test)) Rmse.mat[i, 3] <- rmse(test$y, predict(model2, test)) AIC.mat[i, 1] <- AIC(model0) AIC.mat[i, 2] <- AIC(model1) AIC.mat[i, 3] <- AIC(model2) } OutMat <- rbind(round(Rmse.mat, digits = 3), round(colMeans(Rmse.mat), digits = 3), round(colMeans(AIC.mat), digits = 3)) colnames(OutMat) <- c("Community Rating", "Two Levels", "Six Levels") row.names(OutMat) <- c("Rmse - Fold 1", "Rmse - Fold 2", "Rmse - Fold 3", "Rmse - Fold 4", "Rmse - Fold 5", "Rmse - Average", "AIC - Average") TableGen1(TableData = OutMat, TextTitle = "Under- and Over-Fitting of Models", Align = "c", Digits = 3, ColumnSpec = 1:3, ColWidth = ColWidth5) ``` # From Chapter 3 of Regression Modeling with Actuarial and Financial Applications ```{r eval = FALSE} #Term <- read.csv("CSVData/TermLife.csv", header=TRUE) Term <- read.csv(file = "https://instruction.bus.wisc.edu/jfrees/jfreesbooks/Regression%20Modeling/BookWebDec2010/CSVData/TermLife.csv", header = TRUE) # SELECCIONAR EL SUBCONJUNTO DE DATOS CORRESPONDIENTE A LA COMPRA DE SEGURO Term2 <-subset(Term, FACE > 0) # TABLA 3.1 ESTADÍSTICAS DESCRIPTIVAS BookSummStats <- function(Xymat){ meanSummary <- sapply(Xymat, mean, na.rm=TRUE) sdSummary <- sapply(Xymat, sd, na.rm=TRUE) minSummary <- sapply(Xymat, min, na.rm=TRUE) maxSummary <- sapply(Xymat, max, na.rm=TRUE) medSummary <- sapply(Xymat, median,na.rm=TRUE) tableMat <- cbind(meanSummary, medSummary, sdSummary, minSummary, maxSummary) return(tableMat) } LNFACE <- log(Term2$FACE) LNINCOME <- log(Term2$INCOME) Xymat <- data.frame(cbind(Term2$FACE, Term2$INCOME,Term2$EDUCATION, Term2$NUMHH,LNFACE, LNINCOME) ) tableMat <- BookSummStats(Xymat) colnames(tableMat) <- c("Media" , "Mediana" , "Desviaci n Est ndar" , "M nimo" , "M ximo") rownames(tableMat) <- c("FACE", "INCOME", "EDUCATION", "NUMHH", "LNFACE", "LNINCOME") tableMat1 <- tableMat tableMat1[3:6,] <- round(tableMat1[3:6,], digits = 3) tableMat1[1:2,] <- format(round(tableMat[1:2,], digits=0), big.mark = ',') TableGen1(TableData=tableMat1, TextTitle='Estad sticas Descriptivas del Seguro de Vida Temporal', Align='r', Digits=3, ColumnSpec=1:5, ColWidth = ColWidth5) ``` ```{r eval = FALSE} # Change the read.csv to point to your local data folder #Term <- read.csv("CSVData/TermLife.csv", header=TRUE) # SELECCIONAR EL SUBCONJUNTO DE DATOS CORRESPONDIENTE A LA COMPRA DE SEGURO Term2 <-subset(Term, FACE > 0) Term2$LNFACE <- log(Term2$FACE) Term2$LNINCOME <- log(Term2$INCOME) # FIGURA 3.1 par(mfrow=c(1, 2), cex=1.1, mar=c(4.1,4,1.5,1)) plot(Term2$INCOME, Term2$FACE, ylab="", las=1, yaxt="n", xaxt="n", xlab="INCOME (en Millones)") mtext("FACE (en Millones)", side=2, at=15200000, las=1, cex=1.1, adj=.4) axis(2,at=seq(0,14000000,2000000), labels=c("0", "2", "4", "6", "8","10","12","14"), las=1) axis(1,at=seq(0,11000000,1000000), labels=c("0","1", "2","3 ", "4","5", "6","7","8","9","10","11")) plot(Term2$LNINCOME, Term2$LNFACE, ylab="", xlab = "LNINCOME", las=1) mtext("LNFACE", side=2, at=16.8, las=1, cex=1.1, adj=1.1) ``` # From Chapter 3 Multiple Linear Regression of the Online Tutorial ```{r eval = FALSE} # Note the different case of the variables, e.g; income vs INCOME, ... #Term <- read.csv("CSVData\\term_life.csv", header = TRUE) Term <- read.csv("https://assets.datacamp.com/production/repositories/2610/datasets/efc64bc2d78cf6b48ad2c3f5e31800cb773de261/term_life.csv", header = TRUE) # PICK THE SUBSET OF THE DATA CORRESPONDING TO TERM PURCHASE Term1 <- subset(Term, subset = face > 0) str(Term1) head(Term1) library(psych) Term2 <- Term1[, c("education", "face", "income", "logface", "logincome", "numhh")] #options(scipen = 100, digits = 4) head(Term2) describe(Term2)#[,c(3,4,8,5,9,2)] ```