# Faking it as a Data Scientist, Hands-on # ============================================================================== # Loading libraries, etc. # ------------------------------------------------------------------------------ library(datasets) library(ggplot2) library(gridExtra) theme_set(theme_bw(12)) # Clustering - Textbook example # ============================================================================== # Exploring data # ------------------------------------------------------------------------------ head(iris) # ?iris ggplot( iris, aes( x = Petal.Length, y =Petal.Width ) ) + geom_point() ggplot( iris, aes( x = Petal.Length, y = Petal.Width, color = Species ) ) + geom_point() + scale_colour_manual( values = c('red','black','green') ) # Running k-means clustering # Feel free to play with the arguments # ------------------------------------------------------------------------------ # controlling any randomisation set.seed(1) iris$Cluster = NULL irisCluster <- kmeans( x = iris[, 3:4], # assume we already these are the important variables centers = 3 # assume we already know how many clusters to split it in ) # K-means clustering results exploration # ------------------------------------------------------------------------------ iris$Cluster = as.character(irisCluster$cluster) ggplot( iris, aes( x = Petal.Length, y = Petal.Width, color = Species ) ) + geom_point() + scale_colour_manual( values = c('red','black','green') ) + facet_grid(Cluster~Species) table(irisCluster$cluster, iris$Species) # Cleaning # ------------------------------------------------------------------------------ rm(list = ls()) # Clustering - More realistic data set # ============================================================================== # Exploring data # ------------------------------------------------------------------------------ head(attitude) # ?attitude plot(attitude) lPlots = lapply( colnames(attitude), function(cColumnName) { ggplot(attitude) + geom_density(aes_string(x = cColumnName)) } ) do.call('grid.arrange', lPlots) # Figure out the best number of clusters # --------------------------------------------------------------------------- dfnbrclusters = data.frame( wss = ( nrow( attitude ) - 1 ) * sum( apply ( attitude, 2, var ) ), NbrClusters = 1 ) for (i in 2:15) { dfnbrclusters = rbind( dfnbrclusters, data.frame( NbrClusters = i, wss = sum( kmeans( attitude, centers = i )$withinss ) ) ) } # Find an elbow. # Incremental gain after elbow is little. ggplot( dfnbrclusters, aes(x = NbrClusters, y = wss) ) + geom_line() + geom_point() # Running k-means clustering # Feel free to play with the arguments # ------------------------------------------------------------------------------ # controlling any randomisation set.seed(1) # clustering attitude$Cluster = NULL # Centers = 6 from the elbow attitudeCluster <- kmeans( x = attitude, centers = 6 ) # K-means clustering results exploration # ------------------------------------------------------------------------------ attitude$Cluster = as.character(attitudeCluster$cluster) lClusterPlots = lapply( setdiff(colnames(attitude), 'Cluster'), function(cColumnName) { ggplot(attitude) + geom_boxplot( aes_string( x = 'Cluster', y = cColumnName, group = 'Cluster' ) ) } ) do.call( 'grid.arrange', lClusterPlots ) # Running hierarchical clustering # ------------------------------------------------------------------------------ attitude$Cluster = NULL attitudeCluster = hclust(dist(attitude)) # Hierarchical clustering results exploration # ------------------------------------------------------------------------------ attitude$Cluster = as.character(cutree(attitudeCluster, k = 6)) lClusterPlots = lapply( setdiff(colnames(attitude), 'Cluster'), function(cColumnName) { ggplot(attitude) + geom_boxplot( aes_string( x = 'Cluster', y = cColumnName, group = 'Cluster' ) ) } ) do.call( 'grid.arrange', lClusterPlots ) # Cleaning # ------------------------------------------------------------------------------ rm(list = ls()) # Other types # ------------------------------------------------------------------------------ # Fuzzy # Density # Overlapping # Classification - example # ============================================================================== # Exploring data # ------------------------------------------------------------------------------ head(infert) # ?infert plot(infert) # Running logistic regression # Feel free to play with the arguments # ------------------------------------------------------------------------------ # cautionary removal of prediction column in case you rerun it # and mistakenly to get the prediction being used to predict infert$prediction = NULL infert$predictionprobability = NULL infertClassification <- glm( case ~ ., data = infert, family = binomial() ) summary(infertClassification) # note that only induced and spontaneous are significant infertClassification <- glm( case ~ induced+spontaneous, data = infert, family = binomial() ) summary(infertClassification) # Logistic regression results exploration # ------------------------------------------------------------------------------ infert$predictionprobability = predict( infertClassification, newdata = infert, type = 'response' ) infert$prediction = infert$predictionprobability > 0.5 ggplot(infert) + geom_point( aes( x = predictionprobability, y = factor(case) ), position = position_jitter(height = 0.01) ) table( infert$prediction, infert$case ) # oveall accuracy isn't great # but what if the problem statement was fraud detection? 149/204 fraudulent # transactions could be detected at the inconvenience of 16/44 genuine customers # Looking for the famous S shape # ------------------------------------------------------------------------------ # Making some fake data # ------------------------------------------------------------------------------ # Spontaneous will be broken into more than just 0, 1, 2 so that we have # better resolution on the curve even thought you can't have 0.5 spontaneous # abortions. dtFakeDataset = data.frame( expand.grid( 0:2, (0:60)/30 ) ) colnames(dtFakeDataset) = c('induced','spontaneous') head(dtFakeDataset) # predicting probabilities for the fake data # ------------------------------------------------------------------------------ dtFakeDataset$predictionprobability = predict( infertClassification, newdata = dtFakeDataset, type = 'response' ) # s shape plot # ------------------------------------------------------------------------------ ggplot( dtFakeDataset[dtFakeDataset$induced == 0.00,] ) + geom_line( aes( x = spontaneous, y = predictionprobability ) ) # fail # making faker data to get the s shape # ------------------------------------------------------------------------------ dtFakeDataset = data.frame( expand.grid( 0:2, (-600:600)/30 ) ) colnames(dtFakeDataset) = c('induced','spontaneous') head(dtFakeDataset) # predicting probabilities for the faker data # ------------------------------------------------------------------------------ dtFakeDataset$predictionprobability = predict( infertClassification, newdata = infert, type = 'response' ) # s shape plot # ------------------------------------------------------------------------------ ggplot( dtFakeDataset ) + geom_line( aes( x = spontaneous, y = predictionprobability ) ) # Cleaning # ------------------------------------------------------------------------------ rm(list = ls()) # Regression - example # Disclaimer: Not a meteorologist. The formulation may not makes sense. # ============================================================================== # Exploring data # ------------------------------------------------------------------------------ head(airquality) # ?airquality plot( airquality[, c('Ozone','Solar.R','Wind','Temp') ] ) # Running linear regression # Feel free to play with the arguments # ------------------------------------------------------------------------------ lmAirquality = lm( Ozone ~ Solar.R + Wind + Temp, data = airquality ) summary(lmAirquality) # what are the significant variables? # Linear regression results exploration # ------------------------------------------------------------------------------ airquality$PredictionOzone = predict( lmAirquality, newdata = airquality ) airquality$ErrorOzone = airquality$PredictionOzone - airquality$Ozone plotPredictions1 = ggplot() + geom_point( data = airquality, aes( x = Ozone, y = PredictionOzone ) ) + geom_abline( slope = 1, intercept = 0 ) + geom_vline(xintercept = 0) + geom_hline(yintercept = 0) + coord_fixed() plotPredictions1 plotWindError = ggplot() + geom_point( data = airquality, aes( x = Wind, y = ErrorOzone ) ) + geom_hline(yintercept = 0) plotTempError = ggplot() + geom_point( data = airquality, aes( x = Temp, y = ErrorOzone ) ) + geom_hline(yintercept = 0) plotSolar.RError = ggplot() + geom_point( data = airquality, aes( x = Solar.R, y = ErrorOzone ) ) + geom_hline(yintercept = 0) plotOzoneError = ggplot() + geom_point( data = airquality, aes( x = Ozone, y = ErrorOzone ) ) + geom_hline(yintercept = 0) grid.arrange( plotWindError, plotTempError, plotSolar.RError, plotOzoneError, ncol = 2 ) # errors from a good linear regression should be uniformly spread # around 0 across all values of the variable. There should be no # pattern against any of the variables. # Whatever pattern is observed, try to include it in the regression # Therefore, running linear regression - second pass # Feel free to play with the arguments # ------------------------------------------------------------------------------ lmAirquality = lm( Ozone ~ Solar.R + Wind + Temp + I( Temp * Temp ) + I( Wind * Wind ), data = airquality ) summary(lmAirquality) # Linear regression second pass results exploration # ------------------------------------------------------------------------------ airquality$PredictionOzone = predict( lmAirquality, newdata = airquality ) airquality$ErrorOzone = airquality$PredictionOzone - airquality$Ozone plotPredictions1 + geom_point( data = airquality, aes( x = Ozone, y = PredictionOzone, color = '2' ) ) grid.arrange( plotWindError + geom_point( data = airquality, aes( x = Wind, y = ErrorOzone, color = '2' ) ), plotTempError + geom_point( data = airquality, aes( x = Temp, y = ErrorOzone, color = '2' ) ), plotSolar.RError + geom_point( data = airquality, aes( x = Solar.R, y = ErrorOzone, color = '2' ) ), plotOzoneError + geom_point( data = airquality, aes( x = Ozone, y = ErrorOzone, color = '2' ) ), ncol = 2 )