### ----------------------------- ### Hui Lin ### @gossip_rabbit ### ### http://scientistcafe.com ### ----------------------------- ## Data Preprocessing ## ----------------------------- sim.dat <- read.csv("https://raw.githubusercontent.com/happyrabbit/DataScientistR/master/Data/SegData.csv ") summary(sim.dat) # set problematic values as missings sim.dat$age[which(sim.dat$age>100)]<-NA sim.dat$store_exp[which(sim.dat$store_exp<0)]<-NA # see the results summary(subset(sim.dat,select=c("age","income"))) ## Missing Values ## ----------------------------- # save the result as another object demo_imp<-impute(sim.dat,method="median/mode") # check the first 5 columns, there is no missing values in other columns summary(demo_imp[,1:5]) imp<-preProcess(sim.dat,method="medianImpute") demo_imp2<-predict(imp,sim.dat) summary(demo_imp2[,1:5]) ## Missing Values: K-nearest neighbors ## ----------------------------- imp<-preProcess(sim.dat,method="knnImpute",k=5) # need to use predict() to get KNN result demo_imp<-predict(imp,sim.dat) ## Solve the problem # find factor columns imp<-preProcess(sim.dat,method="knnImpute",k=5) idx<-which(lapply(sim.dat,class)=="factor") demo_imp<-predict(imp,sim.dat[,-idx]) summary(demo_imp[,1:3]) ## Missing Values: Bagging Tree ## ----------------------------- imp<-preProcess(sim.dat,method="bagImpute") demo_imp<-predict(imp,sim.dat) summary(demo_imp[,1:5]) ## Centering and Scaling ## ----------------------------- # DIY income<-sim.dat$income # calculate the mean of income mux<-mean(income,na.rm=T) # calculate the standard deviation of income sdx<-sd(income,na.rm=T) # centering tr1<-income-mux # scaling tr2<-tr1/sdx # preProcess() sdat<-subset(sim.dat,select=c("age","income")) # set the "method" option trans<-preProcess(sdat,method=c("center","scale")) # use predict() function to get the final result transformed<-predict(trans,sdat) ## Resolve Skewness ## ----------------------------- describe(sim.dat) # select the two columns and save them as dat_bc dat_bc<-subset(sim.dat,select=c("store_trans","online_trans")) (trans<-preProcess(dat_bc,method=c("BoxCox"))) # Use predict() to get the transformed result transformed<-predict(trans,dat_bc) # Check before and after par(mfrow=c(1,2),oma=c(2,2,2,2)) hist(dat_bc$store_trans,main="Before Transformation",xlab="store_trans") hist(transformed$store_trans,main="After Transformation",xlab="store_trans") ## Resolve Outliers ## ----------------------------- ## Z-score and modified Z-score # calculate median of the absolute dispersion for income ymad<-mad(na.omit(sdat$income)) # calculate z-score zs<-(sdat$income-mean(na.omit(sdat$income)))/ymad # count the number of outliers sum(na.omit(zs>3.5)) ## Spatial Sign Transformation # KNN imputation sdat<-sim.dat[,c("income","age")] imp<-preProcess(sdat,method=c("knnImpute"),k=5) sdat<-predict(imp,sdat) transformed <- spatialSign(sdat) transformed <- as.data.frame(transformed) par(mfrow=c(1,2),oma=c(2,2,2,2)) plot(income ~ age,data = sdat,col="blue",main="Before") plot(income ~ age,data = transformed,col="blue",main="After") ## Collinearity ## ----------------------------- # corrplot() # select non-survey numerical variables sdat<-subset(sim.dat,select=c("age","income","store_exp","online_exp","store_trans","online_trans" )) # use bagging imputation here imp<-preProcess(sdat,method="bagImpute") sdat<-predict(imp,sdat) # get the correlation matrix correlation<-cor(sdat) # plot par(oma=c(2,2,2,2)) corrplot.mixed(correlation,order="hclust",tl.pos="lt",upper="ellipse") # The `findCorrelation()` function in package `caret` will apply the above algorithm. (highCorr<-findCorrelation(cor(sdat),cutoff=.75)) # delete highly correlated columns sdat<-sdat[-highCorr] # check the new correlation matrix cor(sdat) ## Sparse Variables ## ----------------------------- # make a copy zero_demo<-sim.dat # add two sparse variable # zero1 only has one unique value # zero2 is a vector with the first element 1 and the rest are 0s zero_demo$zero1<-rep(1,nrow(zero_demo)) zero_demo$zero2<-c(1,rep(0,nrow(zero_demo)-1)) nearZeroVar(zero_demo,freqCut = 95/5, uniqueCut = 10) ## Re-encode Dummy Variables ## ----------------------------- # `class.ind()` from `nnet` package dumVar<-class.ind(sim.dat$gender) head(dumVar) # `dummyVars()` from `caret` dumMod<-dummyVars(~gender+house+income, data=sim.dat, # use "origional variable name + level" as new name levelsOnly=F) head(predict(dumMod,sim.dat)) # the function can create interaction term dumMod<-dummyVars(~gender+house+income+income:gender, data=sim.dat, levelsOnly=F) head(predict(dumMod,sim.dat))