### ----------------------------- ### Hui Lin ### @gossip_rabbit ### ### http://scientistcafe.com ### ----------------------------- ## Data Wrangling ## ----------------------------- # Should have loaded it # library(readr) ## Read and write data: readr package read_csv("2015,2016,2017 1,2,3 4,5,6") sim.dat <- read_csv("https://raw.githubusercontent.com/happyrabbit/DataScientistR/master/Data/SegData.csv ") head(sim.dat) dat=read_csv("2015,2016,2017 100,200,300 canola,soybean,corn") print(dat) ## Baby example 2 dat=read_csv("# I will never let you know that # my favorite food is carrot Date,Food,Mood Monday,carrot,happy Tuesday,carrot,happy Wednesday,carrot,happy Thursday,carrot,happy Friday,carrot,happy Saturday,carrot,extremely happy Sunday,carrot,extremely happy", skip = 2) print(dat) ## Baby example 3 dat=read_csv("Saturday,carrot,extremely happy Sunday,carrot,extremely happy", col_names=FALSE) print(dat) ## `read_csv2()` # Read semicolon separated files dat=read_csv2("Saturday; carrot; extremely happy \n Sunday; carrot; extremely happy", col_names=FALSE) print(dat) # Read tab delimited files dat=read_tsv("every\tman\tis\ta\tpoet\twhen\the\tis\tin\tlove\n", col_names = FALSE) print(dat) # Assign separating character dat=read_delim("THE|UNBEARABLE|RANDOMNESS|OF|LIFE\n", delim = "|", col_names = FALSE) print(dat) # Substitute for missing value dat=read_csv("Q1,Q2,Q3 5,4,99",na="99") print(dat) ## data.table # library(data.table) # covert the existing data frame to data table dt <- data.table(sim.dat) class(dt) # Calculate mean for counts of online transactions: dt[, mean(online_trans)] # Calculate mean by group: dt[ , mean(online_trans), by = gender] # group by more than one variables: dt[ , mean(online_trans), by = .(gender, house)] # Assign column names for aggregated variables: dt[ , .(avg = mean(online_trans)), by = .(gender, house)] # select row # select rows with age<20 and income > 80000 dt[age < 20 & income > 80000] # select the first two rows dt[1:2] # select column # select column “age” but return it as a vector # the argument for row is empty so the result will return all observations ans <- dt[, age] head(ans) # To return `data.table` object, put column names in `list()` # Select age and online_exp columns and return as a data.table instead ans <- dt[, list(age, online_exp)] head(ans) # you can also put column names in `.()` ans <- dt[, .(age, online_exp)] # head(ans) # To select all columns from “age” to “income” ans <- dt[, age:income, with = FALSE] head(ans,2) # Delete columns using `-` or `!` # delete columns from age to online_exp ans <- dt[, -(age:online_exp), with = FALSE] ans <- dt[, !(age:online_exp), with = FALSE] # tabulation # row count dt[, .N] # Count by groups: # counts by gender dt[, .N, by= gender] # for those younger than 30, count by gender dt[age < 30, .(count=.N), by= gender] # Order table: # get records with the highest 5 online expense: head(dt[order(-online_exp)],5) # Since data table keep some characters of data frame, they share some operations dt[order(-online_exp)][1:5] # order the table by more than one variable dt[order(gender, -online_exp)][1:5] # `apply()`, `lapply()` and `sapply()` in base R ## simulate a matrix x <- cbind(x1 =1:8, x2 = c(4:1, 2:5)) dimnames(x)[[1]] <- letters[1:8] apply(x, 2, mean) col.sums <- apply(x, 2, sum) row.sums <- apply(x, 1, sum) # apply other functions: ma <- matrix(c(1:4, 1, 6:8), nrow = 2) ma apply(ma, 1, table) #--> a list of length 2 apply(ma, 1, stats::quantile) # 5 x n matrix with rownames # Tricky example ## Example with different lengths for each call z <- array(1:24, dim = 2:4) zseq <- apply(z, 1:2, function(x) seq_len(max(x))) zseq ## a 2 x 3 matrix typeof(zseq) ## list dim(zseq) ## 2 3 zseq[1,] apply(z, 3, function(x) seq_len(max(x))) # Read data sim.dat<-read.csv("https://raw.githubusercontent.com/happyrabbit/DataScientistR/master/Data/SegData.csv") # Get numerical variables sdat<-sim.dat[,!lapply(sim.dat,class)=="factor"] ## Try the following code with apply() function ## apply(sim.dat,2,class) ## What is the problem? apply(sdat, MARGIN=2,function(x) mean(na.omit(x))) apply(sdat, MARGIN=2,function(x) sd(na.omit(x))) lapply(sdat, function(x) sd(na.omit(x))) sapply(sdat, function(x) sd(na.omit(x))) sapply(sdat, function(x) sd(na.omit(x)), simplify = FALSE) # dplyr - display tbl_df(sim.dat) glimpse(sim.dat) # dplyr - subset filter(sim.dat, income >300000) %>% tbl_df() # Pipe operator # piping with . "Hello World" %>% substring(7, 11) %>% grepl("Wo", .) # without pipe ave_exp <- filter( summarise( group_by( filter( sim.dat, !is.na(income) ), segment ), ave_online_exp = mean(online_exp), n = n() ), n > 200 ) # with pipe avg_exp <- sim.dat %>% filter(!is.na(income)) %>% group_by(segment) %>% summarise( ave_online_exp = mean(online_exp), n = n() ) %>% filter(n > 200) # Subset - select rows dplyr::distinct(sim.dat) # random sample rows dplyr::sample_frac(sim.dat, 0.5, replace = TRUE) dplyr::sample_n(sim.dat, 10, replace = TRUE) # select rows by position # It is equivalent to `sim.dat[10:15,]` dplyr::slice(sim.dat, 10:15) # select the order top n entries dplyr::top_n(sim.dat,2,income) # Subset - select columns # select by column name dplyr::select(sim.dat,income,age,store_exp) # select columns whose name contains a character string dplyr::select(sim.dat, contains("_")) # select columns whose name ends with a character string # similar there is "starts_with" dplyr::select(sim.dat, ends_with("e")) # select columns Q1,Q2,Q3,Q4 and Q5 select(sim.dat, num_range("Q", 1:5)) # select columns whose names are in a group of names dplyr::select(sim.dat, one_of(c("age", "income"))) # select columns between age and online_exp dplyr::select(sim.dat, age:online_exp) # select all columns except for age dplyr::select(sim.dat, -age) # Summarize dplyr::summarise(sim.dat, avg_online = mean(online_trans)) # apply function anyNA() to each column # you can also assign a function vector such as: c("anyNA","is.factor") dplyr::summarise_each(sim.dat, funs_(c("anyNA"))) sim.dat %>% group_by(segment) %>% summarise_each(funs_(c("anyNA"))) # Create new variable # `mutate()`: compute and append one or more new columns: dplyr::mutate(sim.dat, total_exp = store_exp + online_exp) # min_rank=rank(ties.method = "min") # mutate_each() means apply function to each column dplyr::mutate_each(sim.dat, funs(min_rank)) # `transmute()`: delete the original columns and only keep the new ones dplyr::transmute(sim.dat, total_exp = store_exp + online_exp) ## Merge (x<-data.frame(cbind(ID=c("A","B","C"),x1=c(1,2,3)))) (y<-data.frame(cbind(ID=c("B","C","D"),y1=c(T,T,F)))) # join to the left # keep all rows in x left_join(x,y,by="ID") # get rows matched in both data sets inner_join(x,y,by="ID") # get rows in either data set full_join(x,y,by="ID") # filter out rows in x that can be matched in y # it doesn't bring in any values from y semi_join(x,y,by="ID") # the opposite of semi_join() # it gets rows in x that cannot be matched in y # it doesn't bring in any values from y anti_join(x,y,by="ID") ## Tidy and Reshape Data # Take a baby subset of our exemplary clothes consumers data to illustrate: (sdat<-sim.dat[1:5,1:6]) # `reshape2` example (mdat <- melt(sdat, measure.vars=c("store_exp","online_exp"), variable.name = "Channel", value.name = "Expense")) # compare the online and in store expense between male and female based on the house ownership dcast(mdat, house + gender ~ Channel, sum) ## `tidyr` package # practice functions we learnt before sdat<-sim.dat[1:5,]%>% dplyr::select(age,gender,store_exp,store_trans) sdat %>% tbl_df() # `gather()`: Analogous to `melt()` in `reshape2` msdat<-tidyr::gather(sdat,"variable","value",store_exp,store_trans) msdat %>% tbl_df() # pipe version sdat%>%gather("variable","value",store_exp,store_trans) # melt version melt(sdat, measure.vars=c("store_exp","store_trans"), variable.name = "variable", value.name = "value") # spread msdat %>% spread(variable,value) ## `separate()` and `unite()` # You can use `sep=` # By default, it is "`_`" sepdat<- msdat %>% separate(variable,c("Source","Type")) sepdat %>% tbl_df() sepdat %>% unite("variable",Source,Type,sep="_") sepdat %>% unite("variable",Source,Type,sep="_")