## session initialization ##--------------------------------------------------------- library(datadr) library(parallel) library(trelliscope) # create a datadr backend with a 3 core cluster options(defaultLocalDiskControl = localDiskControl(makeCluster(3))) # set basepath to directory containing taxiOneWeek.csv basepath <- "__SET_PATH__" # initialize a vdb connection for trelliscope displays # the first time you do this it will ask you to confirm vdbConn(file.path(basepath, "vdb"), autoYes = TRUE) ## read in the data ##--------------------------------------------------------- # first read in a subset to get a feel for the data test <- read.csv(file.path(basepath, "taxiOneWeek.csv"), nrow = 10, stringsAsFactors = FALSE) str(test) # build a transformation function that will convert date columns # and get rid of coordinates outside of NYC trans <- function(x) { require(lubridate) x$pickup_datetime <- fast_strptime(x$pickup_datetime, format = "%Y-%m-%d %H:%M:%S", tz = "EST") x$dropoff_datetime <- fast_strptime(x$dropoff_datetime, format = "%Y-%m-%d %H:%M:%S", tz = "EST") # Set coordinates outside of NYC bounding box to NA to filter outliers. nw <- list(lat = 40.917577, lon = -74.259090) se <- list(lat = 40.477399, lon = -73.700272) ind <- which(x$dropoff_longitude < nw$lon | x$dropoff_longitude > se$lon) x$dropoff_longitude[ind] <- NA ind <- which(x$pickup_longitude < nw$lon | x$pickup_longitude > se$lon) x$pickup_longitude[ind] <- NA ind <- which(x$dropoff_latitude < se$lat | x$dropoff_latitude > nw$lat) x$dropoff_latitude[ind] <- NA ind <- which(x$pickup_latitude < se$lat | x$pickup_latitude > nw$lat) x$pickup_latitude[ind] <- NA x } # test this transformation on a subset and make sure it works str(trans(test)) # now read the whole file in as a ddf taxi <- drRead.csv(file.path(basepath, "taxiOneWeek.csv"), output = localDiskConn(file.path(basepath, "taxiddf")), postTransFn = trans, rowsPerBlock = 300000, stringsAsFactors = FALSE) # update the attributes taxi <- updateAttributes(taxi) # look at what has been created on disk list.files(file.path(basepath, "taxiddf")) # if you need to get taxi back in a future session # you don't need to read it in again - just reconnect taxi <- ddf(localDiskConn(file.path(basepath, "taxiddf"))) ## summaries ##--------------------------------------------------------- # We can get a feel for the data by looking at the summaries that are already available from calling `updateAttributes()` and also from computing additional summaries. # look at summary summary(taxi) # plot medallion frequency mft <- summary(taxi)$medallion$freqTable plot(sort(mft$Freq), ylab = "Medallion frequency", main = "Frequency of occurrance of medallions for top and bottom 5000") abline(v = 5000, lty = 2, col = "gray") ## Plot the hack license use frequency hft <- summary(taxi)$hack_license$freqTable plot(hft$Freq, ylab = "Hack license frequency", main ="Frequncy of use for hack licenses for top and bottom 5000") abline(v = 5000, lty = 2, col = "gray") ## Compute a list of the quantiles of some of the variables varName <- c("passenger_count", "trip_time_in_secs", "trip_distance", "total_amount", "tip_amount") quantList <- lapply(varName, function(var) drQuantile(taxi, var = var, tails = 1)) ## Create and test a function to create the quantile plot quant.plot <- function(quant, var.name, ylim){ p1 <- ggplot(quant, aes(fval, q)) + geom_point() + ggtitle(paste("Quantile plot of", var.name)) + ylab(var.name) + xlab("Proportion") p2 <- ggplot(subset(quant, q < ylim), aes(fval, q)) + geom_point() + ggtitle(paste("Quantile plot of", var.name)) + ylab(var.name) + xlab("Proportion") grid.arrange(p1, p2, ncol = 2) } require(ggplot2) require(gridExtra) quant.plot(quantList[[1]], "passenger_count", 10) # plot the quantile information, with limits on the y axis. ylims <- c(10, 3000, 25, 100, 40) Map(quant.plot, quantList, varName, ylims) # drill down on tips by payment type tip_quant <- drQuantile(taxi, var = "tip_amount", by = "payment_type", tails = 1) ggplot(subset(tip_quant, q < 20), aes(fval, q, group = payment_type)) + geom_point(aes(colour = factor(payment_type))) + ggtitle(paste("Quantile plot of tip amount")) + ylab("Tip amount") + xlab("Proportion") # create hexbin plot of taxi data. library(hexbin) library(trelliscope) pickup_latlon <- drHexbin(taxi, "pickup_longitude", "pickup_latitude", xbins = 300, shape = 1.4) plot(pickup_latlon, trans = log, inv = exp, style = "centroids", xlab = "longitude", ylab = "latitude", legend = FALSE) # look at the relationship between fare and distance traveled trns <- function(x) log2(x + 1) amount_vs_dist <- drHexbin(taxi, "trip_distance", "total_amount", xbins = 150, shape = 1, xTransFn = trns, yTransFn = trns) plot(amount_vs_dist, trans = log, inv = exp, style = "colorscale", colramp = LinOCS, xlab = "Distance (log2 miles)", ylab = "Total Amount Paid (log2 dollars)") # plot with another color scale plot(pickup_latlon, trans = log, inv = exp, style = "colorscale", colramp = LinOCS, xlab = "longitude", ylab = "latitude") # zooming in on Manhattan xRange <- c(-74.03, -73.92) yRange <- c(40.7, 40.82) tmp <- addTransform(taxi, function(x) { subset(x, pickup_longitude < -73.92 & pickup_longitude > -74.03 & pickup_latitude > 40.7 & pickup_latitude < 40.82) }) pickup_latlon_manhat <- drHexbin(tmp, "pickup_longitude", "pickup_latitude", xbins = 300, shape = 1.4, xRange = xRange, yRange = yRange) # plot with different color scales plot(pickup_latlon_manhat, trans = log, inv = exp, style = "centroids", xlab = "longitude", ylab = "latitude", legend = FALSE) plot(pickup_latlon_manhat, trans = log, inv = exp, style = "colorscale", xlab = "longitude", ylab = "latitude", colramp = LinOCS) ## division by distance ##--------------------------------------------------------- # break the data up by an approximation of the distance travelled # and investigate some properties of the data at different distances. # use geosphere package to compute 'as-the-crow-flies' distance library(geosphere) # define breakpoints for categorizating distance dist_breaks <- c(seq(0, 1, by = 0.1), seq(1.25, 5, by = 0.25), 6:10, 12, 15, 20, 25) # add transformation that computes distance taxi_dist <- taxi %>% addTransform(function(x) { # get log distance in miles between pickup and dropoff x$distance <- distHaversine( x[,c("pickup_longitude", "pickup_latitude")], x[,c("dropoff_longitude", "dropoff_latitude")], r = 3963.1676) x <- subset(x, !is.na(x$distance)) # break distance into categories x$distance_cat <- cut(x$distance, breaks = dist_breaks, include.lowest = TRUE) x$distance_index <- cut(x$distance, breaks = dist_breaks, include.lowest = TRUE, labels = FALSE) x }) names(taxi_dist) # look at the distribution of our new categorical variable distAgg <- drAggregate(~ distance_cat, data = taxi_dist) distAgg # divide the ddf by the distance category and payment type taxi_by_dist <- divide(taxi_dist, by = c("distance_index", "payment_type"), output = localDiskConn(file.path(basepath, "taxi_by_dist")), update = TRUE, overwrite = TRUE) ## investigating Properties by Distance ##--------------------------------------------------------- # look for differences in tip percentages by distance category # first add a transformation that computes this meanTipPrct <- function(x) mean((x$tip_amount * 100) / x$fare_amount, na.rm = TRUE) tipPrctByDist <- addTransform(taxi_by_dist, meanTipPrct) # look at a subset tipPrctByDist[[8]]$value # recombine into a data frame tipPrctByDistComb <- recombine(tipPrctByDist, combine = combRbind) tipPrctByDistComb # plot result xyplot(val ~ distance_index, groups = payment_type, data = tipPrctByDistComb, auto.key = list(space = "right"), aspect = 1.5, type = c("p", "g")) # now look at tolls paid by distance category meanToll <- function(x) mean(x$tolls_amount, na.rm = TRUE) meanTollByDist <- addTransform(taxi_by_dist, meanToll) # look at a subset meanTollByDist[[8]]$value # recombine into a data frame meanTollByDistComb <- recombine(meanTollByDist, combine = combRbind) meanTollByDistComb # plot result xyplot(val ~ distance_index, groups = payment_type, data = meanTollByDistComb, auto.key = list(space = "right"), aspect = 1, type = c("p", "g")) ## trelliscope displays ##--------------------------------------------------------- # create a plot of the total fare vs. distance with toll amount removed no_toll <- function(x) { x$no_toll <- x$total_amount - x$tolls_amount x } # test the function on a subset head(no_toll(taxi[[1]]$value)) # add the transformation taxiNoToll <- addTransform(taxi, no_toll) # look at the result taxiNoToll # look at the name of the result names(taxiNoToll) # look at the relationship between fare without tolls and distance traveled trns <- function(x) log2(x + 1) amount_vs_dist <- drHexbin(taxiNoToll, "trip_distance", "no_toll", xbins = 150, shape = 1, xTransFn = trns, yTransFn = trns, xRange = c(0, 100), yRange = c(0, 650)) plot(amount_vs_dist, trans = log, inv = exp, style = "colorscale", colramp = LinOCS, xlab = "Distance (log2 miles)", ylab = "Amount Paid Less Tolls (log2 dollars)") # look at just the fare without tolls or tip amount_vs_dist <- drHexbin(taxiNoToll, "trip_distance", "fare_amount", xbins = 150, shape = 1, xTransFn = trns, yTransFn = trns, xRange = c(0, 100), yRange = c(0, 650)) plot(amount_vs_dist, trans = log, inv = exp, style = "colorscale", colramp = LinOCS, xlab = "Distance (log2 miles)", ylab = "Fare paid, no tip or tolls (log2 dollars)") # zooming in on Manhattan xRange <- c(-74.03, -73.92) yRange <- c(40.7, 40.82) tmp <- addTransform(taxi, function(x) { subset(x, pickup_longitude < -73.92 & pickup_longitude > -74.03 & pickup_latitude > 40.7 & pickup_latitude < 40.82) }) # divide the ddf by the distance-rate_code categories. taxi_by_dist_code <- divide(taxi_dist, by = c("distance_index", "rate_code"), output = localDiskConn(file.path(basepath, "taxi_by_dist_code"), autoYes = TRUE, reset = TRUE), update = TRUE) names(taxi_by_dist_code) # a panel function for the data divided by distance and rate code xRange <- c(-74.03, -73.92) yRange <- c(40.7, 40.82) distCodePanel <- function(x) { ggplot(x, aes(pickup_longitude, pickup_latitude)) + stat_binhex(bins = 300, rm.na = TRUE) + xlim(xRange) + ylim(yRange) } distCodePanel(taxi_by_dist_code[[1]]$value) # create a cognositics function taxiCog <- function(x) { list( meanFare = cogMean(x$fare_amount), meanToll = cogMean(x$tolls_amount), meanTipPercent = cogMean((x$tip_amount *100) / x$fare_amount), count = nrow(x) ) } makeDisplay(taxi_by_dist_code, name = "Lat and Lon of pickup for trips", desc = "Lat and Lon of pickup location for trips", panelFn = distCodePanel, width = 400, height = 400, cogFn = taxiCog, lims = list(x = "same") ) # remove trips with cash payments which don't record tips diskConn <- localDiskConn(file.path(basepath, "taxi_by_dist_code2"), autoYes = TRUE, reset = TRUE) taxi_by_dist_code2 <- drLapply(taxi_by_dist_code, function(x) x$payment_type != "CSH", combine = combDdo, output = diskConn) updateAttributes(taxi_by_dist_code2) # look at tip percentage vs time of the day, distance and code distCodeTimePanel <- function(x) { x$hour <- as.factor(substring(strftime(x$pickup_datetime, format = "%Y-%m-%d %H:%M:%S"), 12, 13)) x$tipPct <- (x$tip_amount * 100) / x$fare_amount ggplot(x, aes(hour, tipPct)) + geom_boxplot(na.rm = TRUE) + ylim(0, 100) } distCodeTimePanel(taxi_by_dist_code[[8]]$value) makeDisplay(taxi_by_dist_code, name = "Tip percent by time of day", desc = "Tip percent by time of day", panelFn = distCodeTimePanel, width = 400, height = 400, cogFn = taxiCog)