--- title: "HW6" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ## Clean Data ```{r} library(sf) library(dplyr) library(ggplot2) pluto = st_read("/data/nyc_parking/pluto_manhattan/", stringsAsFactors = FALSE, quiet = TRUE) plot(st_geometry(st_centroid(pluto)), pch=16, cex=0.1, col=adjustcolor("black", alpha.f = 0.1)) pluto_df = pluto %>% st_centroid() %>% st_coordinates() %>% cbind(data.frame(Address=pluto$Address), .) %>% mutate(Address = tolower(Address)) plot(pluto_df[,2:3], pch=16, cex=0.1, col=adjustcolor("black", alpha.f = 0.1), asp=1) load("/data/nyc_parking/NYParkingViolations.Rdata") nyc_df = nyc %>% select(Violation.Precinct, House.Number, Street.Name) %>% mutate(Address = paste(House.Number, Street.Name)) %>% select(Precinct = Violation.Precinct, Address) %>% mutate(Address = tolower(Address)) full = inner_join(nyc_df, pluto_df) %>% filter(Precinct >=1 & Precinct <= 34) # Manhattan Precincts ggplot(full, aes(x=X, y=Y, color=as.factor(Precinct))) + geom_point(size=0.1) ``` ## Modeling ### Setup ```{r} nybb = st_read("/data/nyc_parking/nybb/") manh = nybb %>% filter(BoroName == "Manhattan") library(raster) st_bbox(manh) rast_extent = extent(c(-74.04773,-73.90665,40.68292,40.87904)) r = raster(rast_extent, nrow=300, ncol=100) manh_r = rasterize(as(manh,"Spatial"), r) manh_cells = which(!is.na(manh_r[])) manh_xy = xyFromCell(manh_r, manh_cells) pred_xy = data.frame(manh_xy) %>% setNames(c("X","Y")) plot(manh_xy, pch=16, cex=0.1) ``` ### Logistic Regression ```{r} full_log = full %>% mutate(precinct1 = (Precinct == 1)) g = glm(precinct1 ~ poly(X,2)*poly(Y,2), family=binomial, data=full_log) r_logistic = r r_logistic[manh_cells] = predict(g, newdata=pred_xy, type="response") full_log_p1 = full_log %>% filter(precinct1 == TRUE) plot(r_logistic) #points(full_log_p1$X, full_log_p1$Y, pch=16, cex=0.5) ``` ### Multiple Logistic Regressions ```{r} precincts = sort(unique(full$Precinct)) n_precincts = length(precincts) probs = matrix(NA, ncol=n_precincts, nrow=nrow(pred_xy)) for(i in seq_along(precincts)) { cat("Precinct",precincts[i],"\n") tmp = full %>% mutate(p = (Precinct == precincts[i])) g = glm(p ~ poly(X,2)*poly(Y,2), family=binomial, data=tmp) probs[,i] = predict(g, newdata=pred_xy, type="response") } precinct_index = apply(probs, 1, which.max) r_multi_log = r r_multi_log[manh_cells] = as.character(precincts[precinct_index]) plot(r_multi_log) ``` ### Multinomial Regression ```{r} library(nnet) full_mn = full %>% mutate(z = as.factor(Precinct)) m = multinom(z ~ X + Y + X:Y + I(X^2) + I(Y^2), data=full_mn) pred_m = predict(m, newdata=pred_xy) r_multi = r r_multi[manh_cells] = as.character(pred_m) plot(r_multi) ``` ### xgboost ```{r} library(xgboost) precincts = factor(full$Precinct) %>% levels() y = (factor(full$Precinct) %>% as.integer()) - 1L x = full %>% dplyr::select(X, Y) %>% as.matrix() m = xgboost(data=x, label=y, nthread=4, nround=20, objective="multi:softmax", num_class=length(precincts)) xg_pred = predict(m, newdata=as.matrix(pred_xy)) pred_label = precincts[xg_pred+1] r_xg = r r_xg[manh_cells] = as.character(pred_label) plot(r_xg) ``` ## Scoring / Predict ```{r} source("polygonizer.R") poly = polygonizer(r_xg) st_write(poly, dsn = "precincts.json") ```