---
title: "Lab-03 Identifying Neighborhood Clusters"
output:
html_document:
theme: readable
highlight: tango
toc: true
self_contained: false
number_sections: false
css: textbook.css
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo=T, fig.width=10, fig.height=6, warning=F, message=F )
```
# Packages
```{r}
library( geojsonio ) # read shapefiles
library( sp ) # work with shapefiles
library( sf ) # work with shapefiles - simple features format
library( mclust ) # cluster analysis
library( tmap ) # theme maps
library( ggplot2 ) # graphing
library( ggthemes )
library( dplyr )
library( pander )
```
## Data Source
This exercise uses Census data from the 2012 American Communities Survey made available through the [Diversity and Disparities Project](https://s4.ad.brown.edu/projects/diversity/Researcher/Bridging.htm).
**DATA DICTIONARY**
```{r, echo=F}
data.dictionary <-
structure(list(LABEL = c("tractid", "pnhwht12", "pnhblk12", "phisp12",
"pntv12", "pfb12", "polang12", "phs12", "pcol12", "punemp12",
"pflabf12", "pprof12", "pmanuf12", "pvet12", "psemp12", "hinc12",
"incpc12", "ppov12", "pown12", "pvac12", "pmulti12", "mrent12",
"mhmval12", "p30old12", "p10yrs12", "p18und12", "p60up12", "p75up12",
"pmar12", "pwds12", "pfhh12"), VARIABLE = c("GEOID", "Percent white, non-Hispanic",
"Percent black, non-Hispanic", "Percent Hispanic", "Percent Native American race",
"Percent foreign born", "Percent speaking other language at home, age 5 plus",
"Percent with high school degree or less", "Percent with 4-year college degree or more",
"Percent unemployed", "Percent female labor force participation",
"Percent professional employees", "Percent manufacturing employees",
"Percent veteran", "Percent self-employed", "Median HH income, total",
"Per capita income", "Percent in poverty, total", "Percent owner-occupied units",
"Percent vacant units", "Percent multi-family units", "Median rent",
"Median home value", "Percent structures more than 30 years old",
"Percent HH in neighborhood 10 years or less", "Percent 17 and under, total",
"Percent 60 and older, total", "Percent 75 and older, total",
"Percent currently married, not separated", "Percent widowed, divorced and separated",
"Percent female-headed families with children")), class = "data.frame", row.names = c(NA,
-31L))
data.dictionary %>% pander()
```
# Load Phoenix Shapefile
We are using a [Dorling Cartogram](https://github.com/sjewo/cartogram) representation of Census tracts to remove bias.
The steps to create the cartogram are described [HERE](https://github.com/DS4PS/cpp-529-master/tree/master/data).
```{r}
# dorling cartogram of Phoenix Census Tracts
github.url <- "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/phx_dorling.geojson"
phx <- geojson_read( x=github.url, what="sp" )
plot( phx )
```
We can do better than that. Let's take a look at the [**tmap** package](http://127.0.0.1:30972/library/tmap/doc/tmap-getstarted.html) in R.
```{r, fig.width=10, fig.height=8}
# library( tmap )
phx <- spTransform( phx, CRS("+init=epsg:3395") )
bb <- st_bbox( c( xmin = -12519146, xmax = -12421368,
ymax = 3965924, ymin = 3899074 ),
crs = st_crs("+init=epsg:3395"))
tm_shape( phx, bbox=bb ) +
tm_polygons( col="MHHI", n=10, style="quantile", palette="Spectral" ) +
tm_layout( "PHX Dorling Cartogram", title.position=c("right","top") )
```
```{r, fig.width=10, fig.height=8}
tmap_mode("view")
tm_basemap( "HikeBike.HillShading" ) +
tm_shape( phx, bbox=bb ) +
tm_polygons( col="phisp12", n=7, style="quantile", palette="-inferno" )
```
```{r, fig.width=10, fig.height=8}
tm_basemap( "Stamen.Watercolor" ) +
tm_shape( phx, bbox=bb ) +
tm_polygons( col="MHHI", n=7, style="quantile", palette="RdYlBu" ) +
tm_legend( show=FALSE )
```
Check out some [useful guides](https://www.datanovia.com/en/blog/top-r-color-palettes-to-know-for-great-data-visualization/) or launch the palette explorer for an easy way to find great color schemes for your data.
```{r, eval=F}
tmaptools::palette_explorer()
```
## Census Variables
This shapefile comes pre-loaded with the Census data from your first lab.
```{r}
names( phx )
```
We are going to extract the data from the shapefile and save it as a separate data frame so we can use it for analysis independent of the shapefile.
```{r}
d1 <- phx@data
head( d1[,1:6] ) %>% pander()
```
## Prepare Data for Clustering
We transform all of the variables to z scorse so they are on the same scale while clustering. This ensures that each census variable has equal weight. Z-scores typically range from about -3 to +3 with a mean of zero.
```{r}
keep.these <- c("pnhwht12", "pnhblk12", "phisp12", "pntv12", "pfb12", "polang12",
"phs12", "pcol12", "punemp12", "pflabf12", "pprof12", "pmanuf12",
"pvet12", "psemp12", "hinc12", "incpc12", "ppov12", "pown12",
"pvac12", "pmulti12", "mrent12", "mhmval12", "p30old12", "p10yrs12",
"p18und12", "p60up12", "p75up12", "pmar12", "pwds12", "pfhh12")
d2 <- select( d1, keep.these )
d3 <- apply( d2, 2, scale )
head( d3[,1:6] ) %>% pander()
```
## Perform Cluster Analysis
For more details on cluster analysis visit the [**mclust** tutorial](https://cran.r-project.org/web/packages/mclust/vignettes/mclust.html).
```{r}
# library( mclust )
set.seed( 1234 )
fit <- Mclust( d3 )
phx$cluster <- as.factor( fit$classification )
summary( fit )
```
Some visuals of model fit statistics (doesn't work well with a lot of variables):
```{r, eval=F}
plot( fit, what = "classification" )
```
```{r, fig.width=10, fig.height=8}
# dropping two tracts that cover the legend
phx2 <- phx[ !(phx$GEOID %in% c("04013723308","04013723304")) , ]
tmap_mode("plot")
tmap_style("cobalt")
tm_shape( phx2, bbox=bb ) +
tm_polygons( col="cluster", palette="Accent" )
```
# Identifying Neighborhood Clusters
Ok, so we have a pretty map now, but what does it mean? Let's look at our clusters.
We can split our data up by the 8 distinct clusters and look at characteristics of each.
Let's start by looking at how one of our 30 census variables varies by group:
```{r}
# library( ggplot2 )
# library( ggthemes )
# table( d2$cluster ) # number of tracts in each group
d2$cluster <- d2$cluster <- as.factor( paste0("GROUP-",fit$classification) )
ggplot( d2, aes( x=phisp12 ) ) +
geom_density( alpha = 0.5, fill="blue" ) + # xlim( -3, 3 ) +
xlab( "Percent Hispanic" ) + facet_wrap( ~ cluster, nrow=2 ) + theme_minimal()
```
We can see that some groups (neighborhood clusters) have a high proportion of Hispanic residents, while others have almost none. That is a good sign - it means the clustering appears to be meaningful in the sense that it identifies differences between the groups.
Let's look at one more.
```{r}
ggplot( d2, aes( x=pcol12) ) +
geom_density( alpha = 0.5, fill="blue" ) + # xlim( -3, 3 ) +
xlab( "Percent with a College Degree" ) + facet_wrap( ~ cluster, nrow=2 ) + theme_minimal()
```
We can see that lots of clusters have between 20 and 40 percent with college degrees. Group 1 has an extremely high proportion, and Group 7 has an extremely low proportion.
```{r, fig.height=10, fig.width=8, eval=F}
df.pct <- sapply( d2, ntile, 100 )
d3 <- as.data.frame( df.pct )
d3$cluster <- as.factor( paste0("GROUP-",fit$classification) )
stats <-
d3 %>%
group_by( cluster ) %>%
summarise_each( funs(mean) )
t <- data.frame( t(stats), stringsAsFactors=F )
names(t) <- paste0( "GROUP.", 1:8 )
t <- t[-1,]
for( i in 1:8 )
{
z <- t[,i]
plot( rep(1,30), 1:30, bty="n", xlim=c(-75,100),
type="n", xaxt="n", yaxt="n",
xlab="Percentile", ylab="",
main=paste("GROUP",i) )
abline( v=seq(0,100,25), lty=3, lwd=1.5, col="gray90" )
segments( y0=1:30, x0=0, x1=100, col="gray70", lwd=2 )
text( -0.2, 1:30, data.dictionary$VARIABLE[-1], cex=0.85, pos=2 )
points( z, 1:30, pch=19, col="firebrick", cex=1.5 )
axis( side=1, at=c(0,50,100), col.axis="gray", col="gray" )
}
```
```{r, fig.width=10, fig.height=8, echo=F}
tmap_mode("plot")
tmap_style("cobalt")
tm_shape( phx2, bbox=bb ) +
tm_polygons( col="cluster", palette="Accent" )
```
```{r, fig.height=10, fig.width=8, echo=F}
df.pct <- sapply( d2, ntile, 100 )
d3 <- as.data.frame( df.pct )
d3$cluster <- as.factor( paste0("GROUP-",fit$classification) )
stats <-
d3 %>%
group_by( cluster ) %>%
summarise_each( funs(mean) )
t <- data.frame( t(stats), stringsAsFactors=F )
names(t) <- paste0( "GROUP.", 1:8 )
t <- t[-1,]
for( i in 1:4 )
{
z <- t[,i]
plot( rep(1,30), 1:30, bty="n", xlim=c(-75,100),
type="n", xaxt="n", yaxt="n",
xlab=" Percentile",
ylab="", col.lab="gray40",
main=paste("GROUP",i) )
rect( xleft=0, ybottom=0, xright=20, ytop=31, col=gray(0.75,0.5), border="gray80" )
rect( xleft=40, ybottom=0, xright=60, ytop=31, col=gray(0.75,0.5), border="gray80" )
rect( xleft=80, ybottom=0, xright=100, ytop=31, col=gray(0.75,0.5), border="gray80" )
# abline( v=seq(0,100,25), lty=3, lwd=1.5, col="gray70" )
segments( y0=1:30, x0=0, x1=100, col="gray70", lwd=2 )
text( -0.2, 1:30, data.dictionary$VARIABLE[-1], cex=0.85, pos=2 )
points( z, 1:30, pch=19, col="firebrick", cex=1.5 )
axis( side=1, at=c(0,50,100), col.axis="gray", col="gray" )
}
```
```{r, fig.width=10, fig.height=8, echo=F}
tmap_mode("plot")
tmap_style("cobalt")
tm_shape( phx2, bbox=bb ) +
tm_polygons( col="cluster", palette="Accent" )
```
```{r, fig.height=10, fig.width=8, echo=F}
df.pct <- sapply( d2, ntile, 100 )
d3 <- as.data.frame( df.pct )
d3$cluster <- as.factor( paste0("GROUP-",fit$classification) )
stats <-
d3 %>%
group_by( cluster ) %>%
summarise_each( funs(mean) )
t <- data.frame( t(stats), stringsAsFactors=F )
names(t) <- paste0( "GROUP.", 1:8 )
t <- t[-1,]
for( i in 5:8 )
{
z <- t[,i]
plot( rep(1,30), 1:30, bty="n", xlim=c(-75,100),
type="n", xaxt="n", yaxt="n",
xlab=" Percentile",
ylab="", col.lab="gray40",
main=paste("GROUP",i) )
rect( xleft=0, ybottom=0, xright=20, ytop=31, col=gray(0.75,0.5), border="gray80" )
rect( xleft=40, ybottom=0, xright=60, ytop=31, col=gray(0.75,0.5), border="gray80" )
rect( xleft=80, ybottom=0, xright=100, ytop=31, col=gray(0.75,0.5), border="gray80" )
abline( v=seq(0,100,25), lty=3, lwd=1.5, col="gray90" )
segments( y0=1:30, x0=0, x1=100, col="gray70", lwd=2 )
text( -0.2, 1:30, data.dictionary$VARIABLE[-1], cex=0.85, pos=2 )
points( z, 1:30, pch=19, col="firebrick", cex=1.5 )
axis( side=1, at=c(0,50,100), col.axis="gray", col="gray" )
}
```
```{r, fig.width=10, fig.height=8, echo=F}
tmap_mode("plot")
tmap_style("cobalt")
tm_shape( phx2, bbox=bb ) +
tm_polygons( col="cluster", palette="Accent" )
```
You are now ready to identify meaningful labels for your clusters.
For the YellowDig discussion this week propose labels for Groups 1-8 based upon these characteristics. A good label will be catchy and descriptive of the population within the neighborhood cluster.
-------
# Submission Instructions
After you have completed your lab submit via Canvas. Login to the ASU portal at and navigate to the assignments tab in the course repository. Upload your RMD and your HTML files to the appropriate lab submission link. Or else use the link from the Schedule page.
Remember to name your files according to the convention: **Lab-##-LastName.xxx**
-------
# Auxiliary Functions
### Create Census Var Levels by Clustering Groups
Note these produce plots reporting percentiles for each census variable, not percents.
```{r, fig.height=16}
# list variables for clustering
use.these <- c("pnhwht12", "pnhblk12", "phisp12", "pntv12", "pfb12", "polang12",
"phs12", "pcol12", "punemp12", "pflabf12", "pprof12", "pmanuf12",
"pvet12", "psemp12", "hinc12", "incpc12", "ppov12", "pown12",
"pvac12", "pmulti12", "mrent12", "mhmval12", "p30old12", "p10yrs12",
"p18und12", "p60up12", "p75up12", "pmar12", "pwds12", "pfhh12")
df <- phx@data # extract data frame from spatial sp object
model.dat <- select( df, use.these ) # select census variables for clustering
model.dat.scaled <- apply( model.dat, 2, scale ) # transform raw measures into z scores
# library( mclust )
fit <- Mclust( model.dat.scaled )
group.ids <- as.character( fit$classification )
# data.dictionary must have "LABEL" for var name and "VARIABLE" for description
view_cluster_data <- function( model.dat, group.ids, data.dictionary )
{
num.groups <- length( unique( as.character( group.ids ) ) )
num.vars <- ncol( model.dat )
data.dictionary <- data.dictionary[ data.dictionary$LABEL %in% use.these , ]
df.ntile <- sapply( model.dat, ntile, 100 )
df.ntile <- as.data.frame( df.ntile )
df.ntile$cluster <- as.factor( paste0("GROUP-", group.ids ) )
stats <-
df.ntile %>%
group_by( cluster ) %>%
summarise_each( funs(mean) )
df.stats <- data.frame( t(stats), stringsAsFactors=F )
names(df.stats) <- paste0( "GROUP.", 1:num.groups )
df.stats <- df.stats[-1,]
for( i in 1:num.groups )
{
z <- df.stats[,i]
print({
plot( rep(1,num.vars), 1:num.vars, bty="n", xlim=c(-75,100),
type="n", xaxt="n", yaxt="n",
xlab="Percentile",
ylab="", col.lab="gray40",
main=paste( "GROUP", i ) )
rect( xleft=0, ybottom=0, xright=20, ytop=(num.vars+1), col=gray(0.75,0.5), border="gray80" )
rect( xleft=40, ybottom=0, xright=60, ytop=(num.vars+1), col=gray(0.75,0.5), border="gray80" )
rect( xleft=80, ybottom=0, xright=100, ytop=(num.vars+1), col=gray(0.75,0.5), border="gray80" )
abline( v=seq(0,100,25), lty=3, lwd=1.5, col="gray90" )
segments( y0=1:num.vars, x0=0, x1=100, col="gray70", lwd=2 )
text( -0.2, 1:num.vars, data.dictionary$VARIABLE, cex=0.85, pos=2 )
points( z, 1:num.vars, pch=19, col="firebrick", cex=1.5 )
axis( side=1, at=c(0,50,100), col.axis="gray", col="gray" )
}) # end print
} # end loop
}
par( mfrow=c(4,2) )
view_cluster_data( model.dat=model.dat,
group.ids=group.ids,
data.dictionary=data.dictionary )
```
### Helper Plots
```{r, eval=F}
pdf( "interpretting-clusters.pdf" )
tmap_style("cobalt")
tm_shape( phx2, bbox=bb ) +
tm_polygons( col="cluster", palette="Accent" )
for( i in 1:8 )
{
z <- t[,i]
plot( rep(1,30), 1:30, bty="n", xlim=c(-75,100),
type="n", xaxt="n", yaxt="n",
xlab="Percentile", ylab="",
main=paste("GROUP",i) )
abline( v=seq(0,100,25), lty=3, lwd=1.5, col="gray90" )
segments( y0=1:30, x0=0, x1=100, col="gray70", lwd=2 )
text( -0.2, 1:30, data.dictionary$VARIABLE[-1], cex=0.65, pos=2 )
# points( q50, 1:30, pch="|", cex=0.8, col="gray" )
points( z, 1:30, pch=19, col="firebrick", cex=1.5 )
axis( side=1, at=c(0,50,100), col.axis="gray", col="gray" )
}
dev.off()
```
```{r, eval=F}
d2 <- as.data.frame( d2 )
d2$cluster <- as.factor( paste0("GROUP-",fit$classification) )
pdf( "cluster-density-plots.pdf" )
these <- names(d2)
these <- these[ -length(these) ]
for( i in these )
{
graph.label <-
data.dictionary$VARIABLE[ data.dictionary$LABEL == i ] %>%
as.character()
p <-
ggplot( d2, aes( x=get(i) ) ) +
geom_density( alpha = 0.5, fill="blue" ) + # xlim( -3, 3 ) +
xlab( graph.label ) + facet_wrap( ~ cluster, nrow=2 ) + theme_minimal()
print( p )
}
dev.off()
```
### DATA DICTIONARY
This exercise uses Census data from the 2012 American Communities Survey made available through the [Diversity and Disparities Project](https://s4.ad.brown.edu/projects/diversity/Researcher/Bridging.htm).
```{r}
dd.URL <- "https://raw.githubusercontent.com/DS4PS/cpp-529-master/master/data/data-dictionary.csv"
data.dictionary <- read.csv( dd.URL, stringsAsFactors=F )
data.dictionary %>% pander()
```