---
title: "Hospital Data - GGPLOT 9-28-2020"
author: "Rob Wells"
date: "9/28/2020"
output: html_document
---


#rio
```{r}
library(tidyverse)
#install.packages("rio") 
```
#rio handles more than two dozen formats including tab-separated data (with the extension .tsv), 
#JSON, Stata, and fixed-width format data (.fwf).


```{r}
test <- rio::import('https://raw.githubusercontent.com/Arkansascovid/Main/master/hospital_master.csv')
head(test)
```
# Number columns
```{r}
ncol(test)
```
# Number rows


#YOUR TURN

## Retrieve Arkansas Covid Data for Hospitals

- **Download Daily Hospital Data - Manual Input" from Google Sheet**
https://docs.google.com/spreadsheets/d/1ikblX8tikM59ma1AftkqgGbyeZkXB6DuBtwMsVeoGYw/edit#gid=701586163

- **Get the File Path on Your Hard Drive**
    
    1) Find the File and LEFT CLICK on it (or Control + Click).
        A menu appears. 
    2) Continue holding down CONTROL. 
    3) Now Hold down CONTROL + OPTION at same time. 
    Copy "Daily State Data - Manual Input" as Path. 
    It will look like this:
    /Users/rswells/Downloads/Daily State Data - Manual Input.xlsx
    4) Paste that path below

Wells Notes - Don't Run this
```{r}
# hospital <- rio::import('https://raw.githubusercontent.com/Arkansascovid/Main/master/hospital_master.csv')
# hospital <- hospital [ -c(7:14) ]
# head(hospital)
```

```{r}
hospital <- rio::import("LATEST VERSION OF THE HOSPITAL SPREADSHEET", which = "Hospital-TWEET SLIDE")
```

#An example of downloading data from your hard drive
#This loads in "Daily State Data - Manual Input" from Google Sheet

- **Clean Names**
```{r}

hospital <- janitor::clean_names(hospital)
```

##Check it out - Look at the data types.
```{r}
 
#chr stands for character vectors, or strings.
#int stands for integers.
#dbl stands for doubles, or real numbers.
#dttm stands for date-times (a date + a time).
glimpse(hospital)
```


- **Cleaning: Convert data from List variables to numeric**

```{r include=FALSE}

#text to date
hospital$date <- as.Date(hospital$date)
#text to numeric
hospital[3:6] <- lapply((hospital)[3:6], as.numeric)

```

```{r}
#Eliminate junk first column
hospital <- hospital [ -c(1) ]
```

```{r}
#Rename columns
colnames(hospital)[2:5] <- c("Hospitalized", "Vent", "Ever_Hospitalized", "Ever_on_a_Vent")
```

```{r include=FALSE}
#Reorder columns
hospital <- hospital %>%
  arrange(desc(date))

```

```{r}
#Check on our cleaned table
head(hospital)
```


--**Check the date is correct**

```{r}
#today's date
today <- Sys.Date()

#NOTE - IF YOU DON'T HAVE TODAY'S DATA, THEN RUN THIS LINE INSTEAD AND COMMENT OUT today <- Sys.Date()
#today <- Sys.Date()-1

#yesterday's date
yesterday <- (today-1)

```


#Bring in master file and join with hospital

```{r}
master2 <- rio::import('https://raw.githubusercontent.com/Arkansascovid/Main/master/master_file.csv')
master2$mydate <- as.Date(master2$mydate)
head(master2)
```

#Joining selected columns from master file
```{r}
main_hospital <- master2 %>% 
  filter(county_nam=="Arkansas_all_counties") %>% 
  select(mydate, confirmed_active, active_cases) 
main_hospital2 <- left_join(hospital, main_hospital, by=c("date"="mydate"))
glimpse(main_hospital2)
```

#Hospital Math

```{r}
hospital1 <- main_hospital2 %>% 
  mutate(New_Admits = (Ever_Hospitalized-lead(Ever_Hospitalized))) %>% 
  mutate(Hosp_Change_from_Yesterday = (Hospitalized-lead(Hospitalized))) %>% 
  mutate(New_Discharges_Deaths = (New_Admits-Hosp_Change_from_Yesterday)) %>% 
  mutate(Pct_Vent = (Vent/Hospitalized)*100) %>% 
  mutate(New_on_Vent = (Ever_on_a_Vent-lead(Ever_on_a_Vent,na.rm=TRUE)))%>% 
  mutate(Pct_Hospitalized = (Hospitalized/active_cases)*100)

#hospital1 <- hospital1 %>% 
#  filter(date ==(today))
```

```{r}
head(hospital1)
```


```{r}
write.csv(hospital1, "hospital1_test.csv")
```


#Basic graphs
```{r}
plot(hospital1$New_Admits)
```
```{r}
hist(hospital1$New_Admits)
```

```{r}
boxplot(hospital1$New_Admits)
```
```{r}
barplot(hospital1$New_Admits)
```

```{r}
barplot(sort(hospital1$New_Admits,decreasing = TRUE))
```

```{r}
#basic ggplot2 -
ggplot(hospital1, aes(x = date, y = New_Admits, color = New_Admits)) +
  geom_line()
```


#Slide on hospitalized
```{r}
ggplot(hospital1, aes(x = date, y = Hospitalized, label = Hospitalized, fill= Hospitalized)) + 
  geom_bar(aes(fill = Hospitalized), stat = "identity", position = "dodge") +
  labs(title = "Hospitalizations in Arkansas",  
       subtitle = "ADH Data for Sept. 27, 2020",
       caption = "Graphic by ArkansasCovid.com",
       y="Hospitalized",
       x="Date")


ggsave("test.png",device = "png",width=9,height=6, dpi=400)

```

#Slide on vents
```{r}
ggplot(hospital1, aes(x = date, y = Vent, label = Vent, fill= Vent)) + 
  geom_bar(aes(fill = Vent), stat = "identity", position = "dodge") +
  labs(title = "Vents in Arkansas",  
       subtitle = "ADH Data for Sept. 27, 2020",
       caption = "Graphic by ArkansasCovid.com",
       y="Vent",
       x="Date")


ggsave("test.png",device = "png",width=9,height=6, dpi=400)

```

#Advanced Hospital Slide

```{r}
df <- hospital1 %>% 
    filter(date>=(yesterday))

# Date <- df$mydate
# New_Cases <- df$New_Cases_Today
# Deaths <- df$deaths

library(reshape2)

df2 <- melt(df[,c("date", "New_Admits", "New_on_Vent")], id.vars = 1)

ggplot(df2,aes(x = date, y = value, label = value, fill= variable)) + 
  #geom_bar(aes(fill = variable), stat = "identity", position = "dodge") +
  geom_col(position = position_dodge2(width = 0.9, preserve = "single"), show.legend = T) +
  geom_text(position = position_dodge2(width = 0.9, preserve = "single"), vjust=-1, hjust=+1) +
  scale_y_continuous(limits=c(0, 20)) + 
  labs(title = "Admits and Vents in Arkansas ", 
       subtitle = "ADH Data for Sept. 27, 2020",
       caption = "Graphic by ArkansasCovid.com",
       y="Amount",
       x="Date")
ggsave("test2.png",device = "png",width=9,height=6, dpi=400)
```


#--------------------------------------------------------------------#
#REVIEW 
#--------------------------------------------------------------------#

# vignettes: Learn about packages and commands
```{r}
browseVignettes("tidyverse")
```

```{r}
??tidyverse
```
#Converting character strings into numeric
#What is the character type? Glimpse function
```{r}
glimpse(hospital_master1)
```

#chr stands for character vectors, or strings.
#int stands for integers.
#dbl stands for doubles, or real numbers.
#dttm stands for date-times (a date + a time).

#Convert numbers to "numeric" data
#We want to turn all columns after HMC2 into numeric
#HMC2 is Column #10


# Part 2: Math on State Data   

- **Now We Do The Math**
- **Previous Date Calculations**

Create Temporary Table Two Days' Worth of Data
Sort Alphbetically and Run Calculations
You should get a df with 150 observations (two days' worth a data)


--**Check the dates are correct and that the data is filtered for today, yesterday**

```{r}
twodays <- state %>% 
  filter(Date >= yesterday) %>%  #get two days' of data
arrange(desc(Date)) 
```

```{r}
head(twodays) 
```


-**The Today-Yesterday Calculations**

    This calculates:
    CHECKED Column K New Cases Today=Positive - (yesterday) Positive) J2-J3 
    CHECKED Column N Recovered Since Yesterday: Recovered - (yesterday) Recovered =M2-M3)
    CHECKED Column P New Deaths Today: = Total Deaths-(yesterday(Total Deaths) O2-O3)
    
    (FIELD IS IN STATE-HOSPITALS)
    CHECKED Column R New Tests Dashboard: = Number tested-(YESTERDAY) Number tested Q2-Q3)
    
    CHECKED Column Z (hosp_change_from_yesterday: Hospitals -(yesterday) Hospitals
    
    CHECKED Column AB (new_admits = ever_hospitalized-(yesterday) ever_hospitalized)
    CHECKED Column AC (new_discharges_deaths = new admits - HOSPITAL CHANGE)
    CHECKED Column AD (pct_hospitalized = hospitalized / current infections) * 100
    CHECKED Column AG (new_on_vent = ever_on_a_vent - (yesterday) ever_on_a_vent
    CHECKED Column AH (pct_vent = vent / hospitalized) * 100                  

```{r}
temp <- twodays %>%
  mutate(New_Cases_Today = (Positive-lead(Positive))) %>%
  mutate(Recovered_Since_Yesterday = (Recovered-lead(Recovered))) %>%
  mutate(New_Deaths_Today = (Total_Deaths-lead(Total_Deaths))) %>%
  #mutate(New_Tests_Dashboard = (Number_Tested-lead(Number_Tested))) %>% 
  mutate(Hosp_Change_from_Yesterday = (Hospitalized-lead(Hospitalized))) %>% 
  mutate(New_Admits = (Ever_Hospitalized-lead(Ever_Hospitalized))) %>% 
  mutate(New_Discharges_Deaths = (New_Admits-Hosp_Change_from_Yesterday)) %>% 
  mutate(Pct_Hospitalized = (Hospitalized/Current_Infections)*100) %>% 
  mutate(New_on_Vent = (Ever_on_a_Vent-lead(Ever_on_a_Vent))) %>% 
  mutate(Pct_Vent = (Vent/Hospitalized)*100) 
```

```{r}
glimpse(temp)
```

-**More Percentage Calculations**

    CHECKED Column E (Cases/population = (positive / population) *100
    CHECKED ***MAYBE ELIMINATE*** Column F (Tested/population = (Number tested / population) *100
    CHECKED Column G (New Cases Today per 10k Population = (New Cases Today / population) *10000
    CHECKED Column H (Active Cases per 10k = (current_infections / population) *10000
    CHECKED Column T (% Positive Cumulative = (positive / Number tested) *100
    CHECKED Column U (% Positive New to Dashboard = New Cases Today / New Tests Dashboard) * 100
    CHECKED Column V (Closed = Recovered + Total Deaths)
    CHECKED Column W (% Deaths vs. Recoveries = Total Deaths / Closed) * 100
    CHECKED Column X (% Recoveries vs. Deaths = Recovered / Closed) * 100

```{r}
temp <- temp %>%
  mutate(Cases_Population = (Positive / Population)*100) %>% 
  #mutate(Tested_Population = (Number_Tested / Population)*100) %>% 
  mutate(New_Cases_Today_10k_Pop = (New_Cases_Today/Population)*10000) %>% 
  mutate(Active_Cases_10k_Pop = (Current_Infections/Population)*10000) %>% 
  #mutate(Pct_Positive_Cumulative = (Positive/Number_Tested)*100) %>% 
  #mutate(Pct_Positive_New_to_Dashboard = (New_Cases_Today/New_Tests_Dashboard)*100) %>% 
  mutate(Closed = (Recovered + Total_Deaths)) %>% 
  mutate(Pct_Deaths_vs_Recoveries = (Total_Deaths/Closed)*100) %>% 
  mutate(Pct_Recoveries_vs_Deaths = (Recovered/Closed)*100) %>% 
  mutate(County = "Arkansas_all_counties")
  
temp
```

```{r}
glimpse(temp)
```

- **Align names in order of covid table**
#Cut Tested_Population, Number_Tested,New_Tests_Dashboard,Pct_Positive_Cumulative, Pct_Positive_New_to_Dashboard
```{r}
temp <- temp %>% select(County, Date, Population, Cases_Population,,New_Cases_Today_10k_Pop,Active_Cases_10k_Pop,Current_Infections,Positive, New_Cases_Today, Recovered, Recovered_Since_Yesterday, Total_Deaths, New_Deaths_Today, Closed, Pct_Deaths_vs_Recoveries, Pct_Recoveries_vs_Deaths, Hospitalized, Hosp_Change_from_Yesterday, Ever_Hospitalized, New_Admits, New_Discharges_Deaths, Pct_Hospitalized, Vent, Ever_on_a_Vent, New_on_Vent, Pct_Vent)

names(temp)
```


# Part 3: Joining and Archiving   

-**Update the main sheet, archiving**
```{r}
#Create a standalone copy of the TODAY'S data with the calculations
TODAY <- temp %>% 
  filter(Date > yesterday)
glimpse(TODAY)
```

-**Import Whole Table**
#IMPORTANT - CHECK THIS TABLE HAS YESTERDAY'S DATA
```{r}
wholetable <- rio::import("https://raw.githubusercontent.com/Arkansascovid/Main/master/state_hospitals_part1.csv")

```

-**Fix Date, Eliminate V1 index**
```{r}
wholetable$Date <- as.Date(wholetable$Date)
#cut timestamp column
wholetable <- wholetable[ -c(1) ]
#wholetable <- wholetable[ -c(27:33) ]

#If you need to eliminate today's data, run this
#wholetable <- wholetable %>% 
#  filter(Date < today)
```


```{r}
names(TODAY)
```

```{r}
glimpse(wholetable)
```

-**Join with Wholetable**
```{r}
wholetable <- rbind(wholetable, TODAY)

wholetable <- wholetable %>% 
  arrange(desc(Date))
```


```{r}
head(wholetable)
```

-**You've finished Part 1 State Calculations**


# Part 2: Build Simple Charts and Tables with Data

-**Main Numbers for the Day**

-**The goal is to build code to create this table automatically**
 
    Here is a quick look at today’s COVID-19 numbers:  
    66,804 Positive Cases (+398 today) 
    60,668 Recoveries (+748 today) 
    940 Total Deaths (+12 today) 
    *5,196 Total Active Cases Today 
    All charts will be updated on the website later this afternoon with cou @maryhennigan_ 

```{r}
DailyUpdate <- wholetable %>% 
  select(Date, Positive, Current_Infections, Total_Deaths, New_Admits) %>% 
  filter(Date >= yesterday)
DailyUpdate
```


```{r}
DailyUpdate %>% 
  ggplot(aes(x = Date, y = Total_Deaths, fill=Total_Deaths)) + 
  geom_bar(stat = "identity", show.legend = FALSE) +
  scale_y_continuous(limits=c(0, 1200)) +
  geom_text(aes(label = Total_Deaths), vjust = -1.1, size = 5) +
  #coord_flip() +    #this makes it a horizontal bar chart instead of vertical
  labs(title = "Today's Death Trends", 
       subtitle = "Source: ADH ",
       caption = "Graphic by Wells",
       y="Total Deaths",
       x="Date")
```

#Now we will refine it further


# Upload This to Google Drive: state_hospitals_part1.csv
-**Change File Name to Today's Date**

```{r}
write.csv(wholetable, "state_hospitals_part1.csv")
```

- **Loading and basic file management**

    Bringing in data
    Data Frames
    Extracting interesting details
    Cleaning the data
    Reshaping the format
    Manipulating the data
    Exporting  
    Add a column with a math conversion  

#--------------------------------------------------------------------#
# Loading Data from Scratch
#--------------------------------------------------------------------#

#Loading data
#RSQlite - read data from a database
#xlsx - read in Excel spreadsheets

#Import Income data from US Census
#INCOME IN THE PAST 12 MONTHS (IN 2017 INFLATION-ADJUSTED DOLLARS) 
#2013-2017 American Community Survey 5-Year Estimates. S1901. All Arkansas Counties

https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_17_5YR_S1901&prodType=table

#Load Data
```{r}
ArkCo_Income_2017 <- rio::import("Data/ArkCo_Income_2017.csv")
```

#Look at the table
```{r}
View(ArkCo_Income_2017)
```
# How many rows?  
```{r}
nrow(ArkCo_Income_2017)
```
# How many columns?
```{r}
ncol(ArkCo_Income_2017)
```
#Install dplyr or tibble for the glimpse function if you haven't already
#library (tibble)
    
#Check data types
```{r}
glimpse(ArkCo_Income_2017)
```

#What is the issue? (Don't read ahead and spoil the fun)

#Delete First Row Headers
#Reimport the data and skip the first row
#read.csv(.... , skip=1)

```{r}
ArkCo_Income_2017 <- rio::import("Data/ArkCo_Income_2017.csv", skip=1)
View(ArkCo_Income_2017)
```
#Clean Headers - Janitor package
```{r}
library(janitor)
```
# Clean up column names to they are R friendly
```{r}
ArkCo_Income_2017 <- janitor::clean_names(ArkCo_Income_2017)
View(ArkCo_Income_2017)
```     

# Still need to fix column names
```{r}
colnames(ArkCo_Income_2017)
```
#You can do it one at a time
#Column 4 households_estimate_total renamed to household_income
```{r}
colnames(ArkCo_Income_2017)[4] <- "household_income"
colnames(ArkCo_Income_2017)
```
#change it back
```{r}
colnames(ArkCo_Income_2017)[4] <- "households_estimate_total"
colnames(ArkCo_Income_2017)
```
#------------------------------------------#
#Rename a whole slug of columns at once!
#So the following is a *little intense*
#------------------------------------------#

#Use setnames from the data.tablepackage will work on data.frames or data.tables
#Example
#library(data.table)
#setnames(d, old = c('a','d'), new = c('anew','dnew'))
#d


#We are changing all of the old column names to new ones
#That's 19 column names we are changing.

#New Names
```{r}
library(data.table)

data.table::setnames(ArkCo_Income_2017, old = c('id', 'id2', 'geography', 'households_estimate_total', 
  'households_estimate_less_than_10_000', 'households_estimate_10_000_to_14_999', 
  'households_estimate_15_000_to_24_999', 'households_estimate_25_000_to_34_999', 
  'households_estimate_35_000_to_49_999', 'households_estimate_50_000_to_74_999', 
  'households_estimate_75_000_to_99_999', 'households_estimate_100_000_to_149_999', 
  'households_estimate_150_000_to_199_999', 'households_estimate_200_000_or_more',
  'households_estimate_median_income_dollars', 'households_estimate_mean_income_dollars',
  'households_estimate_percent_allocated_household_income_in_the_past_12_months',
  'households_estimate_percent_allocated_family_income_in_the_past_12_months',
  'households_estimate_percent_allocated_nonfamily_income_in_the_past_12_months'),
   new = c('id','id2','geography','households_estimate_total','less10_000','10k_to_14_999','15k_to_24_999',
           '25k_to_34_999', '35k_to_49_999','50k_to_74_999','75k_to_99_999','100k_to_149_999',
           '150k_to_199_999','200k_plus','median_income','mean_income',
           'pct_allocated_household_income','pct_allocated_family_income','pct_allocated_nonfamily_income'))

View(ArkCo_Income_2017)  
```
#Manipulating data
#dplyr - fast data work
#stringr - work with strings

#Data Management
#mutate - Create new column(s) in the data, or change existing column(s).
#mutate() adds new variables and preserves existing;
# Newly created variables are available immediately

#An example:
```{r}
mtcars <- as.data.frame(mtcars)
View(mtcars)
```

```{r}
mtcars2 <- mtcars %>% as_tibble() %>% mutate(
  cyl2 = cyl * 2,
  cyl4 = cyl2 * 2
)
```

# window functions are useful for grouped mutates
```{r}
mtcars %>%
  group_by(cyl) %>%
  mutate(rank = min_rank(desc(mpg)))
```

#Use mutate to add together the percentages of low-wage households
```{r}
ArkCo_Income_2017 <- ArkCo_Income_2017 %>%
  replace(is.na(.), 0) %>%
  mutate(Low_Wage_Households = rowSums(.[5:7]))
```
#Export data 
Write Export output this file to a CSV or Excel  write.csv or write.excel
```{r}
write.csv(ArkCo_Income_2017,"ArkCo_Income_2017.csv") 
```


#Exercises
# 1) Create a column for working class households: $25,000 to $50,000
# 2) Create a column for middle class households: $50,000 to $150,000
# 3) Create a column for upper income households: More than $150,000
# 4) Using these percentages, create new columns for low-wage, working class, middle class, and upper income 
# and calculate the actual number of people in each income group
# This will require looking at the table data structure, so go to the census.gov link provided above


#Answers
# 1) Create a column for working class households: $25,000 to $50,000
```{r}
ArkCo_Income_2017 <- ArkCo_Income_2017 %>%
  replace(is.na(.), 0) %>%
  mutate(WorkingClass = rowSums(.[8:9]))
```

# 2) Create a column for middle class households: $50,000 to $150,000
```{r}
ArkCo_Income_2017 <- ArkCo_Income_2017 %>%
  replace(is.na(.), 0) %>%
  mutate(MiddleClass = rowSums(.[10:12]))
```

# 3) Create a column for upper income households: More than $150,000
```{r}
ArkCo_Income_2017 <- ArkCo_Income_2017 %>%
  replace(is.na(.), 0) %>%
  mutate(UpperIncome = rowSums(.[13:14]))
```

# 4) Using these percentages, create new columns for low-wage, working class, middle class, and upper income 
# and calculate the actual number of people in each income group
# This will require looking at the table data structure, so go to the census.gov link provided above

#Copied this as a test
#ArkCensus$Pct2017 <- ((ArkCensus$x2017-ArkCensus$x2016)/(ArkCensus$x2016))

```{r}
ArkCo_Income_2017$LowWagePop <- ((ArkCo_Income_2017$households_estimate_total*ArkCo_Income_2017$Low_Wage_Households)/100)
```


```{r}
ArkCo_Income_2017$WorkingClassPop <- ((ArkCo_Income_2017$households_estimate_total*ArkCo_Income_2017$WorkingClass)/100)

ArkCo_Income_2017$MiddleClassPop <- ((ArkCo_Income_2017$households_estimate_total*ArkCo_Income_2017$MiddleClass)/100)

ArkCo_Income_2017$UpperIncomePop <- ((ArkCo_Income_2017$households_estimate_total*ArkCo_Income_2017$UpperIncome)/100)
```

#For amusement, see if they all add up
```{r}
ArkCo_Income_2017 <- ArkCo_Income_2017 %>%
  replace(is.na(.), 0) %>%
  mutate(SumPop = rowSums(.[24:27]))
```

#Eyeball the two columns, household_estimate_total and our SumPop
#df1 <- select(AR2016ALL, V4:V8, V10:20)
```{r}
PopCheck <- select(ArkCo_Income_2017, households_estimate_total, SumPop) 
```       
#which ones varied the most?

```{r}
PopCheck$variance <- (ArkCo_Income_2017$households_estimate_total- ArkCo_Income_2017$SumPop) 
```
#nerdy checking individual
```{r}
ArkCo_Income_2017 <- ArkCo_Income_2017 %>%
  +   replace(is.na(.), 0) %>%
  +   mutate(SumIndivdPct = rowSums(.[5:14]))
```
#more sum groups
```{r}
ArkCo_Income_2017 <- ArkCo_Income_2017 %>%
  replace(is.na(.), 0) %>%
  mutate(SumGroupPct = rowSums(.[20:23]))
```

```{r}
PopCheck <- select(ArkCo_Income_2017, households_estimate_total, SumPop, SumIndivdPct, SumGroupPct) 
```

#Other tools

#rename - Rename column(s).  
#bind_rows - Merge two data frames into one, combining data from columns with the same name.


#Other data cleaning tricks
#Change column to number format (first you have to strip out the $)  
--The $ is a special character  
-- earnings$TOTAL.EARNINGS <- gsub("\\$", "", earnings$TOTAL.EARNINGS) 


#Quick Data Viz
#Basic graphs
```{r}
plot(ArkCo_Income_2017$median_income)
```

```{r}
hist(ArkCo_Income_2017$median_income)  
```

```{r}
boxplot(ArkCo_Income_2017$median_income)
```

```{r}
barplot(ArkCo_Income_2017$median_income)
```

```{r}
barplot(sort(ArkCo_Income_2017$median_income, decreasing = TRUE))
```


#More Census Exercises
--Census Data: Examine median household income; income by women-led households; income by latino-based households; income by black-led households. Compare to state and national averages. Data dictionary required

- **The course GitHub Page**
> [**Here it is**](https://github.com/profrobwells/Data-Analysis-Class-Jour-405v-5003){target="_blank"} 

- **See Data folder**   
   
    Click USArk_Counties_Poverty_ACS_16_5YR_DP03_Jan_24.xlsxAOC.csv   
    "View raw"   
    Cntl + click (or right click) - Save As - Census.csv   
    Create R Markdown File
    Import into R


![](Images/ImportingDataTip.jpg)

#--------------------------------------------------------------------#
#More Advanced Section from Machlis Book, Ch. 4
#--------------------------------------------------------------------#


#get data for tutorial
```{r}
download.file("http://bit.ly/BostonSnowfallCSV", "BostonWinterSnowfalls.csv")
```

#load into memory
```{r}
snowdata <- rio::import("BostonWinterSnowfalls.csv")
```


#Data Cleaning install own function in my own rmiscutils package 
#turns “character strings” -- numbers with commas back into numbers
```{r}
pacman::p_load_gh("smach/rmiscutils")
```

#more software
```{r}
install.packages("remotes")
install.packages("githubinstall")
githubinstall::gh_install_packages("rmiscutils")
install.packages("htmltab")
library(htmltab)
```

```{r}
citytable <- htmltab(
  "https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population", 
  which = 5)
colnames(citytable)
```

```{r}
library(rmiscutils)
citytable$PopEst2017 <- number_with_commas(citytable$`2017estimate`)
```
#parsing numbers with readr
#After installing readr, you could generate numbers from the 
#2017 estimate column with readr:
  
```{r}  
citytable$PopEst2017 <- readr::parse_number(citytable$`2017 estimate`)
```


--------------------------------------------------------------------   

- **Common Problems**   

    ``"What the hell? I converted population to numeric and the calculations come out as NA values!
      This is driving me insane! What is going on?"

**Answer**:   
One of the obnoxious things about R is it considers commas as text. So it will show 720 as a number but 2,810 as not a number for calculations because it has a friggin comma.   

**Never fear.** There is a solution. Run the find and replace function, called gsub   

Example:
Crimedata$Population <- gsub(",", "", Crimedata$Population)   

Translation:   
Crimedata$Population   
-- is the population column in your crime dataset     
gsub(",", "",  
finds a comma and replaces it with nothing.  --    
and it found the comma in the column Crimedata$Population)
and the <- dumps the results back into the Crimedata$Population  column.

Fancy!   

**Question**:
"How do I get rid of the last row that only has text in the table that I just imported?"   

**Answer**:
Get rid of row using base R commands   
Crimedata <- Crimedata[-c(187), ]    

Translation:     
Crimedata[-c(187), ]    looks for row #187, which has this garbage text, and gives it the big minus sign, which eliminates it.    
Crimedata <-  dumps this slimmed down table back into your table and so you are good to go.   


 **--30--**