################################################## ## installing & loading packages ################################################## ## install.packages('tidyverse') ## you have to do this once to install the package on each computer require(tidyverse) ## you have to do this each R session to load the package ################################################## ## vectors ################################################## 3+3 1:100 ## [1] and other [numbers] that R prints out are indices of vector elements x <- rnorm(100) ## create 100 standard normal random numbers x length(x[x>0]) ## select the vector of elements of x that are >0, then return the length of that vector ################################################## ## data.frames ################################################## ## creating fake data in base R ncat <- 4 ## imagine a variable that has 4 categories (groups A, B, C, D) numPerCat <- 30 ## and you sample 4 individuals per category nr <- numPerCat*ncat ## this is how many observations you'll have (i.e. Number of Rows) myData <- data.frame(x = 1:nr, ## first column of myData will be the sequence of 1:nr y = rnorm(nr), ## create random std normal numbers of the same length cat = rep(LETTERS[1:ncat], numPerCat)) ## group them into groups A, B, C, D myData myData[myData$y>0,] ## look at myData for rows where y > 0 ################################################## ## tidyR ## Playing with the same data in TidyR myTibble <- as.tibble(myData) ## in TidyR, we use tibbles intead of data.frames myTibble filter(myTibble, y>0) ## select rows where y>0 ################################################## ## object classes tell us what an object is. This affects how R handles it class(myData) class(myTibble) ################################################## ## subsetting myTibble rowsToShow <- c(54, 99, 33, 27, 100,100,100) slice(myTibble, rowsToShow) ## tidyR version myData[rowsToShow,] ## base R version ## divide the data into categories, then select rows where y is not an NA, then look at correlation ## of y with itself (Stephanie's question) group_by(myTibble, cat) %>% filter(!is.na(y)) %>% summarize(cor(y,y)) boxplot(myTibble$y ~ myTibble$cat) ## create a boxplot with base R boxplot(y ~ cat, data = myTibble) ## a more succinct version of the last line ?boxplot ggplot(data=myTibble, mapping=aes(cat, y)) + geom_boxplot() ## boxplot with tidyR ################################################## ## data input ############### ## absolute path yts <- read_csv('~/Downloads/Youth_Tobacco_Survey__YTS__Data.csv') ############### ## relative path setwd('~/Downloads') yts <- read_csv('Youth_Tobacco_Survey__YTS__Data.csv') ## look at the data View(yts) ############### ## Excercise 1: ## Make a boxplot of the % of men of all reaces who are currently smokers by state in 2015 ############### names(yts) ## look at names of the data set nrow(yts) ## how many rows are there? ## look at how many rows we have after filtering filter(yts, MeasureDesc=="Smoking Status") %>% nrow() filter(yts, MeasureDesc=="Smoking Status" & Response=="Current") %>% nrow() filter(yts, MeasureDesc=="Smoking Status" & Response=="Current" & YEAR == 2015) %>% nrow() filter(yts, MeasureDesc=="Smoking Status" & Response=="Current" & YEAR == 2015 & Gender =="Male") %>% nrow() ## Save the new filtered data subset yts2 <- filter(yts, MeasureDesc=="Smoking Status" & Response=="Current" & YEAR == 2015 & Gender =="Male") View(yts2) boxplot(Data_Value ~ LocationAbbr, data = yts2) ## boxplot base R ggplot(data=yts2, mapping = aes(LocationAbbr, Data_Value)) + geom_boxplot() ## boxplot base R ############### ################################################## ## data output ################################################## ## save new filtered data to a CSV file that you can open in spreadsheet software or elsewhere write.csv(yts2, 'male current smokers by state.csv') ## looking in the current directory dir() dir(full.names = T) getwd() ################################################## ## Save one R object save(yts, file='yts.Rda') ## Save two R objects save(yts, yts2, file='yts.Rda') ## Remove all R objects rm(list=ls()) ls() ## list all objects in R environment/workspace whatsLoaded <- load(file='yts.Rda') ## load saved data in whatsLoaded ## get names of saved data ################################################## ## the Example function for learning R example(mean) ## you can use example in R to see example code for how stuff works ################################################## ## Control Flow ################################################## ## if/else if(-5 > 0) print("I'm bad at math") else print("I'm pretty good at math") ifelse(-5>0, "I'm bad at math", "I'm pretty good at math") args(ifelse) ?ifelse ################################################## ## Logical indices and character strings currFilenms <- dir() currFilenms ## load filenames into a character string vector ?grepl ## pattern finding in character strings fileToLoad <- currFilenms[grepl('csv', currFilenms)] ## which of those file names have CSV in it? fileToLoad ## now load the file (if there's more than one with CSV in it you might get an error) if(grepl('csv', filenm)) mydat <- read.csv(filenm) letters ## alphabet letters[grep('o', letters)] ## return the element of letters that has o in it letters[grepl('o', letters)] ## return a logical vector saying whether each alphabet letter has o in it which(grepl('o', letters)) ## which of the true/false vector is true (numeric element) ## for loops ## plotting by category in ggplot ## # create a data frame that has 6 columns, 5 rows, where 2 of the columns are numeric, and 4 of them are categories (of your choice)