--- title: "R Notebook" output: html_document: df_print: paged editor_options: chunk_output_type: console --- # Data managements with dplyr ## filter ```{r} require(readxl) diab <-read_excel("diabetes.xls") head(diab, n = 4) summary(diab) diab <- diab %>% mutate_if(is.character, as.factor) summary(diabetes) require(tidyverse) #require(dplyr) diab_filt <- filter(diab, tabac=="No fumador", edat>=50) # and & - or | head(diab_filt, n = 4) summary(diab_filt) #diab$tabac <- as.factor(diab$tabac) ##**Warning!** by default alphabetic order is used when creating factor levels. rm(diabetes) #Drop dataset from env ``` ## select ```{r} diab_small <- select(diab, mort, edat, tabac, sbp) head(diab_small, n = 4) diab_small_consec <- select(diab, mort:bmi) # Select consecutive variables diab_small_noconsec <- select(diab, !(mort:bmi)) # Non-select consecutive variables ``` ## sort ```{r} diab_sort <-arrange(diab,desc(mort), edat) head(diab_sort, n = 4) ``` ## mutate ```{r} diab_mut <- mutate(diab,edatcat =cut(edat, breaks =c(0,40,50,60,70,120)),highbmi = bmi> mean(bmi)) tail(diab_mut, n = 4) summary(diab_mut) table(diab_mut$edatcat, diab_mut$highbmi) diab_mut_2 <- mutate(diab,edatcat =cut(edat, breaks =c(0,40,50,60,70,120), include.lowest=TRUE, labels=c("0-40","41-50","51-60", "61-70", "71-120"))) ``` ## E1 Find all individual that: 1. Had a sbp higher than 160 (filter()) ```{r} diab_sbp <- filter(diab, sbp>160) summary(diab_sbp) ``` 2. Had a sbp higher than 160 or tabac was 'Fumador' ```{r} diab_sbp_tabac <- filter(diab, sbp>160 | tabac=='Fumador') summary(diab_sbp_tabac) ``` - What happens if you include the name of a variable multiple times in a select() call? ```{r} diab_mselect <- select(diab, sbp, sbp, dbp) summary(diab_mselect);names(diab_mselect) ``` 3. Sort individual to find the most 'tempsviu'. (arrange() ) ```{r} diab_ltempsviu <- arrange(diab, tempsviu) head(diab_ltempsviu, 4) ``` # The pipe operator ```{r} diab40 <-filter(diab, mort=="Vivo"&edat<40) diab40_small <-select(diab40, edat, dbp) head(diab40_small,n = 4) ``` ```{r} diab40_small <- diab %>% filter(mort=="Vivo"&edat<40) %>% select(edat, dbp) head(diab40_small,n = 4) ``` ## E2 Replicate the last exercice using 'pipes': ```{r} df <- filter(diab,sbp > 160 | tabac == "Fumador") dfs <- select(df, tempsviu ,bmi,sbp,sbp) dfsa <- arrange(dfs, desc(tempsviu)) ``` ```{r} diab_pipes <- diab %>% filter(sbp > 160 | tabac == "Fumador") %>% select(tempsviu ,bmi,sbp,sbp) %>% arrange(desc(tempsviu)) ``` # Merging datasets ```{r} band_members band_instruments ``` ```{r} bind_rows(band_members, band_instruments) ## PROBLEMES! bind_cols(band_members, band_instruments) ``` ## mutating ```{r} band_members %>% inner_join(band_instruments, by = "name") ``` ```{r} band_members %>% left_join(band_instruments) ``` ##E3 What happens if you run these lines? ```{r} band_members %>% right_join(band_instruments) band_members %>% full_join(band_instruments) ``` #More ```{r} diab %>% group_by(mort) %>% summarise(mean(sbp)) ```