## R ## Minimalistic introduction ## NOTE: Everything after # is ignored by R interpreter ## Forbes billionaires ## https://www.forbes.com/billionaires/ ## wektors napisów (character vector) ## More vectors (think columns in spreadsheets) ## character vector: milioner <- c('Jeff Bezos', 'Bill Gates', 'Bernard Arnault', 'Warren Buffett', 'Larry Ellison', "Amancio Ortega", "Mark Zuckerberg", "Jim Walton", "Alice Walton", "Rob Walton") ## numeric vector: ## majątek (wealth) majatek <- c(113, 98,76, 67.5,59,55.1,54.7,54.6,54.4, 54.1) ## another numeric vector ## wiek (age) wiek <- c(56, 64, 71, 89, 75, 84, 35, 71, 70, 75 ) ## character vector (fake) dates ## urodzony (born) urodzony <- c( '1964-01-01', '1956-01-01', '1949-01-01', '1931-01-01', '1945-01-01', '1936-01-01', '1985-01-01', '1949-01-01', '1950-01-01', '1945-01-01' ) ## country ISO codes kraj <- c( 'US', 'US', 'FR', 'US', 'US', 'ES', 'US', 'US', 'US', 'US' ); ## F&I = finance and investment ## branch/sector branza <- c("Technology", "Technology", "Retail", "F&I", "Technology", "Retail", "Technology", "Retail", "Retail", "Retail") ## Dataframe = fundamental R data structure (think spreadsheet) ## dataframe = list of named vectors forbes <- data.frame(milioner, majatek, urodzony, wiek, kraj, branza) ## str(ucture) = inspect structure of any R object (incl dataframe) str(forbes) ## print frame (not particularily useful) forbes ## Print column (more useful) ## df$column = access column from df ## df$colum ## another column: forbes$majatek ## ## access columns using numbers ## columns are numbered from 1 forbes[, 3] ## Print first 13 rows ## useful functions head(forbes, n=13) ## Print some rows from the bottom of dataframe tail(forbes) ## Number of rows in a dataframe nrow(forbes) ## ## Basic (descriptive) statistics (at least) ## mean of a column majatek (wealth) from dataframe forbes mean(forbes$majatek) ## some summary statistics summary(forbes$wiek) ## standard deviation sd(forbes$wiek) ## minimum value min(forbes$wiek) ## Simple plot ## plot with default chart (R decides based on the data to plot) ## Guess the plot type :-) plot(forbes$wiek) plot(forbes$wiek, forbes$majatek) ## histogram hist(forbes$wiek) ## ## In practice data are not manually typed as above, but loaded from files or URLs ## As a example file FB2020.csv contains data on Forbs billionaires ## Import data into frame (function csv.read): forbes <- read.csv("FB2020.csv", dec=".", sep = ';', header=T, na.string="NA"); ## dec = determines decimal ## sep = cell separation character (';') ## header = if there is a header with variable names (T) or not (F) ## na.string = determines how missing values are encodes (here as 'NA') ## examine structure str(forbes) ## indexing w <- forbes[,3] p <- forbes[1,] p w <- forbes$worth billionares <- forbes[,"name"] ## or: billionares <- forbes$name ## Basic statistics again summary(forbes$worth) forbes.summary <- summary(forbes$worth) str(forbes.summary) forbes.summary[1] ## Extract atribute 'Median' forbes.summary["Median"] forbes.median <- forbes.summary["Median"] forbes.median forbes.mean <- mean(forbes$age, na.rm = T) ## Printing results print (forbes.mean) ## Formatted print sprintf ("%.2f", forbes.mean) ## or (cat = concatenate) cat ("Median:", forbes.median) summary(forbes) ## The distibution is extremly skew forbes.table <- table(forbes$worth) length(forbes.table) forbes.table cut(forbes$worth, breaks=seq(0,120, by=10)) qq <- table(cut(forbes$worth, breaks=seq(0,120, by=10))) hist(qq) ## Core R can be extended by attaching libraries. Some ## libraries are very useful. Installing a library is very easy: ## install.packages("library") ## Filtering/selecting/modyfing dataframes with dplyr/tidyverse ## Filtering rows library("dplyr") ## install.packages("dplyr") ## installation is automatic (upon confirmation) in RStudio ## filter all billionaires who are non US: nonus.forbes <- filter(forbes, country != "US") nonus.forbes ## Modification oprations can be connected to one sequence with %>% operator ## Example: filter some rows %>% select some columns: nonus.forbes.worth <- filter(forbes, country != "US") %>% select(worth) ## Compute total wealth: sum(nonus.forbes.worth) ## Print all countries without repetitions select(forbes, country) %>% unique ## How many countries: select(forbes, country) %>% unique %>% nrow ## alternative syntax: forbes %>% select(country) %>% unique %>% nrow ## Grouping is useful for summarization ## Example: compute total wealth and number of billionaires by country by.country <- forbes %>% group_by(country) %>% ## group by country summarise(t = sum(worth), n=n()) ## summarise (in groups) ## Print results by.country ## ##################################################################### ## Graphics ## default chart for list of numbers ## #################################################################### plot (forbes$worth) ## boxplot boxplot(forbes$worth) ## color= breaks hist(forbes$worth) ## boxplot(worth ~ branch, data=forbes) select(forbes, branch) %>% unique %>% nrow ## mutate = create new variables forbes.x <- mutate(forbes, branch = case_when(branch == "Technology" ~ "IT", branch == "Fashion & Retail" ~ "FR", TRUE ~ "Other")) forbes.x boxplot(worth ~ branch, data=forbes.x) ## basic plot ## for two lists (XY-plot) plot(forbes.x$age, forbes.x$worth) ## Much better quality graphics with ggplot2 ## library ("ggplot2") # quick-plot (default charts are generated based on data types) qplot(data=forbes.x, age, worth, color=branch) qplot(data=forbes.x, age, worth, facets = . ~ branch)