# WEBSCRAPING ---- # 1.0 LIBRARIES ---- library(tidyverse) # Main Package - Loads dplyr, purrr library(rvest) # HTML Hacking & Web Scraping library(xopen) # Quickly opening URLs library(jsonlite) # converts JSON files to R objects library(glue) # concatenate strings library(stringi) # character string/text processing # 2.0 COLLECT PRODUCT FAMILIES ---- url_home <- "https://www.canyon.com/en-de" xopen(url_home) # Open links directly from RStudio to inspect them # Read in the HTML for the entire webpage html_home <- read_html(url_home) # Web scrape the ids for the families bike_family_tbl <- html_home %>% # Get the nodes for the families ... html_nodes(css = ".js-navigationDrawer__list--secondary") %>% # ...and extract the infromation of the id attribute html_attr('id') %>% # Remove the product families Gear and Outlet and Woman # (because the female bikes are also listed with the others) discard(.p = ~stringr::str_detect(.x,"WMN|WOMEN|GEAR|OUTLET")) %>% # Convert vector to tibble enframe(name = "position", value = "family_class") %>% # Add a hashtag so we can get nodes of the categories by id (#) mutate( family_id = str_glue("#{family_class}") ) bike_family_tbl # 3.0 COLLECT PRODUCT CATEGORIES ---- # Combine all Ids to one string so that we will get all nodes at once (seperated by the OR operator ",") family_id_css <- bike_family_tbl %>% .[["family_id"]] %>% stringr::str_c(collapse = ", ") # Extract the urls of the entries from the href attribute bike_category_tbl <- html_home %>% # Select nodes by the ids html_nodes(css = family_id_css) %>% # Going further down the tree and select nodes by class # Selecting two classes makes it specific enough html_nodes(css = ".navigationListSecondary__listItem .js-ridestyles") %>% html_attr('href') %>% # Convert vector to tibble enframe(name = "position", value = "subdirectory") %>% # Add the domain, because we will get only the subdirectories mutate( url = str_glue("https://www.canyon.com{subdirectory}") ) %>% # Some categories are listed multiple times. # We only need unique values distinct(url) # 4.0 COLLECT BIKE DATA ---- # 4.1 Get URL for each bike of the Product categories # select first bike category url bike_category_url <- bike_category_tbl$url[1] # Alternatives for selecting values # bike_url <- bike_category_tbl %$% url %>% .[1] # bike_url <- bike_category_tbl %>% pull(url) %>% .[1] # bike_url <- deframe(bike_category_tbl[1,]) # bike_url <- bike_category_tbl %>% first %>% first xopen(bike_category_url) # Get the URLs for the bikes of the first category html_bike_category <- read_html(bike_category_url) bike_url_tbl <- html_bike_category %>% # Get the 'a' nodes, which are hierarchally underneath # the class productTile__contentWrapper html_nodes(css = ".productTile__contentWrapper > a") %>% html_attr("href") %>% # Remove the query parameters of the URL (everything after the '?') str_remove(pattern = "\\?.*") %>% # Convert vector to tibble enframe(name = "position", value = "url") # 4.2 Extract the descriptions (since we have retrieved the data already) bike_desc_tbl <- html_bike_category %>% # Get the nodes in the meta tag where the attribute itemprop equals description html_nodes('.productTile__productSummaryLeft > meta[itemprop="description"]') %>% # Extract the content of the attribute content html_attr("content") %>% # Convert vector to tibble enframe(name = "position", value = "description") # 4.3 Get even more data from JSON files bike_json_tbl <- html_bike_category %>% html_nodes(css = '.productGrid__listItem.xlt-producttile > div') %>% html_attr("data-gtm-impression") %>% # Convert the JSON format to dataframe # map runs that function on each element of the list map(fromJSON) %>% # need JSON ### need lists # Extract relevant information of the nested list map(purrr::pluck, 2, "impressions") %>% # Need purrr and expl above # Set "not defined" and emtpy fields to NA (will be easier to work with) map(na_if, "not defined") %>% map(na_if, "") %>% # The class of the col dimension56 varies between numeric and char. # This converts this column in each list to numeric map(~mutate_at(.,"dimension56", as.numeric)) %>% # Stack all lists together bind_rows() %>% # Convert to tibble so that we have the same data format as_tibble() %>% # Add consecutive numbers so that we can bind all data together # You could have also just use bind_cols() rowid_to_column(var='position') %>% left_join(bike_desc_tbl) %>% left_join(bike_url_tbl) # 4.4 Wrap it into a function get_bike_data <- function(url) { html_bike_category <- read_html(url) # Get the URLs bike_url_tbl <- html_bike_category %>% html_nodes(css = ".productTile__contentWrapper > a") %>% html_attr("href") %>% str_remove(pattern = "\\?.*") %>% enframe(name = "position", value = "url") # Get the descriptions bike_desc_tbl <- html_bike_category %>% html_nodes(css = '.productTile__productSummaryLeft > meta[itemprop="description"]') %>% html_attr("content") %>% enframe(name = "position", value = "description") # Get JSON data bike_json_tbl <- html_bike_category %>% html_nodes(css = '.productGrid__listItem.xlt-producttile > div') %>% html_attr("data-gtm-impression") %>% map(fromJSON) %>% # need JSON ### need lists map(purrr::pluck, 2, "impressions") %>% map(na_if, "not defined") %>% map(na_if, "") %>% map(~mutate_at(.,"dimension56", as.numeric)) %>% map(~mutate_at(.,"price", as.numeric)) %>% bind_rows() %>% as_tibble() %>% rowid_to_column(var='position') %>% left_join(bike_desc_tbl) %>% left_join(bike_url_tbl) } # Run the function with the first url to check if it is working bike_category_url <- bike_category_tbl$url[1] bike_data_tbl <- get_bike_data(url = bike_category_url) bike_data_tbl # 4.4.1 Map the function against all urls # Extract the urls as a character vector bike_category_url_vec <- bike_category_tbl %>% pull(url) # Run the function with every url as an argument bike_data_lst <- map(bike_category_url_vec, get_bike_data) # Merge the list into a tibble bike_data_tbl <- bind_rows(bike_data_lst) saveRDS(bike_data_tbl, "bike_data_tbl.rds") # 4.4.1 Alternative with a for loop # Create an empty tibble, that we can populate bike_data_tbl <- tibble() # Loop through all urls for (i in seq_along(bike_category_tbl$url)) { bike_category_url <- bike_category_tbl$url[i] bike_data_tbl <- bind_rows(bike_data_tbl, get_bike_data(bike_category_url)) # Wait between each request to reduce the load on the server # Otherwise we could get blocked Sys.sleep(5) # print the progress print(i) } # We have some duplicates bike_data_cleaned_tbl %>% group_by(id) %>% filter(n()>1) %>% View() # Filter non Canyon bikes (based on id length) and add an empty column for the colors bike_data_cleaned_tbl <- bike_data_tbl %>% filter(nchar(.$id) == 4) %>% distinct(id, .keep_all = T) # 5.0 Get all color variations for each bike ---- # Extract all bike urls bike_url_vec <- bike_data_cleaned_tbl %>% pull(url) # Create function to get the variations get_colors <- function(url) { color_variations <- url %>% read_html() %>% # Get all 'script nodes' and convert to char html_nodes(css = "script") %>% as.character() %>% # Select the node, that contains 'window.deptsfra' str_subset(pattern = "window.deptsfra") %>% # remove the chars that do not belong to the json # 1. replace at the beginning everything until the first "{" with "" gsub("^[^\\{]+", "", .) %>% # 2. replace at the end everything after the last "}" with "" gsub("[^\\}]+$", "", .) %>% # Convert from json to an r object and pick the relevant values fromJSON() %>% purrr::pluck("productDetail", "variationAttributes", "values", 1, "value") %>% # Paste all results into one string str_c(collapse = ";") } # Run the function over all urls and add result to bike_data_cleaned_tbl # This will take a long time (~ 20-30 minutes) because we have to iterate over many bikes bike_data_colors_tbl <- bike_data_cleaned_tbl %>% mutate(color_variations = map(bike_url_vec, get_colors)) saveRDS(bike_data_colors_tbl, "bike_data_colors_tbl.rds") # 6.0 Create the urls for each variation ---- bike_data_wrangled_tbl <- bike_data_colors_tbl %>% # Remove unnecessary columns select(name, id, category, dimension50, dimension63, price, metric4, description, url, color_variations) %>% # Rename some columns rename(year = dimension50) %>% rename(price_dollar = price) %>% rename(price_euro = metric4) %>% # Make the data tyidy # Use str_count(test$color_variations, ";") %>% max() # 2 --> max 3 color variants separate(col = color_variations, into = c("color1","color2","color3"), sep = ";", convert = T) %>% pivot_longer(cols = c("color1", "color2", "color3"), names_to = "color", values_drop_na = T) %>% select(-color) %>% rename(color = value) %>% # Merge url and query parameters for the colors mutate(url_color = glue("{url}?dwvar_{id}_pv_rahmenfarbe={color}")) %>% # Use library(stringi) to replace the last dash (HTLM format of a dash) mutate(url_color = ifelse(grepl(pattern = "/", .$color), stringi::stri_replace_last_fixed(.$url_color, "/", "%2F"), .$url_color)) # 7.0 Stock availability for each frame size ---- get_sizes <- function(url) { json <- url %>% read_html() %>% # Get all 'script nodes' and convert to char html_nodes(css = "script") %>% as.character() %>% # Select the node, that contains 'window.deptsfra' str_subset(pattern = "window.deptsfra") %>% # remove the chars that do not belong to the json # 1. replace at the beginning everything until the first "{" with "" gsub("^[^\\{]+", "", .) %>% # 2. replace at the end everything after the last "}" with "" gsub("[^\\}]+$", "", .) %>% # Convert from json to an r object and pick the relevant values fromJSON(flatten = T) %>% purrr::pluck("productDetail", "variationAttributes", "values", 2) %>% # select(id, value, available, availability)# %>% select(id, value, availability.onlyXLeftNumber) %>% # Rename rename(id_size = id) %>% rename(size = value) %>% rename(stock_availability = availability.onlyXLeftNumber) %>% # Conver to tibble as_tibble() } # Pull url vector bike_url_color_vec <- bike_data_wrangled_tbl %>% pull(url_color) # Map bike_data_sizes_tbl <- bike_data_wrangled_tbl %>% mutate(size = map(bike_url_color_vec, get_sizes)) # Unnest bike_data_sizes_tbl <- bike_data_sizes_tbl %>% select(-url) %>% unnest(size) saveRDS(bike_data_sizes_tbl, "bike_data_sizes_tbl.rds")