#Parse Amazon html pages for data
amazon_scraper <- function(doc, reviewer = T, delay = 0){
if(!"pacman" %in% installed.packages()[,"Package"]) install.packages("pacman")
pacman::p_load_gh("trinker/sentimentr")
pacman::p_load(RCurl, XML, dplyr, stringr, rvest, audio)
sec = 0
if(delay < 0) warning("delay was less than 0: set to 0")
if(delay > 0) sec = max(0, delay + runif(1, -1, 1))
#Remove all white space
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
title <- doc %>%
html_nodes("#cm_cr-review_list .a-color-base") %>%
html_text()
author <- doc %>%
html_nodes(".review-byline .author") %>%
html_text()
date <- doc %>%
html_nodes("#cm_cr-review_list .review-date") %>%
html_text() %>%
gsub(".*on ", "", .)
ver.purchase <- doc%>%
html_nodes(".review-data.a-spacing-mini") %>%
html_text() %>%
grepl("Verified Purchase", .) %>%
as.numeric()
format <- doc %>%
html_nodes(".review-data.a-spacing-mini") %>%
html_text() %>%
gsub("Color: |\\|.*|Verified.*", "", .)
#if(length(format) == 0) format <- NA
stars <- doc %>%
html_nodes("#cm_cr-review_list .review-rating") %>%
html_text() %>%
str_extract("\\d") %>%
as.numeric()
comments <- doc %>%
html_nodes("#cm_cr-review_list .review-text") %>%
html_text()
helpful <- doc %>%
html_nodes(".cr-vote-buttons .a-color-secondary") %>%
html_text() %>%
str_extract("[:digit:]+|One") %>%
gsub("One", "1", .) %>%
as.numeric()
if(reviewer == T){
rver_url <- doc %>%
html_nodes(".review-byline .author") %>%
html_attr("href") %>%
gsub("/ref=cm_cr_othr_d_pdp\\?ie=UTF8", "", .) %>%
gsub("/gp/pdp/profile/", "", .) %>%
paste0("https://www.amazon.com/gp/cdp/member-reviews/",.)
#average rating of past 10 reviews
rver_avgrating_10 <- rver_url %>%
sapply(., function(x) {
read_html(x) %>%
html_nodes(".small span img") %>%
html_attr("title") %>%
gsub("out of.*|stars", "", .) %>%
as.numeric() %>%
mean(na.rm = T)
}) %>% as.numeric()
rver_prof <- rver_url %>%
sapply(., function(x)
read_html(x) %>%
html_nodes("div.small, td td td .tiny") %>%
html_text()
)
rver_numrev <- rver_prof %>%
lapply(., function(x)
gsub("\n Customer Reviews: |\n", "", x[1])
) %>% as.numeric()
rver_numhelpful <- rver_prof %>%
lapply(., function(x)
gsub(".*Helpful Votes:|\n", "", x[2]) %>%
trim()
) %>% as.numeric()
rver_rank <- rver_prof %>%
lapply(., function(x)
gsub(".*Top Reviewer Ranking:|Helpful Votes:.*|\n", "", x[2]) %>%
removePunctuation() %>%
trim()
) %>% as.numeric()
df <- data.frame(title, date, ver.purchase, format, stars, comments, helpful,
rver_url, rver_avgrating_10, rver_numrev, rver_numhelpful, rver_rank, stringsAsFactors = F)
} else df <- data.frame(title, author, date, ver.purchase, format, stars, comments, helpful, stringsAsFactors = F)
return(df)
}