#' @title Scraping Google Scholar #' @author Shea Fyffe sfyffe@@gmail.com #' @description Wrapper function that collect data from a Google Scholar Search #' @seealso \link{https://scholar.google.com/robots.txt} For preventing scraper from being caught. #' @param ... String to be passed to Google Scholar search. #' Use \code{min =} and \code{max = } to restirict number of articles returned. #' Use \code{year = } as a single-element vector to pull articles newer than the given year. #' Use \code{year = } as a double-element vector to pull articles between two dates. #' \code{year[1]} sets signifies 'after date' and \code{year[2]} 'before date'. #' Use \code{article =} to search within a particular article. *note* this should be the article #' given by Google (e.g., usually a string of digits), define this as a character. #' @details \code{max = } will default to 250 if note explicitly defined. #' \code{article = } will use search terms within the article defined. #' @examples #' \dontrun{ #' ## Searches for 'Engagement' AND 'organizational behavior' and limits return to 250 #' res <- build_search("Engagement", "organizational behavior", max = 250) #' } #' #' \dontrun{ #' ## Searches for articles newer that 2010 with the words 'Job satistfaction' and limits return to 200 starts at 100 #' res <- build_search("Job satisfaction", year = 2010, min = 100, max = 200) #' } #' #' \dontrun{ #' ## Searches for articles newer that 2010 but earlier than 2015 with the words 'Job satistfaction' and limits return to 200 starts at 100 #' res <- build_search("Job satisfaction", year = c(2010, 2015), min = 100, max = 200) #' } #' #' \dontrun{ #' ## Searches for articles WITHIN article 10572144307387769361 newer that 2010 but earlier than 2015 with the words 'Job satistfaction' #' ## and limits return to 200 starts at 100 #' res <- build_search("Job satisfaction", article = "10572144307387769361", year = c(2010, 2015), min = 100, max = 200) #' } #' @export #' TODO(clean up arguments passed to ...) scrape_scholar <- function(...) { .tmp <- paste(c(...)) if (any(sapply(.tmp, function(x) grepl("^https://", x)))) { .url <- .tmp } else { .url <- build_search(...) } .out <- list() for (i in seq_along(.url)) { .out[[i]] <- try(scrape(.url[i])) if ((inherits(.out[[i]], "try-error") || is.na(.out[[i]])) & i > 1) { .i <- i - 1L .out <- .out[seq(.i)] } else if ((inherits(.out[[i]], "try-error") || is.na(.out[[i]])) & i == 1) { stop( sprintf( "ERROR: %s open:%s and check for CAPTCHA and/or mispellings", .out[[i]], .url[i] ) ) } } .out <- do.call("rbind", .out) return(.out) } #' @title Scrape Child Function #' @description This function actually does the scraping of Google Scholar #' @export #' @import curl rvest xml2 httr NULL scrape <- function(url, user_agent = NULL, verbose = FALSE) { if (is.null(user_agent)) { user_agent <- sample(.use_age(), 1) } mproxy <- .get_proxy() .mpr <- sample(seq(nrow(mproxy)), 1L) # set a random user_agent and a timeout .cfg <- list( httr::user_agent(user_agent), httr::timeout(3L), httr::use_proxy( url = mproxy[.mpr, 1], port = mproxy[.mpr, 2] ) ) t0 <- Sys.time() my_page <- tryCatch( xml2::read_html(httr::content(httr::GET(url, .cfg), as = "text")), error = function(err) { stop(print(err)) } ) t1 <- Sys.time() .cap <- rvest::html_text(rvest::html_nodes(xml2::xml_child(my_page, 2), "script")[1]) if (length(.cap) != 0L) { if (grepl("gs_captcha_cb()", .cap)) { stop( "Oh no! Google thinks you're a robot... Take a break, go make human friends, and try again later." ) } } if (verbose) { return(xml2::html_structure(my_page)) } else if (is.na(my_page)) { on.exit(closeAllConnections()) return(invisible(my_page)) } else { .out <- parse_page(my_page) } .pause(as.numeric(t1 - t0)) on.exit(closeAllConnections()) return(.out) } #' @title Build Search #' @author Shea Fyffe sfyffe@@gmail.com #' @description Helper function for scraping Google Scholar #' @param ... String to be passed to Google Scholar search. #' Use \code{min =} and \code{max = } to restirict number of articles returned. #' Use \code{year = } as a single-element vector to pull articles newer than the given year. #' Use \code{year = } as a double-element vector to pull articles between two dates. #' \code{year[1]} sets signifies 'after date' and \code{year[2]} 'before date'. #' Use \code{article =} to search within a particular article. *note* this should be the article #' given by Google (e.g., usually a string of digits), define this as a character. #' @details \code{max = } will default to 250 if note explicitly defined. #' \code{article = } will use search terms within the article defined. #' @examples #' \dontrun{ #' ## Searches for 'Engagement' AND 'organizational behavior' and limits return to 250 #' res <- build_search("Engagement", "organizational behavior", max = 250) #' } #' #' \dontrun{ #' ## Searches for articles newer that 2010 with the words 'Job satistfaction' and limits return to 200 starts at 100 #' res <- build_search("Job satisfaction", year = 2010, min = 100, max = 200) #' } #' #' \dontrun{ #' ## Searches for articles newer that 2010 but earlier than 2015 with the words 'Job satistfaction' and limits return to 200 starts at 100 #' res <- build_search("Job satisfaction", year = c(2010, 2015), min = 100, max = 200) #' } #' #' \dontrun{ #' ## Searches for articles WITHIN article 10572144307387769361 newer that 2010 but earlier than 2015 with the words 'Job satistfaction' #' ## and limits return to 200 starts at 100 #' res <- build_search("Job satisfaction", article = "10572144307387769361", year = c(2010, 2015), min = 100, max = 200) #' } #' @seealso \code{scrape_scholar} #' @export build_search <- function(...) { .search <- list(...) if ("year" %in% names(.search)) { .year <- .search[["year"]] .search[["year"]] <- NULL } if ("max" %in% names(.search)) { .max <- .search[["max"]] if (as.numeric(.max) %% 10 != 0) { stop("Please verify max is divisible by 10") } .search[["max"]] <- NULL } if ("min" %in% names(.search)) { .min <- .search[["min"]] if (as.numeric(.min) %% 10 != 0) { stop("Please verify min is divisible by 10") } .search[["min"]] <- NULL } if ("article" %in% names(.search)) { .cites <- .search[["article"]] .search[["article"]] <- NULL } if (exists(".cites")) { .base <- .get_base(.cites) } else { .base <- .get_base(NULL) } .args <- c(exists(".year"), exists(".max"), exists(".min")) if (is.recursive(.search)) { .search <- unlist(.search) } .search <- sapply(.search, utils::URLencode) .search <- paste(.search, collapse = "+") if (nchar(.search) > 255L) { stop("search string too many characters, please shorten") } if (all(.args)) { .max <- seq(as.numeric(.min), as.numeric(.max), by = 10) if (length(.year) == 2L) { .add <- sprintf( "start=%d&as_ylo=%s&as_yhi=%s&q=%s&btnG=", .max, .year[1], .year[2], .search ) } else { .add <- sprintf("start=%d&as_ylo=%s&q=%s&btnG=", .max, .year, .search) } } else if (.args[1] & !.args[2] & !.args[3]) { .max <- seq(0, 250, by = 10) if (length(.year) == 2L) { .add <- sprintf( "start=%d&as_ylo=%s&as_yhi=%s&q=%s&btnG=", .max, .year[1], .year[2], .search ) } else { .add <- sprintf("start=%d&as_ylo=%s&q=%s&btnG=", .max, .year, .search) } } else if (!.args[1] & (.args[2] & .args[3])) { .max <- seq(as.numeric(.min), as.numeric(.max), by = 10) .add <- sprintf("start=%d&q=%s&btnG=", .max, .search) } else if (!.args[1] & .args[2] & !.args[3]) { .max <- seq(0, as.numeric(.max), by = 10) .add <- sprintf("start=%d&q=%s&btnG=", .max, .search) } else if (!.args[1] & !.args[2] & .args[3]) { .max <- seq(as.numeric(.min), as.numeric(.min) + 250, by = 10) .add <- sprintf("start=%d&q=%s&btnG=", .max, .search) } else if (.args[1] & .args[2] & !.args[3]) { .max <- seq(0, as.numeric(.max), by = 10) if (length(.year) == 2L) { .add <- sprintf( "start=%d&as_ylo=%s&as_yhi=%s&q=%s&btnG=", .max, .year[1], .year[2], .search ) } else { .add <- sprintf("start=%d&as_ylo=%s&q=%s&btnG=", .max, .year, .search) } } else if (.args[1] & !.args[2] & .args[3]) { .max <- seq(as.numeric(.min), as.numeric(.min) + 250, by = 10) if (length(.year) == 2L) { .add <- sprintf( "start=%d&as_ylo=%s&as_yhi=%s&q=%s&btnG=", .max, .year[1], .year[2], .search ) } else { .add <- sprintf( "start=%d&hl=en&as_vis=1?&as_sdt=0,47&as_ylo=%s&q=%s&btnG=", .max, .year, .search ) } } else { .max <- seq(0, 250, by = 10) .add <- sprintf("start=%d&q=%s&btnG=", .max, .search) } .base <- paste0(.base, .add) return(.base) } #' @title Count Results #' @details just a helper to get the number of results from a search .count_results <- function(url) { url <- xml2::read_html(x = url) .res <- rvest::html_nodes(url, ".gs_ab_mdw") .res <- rvest::html_text(.res)[2] .res <- gsub(",", "", sub("About (.*?) results.*", "\\1", .res)) return(as.numeric(.res)) } #' @title Parse web page #' @details Helper function that finds if articles are missing nodes (i.e., Abstracts, links, etc.). #' Also cleans data. #' @export #' @import rvest xml2 parse_page <- function(.html) { .html <- rvest::html_nodes(.html, "div.gs_ri") .html <- do.call(rbind, lapply(.html, function(x) { Title <- tryCatch( xml2::xml_text(rvest::xml_node(x, ".gs_rt")), error = function(err) { NA } ) Meta <- tryCatch( xml2::xml_text(rvest::xml_node(x, ".gs_a")), error = function(err) { NA } ) Abstract <- tryCatch( xml2::xml_text(rvest::xml_node(x, ".gs_rs")), error = function(err) { NA } ) Link <- tryCatch( rvest::html_attr(rvest::xml_node(x, "h3 a"), "href"), error = function(err) { NA } ) x <- data.frame(Title, Meta, Abstract, Link, stringsAsFactors = FALSE) return(x) })) if (is.null(.html)) { return(NA) } else { .html <- .parsing_helper(.html) if (nrow(.html) > 1L) { .html <- data.frame(apply(.html, 2, function(x) parsing_helper(x)), stringsAsFactors = FALSE ) } else { .html <- sapply(.html, parsing_helper) } .html <- .html[c("Title", "Author", "Journal", "Year", "Link", "Abstract")] return(.html) } } parsing_helper <- function(.vec) { .vec <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", .vec, perl = TRUE) return(.vec) } .parsing_helper <- function(.df) { .meta <- strsplit(.df$Meta, "\\s-\\s") .meta <- lapply(.meta, .f) .meta <- do.call("rbind", .meta) .df$Title <- gsub("\\[[^\\]]*\\]", "", .df$Title, perl = TRUE) .df$Abstract <- gsub("[^[:alnum:][:blank:]+?&/\\-]", " ", .df$Abstract, perl = TRUE) .df$Abstract <- gsub("^Page\\s\\d+.", "", .df$Abstract) .df$Abstract <- gsub("- |^\\s+|\\s+$", "", .df$Abstract) .df$Abstract <- tolower(.df$Abstract) .df$Meta <- NULL .df <- cbind(.df, .meta) return(.df) } #' @title Extract Author, Journal, and Year #' @details Helper function that extracts Author, Journal, and Year from a single node .f <- function(.vec) { if (length(.vec) == 3 || length(.vec) == 2) { Author <- gsub("\\s-.*", "", .vec[1]) if (grepl(",", .vec[2])) { Date <- gsub(".*(\\d{4})", "\\1", .vec[2]) Journal <- gsub("(.+?)(\\,.*)", "\\1", .vec[2]) } else if (grepl("^\\d{4}$|^\\s\\d{4}$", .vec[2])) { Date <- gsub(".*(\\d{4})", "\\1", .vec[2]) Journal <- NA } else if (!grepl("^\\d{4}$|^\\s\\d{4}$", .vec[2])) { Date <- NA Journal <- .vec[2] } else { Date <- NA Journal <- NA } } else if (length(.vec == 1)) { if (grepl("([A-Z]+ )", .vec[1])) { Author <- .vec[1] Date <- NA Journal <- NA } else if (grepl("^\\d{4}$|^\\s\\d{4}$", .vec[1])) { Author <- NA Date <- gsub(".*(\\d{4})", "\\1", .vec[1]) Journal <- NA } else if (!grepl("\\.", .vec[1])) { Author <- NA Date <- NA Journal <- .vec[1] } else { Author <- NA Date <- NA Journal <- NA } } .res <- data.frame( Author = Author, Journal = Journal, Year = Date, stringsAsFactors = FALSE ) return(.res) } #' @title Randomize User-Agent Helper #' @details Function that randomly selects user-agents to make the bot seem 'human' #' @export .use_age <- function() { .out <- c( # Chrome "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36", "Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36", # Firefox "Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)", "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)", "Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko", "Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)", "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)" ) return(.out) } #' @title Pause Helper #' @details Function that randomly pauses after scraping a web page to make the bot seem 'human'. #' to increase scraping speed, lower sampling time. .pause <- function(x) { .wait <- sample(seq(1, 2, .25), 1) * (1 / (1 / 2.5)) .wait <- 10 * (.wait * x) message(sprintf("waiting for %.3f seconds...to prevent being blocked", .wait)) Sys.sleep(.wait) # pause to let connection work closeAllConnections() gc() } #' @title Base Url Helper #' @details Builds a base-url and determines if search should be global (i.e., across all articles) #' or local (i.e., within article cited by) .get_base <- function(.cites = NULL) { if (!is.null(.cites)) { .base <- sprintf( "https://scholar.google.com/scholar?&hl=en&as_vis=1?&as_sdt=1,47&cites=%s&scipsc=1&", .cites ) } else { .base <- "https://scholar.google.com/scholar?&hl=en&as_vis=1?&as_sdt=1,47&" } return(.base) } #' @title Randomize Proxy Helper #' @details Function that randomly selects proxy string to make the bot seem 'human' #' #' @return A dataframe of proxy information #' @export #' .get_proxy <- function() { if ("mproxy" %in% ls(envir = parent.frame())) { return(get("mproxy", envir = parent.frame())) } else { my_page <- tryCatch( xml2::read_html(x = "https://free-proxy-list.net/"), error = function(err) { stop(print(err)) } ) .proxy <- rvest::html_table(my_page)[[1L]] .proxy <- .proxy[.proxy[, 7] == "yes", ] if (nrow(.proxy[.proxy[, 3] == "US", ]) > 1) { .proxy <- .proxy[.proxy[, 3] == "US", c(1, 2)] } else { .proxy <- .proxy[, c(1, 2)] } return(.proxy) } } #' @title Get count for results returned from a search string #' @return a numeric scalar #' @export results_count <- function(...) { .res <- build_search(...)[1L] .res <- .count_results(.res) return(.res) }