#! /usr/bin/env Rscript # Time-stamp: <2019-12-17 11:25:00 christophe@pallier.org> # Download a datasets from a json file using 'dafter' syntax (see https://github.com/vinzeebreak/dafter/) require("rjson") require("tools") # Required for md5sum require("varhandle") ############################### ########### Encoding ########## ############################### # It counts non numeric cells in input vector x (using check.numeric) and returns TRUE if the vector can be "safely" converted to numeric (i.e. it does not contain any cells that can be considered non numeric, NAs being considered numeric). can.be.numeric <- function(x) { stopifnot(is.atomic(x) || is.list(x)) # check if x is a vector numNonNumeric <- sum(!(check.numeric(x))) return(numNonNumeric == 0) } # Fix encoding to UTF-8 fix.encoding <- function(df) { numCols <- ncol(df) numRows <- nrow(df) highProbaEncoding = html_encoding_guess(paste(df[ ,1], sep = " "))[[1]][1] if (grepl("UTF-16BE", highProbaEncoding)){ highProbaEncoding = "latin1" } for (col in 1:numCols){ df[, col] <- iconv(df[, col], from = highProbaEncoding, to = "UTF-8") Encoding(colnames(df)[colnames(df)==col]) <- "UTF-8" } # Remove spaces to handle numeric columns df_without_space <- df for (i in colnames(df_without_space)) { df_without_space[,i] <- gsub("[[:space:]]", "", df_without_space[,i]) } for (col in 1:numCols){ # Handle numeric columns if (can.be.numeric(df_without_space[,col])){ df[, col] <- as.numeric(df_without_space[, col]) }else{ df[, col] <- as.character(df[, col]) } } colnames(df) <- trimws(colnames(df)) return(df) } ############################### ############################### get_data.home <- function() # return the path of the local folder where to put datasets { data.home <- Sys.getenv('OPENLEXICON_DATASETS') xdg.data.home <- Sys.getenv('XDG_DATA_HOME') if (data.home == "") { if (xdg.data.home == "") { data.home <- file.path(path.expand('~'), 'openlexicon_datasets') } else { data.home <- file.path(xdg.data.home, 'openlexicon_datasets') } } dir.create(data.home, showWarnings = FALSE, recursive = TRUE) data.home } get_log.home <- function(app_name) # return the path of the folder where to put logs { is_local <- Sys.getenv('SHINY_PORT') == "" # True if used in local, else false if in production log.home = Sys.getenv("SHINY_LOG") if (log.home == "") { if (is_local){ log.home <- file.path(path.expand('~'), 'shiny_log', app_name) }else{ log.home <- file.path('/var', 'shiny_log', app_name) } } dir.create(log.home, showWarnings = FALSE, recursive = TRUE) log.home } get_info_from_json <- function(json_url) # extract information fields from a json file { print(paste("Parsing", json_url)) json_data <- fromJSON(file = json_url) return( list( description = json_data$description, readme = json_data$readme, website = json_data$website, language = json_data$tags[1], id_lang = json_data$id_lang, mandatory_columns = json_data$mandatory_columns, column_names = json_data$column_names ) ) } get_dataset_from_json <- function(json_url, filename) # download (only if needed), the dataset 'filename' indicated from the json description file # returns the path to the local version of the dataset # Example: # lexique = readRDS(get_dataset_from_json("http://www.lexique.org/databases/_json/Lexique383.json", "Lexique383.rds")) { destname <- file.path(get_data.home(), filename) json_data <- fromJSON(file = json_url) for (u in json_data$urls) { fname <- basename(u$url) if (fname != filename) next if (!file.exists(destname)) { download.file(u$url, destname, mode = 'wb') } else if (md5sum(destname) != u$md5sum) { download.file(u$url, destname, mode = 'wb') } if (md5sum(destname) != u$md5sum) { warning( "Something is wrong: the md5sums don't match. Either the upstream files are inconsistent or someone is messing with your internet connection." ) return(NULL) } return(destname) } } locations <- list( Anagrammes=c("http://www.lexique.org/databases/_json/anagrammes.json", "Anagrammes.rds"), AoA32=c("http://www.lexique.org/databases/_json/AoA-32lang.json", "AoA32lang.rds"), AoA_FamConcept_1225=c("http://www.lexique.org/databases/_json/AoA_FamConcept_1225.json", "AoA_FamConcept_1225.rds"), AoA_FreqSub_1493=c("http://www.lexique.org/databases/_json/AoA_FreqSub_1493.json", "AoA_FreqSub_1493.rds"), Assoc_366=c("http://www.lexique.org/databases/_json/Assoc_366.json", "Assoc_366.rds"), Assoc_520=c("http://www.lexique.org/databases/_json/Assoc_520.json", "Assoc_520.rds"), # Concr_ContextAv_ValEmo_Arous_1659=c("http://www.lexique.org/databases/_json/Concr_ContextAv_ValEmo_Arous_1659.json", "Concr_ContextAv_ValEmo_Arous_1659/Concr_ContextAv_ValEmo_Arous_1659.rds"), # Concr_Imag_FreqSub_Valemo_866=c("http://www.lexique.org/databases/_json/Concr_Imag_FreqSub_Valemo_866.json", "Concr_Imag_FreqSub_Valemo_866/Concr_Imag_FreqSub_Valemo_866.rds"), #XX=c("http://www.lexique.org/databases/_json/FrenchLexiconProject-words.json", "flp-words.rds"), FreqSub_Adulte_Senior_660=c("http://www.lexique.org/databases/_json/FreqSub_Adulte_Senior_660.json", "FreqSub_Adulte_Senior_660.rds"), FreqSub_Imag_1916=c("http://www.lexique.org/databases/_json/FreqSub_Imag_1916.json", "FreqSub_Imag_1916.rds"), FreqSub_Imag_3600=c("http://www.lexique.org/databases/_json/FreqSub_Imag_3600.json", "FreqSub_Imag_3600.rds"), Imag_1493=c("http://www.lexique.org/databases/_json/Imag_1493.json", "Imag_1493.rds"), #XX=c("http://www.lexique.org/databases/_json/Lexique382.json", "Lexique382.rds"), Lexique3=c("http://www.lexique.org/databases/_json/Lexique383.json", "Lexique383.rds"), # LexiqueInfraGP=c("http://www.lexique.org/databases/_json/LexiqueInfra-Graphèmes-Phonèmes.json", "Lexique.Infra.Corresp.Graphème.Phonème.rds"), # XX=c("http://www.lexique.org/databases/_json/Lexique-Infra-Stats-Infra.json", ""), # XX=c("http://www.lexique.org/databases/_json/Manulex.json", ""), Megalex_auditory=c("http://www.lexique.org/databases/_json/Megalex-auditory.json", "Megalex-auditory.rds"), Megalex_visual=c("http://www.lexique.org/databases/_json/Megalex-visual.json", "Megalex-visual.rds"), SemantiQc_auditory=c("http://www.lexique.org/databases/_json/SemantiQc_auditory.json", "SemantiQc_auditory.rds"), SemantiQc_familiarity_concept=c("http://www.lexique.org/databases/_json/SemantiQc_familiarity_concept.json", "SemantiQc_familiarity_concept.rds"), SemantiQc_visual=c("http://www.lexique.org/databases/_json/SemantiQc_visual.json", "SemantiQc_visual.rds"), SensoryExp_1659=c("http://www.lexique.org/databases/_json/SensoryExp_1659.json", "SensoryExp_1659.rds"), #XX=c("http://www.lexique.org/databases/_json/SUBTLEX-US-corpus.json", "SUBTLEX-US-corpus.rds"), SubtlexUS=c("http://www.lexique.org/databases/_json/SUBTLEX-US.json", "SUBTLEXus.rds"), Valemo_Adultes_604=c("http://www.lexique.org/databases/_json/Valemo_Adultes_604.json", "Valemo_Adultes_604.rds"), ValEmo_Arous_1286=c("http://www.lexique.org/databases/_json/ValEmo_Arous_1286.json", "ValEmo_Arous_1286.rds"), ValEmo_Arous_Imag_835=c("http://www.lexique.org/databases/_json/ValEmo_Arous_Imag_835.json", "ValEmo_Arous_Imag_835.rds"), Valemo_Enfants_600=c("http://www.lexique.org/databases/_json/Valemo_Enfants_600.json", "Valemo_Enfants_600.rds"), Voisins=c("http://www.lexique.org/databases/_json/Voisins.json", "Voisins.rds"), WorldLex_EN=c("http://www.lexique.org/databases/_json/WorldLex-English.json", "WorldLex_EN.rds"), WorldLex_FR=c("http://www.lexique.org/databases/_json/WorldLex-French.json", "WorldLex_FR.rds"), WorldLex_AF=c("http://www.lexique.org/databases/_json/WorldLex-Afrikaans.json", "WorldLex-Afrikaans.rds"), WorldLex_ALB=c("http://www.lexique.org/databases/_json/WorldLex-Albanian.json", "WorldLex-Albanian.rds"), WorldLex_AMH=c("http://www.lexique.org/databases/_json/WorldLex-Amharic.json", "WorldLex-Amharic.rds"), WorldLex_ARA=c("http://www.lexique.org/databases/_json/WorldLex-Arabic.json", "WorldLex-Arabic.rds"), WorldLex_ARM=c("http://www.lexique.org/databases/_json/WorldLex-Armenian.json", "WorldLex-Armenian.rds"), WorldLex_AZE=c("http://www.lexique.org/databases/_json/WorldLex-Azeri.json", "WorldLex-Azeri.rds"), WorldLex_BEN=c("http://www.lexique.org/databases/_json/WorldLex-Bengali.json", "WorldLex-Bengali.rds"), WorldLex_BOS=c("http://www.lexique.org/databases/_json/WorldLex-Bosnian.json", "WorldLex-Bosnian.rds"), WorldLex_CAT=c("http://www.lexique.org/databases/_json/WorldLex-Catalan.json", "WorldLex-Catalan.rds"), WorldLex_CHI=c("http://www.lexique.org/databases/_json/WorldLex-Chinese-Simplified.json", "WorldLex-Chinese-Simplified.rds"), WorldLex_CRO=c("http://www.lexique.org/databases/_json/WorldLex-Croatian.json", "WorldLex-Croatian.rds"), WorldLex_CZE=c("http://www.lexique.org/databases/_json/WorldLex-Czech.json", "WorldLex-Czech.rds"), WorldLex_DK=c("http://www.lexique.org/databases/_json/WorldLex-Danish.json", "WorldLex-Danish.rds"), WorldLex_NL=c("http://www.lexique.org/databases/_json/WorldLex-Dutch.json", "WorldLex-Dutch.rds"), WorldLex_EST=c("http://www.lexique.org/databases/_json/WorldLex-Estonian.json", "WorldLex-Estonian.rds"), WorldLex_FI=c("http://www.lexique.org/databases/_json/WorldLex-Finnish.json", "WorldLex-Finnish.rds"), WorldLex_GE=c("http://www.lexique.org/databases/_json/WorldLex-Georgian.json", "WorldLex-Georgian.rds"), WorldLex_DE=c("http://www.lexique.org/databases/_json/WorldLex-German.json", "WorldLex-German.rds"), WorldLex_GRE=c("http://www.lexique.org/databases/_json/WorldLex-Greek.json", "WorldLex-Greek.rds"), WorldLex_GL=c("http://www.lexique.org/databases/_json/WorldLex-Greenlandic.json", "WorldLex-Greenlandic.rds"), WorldLex_GU=c("http://www.lexique.org/databases/_json/WorldLex-Gujarati.json", "WorldLex-Gujarati.rds"), WorldLex_HE=c("http://www.lexique.org/databases/_json/WorldLex-Hebrew.json", "WorldLex-Hebrew.rds"), WorldLex_HI=c("http://www.lexique.org/databases/_json/WorldLex-Hindi.json", "WorldLex-Hindi.rds"), WorldLex_HU=c("http://www.lexique.org/databases/_json/WorldLex-Hungarian.json", "WorldLex-Hungarian.rds"), WorldLex_ICE=c("http://www.lexique.org/databases/_json/WorldLex-Icelandic.json", "WorldLex-Icelandic.rds"), WorldLex_ID=c("http://www.lexique.org/databases/_json/WorldLex-Indonesian.json", "WorldLex-Indonesian.rds"), WorldLex_ITA=c("http://www.lexique.org/databases/_json/WorldLex-Italian.json", "WorldLex-Italian.rds"), WorldLex_JAP=c("http://www.lexique.org/databases/_json/WorldLex-Japanese.json", "WorldLex-Japanese.rds"), WorldLex_KN=c("http://www.lexique.org/databases/_json/WorldLex-Kannada.json", "WorldLex-Kannada.rds"), WorldLex_KZ=c("http://www.lexique.org/databases/_json/WorldLex-Kazakh.json", "WorldLex-Kazakh.rds"), WorldLex_KHM=c("http://www.lexique.org/databases/_json/WorldLex-Khmer.json", "WorldLex-Khmer.rds"), WorldLex_KR=c("http://www.lexique.org/databases/_json/WorldLex-Korean.json", "WorldLex-Korean.rds"), WorldLex_LV=c("http://www.lexique.org/databases/_json/WorldLex-Latvian.json", "WorldLex-Latvian.rds"), WorldLex_LIT=c("http://www.lexique.org/databases/_json/WorldLex-Lithuanian.json", "WorldLex-Lithuanian.rds"), WorldLex_MK=c("http://www.lexique.org/databases/_json/WorldLex-Macedonian.json", "WorldLex-Macedonian.rds"), WorldLex_ML=c("http://www.lexique.org/databases/_json/WorldLex-Malayalam.json", "WorldLex-Malayalam.rds"), WorldLex_MY=c("http://www.lexique.org/databases/_json/WorldLex-Malaysian.json", "WorldLex-Malaysian.rds"), WorldLex_MN=c("http://www.lexique.org/databases/_json/WorldLex-Mongolian.json", "WorldLex-Mongolian.rds"), WorldLex_NEP=c("http://www.lexique.org/databases/_json/WorldLex-Nepali.json", "WorldLex-Nepali.rds"), WorldLex_NOB=c("http://www.lexique.org/databases/_json/WorldLex-Norwegian.json", "WorldLex-Norwegian.rds"), WorldLex_PER=c("http://www.lexique.org/databases/_json/WorldLex-Persian.json", "WorldLex-Persian.rds"), WorldLex_PL=c("http://www.lexique.org/databases/_json/WorldLex-Polish.json", "WorldLex-Polish.rds"), WorldLex_POR_BR=c("http://www.lexique.org/databases/_json/WorldLex-Portuguese-Brazil.json", "WorldLex-Portuguese-Brazil.rds"), WorldLex_POR_EU=c("http://www.lexique.org/databases/_json/WorldLex-Portuguese-Europe.json", "WorldLex-Portuguese-Europe.rds"), WorldLex_PAN=c("http://www.lexique.org/databases/_json/WorldLex-Punjabi.json", "WorldLex-Punjabi.rds"), WorldLex_RO=c("http://www.lexique.org/databases/_json/WorldLex-Romanian.json", "WorldLex-Romanian.rds"), WorldLex_RU=c("http://www.lexique.org/databases/_json/WorldLex-Russian.json", "WorldLex-Russian.rds"), WorldLex_SER=c("http://www.lexique.org/databases/_json/WorldLex-Serbian.json", "WorldLex-Serbian.rds"), WorldLex_SIN=c("http://www.lexique.org/databases/_json/WorldLex-Sinhala.json", "WorldLex-Sinhala.rds"), WorldLex_SK=c("http://www.lexique.org/databases/_json/WorldLex-Slovak.json", "WorldLex-Slovak.rds"), WorldLex_SL=c("http://www.lexique.org/databases/_json/WorldLex-Slovenian.json", "WorldLex-Slovenian.rds"), WorldLex_ES_SA=c("http://www.lexique.org/databases/_json/WorldLex-Spanish-Latin-America.json", "WorldLex-Spanish-Latin-America.rds"), WorldLex_ES=c("http://www.lexique.org/databases/_json/WorldLex-Spanish-Spain.json", "WorldLex-Spanish-Spain.rds"), WorldLex_SWA=c("http://www.lexique.org/databases/_json/WorldLex-Swahili.json", "WorldLex-Swahili.rds"), WorldLex_SWE=c("http://www.lexique.org/databases/_json/WorldLex-Swedish.json", "WorldLex-Swedish.rds"), WorldLex_TGL=c("http://www.lexique.org/databases/_json/WorldLex-Tagalog.json", "WorldLex-Tagalog.rds"), WorldLex_TA=c("http://www.lexique.org/databases/_json/WorldLex-Tamil.json", "WorldLex-Tamil.rds"), WorldLex_TEL=c("http://www.lexique.org/databases/_json/WorldLex-Telugu.json", "WorldLex-Telugu.rds"), WorldLex_TUR=c("http://www.lexique.org/databases/_json/WorldLex-Turkish.json", "WorldLex-Turkish.rds"), WorldLex_UK=c("http://www.lexique.org/databases/_json/WorldLex-Ukrainian.json", "WorldLex-Ukrainian.rds"), WorldLex_URD=c("http://www.lexique.org/databases/_json/WorldLex-Urdu.json", "WorldLex-Urdu.rds"), WorldLex_UZ=c("http://www.lexique.org/databases/_json/WorldLex-Uzbek.json", "WorldLex-Uzbek.rds"), WorldLex_VIE=c("http://www.lexique.org/databases/_json/WorldLex-Vietnamese.json", "WorldLex-Vietnamese.rds"), WorldLex_CY=c("http://www.lexique.org/databases/_json/WorldLex-Welsh.json", "WorldLex-Welsh.rds") ) get_datasets <- function(listofdatasets, locations) # returns the local locations of datasets listed in listdatasets (downloading them from the internet if needed) # Example: # get_datasets(c('Lexique3', 'Voisins', 'Anagrammes'), locations) { locs = list() for (name in listofdatasets) locs = append(locs, get_dataset_from_json(locations[[name]][1],locations[[name]][2])) names(locs) = listofdatasets return(locs) } get_all_datasets <- function(locations) # download all the datasets listed in location (only oif not already downloaded) { return(get_datasets(names(locations), locations)) } load_rds <- function(list_rds) # load in memory the rds files listed in list_rds, in variables with the names matching names(list_rds) { for (n in names(list_rds)) { warning(paste('Loading ', n, ' from ', list_rds[[n]])) assign(n, readRDS(list_rds[[n]]), envir= .GlobalEnv) } } ########################################################################################## # No: the following functions should be obsolete now that there is `get_datasets` ` default_remote <- "https://raw.githubusercontent.com/chrplr/openlexicon/master/datasets-info/_json/" lexique_remote <- "http://www.lexique.org/databases/_json" get_lexique383_rds <- function() { readRDS(get_dataset_from_json(paste(lexique_remote, "Lexique383.json", sep="/"), "Lexique383.rds")) } get_worldlex.french_rds <- function() { readRDS(get_dataset_from_json(paste(lexique_remote, 'WorldLex-French.json', sep='/'), "WorldLex_FR.rds")) } get_worldlex.english_rds <- function() { readRDS(get_dataset_from_json(paste(lexique_remote, 'WorldLex-English.json', sep='/'), "WorldLex_EN.rds")) } get_subtlex.us_rds <- function() { readRDS(get_dataset_from_json(paste(lexique_remote, 'SUBTLEX-US.json', sep='/'), "SUBTLEX-us.rds")) } get_aoa32_rds <- function() { readRDS(get_dataset_from_json(paste(lexique_remote, 'AoA-32lang.json', sep='/'), "AoA-32lang.rds")) } ################################################################################ # Usage: # source('https://raw.githubusercontent.com/chrplr/openlexicon/master/datasets-info/fetch_datasets.R') # lexique <- get_lexique382() # or # uscorpus <- readRDS(fetch_dataset('SUBTLEX-US-corpus', format='rds')$datatables[[1]]) ## OBSOLETE: ## fetch_dataset <- ## function(dataset_id, ## location = default_remote, ## filename = NULL, ## format = NULL) ## # download, only if needed, a dataset from openlexicon databases ## # returns a list with information about the dataset and a list of local filenames containing the datatables): ## ## list(name=dataset_id, ## ## datatables=tables, ## ## description=description, ## ## readme=readme, ## ## website=website) ## { ## destname <- '' ## json_file <- paste(location, dataset_id, '.json', sep = "") ## json_data <- fromJSON(file = json_file) ## description <- json_data$description ## readme <- json_data$readme ## website <- json_data$website ## language <- json_data$tag[1] ## mandatory_columns <- json_data$mandatory_columns ## tables = list() ## for (u in json_data$urls) ## { ## fname <- basename(u$url) ## if (!is.null(filename) && (filename != fname)) ## next # skip this file ## if (!is.null(format) && ## tools::file_ext(fname) != format) ## # check if format (extension) matches ## next # skip this file ## destname <- file.path(get_data.home(), fname) ## warning(paste("Downloading in ", destname)) ## if (!file.exists(destname)) ## { ## download.file(u$url, destname, mode = 'wb') ## if (md5sum(destname) != u$md5sum) ## { ## warning( ## "Something is wrong: the md5sums don't match. Either the upstream files are inconsistent or someone is messing with your internet connection." ## ) ## } else ## { ## print(paste("File", destname, "downloaded without issue.")) ## tables <- append(tables, destname) ## } ## } else ## # The local file exists ## { ## if (md5sum(destname) != u$md5sum) { ## warning( ## paste( ## "the md5 sum of your local file", ## destname, ## md5sum(destname), ## "doesn't match the distant version", ## u$md5sum, ## ". Aborting. Delete the local file if necessary" ## ) ## ) ## } ## else ## { ## warning(paste( ## "You already have the file", ## destname, ## "which is up to date." ## )) ## tables <- append(tables, destname) ## } ## } ## } ## if (length(tables) == 0) ## { ## warning("could not find a file with a matching format") ## } ## list( ## name = dataset_id, ## datatables = tables, ## description = description, ## readme = readme, ## language = language, ## website = website, ## mandatory_columns=mandatory_columns ## ) ## }