---
title: "Who is Who: Matching taxonomy for mammalian IUCN data to other databases"
author: "Manuela Gonzalez Suarez and Luis D. Verde Arregoitia"
date: "13 February 2017"
output:
html_document: default
pdf_document: default
word_document: default
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
#### This script shows how to match the IUCN Red List status data with a species-level database. Here as an example we use the diet database EltonTraits as an example, but the script could be modified relatively easy to match other databases.
#### EltonTrait reference: Wilman, H., Belmaker, J., Simpson, J., de la Rosa, C., Rivadeneira, M. M. and Jetz, W. (2014), EltonTraits 1.0: Species-level foraging attributes of the world's birds and mammals. Ecology, 95: 2027. doi:10.1890/13-1917.1
#### This blog entry shows the code and includes some additional explanations https://wordpress.com/post/ramblingsofanecologa.wordpress.com/498
#### FIRST: we start by loading and retrieving the necessary R packages and datasets. Using the R package taxize you can actually retrieve the IUCN Red List data directly but you need a personal API so I am not showing that here. You can also download the latest version from IUCN Red List (www.iucn.org). For this example I just uploaded a list obtained on the 5th January 2017 which is available on GitHub (https://github.com/ManuelaGonzalez/Who-is-Who/)
```##
## Necessary packages
library(taxize)
library(stringdist)
library(rvest)
library(stringi)
library(tidyr)
library(dplyr)
## IUCN Red List (download and tidy-up)
IUCN <- read.csv("https://raw.githubusercontent.com/ManuelaGonzalez/Who-is-Who/master/Mammals_2017_01_05.csv", stringsAsFactors=FALSE,strip.white=TRUE)
## Generate full species name
IUCN$binomial <- paste(IUCN$Genus, IUCN$Species, sep = " ")
## Tidy-up synonyms list
IUCN$Synonyms <- gsub(" ", " ", IUCN$Synonyms)
IUCN$Synonyms <- gsub("", "", IUCN$Synonyms)
IUCN$Synonyms <- gsub("", "", IUCN$Synonyms)
##In this example this is the database I want to match to IUCN data.
## EltonTrait database
EltonTraits_original <- read.table("http://www.esapubs.org/archive/ecol/E095/178/MamFuncDat.txt", sep="\t", stringsAsFactors=FALSE, header=T, strip.white=TRUE)
EltonTraits <- EltonTraits_original[EltonTraits_original$Scientific!="",] ## There are some empty rows in the downloadable file of EltonTraits, these should be removed
str(EltonTraits)
## Synomym database as extracted from this website http://www.planet-mammiferes.org/drupal/en/node/20, using code written by Dr. Luis Verde
## scrape page
fullPage <- read_html("http://www.planet-mammiferes.org/drupal/en/node/37?taxon=1")
## extract node (identified using the selectorgadget browser extension)
## note that the scraped hmtl is saved as a string
synList <- fullPage %>% html_nodes("#main p") %>% toString()
## split strings using the newline html tags, store as matrix
## make sure the search pattern doesn't need escaping (depending on your OS)
synListMat <- stri_split_fixed(synList, "
") %>%
stri_list2matrix() %>% as.data.frame()
## split into columns (probably more than needed, just being cautious)
## the warning message can be ignored
synSepDF <- separate(synListMat,V1,into = paste("V", 1:9,sep=""), sep = "=")
## which columss are empty
synSepDF %>% summarise_each(funs(100*mean(is.na(.))))
## remove them
synSepDF <- synSepDF %>% select(V1:V5)
## clean up the first column (species), based on the tags for boldface
synSepDF$V1 <- stri_replace_all_fixed(synSepDF$V1,c("",""),"",vectorize_all = FALSE)
## clean up the rows that don't have species names (still html tags in the cells)
## make sure the regex doesn't need escaping (depending on your OS)
synTable <- synSepDF %>% filter(!stri_detect_regex(V1,'\\<'))
## rename columns
synTable <- synTable %>% rename(species=V1,syn1=V2,syn2=V3,syn3=V4,syn4=V5)
## species and subspecies
## count words
synTable$ssp <- stri_count_words(synTable$species)
## recode
synTable$subsp <- case_when(synTable$ssp == 2 ~ "species",
synTable$ssp == 3 ~ "subspecies")
## clean up
synTable <- synTable %>% select(-ssp)
## trim whitespace
Mammal_planet_all <- synTable %>% mutate_all(stri_trim_both)
## subset and prepare for next step
## remove last variable that separates taxonomic level
Mammal_planet <- subset(Mammal_planet_all, subsp=="species")[,1:5]
## write to disk (optional)
# write.csv(synTable,file="mammalPLanet.csv")
```
#### SECOND: this is a loop to match species names from EltonTrait to IUCN. It is a presented as a single loop with several steps that could be done separately (that could help for checking mistakes).
```##
## Create some variables to store information or for checks within loopEltonTraits$IUCN_binomial="no_match"
EltonTraits$IUCN_binomial="no_match"
EltonTraits$IUCN_binomial_issues=""
EltonTraits$IUCN_binomial_source=""
f1="test"
maxDist = 5 ## This value influences the allowed mistmatched in partial matches made with 'amatch' from the library stringdist, can be changed to allow for more or less variation in spelling.
for (i in 1: nrow(EltonTraits)){
##STEP 1: matches IUCN listed species name or synonym to names in my database of interest (in this example EltonTraits)
if (length(grep(EltonTraits$Scientific[i], IUCN$binomial))>0){
EltonTraits$IUCN_binomial[i]=IUCN$binomial[grep(EltonTraits$Scientific[i], IUCN$binomial)]
EltonTraits$IUCN_binomial_source[i]="IUCN"}
if ((EltonTraits$IUCN_binomial[i]=="no_match") & (length(grep(EltonTraits$Scientific[i], IUCN$Synonyms))==1)) {
EltonTraits$IUCN_binomial[i]=IUCN$binomial[grep(EltonTraits$Scientific[i], IUCN$Synonyms)]
EltonTraits$IUCN_binomial_source[i]="IUCN"}
if ((EltonTraits$IUCN_binomial[i]=="no_match") & (length(grep(EltonTraits$Scientific[i], IUCN$Synonyms))>1)) {
EltonTraits$IUCN_binomial[i]="multiple_matches"
EltonTraits$IUCN_binomial_issues[i]=paste(IUCN$binomial[grep(EltonTraits$Scientific[i], IUCN$Synonyms)], collapse = ';')
EltonTraits$IUCN_binomial_source[i]="IUCN"}
##also try partial matches to account for possible mispellings and variants. Names would be listed as partial_match to ensure manual checking as partial matches may not be taxonomically correct.
if ((EltonTraits$IUCN_binomial[i]=="no_match") & (!is.na(amatch(EltonTraits$Scientific[i], IUCN$binomial, maxDist = maxDist)))) {
EltonTraits$IUCN_binomial[i]="partial_match"
EltonTraits$IUCN_binomial_issues[i]=paste(IUCN$binomial[amatch(EltonTraits$Scientific[i], IUCN$binomial, maxDist = maxDist)], collapse = ';')
EltonTraits$IUCN_binomial_source[i]="IUCN"}
if ((EltonTraits$IUCN_binomial[i]=="no_match") & (!is.na(amatch(EltonTraits$Scientific[i], IUCN$Synonyms, maxDist = maxDist)))){
EltonTraits$IUCN_binomial[i]="partial_match"
EltonTraits$IUCN_binomial_issues[i]=paste(IUCN$binomial[amatch(EltonTraits$Scientific[i], IUCN$Synonyms, maxDist = maxDist)], paste(IUCN$Synonyms[amatch(EltonTraits$Scientific[i], IUCN$Synonyms, maxDist = maxDist)], collapse = ';'), sep = ";")
EltonTraits$IUCN_binomial_source[i]="IUCN"}
##STEP 2: matches names not recognized in step 1 with the EoL, Encyclopedia of Life database to identify additional synomyms and possible matches
if ((EltonTraits$IUCN_binomial[i]=="no_match") | (EltonTraits$IUCN_binomial[i]=="partial_match")) {
EOL_synonym <- eol_search(EltonTraits$Scientific[i])
if (!is.na(EOL_synonym[[1]][1])){
for (j in 1:nrow(EOL_synonym)){
f2 <-f1
f1 <-trimws(paste(strsplit(EOL_synonym[j,2], " ")[[1]][1],strsplit(EOL_synonym[j,2], " ")[[1]][2], " ")) ##listed names are often repeated so this is to avoid rematching the same name
if (f1!=f2){
if (length(grep(f1, IUCN$binomial))>0) {
if (EltonTraits$IUCN_binomial[i]=="partial_match"){
EltonTraits$IUCN_binomial_issues[i]=""
EltonTraits$IUCN_binomial[i]=IUCN$binomial[grep(f1, IUCN$binomial)]
EltonTraits$IUCN_binomial_source[i]="EOL-Taxize"}
if ((EltonTraits$IUCN_binomial[i]!="no_match") & (EltonTraits$IUCN_binomial[i]!=IUCN$binomial[grep(f1, IUCN$binomial)])) {
if (EltonTraits$IUCN_binomial_source[i]!="EOL-Taxize"){
EltonTraits$IUCN_binomial[i]="multiple_matches"
EltonTraits$IUCN_binomial_source[i]= paste(EltonTraits$IUCN_binomial_source[i], "EOL-Taxize", sep = ";")}
if (EltonTraits$IUCN_binomial_issues[i]==""){
EltonTraits$IUCN_binomial_issues[i]=paste(EltonTraits$IUCN_binomial[i], paste(IUCN$binomial[grep(f1, IUCN$binomial)], collapse = ';'), sep = ";")
EltonTraits$IUCN_binomial[i]="multiple_matches"}
if (EltonTraits$IUCN_binomial_issues[i]!="") {
EltonTraits$IUCN_binomial_issues[i]=paste(EltonTraits$IUCN_binomial_issues[i], paste(IUCN$binomial[grep(f1, IUCN$binomial)],collapse = ';'), sep = ";")
EltonTraits$IUCN_binomial[i]="multiple_matches"}
}
if (EltonTraits$IUCN_binomial[i]=="no_match"){
EltonTraits$IUCN_binomial[i]=IUCN$binomial[grep(f1, IUCN$binomial)]
EltonTraits$IUCN_binomial_source[i]="EOL-Taxize"}
}
if (length(grep(f1, IUCN$Synonyms))==1){
if (EltonTraits$IUCN_binomial[i]=="partial_match"){
EltonTraits$IUCN_binomial_issues[i]=""
EltonTraits$IUCN_binomial[i]=IUCN$binomial[grep(f1, IUCN$Synonyms)]
EltonTraits$IUCN_binomial_source[i]="EOL-Taxize"}
if ((EltonTraits$IUCN_binomial[i]!="no_match") & (EltonTraits$IUCN_binomial[i]!=IUCN$binomial[grep(f1, IUCN$Synonyms)])) {
if (EltonTraits$IUCN_binomial_source[i]!="EOL-Taxize"){
EltonTraits$IUCN_binomial_source[i]= paste(EltonTraits$IUCN_binomial_source[i], "EOL-Taxize", sep = ";")
EltonTraits$IUCN_binomial[i]="multiple_matches"}
if (EltonTraits$IUCN_binomial_issues[i]==""){
EltonTraits$IUCN_binomial_issues[i]=paste(EltonTraits$IUCN_binomial[i], paste(IUCN$binomial[grep(f1, IUCN$Synonyms)],collapse = ';'), sep = ";")
EltonTraits$IUCN_binomial[i]="multiple_matches"}
if (EltonTraits$IUCN_binomial_issues[i]!="") {
EltonTraits$IUCN_binomial_issues[i]=paste(EltonTraits$IUCN_binomial_issues[i], paste(IUCN$binomial[grep(f1, IUCN$Synonyms)],collapse = ';'), sep = ";")
EltonTraits$IUCN_binomial[i]="multiple_matches"}
}
if (EltonTraits$IUCN_binomial[i]=="no_match"){
EltonTraits$IUCN_binomial[i]=IUCN$binomial[grep(f1, IUCN$Synonyms)]
EltonTraits$IUCN_binomial_source[i]="EOL-Taxize" }
}
if ((length(grep(f1, IUCN$Synonyms))>1) & (EltonTraits$IUCN_binomial[i]=="no_match")){
EltonTraits$IUCN_binomial[i]="multiple_matches"
EltonTraits$IUCN_binomial_source[i]="EOL-Taxize"
EltonTraits$IUCN_binomial_issues[i]=paste(IUCN$binomial[grep(f1, IUCN$Synonyms)],collapse = ';')
}
if ((length(grep(f1, IUCN$Synonyms))>1) & (EltonTraits$IUCN_binomial[i]!="no_match")){
EltonTraits$IUCN_binomial[i]="multiple_matches"
EltonTraits$IUCN_binomial_issues[i]=paste(EltonTraits$IUCN_binomial_issues[i], paste(IUCN$binomial[grep(f1, IUCN$Synonyms)],collapse = ';'), sep = ";")
if (EltonTraits$IUCN_binomial[i]!="partial_match"){
EltonTraits$IUCN_binomial_source[i]="EOL-Taxize"}
if (EltonTraits$IUCN_binomial_source[i]=="IUCN"){
EltonTraits$IUCN_binomial_source[i]="IUCN-partial_EOL-Taxize"}
}
}
}
}
}
##STEP 3: matches names not recognized in step 2 with the synomym list from website Mammal Planet
if ((EltonTraits$IUCN_binomial[i]=="no_match") | (EltonTraits$IUCN_binomial[i]=="partial_match")){
for (m in 1:ncol(Mammal_planet)){
if (length(grep(EltonTraits$Scientific[i], Mammal_planet[,m]))>0){
syno = Mammal_planet[grep(EltonTraits$Scientific[i], Mammal_planet[,m]),-m]
for(z in 1:ncol(syno)){
if ((syno[z])!="") {
if (length(grep(syno[z],IUCN$binomial))>0) {
if (EltonTraits$IUCN_binomial[i]=="partial_match") {
EltonTraits$IUCN_binomial_issues[i]=paste("partial_IUCN_match", EltonTraits$IUCN_binomial_issues[i],sep = ";")
EltonTraits$IUCN_binomial[i]=IUCN$binomial[grep(syno[z],IUCN$binomial)]
EltonTraits$IUCN_binomial_source[i]="Mammal_Planet_Website"}
if ((EltonTraits$IUCN_binomial[i]!="no_match") & (EltonTraits$IUCN_binomial[i]!=IUCN$binomial[grep(syno[z],IUCN$binomial)])){
EltonTraits$IUCN_binomial_issues[i]=paste(EltonTraits$IUCN_binomial[i],paste(IUCN$binomial[grep(syno[z],IUCN$binomial)],collapse = ';'),sep = ";")
EltonTraits$IUCN_binomial[i]="multiple_matches"
if (EltonTraits$IUCN_binomial_source[i]!="Mammal_Planet_Website"){
EltonTraits$IUCN_binomial_source[i]=paste( EltonTraits$IUCN_binomial_source[i], "Mammal_Planet_Website",sep = ";")}
}
if (EltonTraits$IUCN_binomial[i]=="no_match"){
EltonTraits$IUCN_binomial[i]=IUCN$binomial[grep(syno[z],IUCN$binomial)]
EltonTraits$IUCN_binomial_source[i]="Mammal_Planet_Website"}
}
if (length(grep(syno[z],IUCN$Synonyms))>0) {
if (EltonTraits$IUCN_binomial[i]=="partial_match") {
EltonTraits$IUCN_binomial_issues[i]=paste("partial_IUCN_match", EltonTraits$IUCN_binomial_issues[i],sep = ";")
EltonTraits$IUCN_binomial[i]=IUCN$binomial[grep(syno[z],IUCN$Synonyms)]
EltonTraits$IUCN_binomial_source[i]="Mammal_Planet_Website"}
if ((EltonTraits$IUCN_binomial[i]!="no_match") & (EltonTraits$IUCN_binomial[i]!=IUCN$binomial[grep(syno[z],IUCN$Synonyms)])) {
EltonTraits$IUCN_binomial_issues[i]=paste(EltonTraits$IUCN_binomial[i],paste(IUCN$binomial[grep(syno[z],IUCN$Synonyms)],collapse = ';'),sep = ";")
EltonTraits$IUCN_binomial[i]="multiple_matches"
if (EltonTraits$IUCN_binomial_source[i]!="Mammal_Planet_Website"){
EltonTraits$IUCN_binomial_source[i]=paste( EltonTraits$IUCN_binomial_source[i], "Mammal_Planet_Website",sep = ";")}}
if (EltonTraits$IUCN_binomial[i]=="no_match"){
EltonTraits$IUCN_binomial[i]=IUCN$binomial[grep(syno[z],IUCN$Synonyms)]
EltonTraits$IUCN_binomial_source[i]="Mammal_Planet_Website"}
}
}
}
}
}
}
}
```
##### THIRD: quick summary of issues. Some species may be only partially matched or matched to more than one name. In those cases we see no solution by human intervention. That means, you need to check those taxa and make a decision (to assign one name, to ignore these potential confusion species, to write a paper to clarify taxonomy...)
```##
EltonTraits %>% filter(IUCN_binomial %in% c("partial_match","multiple_matches")) %>%
count(IUCN_binomial)
```
##### EXTRA BITS: for this particular database this additional step was not useful but we have already written the code, so here it is in case you want to run it, we suggest adding to the sequence above as step 4
```##
## POSSIBLE STEP 4: matches names not recognized in step 3 in the loop above with the ITIS, Integrated Taxonomic Information System, database to identify additional synomyms and possible matches
#### For this particular example this step does not contribute any matches, but may be worth exploring for different subsets of data. Could be inserted into the loop or run afterwards (as shown here)
for (i in 1: nrow(EltonTraits)){
if (EltonTraits$IUCN_binomial[i]=="no_match" | (EltonTraits$IUCN_binomial[i]=="partial_match"){
ITIS_synonym <- synonyms(EltonTraits$Scientific[i], db="itis")
if (!is.na(ITIS_synonym)){
if (ITIS_synonym[[1]][1,3]!="no syns found") {
for (k in 1:nrow(ITIS_synonym[[1]][4])){
if (length(grep(ITIS_synonym[[1]][k,4], IUCN$binomial))>0){
if (EltonTraits$IUCN_binomial[i]=="partial_match") {
EltonTraits$IUCN_binomial_issues[i]=paste("partial_IUCN_match", EltonTraits$IUCN_binomial_issues[i],sep = ";")
EltonTraits$IUCN_binomial[i]=IUCN$binomial[grep(ITIS_synonym[[1]][k,4], IUCN$binomial)]}
if ((EltonTraits$IUCN_binomial[i]!="no_match") & (EltonTraits$IUCN_binomial[i]!=IUCN$binomial[grep(ITIS_synonym[[1]][k,4], IUCN$binomial)])) {
EltonTraits$IUCN_binomial[i]="multiple_matches"
EltonTraits$IUCN_binomial_source[i]="itis-Taxize"
EltonTraits$IUCN_binomial_issues[i]=paste(EltonTraits$IUCN_binomial[i],paste(IUCN$binomial[grep(ITIS_synonym[[1]][k,4], IUCN$binomial)],collapse = ';'))}
if (EltonTraits$IUCN_binomial[i]=="no_match"){
EltonTraits$IUCN_binomial[i]=IUCN$binomial[grep(ITIS_synonym[[1]][k,4], IUCN$binomial)]
EltonTraits$IUCN_binomial_source[i]="itis-Taxize"}
}
if (length(grep(ITIS_synonym[[1]][k,4], IUCN$Synonyms))>0) {
if (EltonTraits$IUCN_binomial[i]!="no_match"){
EltonTraits$IUCN_binomial[i]="multiple_matches"
EltonTraits$IUCN_binomial_source[i]="itis-Taxize"
EltonTraits$IUCN_binomial_issues[i]=paste(EltonTraits$IUCN_binomial[i],paste(IUCN$binomial[grep(ITIS_synonym[[1]][k,4], IUCN$Synonyms)],collapse = ';'))}
if (EltonTraits$IUCN_binomial[i]=="no_match"){
EltonTraits$IUCN_binomial[i]=IUCN$binomial[grep(ITIS_synonym[[1]][k,4], IUCN$Synonyms)]
EltonTraits$IUCN_binomial_source[i]="itis-Taxize"}
}
}
}
}
}
}
```