Please contact cstarrmeredith@gmail.com for any additional information about running the ambiguous taxa program The readme file is best viewed by at https://raw.githubusercontent.com/christystarr/ambiguoustaxa/master/README.md as the information below has errors. #############################Part 1 #Instructions: set working directory and put 2 input files in there. "Invertsinput.csv","lookup.csv". Example files are in the "data" folder on github. Column headings must match. #any taxonomic level that does not have a name associated with it must be "NA" # all of the taxa in the invertsinput.csv must be in the lookup table #set method to "widespread" or "numerous; set method_divide to "nodivide or "divide"; ##The final file is ambiguous_APTC_results.csv in the "all" folder, which has the original taxa (InvName), the new taxa when #ambiguous taxa are assigned at the site level (sitetaxa) and the new taxa when ambiguous taxa are assigned at the dataset level (newtaxa) according to the APTC_SG method # to produce final datasets for other common methods in Meredith et al., run extracode, part 2 (below) #Instructions for APTC_SG Method setwd("C:/christy/examples") # set working directory where input files are located; columns must be the same as in example) method="numerous" #options are "numerous" or "widespread" (see Meredith et al. 2019) method_divide="divide" #options are "divide" or "nodivide" (see Meredith et al. 2019) library(doBy) library(plyr) library(devtools) install_github("christystarr/ambiguoustaxa") library(ambiguoustaxa) inverts=read.table("invertsinput.csv",sep=",",header=TRUE) # place example files from github in your working directory or create your own invertsinput.csv and lookup.csv lookup=read.table("lookup.csv",sep=",",header=TRUE) setup(lookup) missinglevels(lookupsetup) siteresolve(inverts,lookupready,"nodivide") #methods are "divide" or "nodivide" referring to whether the parent taxa will be divided among children at the site, or assigned to the most numerous at the site allresolve(lookupready,invertssite_obs,invertsnew,"widespread") # methods are "numerous" or "widespread" refering to whether ambiguous taxa will be assigned to the child taxa that is most abundance,#or the child taxa found at the most sites ############################## Part 2 #after running part 1, simply select the entire code below; final files after ambiguous taxa are resolved will be put into the working directory #creates datasets in which ambiguous taxa are resolve in different ways: APTC_S,APTC_SG,APTC_SG1,APTC_SG2,RPCK_S; puts these in working directory #also estimates variables describing number of singletons and doubleton and number uniques and duplicates; these will be in the global environment ################################# setwd("all") methods.b=read.table("ambiguous_APTC_results.csv",sep=",",header=TRUE) if(method_divide=="divide") { methods.b=methods.b[,c(1,3,4,5,7,2)] } if (method_divide != "divide"){ methods.b=methods.b[,c(1,3,4,5,7,2)] } ##calculate number of singletons and doubletons in original dataset setwd("C://christy/examples") colnames(methods.b)=c("obs","newtaxa","InvEventFK","InvName","InvCount","sitetaxa") invertsorig=read.table("invertsinput.csv",sep=",",header=TRUE) origsum=summaryBy(InvCount~InvName,FUN=c(sum),data=invertsorig) origsing=subset(origsum,origsum$InvCount.sum==1) origdoub=subset(origsum,origsum$InvCount.sum==2) singledoub=rbind(origsing,origdoub) length(origsing[,1]) length(origdoub[,1]) ## calculate number of uniques and duplicates in original dataset origcount=summaryBy(InvCount~InvName+InvEventFK,FUN=c(sum),data=invertsorig) origcount$InvCount.sum=ifelse(origcount$InvCount.sum>0,1,0) origcount2=summaryBy(InvCount.sum~InvName,FUN=sum,data=origcount) uniqueorig=subset(origcount2,origcount2$InvCount.sum.sum==1) duporig=subset(origcount2,origcount2$InvCount.sum.sum==2) uniqdup=rbind(uniqueorig,duporig) length(uniqueorig[,1]) length(duporig[,1]) # prepare to summarize amb_sing2=merge(methods.b,singledoub,by.x=c("newtaxa"),by.y=c("InvName"),all.x=TRUE,all.y=FALSE) amb_sing3=merge(amb_sing2,uniqdup,by.x=c("newtaxa"),by.y=c("InvName"),all.x=TRUE,all.y=FALSE) amb_sing3=amb_sing3[,c(2,3,5,4,6,1,7,8)] colnames(amb_sing3)=c("obs","InvEventFK","InvCount","InvName","sitetaxa","newtaxa","sing_doub","uniq_dup") amb_sing3$sing_doub[is.na(amb_sing3$sing_doub)] <- 0 amb_sing3$uniq_dup[is.na(amb_sing3$uniq_dup)] <- 0 if(method=="numerous"){ ## create APTC_SG dataset APTC_SG=summaryBy(InvCount~InvEventFK+newtaxa,data=amb_sing3,FUN=sum) amb_sing3$newtaxa=as.character(amb_sing3$newtaxa) amb_sing3$InvName=as.character(amb_sing3$InvName) SG_singdoub=summaryBy(InvCount.sum~newtaxa,FUN=sum,data=APTC_SG) singSG=subset(SG_singdoub,SG_singdoub$InvCount.sum.sum==1) lsingSG=length(singSG[,1]) doubSG=subset(SG_singdoub,SG_singdoub$InvCount.sum.sum>1 & SG_singdoub$InvCount.sum.sum<=2 ) ldoubSG=length(doubSG[,1]) write.table(APTC_SG,"APTC_SG.csv",sep=",",row.names=FALSE) #create APTC_SG1 dataset amb_sing3.SG1=amb_sing3[!(amb_sing3$sing_doub!=0 & amb_sing3$InvName!=amb_sing3$newtaxa),] APTC_SG1=summaryBy(InvCount~InvEventFK+newtaxa,data=amb_sing3.SG1,FUN=sum) test=unique(APTC_SG1$newtaxa) APTC_SG1singdoub=summaryBy(InvCount.sum~newtaxa,FUN=sum,data=APTC_SG1) singSG1=subset(APTC_SG1singdoub,APTC_SG1singdoub$InvCount.sum.sum==1) lsingSG1=length(singSG1[,1]) doubSG1=subset(APTC_SG1singdoub,APTC_SG1singdoub$InvCount.sum.sum>1 & APTC_SG1singdoub$InvCount.sum.sum<=2 ) ldoubSG1=length(doubSG1[,1]) write.table(APTC_SG1,"APTC_SG1.csv",sep=",",row.names=FALSE) # create APTC_SG2 dataset amb_sing3.SG2=amb_sing3[!(amb_sing3$sing_doub!=0 & amb_sing3$newtaxa==amb_sing3$sitetaxa & amb_sing3$InvName!=amb_sing3$newtaxa),] APTC_SG2=summaryBy(InvCount~InvEventFK+newtaxa,data=amb_sing3.SG2,FUN=sum) test=unique(APTC_SG2$newtaxa) APTC_SG2singdoub=summaryBy(InvCount.sum~newtaxa,FUN=sum,data=APTC_SG2) singSG2=subset(APTC_SG2singdoub,APTC_SG2singdoub$InvCount.sum.sum==1) lsingSG2=length(singSG2[,1]) doubSG2=subset(APTC_SG2singdoub,APTC_SG1singdoub$InvCount.sum.sum>1 & APTC_SG2singdoub$InvCount.sum.sum<=2 ) ldoubSG2=length(doubSG2[,1]) write.table(APTC_SG2,"APTC_SG2.csv",sep=",",row.names=FALSE) ## create APTC_S dataset APTC_S=summaryBy(InvCount~InvEventFK + sitetaxa,FUN=sum,data=amb_sing3) APTC_singdoub=summaryBy(InvCount~sitetaxa,FUN=sum,data=amb_sing3) singAPTC_s=subset(APTC_singdoub,APTC_singdoub$InvCount.sum==1) doubAPTC_s=subset(APTC_singdoub,APTC_singdoub$InvCount.sum==2) l_singAPTC_s=length(singAPTC_s[,1]) l_doubAPTC_s=length(doubAPTC_s[,1]) write.table(APTC_S,"APTC_S.csv",sep=",",row.names=FALSE) ##create RPKC_s datast amb_sing3$diff=ifelse(amb_sing3$sitetaxa!=amb_sing3$InvName,1,0) RPKC_s.a=subset(amb_sing3,amb_sing3$diff==0) RPKC_s=summaryBy(InvCount~InvEventFK + sitetaxa,FUN=sum,data=RPKC_s.a) RPKC_s_singdoub=summaryBy(InvCount.sum~sitetaxa,FUN=sum,data=RPKC_s) sing_RPKC_s=subset(RPKC_s_singdoub,RPKC_s_singdoub$InvCount.sum==1) doub_RPKC_s=subset(RPKC_s_singdoub,RPKC_s_singdoub$InvCount.sum==2) lsing_RPKCS=length(sing_RPKC_s[,1]) ldoub_RPKCS=length(doub_RPKC_s[,1]) write.table(RPKC_s,"RPKC_s.csv",sep=",",row.names=FALSE) } ###if method = widespread instead of numerous # create APTC_SG dataset if (method != "numerous") { SGcount=summaryBy(InvCount~newtaxa+InvEventFK,FUN=c(sum),data=amb_sing3) APTC_SG=SGcount SGcount$InvCount.sum=ifelse(SGcount$InvCount.sum>0,1,0) SGcount2=summaryBy(InvCount.sum~newtaxa,FUN=sum,data=SGcount) uniqueSG=subset(SGcount2,SGcount2$InvCount.sum.sum==1) lSG.uniques=length(uniqueSG[,1]) dupSG=subset(SGcount2,SGcount2$InvCount.sum.sum>1 & SGcount2$InvCount.sum.sum<=2) lSG.dups=length(dupSG[,1]) write.table(APTC_SG,"APTC_SG.csv",sep=",") #write.table(dupSG,"origdup.csv",sep=",") # create APTC_SG1 dataste amb_sing3$InvName=as.character(amb_sing3$InvName) amb_sing3$newtaxa=as.character(amb_sing3$newtaxa) amb_sing3.SG1.uniq=amb_sing3[!(amb_sing3$uniq_dup!=0 & amb_sing3$InvName!=amb_sing3$newtaxa),] APTC_SG1=amb_sing3.SG1.uniq APTC_SG1.uniq=summaryBy(InvCount~InvEventFK+newtaxa,data=amb_sing3.SG1.uniq,FUN=sum) #test=unique(APTC_SG1$newtaxa) APTC_SG1.uniq$InvCount.sum=ifelse(APTC_SG1.uniq$InvCount.sum>0,1,0) APTC_SG1.uniqdup=summaryBy(InvCount.sum~newtaxa,FUN=sum,data=APTC_SG1.uniq) SG1.uniques=subset(APTC_SG1.uniqdup,APTC_SG1.uniqdup$InvCount.sum.sum==1) SG1.dups=subset(APTC_SG1.uniqdup,APTC_SG1.uniqdup$InvCount.sum.sum>1 & APTC_SG1.uniqdup$InvCount.sum.sum<=2) lSG1.uniques=length(SG1.uniques[,1]) lSG1.dups=length(SG1.dups[,1]) write.table(APTC_SG1,"APTC_SG1.csv",sep=",") ##hmmmm #create APTC_SG2 dataset amb_sing3.SG2=amb_sing3[!(amb_sing3$uniq_dup!=0 & amb_sing3$newtaxa!=amb_sing3$sitetaxa),] APTC_SG2=amb_sing3.SG2 APTC_SG2=summaryBy(InvCount~InvEventFK+newtaxa,data=amb_sing3.SG2,FUN=sum) APTC_SG2.uniq=summaryBy(InvCount~InvEventFK+newtaxa,data=amb_sing3.SG2,FUN=sum) APTC_SG2.uniq$InvCount.sum=ifelse(APTC_SG2.uniq$InvCount.sum>0,1,0) APTC_SG2.uniqdup=summaryBy(InvCount.sum~newtaxa,FUN=sum,data=APTC_SG2.uniq) SG2.uniques=subset(APTC_SG2.uniqdup,APTC_SG2.uniqdup$InvCount.sum.sum==1) SG2.dups=subset(APTC_SG2.uniqdup,APTC_SG2.uniqdup$InvCount.sum.sum>1 & APTC_SG2.uniqdup$InvCount.sum.sum<=2) lSG2.uniques=length(SG2.uniques[,1]) lSG2.dups=length(SG2.dups[,1]) write.table(APTC_SG2,"APTC_SG2.csv",sep=",") } ######################################