#importing packages to be used with error handling
try:
    import pandas as pd
except:
    print("Required package pandas not available")
    exit()
try:
    import numpy as np
except:
    print("Required package numpy not available")
    exit()
try:
    import argparse
except:
    print("Required package argparse not available")
    exit()

#setting up expected arguments and help messages
parser = argparse.ArgumentParser(description="VALENCIA is a tool to classify vaginal microbial community into community state types")
#required arguments
required = parser.add_argument_group("Required arguments")
required.add_argument("-ref","--reference", help="Complete path to reference centroids file")
required.add_argument("-i", "--input", help="Path to input CSV file with the first column header 'sampleID', second column total read counts with header 'read_count', and remaining columns containing taxa read counts",required=True)
required.add_argument("-o","--output", help="Output csv file prefix")

#optional arguments
parser.add_argument("-p","--plot", default=None, help="File prefix for boxplots of similarty of each sample to assigned CST, if not provided, no plot will be generated")

#reading arguments into parser
args = parser.parse_args()

#removing the  warning that pandas has for when you do complicated things
pd.options.mode.chained_assignment = None  # default='warn'

#VALENCIA
###The purpose of this tool is to classify vaginal microbial communities into community state types (CSTs)
###in a standardized and repeatable fashion, clusters are based on the 13,000+ women data
###A total of 13 CSTs are considered: CST I-A, I-B, II, III-A, III-B, IV-A, IV-B, IV-C0, IV-C1, IV-C2, IV-C3, IV-C4, V 
####This script tests new samples based on similarity to already defined cluster centroids

#defining function to determine yue-clayton theta
def yue_distance(row, median):
    #creating a counting variable to index the median list
    taxon_count = 0
    #creating lists to iterativly store output
    median_times_obs = []
    median_minus_obs_sq = []    
    #looping through the row and calculating product and difference squared between row data and median data
    for taxon_abund in row:
        #calculate p * q
        median_times_obs.append(median[taxon_count]*taxon_abund)
        #calculate p-q squared
        median_minus_obs_sq.append((median[taxon_count]-taxon_abund)**2)
        taxon_count += 1    
    #calculate sum p* q
    product = np.nansum(median_times_obs)   
    #calculate sum p-q squared
    diff_sq = np.nansum(median_minus_obs_sq)
    #calculate yue_med_dist
    yue_med_dist = product / (diff_sq + product)
    #return the value of yue distance
    return yue_med_dist 

#### defining fuction to determine the penalized similarity score, not currently in use
def penalized_simil_score(row):

    if row['subCST'] == 'I-A':
        row = row.drop('I-B_sim')
    elif row['subCST'] == 'I-B':
        row = row.drop('I-A_sim')
    elif row['subCST'] == 'III-A':
        row = row.drop('III-B_sim')
    elif row['subCST'] == 'III-B':
        row = row.drop('III-A_sim')
    row = row.drop('subCST')
    similarity_scores = list(row)
    similarity_scores.sort()
    score_len = len(similarity_scores)
    penalized_score = similarity_scores[score_len-1] * (similarity_scores[score_len-1]-similarity_scores[score_len-2]) ** (1./2)

    return penalized_score

def combine_taxa(combined_data):
    
    #combining L. jensenii + L. mulieris for purpose of CST assignment
    combined_data['Lactobacillus_jensenii'] = combined_data['Lactobacillus_jensenii'] + combined_data['Lactobacillus_mulieris']
    combined_data = combined_data.drop(['Lactobacillus_mulieris'],axis=1)
    
    #combining together species where genus level typifies the CST 
    combined_data['g_Bifidobacterium'] = combined_data[[col for col in combined_data.columns if col.startswith('Bifidobacterium')]].sum(axis=1)
    combined_data['g_Streptococcus'] = combined_data[[col for col in combined_data.columns if col.startswith('Streptococcus')]].sum(axis=1)
    combined_data['g_Staphylococus'] = combined_data[[col for col in combined_data.columns if col.startswith('Staphylococcus')]].sum(axis=1)
    combined_data['g_Enterococcus'] = combined_data[[col for col in combined_data.columns if col.startswith('Enterococcus')]].sum(axis=1)
    drop_list = [col for col in combined_data.columns if col.startswith('Bifidobacterium')]
    drop_list = drop_list + [col for col in combined_data.columns if col.startswith('Streptococcus')]
    drop_list = drop_list + [col for col in combined_data.columns if col.startswith('Staphylococcus')]
    drop_list = drop_list + [col for col in combined_data.columns if col.startswith('Enterococcus')]
    combined_data = combined_data.drop(drop_list,axis=1)

    return combined_data

#list of subCSTs 
CSTs = ['I-A','I-B','II','III-A','III-B','IV-A','IV-B','IV-C0','IV-C1','IV-C2','IV-C3','IV-C4','V']

#reading in the input CST centroids
try:
    reference_centroids = pd.read_csv(args.reference,sep=',')
except:
    print('Please provide a valid path to the reference centroids using -r')
    exit()

try:
#reading in table of samples to be tested against the centroids
    sample_data_OG = pd.read_csv(args.input,sep=',')
except:
    print('Please provide a valid path to the input file using -i')
    exit()

#checking if first two columns have appropriate headers
if list(sample_data_OG.columns[0:2]) != ['sampleID','read_count']:
    print('Input file expected to be a CSV with first two column headers: sampleID,read_count')
    exit()

#forcing the sample data to have the same number of columns as the reference and in the same order 
combined_data = pd.concat([sample_data_OG,reference_centroids], ignore_index=True,sort=False)

#revising taxa names for purpose of CST assignment
combined_data = combine_taxa(combined_data)

sample_data = combined_data[:-13].fillna(0)
sample_data = sample_data.drop(['sub_CST'],axis=1)
reference_centroids = combined_data.tail(13).fillna(0)
reference_centroids = reference_centroids.drop(["sampleID","read_count"],axis=1).set_index('sub_CST')

#converting all of the read counts to relative abundance data and adding back the first two columns
sample_data_rel = sample_data[sample_data.columns[2:]].div(sample_data['read_count'],axis=0)
sample_data_rel = pd.concat([sample_data[sample_data.columns[0:2]],sample_data_rel],axis=1)

#loop measuring the similarity of each sample to each subCST centroid using yue + clayon theta
for CST in CSTs:

    sample_data_OG['%s_sim' %(CST)] = sample_data_rel.apply(lambda x : yue_distance(x[2:],reference_centroids.loc[CST]), axis=1)

#outputting the acquired data with the new variability measure
#identify for each sample, which subCST was most similar, then correcting name of subCST to remove _sim
sample_data_OG['subCST'] = sample_data_OG.iloc[:,-13:].idxmax(axis=1)
sample_data_OG['subCST'] = sample_data_OG['subCST'].str.replace('_sim',"")

#applying function to calculate a penalized score, not currently in use
#sample_data_OG['penalized_score'] = sample_data_OG.apply(lambda x : penalized_simil_score(x[-14:]), axis=1)

#store the similarity between each sample and its as=ssigned 
sample_data_OG['score'] = sample_data_OG.iloc[:,-14:-1].max(axis=1)
sample_data_OG[sample_data_OG['score']==0]['subCST']=='Unassigned'
#determine higher order CST assignment based on subCST assignment
sample_data_OG['CST'] = sample_data_OG['subCST'].replace({'I-A':'I','I-B':'I','III-A':'III','III-B':'III','IV-C0':'IV-C','IV-C1':'IV-C','IV-C2':'IV-C','IV-C3':'IV-C','IV-C4':'IV-C'})

#output the assignments in new CSV file
sample_data_OG.to_csv("%s.csv" %(args.output),index=None)

#plotting distributions of similarity scores for each subCST agaist that for the reference dataset
if args.plot != None:

    #loading matplotlib module
    try:    
        import matplotlib.pyplot as plt
    except:
        print("Required package matplotlib not available, either install or do not use --plot option")

    #defining figure size
    similarity_fig, similarity_axs = plt.subplots(1,1, figsize=(10,6), facecolor='w', edgecolor='k')
    similarity_fig.subplots_adjust(left=0.15,right=0.85,bottom=0.15,top=0.9,hspace = 0.4, wspace=0.4)
    
    #creating x axis location variables
    loc=0
    #creating blank list to store subCSTs that are actually present in the data
    CST_labels = list()
    
    #values for the average and stdev of the similarity scores of reference samples to their assigned CSTs
    ref_ave = {'I-A':0.995678,'I-B':0.858443,'II': 0.811734,'III-A':0.972983,'III-B':0.810466,'IV-A':0.718100,'IV-B':0.659592,'IV-C0':0.321432,'IV-C1':0.745991,'IV-C2':0.695203,'IV-C3':0.758735,'IV-C4':0.681502,'V':0.734973}
    ref_std = {'I-A':0.007615,'I-B':0.151048,'II':0.161987,'III-A':0.058698,'III-B':0.133990,'IV-A':0.150829,'IV-B':0.162577,'IV-C0':0.137558,'IV-C1':0.231891,'IV-C2':0.246322,'IV-C3':0.213653,'IV-C4':0.186462,'V':0.140276}

    #establishing the CST color scheme
    CST_color_scheme = {'I-A':'#ff6868','I-B':'#ffd4da','II':'#b4ff68','III-A':'#ffbc6b','III-B':'#e4a67b','IV-A':'#c1adec','IV-B':'#91a8ed',
                    'IV-C0':'#989898','IV-C1':'#ffc0cb','IV-C2':'#a8e5e5','IV-C3':'#9acc9a','IV-C4':'#800080','V':'#ffff71'}

    #plotting a boxplot, ref average and ref standard deviation for each subCST
    for CST in CSTs:
        
        #checking if subCST present in data
        if len(sample_data_OG[sample_data_OG['subCST']==CST].index) > 0:

            boxprops = dict(linewidth=1, color="k")
            medianprops = dict(linewidth=1,color="k")
            
            #plotting box plot of similarity distribution
            similarity_axs.boxplot(x=sample_data_OG[sample_data_OG['subCST'] == CST].score,positions=[loc-0.25],notch=True,widths=[0.25],patch_artist=True,boxprops=boxprops,medianprops=medianprops)
            #plotting the reference average and stdev
            similarity_axs.scatter(x=loc,y=ref_ave[CST],s=15,marker='d',c=CST_color_scheme[CST])
            similarity_axs.plot([loc,loc],[ref_ave[CST]+ref_std[CST],ref_ave[CST]-ref_std[CST]],c=CST_color_scheme[CST])
            CST_labels.append(CST)

            loc+=1
        
        else:
            continue

    #setting plot labels etc
    similarity_axs.set_xticklabels(CST_labels)
    similarity_axs.set_ylim(0,1)
    similarity_axs.set_xlabel("subCST")
    similarity_axs.set_ylabel("Similarity to assigned subCST")
    #saving plot
    similarity_fig.savefig("%s.pdf" %(args.plot))