#Z0096


# import from python libraries and modules
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr

# import visual tools
import matplotlib.pyplot as plt
import seaborn as sns

# import from created modules
from prepare import prepare_mvp, split_data


#################### Explore Data ####################


def get_tax_rates(use_csv=True):
    '''

    Obtain tax rates for each property with their county name, and
    output those distributions to a plot and dataframe to return

    '''

    # obtain prepared data to calcualte tax rates
    df = prepare_mvp(use_csv=use_csv)
    # convert into tax rate DataFrame
    df['tax_rate'] = df.tax_amount_usd / df.tax_value_usd
    df = df[['county', 'tax_amount_usd', 'tax_value_usd', 'tax_rate']].sort_values('county')
    # set figure dimensions for plot
    plt.figure(figsize=(30,15))
    # start plot
    plt.hist(df[df.county == 'Los Angeles'].tax_rate, bins=np.linspace(0, 0.1, 50), color='red', alpha=0.25, log=True, label='Los Angeles')
    plt.hist(df[df.county == 'Orange'].tax_rate, bins=np.linspace(0, 0.1, 50), color='green', alpha=0.25, log=True, label='Orange')
    plt.hist(df[df.county == 'Ventura'].tax_rate, bins=np.linspace(0, 0.1, 50), color='blue', alpha=0.25, log=True, label='Ventura')
    plt.rcParams['legend.title_fontsize'] = 20
    plt.xlim(0, 0.1)
    plt.xlabel('Tax Rate')
    plt.title('Distributions of Tax Rates for Each County')
    plt.legend(title='County')
    plt.show()
    
    return df


def get_tax_rates_county(use_csv=True):
    '''

    Takes in Zillow DataFrame with county names, property tax values,
    and property tax payments and adds their means to a DataFrame with
    a calculate tax rate per county.

    '''

    # obtain prepared data to get tax rates
    df = prepare_mvp(use_csv=use_csv)
    # assign blank DataFrame to append to
    df_taxes = pd.DataFrame()
    # start loop for each county in DataFrame
    for county_name in df.county.sort_values().unique():
        # set mask to filter values per county
        subset = df[df.county == county_name]
        # append dictionary to DataFrame values for each county
        df_taxes = df_taxes.append({
            'county_name': county_name,
            'avg_tax_amount_usd': subset.tax_amount_usd.mean(),
            'avg_tax_value_usd': subset.tax_value_usd.mean(),
            'tax_rate': (subset.tax_amount_usd / subset.tax_value_usd).mean()
            }, ignore_index=True)
    df = df_taxes.set_index('county_name')

    return df


def get_tax_rate():
    '''
    '''

    # obtain prepared data to calcualte tax rates
    df = prepare_mvp(use_csv=use_csv)
    # convert into tax rate DataFrame
    df['tax_rate'] = df.tax_amount_usd / df.tax_value_usd
    df = df[['county', 'tax_amount_usd', 'tax_value_usd', 'tax_rate']].sort_values('county')
    # set figure dimensions for plot
    plt.figure(figsize=(30,15))


def explore_mvp(use_csv=True):
    '''

    Bringing in MVP prepared data, this function splits it into X, y for train,
    validate, and test using tax_value_usd as our target variable. This
    functions returns the six splits as well as a DataFrame for exploration
    containing only the 60% the training data

    '''

    # obtain prepared data and drop non-mvp column
    df = prepare_mvp(use_csv=use_csv)
    df = df.drop(columns=['county', 'tax_amount_usd'])
    # split data into appropraite DataFrames
    df, \
    X_train, y_train, \
    X_validate, y_validate, \
    X_test, y_test = split_data(df, 'tax_value_usd')
    
    return (df,
            X_train, y_train,
            X_validate, y_validate,
            X_test, y_test)


#################### Visualize Data ####################


def plot_heat(df, target):
    
    '''

    Use seaborn to create heatmap with coeffecient annotations to
    visualize correlation between all variables


    '''

    n_vars = len(list(df))
    # Set up large figure size for easy legibility
    plt.figure(figsize=(n_vars + 5, n_vars + 1))
    # assign pd.corr() output to variable and create a mask to remove
    # redundancy from graphic
    corr = df.corr()
    mask = np.triu(corr, k=0)
    # define custom cmap for heatmap where the darker the reds the more
    # positive and vice versa for blues
    cmap = sns.diverging_palette(h_neg=220, h_pos=13, sep=25, as_cmap=True)
    # create graphic with zero centered cmap and annotations set to one
    # significant figure
    sns.heatmap(corr, cmap=cmap, center=0, annot=True, fmt=".1g", square=True,
                mask=mask, cbar_kws={
                                     'shrink':0.5,
                                     'aspect':50,
                                     'use_gridspec':False,
                                     'anchor':(-0.75,0.75)
                                      })
    # format xticks for improved legibility and clarity
    plt.xticks(ha='right', va='top', rotation=35, rotation_mode='anchor')
    plt.title('Correlation Heatmap')
    plt.show()


def plot_univariate(data, variable):
    '''

    This function takes the passed DataFrame the requested and plots a
    configured boxenplot and distrubtion for it side-by-side

    '''

    # set figure dimensions
    plt.figure(figsize=(30,8))
    # start subplot 1 for boxenplot
    plt.subplot(1, 2, 1)
    sns.boxenplot(x=variable, data=data)
    plt.axvline(data[variable].median(), color='pink')
    plt.axvline(data[variable].mean(), color='red')
    plt.xlabel('')
    plt.title('Enchanced Box Plot', fontsize=25)
    # start subplot 2 for displot
    plt.subplot(1, 2, 2)
    sns.histplot(data=data, x=variable, element='step', kde=True, color='cyan',
                                line_kws={'linestyle':'dashdot', 'alpha':1})
    plt.axvline(data[variable].median(), color='pink')
    plt.axvline(data[variable].mean(), color='red')
    plt.xlabel('')
    plt.ylabel('')
    plt.title('Distribution', fontsize=20)
    # set layout and show plot
    plt.suptitle(f'{variable} $[n = {data[variable].count():,}]$', fontsize=25)
    plt.tight_layout()
    plt.show()


def plot_discrete_to_continous(data, discrete_var, continous_var, swarm_n=2000,
                                            r_type='pearson', random_state=19):
    '''

    Takes in a DataFrame and lists of discrere and continuous variables and
    plots a boxenplot, swarmplot, and regplot for each against the other,
    providing either the pearson (default) or spearman r measurement in the
    title

    '''

    # choose coefficient
    if r_type == 'pearson':
        r = pearsonr(data[discrete_var], data[continous_var])[0]
    elif r_type =='spearman':
        r = spearmanr(data[discrete_var], data[continous_var])[0]
    # set figure dimensions
    plt.figure(figsize=(30,10))
    # start subplot 1 for boxplot
    plt.subplot(1, 3, 1)
    sns.boxenplot(x=discrete_var, y=continous_var, data=data)
    plt.xlabel('')
    plt.ylabel(f'{continous_var}', fontsize=20)
    # start subplot 2 for boxplot
    plt.subplot(1, 3, 2)
    sns.swarmplot(x=discrete_var, y=continous_var, data=data.sample(n=swarm_n,
                                                    random_state=random_state))
    plt.xlabel(f'{discrete_var}', fontsize=20)
    plt.ylabel('')
    # start subplot 3 for boxplot
    plt.subplot(1, 3, 3)
    sns.regplot(x=discrete_var, y=continous_var, data=data, marker='*',
                                                    line_kws={'color':'red'})
    plt.xlabel('')
    plt.ylabel('')
    # set title for graphic and output
    plt.suptitle(f'{discrete_var} to {continous_var} $[r = {r:.2f}]$',
                                                                fontsize=25)
    plt.tight_layout()
    plt.show()


def plot_joint(data, x, y, r_type='pearson'):
    '''

    Takes in a DataFrame and the specified x, y variables and plots a
    configured joint plot with the pearson (default) or spearman r measurement
    in the title

    '''

    # choose coefficient
    if r_type == 'pearson':
        r = pearsonr(data[x], data[y])[0]
    elif r_type =='spearman':
        r = spearmanr(data[x], data[y])[0]
    # plot jointplot of continuous variables
    sns.jointplot(x, y, data, kind='reg', height=10,
                  joint_kws={'marker':'+', 'line_kws':{'color':'red'}},
                  marginal_kws={'color':'cyan'})
    # set labels for x, y axes
    plt.xlabel(f'{x}')
    plt.ylabel(f'{y}')
    # set title of compared variables
    plt.suptitle(f'{x} to {y} $[r = {r:.2f}]$')
    plt.tight_layout()
    # show plot
    plt.show()


def corr_test(data, x, y, alpha=0.05, r_type='pearson'):
    '''

    Performs a pearson or spearman correlation test and returns the r
    measurement as well as comparing the return p valued to the pass or
    default significance level, outputs whether to reject or fail to
    reject the null hypothesis
    
    '''
    
    # obtain r, p values
    if r_type == 'pearson':
        r, p = pearsonr(data[x], data[y])
    if r_type == 'spearman':
        r, p = spearmanr(data[x], data[y])
    # print reject/fail statement
    print(f'''{r_type:>10} r = {r:.2g}
+--------------------+''')
    if p < alpha:
        print(f'''
        Due to p-value {p:.2g} being less than our significance level of \
{alpha}, we must reject the null hypothesis 
        that there is not a linear correlation between "{x}" and "{y}."
        ''')
    else:
        print(f'''
        Due to p-value {p:.2g} being greater than our significance level of \
{alpha}, we fail to reject the null hypothesis 
        that there is not a linear correlation between "{x}" and "{y}."
        ''')