#Z0096


# import from python libraries and modules
import pandas as pd
import numpy as np

#import visualization tools
import matplotlib.pyplot as plt

# import modeling tools
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.preprocessing import MinMaxScaler


#################### Create & Test Models ####################


def train_model(X, y, model, model_name):
    '''

    Takes in the X_train and y_train, model object and model name, fit the
    model and returns predictions and a dictionary containg the model RMSE
    and R^2 scores on train

    '''

    # fit model to X_train_scaled
    model.fit(X, y)
    # predict X_train
    predictions = model.predict(X)
    # get rmse and r^2 for model predictions on X
    rmse, r2 = get_metrics(y, predictions)
    performance_dict = {'model':model_name, 'RMSE':rmse, 'R^2':r2}
    
    return predictions, performance_dict


def model_testing(X, y, model, model_name):
    '''

    Takes in the X and y for validate or test, model object and model name and
    returns predictions and a dictionary containg the model RMSE and R^2 scores
    on validate or test

    '''
    
    # obtain predictions on X
    predictions = model.predict(X)
    # get for performance and assign them to dictionary
    rmse, r2 = get_metrics(y, predictions)
    performance_dict = {'model':model_name, 'RMSE':rmse, 'R^2':r2}
    
    return predictions, performance_dict


#################### Scale Data #########################


def minmax(X_train, X_validate, X_test, features_to_scale=None):
    '''

    Takes in the X for train, validate, and test and an option list and scales
    all or the list of features using the minmax scaler with default setting,
    outputs dataframes with all or only list variables scaled

    '''
    
    # check if list of specific features are passed
    if features_to_scale == None:
        # assign list of all columns is no list is passed
        features_to_scale = list(X_train)
    # create sacler object
    scaler = MinMaxScaler()
    # fit scaler to X_train
    scaler.fit(X_train[features_to_scale])
    # transform X_train and create new DataFrame for scaled data
    X_train_scaled = scaler.transform(X_train[features_to_scale])
    X_train_scaled = pd.DataFrame(X_train_scaled,
                            columns=features_to_scale).set_index(X_train.index)
    # combine scaled and unscaled features if list was passed
    if features_to_scale != None:
        X_train_scaled = pd.concat((X_train.drop(columns=features_to_scale),
                            X_train_scaled), axis=1)
    # transform X_validate and create new DataFrame for scaled data
    X_validate_scaled = scaler.transform(X_validate[features_to_scale])
    X_validate_scaled = pd.DataFrame(X_validate_scaled,
                        columns=features_to_scale).set_index(X_validate.index)
    # combine scaled and unscaled features if list was passed
    if features_to_scale != None:
        X_validate_scaled = pd.concat((X_validate.drop(columns=features_to_scale),
                        X_validate_scaled), axis=1)
    # transform X_test and create new DataFrame for scaled data
    X_test_scaled = scaler.transform(X_test[features_to_scale])
    X_test_scaled = pd.DataFrame(X_test_scaled,
                            columns=features_to_scale).set_index(X_test.index)
    # combine scaled and unscaled features if list was passed
    if features_to_scale != None:
        X_test_scaled = pd.concat((X_test.drop(columns=features_to_scale),
                            X_test_scaled), axis=1)
    
    return X_train_scaled, X_validate_scaled, X_test_scaled


#################### Explore Features ####################


def select_kbest(X, y, k=1, score_func=f_regression):
    '''

    Takes in the X, y train and an optional k values and score_func to use
    SelectKBest to return k (default=1) best variables for predicting the
    target of y
    
    '''

    # assign SelectKBest using f_regression and top two features default
    selector = SelectKBest(score_func=score_func, k=k)
    # fit selector to training set
    selector.fit(X, y)
    # assign and apply mask to DataFrame for column names
    mask = selector.get_support()
    top_k = X.columns[mask].to_list()
    return top_k


def select_rfe(X, y, n=1, model=LinearRegression(), rank=False):
    '''

    Takes in the X, y train and an optional n values and model to use with
    RFE to return n (default=1) best variables for predicting the
    target of y, optionally can be used to output ranks of features in
    predictions

    '''

    # assign RFE using LinearRegression and top two features as default
    selector = RFE(estimator=model, n_features_to_select=n)
    # fit selector to training set
    selector.fit(X, y)
    # assign and apply mask to DataFrame for column names
    mask = selector.get_support()
    top_n = X.columns[mask].to_list()
    # check if rank=True
    if rank == True:
        # print DataFrame of rankings
        print(pd.DataFrame(X.columns, selector.ranking_,
                           [f'n={n} RFE Rankings']).sort_index())
    return top_n


#################### Model Performance ####################


def get_metrics(true, predicted, display=False):
    '''

    Takes in the true and predicted values and returns the rmse and r^2 for the
    model performance

    '''
    
    rmse = mean_squared_error(true, predicted) ** 0.5
    r2 = explained_variance_score(true, predicted)
    if display == True:
        print(f'Model RMSE: {rmse:.2g}')
        print(f'       R^2: {r2:.2g}')
    return rmse, r2


def plot_residuals(y_true, y_predicted):
    '''

    Takes in 1 to 4 prediction sets and returns a configured scatterplot of the
    residual errors of those predictions against the passed true set
    
    '''

    # set figure dimensions
    plt.figure(figsize=(60, 40))
    plt.rcParams['legend.title_fontsize'] = 50
    # scatterplot for each up to four predictions passed in list
    plt.scatter(y_true, (y_predicted.iloc[0:,0] - y_true), alpha=1,
                    color='cyan', s=250, label=y_predicted.iloc[0:,0].name,
                    edgecolors='black')
    if len(y_predicted.columns) > 1:
        plt.scatter(y_true, (y_predicted.iloc[0:,1] - y_true), alpha=0.75, 
                    color='magenta', s=250, label=y_predicted.iloc[0:,1].name, 
                    edgecolors='black')
    if len(y_predicted.columns) > 2:
        plt.scatter(y_true, (y_predicted.iloc[0:,2] - y_true), alpha=0.75, 
                    color='yellow', s=250, label=y_predicted.iloc[0:,2].name, 
                    edgecolors='black')
    if len(y_predicted.columns) > 3:
        plt.scatter(y_true, (y_predicted.iloc[0:,3] - y_true), alpha=0.5, 
                    color='black', s=250, label=y_predicted.iloc[0:,3].name, 
                    edgecolors='white')
    if len(y_predicted.columns) > 4:
        return 'Can only plot up to four models\' predictions'
    # add zero line for ease of readability
    plt.axhline(label='', color='red', linewidth=5, linestyle='dashed',
                    alpha=0.25)
    # model legend
    plt.legend(title='Models', loc=(0.025,0.05), fontsize=50)
    
    # set labels and title
    plt.xlabel('\nTrue Value\n', fontsize=50)
    plt.ylabel('\nPredicted Value Error\n', fontsize=50)
    plt.title(f'\nPrediction Residuals of {y_true.name}\n', fontsize=50)
    # show plot
    plt.show()