import dash
import dash_core_components as dcc
import dash_html_components as html
import pandas as pd
import plotly.graph_objs as go
from dash.dependencies import Input, Output
import plotly.express as px
import numpy as np
from numpy import arange
from sklearn.linear_model import HuberRegressor 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
import dash_table

app = dash.Dash()
server = app.server
data = pd.read_csv('insurance.csv')

df = data.copy()
df2 = data.head(n=5) 
#Label Encoding the data (sex, smoker, and region variables)
object_df = data.select_dtypes(include=['object']).copy() 
#print(object_df.head())
#changing variables to 'category' type 
object_df["sex"] = object_df["sex"].astype('category')
object_df["smoker"] = object_df["smoker"].astype('category') 
object_df["region"] = object_df["region"].astype('category')

#assinging encoded variables using 'cat.codes' 
object_df["sex_binary"] = object_df["sex"].cat.codes 
object_df["smoker_binary"] = object_df["smoker"].cat.codes 
object_df["region_encoded"] = object_df["region"].cat.codes 

df["sex"] = object_df["sex_binary"] 
df["smoker"] = object_df["smoker_binary"] 
df["region"] = object_df["region_encoded"]

df3 = df.head(n=5) 

fig = px.scatter_matrix(data, dimensions=[
    'age', 'sex', 'bmi', 'children', 'smoker', 'charges'], labels = {col:" " for col in data.columns},color='region', size_max =5)
fig.update_traces(diagonal_visible = False)

fig2 = px.histogram(data,x = data['charges'], color = 'region')
# checking the first 10 occurences of the data
# print(data.head(n=10))



app.layout = html.Div([
    #setting the title 
    html.H1("Internship 2020 - The GreatFull Plate", style = {"textAlign": "center","width":"800px", "font-family":"Verdana"}, className = "container"), 
    html.H2("Author: Arcelio E. Perez", style = {"textAlign":"center", "width": "800px", "font-family":"Verdana"}, className = "container"),
    html.H4("*WEBSITE UNDER CONSTRUCTION*", style = {"textAlign":"center", "width": "800px", "font-family":"Verdana"}, className = "container"),
    #tabs 
    dcc.Tabs(id = "tabs", children = [
        dcc.Tab(label = "How To", children = [
            html.Div([
                html.H1("How To Run The Files?", style = {"textAlign": "center", "font-family":"Verdana"}), 
                dcc.Markdown('''
                ## Files to Download:  
                 
                Required files: [insurance.csv](https://raw.githubusercontent.com/arcelioeperez/dash-app/gh-pages/source/insurance.csv) | [requirements.txt](https://raw.githubusercontent.com/arcelioeperez/dash-app/main/assets/requirements.txt) | [app.py](https://raw.githubusercontent.com/arcelioeperez/dash-app/gh-pages/source/app.py).   
                All in one folder - including Makefile: [Files](https://github.com/arcelioeperez/dash-app/tree/gh-pages/source)  
                Optional - only the makefile: [Makefile](https://raw.githubusercontent.com/arcelioeperez/dash-app/gh-pages/source/makefile)  
                
                GitHub:  
                [GitHub Repository](https://github.com/arcelioeperez/dash-app/tree/gh-pages) | [GitHub Pages](https://arcelioeperez.github.io/dash-app/)

                ## Running on Windows:  

                ### Running with Make:  
                If you don't have Make installed you could install it by downloading it on this [website](http://gnuwin32.sourceforge.net/packages/make.htm).  
                You could also download and install 'Chocolatey', which is a package manager for Windows.

                **Please use a Unix-like terminal like Git Bash or Powershell - it makes running programs easier**  

                ```
                #installing Make with Chocolatey 
                choco install make
                ```  
                Running the *app.py* file with *make*:  
                ```
                #installing all the packages with one command
                make packages
                #then running the app.py file
                make app
                ```  
                ### If you don't want to use Make:
                ```
                #installing all the packages with requirements.txt
                pip install -r requirements.txt
                #then running app.py
                python app.py
                ```  
                >*After running the above commands - either with or without make - you must go the localhost link i.e. http://127.0.0.1:8050/*   

                ## Running on MacOS  

                ### Running with Make:  
                Open the terminal and check if you have *make* installed
                ```
                make --version
                ```  
                *If you don't have it installed, you can install it with Homebrew - a package manager for MacOS*

                ### Installing *make* with Brew
                ```
                brew install make
                ```  
                **The rest is similar to the Windows instructions.** 

                **Note: all the files must be in the same directory (folder) and all the packages must be installed prior to running `app.py`.**
                
                ### Links to the plotly and dash documentations:  
                1.[Plotly](https://plotly.com/)  
                2.[Dash](https://dash.plotly.com/)  
                3.[Kaggle Dataset- Medical Cost Personal Datasets by Miri Choi](https://www.kaggle.com/mirichoi0218/insurance)  

                ### Works Cited:  
                1.[Machine Learning Mastery - Random Forest](https://machinelearningmastery.com/random-forest-ensemble-in-python/)  
                2.[Machine Learning Mastery - Huber Regressor](https://machinelearningmastery.com/robust-regression-for-machine-learning-in-python/#:~:text=Regression%20is%20a%20modeling%20task,most%20successful%20being%20linear%20regression.)  
                3.[Scatter Plots](https://www.evl.uic.edu/aej/524/kyoung/Training-scatterplot.html)

                ### Books Recommended: 
                1.[Fooled by Randomness, Nassim Nicholas Taleb](https://www.amazon.com/Fooled-Randomness-Hidden-Markets-Incerto/dp/0812975219)
                
                ''', style={"font-family":"Verdana"}, className = "container", highlight_config={"theme":"dark"})
            ])
        ],style = {"font-family": "Verdana"}),
        dcc.Tab(label = "Data Pre-Processing", children = [
            html.Div([
                html.H1("Data Cleaning", style = {"textAlign":"center", "font-family":"Verdana"}),
                dcc.Markdown('''
                ### Opening the CSV file with Pandas:  
                ```python
                import pandas as pd 
                data = pd.read_csv("insurance.csv", delimiter = ",")
                #the delimiter parameter is optional, pandas could figure out that it is a comma
                #could also use the sep = "," parameter
                ```  

                ### Converting the 'object' varibles (i.e. those that are categorical) to numeric:  
                ```
                #converting data into a dataframe
                data = pd.DataFrame(data = data)

                #Label Encoding the data (sex, smoker, and region variables)
                object_df = data.select_dtypes(include=['object']).copy() 
                #print(object_df.head())
                #changing variables to 'category' type 
                object_df["sex"] = object_df["sex"].astype('category')
                object_df["smoker"] = object_df["smoker"].astype('category') 
                object_df["region"] = object_df["region"].astype('category')

                #assinging encoded variables using 'cat.codes' 
                object_df["sex_binary"] = object_df["sex"].cat.codes 
                object_df["smoker_binary"] = object_df["smoker"].cat.codes 
                object_df["region_encoded"] = object_df["region"].cat.codes

                #assigning colums to the data object 
                data["sex"] = object_df["sex_binary"] 
                data["smoker"] = object_df["smoker_binary"] 
                data["region"] = object_df["region_encoded"] 

                ```  

                ''', style = {"font-family":"Verdana"}, className = "container", highlight_config = {"theme":"dark"})
                ]),
            html.H3("Data With Categorical Variables", style = {"textAlign":"center", "font-family":"Verdana"}),
            dash_table.DataTable(id = "table",
                                 columns = [{"name":i, "id": i} for i in df2.columns ],
                                 data = df2.to_dict("records")),
            html.H3("Data With Numerical Variables", style = {"textAlign":"center","font-family":"Verdana"}),
            dash_table.DataTable(id = "table2",
                                 columns = [{"name":i, "id":i} for i in df3.columns],
                                 data = df3.to_dict("records")),
            dcc.Markdown(''' 
            ### Variables in the data:  
            
            Age: age of the person   
            BMI: body mass index    
            Sex: female (0) or male (1)    
            Children: number of children   
            Smoker: smoker (1) or non-smoker (0)    
            Region: southwest (3), southeast (2), northwest (1), northeast (0)  
            Charges: amount charged by the insurance  

            Data Exploration File: [exploratory_analysis.py](https://github.com/arcelioeperez/dash-app/raw/gh-pages/source/exploratory_analysis.py)  
            Basic Statistics of the data: [statistical_significance](https://github.com/arcelioeperez/dash-app/raw/gh-pages/source/statistical_significance.py) | [stats.txt](https://github.com/arcelioeperez/dash-app/raw/gh-pages/source/stats.txt)  

            ''', style = {"font-family":"Verdana"}, className = "container", highlight_config = {"theme":"dark"})
            ]),
                
        dcc.Tab(label = "Data Exploration", children = [
            html.Div([ 
                html.H1("Scatter Matrix",style = {"textAlign": "center", "font-family":"Verdana"}), 
                dcc.Graph(figure = fig), 
               
                dcc.Markdown(''' 
                ## Python code to generate this chart:  
                ```
                data = pd.read_csv('insurance.csv')
                fig = px.scatter_matrix(data, dimensions=[
                'age', 'sex', 'bmi', 'children', 'smoker', 'charges'], color='region')
                ```
                ##### *Where, 'px' is the 'plotly.express' module to plot the scatter matrix - this is the 'plotly' package.*  

                #### Installing packages and dependencies:    
                ``` 
                pip install dash  
                pip install plotly  
                pip install numpy  
                pip install pandas  
                pip install sklearn  
                ```  
                #### Importing files needed for the app.py program:  
                ``` 
                import dash
                import dash_core_components as dcc
                import dash_html_components as html
                import pandas as pd
                import plotly.graph_objs as go
                from dash.dependencies import Input, Output
                import plotly.express as px
                import numpy as np
                from numpy import arange
                from sklearn.linear_model import HuberRegressor 
                from sklearn.model_selection import cross_val_score
                from sklearn.model_selection import RepeatedKFold
                ```  
                #### Explanation of the Scatter Matrix:  
                A scatter matrix shows that 'all the pair-wise scatter plots of the variables' in a single picture. The diagonal is the variable against 
                itself - therefore, it will show a perfect correlation. For this chart, the diagonal was removed.  

                **As we see from this chart, the predictors that stand out the most are:** 

                Age, BMI (body mass index), and whether someone is a smoker, seem to be the predictors that explain the insurance charges.  
                However, as I will explain next, more analysis is needed in order to make a definitive conclusion. From the linear regression and random forest
                models- and their plots - we could see that this project is complex because it deals with predicting the charges that a person will have given their 
                age, sex, bmi, region, number of children, and if that person is a smoker.  

                One could argue that a 'basic model' would be intuition. For example, we know that if someone is obese, old, and is a smoker that person will have more
                health problems and therefore will probably be charged more. This basic model tells us a lot because we could then use linear regression, random
                forest or any other model to see if we are right in our analysis.  

                An important conclusion is that we are the ones programming and building the models. It doesn't make sense to conclude that we need machine learning techniques,
                Artificial Intelligence (AI), or a mathematical model for every single problem that we have. We sometimes need that basic model to act as a benchmark - sometimes
                that basic model could prove that a machine learning model would even complicate our analysis even more.  

                **How to deal with outliers?**  

                Dealing with outliers is not easy. I have seen many models online where people eliminate outliers in their data prepocessing steps. This could be the case, for example,
                if a professor is calculating grades. However, if one is dealing with sensitive information one would be making a mistake by eliminating the outliers from the analysis.
                When dealing with potential losses, profits, and health data one has to be extra careful with outliers because the magnitude of the consequences will be
                more than any benefit or insight that we could get from the data.  

                ''', style={"font-family":"Verdana"}, className = "container", highlight_config={"theme":"dark"}),
                html.H1("Histogram of Charges by Region", style = {"textAlign": "center", "font-family":"Verdana"}),
                dcc.Graph(figure = fig2)
                
            ], className = "container"),
        ],style = {"font-family": "Verdana"}),
        dcc.Tab(label = "Perfomance Metrics" , children = [
            html.H1("Linear Regression Metrics", style = {"textAlign": "center", "font-family":"Verdana"}),
            dcc.Dropdown(id='my-dropdown', 
            options=[{'label':'Age','value':'age'}, 
            {'label':'Sex','value':'sex'}, 
            {'label':'BMI','value':'bmi'}, 
            {'label':'Children','value':'children'},
            {'label':'Smoker','value':'smoker'},
            {'label':'Region','value':'region'}

            ], multi=True,value=['age'], 
            style={"display":"block", "margin-left":"auto", "margin-right":"auto", "width":"60%"}), 
            dcc.Graph(id="linear"), 
            dcc.Markdown(''' 
            ### Variables in this chart:  
            Age: age of the person  
            BMI: body mass index  
            Sex: female (0) or male (1)  
            Children: number of children  
            Smoker: smoker (1) or non-smoker (0)  
            Region: southwest (3), southeast (2), northwest (1), northeast (0)  

            ### Code to generate this chart

            ```py
            def update_graph(selected_dropdown):
        
                #dropdown = {'Age':'age', 'Sex':'sex', 'BMI':'bmi', 'Children':'children','Smoker':'smoker', 'Region':'region'}
                for i in selected_dropdown: 
                    if i == "age": 
                         dfx = df["age"]
                    elif i == "sex":
                        dfx = df["sex"]
                    elif i == "bmi":
                        dfx = df["bmi"]
                    elif i == "children":
                        dfx = df["children"]
                    elif i == "smoker":
                         dfx = df["smoker"]
                    else:
                        dfx = df["region"]
                    dfx = np.array(dfx) 
                    dfx = dfx.reshape(-1,1)
        
                    #results = evaluate_model(dfx, dataY, model)
                    #print("MAE (mean) and MAE (stdev): ", np.mean(results), np.std(results)) 
                    model = HuberRegressor() 
                    model.fit(dfx, df["charges"])
                    x_range = np.linspace(dfx.min(), dfx.max(), 100) 
                    y_range = model.predict(x_range.reshape(-1,1))
                    figure3 = px.scatter(data,x=df[f"{i}"], y=df["charges"])
                    figure3.add_traces(go.Scatter(x=x_range, y=y_range, name = "Regression Fit"))
                return figure3
            ```
            *Note: this code was inside the `@app.callback` function*  
            ''',style={"font-family":"Verdana"}, className = "container", highlight_config={"theme":"dark"}), 
            
            html.H1("Random Forest Model", style = {"textAlign": "center", "font-family":"Verdana"}), 
            dcc.Markdown('''
            ## What is a Random Forest Model?  
            >  
            > "Random Forests grow many classification trees. \[...] Each tree gives a classification, and we say the tree 'votes' for that class. The forest chooses
            the classification having the most votes (over all the trees in the forest)." - [Breiman and Cutler](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm)
            >  

            ## Code to generate the Random Forest Model:  
            ```py
            def get_models(): 
                models = dict() 
                #exploring ratios from 10% to 100% 
                for i in arange(0.1, 1.1, 0.1): 
                    key = "%.1f" % i 
                    #setting the max samples to none 
                    if i == 1.0: 
                        i = None 
                    models[key] = RandomForestRegressor(max_samples = i)
                return models 
            def evaluate_model(model, x, y): 
                #defining the evaluation procedure 
                cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 1) 
                #scores = cross_val_score(model, dataX, dataY, scoring = "neg_mean_absolute_error", cv = cv, n_jobs = 1, error_score = "raise")
                scores = cross_val_score(model, dataX, dataY, scoring = "neg_mean_squared_error", cv = cv, n_jobs = 1, error_score = "raise")
                return np.absolute(scores) 
            ```
            ## Testing the models:
            ```py
            models = get_models()
            results, names = list(), list()

            for name, model in models.items():
                #evaluate the model
                scores = evaluate_model(model, dataX, dataY)
                #storing the results
                results.append(scores)
                names.append(name)
                #summarizing the performance
                #print("Mean MAE scores and STD", name, mean(scores), std(scores))
                print("RMSE scores and STD", name, mean(np.sqrt(scores)))

            ans = np.sqrt(results)
            #converting the ans variable to a list in order to plot it with the names list - otherwise it won't run
            ans = list(ans)
            plt.boxplot(ans, labels = names, showmeans = True)
            ```  
            For this model I decided to include two error metrics - RMSE (Root Mean Squared Error) and MAE (Mean Squared Error). RMSE tends to penalize bigger errors, 
            therefore when MAEs for a given problem tend to be the same, the RMSE could be a factor deciding which is the 'best' model - or at least the one that fits
            the problem.  

            In the above code, if you want to get the same results for the MAE:  
            >Don't use **'ans = np.sqrt(results)'**. Instead, uncomment the **'print("Mean MAE ...'** and use the **'results'** variable in the **'plt.boxplot(results, labels = names, showmeans = True)'** function.   

            Since we are dealing with insurance charges, I opted to include both error metrics and use the RMSE as the benchmark.  
            
            \*[File](https://raw.githubusercontent.com/arcelioeperez/dash-app/gh-pages/source/random_forest.py) that produced the model and the plots.\* 
            
            To run the above file and create a text file with the output, type:  
            ```
            python random_forest.py > random_forest.txt
            ```  
            
            These files contain the MAE (mean and standard deviation) and the RMSE from the random forest model:  
            [MAE-Random Forest](https://raw.githubusercontent.com/arcelioeperez/dash-app/gh-pages/source/random_forest_mae.txt)|[RMSE-Random Forest](https://raw.githubusercontent.com/arcelioeperez/dash-app/gh-pages/source/random_forest_rmse.txt)  
            ## MAE Plot  
            ![MAE Plot](https://github.com/arcelioeperez/dash-app/raw/gh-pages/demo/randomforest.PNG)
            ## RMSE Plot  
            ![RMSE Plot](https://github.com/arcelioeperez/dash-app/raw/gh-pages/demo/randomforestrmse0.PNG)
            ''', style={"font-family":"Verdana"}, className = "container", highlight_config={"theme":"dark"})   

     ])
 
    ],style = {"font-family": "Verdana"})
   

])

@app.callback(Output('linear', 'figure'), 
[Input('my-dropdown', 'value')])
def update_graph(selected_dropdown):
    
    #dropdown = {'Age':'age', 'Sex':'sex', 'BMI':'bmi', 'Children':'children','Smoker':'smoker', 'Region':'region'}
    for i in selected_dropdown: 
        if i == "age": 
            dfx = df["age"]
        elif i == "sex":
            dfx = df["sex"]
        elif i == "bmi":
            dfx = df["bmi"]
        elif i == "children":
            dfx = df["children"]
        elif i == "smoker":
            dfx = df["smoker"]
        else:
            dfx = df["region"]
        dfx = np.array(dfx) 
        dfx = dfx.reshape(-1,1)
        
        #results = evaluate_model(dfx, dataY, model)
        #print("MAE (mean) and MAE (stdev): ", np.mean(results), np.std(results)) 
        model = HuberRegressor() 
        model.fit(dfx, df["charges"])
        x_range = np.linspace(dfx.min(), dfx.max(), 100) 
        y_range = model.predict(x_range.reshape(-1,1))
        figure3 = px.scatter(data,x=df[f"{i}"], y=df["charges"])
        figure3.add_traces(go.Scatter(x=x_range, y=y_range, name = "Regression Fit"))
    return figure3

    
if __name__ == "__main__": 
    app.run_server(debug=False)