"""Medical Language Model Learner (MLML)

Author: Georgi Tancev. https://github.com/gtancev
Source: https://github.com/gtancev/MLML/blob/master/nlp_app.py
Article: https://towardsdatascience.com/mining-and-classifying-medical-text-documents-1876462f73bc
Data: https://www.kaggle.com/tboyle10/medicaltranscriptions/
Credits: Marc Skov Madsen (for refactoring)
"""
# Todo:
# 1. Do something about charts. They don't have the right size and the text characters do not
# look well.
# 2. Refactor into smaller functions. Some with @st.cache to speed up the application.

from __future__ import division, unicode_literals

import itertools
import pathlib

import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import streamlit as st
from matplotlib import rc
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from streamlit.logger import get_logger

MT_SAMPLES_PATH = pathlib.Path(__file__).parent / "medical_language_learner/mtsamples.csv"

MT_SAMPLES_URL = (
    "https://raw.githubusercontent.com/MarcSkovMadsen/awesome-streamlit/master/gallery/"
    "medical_language_learner/mtsamples.csv"
)

enc = LabelEncoder()

plt.rcParams['lines.linewidth'] = 1.0
plt.rcParams['font.size'] = 6.0

LOGGER = get_logger(__name__)

def main():
    st.title("Medical Language Model Learner")
    st.sidebar.markdown("""
Author: [Georgi Tancev](https://github.com/gtancev)

[Original Code](https://github.com/gtancev/MLML/blob/master/nlp_app.py),
[Article](https://towardsdatascience.com/mining-and-classifying-medical-text-documents-1876462f73bc),
[Data](https://www.kaggle.com/tboyle10/medicaltranscriptions/)
""")
    st.sidebar.header("Sample Selection.")
    filename = st.sidebar.selectbox("Choose a file.",("None", "mtsamples"))

    st.header("Introduction")
    st.markdown("""
This application guides you through the development of **a language model
that classifies clinical documents** according to their medical speciality.

It is based on a **term frequency-inverse document frequency (tf-idf)** approach. Tf-idf is a
numerical statistic that is intended to reflect how important a word is to a document
in a collection or corpus. It is often used as a weighting factor in searches of information
retrieval, text mining, and user modeling.

The tf-idf value increases proportionally to the number of times a word appears in the document
and is offset by the number of documents in the corpus that contain the word, which helps to
adjust for the fact that some words appear more frequently in general.

Tf-idf is one of the most popular term-weighting schemes today; 83% of text-based recommender
systems in digital libraries use tf-idf.

**Note that tf-idf ignores the sequential aspect of a language**.

The actual model itself is based on a **random forest** classifier.

Random forests are an ensemble learning method for classification, regression and other tasks
that operates by constructing a multitude of decision trees at training time and outputting the
class that is the mode of the classes (classification) or mean prediction (regression) of the
individual trees. In particular, trees that are grown very deep tend to learn highly irregular
patterns: they overfit their training sets, i.e. have low bias, but very high variance.

Random forests are a way of averaging multiple deep decision trees, trained on different parts
of the same training set, with the goal of reducing the variance.

Random forests can be used to rank the importance of variables in a regression or classification
problem in a natural way.

The model is developed with scikit-learn. **Some possible degrees of freedom are shown in the
sidebar**. By adjusting them, the model is retrained and its performance re-evaluated.

**Start by choosing a file in the sidebar**.
""")

    if filename != "mtsamples":
        return

    try:
        data = get_mt_samples()
    except:
        st.error("The MT Samples could not be loaded!")
        return

    # DEFINE DATA.
    st.header("Preprocessing")
    st.write("First, data has to be preprocessed by adjusting the number of classes and transforming the data into tf-idf representation.",
    "As you can imagine, not every class might be represented enough in a data set to be able to classify it properly.",
    "It is advisable to remove classes which are underrepresented or whose abundance is below some treshhold to achieve a better performance.",
    "Alternatively, try to collect more data.")
    st.write("In addition, there are several degrees of freedom for the construction of tf-idf the dictionary. Adjusting them changes the amount and kind of words (**features**) which are included in the model.")
    st.subheader("Load data.")
    st.write("Display sample data by ticking the checkbox in the sidebar.")
    #filename = st.sidebar.text_input('Enter the filename of a csv-file.')

    agree = st.sidebar.checkbox('Display raw data.')
    if agree:
        st.dataframe(data)
    samples = data.transcription
    text_labels  = [label_name.lower() for label_name in data.medical_specialty]
    labels = enc.fit_transform(np.array(text_labels))
    labels = np.ravel(labels)
    unique_values, counts = np.unique(labels, return_counts=True)
    relative_counts = counts/np.sum(counts)
    st.write("The initial data set contains",np.shape(unique_values)[0],"classes and",data.shape[0],"samples.")

    # EXTRACT SAMPLES AND LABELS.
    st.sidebar.header("Preprocessing class distributions.")
    treshhold_to_consider = st.sidebar.slider("Minimum fraction of class in data set in order to be considered.", min_value=0.01, max_value=0.1, value=0.02, step=0.01)
    classes_to_consider = unique_values[relative_counts>=treshhold_to_consider]

    index_to_consider = np.empty((labels.shape[0]),dtype="bool")
    for i,label in enumerate(labels):
        if label in classes_to_consider:
            index_to_consider[i] = True
        else:
            index_to_consider[i] = False

    # EXTRACT CLASSES
    labels = labels[index_to_consider]
    samples = samples[index_to_consider]
    unique_values, counts = np.unique(labels, return_counts=True)
    relative_counts = counts/np.sum(counts)
    label_names = enc.inverse_transform(unique_values)

    # INSTRUCTION
    st.info("Some classes might be **underrepresented** in the data set and should be dropped. Choose a treshhold for minimum abundance in the original data set.")
    st.write("The final number of classes is",np.size(unique_values)," and the residual number of samples is",np.sum(index_to_consider),".")
    rel_counts = pd.DataFrame(data=relative_counts,columns=["fraction of class in data set"]).set_index(label_names)
    st.table(rel_counts)

    # DATA TRANSFORMATION
    st.subheader("Transform data.")
    st.sidebar.header("Constructing dictonary of words.")
    if st.sidebar.checkbox("Use word counts instead."):
        st.write("Transform the text data into **word count** representation.")
        max_df = st.sidebar.slider("Maximum fraction for a word to be considered.", min_value=0.05, max_value=0.4, value=0.3, step=0.01)
        min_df = st.sidebar.slider("Minimum fraction for a word to be considered.", min_value=0.001, max_value=0.05, value=0.01, step=0.001)
        max_features = st.sidebar.slider("Size of vocabulary.", min_value=100, max_value=2000, value=1000, step=100)
        ngram_range = (1,1)
        tfidf_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df,max_features=max_features,stop_words='english',ngram_range=ngram_range)
    else:
        st.write("Transform the text data into **tf-idf representation**. Customize the dictionary in the sidebar. Alternatively, you can also work with pure word counts.")
        max_df = st.sidebar.slider("Maximum tf-idf for a word to be considered.", min_value=0.05, max_value=0.4, value=0.3, step=0.01)
        min_df = st.sidebar.slider("Minimum tf-idf for a word to be considered.", min_value=0.001, max_value=0.05, value=0.01, step=0.001)
        max_features = st.sidebar.slider("Size of vocabulary.", min_value=100, max_value=2000, value=1000, step=100)
        ngram_range = (1,1)
        tfidf_vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df,max_features=max_features,stop_words='english',ngram_range=ngram_range)

    #dimred =  NMF(n_components=2, random_state=1,beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.4,l1_ratio=.5)
    #dimred = LatentDirichletAllocation(n_components=2, max_iter=5,learning_method='online',learning_offset=50,random_state=1)
    dimred = TruncatedSVD(n_components=2)

    #@st.cache(show_spinner=False)
    def transform():
        tfidf = tfidf_vectorizer.fit_transform(samples)
        feature_names = tfidf_vectorizer.get_feature_names()
        return tfidf,feature_names

    # TF-IDF
    with st.spinner('Data is being transformed.'):
        tfidf,feature_names = transform()
        st.success("Transformation finished.")
        st.subheader("Visualize data.")
        st.write("Examine the distribution of classes by dimensionality reduction based on **singular value decomposition (SVD)**.")
        data_ = dimred.fit_transform(tfidf)
        data__ = pd.DataFrame(data=data_,columns=["principal component 1","principal component 2"])
        labels_ = pd.DataFrame(data=enc.inverse_transform(labels),columns=["class"])
        data___ = pd.concat((data__,labels_),axis=1)
        c = alt.Chart(data___,title="dimensionality reduction",height=600).mark_circle(size=20).encode(x='principal component 1', y='principal component 2',color=alt.Color('class', scale=alt.Scale(scheme='blues')),tooltip=["class"]).interactive()
        st.altair_chart(c)
        st.write("The explained variance is",np.round(np.sum(dimred.explained_variance_ratio_)*100,2),"%.")

    # MODEL BUILDING.
    st.header("Model Building")
    st.write("The model is based on a **random forest**. Customize the model in the sidebar.")
    st.sidebar.header("Customizing the model.")
    n_estimators = st.sidebar.text_input('Number of trees in random forest.', '1000')
    max_leaf_nodes = st.sidebar.text_input('Maximum number of lead nodes.', '25')
    max_depth = st.sidebar.text_input('Maximum depth.', '5')
    class_weight = st.sidebar.selectbox("Class weights for the model.",('balanced','balanced_subsample'))
    forest_clf = RandomForestClassifier(n_estimators=int(n_estimators),max_depth=int(max_depth),max_leaf_nodes=int(max_leaf_nodes),class_weight=class_weight,oob_score=True,n_jobs=-1,random_state=0) # Define classifier to optimize.
    #parameters = {'max_leaf_nodes':np.linspace(20,35,14,dtype='int')} # Define grid.
    #clf = RandomizedSearchCV(forest_clf, parameters, n_iter=10, cv=3,iid=False, scoring='accuracy',n_jobs=-1) # Balanced accuracy as performance measure.

    #@st.cache(show_spinner=False)
    def train():
        classifier = forest_clf.fit(tfidf, labels) # Train/optimize classifier.
        #forest = classifier.best_estimator_
        feature_importances = classifier.feature_importances_
        indices = np.argsort(feature_importances)[::-1]

        # Analyze Feature Importance.
        n_f = 30 # Amount of Desired Features.
        sorted_feature_names = []
        for f in range(n_f):
            sorted_feature_names.append(feature_names[indices[f]])
        #feature_importance = pd.DataFrame(data=feature_importances[indices[0:n_f]], index=sorted_feature_names,columns=["feature importance"])
        feature_importance = pd.DataFrame(data=np.transpose(np.array((np.round(feature_importances[indices[0:n_f]],5),sorted_feature_names))),columns=["feature importance","features"])
        return classifier,feature_importance

    with st.spinner('Model is being trained.'):
        classifier,feature_importance = train() # Train/optimize classifier.
        st.success("Training finished.")
        st.write("Examine the importance of the most meaningful words for the overall classification performance.")
        bars = alt.Chart(feature_importance,height=600, title="discriminative power of features").mark_bar(color='steelblue',opacity=0.7).encode(
        y='features:N',
        x='feature importance:Q',tooltip="feature importance")
        st.altair_chart(bars)
        st.write('The test set accuracy (from out-of-bag samples) is',np.round(classifier.oob_score_,2),".")

    # MODEL EVALUATION
    st.header("Model Evaluation")
    y_true = labels
    y_pred = classifier.predict(tfidf)
    f1_score_ = f1_score(y_true,y_pred,average="weighted")
    st.write("The F1-score is",np.round(f1_score_,2),".")
    st.write("Below, the confusion matrix for the classification problem is provided.")
    cm = confusion_matrix(y_true,y_pred)
    labels_repeated = []
    for _ in range(np.unique(labels_).shape[0]):
        labels_repeated.append(np.unique(labels_))
    source = pd.DataFrame({'predicted class': np.transpose(np.array(labels_repeated)).ravel(),
                        'true class': np.array(labels_repeated).ravel(),
                        'count': np.round(cm.ravel(),2)})
    heat = alt.Chart(source,height=1000, title="confusion matrix").mark_rect(opacity=0.7).encode(
    x='predicted class:N',
    y='true class:N',
    color=alt.Color('count:Q', scale=alt.Scale(scheme='blues')),
    tooltip="count")
    st.altair_chart(heat)


@st.cache
def get_mt_samples() -> pd.DataFrame:
    """A dataframe of mt samples

    Returns:
        pd.DataFrame -- A dataframe
    """
    if MT_SAMPLES_PATH.exists():
        location = MT_SAMPLES_PATH
    else:
        location = MT_SAMPLES_URL
    return pd.read_csv(location, index_col=0,usecols=[0,1,2,4]).dropna()


main()