###
# Classic libraries
###

import pandas as pd
import numpy as np

###
# Data science librarie
###

import sklearn as sk
from sklearn.cluster import KMeans   # KMeans function
from sklearn.datasets import make_moons, make_circles, make_classification, make_blobs  # Easy simulations
from sklearn.model_selection import train_test_split   # Cross validation library

###
# data visualization libaries
###

import matplotlib.pyplot as plt
import seaborn as sns   # A must! For nice an easy figures - look for sns command in the notebook
from matplotlib.pyplot import cm   # This is the color chart that I personnaly prefer


def color(label, dim):
    x = np.linspace(0.0, 1.0, 100)
    a = plt.get_cmap("tab10")(x)[np.newaxis, :, :3][0][0]
    b = plt.get_cmap("tab10")(x)[np.newaxis, :, :3][0][10]
    
    if dim == 4:
        c = plt.get_cmap("tab10")(x)[np.newaxis, :, :3][0][20]
        d = plt.get_cmap("tab10")(x)[np.newaxis, :, :3][0][30]
    
        chart = [a, b, c, d]
    else:
        chart = [a, b]
    
    return(np.dot(label, chart))


def super_scat_it(X, y, dim, clusters_center=0, task='kmeans', wcolor=True):
    
    sns.set(rc={'figure.figsize':(8,6)})
    sns.set(font_scale = 1.5)
    cmap = plt.get_cmap("tab10")
    plt.gca().set_aspect(1)


    #color=cmap.rainbow(np.linspace(0,1, dim))

    if task == 'kmeans':
        
        data = np.concatenate((X, y.reshape(len(y),1)), axis=1)
        ens = pd.DataFrame(data)
        ens.columns = ['x1', 'x2', 'y']
        
        for k in range(dim):
            if wcolor:
                plt.scatter(ens[ens['y']==k]['x1'],ens[ens['y']==k]['x2'], color=cmap(k), label=f'Distribution {k+1}')
            else:
                plt.scatter(ens[ens['y']==k]['x1'],ens[ens['y']==k]['x2'], color='b', label=f'Distribution {k+1}')

        ###
        # Plot presentation
        ###
        if max(abs(y)) > 1:
            plt.scatter(ens[ens['y']== 100]['x1'],ens[ens['y']== 1000]['x2'],color="w", marker="x", label=' ')

        if np.sum(abs(clusters_center)) != 0:
            plt.scatter(clusters_center[:,0], clusters_center[:,1], marker="*", color="k", s=200, label='Clusters center')

    if task == 'EM':
        plt.scatter(X[:,0], X[:,1], color=color(y, dim), s=10)


    # Axes
    plt.xlabel('x$_1$')
    plt.ylabel('x$_2$')
    

    # Ghosting the legend
    leg = plt.gca().legend(loc='center left', bbox_to_anchor=(1, .85))
    leg.get_frame().set_alpha(0)


def distance(data, cluster_centers):
    """
    Description:      
        for each observation, calculate the euclidiean distance of the neerest cluster centers
        return the average distance
    Args:
        X: unlabbeled data
        cluster_centers: cluster centers 
    Return:
        average distance
    """
       
    matrice = np.zeros((data.shape[0], cluster_centers.shape[0]))

    for k in np.arange(cluster_centers.shape[0]):
        matrice[:,k] = np.sum((data - cluster_centers[k,:])**2, axis=1)
        
    return(matrice)


def initiate(data, k, seed=None):
    """
    Description: Function for randomnly initiate cluster centers
    Args:
        data: unlabbeled data
        k: number of cluster 
    Return:
        Initial values of the cluster centers
    """
        
    X1_min=np.min(data[:,0])
    X1_max=np.max(data[:,0])
    
    X2_min=np.min(data[:,1])
    X2_max=np.max(data[:,1])

    if seed != None:
        np.random.seed(seed)
    X1_means = np.random.uniform(X1_min, X1_max, k)
    X2_means = np.random.uniform(X2_min, X2_max, k)
    
    return (np.concatenate((X1_means.reshape((k,1)) , X2_means.reshape((k,1))), axis=1)) 


def estimate_centroid(data, labels):
    """
    Description: Estimate the centroid according to the label associated with each observation
    Args:
        data: unlabeled data
        labels: labal associated to each observations
    Return:
        k Centroids
    """
    
    data = np.concatenate((data, labels.reshape(len(labels),1)), axis=1)
    data = pd.DataFrame(data)
    data.columns = ['x1', 'x2', 'y']
    
    data_0 = np.mean(data[data['y']==0].values, axis=0)
    data_1 = np.mean(data[data['y']==1].values, axis=0)
    
    return(np.array([data_0, data_1])[:,0:2])