"""
Copyright IBM Corp. 2021
"""


import logging

import numpy as np

from ibmfl.data.data_handler import DataHandler
from ibmfl.data.data_util import get_reweighing_weights, get_hist_counts
from sklearn.model_selection import train_test_split
import pandas as pd

logger = logging.getLogger(__name__)

TEST_SIZE = 0.2
RANDOM_STATE = 42
SENSITIVE_ATTRIBUTE = 'sex'

class AdultSklearnDataHandler(DataHandler):
    """
    Data handler for Adult dataset to train a Logistic Regression Classifier on scikit-learn.
    TEST_SIZE is set to 0.2, and RANDOM_STATE is set to 42.
    """
    def __init__(self, data_config=None):
        super().__init__()
        self.file_name = None
        if data_config is not None:
            if 'txt_file' in data_config:
                self.file_name = data_config['txt_file']
            if 'epsilon' in data_config:
                self.epsilon = data_config['epsilon']

        # load dataset
        training_dataset = self.load_dataset()

        # pre-process the data
        self.training_dataset = self.preprocess(training_dataset)
        x_0 = self.training_dataset.iloc[:, :-1]
        y_0 = self.training_dataset.iloc[:, -1]
        x = np.array(x_0)
        y = np.array(y_0)

        self.x_train, self.x_test, self.y_train, self.y_test =\
            train_test_split(x, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

    def get_data(self):
        """
        Returns pre-processed adult training and testing data.

        :return: training and testing data
        :rtype: `tuple`
        """
        return (self.x_train, self.y_train), (self.x_test, self.y_test)

    def load_dataset(self):
        """
        Loads the training dataset from a given local path.

        :return: raw dataset
        :rtype: `pandas.core.frame.DataFrame`
        """
        try:
            logger.info('Loaded training data from '+ str(self.file_name))
            training_dataset = pd.read_csv(self.file_name,
                                           dtype='category')
        except Exception:
            raise IOError('Unable to load training data from path '
                          'provided in config file: ' +
                          self.file_name)
        return training_dataset

    def get_weight(self):
        """
        Gets pre-processed adult training and testing data, calculates weights for points
        weight = P-expected(sensitive_attribute & class)/P-observed(sensitive_attribute & class)

        :return: weights
        :rtype: `np.array`
        """
        cols = self.get_col_names()
        training_data, (_) = self.get_data()
        return get_reweighing_weights(training_data, SENSITIVE_ATTRIBUTE, cols)


    def get_hist(self):
        """
        Gets pre-processed adult training and testing data, calculates counts for sensitive attribute
        and label

        :return: weights
        :rtype: `np.array`
        """
        e = self.epsilon
        cols = self.get_col_names()
        training_data, (_) = self.get_data()
        return get_hist_counts(training_data, SENSITIVE_ATTRIBUTE, cols, e)

    @staticmethod
    def get_col_names():
        """
        Returns the names of the dataset columns

        :return: column names
        :rtype: `list`
        """
        cols = ['race', 'sex', 'age1', 'age2', 'age3', 'age4', 'age5', 'age6',
                'age7', 'ed6less', 'ed6', 'ed7', 'ed8', 'ed9',
                'ed10', 'ed11', 'ed12', 'ed12more']

        return cols

    @staticmethod
    def get_sa():
        """
        Returns the sensitive attribute

        :return: sensitive attribute
        :rtype: `str`
        """
        return SENSITIVE_ATTRIBUTE

    def preprocess(self, training_data):
        """
        Performs the following preprocessing on adult training and testing data:
        * Drop following features: 'workclass', 'fnlwgt', 'education', 'marital-status', 'occupation',
          'relationship', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'
        * Map 'race', 'sex' and 'class' values to 0/1
            * ' White': 1, ' Amer-Indian-Eskimo': 0, ' Asian-Pac-Islander': 0, ' Black': 0, ' Other': 0
            * ' Male': 1, ' Female': 0
            * Further details in Kamiran, F. and Calders, T. Data preprocessing techniques for classification without discrimination
        * Split 'age' and 'education' columns into multiple columns based on value

        :param training_data: Raw training data
        :type training_data: `pandas.core.frame.DataFrame
        :return: Preprocessed training data
        :rtype: `pandas.core.frame.DataFrame`
        """
        if len(training_data.columns)==15:
            # drop 'fnlwgt' column
            training_data = training_data.drop(
                training_data.columns[2], axis='columns')

        training_data.columns = ['age', 'workclass', 'education', 'education-num', 'marital-status',
                                    'occupation',
                                    'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
                                    'native-country',
                                    'class']

        # filter out columns unused in training, and reorder columns
        training_dataset = training_data.loc[:,['race', 'sex', 'age', 'education-num', 'class']]

        # map 'sex' and 'race' feature values based on sensitive attribute privileged/unpriveleged groups
        training_dataset['sex'] = training_dataset['sex'].map({' Female': 0, ' Male': 1})
        training_dataset['race'] = training_dataset['race'].map(
            {' Asian-Pac-Islander': 0, ' Amer-Indian-Eskimo': 0, ' Other': 0, ' Black': 0, ' White': 1})

        # map 'class' values to 0/1 based on positive and negative classification
        training_dataset['class'] = training_dataset['class'].map({' <=50K': 0, ' >50K': 1})

        training_dataset['age'] = training_dataset['age'].astype(int)
        training_dataset['education-num'] = training_dataset['education-num'].astype(int)

        # split age column into category columns
        for i in range(8):
            if i != 0:
                training_dataset['age' + str(i)] = 0

        for index, row in training_dataset.iterrows():
            if row['age'] < 20:
                training_dataset.loc[index, 'age1'] = 1
            elif ((row['age'] < 30) & (row['age'] >= 20)):
                training_dataset.loc[index, 'age2'] = 1
            elif ((row['age'] < 40) & (row['age'] >= 30)):
                training_dataset.loc[index, 'age3'] = 1
            elif ((row['age'] < 50) & (row['age'] >= 40)):
                training_dataset.loc[index, 'age4'] = 1
            elif ((row['age'] < 60) & (row['age'] >= 50)):
                training_dataset.loc[index, 'age5'] = 1
            elif ((row['age'] < 70) & (row['age'] >= 60)):
                training_dataset.loc[index, 'age6'] = 1
            elif row['age'] >= 70:
                training_dataset.loc[index, 'age7'] = 1

        # split age column into multiple columns
        training_dataset['ed6less'] = 0
        for i in range(13):
            if i >= 6:
                training_dataset['ed' + str(i)] = 0
        training_dataset['ed12more'] = 0

        for index, row in training_dataset.iterrows():
            if row['education-num'] < 6:
                training_dataset.loc[index, 'ed6less'] = 1
            elif row['education-num'] == 6:
                training_dataset.loc[index, 'ed6'] = 1
            elif row['education-num'] == 7:
                training_dataset.loc[index, 'ed7'] = 1
            elif row['education-num'] == 8:
                training_dataset.loc[index, 'ed8'] = 1
            elif row['education-num'] == 9:
                training_dataset.loc[index, 'ed9'] = 1
            elif row['education-num'] == 10:
                training_dataset.loc[index, 'ed10'] = 1
            elif row['education-num'] == 11:
                training_dataset.loc[index, 'ed11'] = 1
            elif row['education-num'] == 12:
                training_dataset.loc[index, 'ed12'] = 1
            elif row['education-num'] > 12:
                training_dataset.loc[index, 'ed12more'] = 1

        training_dataset.drop(['age', 'education-num'], axis=1, inplace=True)

        # move class column to be last column
        label = training_dataset['class']
        training_dataset.drop('class', axis=1, inplace=True)
        training_dataset['class'] = label

        return training_dataset