{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data cleaning and preparation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import datetime"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Configure data path variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Path where the input csv-files are stored\n",
    "fpath_src_data_dir = 'src_data/'\n",
    "\n",
    "# Path where the cleaned data is stored\n",
    "fpath_clean_data_dir = 'clean_data/'\n",
    "\n",
    "# Path where the data ready for the ML analysis is stored\n",
    "fpath_prepared_data_dir = 'ready_data/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "src_data/hm_animal.csv\t   src_data/hm_mastitis_bact_tfpcr.csv\r\n",
      "src_data/hm_BCS.csv\t   src_data/hm_milkrecording.csv\r\n",
      "src_data/hm_ebv.csv\t   src_data/hm_NSAIET.csv\r\n",
      "src_data/hm_health.csv\t   src_data/hm_pregnancy.csv\r\n",
      "src_data/hm_lactation.csv\r\n"
     ]
    }
   ],
   "source": [
    "!ls {fpath_src_data_dir}*.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir -p {fpath_clean_data_dir}\n",
    "!mkdir -p {fpath_prepared_data_dir}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Definitions to read in data and what columns to ignore\n",
    "According to Thomas Denninger, some columns can be ignored (see below). In a second step, the analysis can and should however be refined, i.e. the information be used nonetheless.\n",
    "\n",
    "**Remarks**:\n",
    "* We are not taking into consideration the dataset 'hm_mastitis_bact_tfpcr.csv' in this analysis.\n",
    "* The columns 'bloodBHB', 'bloodNEFA', 'milkAcetone', 'CH4' of the dataset 'Hm_milkrecording.csv' are removed in this analysis. However, they are stored to a separate file in a different Jupyter notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_files = {'hm_BCS.csv': {'delimiter': ';',\n",
    "                             'columns_to_ignore': []\n",
    "                            },\n",
    "            \n",
    "              'hm_lactation.csv': {'delimiter': ';',\n",
    "                                   'columns_to_ignore': ['dry_date_prev_lact']\n",
    "                                  },\n",
    "\n",
    "              'hm_NSAIET.csv': {'delimiter': ';',\n",
    "                                'columns_to_ignore': ['ejaculate',\n",
    "                                                      'INSEM_SEXINGBEHANDLUNG',\n",
    "                                                      'AI_organization',\n",
    "                                                      'INSEM_SEXINGGESCHLECHT',\n",
    "                                                      'idhrd_anon'\n",
    "                                                     ]\n",
    "                               },\n",
    "\n",
    "#               'hm_mastitis_bact_tfpcr.csv': {'delimiter': ';',\n",
    "#                                              'columns_to_ignore': ['Ct_value']\n",
    "#                                             },\n",
    "\n",
    "              'hm_animal.csv': {'delimiter': ';',\n",
    "                                'columns_to_ignore': ['idbrd', 'idbrd_sire', 'idbrd_dam']\n",
    "                               },\n",
    "\n",
    "              'hm_milkrecording.csv': {'delimiter': ';',\n",
    "                                       'columns_to_ignore': ['idspectrum_num',\n",
    "                                                             'AR_PESEE_PESCODEMALADEANALYSE',\n",
    "                                                             'AR_PESEE_PESCODEMALADEPESEE',\n",
    "                                                             'bloodBHB',\n",
    "                                                             'bloodNEFA',\n",
    "                                                             'milkAcetone',\n",
    "                                                             'CH4',\n",
    "                                                             'milk_yield_morning_measured',\n",
    "                                                             'milk_yield_evening_measured'\n",
    "                                                            ]\n",
    "                                      },\n",
    "\n",
    "              'hm_ebv.csv': {'delimiter': ' ',\n",
    "                             'columns_to_ignore': ['type', 'nDau', 'pubCode', 'rel', 'edc', 'nHerd', 'IVF', 'FIT', 'GZW', 'MIW']\n",
    "                            },\n",
    "\n",
    "              'hm_pregnancy.csv': {'delimiter': ';',\n",
    "                                   'columns_to_ignore': ['pregnancy_detection_method', 'P_N_value']\n",
    "                                  },\n",
    "\n",
    "              'hm_health.csv': {'delimiter': ';',\n",
    "                                'columns_to_ignore': ['intervening_person', 'infected_udder_quarter', 'leg']\n",
    "                               }\n",
    "             }\n",
    "\n",
    "fnames = list(data_files.keys())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Useful functions to clean data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_in_csv(file_path, sep):\n",
    "    \"\"\"\n",
    "    Reads in the specified csv-file.\n",
    "    \n",
    "    :param file_path: Path to the csv-file\n",
    "    :param sep: Separator used in the csv-file\n",
    "    :return: Pandas dataframe\n",
    "    \"\"\"\n",
    "    return pd.read_csv(file_path, sep=sep)\n",
    "\n",
    "\n",
    "def save_csv(df, file_path, index=False):\n",
    "    \"\"\"\n",
    "    Save the specified csv-file.\n",
    "    \n",
    "    :param df: Pandas dataframe\n",
    "    :param file_path: Path to the output csv-file\n",
    "    :param index: Boolean value whether or not the first column (bool) is written to file as well (default: False)\n",
    "    :return: None\n",
    "    \"\"\"\n",
    "    return df.to_csv(file_path, index=index)\n",
    "\n",
    "\n",
    "def remove_rows_with_missing_values(df):\n",
    "    \"\"\"\n",
    "    Compute the number of missing values per column and remove all rows with missing values.\n",
    "    \n",
    "    :param df: Pandas dataframe\n",
    "    :return: Pandas dataframe\n",
    "    \"\"\"\n",
    "    \n",
    "    for _, val in enumerate(df.columns):\n",
    "        mask = df[val].isnull().values\n",
    "        if np.sum(mask) > 0:\n",
    "            print('Column: {:}; Removed missing values: {:.2f}%'.format(val, np.sum(mask)/mask.size*100))\n",
    "            df = df.drop(np.arange(mask.size)[mask], axis=0)\n",
    "            df = df.reset_index(drop=True)\n",
    "    return df\n",
    "\n",
    "\n",
    "def remove_columns_to_be_ignored(df, columns_to_ignore):\n",
    "    \"\"\"\n",
    "    Remove all columns to be ignored.\n",
    "    \n",
    "    :param df: Pandas dataframe\n",
    "    :param columns_to_ignore: List of the column names of the columns to ignore\n",
    "    :return: Pandas dataframe\n",
    "    \"\"\"\n",
    "    return df.drop(labels=columns_to_ignore, axis=1)\n",
    "\n",
    "\n",
    "def convert_column_to_datetime(df, column, format='%Y%m%d', convert_to_int=False):\n",
    "    \"\"\"\n",
    "    Convert a date-like column to a pandas datetime-column.\n",
    "    \n",
    "    :param df: Pandas dataframe\n",
    "    :param column: Column name of the datetime column\n",
    "    :param format: Format of the date-entry (default: YYYYMMDD)\n",
    "    :param convert_to_int: Bool value whether the column needs to be converted to an\n",
    "                           integer column first (default: False)\n",
    "    :return: Pandas dataframe\n",
    "    \"\"\"\n",
    "    if convert_to_int:\n",
    "        df[column] = df[column].values.astype(np.int64)\n",
    "    df[column] = pd.to_datetime(df[column], format='%Y%m%d', errors='coerce').values\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read in src-data & clean it (with modifications as suggested by Thomas Denninger)\n",
    "\n",
    "Mögliche Verbesserungen an den Daten:\n",
    "Jahreszeit (wie auch immer man diese definiert :) ) hinzufügen, wann eine Milchleistungsprüfung / Besamung vorgenommen wurde\n",
    "Information der Art der Milchleistungsprüfmethode hineinnehmen\n",
    "milking_time_morning und milking_time_evening ignorieren\n",
    "Die Information AR_PESEE_PESCODEALPAGE (ob auf Alp oder nicht) weiter verfeinern, da sehr viele Einträge den Wert 0 haben\n",
    "Information der Herde (idhrd_anon) “retten”, z.B. indem man berechnet wie die Milchleistung im Vergleich der mittleren Milchleistung der Herde ist\n",
    "Zeilen entfernen, bei welchen die Spalte label nicht ein “CH” drin hat\n",
    "milk_yield_msrmt_type (Typ wie der Milk Yield gemessen wurde) möglicherweise ignorieren\n",
    "Werte mit nsaiet_type == Belegung oder == ET entfernen\n",
    "Herausfinden, was bei calving_ease schief lief"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--- Reading in hm_BCS.csv ---\n",
      "         BCS_date   BCS       idani_anon\n",
      "0      2018-10-23  3.00  CHE000000000561\n",
      "1      2017-02-16  2.50  CHE000000000781\n",
      "2      2013-12-19  3.00  CHE000000002000\n",
      "3      2015-02-05  3.00  CHE000000002000\n",
      "4      2010-11-05  3.25  CHE000000002251\n",
      "...           ...   ...              ...\n",
      "171212 2016-10-18  2.75  CHE000099998376\n",
      "171213 2018-03-09  2.75  CHE000099999361\n",
      "171214 2019-05-28  2.75  CHE000099999361\n",
      "171215 2020-03-05  2.50  CHE000099999361\n",
      "171216 2013-06-04  3.25  CHE000099999926\n",
      "\n",
      "[171217 rows x 3 columns]\n",
      "\n",
      "--- Reading in hm_lactation.csv ---\n",
      "        parity calving_date calving_ease       idani_anon\n",
      "0            1   2018-09-06            2  CHE000000000561\n",
      "1            2   2019-09-15            2  CHE000000000561\n",
      "2            1   2016-09-07            2  CHE000000000781\n",
      "3            2   2017-08-05            1  CHE000000000781\n",
      "4            3   2018-10-18          2.5  CHE000000000781\n",
      "...        ...          ...          ...              ...\n",
      "637269       1   2013-02-26            2  CHE000099999926\n",
      "637270       2   2014-03-24            1  CHE000099999926\n",
      "637271       3   2015-03-05            2  CHE000099999926\n",
      "637272       4   2016-06-18            3  CHE000099999926\n",
      "637273       5   2017-11-14            2  CHE000099999926\n",
      "\n",
      "[637274 rows x 4 columns]\n",
      "\n",
      "--- Reading in hm_NSAIET.csv ---\n",
      "         parity nsaiet_date nsaiet_type  AI_technician       idani_anon  \\\n",
      "0             0  2017-11-05    Besamung             10  CHE000000000561   \n",
      "1             0  2017-11-28    Besamung             10  CHE000000000561   \n",
      "2             1  2018-11-16    Besamung             10  CHE000000000561   \n",
      "3             1  2018-12-05    Besamung             10  CHE000000000561   \n",
      "4             2  2019-12-09    Besamung             10  CHE000000000561   \n",
      "...         ...         ...         ...            ...              ...   \n",
      "1366133       1  2013-06-13    Besamung             10  CHE000099999926   \n",
      "1366134       2  2014-05-19    Besamung             10  CHE000099999926   \n",
      "1366135       3  2015-09-02    Belegung              5  CHE000099999926   \n",
      "1366136       4  2016-10-14    Besamung             10  CHE000099999926   \n",
      "1366137       4  2017-01-24    Besamung             10  CHE000099999926   \n",
      "\n",
      "        idani_anon_aisire  \n",
      "0         CHE000020282537  \n",
      "1         CHE000020282537  \n",
      "2         CHE000002123235  \n",
      "3         CHE000002123235  \n",
      "4         CHE000001110094  \n",
      "...                   ...  \n",
      "1366133   CHE000025255973  \n",
      "1366134   CHE000038915903  \n",
      "1366135   CHE000099239508  \n",
      "1366136   CHE000045797183  \n",
      "1366137   CHE000079801145  \n",
      "\n",
      "[1366138 rows x 6 columns]\n",
      "\n",
      "--- Reading in hm_animal.csv ---\n",
      "       birth_date brd_abbr_icar       idani_anon\n",
      "0      2016-03-08           HOL  CHE000000000559\n",
      "1      2016-02-27           HOL  CHE000000000561\n",
      "2      2011-05-09           HOL  CHE000000000620\n",
      "3      2014-06-23           HOL  CHE000000000781\n",
      "4      2015-11-25           HOL  CHE000000001494\n",
      "...           ...           ...              ...\n",
      "275766 2014-01-06           HOL  CHE000099998134\n",
      "275767 2016-11-28           HOL  CHE000099998152\n",
      "275768 2013-11-12           HOL  CHE000099998376\n",
      "275769 2015-09-07           HOL  CHE000099999361\n",
      "275770 2011-01-26           HOL  CHE000099999926\n",
      "\n",
      "[275771 rows x 3 columns]\n",
      "\n",
      "--- Reading in hm_milkrecording.csv ---\n",
      "Column: milking_time_morning; Removed missing values: 0.04%\n",
      "Column: milking_time_evening; Removed missing values: 0.01%\n",
      "Column: lab_date; Removed missing values: 0.00%\n",
      "Column: DIM; Removed missing values: 0.02%\n",
      "Column: lactose_24h; Removed missing values: 0.00%\n",
      "Column: scc_24h; Removed missing values: 0.00%\n",
      "Column: urea_24h; Removed missing values: 0.05%\n",
      "        mlksmpl_date  milking_time_morning  milking_time_evening   lab_date  \\\n",
      "0         2018-10-10               50000.0              163000.0 2018-10-11   \n",
      "1         2018-11-13               50000.0              163000.0 2018-11-15   \n",
      "2         2018-12-18               50000.0              163000.0 2018-12-19   \n",
      "3         2019-01-21               50000.0              163000.0 2019-01-23   \n",
      "4         2019-02-23               50000.0              163000.0 2019-02-26   \n",
      "...              ...                   ...                   ...        ...   \n",
      "5116724   2018-08-06               55000.0              172000.0 2018-08-08   \n",
      "5116725   2018-09-07               55000.0              171000.0 2018-09-10   \n",
      "5116726   2018-10-10               53000.0              171000.0 2018-10-11   \n",
      "5116727   2018-11-12               53000.0              165000.0 2018-11-14   \n",
      "5116728   2018-12-14               53500.0              165000.0 2018-12-17   \n",
      "\n",
      "           DIM pruefmethode melkmethode  milk_yield_24h  fat_24h  protein_24h  \\\n",
      "0         34.0          AT4      Normal            34.2     36.5         24.9   \n",
      "1         68.0          AT4      Normal            34.1     34.9         29.2   \n",
      "2        103.0          AT4      Normal            30.9     37.2         26.7   \n",
      "3        137.0          AT4      Normal            36.3     36.8         31.1   \n",
      "4        170.0          AT4      Normal            35.3     34.2         29.6   \n",
      "...        ...          ...         ...             ...      ...          ...   \n",
      "5116724  265.0          AT4      Normal            27.9     43.5         32.7   \n",
      "5116725  297.0          AT4      Normal            30.7     58.8         38.4   \n",
      "5116726  330.0          AT4      Normal            25.3     52.2         40.6   \n",
      "5116727  363.0          AT4      Normal            19.8     52.8         43.3   \n",
      "5116728  395.0          AT4      Normal            14.7     53.6         43.6   \n",
      "\n",
      "         lactose_24h  scc_24h  urea_24h  AR_PESEE_PESCODEALPAGE  \\\n",
      "0               49.8     21.0       2.4                     0.0   \n",
      "1               50.3     80.0       2.2                     0.0   \n",
      "2               48.3    132.0       1.6                     0.0   \n",
      "3               50.0    106.0       2.1                     0.0   \n",
      "4               48.6    133.0       1.5                     0.0   \n",
      "...              ...      ...       ...                     ...   \n",
      "5116724         44.1   1004.0       3.3                     0.0   \n",
      "5116725         44.6    517.0       3.2                     0.0   \n",
      "5116726         43.5    235.0       2.9                     0.0   \n",
      "5116727         43.6    166.0       3.2                     0.0   \n",
      "5116728         41.7    163.0       2.9                     0.0   \n",
      "\n",
      "              idani_anon       idhrd_anon  milk_yield_msrmt_type  \\\n",
      "0        CHE000000000561  CHE000000095710                      2   \n",
      "1        CHE000000000561  CHE000000095710                      3   \n",
      "2        CHE000000000561  CHE000000095710                      2   \n",
      "3        CHE000000000561  CHE000000095710                      3   \n",
      "4        CHE000000000561  CHE000000095710                      2   \n",
      "...                  ...              ...                    ...   \n",
      "5116724  CHE000099999926  CHE000000031281                      2   \n",
      "5116725  CHE000099999926  CHE000000031281                      3   \n",
      "5116726  CHE000099999926  CHE000000031281                      2   \n",
      "5116727  CHE000099999926  CHE000000031281                      3   \n",
      "5116728  CHE000099999926  CHE000000031281                      2   \n",
      "\n",
      "         fat_protein_24h_ratio  \n",
      "0                     1.465863  \n",
      "1                     1.195205  \n",
      "2                     1.393258  \n",
      "3                     1.183280  \n",
      "4                     1.155405  \n",
      "...                        ...  \n",
      "5116724               1.330275  \n",
      "5116725               1.531250  \n",
      "5116726               1.285714  \n",
      "5116727               1.219400  \n",
      "5116728               1.229358  \n",
      "\n",
      "[5116729 rows x 18 columns]\n",
      "\n",
      "--- Reading in hm_ebv.csv ---\n",
      "Column: ekg; Removed missing values: 63.15%\n",
      "Column: scs; Removed missing values: 0.00%\n",
      "        base label       idani_anon   ekg   epr   fkg   fpr     mkg    per  \\\n",
      "0       HO20     A  CHE000000000559 -23.0 -0.02 -31.0 -0.07  -636.0   93.0   \n",
      "1       HO20    CH  CHE000000000561  18.0 -0.23  26.0 -0.24  1173.0   93.0   \n",
      "2       HO20     A  CHE000000000620   5.0  0.11  -1.0  0.05  -147.0   99.0   \n",
      "3       HO20    CH  CHE000000000781   0.0 -0.07  -2.0 -0.11   179.0   95.0   \n",
      "4       HO20    CH  CHE000000001494   3.0  0.11  -4.0  0.04  -179.0   93.0   \n",
      "...      ...   ...              ...   ...   ...   ...   ...     ...    ...   \n",
      "271769  HO20    CH  CHE000099998134   6.0  0.11  18.0  0.26   -82.0   89.0   \n",
      "271770  HO20    CH  CHE000099998152   8.0 -0.03   3.0 -0.11   311.0   89.0   \n",
      "271771  HO20    CH  CHE000099998376   8.0  0.16  46.0  0.64  -167.0  104.0   \n",
      "271772  HO20    CH  CHE000099999361  10.0 -0.02  12.0 -0.03   365.0  103.0   \n",
      "271773  HO20    CH  CHE000099999926  -1.0 -0.01  11.0  0.13    -3.0   94.0   \n",
      "\n",
      "          scs  \n",
      "0       102.0  \n",
      "1        89.0  \n",
      "2        94.0  \n",
      "3       106.0  \n",
      "4       111.0  \n",
      "...       ...  \n",
      "271769   90.0  \n",
      "271770  110.0  \n",
      "271771   99.0  \n",
      "271772  111.0  \n",
      "271773   98.0  \n",
      "\n",
      "[271774 rows x 10 columns]\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--- Reading in hm_pregnancy.csv ---\n",
      "       pregnancy_detection_date   finding       idani_anon       idhrd_anon\n",
      "0                    2019-01-08  pregnant  CHE000000000561  CHE000000095710\n",
      "1                    2020-03-11      open  CHE000000000561  CHE000000095710\n",
      "2                    2018-01-09      open  CHE000000000781  CHE000000072353\n",
      "3                    2018-03-13  pregnant  CHE000000000781  CHE000000072353\n",
      "4                    2018-03-20  pregnant  CHE000000001494  CHE000000052966\n",
      "...                         ...       ...              ...              ...\n",
      "143945               2018-11-27  pregnant  CHE000099992995  CHE000000014814\n",
      "143946               2020-01-17      open  CHE000099993267  CHE000000035579\n",
      "143947               2019-11-21  pregnant  CHE000099998152  CHE000000064265\n",
      "143948               2020-01-27  pregnant  CHE000099998152  CHE000000064265\n",
      "143949               2018-05-01  pregnant  CHE000099999926  CHE000000031281\n",
      "\n",
      "[143950 rows x 4 columns]\n",
      "\n",
      "--- Reading in hm_health.csv ---\n",
      "Column: idhrd_anon; Removed missing values: 0.07%\n",
      "      hecode_ASR healthevent_date       idani_anon       idhrd_anon\n",
      "0        10.7.1.       2018-11-19  CHE000000005877  CHE000000079291\n",
      "1         2.1.1.       2018-12-20  CHE000000005877  CHE000000079291\n",
      "2          10.4.       2019-09-06  CHE000000005877  CHE000000079291\n",
      "3           3.5.       2020-03-14  CHE000000005877  CHE000000079291\n",
      "4           6.1.       2014-06-02  CHE000000006772  CHE000000055108\n",
      "...          ...              ...              ...              ...\n",
      "53730     2.1.2.       2019-06-15  CHE000099990201  CHE000000088759\n",
      "53731       2.5.       2018-01-30  CHE000099990433  CHE000000059284\n",
      "53732     2.1.1.       2020-03-18  CHE000099992995  CHE000000014814\n",
      "53733    9.3.99.       2013-11-25  CHE000099998376  CHE000000086408\n",
      "53734      10.2.       2013-12-19  CHE000099998376  CHE000000086408\n",
      "\n",
      "[53735 rows x 4 columns]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for _, fname in enumerate(fnames):\n",
    "    print('--- Reading in {:} ---'.format(fname))\n",
    "    df = read_in_csv(file_path=fpath_src_data_dir+fname, sep=data_files[fname]['delimiter'])\n",
    "\n",
    "    # Convert date columns to datetime objects & sort dataframe\n",
    "    if fname == 'hm_BCS.csv':\n",
    "        df = convert_column_to_datetime(df, column='BCS_date', convert_to_int=False)\n",
    "        df = df.sort_values(by=['idani_anon', 'BCS_date'])\n",
    "    if fname == 'hm_lactation.csv':\n",
    "        df = convert_column_to_datetime(df, column='calving_date', convert_to_int=False)\n",
    "        df = convert_column_to_datetime(df, column='dry_date_prev_lact', convert_to_int=True)\n",
    "        df = df.sort_values(by=['idani_anon', 'calving_date'])\n",
    "    if fname == 'hm_NSAIET.csv':\n",
    "        df = convert_column_to_datetime(df, column='nsaiet_date', convert_to_int=False)\n",
    "        df = df.sort_values(by=['idani_anon', 'nsaiet_date'])\n",
    "    if fname == 'hm_animal.csv':\n",
    "        df = convert_column_to_datetime(df, column='birth_date', convert_to_int=False)\n",
    "        df = df.sort_values(by=['idani_anon', 'birth_date'])\n",
    "    if fname == 'hm_milkrecording.csv':\n",
    "        df = convert_column_to_datetime(df, column='mlksmpl_date', convert_to_int=False)\n",
    "        df = convert_column_to_datetime(df, column='lab_date', convert_to_int=False)\n",
    "        df = df.sort_values(by=['idani_anon', 'mlksmpl_date'])\n",
    "#     if fname == 'hm_ebv.csv':  # Sort afterwards\n",
    "#         df = df.sort_values(by=['idani_anon'])\n",
    "    if fname == 'hm_pregnancy.csv':\n",
    "        df = convert_column_to_datetime(df, column='pregnancy_detection_date', convert_to_int=False)\n",
    "        df = df.sort_values(by=['idani_anon', 'pregnancy_detection_date'])\n",
    "    if fname == 'hm_health.csv':\n",
    "        df = convert_column_to_datetime(df, column='healthevent_date', convert_to_int=False)\n",
    "        df = df.sort_values(by=['idani_anon', 'healthevent_date'])\n",
    "\n",
    "    # Modifications to individual datasets, as suggested by Thomas Denninger\n",
    "    if fname == 'hm_milkrecording.csv':\n",
    "        # Add new column depending on how the milk_yield was measured\n",
    "        # \"Only milk_yield_24h\": 1; \"milk_yield_morning_measured\": 2; \"milk_yield_evening_measured\": 3\n",
    "        new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int)\n",
    "        new_col[~(df['milk_yield_morning_measured'].isnull().values)] = 2\n",
    "        new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3\n",
    "        df['milk_yield_msrmt_type'] = new_col.copy()\n",
    "\n",
    "        # Add new column of the fat_24h vs. protein_24h ratio\n",
    "        new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int)\n",
    "        new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3\n",
    "        df['fat_protein_24h_ratio'] = df['fat_24h'] / df['protein_24h']\n",
    "\n",
    "        # Replace empty 'AR_PESEE_PESCODEALPAGE' (altitude in m when cow was milked) values with fixed value (350)\n",
    "        df['AR_PESEE_PESCODEALPAGE'] = df['AR_PESEE_PESCODEALPAGE'].fillna(350)\n",
    "\n",
    "    if fname == 'hm_lactation.csv':\n",
    "#         # In a first step, fill empty 'dry_date_prev_lact' values with 'calving_date' + 305 days\n",
    "#         values = df['dry_date_prev_lact'].values.copy()\n",
    "#         mask = df['dry_date_prev_lact'].isnull().values & (df['parity'].values > 1)\n",
    "#         values[mask] = (df['calving_date'][mask] + pd.DateOffset(days=305)).values\n",
    "#         df['dry_date_prev_lact'] = values.copy()\n",
    "\n",
    "        # Fill empty 'calving_ease' with constant value 2.5\n",
    "        df['calving_ease'] = df['calving_ease'].fillna(2.5)\n",
    "\n",
    "    if fname == 'hm_NSAIET.csv':\n",
    "        # Overwrite the AI_technician (= ID of person who performed the artificial insemination) column with\n",
    "        # \"nsaiet_type=Belegung\": 5; \"nsaiet_type=Besamung\": 10; \"nsaiet_type=ET\": 15\n",
    "        new_col = np.zeros(df['AI_technician'].size, dtype=np.int)\n",
    "        new_col[df['nsaiet_type'].values == 'Belegung'] = 5\n",
    "        new_col[df['nsaiet_type'].values == 'Besamung'] = 10\n",
    "        new_col[df['nsaiet_type'].values == 'ET'] = 15\n",
    "        df['AI_technician'] = new_col.copy()\n",
    "\n",
    "    if fname == 'hm_ebv.csv':\n",
    "        # Replace the values 'XXXX' in the column 'label' with a 'U' (Unbekannt)\n",
    "        values = df['label'].values.copy()\n",
    "        mask = df['label'].values == 'XXXX'\n",
    "        values[mask] = 'U'\n",
    "        df['label'] = values.copy()\n",
    "        \n",
    "        # Convert 'trait' (=genetic trait) and 'estimate' (=value) to individual columns unsing a pivot table \n",
    "        column_names = [col for col in list(df.columns.values) if col not in data_files[fname]['columns_to_ignore']]\n",
    "        column_names.remove('trait')\n",
    "        column_names.remove('estimate')\n",
    "        df = pd.DataFrame(df.pivot_table('estimate', column_names, 'trait').to_records())\n",
    "        \n",
    "        # Only consider the traits 'mkg', 'fkg', 'fpr', 'ekg', and 'epr' here and remove the rest\n",
    "        considered_traits = ['ekg', 'epr', 'fkg', 'fpr', 'mkg', 'per', 'scs']\n",
    "        \n",
    "        df = df[column_names+considered_traits]\n",
    "        \n",
    "        # Sort dataframe\n",
    "        df = df.sort_values(by=['idani_anon'])\n",
    "    \n",
    "    # Remove columns to be ignored and rows with emtpy values\n",
    "    if fname != 'hm_ebv.csv':\n",
    "        df = remove_columns_to_be_ignored(df=df, columns_to_ignore=data_files[fname]['columns_to_ignore'])\n",
    "    \n",
    "    # Remove rows with missing values\n",
    "    df = df.reset_index(drop=True)\n",
    "    df = remove_rows_with_missing_values(df)\n",
    "    \n",
    "    assert(0 == df.isnull().sum().sum())\n",
    "    print(df)\n",
    "    print()\n",
    "\n",
    "    # Save file to clean directory\n",
    "    save_csv(df=df, file_path=fpath_clean_data_dir+fname)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Make an extra file of the hm_milkrecording.csv blood, methane, and acetone values (for other challenge)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Column: milking_time_morning; Removed missing values: 0.04%\n",
      "Column: milking_time_evening; Removed missing values: 0.01%\n",
      "Column: DIM; Removed missing values: 0.02%\n",
      "Column: lactose_24h; Removed missing values: 0.00%\n",
      "Column: scc_24h; Removed missing values: 0.00%\n",
      "Column: urea_24h; Removed missing values: 0.05%\n",
      "Column: bloodBHB; Removed missing values: 88.89%\n",
      "Column: CH4; Removed missing values: 5.46%\n"
     ]
    }
   ],
   "source": [
    "fname = 'hm_milkrecording.csv'\n",
    "columns_to_ignore = ['idspectrum_num',\n",
    "                     'AR_PESEE_PESCODEMALADEANALYSE',\n",
    "                     'AR_PESEE_PESCODEMALADEPESEE',\n",
    "                     'milk_yield_morning_measured',\n",
    "                     'milk_yield_evening_measured'\n",
    "                    ]\n",
    "\n",
    "df = read_in_csv(file_path=fpath_src_data_dir+fname, sep=data_files[fname]['delimiter'])\n",
    "\n",
    "# Convert date columns to datetime objects & sort dataframe\n",
    "df = convert_column_to_datetime(df, column='mlksmpl_date', convert_to_int=False)\n",
    "df = df.sort_values(by=['idani_anon', 'mlksmpl_date'])\n",
    "\n",
    "# Add new column depending on how the milk_yield was measured\n",
    "# \"Only milk_yield_24h\": 1; \"milk_yield_morning_measured\": 2; \"milk_yield_evening_measured\": 3\n",
    "new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int)\n",
    "new_col[~(df['milk_yield_morning_measured'].isnull().values)] = 2\n",
    "new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3\n",
    "df['milk_yield_msrmt_type'] = new_col.copy()\n",
    "\n",
    "# Replace empty 'AR_PESEE_PESCODEALPAGE' (altitude in m when cow was milked) values with fixed value (350)\n",
    "df['AR_PESEE_PESCODEALPAGE'] = df['AR_PESEE_PESCODEALPAGE'].fillna(350)\n",
    "\n",
    "# Remove columns to be ignored\n",
    "df = remove_columns_to_be_ignored(df=df, columns_to_ignore=columns_to_ignore)\n",
    "\n",
    "# Remove rows with emtpy values\n",
    "df = df.reset_index(drop=True)\n",
    "df = remove_rows_with_missing_values(df)\n",
    "\n",
    "assert(0 == df.isnull().sum().sum())\n",
    "\n",
    "# Save file to clean directory\n",
    "save_csv(df=df, file_path=fpath_clean_data_dir+'hm_milkrecording_bloodvalues.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}