{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data cleaning and preparation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import datetime" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Configure data path variables" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Path where the input csv-files are stored\n", "fpath_src_data_dir = 'src_data/'\n", "\n", "# Path where the cleaned data is stored\n", "fpath_clean_data_dir = 'clean_data/'\n", "\n", "# Path where the data ready for the ML analysis is stored\n", "fpath_prepared_data_dir = 'ready_data/'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "src_data/hm_animal.csv\t src_data/hm_mastitis_bact_tfpcr.csv\r\n", "src_data/hm_BCS.csv\t src_data/hm_milkrecording.csv\r\n", "src_data/hm_ebv.csv\t src_data/hm_NSAIET.csv\r\n", "src_data/hm_health.csv\t src_data/hm_pregnancy.csv\r\n", "src_data/hm_lactation.csv\r\n" ] } ], "source": [ "!ls {fpath_src_data_dir}*.csv" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "!mkdir -p {fpath_clean_data_dir}\n", "!mkdir -p {fpath_prepared_data_dir}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Definitions to read in data and what columns to ignore\n", "According to Thomas Denninger, some columns can be ignored (see below). In a second step, the analysis can and should however be refined, i.e. the information be used nonetheless.\n", "\n", "**Remarks**:\n", "* We are not taking into consideration the dataset 'hm_mastitis_bact_tfpcr.csv' in this analysis.\n", "* The columns 'bloodBHB', 'bloodNEFA', 'milkAcetone', 'CH4' of the dataset 'Hm_milkrecording.csv' are removed in this analysis. However, they are stored to a separate file in a different Jupyter notebook." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "data_files = {'hm_BCS.csv': {'delimiter': ';',\n", " 'columns_to_ignore': []\n", " },\n", " \n", " 'hm_lactation.csv': {'delimiter': ';',\n", " 'columns_to_ignore': ['dry_date_prev_lact']\n", " },\n", "\n", " 'hm_NSAIET.csv': {'delimiter': ';',\n", " 'columns_to_ignore': ['ejaculate',\n", " 'INSEM_SEXINGBEHANDLUNG',\n", " 'AI_organization',\n", " 'INSEM_SEXINGGESCHLECHT',\n", " 'idhrd_anon'\n", " ]\n", " },\n", "\n", "# 'hm_mastitis_bact_tfpcr.csv': {'delimiter': ';',\n", "# 'columns_to_ignore': ['Ct_value']\n", "# },\n", "\n", " 'hm_animal.csv': {'delimiter': ';',\n", " 'columns_to_ignore': ['idbrd', 'idbrd_sire', 'idbrd_dam']\n", " },\n", "\n", " 'hm_milkrecording.csv': {'delimiter': ';',\n", " 'columns_to_ignore': ['idspectrum_num',\n", " 'AR_PESEE_PESCODEMALADEANALYSE',\n", " 'AR_PESEE_PESCODEMALADEPESEE',\n", " 'bloodBHB',\n", " 'bloodNEFA',\n", " 'milkAcetone',\n", " 'CH4',\n", " 'milk_yield_morning_measured',\n", " 'milk_yield_evening_measured'\n", " ]\n", " },\n", "\n", " 'hm_ebv.csv': {'delimiter': ' ',\n", " 'columns_to_ignore': ['type', 'nDau', 'pubCode', 'rel', 'edc', 'nHerd', 'IVF', 'FIT', 'GZW', 'MIW']\n", " },\n", "\n", " 'hm_pregnancy.csv': {'delimiter': ';',\n", " 'columns_to_ignore': ['pregnancy_detection_method', 'P_N_value']\n", " },\n", "\n", " 'hm_health.csv': {'delimiter': ';',\n", " 'columns_to_ignore': ['intervening_person', 'infected_udder_quarter', 'leg']\n", " }\n", " }\n", "\n", "fnames = list(data_files.keys())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Useful functions to clean data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def read_in_csv(file_path, sep):\n", " \"\"\"\n", " Reads in the specified csv-file.\n", " \n", " :param file_path: Path to the csv-file\n", " :param sep: Separator used in the csv-file\n", " :return: Pandas dataframe\n", " \"\"\"\n", " return pd.read_csv(file_path, sep=sep)\n", "\n", "\n", "def save_csv(df, file_path, index=False):\n", " \"\"\"\n", " Save the specified csv-file.\n", " \n", " :param df: Pandas dataframe\n", " :param file_path: Path to the output csv-file\n", " :param index: Boolean value whether or not the first column (bool) is written to file as well (default: False)\n", " :return: None\n", " \"\"\"\n", " return df.to_csv(file_path, index=index)\n", "\n", "\n", "def remove_rows_with_missing_values(df):\n", " \"\"\"\n", " Compute the number of missing values per column and remove all rows with missing values.\n", " \n", " :param df: Pandas dataframe\n", " :return: Pandas dataframe\n", " \"\"\"\n", " \n", " for _, val in enumerate(df.columns):\n", " mask = df[val].isnull().values\n", " if np.sum(mask) > 0:\n", " print('Column: {:}; Removed missing values: {:.2f}%'.format(val, np.sum(mask)/mask.size*100))\n", " df = df.drop(np.arange(mask.size)[mask], axis=0)\n", " df = df.reset_index(drop=True)\n", " return df\n", "\n", "\n", "def remove_columns_to_be_ignored(df, columns_to_ignore):\n", " \"\"\"\n", " Remove all columns to be ignored.\n", " \n", " :param df: Pandas dataframe\n", " :param columns_to_ignore: List of the column names of the columns to ignore\n", " :return: Pandas dataframe\n", " \"\"\"\n", " return df.drop(labels=columns_to_ignore, axis=1)\n", "\n", "\n", "def convert_column_to_datetime(df, column, format='%Y%m%d', convert_to_int=False):\n", " \"\"\"\n", " Convert a date-like column to a pandas datetime-column.\n", " \n", " :param df: Pandas dataframe\n", " :param column: Column name of the datetime column\n", " :param format: Format of the date-entry (default: YYYYMMDD)\n", " :param convert_to_int: Bool value whether the column needs to be converted to an\n", " integer column first (default: False)\n", " :return: Pandas dataframe\n", " \"\"\"\n", " if convert_to_int:\n", " df[column] = df[column].values.astype(np.int64)\n", " df[column] = pd.to_datetime(df[column], format='%Y%m%d', errors='coerce').values\n", " return df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read in src-data & clean it (with modifications as suggested by Thomas Denninger)\n", "\n", "Mögliche Verbesserungen an den Daten:\n", "Jahreszeit (wie auch immer man diese definiert :) ) hinzufügen, wann eine Milchleistungsprüfung / Besamung vorgenommen wurde\n", "Information der Art der Milchleistungsprüfmethode hineinnehmen\n", "milking_time_morning und milking_time_evening ignorieren\n", "Die Information AR_PESEE_PESCODEALPAGE (ob auf Alp oder nicht) weiter verfeinern, da sehr viele Einträge den Wert 0 haben\n", "Information der Herde (idhrd_anon) “retten”, z.B. indem man berechnet wie die Milchleistung im Vergleich der mittleren Milchleistung der Herde ist\n", "Zeilen entfernen, bei welchen die Spalte label nicht ein “CH” drin hat\n", "milk_yield_msrmt_type (Typ wie der Milk Yield gemessen wurde) möglicherweise ignorieren\n", "Werte mit nsaiet_type == Belegung oder == ET entfernen\n", "Herausfinden, was bei calving_ease schief lief" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--- Reading in hm_BCS.csv ---\n", " BCS_date BCS idani_anon\n", "0 2018-10-23 3.00 CHE000000000561\n", "1 2017-02-16 2.50 CHE000000000781\n", "2 2013-12-19 3.00 CHE000000002000\n", "3 2015-02-05 3.00 CHE000000002000\n", "4 2010-11-05 3.25 CHE000000002251\n", "... ... ... ...\n", "171212 2016-10-18 2.75 CHE000099998376\n", "171213 2018-03-09 2.75 CHE000099999361\n", "171214 2019-05-28 2.75 CHE000099999361\n", "171215 2020-03-05 2.50 CHE000099999361\n", "171216 2013-06-04 3.25 CHE000099999926\n", "\n", "[171217 rows x 3 columns]\n", "\n", "--- Reading in hm_lactation.csv ---\n", " parity calving_date calving_ease idani_anon\n", "0 1 2018-09-06 2 CHE000000000561\n", "1 2 2019-09-15 2 CHE000000000561\n", "2 1 2016-09-07 2 CHE000000000781\n", "3 2 2017-08-05 1 CHE000000000781\n", "4 3 2018-10-18 2.5 CHE000000000781\n", "... ... ... ... ...\n", "637269 1 2013-02-26 2 CHE000099999926\n", "637270 2 2014-03-24 1 CHE000099999926\n", "637271 3 2015-03-05 2 CHE000099999926\n", "637272 4 2016-06-18 3 CHE000099999926\n", "637273 5 2017-11-14 2 CHE000099999926\n", "\n", "[637274 rows x 4 columns]\n", "\n", "--- Reading in hm_NSAIET.csv ---\n", " parity nsaiet_date nsaiet_type AI_technician idani_anon \\\n", "0 0 2017-11-05 Besamung 10 CHE000000000561 \n", "1 0 2017-11-28 Besamung 10 CHE000000000561 \n", "2 1 2018-11-16 Besamung 10 CHE000000000561 \n", "3 1 2018-12-05 Besamung 10 CHE000000000561 \n", "4 2 2019-12-09 Besamung 10 CHE000000000561 \n", "... ... ... ... ... ... \n", "1366133 1 2013-06-13 Besamung 10 CHE000099999926 \n", "1366134 2 2014-05-19 Besamung 10 CHE000099999926 \n", "1366135 3 2015-09-02 Belegung 5 CHE000099999926 \n", "1366136 4 2016-10-14 Besamung 10 CHE000099999926 \n", "1366137 4 2017-01-24 Besamung 10 CHE000099999926 \n", "\n", " idani_anon_aisire \n", "0 CHE000020282537 \n", "1 CHE000020282537 \n", "2 CHE000002123235 \n", "3 CHE000002123235 \n", "4 CHE000001110094 \n", "... ... \n", "1366133 CHE000025255973 \n", "1366134 CHE000038915903 \n", "1366135 CHE000099239508 \n", "1366136 CHE000045797183 \n", "1366137 CHE000079801145 \n", "\n", "[1366138 rows x 6 columns]\n", "\n", "--- Reading in hm_animal.csv ---\n", " birth_date brd_abbr_icar idani_anon\n", "0 2016-03-08 HOL CHE000000000559\n", "1 2016-02-27 HOL CHE000000000561\n", "2 2011-05-09 HOL CHE000000000620\n", "3 2014-06-23 HOL CHE000000000781\n", "4 2015-11-25 HOL CHE000000001494\n", "... ... ... ...\n", "275766 2014-01-06 HOL CHE000099998134\n", "275767 2016-11-28 HOL CHE000099998152\n", "275768 2013-11-12 HOL CHE000099998376\n", "275769 2015-09-07 HOL CHE000099999361\n", "275770 2011-01-26 HOL CHE000099999926\n", "\n", "[275771 rows x 3 columns]\n", "\n", "--- Reading in hm_milkrecording.csv ---\n", "Column: milking_time_morning; Removed missing values: 0.04%\n", "Column: milking_time_evening; Removed missing values: 0.01%\n", "Column: lab_date; Removed missing values: 0.00%\n", "Column: DIM; Removed missing values: 0.02%\n", "Column: lactose_24h; Removed missing values: 0.00%\n", "Column: scc_24h; Removed missing values: 0.00%\n", "Column: urea_24h; Removed missing values: 0.05%\n", " mlksmpl_date milking_time_morning milking_time_evening lab_date \\\n", "0 2018-10-10 50000.0 163000.0 2018-10-11 \n", "1 2018-11-13 50000.0 163000.0 2018-11-15 \n", "2 2018-12-18 50000.0 163000.0 2018-12-19 \n", "3 2019-01-21 50000.0 163000.0 2019-01-23 \n", "4 2019-02-23 50000.0 163000.0 2019-02-26 \n", "... ... ... ... ... \n", "5116724 2018-08-06 55000.0 172000.0 2018-08-08 \n", "5116725 2018-09-07 55000.0 171000.0 2018-09-10 \n", "5116726 2018-10-10 53000.0 171000.0 2018-10-11 \n", "5116727 2018-11-12 53000.0 165000.0 2018-11-14 \n", "5116728 2018-12-14 53500.0 165000.0 2018-12-17 \n", "\n", " DIM pruefmethode melkmethode milk_yield_24h fat_24h protein_24h \\\n", "0 34.0 AT4 Normal 34.2 36.5 24.9 \n", "1 68.0 AT4 Normal 34.1 34.9 29.2 \n", "2 103.0 AT4 Normal 30.9 37.2 26.7 \n", "3 137.0 AT4 Normal 36.3 36.8 31.1 \n", "4 170.0 AT4 Normal 35.3 34.2 29.6 \n", "... ... ... ... ... ... ... \n", "5116724 265.0 AT4 Normal 27.9 43.5 32.7 \n", "5116725 297.0 AT4 Normal 30.7 58.8 38.4 \n", "5116726 330.0 AT4 Normal 25.3 52.2 40.6 \n", "5116727 363.0 AT4 Normal 19.8 52.8 43.3 \n", "5116728 395.0 AT4 Normal 14.7 53.6 43.6 \n", "\n", " lactose_24h scc_24h urea_24h AR_PESEE_PESCODEALPAGE \\\n", "0 49.8 21.0 2.4 0.0 \n", "1 50.3 80.0 2.2 0.0 \n", "2 48.3 132.0 1.6 0.0 \n", "3 50.0 106.0 2.1 0.0 \n", "4 48.6 133.0 1.5 0.0 \n", "... ... ... ... ... \n", "5116724 44.1 1004.0 3.3 0.0 \n", "5116725 44.6 517.0 3.2 0.0 \n", "5116726 43.5 235.0 2.9 0.0 \n", "5116727 43.6 166.0 3.2 0.0 \n", "5116728 41.7 163.0 2.9 0.0 \n", "\n", " idani_anon idhrd_anon milk_yield_msrmt_type \\\n", "0 CHE000000000561 CHE000000095710 2 \n", "1 CHE000000000561 CHE000000095710 3 \n", "2 CHE000000000561 CHE000000095710 2 \n", "3 CHE000000000561 CHE000000095710 3 \n", "4 CHE000000000561 CHE000000095710 2 \n", "... ... ... ... \n", "5116724 CHE000099999926 CHE000000031281 2 \n", "5116725 CHE000099999926 CHE000000031281 3 \n", "5116726 CHE000099999926 CHE000000031281 2 \n", "5116727 CHE000099999926 CHE000000031281 3 \n", "5116728 CHE000099999926 CHE000000031281 2 \n", "\n", " fat_protein_24h_ratio \n", "0 1.465863 \n", "1 1.195205 \n", "2 1.393258 \n", "3 1.183280 \n", "4 1.155405 \n", "... ... \n", "5116724 1.330275 \n", "5116725 1.531250 \n", "5116726 1.285714 \n", "5116727 1.219400 \n", "5116728 1.229358 \n", "\n", "[5116729 rows x 18 columns]\n", "\n", "--- Reading in hm_ebv.csv ---\n", "Column: ekg; Removed missing values: 63.15%\n", "Column: scs; Removed missing values: 0.00%\n", " base label idani_anon ekg epr fkg fpr mkg per \\\n", "0 HO20 A CHE000000000559 -23.0 -0.02 -31.0 -0.07 -636.0 93.0 \n", "1 HO20 CH CHE000000000561 18.0 -0.23 26.0 -0.24 1173.0 93.0 \n", "2 HO20 A CHE000000000620 5.0 0.11 -1.0 0.05 -147.0 99.0 \n", "3 HO20 CH CHE000000000781 0.0 -0.07 -2.0 -0.11 179.0 95.0 \n", "4 HO20 CH CHE000000001494 3.0 0.11 -4.0 0.04 -179.0 93.0 \n", "... ... ... ... ... ... ... ... ... ... \n", "271769 HO20 CH CHE000099998134 6.0 0.11 18.0 0.26 -82.0 89.0 \n", "271770 HO20 CH CHE000099998152 8.0 -0.03 3.0 -0.11 311.0 89.0 \n", "271771 HO20 CH CHE000099998376 8.0 0.16 46.0 0.64 -167.0 104.0 \n", "271772 HO20 CH CHE000099999361 10.0 -0.02 12.0 -0.03 365.0 103.0 \n", "271773 HO20 CH CHE000099999926 -1.0 -0.01 11.0 0.13 -3.0 94.0 \n", "\n", " scs \n", "0 102.0 \n", "1 89.0 \n", "2 94.0 \n", "3 106.0 \n", "4 111.0 \n", "... ... \n", "271769 90.0 \n", "271770 110.0 \n", "271771 99.0 \n", "271772 111.0 \n", "271773 98.0 \n", "\n", "[271774 rows x 10 columns]\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "--- Reading in hm_pregnancy.csv ---\n", " pregnancy_detection_date finding idani_anon idhrd_anon\n", "0 2019-01-08 pregnant CHE000000000561 CHE000000095710\n", "1 2020-03-11 open CHE000000000561 CHE000000095710\n", "2 2018-01-09 open CHE000000000781 CHE000000072353\n", "3 2018-03-13 pregnant CHE000000000781 CHE000000072353\n", "4 2018-03-20 pregnant CHE000000001494 CHE000000052966\n", "... ... ... ... ...\n", "143945 2018-11-27 pregnant CHE000099992995 CHE000000014814\n", "143946 2020-01-17 open CHE000099993267 CHE000000035579\n", "143947 2019-11-21 pregnant CHE000099998152 CHE000000064265\n", "143948 2020-01-27 pregnant CHE000099998152 CHE000000064265\n", "143949 2018-05-01 pregnant CHE000099999926 CHE000000031281\n", "\n", "[143950 rows x 4 columns]\n", "\n", "--- Reading in hm_health.csv ---\n", "Column: idhrd_anon; Removed missing values: 0.07%\n", " hecode_ASR healthevent_date idani_anon idhrd_anon\n", "0 10.7.1. 2018-11-19 CHE000000005877 CHE000000079291\n", "1 2.1.1. 2018-12-20 CHE000000005877 CHE000000079291\n", "2 10.4. 2019-09-06 CHE000000005877 CHE000000079291\n", "3 3.5. 2020-03-14 CHE000000005877 CHE000000079291\n", "4 6.1. 2014-06-02 CHE000000006772 CHE000000055108\n", "... ... ... ... ...\n", "53730 2.1.2. 2019-06-15 CHE000099990201 CHE000000088759\n", "53731 2.5. 2018-01-30 CHE000099990433 CHE000000059284\n", "53732 2.1.1. 2020-03-18 CHE000099992995 CHE000000014814\n", "53733 9.3.99. 2013-11-25 CHE000099998376 CHE000000086408\n", "53734 10.2. 2013-12-19 CHE000099998376 CHE000000086408\n", "\n", "[53735 rows x 4 columns]\n", "\n" ] } ], "source": [ "for _, fname in enumerate(fnames):\n", " print('--- Reading in {:} ---'.format(fname))\n", " df = read_in_csv(file_path=fpath_src_data_dir+fname, sep=data_files[fname]['delimiter'])\n", "\n", " # Convert date columns to datetime objects & sort dataframe\n", " if fname == 'hm_BCS.csv':\n", " df = convert_column_to_datetime(df, column='BCS_date', convert_to_int=False)\n", " df = df.sort_values(by=['idani_anon', 'BCS_date'])\n", " if fname == 'hm_lactation.csv':\n", " df = convert_column_to_datetime(df, column='calving_date', convert_to_int=False)\n", " df = convert_column_to_datetime(df, column='dry_date_prev_lact', convert_to_int=True)\n", " df = df.sort_values(by=['idani_anon', 'calving_date'])\n", " if fname == 'hm_NSAIET.csv':\n", " df = convert_column_to_datetime(df, column='nsaiet_date', convert_to_int=False)\n", " df = df.sort_values(by=['idani_anon', 'nsaiet_date'])\n", " if fname == 'hm_animal.csv':\n", " df = convert_column_to_datetime(df, column='birth_date', convert_to_int=False)\n", " df = df.sort_values(by=['idani_anon', 'birth_date'])\n", " if fname == 'hm_milkrecording.csv':\n", " df = convert_column_to_datetime(df, column='mlksmpl_date', convert_to_int=False)\n", " df = convert_column_to_datetime(df, column='lab_date', convert_to_int=False)\n", " df = df.sort_values(by=['idani_anon', 'mlksmpl_date'])\n", "# if fname == 'hm_ebv.csv': # Sort afterwards\n", "# df = df.sort_values(by=['idani_anon'])\n", " if fname == 'hm_pregnancy.csv':\n", " df = convert_column_to_datetime(df, column='pregnancy_detection_date', convert_to_int=False)\n", " df = df.sort_values(by=['idani_anon', 'pregnancy_detection_date'])\n", " if fname == 'hm_health.csv':\n", " df = convert_column_to_datetime(df, column='healthevent_date', convert_to_int=False)\n", " df = df.sort_values(by=['idani_anon', 'healthevent_date'])\n", "\n", " # Modifications to individual datasets, as suggested by Thomas Denninger\n", " if fname == 'hm_milkrecording.csv':\n", " # Add new column depending on how the milk_yield was measured\n", " # \"Only milk_yield_24h\": 1; \"milk_yield_morning_measured\": 2; \"milk_yield_evening_measured\": 3\n", " new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int)\n", " new_col[~(df['milk_yield_morning_measured'].isnull().values)] = 2\n", " new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3\n", " df['milk_yield_msrmt_type'] = new_col.copy()\n", "\n", " # Add new column of the fat_24h vs. protein_24h ratio\n", " new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int)\n", " new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3\n", " df['fat_protein_24h_ratio'] = df['fat_24h'] / df['protein_24h']\n", "\n", " # Replace empty 'AR_PESEE_PESCODEALPAGE' (altitude in m when cow was milked) values with fixed value (350)\n", " df['AR_PESEE_PESCODEALPAGE'] = df['AR_PESEE_PESCODEALPAGE'].fillna(350)\n", "\n", " if fname == 'hm_lactation.csv':\n", "# # In a first step, fill empty 'dry_date_prev_lact' values with 'calving_date' + 305 days\n", "# values = df['dry_date_prev_lact'].values.copy()\n", "# mask = df['dry_date_prev_lact'].isnull().values & (df['parity'].values > 1)\n", "# values[mask] = (df['calving_date'][mask] + pd.DateOffset(days=305)).values\n", "# df['dry_date_prev_lact'] = values.copy()\n", "\n", " # Fill empty 'calving_ease' with constant value 2.5\n", " df['calving_ease'] = df['calving_ease'].fillna(2.5)\n", "\n", " if fname == 'hm_NSAIET.csv':\n", " # Overwrite the AI_technician (= ID of person who performed the artificial insemination) column with\n", " # \"nsaiet_type=Belegung\": 5; \"nsaiet_type=Besamung\": 10; \"nsaiet_type=ET\": 15\n", " new_col = np.zeros(df['AI_technician'].size, dtype=np.int)\n", " new_col[df['nsaiet_type'].values == 'Belegung'] = 5\n", " new_col[df['nsaiet_type'].values == 'Besamung'] = 10\n", " new_col[df['nsaiet_type'].values == 'ET'] = 15\n", " df['AI_technician'] = new_col.copy()\n", "\n", " if fname == 'hm_ebv.csv':\n", " # Replace the values 'XXXX' in the column 'label' with a 'U' (Unbekannt)\n", " values = df['label'].values.copy()\n", " mask = df['label'].values == 'XXXX'\n", " values[mask] = 'U'\n", " df['label'] = values.copy()\n", " \n", " # Convert 'trait' (=genetic trait) and 'estimate' (=value) to individual columns unsing a pivot table \n", " column_names = [col for col in list(df.columns.values) if col not in data_files[fname]['columns_to_ignore']]\n", " column_names.remove('trait')\n", " column_names.remove('estimate')\n", " df = pd.DataFrame(df.pivot_table('estimate', column_names, 'trait').to_records())\n", " \n", " # Only consider the traits 'mkg', 'fkg', 'fpr', 'ekg', and 'epr' here and remove the rest\n", " considered_traits = ['ekg', 'epr', 'fkg', 'fpr', 'mkg', 'per', 'scs']\n", " \n", " df = df[column_names+considered_traits]\n", " \n", " # Sort dataframe\n", " df = df.sort_values(by=['idani_anon'])\n", " \n", " # Remove columns to be ignored and rows with emtpy values\n", " if fname != 'hm_ebv.csv':\n", " df = remove_columns_to_be_ignored(df=df, columns_to_ignore=data_files[fname]['columns_to_ignore'])\n", " \n", " # Remove rows with missing values\n", " df = df.reset_index(drop=True)\n", " df = remove_rows_with_missing_values(df)\n", " \n", " assert(0 == df.isnull().sum().sum())\n", " print(df)\n", " print()\n", "\n", " # Save file to clean directory\n", " save_csv(df=df, file_path=fpath_clean_data_dir+fname)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Make an extra file of the hm_milkrecording.csv blood, methane, and acetone values (for other challenge)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Column: milking_time_morning; Removed missing values: 0.04%\n", "Column: milking_time_evening; Removed missing values: 0.01%\n", "Column: DIM; Removed missing values: 0.02%\n", "Column: lactose_24h; Removed missing values: 0.00%\n", "Column: scc_24h; Removed missing values: 0.00%\n", "Column: urea_24h; Removed missing values: 0.05%\n", "Column: bloodBHB; Removed missing values: 88.89%\n", "Column: CH4; Removed missing values: 5.46%\n" ] } ], "source": [ "fname = 'hm_milkrecording.csv'\n", "columns_to_ignore = ['idspectrum_num',\n", " 'AR_PESEE_PESCODEMALADEANALYSE',\n", " 'AR_PESEE_PESCODEMALADEPESEE',\n", " 'milk_yield_morning_measured',\n", " 'milk_yield_evening_measured'\n", " ]\n", "\n", "df = read_in_csv(file_path=fpath_src_data_dir+fname, sep=data_files[fname]['delimiter'])\n", "\n", "# Convert date columns to datetime objects & sort dataframe\n", "df = convert_column_to_datetime(df, column='mlksmpl_date', convert_to_int=False)\n", "df = df.sort_values(by=['idani_anon', 'mlksmpl_date'])\n", "\n", "# Add new column depending on how the milk_yield was measured\n", "# \"Only milk_yield_24h\": 1; \"milk_yield_morning_measured\": 2; \"milk_yield_evening_measured\": 3\n", "new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int)\n", "new_col[~(df['milk_yield_morning_measured'].isnull().values)] = 2\n", "new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3\n", "df['milk_yield_msrmt_type'] = new_col.copy()\n", "\n", "# Replace empty 'AR_PESEE_PESCODEALPAGE' (altitude in m when cow was milked) values with fixed value (350)\n", "df['AR_PESEE_PESCODEALPAGE'] = df['AR_PESEE_PESCODEALPAGE'].fillna(350)\n", "\n", "# Remove columns to be ignored\n", "df = remove_columns_to_be_ignored(df=df, columns_to_ignore=columns_to_ignore)\n", "\n", "# Remove rows with emtpy values\n", "df = df.reset_index(drop=True)\n", "df = remove_rows_with_missing_values(df)\n", "\n", "assert(0 == df.isnull().sum().sum())\n", "\n", "# Save file to clean directory\n", "save_csv(df=df, file_path=fpath_clean_data_dir+'hm_milkrecording_bloodvalues.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }