{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "955d03bd", "metadata": {}, "outputs": [], "source": [ "%%bash\n", "pip install numpy\n", "pip install pandas\n", "pip install seaborn\n", "pip install mlxtend\n", "pip install arch" ] }, { "cell_type": "code", "execution_count": 1, "id": "08a2f02f", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import datetime\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "1c6d55d7", "metadata": {}, "outputs": [], "source": [ "def print_full(x):\n", " pd.set_option('display.max_rows', len(x))\n", " print(x)\n", " pd.reset_option('display.max_rows')\n", "def add_year(df):\n", " df['year'] = df['Date'].apply(lambda x: int(x[2:4]))\n", " return df" ] }, { "cell_type": "code", "execution_count": 20, "id": "d197c2c6", "metadata": {}, "outputs": [], "source": [ "#Load rain and hotel occupancy data\n", "r16 = pd.read_csv(\"data/rain2016.csv\")\n", "r17 = pd.read_csv(\"data/rain2017.csv\")\n", "r20 = pd.read_csv(\"data/rain2020.csv\")\n", "R=pd.concat([r16,r17,r20])[['Date', 'Rain']]\n", "R=add_year(R)\n", "R.to_csv('data/rain_all.csv', index=False)\n", "H = pd.read_csv(\"data/hotel-nights.csv\")\n", "H['per_day'] = H['Rooms']/H['Days']\n", "R['rooms']=0\n", "for i, row in R.iterrows(): \n", " R.at[i,\"rooms\"]=H.loc[ H['Month']==row[\"Date\"][:7] ,'per_day'].values[0]" ] }, { "cell_type": "code", "execution_count": 21, "id": "a0880450", "metadata": { "tags": [] }, "outputs": [], "source": [ "def expand_columns(ds, n=3):\n", " m=len(ds)\n", " Z=[]\n", " for i in range(n):\n", " ds['Plate_number']=i+1\n", " Z.append(ds.copy())\n", " plate_prefix = [f\"P{i+1}_\" for i in range(n)]\n", " plates=[s for s in ds.columns.tolist() if any(xs in s for xs in plate_prefix)]\n", " for p in plates:\n", " colname=p[3:]\n", " Z[int(p[1])-1][colname] =ds[p]\n", " e=pd.concat(Z)\n", " return e\n", " \n", "def map_columns(ds, col_dict):\n", " col_dict={**common_cols, **col_dict}\n", " x=ds.rename(columns=col_dict)\n", " return x[list(col_dict.values())]\n", "\n", "def calcRain(w):\n", " i=R.loc[R['Date']==w].index.values[0]\n", " r=R.iloc[i-2:i].agg('sum')['Rain']\n", " h=R.iloc[i-6:i].agg('sum')['rooms']\n", " return (h,r)\n", "\n", "def annotate_weeks(ds, weeks):\n", " ds['week']=0\n", " ds['festival']=''\n", " ds['rain']=0\n", " ds['rooms']=0\n", " for w, i in zip(weeks, range(len(weeks))):\n", " I=ds['Date']==w\n", " ds.loc[I, 'week']= i+1\n", " ds.loc[I, 'festival']= weeks[w]\n", " h, r = calcRain(w)\n", " ds.loc[I, 'rain'] = r\n", " ds.loc[I, 'rooms']= h\n", " return ds\n", "\n", "def prepare_year(ds, col_dict, all_indicators, weeks, locations, medium):\n", " ds=ds.query(' or '.join([ f\"(Location=='{u}')\" for u in locations])) \\\n", " .query(f\"medium==\\'{medium}\\'\")\n", " x=map_columns(ds, col_dict)\n", " x['total']=ds[all_indicators].agg(\"sum\", axis=\"columns\")\n", " z= annotate_weeks(x, weeks)\n", " z=z.sort_values(['Date', 'Location', 'Plate_number'],ignore_index=True) \n", " return z\n", "\n", "def normalize_cols(ds, cols, factor):\n", " for col in cols:\n", " ds[col] = 100 * ds[col] / ds[factor]\n", " return ds\n", " \n", "sites=['VNX','MRD','SVT']\n", "common_cols={'Date' : 'Date',\n", " 'Location' : 'Location' ,\n", " 'Plate_number' : 'Plate_number',\n", " 'medium' : 'medium'}" ] }, { "cell_type": "code", "execution_count": 22, "id": "d35644f1", "metadata": { "tags": [] }, "outputs": [], "source": [ "ds16 = pd.read_csv(\"data/2016_Data.csv\")\n", "ds16['medium']='easy_gel'\n", "ds16['P2_qty_sample'] = ds16['P1_qty_sample']\n", "ds16['P3_qty_sample'] = ds16['P1_qty_sample']\n", "#ds16=ds16.sort_values(['Date', 'Location'],ignore_index=True)\n", "ds17 = pd.read_csv(\"data/2017_Data.csv\")\n", "ds17['P2_qty_sample'] = ds17['P1_qty_sample']\n", "ds17['P3_qty_sample'] = ds17['P1_qty_sample']\n", "s17=ds17.sort_values(['Date', 'Location'],ignore_index=True)\n", "ds20 = pd.read_csv(\"data/2020_Data.csv\").query(\"medium=='levine'\").sort_values(['Date', 'Location', 'Plate_number'])" ] }, { "cell_type": "code", "execution_count": 23, "id": "0552bc9a", "metadata": {}, "outputs": [], "source": [ "# 2016\n", "weeks16= {'2016-06-21' : 'before',\n", " '2016-06-28' : 'before',\n", " '2016-07-05' : 'during',\n", " '2016-07-12' : 'during',\n", " '2016-07-19' : 'after',\n", " '2016-07-26' : 'after',\n", " '2016-08-02' : 'after',\n", " '2016-08-09' : 'after'} \n", "Y16 = prepare_year(expand_columns(ds16,3), {\n", " '24h_big_blue' : 'Bioindicator',\n", " '24h_med_blue' : 'Coliform',\n", " 'qty_sample' : 'volume'\n", " }, ['24h_big_blue', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'] , \n", " weeks16, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])\n", "Y16 = normalize_cols(Y16, ['Bioindicator', 'Coliform', 'total'], 'volume')" ] }, { "cell_type": "code", "execution_count": 24, "id": "1ee648cd", "metadata": {}, "outputs": [], "source": [ "# 2017\n", "weeks17 = {'2017-06-12' : 'before',\n", " '2017-06-19' : 'before',\n", " '2017-06-26' : 'before',\n", " '2017-07-03' : 'during',\n", " '2017-07-10' : 'during',\n", " '2017-07-17' : 'after',\n", " '2017-07-24' : 'after',\n", " '2017-07-31' : 'after',}\n", "\n", "Y17 = prepare_year(expand_columns(ds17,3), {\n", " '24h_big_blue' : 'Bioindicator',\n", " 'fluo_halo_colonies' : 'Bioindicator UV+',\n", " '24h_med_blue' : 'Coliform',\n", " 'qty_sample' : 'volume'\n", " }, ['fluo_halo_colonies', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'],\n", " weeks17, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])\n", "Y17 = normalize_cols(Y17, ['Bioindicator', 'Coliform', 'Bioindicator UV+', 'total'], 'volume')" ] }, { "cell_type": "code", "execution_count": 25, "id": "d535af61", "metadata": { "tags": [] }, "outputs": [], "source": [ "# 2020\n", "weeks20 = {'2020-06-11' : 'before',\n", " '2020-06-18' : 'before',\n", " '2020-06-25' : 'before',\n", " '2020-07-01' : 'before',\n", " '2020-07-09' : 'before',\n", " '2020-07-16' : 'before',\n", " '2020-07-22' : 'before',\n", " '2020-08-06' : 'before'}\n", "\n", "Y20=prepare_year(ds20, { 'green_met' : 'Bioindicator',\n", " 'purple' : 'Coliform',\n", " 'volume' : 'volume'\n", " }, ['green_met', 'purple', 'mauve', 'pink', 'other'] , \n", " weeks20 , sites, 'levine')\n", "Y20 = normalize_cols(Y20, ['Bioindicator', 'Coliform', 'total'], 'volume').fillna(0)\n" ] }, { "cell_type": "code", "execution_count": 26, "id": "dff09705", "metadata": {}, "outputs": [], "source": [ "D=pd.concat([Y16,Y17,Y20]).drop(columns=['volume','medium'])\n", "D=add_year(D)\n", "D.to_pickle(\"data/allyears.pkl\")" ] }, { "cell_type": "code", "execution_count": 27, "id": "9d44c41d", "metadata": {}, "outputs": [], "source": [ "#Take average over plates\n", "A=D.groupby(['Date','Location','festival','rooms','year','rain'],as_index=False).mean().drop(columns=['Plate_number'])\n", "A.to_pickle(\"data/allyears-avg_plate.pkl\")\n", "vars_cols=['Bioindicator', 'Coliform', 'total']\n", "#A[[f\"{col}_std\" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).std()[vars_cols]\n", "#A[[f\"{col}_max\" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).max()[vars_cols]" ] }, { "cell_type": "markdown", "id": "ce930101", "metadata": {}, "source": [ "# Cumulative 48 h data" ] }, { "cell_type": "code", "execution_count": 28, "id": "9b4386aa", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Date</th>\n", " <th>Rain</th>\n", " <th>year</th>\n", " <th>rain_same_day</th>\n", " <th>rain_48h</th>\n", " <th>rain_48</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>2016-06-13</td>\n", " <td>0.0</td>\n", " <td>16</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2016-06-14</td>\n", " <td>0.0</td>\n", " <td>16</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2016-06-15</td>\n", " <td>0.0</td>\n", " <td>16</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2016-06-16</td>\n", " <td>0.0</td>\n", " <td>16</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>2016-06-17</td>\n", " <td>0.0</td>\n", " <td>16</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>168</th>\n", " <td>2020-08-02</td>\n", " <td>16.2</td>\n", " <td>20</td>\n", " <td>14.8</td>\n", " <td>1.4</td>\n", " <td>0.5</td>\n", " </tr>\n", " <tr>\n", " <th>169</th>\n", " <td>2020-08-03</td>\n", " <td>29.1</td>\n", " <td>20</td>\n", " <td>14.3</td>\n", " <td>14.8</td>\n", " <td>1.4</td>\n", " </tr>\n", " <tr>\n", " <th>170</th>\n", " <td>2020-08-04</td>\n", " <td>18.6</td>\n", " <td>20</td>\n", " <td>4.3</td>\n", " <td>14.3</td>\n", " <td>14.8</td>\n", " </tr>\n", " <tr>\n", " <th>171</th>\n", " <td>2020-08-05</td>\n", " <td>4.3</td>\n", " <td>20</td>\n", " <td>0.0</td>\n", " <td>4.3</td>\n", " <td>14.3</td>\n", " </tr>\n", " <tr>\n", " <th>172</th>\n", " <td>2020-08-06</td>\n", " <td>0.0</td>\n", " <td>20</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>4.3</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>173 rows × 6 columns</p>\n", "</div>" ], "text/plain": [ " Date Rain year rain_same_day rain_48h rain_48\n", "0 2016-06-13 0.0 16 0.0 0.0 0.0\n", "1 2016-06-14 0.0 16 0.0 0.0 0.0\n", "2 2016-06-15 0.0 16 0.0 0.0 0.0\n", "3 2016-06-16 0.0 16 0.0 0.0 0.0\n", "4 2016-06-17 0.0 16 0.0 0.0 0.0\n", ".. ... ... ... ... ... ...\n", "168 2020-08-02 16.2 20 14.8 1.4 0.5\n", "169 2020-08-03 29.1 20 14.3 14.8 1.4\n", "170 2020-08-04 18.6 20 4.3 14.3 14.8\n", "171 2020-08-05 4.3 20 0.0 4.3 14.3\n", "172 2020-08-06 0.0 20 0.0 0.0 4.3\n", "\n", "[173 rows x 6 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "R = pd.read_csv(\"data/rain_all_cumulative_48h.csv\")\n", "display(R)\n", "H = pd.read_csv(\"data/hotel-nights.csv\")\n", "H['per_day'] = H['Rooms']/H['Days']\n", "R['rooms']=0\n", "for i, row in R.iterrows(): \n", " R.at[i,\"rooms\"]=H.loc[ H['Month']==row[\"Date\"][:7] ,'per_day'].values[0]\n", " \n", "sites=['VNX','MRD','SVT']\n", "common_cols={'Date' : 'Date',\n", " 'Location' : 'Location' ,\n", " 'Plate_number' : 'Plate_number',\n", " 'medium' : 'medium'}\n", "\n", "ds16 = pd.read_csv(\"data/2016_Data.csv\")\n", "ds16['medium']='easy_gel'\n", "ds16['P2_qty_sample'] = ds16['P1_qty_sample']\n", "ds16['P3_qty_sample'] = ds16['P1_qty_sample']\n", "#ds16=ds16.sort_values(['Date', 'Location'],ignore_index=True)\n", "ds17 = pd.read_csv(\"data/2017_Data.csv\")\n", "ds17['P2_qty_sample'] = ds17['P1_qty_sample']\n", "ds17['P3_qty_sample'] = ds17['P1_qty_sample']\n", "s17=ds17.sort_values(['Date', 'Location'],ignore_index=True)\n", "ds20 = pd.read_csv(\"data/2020_Data.csv\").query(\"medium=='levine'\").sort_values(['Date', 'Location', 'Plate_number'])\n", "\n", "# 2016\n", "weeks16= {'2016-06-21' : 'before',\n", " '2016-06-28' : 'before',\n", " '2016-07-05' : 'during',\n", " '2016-07-12' : 'during',\n", " '2016-07-19' : 'after',\n", " '2016-07-26' : 'after',\n", " '2016-08-02' : 'after',\n", " '2016-08-09' : 'after'} \n", "Y16 = prepare_year(expand_columns(ds16,3), {\n", " '24h_big_blue' : 'Bioindicator',\n", " '24h_med_blue' : 'Coliform',\n", " 'qty_sample' : 'volume'\n", " }, ['24h_big_blue', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'] , \n", " weeks16, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])\n", "Y16 = normalize_cols(Y16, ['Bioindicator', 'Coliform', 'total'], 'volume')\n", "\n", "# 2017\n", "weeks17 = {'2017-06-12' : 'before',\n", " '2017-06-19' : 'before',\n", " '2017-06-26' : 'before',\n", " '2017-07-03' : 'during',\n", " '2017-07-10' : 'during',\n", " '2017-07-17' : 'after',\n", " '2017-07-24' : 'after',\n", " '2017-07-31' : 'after',}\n", "\n", "Y17 = prepare_year(expand_columns(ds17,3), {\n", " '24h_big_blue' : 'Bioindicator',\n", " 'fluo_halo_colonies' : 'Bioindicator UV+',\n", " '24h_med_blue' : 'Coliform',\n", " 'qty_sample' : 'volume'\n", " }, ['fluo_halo_colonies', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'],\n", " weeks17, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])\n", "Y17 = normalize_cols(Y17, ['Bioindicator', 'Coliform', 'Bioindicator UV+', 'total'], 'volume')\n", "\n", "# 2020\n", "weeks20 = {'2020-06-11' : 'before',\n", " '2020-06-18' : 'before',\n", " '2020-06-25' : 'before',\n", " '2020-07-01' : 'before',\n", " '2020-07-09' : 'before',\n", " '2020-07-16' : 'before',\n", " '2020-07-22' : 'before',\n", " '2020-08-06' : 'before'}\n", "\n", "Y20=prepare_year(ds20, { 'green_met' : 'Bioindicator',\n", " 'purple' : 'Coliform',\n", " 'volume' : 'volume'\n", " }, ['green_met', 'purple', 'mauve', 'pink', 'other'] , \n", " weeks20 , sites, 'levine')\n", "Y20 = normalize_cols(Y20, ['Bioindicator', 'Coliform', 'total'], 'volume').fillna(0)\n", "\n", "D=pd.concat([Y16,Y17,Y20]).drop(columns=['volume','medium'])\n", "D=add_year(D)\n", "D.to_pickle(\"data/allyears_48h_cumulative.pkl\")\n", "\n", "#Take average over plates\n", "A=D.groupby(['Date','Location','festival','rooms','year','rain'],as_index=False).mean().drop(columns=['Plate_number'])\n", "A.to_pickle(\"data/allyears-avg_plate_48h_cumulative.pkl\")\n", "vars_cols=['Bioindicator', 'Coliform', 'total']\n", "#A[[f\"{col}_std\" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).std()[vars_cols]\n", "#A[[f\"{col}_max\" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).max()[vars_cols]" ] }, { "cell_type": "markdown", "id": "2cc8aed2", "metadata": {}, "source": [ "### Cumulative 72h" ] }, { "cell_type": "code", "execution_count": 30, "id": "901fd9ba", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Date</th>\n", " <th>Rain</th>\n", " <th>year</th>\n", " <th>rain_same_day</th>\n", " <th>rain_48h</th>\n", " <th>rain_48</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>2016-06-13</td>\n", " <td>0.0</td>\n", " <td>16</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2016-06-14</td>\n", " <td>0.0</td>\n", " <td>16</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2016-06-15</td>\n", " <td>0.0</td>\n", " <td>16</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2016-06-16</td>\n", " <td>0.0</td>\n", " <td>16</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>2016-06-17</td>\n", " <td>0.0</td>\n", " <td>16</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>168</th>\n", " <td>2020-08-02</td>\n", " <td>16.7</td>\n", " <td>20</td>\n", " <td>14.8</td>\n", " <td>1.4</td>\n", " <td>0.5</td>\n", " </tr>\n", " <tr>\n", " <th>169</th>\n", " <td>2020-08-03</td>\n", " <td>30.5</td>\n", " <td>20</td>\n", " <td>14.3</td>\n", " <td>14.8</td>\n", " <td>1.4</td>\n", " </tr>\n", " <tr>\n", " <th>170</th>\n", " <td>2020-08-04</td>\n", " <td>33.4</td>\n", " <td>20</td>\n", " <td>4.3</td>\n", " <td>14.3</td>\n", " <td>14.8</td>\n", " </tr>\n", " <tr>\n", " <th>171</th>\n", " <td>2020-08-05</td>\n", " <td>18.6</td>\n", " <td>20</td>\n", " <td>0.0</td>\n", " <td>4.3</td>\n", " <td>14.3</td>\n", " </tr>\n", " <tr>\n", " <th>172</th>\n", " <td>2020-08-06</td>\n", " <td>4.3</td>\n", " <td>20</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>4.3</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>173 rows × 6 columns</p>\n", "</div>" ], "text/plain": [ " Date Rain year rain_same_day rain_48h rain_48\n", "0 2016-06-13 0.0 16 0.0 0.0 0.0\n", "1 2016-06-14 0.0 16 0.0 0.0 0.0\n", "2 2016-06-15 0.0 16 0.0 0.0 0.0\n", "3 2016-06-16 0.0 16 0.0 0.0 0.0\n", "4 2016-06-17 0.0 16 0.0 0.0 0.0\n", ".. ... ... ... ... ... ...\n", "168 2020-08-02 16.7 20 14.8 1.4 0.5\n", "169 2020-08-03 30.5 20 14.3 14.8 1.4\n", "170 2020-08-04 33.4 20 4.3 14.3 14.8\n", "171 2020-08-05 18.6 20 0.0 4.3 14.3\n", "172 2020-08-06 4.3 20 0.0 0.0 4.3\n", "\n", "[173 rows x 6 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "R = pd.read_csv(\"data/rain_all_cumulative_72h.csv\")\n", "display(R)\n", "H = pd.read_csv(\"data/hotel-nights.csv\")\n", "H['per_day'] = H['Rooms']/H['Days']\n", "R['rooms']=0\n", "for i, row in R.iterrows(): \n", " R.at[i,\"rooms\"]=H.loc[ H['Month']==row[\"Date\"][:7] ,'per_day'].values[0]\n", " \n", "sites=['VNX','MRD','SVT']\n", "common_cols={'Date' : 'Date',\n", " 'Location' : 'Location' ,\n", " 'Plate_number' : 'Plate_number',\n", " 'medium' : 'medium'}\n", "\n", "ds16 = pd.read_csv(\"data/2016_Data.csv\")\n", "ds16['medium']='easy_gel'\n", "ds16['P2_qty_sample'] = ds16['P1_qty_sample']\n", "ds16['P3_qty_sample'] = ds16['P1_qty_sample']\n", "#ds16=ds16.sort_values(['Date', 'Location'],ignore_index=True)\n", "ds17 = pd.read_csv(\"data/2017_Data.csv\")\n", "ds17['P2_qty_sample'] = ds17['P1_qty_sample']\n", "ds17['P3_qty_sample'] = ds17['P1_qty_sample']\n", "s17=ds17.sort_values(['Date', 'Location'],ignore_index=True)\n", "ds20 = pd.read_csv(\"data/2020_Data.csv\").query(\"medium=='levine'\").sort_values(['Date', 'Location', 'Plate_number'])\n", "\n", "# 2016\n", "weeks16= {'2016-06-21' : 'before',\n", " '2016-06-28' : 'before',\n", " '2016-07-05' : 'during',\n", " '2016-07-12' : 'during',\n", " '2016-07-19' : 'after',\n", " '2016-07-26' : 'after',\n", " '2016-08-02' : 'after',\n", " '2016-08-09' : 'after'} \n", "Y16 = prepare_year(expand_columns(ds16,3), {\n", " '24h_big_blue' : 'Bioindicator',\n", " '24h_med_blue' : 'Coliform',\n", " 'qty_sample' : 'volume'\n", " }, ['24h_big_blue', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'] , \n", " weeks16, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])\n", "Y16 = normalize_cols(Y16, ['Bioindicator', 'Coliform', 'total'], 'volume')\n", "\n", "# 2017\n", "weeks17 = {'2017-06-12' : 'before',\n", " '2017-06-19' : 'before',\n", " '2017-06-26' : 'before',\n", " '2017-07-03' : 'during',\n", " '2017-07-10' : 'during',\n", " '2017-07-17' : 'after',\n", " '2017-07-24' : 'after',\n", " '2017-07-31' : 'after',}\n", "\n", "Y17 = prepare_year(expand_columns(ds17,3), {\n", " '24h_big_blue' : 'Bioindicator',\n", " 'fluo_halo_colonies' : 'Bioindicator UV+',\n", " '24h_med_blue' : 'Coliform',\n", " 'qty_sample' : 'volume'\n", " }, ['fluo_halo_colonies', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'],\n", " weeks17, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])\n", "Y17 = normalize_cols(Y17, ['Bioindicator', 'Coliform', 'Bioindicator UV+', 'total'], 'volume')\n", "\n", "# 2020\n", "weeks20 = {'2020-06-11' : 'before',\n", " '2020-06-18' : 'before',\n", " '2020-06-25' : 'before',\n", " '2020-07-01' : 'before',\n", " '2020-07-09' : 'before',\n", " '2020-07-16' : 'before',\n", " '2020-07-22' : 'before',\n", " '2020-08-06' : 'before'}\n", "\n", "Y20=prepare_year(ds20, { 'green_met' : 'Bioindicator',\n", " 'purple' : 'Coliform',\n", " 'volume' : 'volume'\n", " }, ['green_met', 'purple', 'mauve', 'pink', 'other'] , \n", " weeks20 , sites, 'levine')\n", "Y20 = normalize_cols(Y20, ['Bioindicator', 'Coliform', 'total'], 'volume').fillna(0)\n", "\n", "D=pd.concat([Y16,Y17,Y20]).drop(columns=['volume','medium'])\n", "D=add_year(D)\n", "D.to_pickle(\"data/allyears_72h_cumulative.pkl\")\n", "\n", "#Take average over plates\n", "A=D.groupby(['Date','Location','festival','rooms','year','rain'],as_index=False).mean().drop(columns=['Plate_number'])\n", "A.to_pickle(\"data/allyears-avg_plate_72h_cumulative.pkl\")\n", "vars_cols=['Bioindicator', 'Coliform', 'total']\n", "#A[[f\"{col}_std\" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).std()[vars_cols]\n", "#A[[f\"{col}_max\" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).max()[vars_cols]" ] } ], "metadata": { "kernelspec": { "display_name": "Python (3.7)", "language": "python", "name": "python3.7" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }