{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "955d03bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "pip install numpy\n",
    "pip install pandas\n",
    "pip install seaborn\n",
    "pip install mlxtend\n",
    "pip install arch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "08a2f02f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import datetime\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1c6d55d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_full(x):\n",
    "    pd.set_option('display.max_rows', len(x))\n",
    "    print(x)\n",
    "    pd.reset_option('display.max_rows')\n",
    "def add_year(df):\n",
    "    df['year'] = df['Date'].apply(lambda x: int(x[2:4]))\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "d197c2c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Load rain and hotel occupancy data\n",
    "r16 = pd.read_csv(\"data/rain2016.csv\")\n",
    "r17 = pd.read_csv(\"data/rain2017.csv\")\n",
    "r20 = pd.read_csv(\"data/rain2020.csv\")\n",
    "R=pd.concat([r16,r17,r20])[['Date', 'Rain']]\n",
    "R=add_year(R)\n",
    "R.to_csv('data/rain_all.csv', index=False)\n",
    "H = pd.read_csv(\"data/hotel-nights.csv\")\n",
    "H['per_day'] = H['Rooms']/H['Days']\n",
    "R['rooms']=0\n",
    "for i, row in R.iterrows(): \n",
    "    R.at[i,\"rooms\"]=H.loc[ H['Month']==row[\"Date\"][:7] ,'per_day'].values[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "a0880450",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def expand_columns(ds, n=3):\n",
    "    m=len(ds)\n",
    "    Z=[]\n",
    "    for i in range(n):\n",
    "        ds['Plate_number']=i+1\n",
    "        Z.append(ds.copy())\n",
    "    plate_prefix = [f\"P{i+1}_\" for i in range(n)]\n",
    "    plates=[s for s in ds.columns.tolist() if any(xs in s for xs in plate_prefix)]\n",
    "    for p in plates:\n",
    "        colname=p[3:]\n",
    "        Z[int(p[1])-1][colname] =ds[p]\n",
    "    e=pd.concat(Z)\n",
    "    return e\n",
    "   \n",
    "def map_columns(ds, col_dict):\n",
    "    col_dict={**common_cols, **col_dict}\n",
    "    x=ds.rename(columns=col_dict)\n",
    "    return x[list(col_dict.values())]\n",
    "\n",
    "def calcRain(w):\n",
    "    i=R.loc[R['Date']==w].index.values[0]\n",
    "    r=R.iloc[i-2:i].agg('sum')['Rain']\n",
    "    h=R.iloc[i-6:i].agg('sum')['rooms']\n",
    "    return (h,r)\n",
    "\n",
    "def annotate_weeks(ds, weeks):\n",
    "    ds['week']=0\n",
    "    ds['festival']=''\n",
    "    ds['rain']=0\n",
    "    ds['rooms']=0\n",
    "    for w, i in zip(weeks, range(len(weeks))):\n",
    "        I=ds['Date']==w\n",
    "        ds.loc[I, 'week']= i+1\n",
    "        ds.loc[I, 'festival']= weeks[w]\n",
    "        h, r = calcRain(w)\n",
    "        ds.loc[I, 'rain'] = r\n",
    "        ds.loc[I, 'rooms']= h\n",
    "    return ds\n",
    "\n",
    "def prepare_year(ds, col_dict, all_indicators, weeks, locations, medium):\n",
    "    ds=ds.query(' or '.join([ f\"(Location=='{u}')\" for u in locations])) \\\n",
    "            .query(f\"medium==\\'{medium}\\'\")\n",
    "    x=map_columns(ds, col_dict)\n",
    "    x['total']=ds[all_indicators].agg(\"sum\", axis=\"columns\")\n",
    "    z=  annotate_weeks(x, weeks)\n",
    "    z=z.sort_values(['Date', 'Location', 'Plate_number'],ignore_index=True) \n",
    "    return z\n",
    "\n",
    "def normalize_cols(ds, cols, factor):\n",
    "    for col in cols:\n",
    "        ds[col] = 100 * ds[col] / ds[factor]\n",
    "    return ds\n",
    "    \n",
    "sites=['VNX','MRD','SVT']\n",
    "common_cols={'Date' : 'Date',\n",
    "                'Location' : 'Location' ,\n",
    "                'Plate_number' : 'Plate_number',\n",
    "                'medium' : 'medium'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "d35644f1",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "ds16 = pd.read_csv(\"data/2016_Data.csv\")\n",
    "ds16['medium']='easy_gel'\n",
    "ds16['P2_qty_sample'] = ds16['P1_qty_sample']\n",
    "ds16['P3_qty_sample'] = ds16['P1_qty_sample']\n",
    "#ds16=ds16.sort_values(['Date', 'Location'],ignore_index=True)\n",
    "ds17 = pd.read_csv(\"data/2017_Data.csv\")\n",
    "ds17['P2_qty_sample'] = ds17['P1_qty_sample']\n",
    "ds17['P3_qty_sample'] = ds17['P1_qty_sample']\n",
    "s17=ds17.sort_values(['Date', 'Location'],ignore_index=True)\n",
    "ds20 = pd.read_csv(\"data/2020_Data.csv\").query(\"medium=='levine'\").sort_values(['Date', 'Location', 'Plate_number'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "0552bc9a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2016\n",
    "weeks16= {'2016-06-21' : 'before',\n",
    "            '2016-06-28' : 'before',\n",
    "            '2016-07-05' : 'during',\n",
    "            '2016-07-12' : 'during',\n",
    "            '2016-07-19' : 'after',\n",
    "            '2016-07-26' : 'after',\n",
    "            '2016-08-02' : 'after',\n",
    "            '2016-08-09' : 'after'} \n",
    "Y16 = prepare_year(expand_columns(ds16,3), {\n",
    "                '24h_big_blue' : 'Bioindicator',\n",
    "                '24h_med_blue' : 'Coliform',\n",
    "                'qty_sample' : 'volume'\n",
    "            }, ['24h_big_blue', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'] , \n",
    "                weeks16, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])\n",
    "Y16 = normalize_cols(Y16, ['Bioindicator', 'Coliform', 'total'], 'volume')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "1ee648cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2017\n",
    "weeks17 = {'2017-06-12' : 'before',\n",
    "    '2017-06-19' : 'before',\n",
    "    '2017-06-26' : 'before',\n",
    "    '2017-07-03' : 'during',\n",
    "    '2017-07-10' : 'during',\n",
    "    '2017-07-17' : 'after',\n",
    "    '2017-07-24' : 'after',\n",
    "    '2017-07-31' : 'after',}\n",
    "\n",
    "Y17 = prepare_year(expand_columns(ds17,3), {\n",
    "            '24h_big_blue' : 'Bioindicator',\n",
    "            'fluo_halo_colonies' : 'Bioindicator UV+',\n",
    "            '24h_med_blue' : 'Coliform',\n",
    "            'qty_sample' : 'volume'\n",
    "        }, ['fluo_halo_colonies', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'],\n",
    "                   weeks17, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])\n",
    "Y17 = normalize_cols(Y17, ['Bioindicator', 'Coliform', 'Bioindicator UV+', 'total'], 'volume')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "d535af61",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 2020\n",
    "weeks20 = {'2020-06-11' : 'before',\n",
    "                            '2020-06-18' : 'before',\n",
    "                            '2020-06-25' : 'before',\n",
    "                            '2020-07-01' : 'before',\n",
    "                            '2020-07-09' : 'before',\n",
    "                            '2020-07-16' : 'before',\n",
    "                            '2020-07-22' : 'before',\n",
    "                            '2020-08-06' : 'before'}\n",
    "\n",
    "Y20=prepare_year(ds20, { 'green_met' : 'Bioindicator',\n",
    "                            'purple' : 'Coliform',\n",
    "                            'volume' : 'volume'\n",
    "                        }, ['green_met', 'purple', 'mauve', 'pink', 'other'] , \n",
    "                 weeks20 , sites, 'levine')\n",
    "Y20 = normalize_cols(Y20, ['Bioindicator', 'Coliform', 'total'], 'volume').fillna(0)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "dff09705",
   "metadata": {},
   "outputs": [],
   "source": [
    "D=pd.concat([Y16,Y17,Y20]).drop(columns=['volume','medium'])\n",
    "D=add_year(D)\n",
    "D.to_pickle(\"data/allyears.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "9d44c41d",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Take average over plates\n",
    "A=D.groupby(['Date','Location','festival','rooms','year','rain'],as_index=False).mean().drop(columns=['Plate_number'])\n",
    "A.to_pickle(\"data/allyears-avg_plate.pkl\")\n",
    "vars_cols=['Bioindicator', 'Coliform', 'total']\n",
    "#A[[f\"{col}_std\" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).std()[vars_cols]\n",
    "#A[[f\"{col}_max\" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).max()[vars_cols]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ce930101",
   "metadata": {},
   "source": [
    "# Cumulative 48 h data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "9b4386aa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Date</th>\n",
       "      <th>Rain</th>\n",
       "      <th>year</th>\n",
       "      <th>rain_same_day</th>\n",
       "      <th>rain_48h</th>\n",
       "      <th>rain_48</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2016-06-13</td>\n",
       "      <td>0.0</td>\n",
       "      <td>16</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2016-06-14</td>\n",
       "      <td>0.0</td>\n",
       "      <td>16</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2016-06-15</td>\n",
       "      <td>0.0</td>\n",
       "      <td>16</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2016-06-16</td>\n",
       "      <td>0.0</td>\n",
       "      <td>16</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2016-06-17</td>\n",
       "      <td>0.0</td>\n",
       "      <td>16</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>168</th>\n",
       "      <td>2020-08-02</td>\n",
       "      <td>16.2</td>\n",
       "      <td>20</td>\n",
       "      <td>14.8</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>169</th>\n",
       "      <td>2020-08-03</td>\n",
       "      <td>29.1</td>\n",
       "      <td>20</td>\n",
       "      <td>14.3</td>\n",
       "      <td>14.8</td>\n",
       "      <td>1.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>170</th>\n",
       "      <td>2020-08-04</td>\n",
       "      <td>18.6</td>\n",
       "      <td>20</td>\n",
       "      <td>4.3</td>\n",
       "      <td>14.3</td>\n",
       "      <td>14.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>171</th>\n",
       "      <td>2020-08-05</td>\n",
       "      <td>4.3</td>\n",
       "      <td>20</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.3</td>\n",
       "      <td>14.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>172</th>\n",
       "      <td>2020-08-06</td>\n",
       "      <td>0.0</td>\n",
       "      <td>20</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>173 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           Date  Rain  year  rain_same_day  rain_48h  rain_48\n",
       "0    2016-06-13   0.0    16            0.0       0.0      0.0\n",
       "1    2016-06-14   0.0    16            0.0       0.0      0.0\n",
       "2    2016-06-15   0.0    16            0.0       0.0      0.0\n",
       "3    2016-06-16   0.0    16            0.0       0.0      0.0\n",
       "4    2016-06-17   0.0    16            0.0       0.0      0.0\n",
       "..          ...   ...   ...            ...       ...      ...\n",
       "168  2020-08-02  16.2    20           14.8       1.4      0.5\n",
       "169  2020-08-03  29.1    20           14.3      14.8      1.4\n",
       "170  2020-08-04  18.6    20            4.3      14.3     14.8\n",
       "171  2020-08-05   4.3    20            0.0       4.3     14.3\n",
       "172  2020-08-06   0.0    20            0.0       0.0      4.3\n",
       "\n",
       "[173 rows x 6 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "R = pd.read_csv(\"data/rain_all_cumulative_48h.csv\")\n",
    "display(R)\n",
    "H = pd.read_csv(\"data/hotel-nights.csv\")\n",
    "H['per_day'] = H['Rooms']/H['Days']\n",
    "R['rooms']=0\n",
    "for i, row in R.iterrows(): \n",
    "    R.at[i,\"rooms\"]=H.loc[ H['Month']==row[\"Date\"][:7] ,'per_day'].values[0]\n",
    "    \n",
    "sites=['VNX','MRD','SVT']\n",
    "common_cols={'Date' : 'Date',\n",
    "                'Location' : 'Location' ,\n",
    "                'Plate_number' : 'Plate_number',\n",
    "                'medium' : 'medium'}\n",
    "\n",
    "ds16 = pd.read_csv(\"data/2016_Data.csv\")\n",
    "ds16['medium']='easy_gel'\n",
    "ds16['P2_qty_sample'] = ds16['P1_qty_sample']\n",
    "ds16['P3_qty_sample'] = ds16['P1_qty_sample']\n",
    "#ds16=ds16.sort_values(['Date', 'Location'],ignore_index=True)\n",
    "ds17 = pd.read_csv(\"data/2017_Data.csv\")\n",
    "ds17['P2_qty_sample'] = ds17['P1_qty_sample']\n",
    "ds17['P3_qty_sample'] = ds17['P1_qty_sample']\n",
    "s17=ds17.sort_values(['Date', 'Location'],ignore_index=True)\n",
    "ds20 = pd.read_csv(\"data/2020_Data.csv\").query(\"medium=='levine'\").sort_values(['Date', 'Location', 'Plate_number'])\n",
    "\n",
    "# 2016\n",
    "weeks16= {'2016-06-21' : 'before',\n",
    "            '2016-06-28' : 'before',\n",
    "            '2016-07-05' : 'during',\n",
    "            '2016-07-12' : 'during',\n",
    "            '2016-07-19' : 'after',\n",
    "            '2016-07-26' : 'after',\n",
    "            '2016-08-02' : 'after',\n",
    "            '2016-08-09' : 'after'} \n",
    "Y16 = prepare_year(expand_columns(ds16,3), {\n",
    "                '24h_big_blue' : 'Bioindicator',\n",
    "                '24h_med_blue' : 'Coliform',\n",
    "                'qty_sample' : 'volume'\n",
    "            }, ['24h_big_blue', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'] , \n",
    "                weeks16, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])\n",
    "Y16 = normalize_cols(Y16, ['Bioindicator', 'Coliform', 'total'], 'volume')\n",
    "\n",
    "# 2017\n",
    "weeks17 = {'2017-06-12' : 'before',\n",
    "    '2017-06-19' : 'before',\n",
    "    '2017-06-26' : 'before',\n",
    "    '2017-07-03' : 'during',\n",
    "    '2017-07-10' : 'during',\n",
    "    '2017-07-17' : 'after',\n",
    "    '2017-07-24' : 'after',\n",
    "    '2017-07-31' : 'after',}\n",
    "\n",
    "Y17 = prepare_year(expand_columns(ds17,3), {\n",
    "            '24h_big_blue' : 'Bioindicator',\n",
    "            'fluo_halo_colonies' : 'Bioindicator UV+',\n",
    "            '24h_med_blue' : 'Coliform',\n",
    "            'qty_sample' : 'volume'\n",
    "        }, ['fluo_halo_colonies', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'],\n",
    "                   weeks17, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])\n",
    "Y17 = normalize_cols(Y17, ['Bioindicator', 'Coliform', 'Bioindicator UV+', 'total'], 'volume')\n",
    "\n",
    "# 2020\n",
    "weeks20 = {'2020-06-11' : 'before',\n",
    "                            '2020-06-18' : 'before',\n",
    "                            '2020-06-25' : 'before',\n",
    "                            '2020-07-01' : 'before',\n",
    "                            '2020-07-09' : 'before',\n",
    "                            '2020-07-16' : 'before',\n",
    "                            '2020-07-22' : 'before',\n",
    "                            '2020-08-06' : 'before'}\n",
    "\n",
    "Y20=prepare_year(ds20, { 'green_met' : 'Bioindicator',\n",
    "                            'purple' : 'Coliform',\n",
    "                            'volume' : 'volume'\n",
    "                        }, ['green_met', 'purple', 'mauve', 'pink', 'other'] , \n",
    "                 weeks20 , sites, 'levine')\n",
    "Y20 = normalize_cols(Y20, ['Bioindicator', 'Coliform', 'total'], 'volume').fillna(0)\n",
    "\n",
    "D=pd.concat([Y16,Y17,Y20]).drop(columns=['volume','medium'])\n",
    "D=add_year(D)\n",
    "D.to_pickle(\"data/allyears_48h_cumulative.pkl\")\n",
    "\n",
    "#Take average over plates\n",
    "A=D.groupby(['Date','Location','festival','rooms','year','rain'],as_index=False).mean().drop(columns=['Plate_number'])\n",
    "A.to_pickle(\"data/allyears-avg_plate_48h_cumulative.pkl\")\n",
    "vars_cols=['Bioindicator', 'Coliform', 'total']\n",
    "#A[[f\"{col}_std\" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).std()[vars_cols]\n",
    "#A[[f\"{col}_max\" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).max()[vars_cols]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2cc8aed2",
   "metadata": {},
   "source": [
    "### Cumulative 72h"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "901fd9ba",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Date</th>\n",
       "      <th>Rain</th>\n",
       "      <th>year</th>\n",
       "      <th>rain_same_day</th>\n",
       "      <th>rain_48h</th>\n",
       "      <th>rain_48</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2016-06-13</td>\n",
       "      <td>0.0</td>\n",
       "      <td>16</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2016-06-14</td>\n",
       "      <td>0.0</td>\n",
       "      <td>16</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2016-06-15</td>\n",
       "      <td>0.0</td>\n",
       "      <td>16</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2016-06-16</td>\n",
       "      <td>0.0</td>\n",
       "      <td>16</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2016-06-17</td>\n",
       "      <td>0.0</td>\n",
       "      <td>16</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>168</th>\n",
       "      <td>2020-08-02</td>\n",
       "      <td>16.7</td>\n",
       "      <td>20</td>\n",
       "      <td>14.8</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>169</th>\n",
       "      <td>2020-08-03</td>\n",
       "      <td>30.5</td>\n",
       "      <td>20</td>\n",
       "      <td>14.3</td>\n",
       "      <td>14.8</td>\n",
       "      <td>1.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>170</th>\n",
       "      <td>2020-08-04</td>\n",
       "      <td>33.4</td>\n",
       "      <td>20</td>\n",
       "      <td>4.3</td>\n",
       "      <td>14.3</td>\n",
       "      <td>14.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>171</th>\n",
       "      <td>2020-08-05</td>\n",
       "      <td>18.6</td>\n",
       "      <td>20</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.3</td>\n",
       "      <td>14.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>172</th>\n",
       "      <td>2020-08-06</td>\n",
       "      <td>4.3</td>\n",
       "      <td>20</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>173 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           Date  Rain  year  rain_same_day  rain_48h  rain_48\n",
       "0    2016-06-13   0.0    16            0.0       0.0      0.0\n",
       "1    2016-06-14   0.0    16            0.0       0.0      0.0\n",
       "2    2016-06-15   0.0    16            0.0       0.0      0.0\n",
       "3    2016-06-16   0.0    16            0.0       0.0      0.0\n",
       "4    2016-06-17   0.0    16            0.0       0.0      0.0\n",
       "..          ...   ...   ...            ...       ...      ...\n",
       "168  2020-08-02  16.7    20           14.8       1.4      0.5\n",
       "169  2020-08-03  30.5    20           14.3      14.8      1.4\n",
       "170  2020-08-04  33.4    20            4.3      14.3     14.8\n",
       "171  2020-08-05  18.6    20            0.0       4.3     14.3\n",
       "172  2020-08-06   4.3    20            0.0       0.0      4.3\n",
       "\n",
       "[173 rows x 6 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "R = pd.read_csv(\"data/rain_all_cumulative_72h.csv\")\n",
    "display(R)\n",
    "H = pd.read_csv(\"data/hotel-nights.csv\")\n",
    "H['per_day'] = H['Rooms']/H['Days']\n",
    "R['rooms']=0\n",
    "for i, row in R.iterrows(): \n",
    "    R.at[i,\"rooms\"]=H.loc[ H['Month']==row[\"Date\"][:7] ,'per_day'].values[0]\n",
    "    \n",
    "sites=['VNX','MRD','SVT']\n",
    "common_cols={'Date' : 'Date',\n",
    "                'Location' : 'Location' ,\n",
    "                'Plate_number' : 'Plate_number',\n",
    "                'medium' : 'medium'}\n",
    "\n",
    "ds16 = pd.read_csv(\"data/2016_Data.csv\")\n",
    "ds16['medium']='easy_gel'\n",
    "ds16['P2_qty_sample'] = ds16['P1_qty_sample']\n",
    "ds16['P3_qty_sample'] = ds16['P1_qty_sample']\n",
    "#ds16=ds16.sort_values(['Date', 'Location'],ignore_index=True)\n",
    "ds17 = pd.read_csv(\"data/2017_Data.csv\")\n",
    "ds17['P2_qty_sample'] = ds17['P1_qty_sample']\n",
    "ds17['P3_qty_sample'] = ds17['P1_qty_sample']\n",
    "s17=ds17.sort_values(['Date', 'Location'],ignore_index=True)\n",
    "ds20 = pd.read_csv(\"data/2020_Data.csv\").query(\"medium=='levine'\").sort_values(['Date', 'Location', 'Plate_number'])\n",
    "\n",
    "# 2016\n",
    "weeks16= {'2016-06-21' : 'before',\n",
    "            '2016-06-28' : 'before',\n",
    "            '2016-07-05' : 'during',\n",
    "            '2016-07-12' : 'during',\n",
    "            '2016-07-19' : 'after',\n",
    "            '2016-07-26' : 'after',\n",
    "            '2016-08-02' : 'after',\n",
    "            '2016-08-09' : 'after'} \n",
    "Y16 = prepare_year(expand_columns(ds16,3), {\n",
    "                '24h_big_blue' : 'Bioindicator',\n",
    "                '24h_med_blue' : 'Coliform',\n",
    "                'qty_sample' : 'volume'\n",
    "            }, ['24h_big_blue', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'] , \n",
    "                weeks16, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])\n",
    "Y16 = normalize_cols(Y16, ['Bioindicator', 'Coliform', 'total'], 'volume')\n",
    "\n",
    "# 2017\n",
    "weeks17 = {'2017-06-12' : 'before',\n",
    "    '2017-06-19' : 'before',\n",
    "    '2017-06-26' : 'before',\n",
    "    '2017-07-03' : 'during',\n",
    "    '2017-07-10' : 'during',\n",
    "    '2017-07-17' : 'after',\n",
    "    '2017-07-24' : 'after',\n",
    "    '2017-07-31' : 'after',}\n",
    "\n",
    "Y17 = prepare_year(expand_columns(ds17,3), {\n",
    "            '24h_big_blue' : 'Bioindicator',\n",
    "            'fluo_halo_colonies' : 'Bioindicator UV+',\n",
    "            '24h_med_blue' : 'Coliform',\n",
    "            'qty_sample' : 'volume'\n",
    "        }, ['fluo_halo_colonies', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'],\n",
    "                   weeks17, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])\n",
    "Y17 = normalize_cols(Y17, ['Bioindicator', 'Coliform', 'Bioindicator UV+', 'total'], 'volume')\n",
    "\n",
    "# 2020\n",
    "weeks20 = {'2020-06-11' : 'before',\n",
    "                            '2020-06-18' : 'before',\n",
    "                            '2020-06-25' : 'before',\n",
    "                            '2020-07-01' : 'before',\n",
    "                            '2020-07-09' : 'before',\n",
    "                            '2020-07-16' : 'before',\n",
    "                            '2020-07-22' : 'before',\n",
    "                            '2020-08-06' : 'before'}\n",
    "\n",
    "Y20=prepare_year(ds20, { 'green_met' : 'Bioindicator',\n",
    "                            'purple' : 'Coliform',\n",
    "                            'volume' : 'volume'\n",
    "                        }, ['green_met', 'purple', 'mauve', 'pink', 'other'] , \n",
    "                 weeks20 , sites, 'levine')\n",
    "Y20 = normalize_cols(Y20, ['Bioindicator', 'Coliform', 'total'], 'volume').fillna(0)\n",
    "\n",
    "D=pd.concat([Y16,Y17,Y20]).drop(columns=['volume','medium'])\n",
    "D=add_year(D)\n",
    "D.to_pickle(\"data/allyears_72h_cumulative.pkl\")\n",
    "\n",
    "#Take average over plates\n",
    "A=D.groupby(['Date','Location','festival','rooms','year','rain'],as_index=False).mean().drop(columns=['Plate_number'])\n",
    "A.to_pickle(\"data/allyears-avg_plate_72h_cumulative.pkl\")\n",
    "vars_cols=['Bioindicator', 'Coliform', 'total']\n",
    "#A[[f\"{col}_std\" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).std()[vars_cols]\n",
    "#A[[f\"{col}_max\" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).max()[vars_cols]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (3.7)",
   "language": "python",
   "name": "python3.7"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}