{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "955d03bd", "metadata": {}, "outputs": [], "source": [ "%%bash\n", "pip install numpy\n", "pip install pandas\n", "pip install seaborn\n", "pip install mlxtend\n", "pip install arch" ] }, { "cell_type": "code", "execution_count": 1, "id": "08a2f02f", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import datetime\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "1c6d55d7", "metadata": {}, "outputs": [], "source": [ "def print_full(x):\n", " pd.set_option('display.max_rows', len(x))\n", " print(x)\n", " pd.reset_option('display.max_rows')\n", "def add_year(df):\n", " df['year'] = df['Date'].apply(lambda x: int(x[2:4]))\n", " return df" ] }, { "cell_type": "code", "execution_count": 20, "id": "d197c2c6", "metadata": {}, "outputs": [], "source": [ "#Load rain and hotel occupancy data\n", "r16 = pd.read_csv(\"data/rain2016.csv\")\n", "r17 = pd.read_csv(\"data/rain2017.csv\")\n", "r20 = pd.read_csv(\"data/rain2020.csv\")\n", "R=pd.concat([r16,r17,r20])[['Date', 'Rain']]\n", "R=add_year(R)\n", "R.to_csv('data/rain_all.csv', index=False)\n", "H = pd.read_csv(\"data/hotel-nights.csv\")\n", "H['per_day'] = H['Rooms']/H['Days']\n", "R['rooms']=0\n", "for i, row in R.iterrows(): \n", " R.at[i,\"rooms\"]=H.loc[ H['Month']==row[\"Date\"][:7] ,'per_day'].values[0]" ] }, { "cell_type": "code", "execution_count": 21, "id": "a0880450", "metadata": { "tags": [] }, "outputs": [], "source": [ "def expand_columns(ds, n=3):\n", " m=len(ds)\n", " Z=[]\n", " for i in range(n):\n", " ds['Plate_number']=i+1\n", " Z.append(ds.copy())\n", " plate_prefix = [f\"P{i+1}_\" for i in range(n)]\n", " plates=[s for s in ds.columns.tolist() if any(xs in s for xs in plate_prefix)]\n", " for p in plates:\n", " colname=p[3:]\n", " Z[int(p[1])-1][colname] =ds[p]\n", " e=pd.concat(Z)\n", " return e\n", " \n", "def map_columns(ds, col_dict):\n", " col_dict={**common_cols, **col_dict}\n", " x=ds.rename(columns=col_dict)\n", " return x[list(col_dict.values())]\n", "\n", "def calcRain(w):\n", " i=R.loc[R['Date']==w].index.values[0]\n", " r=R.iloc[i-2:i].agg('sum')['Rain']\n", " h=R.iloc[i-6:i].agg('sum')['rooms']\n", " return (h,r)\n", "\n", "def annotate_weeks(ds, weeks):\n", " ds['week']=0\n", " ds['festival']=''\n", " ds['rain']=0\n", " ds['rooms']=0\n", " for w, i in zip(weeks, range(len(weeks))):\n", " I=ds['Date']==w\n", " ds.loc[I, 'week']= i+1\n", " ds.loc[I, 'festival']= weeks[w]\n", " h, r = calcRain(w)\n", " ds.loc[I, 'rain'] = r\n", " ds.loc[I, 'rooms']= h\n", " return ds\n", "\n", "def prepare_year(ds, col_dict, all_indicators, weeks, locations, medium):\n", " ds=ds.query(' or '.join([ f\"(Location=='{u}')\" for u in locations])) \\\n", " .query(f\"medium==\\'{medium}\\'\")\n", " x=map_columns(ds, col_dict)\n", " x['total']=ds[all_indicators].agg(\"sum\", axis=\"columns\")\n", " z= annotate_weeks(x, weeks)\n", " z=z.sort_values(['Date', 'Location', 'Plate_number'],ignore_index=True) \n", " return z\n", "\n", "def normalize_cols(ds, cols, factor):\n", " for col in cols:\n", " ds[col] = 100 * ds[col] / ds[factor]\n", " return ds\n", " \n", "sites=['VNX','MRD','SVT']\n", "common_cols={'Date' : 'Date',\n", " 'Location' : 'Location' ,\n", " 'Plate_number' : 'Plate_number',\n", " 'medium' : 'medium'}" ] }, { "cell_type": "code", "execution_count": 22, "id": "d35644f1", "metadata": { "tags": [] }, "outputs": [], "source": [ "ds16 = pd.read_csv(\"data/2016_Data.csv\")\n", "ds16['medium']='easy_gel'\n", "ds16['P2_qty_sample'] = ds16['P1_qty_sample']\n", "ds16['P3_qty_sample'] = ds16['P1_qty_sample']\n", "#ds16=ds16.sort_values(['Date', 'Location'],ignore_index=True)\n", "ds17 = pd.read_csv(\"data/2017_Data.csv\")\n", "ds17['P2_qty_sample'] = ds17['P1_qty_sample']\n", "ds17['P3_qty_sample'] = ds17['P1_qty_sample']\n", "s17=ds17.sort_values(['Date', 'Location'],ignore_index=True)\n", "ds20 = pd.read_csv(\"data/2020_Data.csv\").query(\"medium=='levine'\").sort_values(['Date', 'Location', 'Plate_number'])" ] }, { "cell_type": "code", "execution_count": 23, "id": "0552bc9a", "metadata": {}, "outputs": [], "source": [ "# 2016\n", "weeks16= {'2016-06-21' : 'before',\n", " '2016-06-28' : 'before',\n", " '2016-07-05' : 'during',\n", " '2016-07-12' : 'during',\n", " '2016-07-19' : 'after',\n", " '2016-07-26' : 'after',\n", " '2016-08-02' : 'after',\n", " '2016-08-09' : 'after'} \n", "Y16 = prepare_year(expand_columns(ds16,3), {\n", " '24h_big_blue' : 'Bioindicator',\n", " '24h_med_blue' : 'Coliform',\n", " 'qty_sample' : 'volume'\n", " }, ['24h_big_blue', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'] , \n", " weeks16, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])\n", "Y16 = normalize_cols(Y16, ['Bioindicator', 'Coliform', 'total'], 'volume')" ] }, { "cell_type": "code", "execution_count": 24, "id": "1ee648cd", "metadata": {}, "outputs": [], "source": [ "# 2017\n", "weeks17 = {'2017-06-12' : 'before',\n", " '2017-06-19' : 'before',\n", " '2017-06-26' : 'before',\n", " '2017-07-03' : 'during',\n", " '2017-07-10' : 'during',\n", " '2017-07-17' : 'after',\n", " '2017-07-24' : 'after',\n", " '2017-07-31' : 'after',}\n", "\n", "Y17 = prepare_year(expand_columns(ds17,3), {\n", " '24h_big_blue' : 'Bioindicator',\n", " 'fluo_halo_colonies' : 'Bioindicator UV+',\n", " '24h_med_blue' : 'Coliform',\n", " 'qty_sample' : 'volume'\n", " }, ['fluo_halo_colonies', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'],\n", " weeks17, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])\n", "Y17 = normalize_cols(Y17, ['Bioindicator', 'Coliform', 'Bioindicator UV+', 'total'], 'volume')" ] }, { "cell_type": "code", "execution_count": 25, "id": "d535af61", "metadata": { "tags": [] }, "outputs": [], "source": [ "# 2020\n", "weeks20 = {'2020-06-11' : 'before',\n", " '2020-06-18' : 'before',\n", " '2020-06-25' : 'before',\n", " '2020-07-01' : 'before',\n", " '2020-07-09' : 'before',\n", " '2020-07-16' : 'before',\n", " '2020-07-22' : 'before',\n", " '2020-08-06' : 'before'}\n", "\n", "Y20=prepare_year(ds20, { 'green_met' : 'Bioindicator',\n", " 'purple' : 'Coliform',\n", " 'volume' : 'volume'\n", " }, ['green_met', 'purple', 'mauve', 'pink', 'other'] , \n", " weeks20 , sites, 'levine')\n", "Y20 = normalize_cols(Y20, ['Bioindicator', 'Coliform', 'total'], 'volume').fillna(0)\n" ] }, { "cell_type": "code", "execution_count": 26, "id": "dff09705", "metadata": {}, "outputs": [], "source": [ "D=pd.concat([Y16,Y17,Y20]).drop(columns=['volume','medium'])\n", "D=add_year(D)\n", "D.to_pickle(\"data/allyears.pkl\")" ] }, { "cell_type": "code", "execution_count": 27, "id": "9d44c41d", "metadata": {}, "outputs": [], "source": [ "#Take average over plates\n", "A=D.groupby(['Date','Location','festival','rooms','year','rain'],as_index=False).mean().drop(columns=['Plate_number'])\n", "A.to_pickle(\"data/allyears-avg_plate.pkl\")\n", "vars_cols=['Bioindicator', 'Coliform', 'total']\n", "#A[[f\"{col}_std\" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).std()[vars_cols]\n", "#A[[f\"{col}_max\" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).max()[vars_cols]" ] }, { "cell_type": "markdown", "id": "ce930101", "metadata": {}, "source": [ "# Cumulative 48 h data" ] }, { "cell_type": "code", "execution_count": 28, "id": "9b4386aa", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | Date | \n", "Rain | \n", "year | \n", "rain_same_day | \n", "rain_48h | \n", "rain_48 | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "2016-06-13 | \n", "0.0 | \n", "16 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| 1 | \n", "2016-06-14 | \n", "0.0 | \n", "16 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| 2 | \n", "2016-06-15 | \n", "0.0 | \n", "16 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| 3 | \n", "2016-06-16 | \n", "0.0 | \n", "16 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| 4 | \n", "2016-06-17 | \n", "0.0 | \n", "16 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 168 | \n", "2020-08-02 | \n", "16.2 | \n", "20 | \n", "14.8 | \n", "1.4 | \n", "0.5 | \n", "
| 169 | \n", "2020-08-03 | \n", "29.1 | \n", "20 | \n", "14.3 | \n", "14.8 | \n", "1.4 | \n", "
| 170 | \n", "2020-08-04 | \n", "18.6 | \n", "20 | \n", "4.3 | \n", "14.3 | \n", "14.8 | \n", "
| 171 | \n", "2020-08-05 | \n", "4.3 | \n", "20 | \n", "0.0 | \n", "4.3 | \n", "14.3 | \n", "
| 172 | \n", "2020-08-06 | \n", "0.0 | \n", "20 | \n", "0.0 | \n", "0.0 | \n", "4.3 | \n", "
173 rows × 6 columns
\n", "| \n", " | Date | \n", "Rain | \n", "year | \n", "rain_same_day | \n", "rain_48h | \n", "rain_48 | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "2016-06-13 | \n", "0.0 | \n", "16 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| 1 | \n", "2016-06-14 | \n", "0.0 | \n", "16 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| 2 | \n", "2016-06-15 | \n", "0.0 | \n", "16 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| 3 | \n", "2016-06-16 | \n", "0.0 | \n", "16 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| 4 | \n", "2016-06-17 | \n", "0.0 | \n", "16 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 168 | \n", "2020-08-02 | \n", "16.7 | \n", "20 | \n", "14.8 | \n", "1.4 | \n", "0.5 | \n", "
| 169 | \n", "2020-08-03 | \n", "30.5 | \n", "20 | \n", "14.3 | \n", "14.8 | \n", "1.4 | \n", "
| 170 | \n", "2020-08-04 | \n", "33.4 | \n", "20 | \n", "4.3 | \n", "14.3 | \n", "14.8 | \n", "
| 171 | \n", "2020-08-05 | \n", "18.6 | \n", "20 | \n", "0.0 | \n", "4.3 | \n", "14.3 | \n", "
| 172 | \n", "2020-08-06 | \n", "4.3 | \n", "20 | \n", "0.0 | \n", "0.0 | \n", "4.3 | \n", "
173 rows × 6 columns
\n", "