{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np  # this module handles arrays, but here we need it for its NaN value\n",
    "import pandas as pd # this module contains a lot of tools for handling tabular data\n",
    "from matplotlib import pyplot as plt\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define paths to the source files and eventual output file\n",
    "pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC//All 2015 SoG_bottle.xlsx'\n",
    "pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2018 Abs phyto groupsCorrected.xlsx'\n",
    "pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2015.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reading sheet 2015-17\n",
      "Reading sheet 2015-20\n",
      "Reading sheet 2015-18\n",
      "Reading sheet 2015-21\n",
      "Reading sheet 2015-19\n",
      "removed empty columns: {'Ammonium', 'Flag:Ammonium'}\n"
     ]
    }
   ],
   "source": [
    "# load each sheet in the 2015 bottle Excel file and concatenate them together into one table\n",
    "dfbotlist=list()\n",
    "for sheet in ('2015-17','2015-20','2015-18','2015-21','2015-19'):\n",
    "    df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,na_values=(-99,-99.9)) # read each sheet\n",
    "    df0['Cruise']=sheet  # create and populate Cruise column based on sheet name\n",
    "    dfbotlist.append(df0) # append the sheet to a list\n",
    "dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table\n",
    "# Drop columns with no data in them\n",
    "l1=set(dfbot.keys())\n",
    "dfbot.dropna(axis=1,how='all',inplace=True)\n",
    "print('removed empty columns:',l1-set(dfbot.keys()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',\n",
      "       'LOC:EVENT_NUMBER', 'LOC:STATION', 'LOC:LATITUDE', 'LOC:LONGITUDE',\n",
      "       'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'Sample_Number', 'Pressure',\n",
      "       'Temperature:Primary', 'Transmissivity', 'Fluorescence:URU:Seapoint',\n",
      "       'PAR', 'Salinity:T0:C0', 'Oxygen:Dissolved:SBE',\n",
      "       'Oxygen:Dissolved:SBE.1', 'pH:SBE:Nominal', 'Salinity:Bottle',\n",
      "       'Flag:Salinity:Bottle', 'Chlorophyll:Extracted',\n",
      "       'Flag:Chlorophyll:Extracted', 'Nitrate_plus_Nitrite',\n",
      "       'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate',\n",
      "       'Flag:Phosphate', 'Cruise', 'Temperature:Draw',\n",
      "       'Phaeo-Pigment:Extracted', 'Oxygen:Dissolved', 'Oxygen:Dissolved.1',\n",
      "       'Flag:Oxygen:Dissolved', 'Temperature:Secondary', 'Salinity:T1:C1',\n",
      "       'Number_of_bin_records'],\n",
      "      dtype='object')\n"
     ]
    }
   ],
   "source": [
    "# list the column names in the resulting table\n",
    "print(dfbot.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def subval(idf,colList):\n",
    "    # first value in colList should be the column you are going to keep\n",
    "    # follow with other columns that will be used to fill in when that column is NaN\n",
    "    # in order of precedence\n",
    "    if len(colList)==2:\n",
    "        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \\\n",
    "                         else  r[colList[1]] for i,r in idf.iterrows()]\n",
    "    elif len(colList)==3:\n",
    "        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \\\n",
    "                         else  r[colList[1]] if not pd.isna(r[colList[1]]) \\\n",
    "                         else r[colList[2]] for i,r in idf.iterrows()]\n",
    "    else:\n",
    "        raise NotImplementedError('Add to code to handle this case')\n",
    "    idf.drop(columns=list(colList[1:]),inplace=True)\n",
    "    return idf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfbot=subval(dfbot,('Oxygen:Dissolved','Oxygen:Dissolved.1'))\n",
    "dfbot=subval(dfbot,('Temperature:Primary','Temperature:Secondary'))\n",
    "dfbot=subval(dfbot,('Salinity:T0:C0','Salinity:T1:C1'))\n",
    "dfbot=subval(dfbot,('Oxygen:Dissolved:SBE','Oxygen:Dissolved:SBE.1'))\n",
    "dfbot.rename(columns={'LOC:LATITUDE':'Lat',\n",
    "                      'LOC:LONGITUDE':'Lon',\n",
    "                      'Temperature:Primary':'Temperature',\n",
    "                      'Oxygen:Dissolved:SBE':'Oxygen:Dissolved:CTD',\n",
    "                      'Salinity:T0:C0':'Salinity',},inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define a function that will be applied to the values in the index column;\n",
    "# this makes it easier to drop non-data rows later\n",
    "def convertIndex(val):\n",
    "    try:\n",
    "        x =int(val)\n",
    "    except ValueError:\n",
    "        x=np.nan\n",
    "    return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reading sheet 2015 CHEMTAX abs results\n"
     ]
    }
   ],
   "source": [
    "# load the 2015 phytoplankton data with the following options:\n",
    "#   sheet_name='2015 CHEMTAX abs results'  -> choose the 2015 sheet\n",
    "#   usecols='A:I,T:AC'   -> read only columns A:I and T:AC from the Excel sheet\n",
    "#   skiprows=2     -> start reading at the 3rd row of the sheet, \n",
    "#                     which contains the column headings\n",
    "#   converters={'Index': convertIndex,}   -> apply the function defined above to the Index column\n",
    "#   verbose = True   -> print extra information/ warnings/ errors\n",
    "dfPhyto=pd.read_excel(pathPhyto,sheet_name='2015 CHEMTAX abs results',usecols='A:I,T:AC',\n",
    "                      skiprows=2,converters={'Index': convertIndex,},\n",
    "                      verbose=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bin #</th>\n",
       "      <th>Index</th>\n",
       "      <th>Subgroup</th>\n",
       "      <th>Cruise</th>\n",
       "      <th>Month</th>\n",
       "      <th>Station</th>\n",
       "      <th>Sample#</th>\n",
       "      <th>rep</th>\n",
       "      <th>depth</th>\n",
       "      <th>Cyanobacteria.1</th>\n",
       "      <th>Prasinophytes.1</th>\n",
       "      <th>Cryptophytes.1</th>\n",
       "      <th>Diatoms-1.1</th>\n",
       "      <th>Diatoms-2.1</th>\n",
       "      <th>Dinoflagellates-1.1</th>\n",
       "      <th>Haptophytes.1</th>\n",
       "      <th>Dictyo.1</th>\n",
       "      <th>Raphido.1</th>\n",
       "      <th>TchlA (ug/L)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>12</td>\n",
       "      <td>49.0</td>\n",
       "      <td>1</td>\n",
       "      <td>2015-17</td>\n",
       "      <td>April</td>\n",
       "      <td>22</td>\n",
       "      <td>303</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0915533</td>\n",
       "      <td>0.0933184</td>\n",
       "      <td>9.81634</td>\n",
       "      <td>1.17253</td>\n",
       "      <td>0.00630329</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>11.1801</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>12</td>\n",
       "      <td>50.0</td>\n",
       "      <td>1</td>\n",
       "      <td>2015-17</td>\n",
       "      <td>April</td>\n",
       "      <td>22</td>\n",
       "      <td>303</td>\n",
       "      <td>B</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>Absolute Pigment Compositions - Bin # 16</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>From Sheet:  OutR4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51</th>\n",
       "      <td>Tchl_a</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52</th>\n",
       "      <td>Bin #</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Subgroup</td>\n",
       "      <td>Cruise</td>\n",
       "      <td>Month</td>\n",
       "      <td>Station</td>\n",
       "      <td>Sample#</td>\n",
       "      <td>rep</td>\n",
       "      <td>depth</td>\n",
       "      <td>Cyanobacteria</td>\n",
       "      <td>Prasinophytes</td>\n",
       "      <td>Cryptophytes</td>\n",
       "      <td>Diatoms-1</td>\n",
       "      <td>Diatoms-2</td>\n",
       "      <td>Dinoflagellates-1</td>\n",
       "      <td>Haptophytes</td>\n",
       "      <td>Dictyo</td>\n",
       "      <td>Raphido</td>\n",
       "      <td>TchlA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53</th>\n",
       "      <td>16</td>\n",
       "      <td>51.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2015-20</td>\n",
       "      <td>June</td>\n",
       "      <td>CPF2</td>\n",
       "      <td>396</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0672112</td>\n",
       "      <td>0.10427</td>\n",
       "      <td>0.0489969</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0163572</td>\n",
       "      <td>0</td>\n",
       "      <td>0.184355</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0422233</td>\n",
       "      <td>0.463414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54</th>\n",
       "      <td>16</td>\n",
       "      <td>52.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2015-20</td>\n",
       "      <td>June</td>\n",
       "      <td>CPF2</td>\n",
       "      <td>396</td>\n",
       "      <td>B</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55</th>\n",
       "      <td>16</td>\n",
       "      <td>53.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2015-20</td>\n",
       "      <td>June</td>\n",
       "      <td>CPF1</td>\n",
       "      <td>397</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0043579</td>\n",
       "      <td>0.495659</td>\n",
       "      <td>0.462661</td>\n",
       "      <td>0.692599</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0954998</td>\n",
       "      <td>0.246477</td>\n",
       "      <td>0.0565426</td>\n",
       "      <td>0.0569377</td>\n",
       "      <td>2.11073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>56</th>\n",
       "      <td>16</td>\n",
       "      <td>54.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2015-20</td>\n",
       "      <td>June</td>\n",
       "      <td>CPF1</td>\n",
       "      <td>397</td>\n",
       "      <td>B</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57</th>\n",
       "      <td>16</td>\n",
       "      <td>55.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2015-20</td>\n",
       "      <td>June</td>\n",
       "      <td>22</td>\n",
       "      <td>398</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0140808</td>\n",
       "      <td>0.169415</td>\n",
       "      <td>0.087069</td>\n",
       "      <td>0.392301</td>\n",
       "      <td>0.0331626</td>\n",
       "      <td>3.15016e-06</td>\n",
       "      <td>0.0847585</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00837901</td>\n",
       "      <td>0.789169</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58</th>\n",
       "      <td>16</td>\n",
       "      <td>56.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2015-20</td>\n",
       "      <td>June</td>\n",
       "      <td>22</td>\n",
       "      <td>398</td>\n",
       "      <td>B</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>59</th>\n",
       "      <td>16</td>\n",
       "      <td>57.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2015-20</td>\n",
       "      <td>June</td>\n",
       "      <td>24</td>\n",
       "      <td>399</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0200784</td>\n",
       "      <td>0.101276</td>\n",
       "      <td>0.0692901</td>\n",
       "      <td>0.406107</td>\n",
       "      <td>0.00241722</td>\n",
       "      <td>0.000152693</td>\n",
       "      <td>0.00213347</td>\n",
       "      <td>6.5546e-05</td>\n",
       "      <td>0</td>\n",
       "      <td>0.60152</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       Bin #  Index  Subgroup   Cruise  Month  \\\n",
       "48                                        12   49.0         1  2015-17  April   \n",
       "49                                        12   50.0         1  2015-17  April   \n",
       "50  Absolute Pigment Compositions - Bin # 16    NaN       NaN      NaN    NaN   \n",
       "51                                    Tchl_a    NaN       NaN      NaN    NaN   \n",
       "52                                     Bin #    NaN  Subgroup   Cruise  Month   \n",
       "53                                        16   51.0         2  2015-20   June   \n",
       "54                                        16   52.0         2  2015-20   June   \n",
       "55                                        16   53.0         2  2015-20   June   \n",
       "56                                        16   54.0         2  2015-20   June   \n",
       "57                                        16   55.0         2  2015-20   June   \n",
       "58                                        16   56.0         2  2015-20   June   \n",
       "59                                        16   57.0         2  2015-20   June   \n",
       "\n",
       "               Station  Sample#  rep  depth Cyanobacteria.1 Prasinophytes.1  \\\n",
       "48                  22      303    A      0               0       0.0915533   \n",
       "49                  22      303    B      0             NaN             NaN   \n",
       "50  From Sheet:  OutR4      NaN  NaN    NaN             NaN             NaN   \n",
       "51                 NaN      NaN  NaN    NaN             NaN             NaN   \n",
       "52             Station  Sample#  rep  depth   Cyanobacteria   Prasinophytes   \n",
       "53                CPF2      396    A      0       0.0672112         0.10427   \n",
       "54                CPF2      396    B      0             NaN             NaN   \n",
       "55                CPF1      397    A      0       0.0043579        0.495659   \n",
       "56                CPF1      397    B      0             NaN             NaN   \n",
       "57                  22      398    A      0       0.0140808        0.169415   \n",
       "58                  22      398    B      0             NaN             NaN   \n",
       "59                  24      399    A      0       0.0200784        0.101276   \n",
       "\n",
       "   Cryptophytes.1 Diatoms-1.1 Diatoms-2.1 Dinoflagellates-1.1 Haptophytes.1  \\\n",
       "48      0.0933184     9.81634     1.17253          0.00630329             0   \n",
       "49            NaN         NaN         NaN                 NaN           NaN   \n",
       "50            NaN         NaN         NaN                 NaN           NaN   \n",
       "51            NaN         NaN         NaN                 NaN           NaN   \n",
       "52   Cryptophytes   Diatoms-1   Diatoms-2   Dinoflagellates-1   Haptophytes   \n",
       "53      0.0489969           0   0.0163572                   0      0.184355   \n",
       "54            NaN         NaN         NaN                 NaN           NaN   \n",
       "55       0.462661    0.692599           0           0.0954998      0.246477   \n",
       "56            NaN         NaN         NaN                 NaN           NaN   \n",
       "57       0.087069    0.392301   0.0331626         3.15016e-06     0.0847585   \n",
       "58            NaN         NaN         NaN                 NaN           NaN   \n",
       "59      0.0692901    0.406107  0.00241722         0.000152693    0.00213347   \n",
       "\n",
       "      Dictyo.1   Raphido.1 TchlA (ug/L)  \n",
       "48           0           0      11.1801  \n",
       "49         NaN         NaN          NaN  \n",
       "50         NaN         NaN          NaN  \n",
       "51         NaN         NaN          NaN  \n",
       "52      Dictyo     Raphido        TchlA  \n",
       "53           0   0.0422233     0.463414  \n",
       "54         NaN         NaN          NaN  \n",
       "55   0.0565426   0.0569377      2.11073  \n",
       "56         NaN         NaN          NaN  \n",
       "57           0  0.00837901     0.789169  \n",
       "58         NaN         NaN          NaN  \n",
       "59  6.5546e-05           0      0.60152  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# display rows 48 to 59 of the resulting table\n",
    "dfPhyto[48:60]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now, drop any rows from the table that have NaN values in either of the columns\n",
    "#  'Index' or 'TchlA (ug/L)'\n",
    "# This is why we applied a function to the Index column to make sure all \n",
    "#  non-numeric Index values would have a consistent NaN entry, making them easy to identify\n",
    "#  and remove\n",
    "dfPhyto.dropna(subset=['Index', 'TchlA (ug/L)'],how='any',inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pandas creates its own index, and after dropping rows I like to reset it -\n",
    "# this is just for convenience\n",
    "dfPhyto.reset_index(drop=True,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bin #</th>\n",
       "      <th>Index</th>\n",
       "      <th>Subgroup</th>\n",
       "      <th>Cruise</th>\n",
       "      <th>Month</th>\n",
       "      <th>Station</th>\n",
       "      <th>Sample#</th>\n",
       "      <th>rep</th>\n",
       "      <th>depth</th>\n",
       "      <th>Cyanobacteria.1</th>\n",
       "      <th>Prasinophytes.1</th>\n",
       "      <th>Cryptophytes.1</th>\n",
       "      <th>Diatoms-1.1</th>\n",
       "      <th>Diatoms-2.1</th>\n",
       "      <th>Dinoflagellates-1.1</th>\n",
       "      <th>Haptophytes.1</th>\n",
       "      <th>Dictyo.1</th>\n",
       "      <th>Raphido.1</th>\n",
       "      <th>TchlA (ug/L)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>7</td>\n",
       "      <td>95.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2015-18</td>\n",
       "      <td>June</td>\n",
       "      <td>2</td>\n",
       "      <td>202</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.1201</td>\n",
       "      <td>0.0876395</td>\n",
       "      <td>0.151612</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.360452</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0861721</td>\n",
       "      <td>0.805976</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>7</td>\n",
       "      <td>97.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2015-18</td>\n",
       "      <td>June</td>\n",
       "      <td>3</td>\n",
       "      <td>218</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.108716</td>\n",
       "      <td>0.100695</td>\n",
       "      <td>0.113358</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.437597</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00902126</td>\n",
       "      <td>0.769387</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>7</td>\n",
       "      <td>99.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2015-18</td>\n",
       "      <td>June</td>\n",
       "      <td>6</td>\n",
       "      <td>249</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0899446</td>\n",
       "      <td>0.0257313</td>\n",
       "      <td>0.0671602</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.347255</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.530091</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51</th>\n",
       "      <td>7</td>\n",
       "      <td>101.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2015-18</td>\n",
       "      <td>June</td>\n",
       "      <td>9</td>\n",
       "      <td>261</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0929619</td>\n",
       "      <td>0.0873561</td>\n",
       "      <td>0.102004</td>\n",
       "      <td>0.00641763</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.279578</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.568318</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52</th>\n",
       "      <td>7</td>\n",
       "      <td>103.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2015-18</td>\n",
       "      <td>June</td>\n",
       "      <td>12</td>\n",
       "      <td>277</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0786977</td>\n",
       "      <td>0.1356</td>\n",
       "      <td>0.168394</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.444634</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0214413</td>\n",
       "      <td>0.848767</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53</th>\n",
       "      <td>7</td>\n",
       "      <td>105.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2015-18</td>\n",
       "      <td>June</td>\n",
       "      <td>14</td>\n",
       "      <td>292</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0520265</td>\n",
       "      <td>0.135979</td>\n",
       "      <td>0.161225</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.933363</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1.28259</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54</th>\n",
       "      <td>7</td>\n",
       "      <td>107.0</td>\n",
       "      <td>3</td>\n",
       "      <td>2015-18</td>\n",
       "      <td>June</td>\n",
       "      <td>16</td>\n",
       "      <td>303</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0162557</td>\n",
       "      <td>0.0659037</td>\n",
       "      <td>0.131307</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.79041</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1.00388</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55</th>\n",
       "      <td>1</td>\n",
       "      <td>109.0</td>\n",
       "      <td>4</td>\n",
       "      <td>2015-21</td>\n",
       "      <td>Sept</td>\n",
       "      <td>14</td>\n",
       "      <td>327</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.103725</td>\n",
       "      <td>0.411755</td>\n",
       "      <td>0.541998</td>\n",
       "      <td>0.644463</td>\n",
       "      <td>0.161644</td>\n",
       "      <td>0.0743051</td>\n",
       "      <td>0.794115</td>\n",
       "      <td>0.0455306</td>\n",
       "      <td>0.0427693</td>\n",
       "      <td>2.82031</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>56</th>\n",
       "      <td>1</td>\n",
       "      <td>111.0</td>\n",
       "      <td>4</td>\n",
       "      <td>2015-21</td>\n",
       "      <td>Sept</td>\n",
       "      <td>11</td>\n",
       "      <td>328</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.473666</td>\n",
       "      <td>0.863616</td>\n",
       "      <td>0.922717</td>\n",
       "      <td>0.198315</td>\n",
       "      <td>0.0325998</td>\n",
       "      <td>0.186888</td>\n",
       "      <td>1.37823</td>\n",
       "      <td>0.0334639</td>\n",
       "      <td>0.106627</td>\n",
       "      <td>4.19612</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57</th>\n",
       "      <td>1</td>\n",
       "      <td>113.0</td>\n",
       "      <td>4</td>\n",
       "      <td>2015-21</td>\n",
       "      <td>Sept</td>\n",
       "      <td>CPF2</td>\n",
       "      <td>329</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.270905</td>\n",
       "      <td>1.43341</td>\n",
       "      <td>0.910451</td>\n",
       "      <td>0.195409</td>\n",
       "      <td>0.61877</td>\n",
       "      <td>0</td>\n",
       "      <td>0.4964</td>\n",
       "      <td>0.127837</td>\n",
       "      <td>0.387924</td>\n",
       "      <td>4.4411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58</th>\n",
       "      <td>1</td>\n",
       "      <td>115.0</td>\n",
       "      <td>4</td>\n",
       "      <td>2015-21</td>\n",
       "      <td>Sept</td>\n",
       "      <td>CPF1</td>\n",
       "      <td>330</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.369211</td>\n",
       "      <td>1.56642</td>\n",
       "      <td>0.752309</td>\n",
       "      <td>0.207375</td>\n",
       "      <td>0.392154</td>\n",
       "      <td>0.000738665</td>\n",
       "      <td>0.142002</td>\n",
       "      <td>0.130842</td>\n",
       "      <td>0.4017</td>\n",
       "      <td>3.96275</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>59</th>\n",
       "      <td>1</td>\n",
       "      <td>117.0</td>\n",
       "      <td>4</td>\n",
       "      <td>2015-21</td>\n",
       "      <td>Sept</td>\n",
       "      <td>22</td>\n",
       "      <td>331</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.21602</td>\n",
       "      <td>1.21956</td>\n",
       "      <td>0.471939</td>\n",
       "      <td>0.421989</td>\n",
       "      <td>0.256469</td>\n",
       "      <td>0.00168525</td>\n",
       "      <td>0.402357</td>\n",
       "      <td>0.0810156</td>\n",
       "      <td>0.180523</td>\n",
       "      <td>3.25155</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Bin #  Index Subgroup   Cruise Month Station Sample# rep depth  \\\n",
       "48     7   95.0        3  2015-18  June       2     202   A     0   \n",
       "49     7   97.0        3  2015-18  June       3     218   A     0   \n",
       "50     7   99.0        3  2015-18  June       6     249   A     0   \n",
       "51     7  101.0        3  2015-18  June       9     261   A     0   \n",
       "52     7  103.0        3  2015-18  June      12     277   A     0   \n",
       "53     7  105.0        3  2015-18  June      14     292   A     0   \n",
       "54     7  107.0        3  2015-18  June      16     303   A     0   \n",
       "55     1  109.0        4  2015-21  Sept      14     327   A     0   \n",
       "56     1  111.0        4  2015-21  Sept      11     328   A     0   \n",
       "57     1  113.0        4  2015-21  Sept    CPF2     329   A     0   \n",
       "58     1  115.0        4  2015-21  Sept    CPF1     330   A     0   \n",
       "59     1  117.0        4  2015-21  Sept      22     331   A     0   \n",
       "\n",
       "   Cyanobacteria.1 Prasinophytes.1 Cryptophytes.1 Diatoms-1.1 Diatoms-2.1  \\\n",
       "48          0.1201       0.0876395       0.151612           0           0   \n",
       "49        0.108716        0.100695       0.113358           0           0   \n",
       "50       0.0899446       0.0257313      0.0671602           0           0   \n",
       "51       0.0929619       0.0873561       0.102004  0.00641763           0   \n",
       "52       0.0786977          0.1356       0.168394           0           0   \n",
       "53       0.0520265        0.135979       0.161225           0           0   \n",
       "54       0.0162557       0.0659037       0.131307           0           0   \n",
       "55        0.103725        0.411755       0.541998    0.644463    0.161644   \n",
       "56        0.473666        0.863616       0.922717    0.198315   0.0325998   \n",
       "57        0.270905         1.43341       0.910451    0.195409     0.61877   \n",
       "58        0.369211         1.56642       0.752309    0.207375    0.392154   \n",
       "59         0.21602         1.21956       0.471939    0.421989    0.256469   \n",
       "\n",
       "   Dinoflagellates-1.1 Haptophytes.1   Dictyo.1   Raphido.1 TchlA (ug/L)  \n",
       "48                   0      0.360452          0   0.0861721     0.805976  \n",
       "49                   0      0.437597          0  0.00902126     0.769387  \n",
       "50                   0      0.347255          0           0     0.530091  \n",
       "51                   0      0.279578          0           0     0.568318  \n",
       "52                   0      0.444634          0   0.0214413     0.848767  \n",
       "53                   0      0.933363          0           0      1.28259  \n",
       "54                   0       0.79041          0           0      1.00388  \n",
       "55           0.0743051      0.794115  0.0455306   0.0427693      2.82031  \n",
       "56            0.186888       1.37823  0.0334639    0.106627      4.19612  \n",
       "57                   0        0.4964   0.127837    0.387924       4.4411  \n",
       "58         0.000738665      0.142002   0.130842      0.4017      3.96275  \n",
       "59          0.00168525      0.402357  0.0810156    0.180523      3.25155  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# display part of the table, confirming that non-data rows have been removed\n",
    "dfPhyto[48:60]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# due to repeated column names in the original spreadsheet, '.1' was appended to the names\n",
    "# of the phytoplankton columns; \n",
    "# these lines correct the column names, removing the '.1':\n",
    "renameDict=dict()\n",
    "for colName in dfPhyto.keys():\n",
    "    if colName.endswith('.1'):\n",
    "        renameDict[colName]=colName.split('.1')[0]\n",
    "dfPhyto.rename(columns=renameDict,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Bin #</th>\n",
       "      <th>Index</th>\n",
       "      <th>Subgroup</th>\n",
       "      <th>Cruise</th>\n",
       "      <th>Month</th>\n",
       "      <th>Station</th>\n",
       "      <th>Sample#</th>\n",
       "      <th>rep</th>\n",
       "      <th>depth</th>\n",
       "      <th>Cyanobacteria</th>\n",
       "      <th>Prasinophytes</th>\n",
       "      <th>Cryptophytes</th>\n",
       "      <th>Diatoms-1</th>\n",
       "      <th>Diatoms-2</th>\n",
       "      <th>Dinoflagellates-1</th>\n",
       "      <th>Haptophytes</th>\n",
       "      <th>Dictyo</th>\n",
       "      <th>Raphido</th>\n",
       "      <th>TchlA (ug/L)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>2015-17</td>\n",
       "      <td>April</td>\n",
       "      <td>SI</td>\n",
       "      <td>9</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>24.4709</td>\n",
       "      <td>1.60687</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0143717</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0695617</td>\n",
       "      <td>26.1618</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1</td>\n",
       "      <td>2015-17</td>\n",
       "      <td>April</td>\n",
       "      <td>59</td>\n",
       "      <td>23</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>3.57707e-05</td>\n",
       "      <td>4.01063e-05</td>\n",
       "      <td>0</td>\n",
       "      <td>0.337957</td>\n",
       "      <td>0.039984</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0447033</td>\n",
       "      <td>2.42373e-05</td>\n",
       "      <td>0.000804232</td>\n",
       "      <td>0.423549</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>12</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1</td>\n",
       "      <td>2015-17</td>\n",
       "      <td>April</td>\n",
       "      <td>102</td>\n",
       "      <td>37</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00264467</td>\n",
       "      <td>0.021158</td>\n",
       "      <td>0.0450918</td>\n",
       "      <td>2.40675</td>\n",
       "      <td>0.610975</td>\n",
       "      <td>0.00463661</td>\n",
       "      <td>0.0242601</td>\n",
       "      <td>0.00118509</td>\n",
       "      <td>0.0118163</td>\n",
       "      <td>3.12852</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>12</td>\n",
       "      <td>7.0</td>\n",
       "      <td>1</td>\n",
       "      <td>2015-17</td>\n",
       "      <td>April</td>\n",
       "      <td>75</td>\n",
       "      <td>51</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00488975</td>\n",
       "      <td>0.00514994</td>\n",
       "      <td>0.0671581</td>\n",
       "      <td>3.17296</td>\n",
       "      <td>0.881844</td>\n",
       "      <td>0.00563953</td>\n",
       "      <td>0.0438563</td>\n",
       "      <td>0.0151278</td>\n",
       "      <td>0.0224263</td>\n",
       "      <td>4.21906</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>12</td>\n",
       "      <td>9.0</td>\n",
       "      <td>1</td>\n",
       "      <td>2015-17</td>\n",
       "      <td>April</td>\n",
       "      <td>72</td>\n",
       "      <td>64</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.130189</td>\n",
       "      <td>0.326276</td>\n",
       "      <td>4.42956</td>\n",
       "      <td>0.991716</td>\n",
       "      <td>0.50695</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>6.38469</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>83</th>\n",
       "      <td>16</td>\n",
       "      <td>165.0</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-19</td>\n",
       "      <td>Sept</td>\n",
       "      <td>9</td>\n",
       "      <td>248</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0903742</td>\n",
       "      <td>0.844792</td>\n",
       "      <td>0.15371</td>\n",
       "      <td>0.552866</td>\n",
       "      <td>0.20032</td>\n",
       "      <td>0.000378653</td>\n",
       "      <td>0.314222</td>\n",
       "      <td>0.0342132</td>\n",
       "      <td>0.0465154</td>\n",
       "      <td>2.23739</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84</th>\n",
       "      <td>16</td>\n",
       "      <td>167.0</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-19</td>\n",
       "      <td>Sept</td>\n",
       "      <td>6</td>\n",
       "      <td>261</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0838546</td>\n",
       "      <td>0.81372</td>\n",
       "      <td>0.308694</td>\n",
       "      <td>0.291579</td>\n",
       "      <td>0.32757</td>\n",
       "      <td>8.07944e-05</td>\n",
       "      <td>0.561729</td>\n",
       "      <td>0.0746783</td>\n",
       "      <td>0.0588162</td>\n",
       "      <td>2.52072</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>85</th>\n",
       "      <td>16</td>\n",
       "      <td>169.0</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-19</td>\n",
       "      <td>Sept</td>\n",
       "      <td>BS11</td>\n",
       "      <td>269</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.125485</td>\n",
       "      <td>10.1442</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>10.2697</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>86</th>\n",
       "      <td>16</td>\n",
       "      <td>171.0</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-19</td>\n",
       "      <td>Sept</td>\n",
       "      <td>3</td>\n",
       "      <td>285</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.12782</td>\n",
       "      <td>0.515151</td>\n",
       "      <td>0.262525</td>\n",
       "      <td>0.634216</td>\n",
       "      <td>0.170571</td>\n",
       "      <td>0.000441982</td>\n",
       "      <td>0.48741</td>\n",
       "      <td>0.033452</td>\n",
       "      <td>0.1269</td>\n",
       "      <td>2.35849</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87</th>\n",
       "      <td>16</td>\n",
       "      <td>173.0</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-19</td>\n",
       "      <td>Sept</td>\n",
       "      <td>2</td>\n",
       "      <td>300</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>0.111882</td>\n",
       "      <td>1.00504</td>\n",
       "      <td>0.165712</td>\n",
       "      <td>1.52858</td>\n",
       "      <td>0.0134512</td>\n",
       "      <td>0</td>\n",
       "      <td>0.396609</td>\n",
       "      <td>0.0252189</td>\n",
       "      <td>0.216654</td>\n",
       "      <td>3.46315</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>88 rows × 19 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Bin #  Index Subgroup   Cruise  Month Station Sample# rep depth  \\\n",
       "0     12    1.0        1  2015-17  April      SI       9   A     0   \n",
       "1     12    3.0        1  2015-17  April      59      23   A     0   \n",
       "2     12    5.0        1  2015-17  April     102      37   A     0   \n",
       "3     12    7.0        1  2015-17  April      75      51   A     0   \n",
       "4     12    9.0        1  2015-17  April      72      64   A     0   \n",
       "..   ...    ...      ...      ...    ...     ...     ...  ..   ...   \n",
       "83    16  165.0        5  2015-19   Sept       9     248   A     0   \n",
       "84    16  167.0        5  2015-19   Sept       6     261   A     0   \n",
       "85    16  169.0        5  2015-19   Sept    BS11     269   A     0   \n",
       "86    16  171.0        5  2015-19   Sept       3     285   A     0   \n",
       "87    16  173.0        5  2015-19   Sept       2     300   A     0   \n",
       "\n",
       "   Cyanobacteria Prasinophytes Cryptophytes Diatoms-1  Diatoms-2  \\\n",
       "0              0             0            0   24.4709    1.60687   \n",
       "1    3.57707e-05   4.01063e-05            0  0.337957   0.039984   \n",
       "2     0.00264467      0.021158    0.0450918   2.40675   0.610975   \n",
       "3     0.00488975    0.00514994    0.0671581   3.17296   0.881844   \n",
       "4              0      0.130189     0.326276   4.42956   0.991716   \n",
       "..           ...           ...          ...       ...        ...   \n",
       "83     0.0903742      0.844792      0.15371  0.552866    0.20032   \n",
       "84     0.0838546       0.81372     0.308694  0.291579    0.32757   \n",
       "85             0             0     0.125485   10.1442          0   \n",
       "86       0.12782      0.515151     0.262525  0.634216   0.170571   \n",
       "87      0.111882       1.00504     0.165712   1.52858  0.0134512   \n",
       "\n",
       "   Dinoflagellates-1 Haptophytes       Dictyo      Raphido TchlA (ug/L)  \n",
       "0                  0   0.0143717            0    0.0695617      26.1618  \n",
       "1                  0   0.0447033  2.42373e-05  0.000804232     0.423549  \n",
       "2         0.00463661   0.0242601   0.00118509    0.0118163      3.12852  \n",
       "3         0.00563953   0.0438563    0.0151278    0.0224263      4.21906  \n",
       "4            0.50695           0            0            0      6.38469  \n",
       "..               ...         ...          ...          ...          ...  \n",
       "83       0.000378653    0.314222    0.0342132    0.0465154      2.23739  \n",
       "84       8.07944e-05    0.561729    0.0746783    0.0588162      2.52072  \n",
       "85                 0           0            0            0      10.2697  \n",
       "86       0.000441982     0.48741     0.033452       0.1269      2.35849  \n",
       "87                 0    0.396609    0.0252189     0.216654      3.46315  \n",
       "\n",
       "[88 rows x 19 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfPhyto"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This is the important step- join the two tables ('left' and 'right'), \n",
    "#  matching the cruise IDs and sample numbers\n",
    "#   how='outer'  -> all rows from both the left and the right tables will be included, \n",
    "#                   even if they cannot be matched; this makes it easy to check for \n",
    "#                   unmatched data later\n",
    "#   left_on  specifies the name of the column to match in the left table (dfbot) \n",
    "#   right_on specifies the name of the column to match in the right table (dfPhyto)\n",
    "dfout = pd.merge(dfbot, dfPhyto,  how='outer', \n",
    "                 left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',\n",
       "       'LOC:EVENT_NUMBER', 'LOC:STATION', 'Lat', 'Lon', 'LOC:WATER DEPTH',\n",
       "       'ADM:SCIENTIST', 'Sample_Number', 'Pressure', 'Temperature',\n",
       "       'Transmissivity', 'Fluorescence:URU:Seapoint', 'PAR', 'Salinity',\n",
       "       'Oxygen:Dissolved:CTD', 'pH:SBE:Nominal', 'Salinity:Bottle',\n",
       "       'Flag:Salinity:Bottle', 'Chlorophyll:Extracted',\n",
       "       'Flag:Chlorophyll:Extracted', 'Nitrate_plus_Nitrite',\n",
       "       'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate',\n",
       "       'Flag:Phosphate', 'Cruise', 'Temperature:Draw',\n",
       "       'Phaeo-Pigment:Extracted', 'Oxygen:Dissolved', 'Flag:Oxygen:Dissolved',\n",
       "       'Number_of_bin_records', 'Bin #', 'Index', 'Subgroup', 'Month',\n",
       "       'Station', 'Sample#', 'rep', 'depth', 'Cyanobacteria', 'Prasinophytes',\n",
       "       'Cryptophytes', 'Diatoms-1', 'Diatoms-2', 'Dinoflagellates-1',\n",
       "       'Haptophytes', 'Dictyo', 'Raphido', 'TchlA (ug/L)'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# show the column names in the resulting table\n",
    "dfout.keys()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Checks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Identify cases where phytoplankton data were matched to multiple samples in bottle data:\n",
    "dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})\n",
    "temp.columns = ['icount']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1])"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.unique(temp.icount)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>icount</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cruise</th>\n",
       "      <th>Sample#</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [icount]\n",
       "Index: []"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check for Phyto samples matched to multiple bottle samples:\n",
    "temp.loc[temp.icount>1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>icount</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cruise</th>\n",
       "      <th>Sample#</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [icount]\n",
       "Index: []"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check for phyto samples not matched to bottle samples:\n",
    "temp.loc[temp.icount==0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>icount</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cruise</th>\n",
       "      <th>Sample_Number</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [icount]\n",
       "Index: []"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})\n",
    "temp2.columns = ['icount']\n",
    "# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:\n",
    "temp2.loc[temp2.icount>1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>icount</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cruise</th>\n",
       "      <th>Sample#</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [icount]\n",
       "Index: []"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check for phyto samples not matched to bottle samples:\n",
    "temp.loc[temp.icount==0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(938, 88, 938)"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# if the output table is longer than either of the input tables, some columns were not matched\n",
    "len(dfout), len(dfPhyto), len(dfbot)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(88, 88)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check that the number of cells with data in the 'Cyanobacteria' column is \n",
    "#  the same for the input and output tables:\n",
    "np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Series([], Name: Index, dtype: float64)\n"
     ]
    }
   ],
   "source": [
    "# If there were data rows from the phytoplankton table that were not matched to \n",
    "#  rows from the bottle table, their indices from the phytoplankton table would be \n",
    "#  displayed below (the series [] would not be empty)\n",
    "print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Index'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop repetetive/unecessary columns:\n",
    "dfout.drop(labels=['Bin #', 'Index', 'Subgroup', 'Month', 'Station', 'Sample#', 'rep',\n",
    "                   'depth',],axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# truncate phyto group values to 3 decimal places:\n",
    "for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',\n",
    "       'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',\n",
    "       'TchlA (ug/L)'):\n",
    "    dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',\n",
       "       'LOC:EVENT_NUMBER', 'LOC:STATION', 'Lat', 'Lon', 'LOC:WATER DEPTH',\n",
       "       'ADM:SCIENTIST', 'Sample_Number', 'Pressure', 'Temperature',\n",
       "       'Transmissivity', 'Fluorescence:URU:Seapoint', 'PAR', 'Salinity',\n",
       "       'Oxygen:Dissolved:CTD', 'pH:SBE:Nominal', 'Salinity:Bottle',\n",
       "       'Flag:Salinity:Bottle', 'Chlorophyll:Extracted',\n",
       "       'Flag:Chlorophyll:Extracted', 'Nitrate_plus_Nitrite',\n",
       "       'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate',\n",
       "       'Flag:Phosphate', 'Cruise', 'Temperature:Draw',\n",
       "       'Phaeo-Pigment:Extracted', 'Oxygen:Dissolved', 'Flag:Oxygen:Dissolved',\n",
       "       'Number_of_bin_records', 'Cyanobacteria', 'Prasinophytes',\n",
       "       'Cryptophytes', 'Diatoms-1', 'Diatoms-2', 'Dinoflagellates-1',\n",
       "       'Haptophytes', 'Dictyo', 'Raphido', 'TchlA (ug/L)'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfout.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now write the output table to a .csv file:\n",
    "dfout.to_csv(pathOut, index=False)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (py38_ct)",
   "language": "python",
   "name": "py38_ct"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}