{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np  # this module handles arrays, but here we need it for its NaN value\n",
    "import pandas as pd # this module contains a lot of tools for handling tabular data\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define paths to the source files and eventual output file\n",
    "pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2016 SoG bottle.xlsx'\n",
    "pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2019 SoG abs values New ALLO.xlsx'\n",
    "pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2016_NewALLO.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#formatting function to get year-(3digit) format\n",
    "def fmtCruise(istr):\n",
    "    if re.fullmatch('[0-9]{4}-[0-9]{2}',istr): \n",
    "        sp=re.split('-',istr)\n",
    "        rstr=sp[0]+'-0'+sp[1]\n",
    "    elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr):\n",
    "        rstr=istr\n",
    "    else:\n",
    "        raise ValueError('Input had unexpected format:',istr)\n",
    "    return rstr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['2016-05', '2016-47', '2016-07', '2016-62', '2016-010', '2016-071']\n"
     ]
    }
   ],
   "source": [
    "# get names of sheets in notebook\n",
    "with pd.ExcelFile(pathBottle,engine='openpyxl') as xl:\n",
    "    sheets=xl.sheet_names\n",
    "print(sheets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reading sheet 2016-05\n",
      "Reading sheet 2016-47\n",
      "Reading sheet 2016-07\n",
      "Reading sheet 2016-62\n",
      "Reading sheet 2016-010\n",
      "Reading sheet 2016-071\n",
      "removed empty columns: {'Flag:Ammonium', 'Oxygen:Dissolved.1', 'Ammonium'}\n"
     ]
    }
   ],
   "source": [
    "# load each sheet in the 2015 bottle Excel file and concatenate them together into one table\n",
    "dfbotlist=list()\n",
    "for sheet in sheets:\n",
    "    df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,engine='openpyxl',\n",
    "                      na_values=(-99,-99.9)) # read each sheet; include additional na values\n",
    "    df0['Cruise']=fmtCruise(sheet)  # create and populate Cruise column based on sheet name\n",
    "    dfbotlist.append(df0) # append the sheet to a list\n",
    "dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table\n",
    "# Drop columns with no data in them\n",
    "l1=set(dfbot.keys())\n",
    "dfbot.dropna(axis=1,how='all',inplace=True)\n",
    "print('removed empty columns:',l1-set(dfbot.keys()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',\n",
      "       'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH',\n",
      "       'ADM:SCIENTIST', 'LOC:STATION', 'Sample_Number', 'Pressure',\n",
      "       'Temperature:Secondary', 'Transmissivity', 'Fluorescence:URU:Seapoint',\n",
      "       'PAR', 'PAR:Reference', 'pH:SBE:Nominal', 'Salinity:T1:C1',\n",
      "       'Oxygen:Dissolved:SBE', 'Temperature:Draw', 'Salinity:Bottle',\n",
      "       'Flag:Salinity:Bottle', 'Chlorophyll:Extracted',\n",
      "       'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted',\n",
      "       'Oxygen:Dissolved', 'Flag:Oxygen:Dissolved', 'Nitrate_plus_Nitrite',\n",
      "       'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate',\n",
      "       'Flag:Phosphate', 'Cruise', 'PAR1', 'Sample_Number.1',\n",
      "       'Number_of_bin_records', 'YYYY/MM/DD HH:MM:SS', 'ADM:MISSION',\n",
      "       'ADM:PROJECT', 'Pressure [decibar]',\n",
      "       'Temperature:Secondary [deg C (ITS90)]', 'Transmissivity [*/metre]',\n",
      "       'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]',\n",
      "       'PAR:Reference [uE/m^2/sec]', 'Salinity:T1:C1 [PSS-78]',\n",
      "       'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]',\n",
      "       'Salinity:Bottle [PSS-78]', 'Chlorophyll:Extracted [mg/m^3]',\n",
      "       'Phaeo-Pigment:Extracted [mg/m^3]', 'Nitrate_plus_Nitrite [umol/L]',\n",
      "       'Silicate [umol/L]', 'Phosphate [umol/L]', 'Bottle_Number',\n",
      "       'Bottle:Firing_Sequence', 'Oxygen:Dissolved [mL/L]',\n",
      "       'Oxygen:Dissolved [umol/kg]'],\n",
      "      dtype='object')\n"
     ]
    }
   ],
   "source": [
    "# list the column names in the resulting table\n",
    "print(dfbot.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def subval(idf,colList):\n",
    "    # first value in colList should be the column you are going to keep\n",
    "    # follow with other columns that will be used to fill in when that column is NaN\n",
    "    # in order of precedence\n",
    "    if len(colList)==2:\n",
    "        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \\\n",
    "                         else  r[colList[1]] for i,r in idf.iterrows()]\n",
    "    elif len(colList)==3:\n",
    "        idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \\\n",
    "                         else  r[colList[1]] if not pd.isna(r[colList[1]]) \\\n",
    "                         else r[colList[2]] for i,r in idf.iterrows()]\n",
    "    else:\n",
    "        raise NotImplementedError('Add to code to handle this case')\n",
    "    return idf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0 : zero means no difference between Sample_Number and Sample_number.1 columns containing data\n",
      "0 : zero means no missing values in Sample_Number\n"
     ]
    }
   ],
   "source": [
    "# there are some duplicate columns here; handle them:\n",
    "print(np.max(np.abs(dfbot['Sample_Number']-dfbot['Sample_Number.1'])),\n",
    "      ': zero means no difference between Sample_Number and Sample_number.1 columns containing data')\n",
    "# check there are no rows missing a value in 'Sample_Number':\n",
    "print(np.sum(pd.isna(dfbot['Sample_Number'])),\n",
    "      ': zero means no missing values in Sample_Number')\n",
    "# -> extra copy is repeat so drop it\n",
    "# 'ADM:MISSION' contains cruise id so drop it\n",
    "# 'YYYY/MM/DD HH:MM:SS' data should be in 'FIL:START TIME YYYY/MM/DD HH:MM:SS'; move it there:\n",
    "dfbot=subval(dfbot,('FIL:START TIME YYYY/MM/DD HH:MM:SS','YYYY/MM/DD HH:MM:SS'))\n",
    "dfbot=subval(dfbot,('Pressure [decibar]','Pressure'))\n",
    "dfbot=subval(dfbot,('Temperature:Secondary [deg C (ITS90)]','Temperature:Secondary'))\n",
    "dfbot=subval(dfbot,('Transmissivity [*/metre]','Transmissivity'))\n",
    "dfbot=subval(dfbot,('Fluorescence:URU:Seapoint [mg/m^3]','Fluorescence:URU:Seapoint'))\n",
    "dfbot=subval(dfbot,('PAR [uE/m^2/sec]','PAR'))\n",
    "dfbot=subval(dfbot,('PAR:Reference [uE/m^2/sec]','PAR:Reference'))\n",
    "dfbot=subval(dfbot,('Salinity:T1:C1 [PSS-78]','Salinity:T1:C1'))\n",
    "dfbot=subval(dfbot,('Salinity:Bottle [PSS-78]','Salinity:Bottle'))\n",
    "dfbot=subval(dfbot,('Chlorophyll:Extracted [mg/m^3]','Chlorophyll:Extracted'))\n",
    "dfbot=subval(dfbot,('Phaeo-Pigment:Extracted [mg/m^3]','Phaeo-Pigment:Extracted'))\n",
    "dfbot=subval(dfbot,('Oxygen:Dissolved:SBE [mL/L]','Oxygen:Dissolved:SBE'))\n",
    "dfbot=subval(dfbot,('Oxygen:Dissolved [mL/L]','Oxygen:Dissolved'))\n",
    "dfbot=subval(dfbot,('Nitrate_plus_Nitrite [umol/L]','Nitrate_plus_Nitrite'))\n",
    "dfbot=subval(dfbot,('Silicate [umol/L]','Silicate'))\n",
    "dfbot=subval(dfbot,('Phosphate [umol/L]','Phosphate'))\n",
    "dfbot=subval(dfbot,('PAR [uE/m^2/sec]','PAR1'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop repetetive/unecessary columns:\n",
    "dfbot.drop(labels=['Sample_Number.1','ADM:MISSION','YYYY/MM/DD HH:MM:SS','Transmissivity',\n",
    "                   'Pressure','Temperature:Secondary','Fluorescence:URU:Seapoint','PAR',\n",
    "                   'PAR:Reference','Salinity:T1:C1','Chlorophyll:Extracted',\n",
    "                   'Oxygen:Dissolved:SBE','Oxygen:Dissolved',\n",
    "                   'Nitrate_plus_Nitrite','Silicate','PAR1'],axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define a function that will be applied to the values in the index column;\n",
    "# this makes it easier to drop non-data rows later\n",
    "def convertIndex(val):\n",
    "    try:\n",
    "        x =int(val)\n",
    "    except ValueError:\n",
    "        x=np.nan\n",
    "    return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reading sheet 2016\n"
     ]
    }
   ],
   "source": [
    "# load the 2015 phytoplankton data with the following options:\n",
    "#   sheet_name='2015 CHEMTAX abs results'  -> choose the 2015 sheet\n",
    "#   usecols='A:I,T:AC'   -> read only columns A:I and T:AC from the Excel sheet\n",
    "#   skiprows=2     -> start reading at the 3rd row of the sheet, \n",
    "#                     which contains the column headings\n",
    "#   converters={'Index': convertIndex,}   -> apply the function defined above to the Index column\n",
    "#   verbose = True   -> print extra information/ warnings/ errors\n",
    "dfPhyto=pd.read_excel(pathPhyto,sheet_name='2016',engine='openpyxl',\n",
    "                      skiprows=0,converters={'Sample#': convertIndex,},\n",
    "                      verbose=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Cruise</th>\n",
       "      <th>Date</th>\n",
       "      <th>Station</th>\n",
       "      <th>Sample#</th>\n",
       "      <th>rep</th>\n",
       "      <th>depth</th>\n",
       "      <th>Diatoms-1</th>\n",
       "      <th>Diatoms-2</th>\n",
       "      <th>Prasinophytes</th>\n",
       "      <th>Cryptophytes</th>\n",
       "      <th>Dinoflagellates-1</th>\n",
       "      <th>Haptophytes</th>\n",
       "      <th>Dictyo</th>\n",
       "      <th>Raphido</th>\n",
       "      <th>Cyanobacteria</th>\n",
       "      <th>TchlA</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>2016-007</td>\n",
       "      <td>June</td>\n",
       "      <td>14</td>\n",
       "      <td>306.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.215782</td>\n",
       "      <td>0.568259</td>\n",
       "      <td>0.155735</td>\n",
       "      <td>1.02837</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00485905</td>\n",
       "      <td>1.973</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>2016-007</td>\n",
       "      <td>June</td>\n",
       "      <td>16</td>\n",
       "      <td>318.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.176595</td>\n",
       "      <td>0</td>\n",
       "      <td>0.197922</td>\n",
       "      <td>0.556587</td>\n",
       "      <td>0.129862</td>\n",
       "      <td>0.704256</td>\n",
       "      <td>0.0753036</td>\n",
       "      <td>0.0859203</td>\n",
       "      <td>0.0115539</td>\n",
       "      <td>1.938</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>2016-007</td>\n",
       "      <td>June</td>\n",
       "      <td>OKE</td>\n",
       "      <td>329.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.534565</td>\n",
       "      <td>1.30459</td>\n",
       "      <td>0.310359</td>\n",
       "      <td>1.03751</td>\n",
       "      <td>0.435442</td>\n",
       "      <td>2.29758</td>\n",
       "      <td>2.14958</td>\n",
       "      <td>0.247075</td>\n",
       "      <td>0.0502969</td>\n",
       "      <td>8.367</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51</th>\n",
       "      <td>2016-007</td>\n",
       "      <td>June</td>\n",
       "      <td>22</td>\n",
       "      <td>333.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.748735</td>\n",
       "      <td>2.2245</td>\n",
       "      <td>0.0510914</td>\n",
       "      <td>0.270685</td>\n",
       "      <td>0.417001</td>\n",
       "      <td>0.156895</td>\n",
       "      <td>3.49932</td>\n",
       "      <td>0.0870553</td>\n",
       "      <td>0.0197189</td>\n",
       "      <td>7.475</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52</th>\n",
       "      <td>Cruise</td>\n",
       "      <td>Date</td>\n",
       "      <td>Station</td>\n",
       "      <td>NaN</td>\n",
       "      <td>rep</td>\n",
       "      <td>depth</td>\n",
       "      <td>Diatoms-1</td>\n",
       "      <td>Diatoms-2</td>\n",
       "      <td>Prasinophytes</td>\n",
       "      <td>Cryptophytes</td>\n",
       "      <td>Dinoflagellates-1</td>\n",
       "      <td>Haptophytes</td>\n",
       "      <td>Dictyo</td>\n",
       "      <td>Raphido</td>\n",
       "      <td>Cyanobacteria</td>\n",
       "      <td>TchlA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53</th>\n",
       "      <td>2016-047</td>\n",
       "      <td>June</td>\n",
       "      <td>14</td>\n",
       "      <td>438.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>1.03723</td>\n",
       "      <td>0.402856</td>\n",
       "      <td>0.0826985</td>\n",
       "      <td>0.215331</td>\n",
       "      <td>0.117579</td>\n",
       "      <td>0.224789</td>\n",
       "      <td>0.037861</td>\n",
       "      <td>0.0581045</td>\n",
       "      <td>0.0265498</td>\n",
       "      <td>2.203</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54</th>\n",
       "      <td>2016-047</td>\n",
       "      <td>June</td>\n",
       "      <td>11</td>\n",
       "      <td>449.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.37708</td>\n",
       "      <td>0.165123</td>\n",
       "      <td>0.087552</td>\n",
       "      <td>0.113769</td>\n",
       "      <td>0.101932</td>\n",
       "      <td>0.26103</td>\n",
       "      <td>0.0118683</td>\n",
       "      <td>0.0485396</td>\n",
       "      <td>0.0161065</td>\n",
       "      <td>1.183</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55</th>\n",
       "      <td>2016-047</td>\n",
       "      <td>June</td>\n",
       "      <td>CPF1</td>\n",
       "      <td>452.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00981477</td>\n",
       "      <td>0</td>\n",
       "      <td>0.125556</td>\n",
       "      <td>0.143086</td>\n",
       "      <td>0.0616604</td>\n",
       "      <td>0.287119</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0535119</td>\n",
       "      <td>0.00525178</td>\n",
       "      <td>0.686</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>56</th>\n",
       "      <td>2016-047</td>\n",
       "      <td>June</td>\n",
       "      <td>CPF2</td>\n",
       "      <td>455.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.223128</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0825152</td>\n",
       "      <td>0.324354</td>\n",
       "      <td>0.067458</td>\n",
       "      <td>0.251326</td>\n",
       "      <td>0.0179209</td>\n",
       "      <td>0.125422</td>\n",
       "      <td>0.026876</td>\n",
       "      <td>1.119</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57</th>\n",
       "      <td>2016-047</td>\n",
       "      <td>June</td>\n",
       "      <td>22</td>\n",
       "      <td>466.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.612099</td>\n",
       "      <td>0.124002</td>\n",
       "      <td>0.0585194</td>\n",
       "      <td>0.158803</td>\n",
       "      <td>0.0759229</td>\n",
       "      <td>0.119778</td>\n",
       "      <td>0.0561802</td>\n",
       "      <td>0.0190252</td>\n",
       "      <td>0.00866994</td>\n",
       "      <td>1.233</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58</th>\n",
       "      <td>2016-047</td>\n",
       "      <td>June</td>\n",
       "      <td>24</td>\n",
       "      <td>469.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.696984</td>\n",
       "      <td>0.106455</td>\n",
       "      <td>0.0752098</td>\n",
       "      <td>0.160245</td>\n",
       "      <td>0.0776968</td>\n",
       "      <td>0.182455</td>\n",
       "      <td>0.0441467</td>\n",
       "      <td>0.0226451</td>\n",
       "      <td>0.00516366</td>\n",
       "      <td>1.371</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>59</th>\n",
       "      <td>2016-047</td>\n",
       "      <td>June</td>\n",
       "      <td>28</td>\n",
       "      <td>472.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.818837</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0915953</td>\n",
       "      <td>0.174323</td>\n",
       "      <td>0.055937</td>\n",
       "      <td>0.0619514</td>\n",
       "      <td>0.0265586</td>\n",
       "      <td>0.0612878</td>\n",
       "      <td>0.0205105</td>\n",
       "      <td>1.311</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Cruise  Date  Station  Sample#   rep  depth   Diatoms-1  Diatoms-2  \\\n",
       "48  2016-007  June       14    306.0  mean      0           0          0   \n",
       "49  2016-007  June       16    318.0  mean      0    0.176595          0   \n",
       "50  2016-007  June      OKE    329.0  mean      0    0.534565    1.30459   \n",
       "51  2016-007  June       22    333.0  mean      0    0.748735     2.2245   \n",
       "52    Cruise  Date  Station      NaN   rep  depth   Diatoms-1  Diatoms-2   \n",
       "53  2016-047  June       14    438.0  mean      0     1.03723   0.402856   \n",
       "54  2016-047  June       11    449.0  mean      0     0.37708   0.165123   \n",
       "55  2016-047  June     CPF1    452.0  mean      0  0.00981477          0   \n",
       "56  2016-047  June     CPF2    455.0  mean      0    0.223128          0   \n",
       "57  2016-047  June       22    466.0  mean      0    0.612099   0.124002   \n",
       "58  2016-047  June       24    469.0  mean      0    0.696984   0.106455   \n",
       "59  2016-047  June       28    472.0  mean      0    0.818837          0   \n",
       "\n",
       "    Prasinophytes  Cryptophytes  Dinoflagellates-1  Haptophytes     Dictyo  \\\n",
       "48       0.215782      0.568259           0.155735      1.02837          0   \n",
       "49       0.197922      0.556587           0.129862     0.704256  0.0753036   \n",
       "50       0.310359       1.03751           0.435442      2.29758    2.14958   \n",
       "51      0.0510914      0.270685           0.417001     0.156895    3.49932   \n",
       "52  Prasinophytes  Cryptophytes  Dinoflagellates-1  Haptophytes     Dictyo   \n",
       "53      0.0826985      0.215331           0.117579     0.224789   0.037861   \n",
       "54       0.087552      0.113769           0.101932      0.26103  0.0118683   \n",
       "55       0.125556      0.143086          0.0616604     0.287119          0   \n",
       "56      0.0825152      0.324354           0.067458     0.251326  0.0179209   \n",
       "57      0.0585194      0.158803          0.0759229     0.119778  0.0561802   \n",
       "58      0.0752098      0.160245          0.0776968     0.182455  0.0441467   \n",
       "59      0.0915953      0.174323           0.055937    0.0619514  0.0265586   \n",
       "\n",
       "      Raphido  Cyanobacteria  TchlA  \n",
       "48          0     0.00485905  1.973  \n",
       "49  0.0859203      0.0115539  1.938  \n",
       "50   0.247075      0.0502969  8.367  \n",
       "51  0.0870553      0.0197189  7.475  \n",
       "52    Raphido  Cyanobacteria  TchlA  \n",
       "53  0.0581045      0.0265498  2.203  \n",
       "54  0.0485396      0.0161065  1.183  \n",
       "55  0.0535119     0.00525178  0.686  \n",
       "56   0.125422       0.026876  1.119  \n",
       "57  0.0190252     0.00866994  1.233  \n",
       "58  0.0226451     0.00516366  1.371  \n",
       "59  0.0612878      0.0205105  1.311  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# display rows 48 to 59 of the resulting table\n",
    "dfPhyto[48:60]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now, drop any rows from the table that have NaN values in either of the columns\n",
    "#  'Index' or 'TchlA (ug/L)'\n",
    "# This is why we applied a function to the Index column to make sure all \n",
    "#  non-numeric Index values would have a consistent NaN entry, making them easy to identify\n",
    "#  and remove\n",
    "dfPhyto.dropna(subset=['Sample#', 'TchlA'],how='any',inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pandas creates its own index, and after dropping rows I like to reset it -\n",
    "# this is just for convenience\n",
    "dfPhyto.reset_index(drop=True,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# apply formatting function all rows in Cruise column to get year-3digit format\n",
    "dfPhyto['Cruise']=[fmtCruise(ii) for ii in dfPhyto['Cruise']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Cruise</th>\n",
       "      <th>Date</th>\n",
       "      <th>Station</th>\n",
       "      <th>Sample#</th>\n",
       "      <th>rep</th>\n",
       "      <th>depth</th>\n",
       "      <th>Diatoms-1</th>\n",
       "      <th>Diatoms-2</th>\n",
       "      <th>Prasinophytes</th>\n",
       "      <th>Cryptophytes</th>\n",
       "      <th>Dinoflagellates-1</th>\n",
       "      <th>Haptophytes</th>\n",
       "      <th>Dictyo</th>\n",
       "      <th>Raphido</th>\n",
       "      <th>Cyanobacteria</th>\n",
       "      <th>TchlA</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>2016-007</td>\n",
       "      <td>June</td>\n",
       "      <td>16</td>\n",
       "      <td>318.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.176595</td>\n",
       "      <td>0</td>\n",
       "      <td>0.197922</td>\n",
       "      <td>0.556587</td>\n",
       "      <td>0.129862</td>\n",
       "      <td>0.704256</td>\n",
       "      <td>0.0753036</td>\n",
       "      <td>0.0859203</td>\n",
       "      <td>0.0115539</td>\n",
       "      <td>1.938</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>2016-007</td>\n",
       "      <td>June</td>\n",
       "      <td>OKE</td>\n",
       "      <td>329.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.534565</td>\n",
       "      <td>1.30459</td>\n",
       "      <td>0.310359</td>\n",
       "      <td>1.03751</td>\n",
       "      <td>0.435442</td>\n",
       "      <td>2.29758</td>\n",
       "      <td>2.14958</td>\n",
       "      <td>0.247075</td>\n",
       "      <td>0.0502969</td>\n",
       "      <td>8.367</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>2016-007</td>\n",
       "      <td>June</td>\n",
       "      <td>22</td>\n",
       "      <td>333.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.748735</td>\n",
       "      <td>2.2245</td>\n",
       "      <td>0.0510914</td>\n",
       "      <td>0.270685</td>\n",
       "      <td>0.417001</td>\n",
       "      <td>0.156895</td>\n",
       "      <td>3.49932</td>\n",
       "      <td>0.0870553</td>\n",
       "      <td>0.0197189</td>\n",
       "      <td>7.475</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51</th>\n",
       "      <td>2016-047</td>\n",
       "      <td>June</td>\n",
       "      <td>14</td>\n",
       "      <td>438.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>1.03723</td>\n",
       "      <td>0.402856</td>\n",
       "      <td>0.0826985</td>\n",
       "      <td>0.215331</td>\n",
       "      <td>0.117579</td>\n",
       "      <td>0.224789</td>\n",
       "      <td>0.037861</td>\n",
       "      <td>0.0581045</td>\n",
       "      <td>0.0265498</td>\n",
       "      <td>2.203</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52</th>\n",
       "      <td>2016-047</td>\n",
       "      <td>June</td>\n",
       "      <td>11</td>\n",
       "      <td>449.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.37708</td>\n",
       "      <td>0.165123</td>\n",
       "      <td>0.087552</td>\n",
       "      <td>0.113769</td>\n",
       "      <td>0.101932</td>\n",
       "      <td>0.26103</td>\n",
       "      <td>0.0118683</td>\n",
       "      <td>0.0485396</td>\n",
       "      <td>0.0161065</td>\n",
       "      <td>1.183</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53</th>\n",
       "      <td>2016-047</td>\n",
       "      <td>June</td>\n",
       "      <td>CPF1</td>\n",
       "      <td>452.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00981477</td>\n",
       "      <td>0</td>\n",
       "      <td>0.125556</td>\n",
       "      <td>0.143086</td>\n",
       "      <td>0.0616604</td>\n",
       "      <td>0.287119</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0535119</td>\n",
       "      <td>0.00525178</td>\n",
       "      <td>0.686</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54</th>\n",
       "      <td>2016-047</td>\n",
       "      <td>June</td>\n",
       "      <td>CPF2</td>\n",
       "      <td>455.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.223128</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0825152</td>\n",
       "      <td>0.324354</td>\n",
       "      <td>0.067458</td>\n",
       "      <td>0.251326</td>\n",
       "      <td>0.0179209</td>\n",
       "      <td>0.125422</td>\n",
       "      <td>0.026876</td>\n",
       "      <td>1.119</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55</th>\n",
       "      <td>2016-047</td>\n",
       "      <td>June</td>\n",
       "      <td>22</td>\n",
       "      <td>466.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.612099</td>\n",
       "      <td>0.124002</td>\n",
       "      <td>0.0585194</td>\n",
       "      <td>0.158803</td>\n",
       "      <td>0.0759229</td>\n",
       "      <td>0.119778</td>\n",
       "      <td>0.0561802</td>\n",
       "      <td>0.0190252</td>\n",
       "      <td>0.00866994</td>\n",
       "      <td>1.233</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>56</th>\n",
       "      <td>2016-047</td>\n",
       "      <td>June</td>\n",
       "      <td>24</td>\n",
       "      <td>469.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.696984</td>\n",
       "      <td>0.106455</td>\n",
       "      <td>0.0752098</td>\n",
       "      <td>0.160245</td>\n",
       "      <td>0.0776968</td>\n",
       "      <td>0.182455</td>\n",
       "      <td>0.0441467</td>\n",
       "      <td>0.0226451</td>\n",
       "      <td>0.00516366</td>\n",
       "      <td>1.371</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57</th>\n",
       "      <td>2016-047</td>\n",
       "      <td>June</td>\n",
       "      <td>28</td>\n",
       "      <td>472.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.818837</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0915953</td>\n",
       "      <td>0.174323</td>\n",
       "      <td>0.055937</td>\n",
       "      <td>0.0619514</td>\n",
       "      <td>0.0265586</td>\n",
       "      <td>0.0612878</td>\n",
       "      <td>0.0205105</td>\n",
       "      <td>1.311</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58</th>\n",
       "      <td>2016-047</td>\n",
       "      <td>June</td>\n",
       "      <td>GE01</td>\n",
       "      <td>483.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>1.71173</td>\n",
       "      <td>0</td>\n",
       "      <td>0.120347</td>\n",
       "      <td>0.210731</td>\n",
       "      <td>0.0864201</td>\n",
       "      <td>0.0260724</td>\n",
       "      <td>0.0207969</td>\n",
       "      <td>0.0199069</td>\n",
       "      <td>0</td>\n",
       "      <td>2.196</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>59</th>\n",
       "      <td>2016-047</td>\n",
       "      <td>June</td>\n",
       "      <td>38</td>\n",
       "      <td>486.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>6.43463</td>\n",
       "      <td>0</td>\n",
       "      <td>0.186531</td>\n",
       "      <td>0.432792</td>\n",
       "      <td>0.137378</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.196669</td>\n",
       "      <td>0</td>\n",
       "      <td>7.388</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Cruise  Date Station  Sample#   rep depth   Diatoms-1 Diatoms-2  \\\n",
       "48  2016-007  June      16    318.0  mean     0    0.176595         0   \n",
       "49  2016-007  June     OKE    329.0  mean     0    0.534565   1.30459   \n",
       "50  2016-007  June      22    333.0  mean     0    0.748735    2.2245   \n",
       "51  2016-047  June      14    438.0  mean     0     1.03723  0.402856   \n",
       "52  2016-047  June      11    449.0  mean     0     0.37708  0.165123   \n",
       "53  2016-047  June    CPF1    452.0  mean     0  0.00981477         0   \n",
       "54  2016-047  June    CPF2    455.0  mean     0    0.223128         0   \n",
       "55  2016-047  June      22    466.0  mean     0    0.612099  0.124002   \n",
       "56  2016-047  June      24    469.0  mean     0    0.696984  0.106455   \n",
       "57  2016-047  June      28    472.0  mean     0    0.818837         0   \n",
       "58  2016-047  June    GE01    483.0  mean     0     1.71173         0   \n",
       "59  2016-047  June      38    486.0  mean     0     6.43463         0   \n",
       "\n",
       "   Prasinophytes Cryptophytes Dinoflagellates-1 Haptophytes     Dictyo  \\\n",
       "48      0.197922     0.556587          0.129862    0.704256  0.0753036   \n",
       "49      0.310359      1.03751          0.435442     2.29758    2.14958   \n",
       "50     0.0510914     0.270685          0.417001    0.156895    3.49932   \n",
       "51     0.0826985     0.215331          0.117579    0.224789   0.037861   \n",
       "52      0.087552     0.113769          0.101932     0.26103  0.0118683   \n",
       "53      0.125556     0.143086         0.0616604    0.287119          0   \n",
       "54     0.0825152     0.324354          0.067458    0.251326  0.0179209   \n",
       "55     0.0585194     0.158803         0.0759229    0.119778  0.0561802   \n",
       "56     0.0752098     0.160245         0.0776968    0.182455  0.0441467   \n",
       "57     0.0915953     0.174323          0.055937   0.0619514  0.0265586   \n",
       "58      0.120347     0.210731         0.0864201   0.0260724  0.0207969   \n",
       "59      0.186531     0.432792          0.137378           0          0   \n",
       "\n",
       "      Raphido Cyanobacteria  TchlA  \n",
       "48  0.0859203     0.0115539  1.938  \n",
       "49   0.247075     0.0502969  8.367  \n",
       "50  0.0870553     0.0197189  7.475  \n",
       "51  0.0581045     0.0265498  2.203  \n",
       "52  0.0485396     0.0161065  1.183  \n",
       "53  0.0535119    0.00525178  0.686  \n",
       "54   0.125422      0.026876  1.119  \n",
       "55  0.0190252    0.00866994  1.233  \n",
       "56  0.0226451    0.00516366  1.371  \n",
       "57  0.0612878     0.0205105  1.311  \n",
       "58  0.0199069             0  2.196  \n",
       "59   0.196669             0  7.388  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# display part of the table, confirming that non-data rows have been removed\n",
    "dfPhyto[48:60]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# due to repeated column names in the original spreadsheet, '.1' was appended to the names\n",
    "# of the phytoplankton columns; \n",
    "# these lines correct the column names, removing the '.1':\n",
    "renameDict=dict()\n",
    "for colName in dfPhyto.keys():\n",
    "    if colName.endswith('.1'):\n",
    "        renameDict[colName]=colName.split('.1')[0]\n",
    "dfPhyto.rename(columns=renameDict,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Cruise</th>\n",
       "      <th>Date</th>\n",
       "      <th>Station</th>\n",
       "      <th>Sample#</th>\n",
       "      <th>rep</th>\n",
       "      <th>depth</th>\n",
       "      <th>Diatoms-1</th>\n",
       "      <th>Diatoms-2</th>\n",
       "      <th>Prasinophytes</th>\n",
       "      <th>Cryptophytes</th>\n",
       "      <th>Dinoflagellates-1</th>\n",
       "      <th>Haptophytes</th>\n",
       "      <th>Dictyo</th>\n",
       "      <th>Raphido</th>\n",
       "      <th>Cyanobacteria</th>\n",
       "      <th>TchlA</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2016-005</td>\n",
       "      <td>April</td>\n",
       "      <td>59</td>\n",
       "      <td>23.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>4.50144</td>\n",
       "      <td>0.438107</td>\n",
       "      <td>0.048483</td>\n",
       "      <td>0.338791</td>\n",
       "      <td>0.0478202</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00362949</td>\n",
       "      <td>0.000129969</td>\n",
       "      <td>0.0086402</td>\n",
       "      <td>5.38704</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2016-005</td>\n",
       "      <td>April</td>\n",
       "      <td>102</td>\n",
       "      <td>37.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>3.55538</td>\n",
       "      <td>1.73326</td>\n",
       "      <td>0.385956</td>\n",
       "      <td>1.82211</td>\n",
       "      <td>0.139308</td>\n",
       "      <td>0.186364</td>\n",
       "      <td>0.0440193</td>\n",
       "      <td>0.0767997</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9432</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2016-005</td>\n",
       "      <td>April</td>\n",
       "      <td>75</td>\n",
       "      <td>51.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>14.7795</td>\n",
       "      <td>0.563364</td>\n",
       "      <td>0.397391</td>\n",
       "      <td>1.18036</td>\n",
       "      <td>0.075357</td>\n",
       "      <td>0.0941965</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>17.0901</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2016-005</td>\n",
       "      <td>April</td>\n",
       "      <td>72</td>\n",
       "      <td>64.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.923699</td>\n",
       "      <td>0.745832</td>\n",
       "      <td>0.337639</td>\n",
       "      <td>1.20012</td>\n",
       "      <td>0.0704119</td>\n",
       "      <td>0.0893556</td>\n",
       "      <td>0.00326812</td>\n",
       "      <td>0.0523156</td>\n",
       "      <td>0</td>\n",
       "      <td>3.42264</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2016-005</td>\n",
       "      <td>April</td>\n",
       "      <td>69</td>\n",
       "      <td>76.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>3.88279</td>\n",
       "      <td>0.290172</td>\n",
       "      <td>0.183778</td>\n",
       "      <td>0.531872</td>\n",
       "      <td>0.0478852</td>\n",
       "      <td>0.0609817</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4.99748</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>115</th>\n",
       "      <td>2016-071</td>\n",
       "      <td>November</td>\n",
       "      <td>9</td>\n",
       "      <td>250.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0142285</td>\n",
       "      <td>0.082281</td>\n",
       "      <td>0.0897016</td>\n",
       "      <td>0.0562649</td>\n",
       "      <td>0.000186116</td>\n",
       "      <td>0.00849243</td>\n",
       "      <td>0.017071</td>\n",
       "      <td>0.00322275</td>\n",
       "      <td>0.00355171</td>\n",
       "      <td>0.275</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116</th>\n",
       "      <td>2016-071</td>\n",
       "      <td>November</td>\n",
       "      <td>6</td>\n",
       "      <td>264.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00399836</td>\n",
       "      <td>0.104654</td>\n",
       "      <td>0.145557</td>\n",
       "      <td>0.109826</td>\n",
       "      <td>0.00047569</td>\n",
       "      <td>0.040834</td>\n",
       "      <td>0.0139237</td>\n",
       "      <td>0.00948663</td>\n",
       "      <td>0.00424517</td>\n",
       "      <td>0.433</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>117</th>\n",
       "      <td>2016-071</td>\n",
       "      <td>November</td>\n",
       "      <td>3</td>\n",
       "      <td>281.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0634224</td>\n",
       "      <td>0.0775545</td>\n",
       "      <td>0.0395764</td>\n",
       "      <td>0.0314152</td>\n",
       "      <td>0.0276703</td>\n",
       "      <td>0.0142296</td>\n",
       "      <td>0.0202238</td>\n",
       "      <td>0.00390783</td>\n",
       "      <td>0.278</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>118</th>\n",
       "      <td>2016-071</td>\n",
       "      <td>November</td>\n",
       "      <td>2</td>\n",
       "      <td>297.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0636933</td>\n",
       "      <td>0.181767</td>\n",
       "      <td>0.274402</td>\n",
       "      <td>0.0402761</td>\n",
       "      <td>0.0463697</td>\n",
       "      <td>0.0196038</td>\n",
       "      <td>0.0268882</td>\n",
       "      <td>0</td>\n",
       "      <td>0.653</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119</th>\n",
       "      <td>2016-071</td>\n",
       "      <td>November</td>\n",
       "      <td>27</td>\n",
       "      <td>314.0</td>\n",
       "      <td>mean</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00343804</td>\n",
       "      <td>0.112049</td>\n",
       "      <td>0.343388</td>\n",
       "      <td>0.400357</td>\n",
       "      <td>0.035914</td>\n",
       "      <td>0.0469057</td>\n",
       "      <td>0.00552154</td>\n",
       "      <td>0.0769891</td>\n",
       "      <td>0.00243813</td>\n",
       "      <td>1.027</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>120 rows × 16 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       Cruise      Date Station  Sample#   rep depth   Diatoms-1  Diatoms-2  \\\n",
       "0    2016-005     April      59     23.0  mean     0     4.50144   0.438107   \n",
       "1    2016-005     April     102     37.0  mean     0     3.55538    1.73326   \n",
       "2    2016-005     April      75     51.0  mean     0     14.7795   0.563364   \n",
       "3    2016-005     April      72     64.0  mean     0    0.923699   0.745832   \n",
       "4    2016-005     April      69     76.0  mean     0     3.88279   0.290172   \n",
       "..        ...       ...     ...      ...   ...   ...         ...        ...   \n",
       "115  2016-071  November       9    250.0  mean     0   0.0142285   0.082281   \n",
       "116  2016-071  November       6    264.0  mean     0  0.00399836   0.104654   \n",
       "117  2016-071  November       3    281.0  mean     0           0  0.0634224   \n",
       "118  2016-071  November       2    297.0  mean     0           0  0.0636933   \n",
       "119  2016-071  November      27    314.0  mean     0  0.00343804   0.112049   \n",
       "\n",
       "    Prasinophytes Cryptophytes Dinoflagellates-1 Haptophytes      Dictyo  \\\n",
       "0        0.048483     0.338791         0.0478202           0  0.00362949   \n",
       "1        0.385956      1.82211          0.139308    0.186364   0.0440193   \n",
       "2        0.397391      1.18036          0.075357   0.0941965           0   \n",
       "3        0.337639      1.20012         0.0704119   0.0893556  0.00326812   \n",
       "4        0.183778     0.531872         0.0478852   0.0609817           0   \n",
       "..            ...          ...               ...         ...         ...   \n",
       "115     0.0897016    0.0562649       0.000186116  0.00849243    0.017071   \n",
       "116      0.145557     0.109826        0.00047569    0.040834   0.0139237   \n",
       "117     0.0775545    0.0395764         0.0314152   0.0276703   0.0142296   \n",
       "118      0.181767     0.274402         0.0402761   0.0463697   0.0196038   \n",
       "119      0.343388     0.400357          0.035914   0.0469057  0.00552154   \n",
       "\n",
       "         Raphido Cyanobacteria    TchlA  \n",
       "0    0.000129969     0.0086402  5.38704  \n",
       "1      0.0767997             0   7.9432  \n",
       "2              0             0  17.0901  \n",
       "3      0.0523156             0  3.42264  \n",
       "4              0             0  4.99748  \n",
       "..           ...           ...      ...  \n",
       "115   0.00322275    0.00355171    0.275  \n",
       "116   0.00948663    0.00424517    0.433  \n",
       "117    0.0202238    0.00390783    0.278  \n",
       "118    0.0268882             0    0.653  \n",
       "119    0.0769891    0.00243813    1.027  \n",
       "\n",
       "[120 rows x 16 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfPhyto"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This is the important step- join the two tables ('left' and 'right'), \n",
    "#  matching the cruise IDs and sample numbers\n",
    "#   how='outer'  -> all rows from both the left and the right tables will be included, \n",
    "#                   even if they cannot be matched; this makes it easy to check for \n",
    "#                   unmatched data later\n",
    "#   left_on  specifies the name of the column to match in the left table (dfbot) \n",
    "#   right_on specifies the name of the column to match in the right table (dfPhyto)\n",
    "dfout = pd.merge(dfbot, dfPhyto,  how='outer', \n",
    "                 left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',\n",
       "       'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH',\n",
       "       'ADM:SCIENTIST', 'LOC:STATION', 'Sample_Number', 'pH:SBE:Nominal',\n",
       "       'Temperature:Draw', 'Salinity:Bottle', 'Flag:Salinity:Bottle',\n",
       "       'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted',\n",
       "       'Flag:Oxygen:Dissolved', 'Flag:Nitrate_plus_Nitrite', 'Flag:Silicate',\n",
       "       'Phosphate', 'Flag:Phosphate', 'Cruise', 'Number_of_bin_records',\n",
       "       'ADM:PROJECT', 'Pressure [decibar]',\n",
       "       'Temperature:Secondary [deg C (ITS90)]', 'Transmissivity [*/metre]',\n",
       "       'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]',\n",
       "       'PAR:Reference [uE/m^2/sec]', 'Salinity:T1:C1 [PSS-78]',\n",
       "       'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]',\n",
       "       'Salinity:Bottle [PSS-78]', 'Chlorophyll:Extracted [mg/m^3]',\n",
       "       'Phaeo-Pigment:Extracted [mg/m^3]', 'Nitrate_plus_Nitrite [umol/L]',\n",
       "       'Silicate [umol/L]', 'Phosphate [umol/L]', 'Bottle_Number',\n",
       "       'Bottle:Firing_Sequence', 'Oxygen:Dissolved [mL/L]',\n",
       "       'Oxygen:Dissolved [umol/kg]', 'Date', 'Station', 'Sample#', 'rep',\n",
       "       'depth', 'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes',\n",
       "       'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',\n",
       "       'Cyanobacteria', 'TchlA'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# show the column names in the resulting table\n",
    "dfout.keys()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Checks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Identify cases where phytoplankton data were matched to multiple samples in bottle data:\n",
    "dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})\n",
    "temp.columns = ['icount']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1])"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.unique(temp.icount)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>icount</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cruise</th>\n",
       "      <th>Sample#</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [icount]\n",
       "Index: []"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check for Phyto samples matched to multiple bottle samples:\n",
    "temp.loc[temp.icount>1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>icount</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cruise</th>\n",
       "      <th>Sample#</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [icount]\n",
       "Index: []"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check for phyto samples not matched to bottle samples:\n",
    "temp.loc[temp.icount==0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>icount</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cruise</th>\n",
       "      <th>Sample_Number</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [icount]\n",
       "Index: []"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})\n",
    "temp2.columns = ['icount']\n",
    "# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:\n",
    "temp2.loc[temp2.icount>1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>icount</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cruise</th>\n",
       "      <th>Sample#</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [icount]\n",
       "Index: []"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check for phyto samples not matched to bottle samples:\n",
    "temp.loc[temp.icount==0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1420, 120, 1420)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# if the output table is longer than either of the input tables, some columns were not matched\n",
    "len(dfout), len(dfPhyto), len(dfbot)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(120, 120)"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check that the number of cells with data in the 'Cyanobacteria' column is \n",
    "#  the same for the input and output tables to show that no rows are missing:\n",
    "np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Series([], Name: Sample#, dtype: float64)\n"
     ]
    }
   ],
   "source": [
    "# If there were data rows from the phytoplankton table that were not matched to \n",
    "#  rows from the bottle table, their indices from the phytoplankton table would be \n",
    "#  displayed below (the series [] would not be empty)\n",
    "print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Sample#'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop repetetive/unecessary columns:\n",
    "dfout.drop(labels=['Date', 'Station', 'Sample#', 'rep',\n",
    "                   'depth',],axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# truncate phyto group values to 3 decimal places:\n",
    "for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',\n",
    "       'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',\n",
    "       'TchlA'):\n",
    "    dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         NaN\n",
       "1         NaN\n",
       "2         NaN\n",
       "3         NaN\n",
       "4         NaN\n",
       "        ...  \n",
       "1415      NaN\n",
       "1416      NaN\n",
       "1417      NaN\n",
       "1418    0.002\n",
       "1419      NaN\n",
       "Name: Cyanobacteria, Length: 1420, dtype: float64"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfout['Cyanobacteria']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "'Depth [metres]'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[0;32m~/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m   2897\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2898\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2899\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mKeyError\u001b[0m: 'Depth [metres]'",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-34-98a40a30babe>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# Check for unexpected depths\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdfout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdfout\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Pressure [decibar]'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m>\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m|\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdfout\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Depth [metres]'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m>\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m&\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdfout\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Diatoms-1'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m>=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;31m#,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0;31m#         ['FIL:START TIME YYYY/MM/DD HH:MM:SS','Pressure [decibar]','Depth [metres]','Diatoms-1']]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   2904\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnlevels\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2905\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_multilevel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2906\u001b[0;31m             \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2907\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mis_integer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2908\u001b[0m                 \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m   2898\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2899\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2900\u001b[0;31m                 \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2901\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2902\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mtolerance\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyError\u001b[0m: 'Depth [metres]'"
     ]
    }
   ],
   "source": [
    "# Check for unexpected depths\n",
    "dfout.loc[((dfout['Pressure [decibar]']>10)|(dfout['Depth [metres]']>10))&(dfout['Diatoms-1']>=0)]#,\n",
    "#         ['FIL:START TIME YYYY/MM/DD HH:MM:SS','Pressure [decibar]','Depth [metres]','Diatoms-1']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now write the output table to a .csv file:\n",
    "dfout.to_csv(pathOut, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfout.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (py39)",
   "language": "python",
   "name": "py39"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}