{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np # this module handles arrays, but here we need it for its NaN value\n", "import pandas as pd # this module contains a lot of tools for handling tabular data\n", "import re" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# define paths to the source files and eventual output file\n", "#pathBottle='/ocean/eolson/MEOPAR/obs/Nina/All 2018 SoG bottle.xlsx'\n", "#pathPhyto='/ocean/eolson/MEOPAR/obs/Nina/2015-2018 Abs phyto groups.xlsx'\n", "pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2019 SoG bottle_revWithNew044.xlsx'\n", "pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2019 SoG abs values New ALLO.xlsx'\n", "pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2019_NewALLO.csv'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "#formatting function to get year-(3digit) format\n", "def fmtCruise(istr):\n", " if re.fullmatch('[0-9]{4}-[0-9]{2}',istr): \n", " sp=re.split('-',istr)\n", " rstr=sp[0]+'-0'+sp[1]\n", " elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr):\n", " rstr=istr\n", " else:\n", " raise ValueError('Input had unexpected format:',istr)\n", " return rstr" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['2019-001', '2019-007', '2019-016', '2019-044', '2019-005', '2019-045', '2019-050', '2019-023', '2019-062']\n" ] } ], "source": [ "# get names of sheets in notebook\n", "with pd.ExcelFile(pathBottle,engine='openpyxl') as xl:\n", " sheets=xl.sheet_names\n", "print(sheets)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reading sheet 2019-001\n", "Reading sheet 2019-007\n", "Reading sheet 2019-016\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/eolson/anaconda3/envs/py39/lib/python3.9/site-packages/openpyxl/worksheet/_reader.py:300: UserWarning: Unknown extension is not supported and will be removed\n", " warn(msg)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Reading sheet 2019-044\n", "Reading sheet 2019-005\n", "Reading sheet 2019-045\n", "Reading sheet 2019-050\n", "Reading sheet 2019-023\n", "Reading sheet 2019-062\n" ] } ], "source": [ "# load each sheet in the bottle Excel file and concatenate them together into one table\n", "dfbotlist=list()\n", "for sheet in sheets:\n", " df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,engine='openpyxl',\n", " na_values=(-99,-99.9)) # read each sheet; include additional na values\n", " df0['Cruise']=fmtCruise(sheet) # create and populate Cruise column based on sheet name\n", " dfbotlist.append(df0) # append the sheet to a list\n", "dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "removed empty columns: {'Dimethylsulfoniopropionate_Total [nmol/L]', 'Flag:Dimethyl_Sulphide', 'Flag:Dimethylsulfoniopropionate_Total', 'Dimethyl_Sulphide [nmol/L]', 'Ammonium [umol/L]', 'Dimethylsulfoniopropionate_Dissolved [nmol/L]', 'Flag:Ammonium', 'Flag:Dimethylsulfoniopropionate_Dissolve'}\n" ] } ], "source": [ "# Drop columns with no data in them\n", "l1=set(dfbot.keys())\n", "dfbot.dropna(axis=1,how='all',inplace=True)\n", "print('removed empty columns:',l1-set(dfbot.keys()))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Chlorophyll:Extracted\n", "Chlorophyll:Extracted [mg/m^3]\n", "Comments by sample_numbeR\n", "Conductivity:Primary [S/m]\n", "Conductivity:Secondary\n", "Conductivity:Secondary [S/m]\n", "Cruise\n", "Depth\n", "Depth [metres]\n", "Depth:Nominal [metres]\n", "FIL:DATA DESCRIPTION\n", "FIL:START TIME YYYY/MM/DD HH:MM:SS\n", "File Name\n", "Fluorescence:URU:Seapoint\n", "Fluorescence:URU:Seapoint [mg/m^3]\n", "Fluorescence:URU:Wetlabs [mg/m^3]\n", "Nitrate_plus_Nitrite\n", "Nitrate_plus_Nitrite [umol/L]\n", "Number_of_bin_records\n", "Oxygen:Dissolved\n", "Oxygen:Dissolved [mL/L]\n", "Oxygen:Dissolved [umol/kg]\n", "Oxygen:Dissolved.1\n", "Oxygen:Dissolved:SBE\n", "Oxygen:Dissolved:SBE [mL/L]\n", "Oxygen:Dissolved:SBE [umol/kg]\n", "Oxygen:Dissolved:SBE.1\n", "PAR\n", "PAR [uE/m^2/sec]\n", "PAR:Reference\n", "PAR:Reference [uE/m^2/sec]\n", "Phaeo-Pigment:Extracted\n", "Phaeo-Pigment:Extracted [mg/m^3]\n", "Phosphate\n", "Phosphate [umol/L]\n", "Pressure\n", "Pressure [decibar]\n", "Salinity:Bottle\n", "Salinity:Bottle [PSS-78]\n", "Salinity:T0:C0 [PSS-78]\n", "Salinity:T1:C1\n", "Salinity:T1:C1 [PSS-78]\n", "Sample_Number\n", "Silicate\n", "Silicate [umol/L]\n", "Temperature:Draw\n", "Temperature:Draw [deg C (ITS90)]\n", "Temperature:Primary [deg C (ITS90)]\n", "Temperature:Secondary\n", "Temperature:Secondary [deg C (ITS90)]\n", "Transmissivity\n", "Transmissivity [*/metre]\n", "Transmissivity2 [*/metre]\n", "Transmissivity:Green [*/metre]\n", "YYYY/MM/DD HH:MM:SS\n", "Zone\n", "Zone.1\n", "pH:SBE:Nominal\n" ] } ], "source": [ "# list the column names in the resulting table\n", "for el in sorted(dfbot.keys()):\n", " if not el.startswith('ADM') and not el.startswith('Flag') and not el.startswith('LOC') \\\n", " and not el.startswith('Bottle'):\n", " print(el)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Oxygen:Dissolved:SBE [mL/L] 275.2\n", "Oxygen:Dissolved:SBE [umol/kg] 447.5\n", "Oxygen:Dissolved [mL/L] 9.571\n", "Oxygen:Dissolved [umol/kg] 419.6\n", "Flag:Oxygen:Dissolved 46.0\n", "Oxygen:Dissolved:SBE 9.19\n", "Oxygen:Dissolved:SBE.1 9.19\n", "Oxygen:Dissolved 8.924\n", "Oxygen:Dissolved.1 8.924\n" ] } ], "source": [ "for el in dfbot.keys():\n", " if 'Oxy' in el:\n", " print(el,np.max(dfbot[el]))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1104 1104 340 340\n" ] }, { "data": { "text/plain": [ "340" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# no rows returned, so there are no rows with both primary and secondary temperature values\n", "print(np.sum(dfbot['Depth [metres]']>=0),\n", " np.sum(dfbot['Pressure [decibar]']>=0),\n", " np.sum(dfbot['Pressure']>=0),\n", " np.sum(dfbot['Depth']>=0))\n", "len(dfbot.loc[~(dfbot['Depth [metres]']>=0)&\\\n", " ~(dfbot['Pressure [decibar]']>=0)&\\\n", " (dfbot['Pressure']>=0)])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "726 378 340\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ZoneLOC:EVENT_NUMBERLOC:LATITUDELOC:LONGITUDELOC:WATER DEPTHADM:SCIENTISTADM:MISSIONLOC:STATIONADM:PROJECTYYYY/MM/DD HH:MM:SS...Temperature:DrawSalinity:BottleChlorophyll:ExtractedPhaeo-Pigment:ExtractedOxygen:DissolvedOxygen:Dissolved.1Nitrate_plus_NitriteSilicatePhosphateTransmissivity:Green [*/metre]
\n", "

0 rows × 77 columns

\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [Zone, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, ADM:MISSION, LOC:STATION, ADM:PROJECT, YYYY/MM/DD HH:MM:SS, Bottle_Number, Bottle:Firing_Sequence, Sample_Number, Pressure [decibar], Depth [metres], Temperature:Primary [deg C (ITS90)], Transmissivity [*/metre], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], Salinity:T0:C0 [PSS-78], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Conductivity:Primary [S/m], Number_of_bin_records, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, FIL:START TIME YYYY/MM/DD HH:MM:SS, Fluorescence:URU:Wetlabs [mg/m^3], Depth:Nominal [metres], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Temperature:Secondary [deg C (ITS90)], Conductivity:Secondary [S/m], PAR:Reference [uE/m^2/sec], pH:SBE:Nominal, Salinity:T1:C1 [PSS-78], File Name, Zone.1, Transmissivity2 [*/metre], LOC:ALTIMETER (M), ADM:PLATFORM, LOC:GEOGRAPHIC AREA, FIL:DATA DESCRIPTION, Pressure, Depth, Temperature:Secondary, Conductivity:Secondary, Transmissivity, Fluorescence:URU:Seapoint, PAR, PAR:Reference, Salinity:T1:C1, Oxygen:Dissolved:SBE, Oxygen:Dissolved:SBE.1, Temperature:Draw, Salinity:Bottle, Chlorophyll:Extracted, Phaeo-Pigment:Extracted, Oxygen:Dissolved, Oxygen:Dissolved.1, Nitrate_plus_Nitrite, Silicate, Phosphate, Transmissivity:Green [*/metre]]\n", "Index: []\n", "\n", "[0 rows x 77 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# no rows returned, so there are no rows with both primary and secondary temperature values\n", "print(np.sum(dfbot['Temperature:Primary [deg C (ITS90)]']>=0),\n", " np.sum(dfbot['Temperature:Secondary [deg C (ITS90)]']>=0),\n", " np.sum(dfbot['Temperature:Secondary']>=0))\n", "dfbot.loc[(np.array([int(ii) for ii in (dfbot['Temperature:Primary [deg C (ITS90)]']>=0)])+\\\n", " np.array([int(ii) for ii in (dfbot['Temperature:Secondary [deg C (ITS90)]']>=0)])+\\\n", " np.array([int(ii) for ii in (dfbot['Temperature:Secondary']>=0)]))>1]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "993 340 110\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ZoneLOC:EVENT_NUMBERLOC:LATITUDELOC:LONGITUDELOC:WATER DEPTHADM:SCIENTISTADM:MISSIONLOC:STATIONADM:PROJECTYYYY/MM/DD HH:MM:SS...Temperature:DrawSalinity:BottleChlorophyll:ExtractedPhaeo-Pigment:ExtractedOxygen:DissolvedOxygen:Dissolved.1Nitrate_plus_NitriteSilicatePhosphateTransmissivity:Green [*/metre]
\n", "

0 rows × 77 columns

\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [Zone, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, ADM:MISSION, LOC:STATION, ADM:PROJECT, YYYY/MM/DD HH:MM:SS, Bottle_Number, Bottle:Firing_Sequence, Sample_Number, Pressure [decibar], Depth [metres], Temperature:Primary [deg C (ITS90)], Transmissivity [*/metre], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], Salinity:T0:C0 [PSS-78], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Conductivity:Primary [S/m], Number_of_bin_records, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, FIL:START TIME YYYY/MM/DD HH:MM:SS, Fluorescence:URU:Wetlabs [mg/m^3], Depth:Nominal [metres], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Temperature:Secondary [deg C (ITS90)], Conductivity:Secondary [S/m], PAR:Reference [uE/m^2/sec], pH:SBE:Nominal, Salinity:T1:C1 [PSS-78], File Name, Zone.1, Transmissivity2 [*/metre], LOC:ALTIMETER (M), ADM:PLATFORM, LOC:GEOGRAPHIC AREA, FIL:DATA DESCRIPTION, Pressure, Depth, Temperature:Secondary, Conductivity:Secondary, Transmissivity, Fluorescence:URU:Seapoint, PAR, PAR:Reference, Salinity:T1:C1, Oxygen:Dissolved:SBE, Oxygen:Dissolved:SBE.1, Temperature:Draw, Salinity:Bottle, Chlorophyll:Extracted, Phaeo-Pigment:Extracted, Oxygen:Dissolved, Oxygen:Dissolved.1, Nitrate_plus_Nitrite, Silicate, Phosphate, Transmissivity:Green [*/metre]]\n", "Index: []\n", "\n", "[0 rows x 77 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# no rows returned, so there are no rows with both primary and secondary temperature values\n", "print(np.sum(dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0),\n", " np.sum(dfbot['Fluorescence:URU:Seapoint']>=0),\n", " np.sum(dfbot['Fluorescence:URU:Wetlabs [mg/m^3]']>=0))\n", "dfbot.loc[(np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0)])+\\\n", " np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Seapoint']>=0)])+\\\n", " np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Wetlabs [mg/m^3]']>=0)]))>1]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "726 378 340\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ZoneLOC:EVENT_NUMBERLOC:LATITUDELOC:LONGITUDELOC:WATER DEPTHADM:SCIENTISTADM:MISSIONLOC:STATIONADM:PROJECTYYYY/MM/DD HH:MM:SS...Temperature:DrawSalinity:BottleChlorophyll:ExtractedPhaeo-Pigment:ExtractedOxygen:DissolvedOxygen:Dissolved.1Nitrate_plus_NitriteSilicatePhosphateTransmissivity:Green [*/metre]
\n", "

0 rows × 77 columns

\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [Zone, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, ADM:MISSION, LOC:STATION, ADM:PROJECT, YYYY/MM/DD HH:MM:SS, Bottle_Number, Bottle:Firing_Sequence, Sample_Number, Pressure [decibar], Depth [metres], Temperature:Primary [deg C (ITS90)], Transmissivity [*/metre], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], Salinity:T0:C0 [PSS-78], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Conductivity:Primary [S/m], Number_of_bin_records, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, FIL:START TIME YYYY/MM/DD HH:MM:SS, Fluorescence:URU:Wetlabs [mg/m^3], Depth:Nominal [metres], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Temperature:Secondary [deg C (ITS90)], Conductivity:Secondary [S/m], PAR:Reference [uE/m^2/sec], pH:SBE:Nominal, Salinity:T1:C1 [PSS-78], File Name, Zone.1, Transmissivity2 [*/metre], LOC:ALTIMETER (M), ADM:PLATFORM, LOC:GEOGRAPHIC AREA, FIL:DATA DESCRIPTION, Pressure, Depth, Temperature:Secondary, Conductivity:Secondary, Transmissivity, Fluorescence:URU:Seapoint, PAR, PAR:Reference, Salinity:T1:C1, Oxygen:Dissolved:SBE, Oxygen:Dissolved:SBE.1, Temperature:Draw, Salinity:Bottle, Chlorophyll:Extracted, Phaeo-Pigment:Extracted, Oxygen:Dissolved, Oxygen:Dissolved.1, Nitrate_plus_Nitrite, Silicate, Phosphate, Transmissivity:Green [*/metre]]\n", "Index: []\n", "\n", "[0 rows x 77 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# no rows returned, so there are no rows with both both salinity fields\n", "print(np.sum(dfbot['Salinity:T0:C0 [PSS-78]']>=0),\n", " np.sum(dfbot['Salinity:T1:C1 [PSS-78]']>=0),\n", " np.sum(dfbot['Salinity:T1:C1']>=0))\n", "dfbot.loc[(np.array([int(ii) for ii in (dfbot['Salinity:T0:C0 [PSS-78]']>=0)])+\\\n", " np.array([int(ii) for ii in (dfbot['Salinity:T1:C1 [PSS-78]']>=0)])+\\\n", " np.array([int(ii) for ii in (dfbot['Salinity:T1:C1']>=0)]))>1]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1104 340\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ZoneLOC:EVENT_NUMBERLOC:LATITUDELOC:LONGITUDELOC:WATER DEPTHADM:SCIENTISTADM:MISSIONLOC:STATIONADM:PROJECTYYYY/MM/DD HH:MM:SS...Temperature:DrawSalinity:BottleChlorophyll:ExtractedPhaeo-Pigment:ExtractedOxygen:DissolvedOxygen:Dissolved.1Nitrate_plus_NitriteSilicatePhosphateTransmissivity:Green [*/metre]
\n", "

0 rows × 77 columns

\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [Zone, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, ADM:MISSION, LOC:STATION, ADM:PROJECT, YYYY/MM/DD HH:MM:SS, Bottle_Number, Bottle:Firing_Sequence, Sample_Number, Pressure [decibar], Depth [metres], Temperature:Primary [deg C (ITS90)], Transmissivity [*/metre], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], Salinity:T0:C0 [PSS-78], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Conductivity:Primary [S/m], Number_of_bin_records, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, FIL:START TIME YYYY/MM/DD HH:MM:SS, Fluorescence:URU:Wetlabs [mg/m^3], Depth:Nominal [metres], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Temperature:Secondary [deg C (ITS90)], Conductivity:Secondary [S/m], PAR:Reference [uE/m^2/sec], pH:SBE:Nominal, Salinity:T1:C1 [PSS-78], File Name, Zone.1, Transmissivity2 [*/metre], LOC:ALTIMETER (M), ADM:PLATFORM, LOC:GEOGRAPHIC AREA, FIL:DATA DESCRIPTION, Pressure, Depth, Temperature:Secondary, Conductivity:Secondary, Transmissivity, Fluorescence:URU:Seapoint, PAR, PAR:Reference, Salinity:T1:C1, Oxygen:Dissolved:SBE, Oxygen:Dissolved:SBE.1, Temperature:Draw, Salinity:Bottle, Chlorophyll:Extracted, Phaeo-Pigment:Extracted, Oxygen:Dissolved, Oxygen:Dissolved.1, Nitrate_plus_Nitrite, Silicate, Phosphate, Transmissivity:Green [*/metre]]\n", "Index: []\n", "\n", "[0 rows x 77 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# no rows returned, so there are no rows with both primary and secondary temperature values\n", "print(np.sum(dfbot['Pressure [decibar]']>=0),\n", " np.sum(dfbot['Pressure']>=0))\n", "dfbot.loc[(dfbot['Pressure [decibar]']>=0)&\\\n", " (dfbot['Pressure']>=0)]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def subval(idf,colList):\n", " # first value in colList should be the column you are going to keep\n", " # follow with other columns that will be used to fill in when that column is NaN\n", " # in order of precedence\n", " if len(colList)==2:\n", " idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \\\n", " else r[colList[1]] for i,r in idf.iterrows()]\n", " elif len(colList)==3:\n", " idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \\\n", " else r[colList[1]] if not pd.isna(r[colList[1]]) \\\n", " else r[colList[2]] for i,r in idf.iterrows()]\n", " else:\n", " raise NotImplementedError('Add to code to handle this case')\n", " idf.drop(columns=list(colList[1:]),inplace=True)\n", " return idf" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# there are some duplicate columns here; handle them:\n", "dfbot=subval(dfbot,('Depth [metres]','Depth'))\n", "dfbot=subval(dfbot,('Pressure [decibar]','Pressure'))\n", "dfbot=subval(dfbot,('Temperature:Primary [deg C (ITS90)]',\n", " 'Temperature:Secondary [deg C (ITS90)]',\n", " 'Temperature:Secondary'))\n", "dfbot=subval(dfbot,('Salinity:T0:C0 [PSS-78]',\n", " 'Salinity:T1:C1 [PSS-78]',\n", " 'Salinity:T1:C1'))\n", "dfbot=subval(dfbot,('Fluorescence:URU:Seapoint [mg/m^3]',\n", " 'Fluorescence:URU:Wetlabs [mg/m^3]',\n", " 'Fluorescence:URU:Seapoint'))\n", "dfbot=subval(dfbot,('PAR [uE/m^2/sec]','PAR'))\n", "dfbot=subval(dfbot,('PAR:Reference [uE/m^2/sec]','PAR:Reference'))\n", "dfbot=subval(dfbot,('Oxygen:Dissolved:SBE [mL/L]',\n", " 'Oxygen:Dissolved:SBE',\n", " 'Oxygen:Dissolved:SBE.1'))\n", "dfbot=subval(dfbot,('Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved', 'Oxygen:Dissolved.1'))\n", "dfbot=subval(dfbot,('Transmissivity [*/metre]','Transmissivity2 [*/metre]','Transmissivity'))\n", "dfbot=subval(dfbot,('Nitrate_plus_Nitrite [umol/L]', 'Nitrate_plus_Nitrite'))\n", "dfbot=subval(dfbot,('Silicate [umol/L]','Silicate'))\n", "dfbot=subval(dfbot,('Phosphate [umol/L]','Phosphate'))\n", "dfbot=subval(dfbot,('Phaeo-Pigment:Extracted [mg/m^3]','Phaeo-Pigment:Extracted'))\n", "dfbot=subval(dfbot,('Chlorophyll:Extracted [mg/m^3]','Chlorophyll:Extracted'))\n", "dfbot=subval(dfbot,('Conductivity:Primary [S/m]',\n", " 'Conductivity:Secondary [S/m]',\n", " 'Conductivity:Secondary'))\n", "dfbot=subval(dfbot,('FIL:START TIME YYYY/MM/DD HH:MM:SS','YYYY/MM/DD HH:MM:SS'))\n", "dfbot=subval(dfbot,('Zone', 'Zone.1'))\n", "dfbot=subval(dfbot,('Temperature:Draw [deg C (ITS90)]','Temperature:Draw'))\n", "dfbot=subval(dfbot,('Salinity:Bottle [PSS-78]','Salinity:Bottle'))\n", "\n", "dfbot.rename(columns={'Temperature:Primary [deg C (ITS90)]':'Temperature [deg C (ITS90)]'},\n", " inplace=True)\n", "dfbot.rename(columns={'Salinity:T0:C0 [PSS-78]':'Salinity [PSS-78]'},\n", " inplace=True)\n", "dfbot.rename(columns={'Fluorescence:URU:Seapoint [mg/m^3]':'Fluorescence:URU [mg/m^3]'},\n", " inplace=True)\n", "dfbot.rename(columns={'Conductivity:Primary [S/m]':'Conductivity [S/m]'},\n", " inplace=True)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# define a function that will be applied to the values in the index column;\n", "# this makes it easier to drop non-data rows later\n", "def convertIndex(val):\n", " try:\n", " x =int(val)\n", " except ValueError:\n", " x=np.nan\n", " return x" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reading sheet 2019\n" ] } ], "source": [ "# load the phytoplankton data with the following options:\n", "# sheet_name='???? CHEMTAX abs results' -> choose the sheet\n", "# usecols='A:I,T:AC' -> read only columns A:I and T:AC from the Excel sheet\n", "# skiprows=2 -> start reading at the 3rd row of the sheet, \n", "# which contains the column headings\n", "# converters={'Index': convertIndex,} -> apply the function defined above to the Index column\n", "# verbose = True -> print extra information/ warnings/ errors\n", "dfPhyto=pd.read_excel(pathPhyto,sheet_name='2019',engine='openpyxl',\n", " skiprows=0,converters={'Sample#': convertIndex,},\n", " verbose=True)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CruiseDateStationSample#repdepthDiatoms-1Diatoms-2PrasinophytesCryptophytesDinoflagellates-1HaptophytesDictyoRaphidoCyanobacteriaTchlA
482019-0162019-04-11 00:00:00102318.0mean01.818610.2541110.02103250.1068420.05268730.22264400.003691540.01171182.49133
492019-0162019-04-11 00:00:0075332.0mean01.661030.1455490.01618150.1135340.05437280.23678500.009869790.007492112.24481
502019-0162019-04-11 00:00:0072345.0mean01.692530.06070430.02738420.209950.1070620.2359140.01712220.0193430.005435822.37545
512019-0162019-04-11 00:00:0069357.0mean01.208150.1899210.02377310.2198370.06971940.27559900.01288930.004300022.00419
522019-0162019-04-11 00:00:00ADCP367.0mean00.2771470.03764730.01820110.2220860.02077150.05088630.00271470.005334820.003128480.637917
532019-0162019-04-11 00:00:0065377.0mean00.912550.03741690.02956070.2260870.0469380.2320940.006586760.01169960.005121611.50806
542019-0162019-04-11 00:00:0063380.0mean00.7042810.02661050.03176020.2545170.05259360.2079930.001729550.00595130.005521831.29096
552019-0162019-04-11 00:00:0062391.0mean00.521770.08005260.01186010.1177730.01964370.06269850.011508900.00460340.82991
562019-0162019-04-12 00:00:00SC04394.0mean03.58720.13906100.1121510.02948170.0468341000.01522133.92995
57CruiseDateStationNaNrepdepthDiatoms-1Diatoms-2PrasinophytesCryptophytesDinoflagellates-1HaptophytesDictyoRaphidoCyanobacteriaTchlA
582019-0442019-04-29 00:00:00GEO19.0mean05.625570.4098930.38740.2415050.03969820.1555950.0700280.1172730.03901357.08597
592019-0442019-04-30 00:00:003910.0mean08.911750.8036370.2389990.5234770.235220.1530880.1200470.100953011.0872
\n", "
" ], "text/plain": [ " Cruise Date Station Sample# rep depth Diatoms-1 \\\n", "48 2019-016 2019-04-11 00:00:00 102 318.0 mean 0 1.81861 \n", "49 2019-016 2019-04-11 00:00:00 75 332.0 mean 0 1.66103 \n", "50 2019-016 2019-04-11 00:00:00 72 345.0 mean 0 1.69253 \n", "51 2019-016 2019-04-11 00:00:00 69 357.0 mean 0 1.20815 \n", "52 2019-016 2019-04-11 00:00:00 ADCP 367.0 mean 0 0.277147 \n", "53 2019-016 2019-04-11 00:00:00 65 377.0 mean 0 0.91255 \n", "54 2019-016 2019-04-11 00:00:00 63 380.0 mean 0 0.704281 \n", "55 2019-016 2019-04-11 00:00:00 62 391.0 mean 0 0.52177 \n", "56 2019-016 2019-04-12 00:00:00 SC04 394.0 mean 0 3.5872 \n", "57 Cruise Date Station NaN rep depth Diatoms-1 \n", "58 2019-044 2019-04-29 00:00:00 GEO1 9.0 mean 0 5.62557 \n", "59 2019-044 2019-04-30 00:00:00 39 10.0 mean 0 8.91175 \n", "\n", " Diatoms-2 Prasinophytes Cryptophytes Dinoflagellates-1 Haptophytes \\\n", "48 0.254111 0.0210325 0.106842 0.0526873 0.222644 \n", "49 0.145549 0.0161815 0.113534 0.0543728 0.236785 \n", "50 0.0607043 0.0273842 0.20995 0.107062 0.235914 \n", "51 0.189921 0.0237731 0.219837 0.0697194 0.275599 \n", "52 0.0376473 0.0182011 0.222086 0.0207715 0.0508863 \n", "53 0.0374169 0.0295607 0.226087 0.046938 0.232094 \n", "54 0.0266105 0.0317602 0.254517 0.0525936 0.207993 \n", "55 0.0800526 0.0118601 0.117773 0.0196437 0.0626985 \n", "56 0.139061 0 0.112151 0.0294817 0.0468341 \n", "57 Diatoms-2 Prasinophytes Cryptophytes Dinoflagellates-1 Haptophytes \n", "58 0.409893 0.3874 0.241505 0.0396982 0.155595 \n", "59 0.803637 0.238999 0.523477 0.23522 0.153088 \n", "\n", " Dictyo Raphido Cyanobacteria TchlA \n", "48 0 0.00369154 0.0117118 2.49133 \n", "49 0 0.00986979 0.00749211 2.24481 \n", "50 0.0171222 0.019343 0.00543582 2.37545 \n", "51 0 0.0128893 0.00430002 2.00419 \n", "52 0.0027147 0.00533482 0.00312848 0.637917 \n", "53 0.00658676 0.0116996 0.00512161 1.50806 \n", "54 0.00172955 0.0059513 0.00552183 1.29096 \n", "55 0.0115089 0 0.0046034 0.82991 \n", "56 0 0 0.0152213 3.92995 \n", "57 Dictyo Raphido Cyanobacteria TchlA \n", "58 0.070028 0.117273 0.0390135 7.08597 \n", "59 0.120047 0.100953 0 11.0872 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# display rows 48 to 59 of the resulting table\n", "dfPhyto[48:60]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# now, drop any rows from the table that have NaN values in either of the columns\n", "# 'Index' or 'TchlA (ug/L)'\n", "# This is why we applied a function to the Index column to make sure all \n", "# non-numeric Index values would have a consistent NaN entry, making them easy to identify\n", "# and remove\n", "dfPhyto.dropna(subset=['Sample#', 'TchlA'],how='any',inplace=True)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# pandas creates its own index, and after dropping rows I like to reset it -\n", "# this is just for convenience\n", "dfPhyto.reset_index(drop=True,inplace=True)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# apply formatting function all rows in Cruise column to get year-3digit format\n", "dfPhyto['Cruise']=[fmtCruise(ii) for ii in dfPhyto['Cruise']]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CruiseDateStationSample#repdepthDiatoms-1Diatoms-2PrasinophytesCryptophytesDinoflagellates-1HaptophytesDictyoRaphidoCyanobacteriaTchlA
482019-0162019-04-11 00:00:0075332.0mean01.661030.1455490.01618150.1135340.05437280.23678500.009869790.007492112.24481
492019-0162019-04-11 00:00:0072345.0mean01.692530.06070430.02738420.209950.1070620.2359140.01712220.0193430.005435822.37545
502019-0162019-04-11 00:00:0069357.0mean01.208150.1899210.02377310.2198370.06971940.27559900.01288930.004300022.00419
512019-0162019-04-11 00:00:00ADCP367.0mean00.2771470.03764730.01820110.2220860.02077150.05088630.00271470.005334820.003128480.637917
522019-0162019-04-11 00:00:0065377.0mean00.912550.03741690.02956070.2260870.0469380.2320940.006586760.01169960.005121611.50806
532019-0162019-04-11 00:00:0063380.0mean00.7042810.02661050.03176020.2545170.05259360.2079930.001729550.00595130.005521831.29096
542019-0162019-04-11 00:00:0062391.0mean00.521770.08005260.01186010.1177730.01964370.06269850.011508900.00460340.82991
552019-0162019-04-12 00:00:00SC04394.0mean03.58720.13906100.1121510.02948170.0468341000.01522133.92995
562019-0442019-04-29 00:00:00GEO19.0mean05.625570.4098930.38740.2415050.03969820.1555950.0700280.1172730.03901357.08597
572019-0442019-04-30 00:00:003910.0mean08.911750.8036370.2389990.5234770.235220.1530880.1200470.100953011.0872
582019-0442019-04-30 00:00:002711.0mean012.12511.065350.1686720.288090.002467240.2406020.09526870.2519480.039394814.2769
592019-0442019-04-30 00:00:002212.0mean010.95180.6814240.1743550.5730970.08554250000.06274812.529
\n", "
" ], "text/plain": [ " Cruise Date Station Sample# rep depth Diatoms-1 \\\n", "48 2019-016 2019-04-11 00:00:00 75 332.0 mean 0 1.66103 \n", "49 2019-016 2019-04-11 00:00:00 72 345.0 mean 0 1.69253 \n", "50 2019-016 2019-04-11 00:00:00 69 357.0 mean 0 1.20815 \n", "51 2019-016 2019-04-11 00:00:00 ADCP 367.0 mean 0 0.277147 \n", "52 2019-016 2019-04-11 00:00:00 65 377.0 mean 0 0.91255 \n", "53 2019-016 2019-04-11 00:00:00 63 380.0 mean 0 0.704281 \n", "54 2019-016 2019-04-11 00:00:00 62 391.0 mean 0 0.52177 \n", "55 2019-016 2019-04-12 00:00:00 SC04 394.0 mean 0 3.5872 \n", "56 2019-044 2019-04-29 00:00:00 GEO1 9.0 mean 0 5.62557 \n", "57 2019-044 2019-04-30 00:00:00 39 10.0 mean 0 8.91175 \n", "58 2019-044 2019-04-30 00:00:00 27 11.0 mean 0 12.1251 \n", "59 2019-044 2019-04-30 00:00:00 22 12.0 mean 0 10.9518 \n", "\n", " Diatoms-2 Prasinophytes Cryptophytes Dinoflagellates-1 Haptophytes \\\n", "48 0.145549 0.0161815 0.113534 0.0543728 0.236785 \n", "49 0.0607043 0.0273842 0.20995 0.107062 0.235914 \n", "50 0.189921 0.0237731 0.219837 0.0697194 0.275599 \n", "51 0.0376473 0.0182011 0.222086 0.0207715 0.0508863 \n", "52 0.0374169 0.0295607 0.226087 0.046938 0.232094 \n", "53 0.0266105 0.0317602 0.254517 0.0525936 0.207993 \n", "54 0.0800526 0.0118601 0.117773 0.0196437 0.0626985 \n", "55 0.139061 0 0.112151 0.0294817 0.0468341 \n", "56 0.409893 0.3874 0.241505 0.0396982 0.155595 \n", "57 0.803637 0.238999 0.523477 0.23522 0.153088 \n", "58 1.06535 0.168672 0.28809 0.00246724 0.240602 \n", "59 0.681424 0.174355 0.573097 0.0855425 0 \n", "\n", " Dictyo Raphido Cyanobacteria TchlA \n", "48 0 0.00986979 0.00749211 2.24481 \n", "49 0.0171222 0.019343 0.00543582 2.37545 \n", "50 0 0.0128893 0.00430002 2.00419 \n", "51 0.0027147 0.00533482 0.00312848 0.637917 \n", "52 0.00658676 0.0116996 0.00512161 1.50806 \n", "53 0.00172955 0.0059513 0.00552183 1.29096 \n", "54 0.0115089 0 0.0046034 0.82991 \n", "55 0 0 0.0152213 3.92995 \n", "56 0.070028 0.117273 0.0390135 7.08597 \n", "57 0.120047 0.100953 0 11.0872 \n", "58 0.0952687 0.251948 0.0393948 14.2769 \n", "59 0 0 0.062748 12.529 " ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# display part of the table, confirming that non-data rows have been removed\n", "dfPhyto[48:60]" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# due to repeated column names in the original spreadsheet, '.1' was appended to the names\n", "# of the phytoplankton columns; \n", "# these lines correct the column names, removing the '.1':\n", "renameDict=dict()\n", "for colName in dfPhyto.keys():\n", " if colName.endswith('.1'):\n", " renameDict[colName]=colName.split('.1')[0]\n", "dfPhyto.rename(columns=renameDict,inplace=True)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CruiseDateStationSample#repdepthDiatoms-1Diatoms-2PrasinophytesCryptophytesDinoflagellates-1HaptophytesDictyoRaphidoCyanobacteriaTchlA
02019-0012019-02-20 00:00:0014729.0mean00.03311460.1016550.1238360.31170.008762110.14620.02253880.02341230.000909720.772129
12019-0012019-02-20 00:00:0012732.0mean00.1564160.176660.1125190.2375140.01278720.1799820.02569410.01653420.0009554740.919063
22019-0012019-02-20 00:00:0022743.0mean00.3563740.3403040.1250080.3157790.01325820.076920.02033370.03626380.002785541.28703
32019-0012019-02-20 00:00:00CPF2746.0mean00.06066560.1160130.1354160.3320630.0076420.1598740.02608620.02746730.001376430.866603
42019-0012019-02-20 00:00:00CPF1749.0mean00.4048520.413670.1316830.1977870.01286830.05821330.01588650.03753230.005855061.27835
...................................................
1712019-0622019-10-05 00:00:00IND0349.0mean09.26450.01114260.3444240.2917750.1089780.1091220.09272350.0746044010.2973
1722019-0622019-10-05 00:00:00IND2361.0mean06.24590.180380.3770890.2377360.06885380.1129860.07238970.04786860.02628377.36948
1732019-0622019-10-05 00:00:00IND4375.0mean08.5743600.6179250.2249240.1477930.2547750.04110090.1415330.067078610.0695
1742019-0622019-10-05 00:00:00IND7389.0mean07.1725100.6512714.732320.3308530.01664830.03341970.1522750.16940113.2587
1752019-0622019-10-05 00:00:00VAN5395.0mean01.975410.07067650.345760.1796190.1159750.02826040.03357610.05993090.009477092.81868
\n", "

176 rows × 16 columns

\n", "
" ], "text/plain": [ " Cruise Date Station Sample# rep depth Diatoms-1 \\\n", "0 2019-001 2019-02-20 00:00:00 14 729.0 mean 0 0.0331146 \n", "1 2019-001 2019-02-20 00:00:00 12 732.0 mean 0 0.156416 \n", "2 2019-001 2019-02-20 00:00:00 22 743.0 mean 0 0.356374 \n", "3 2019-001 2019-02-20 00:00:00 CPF2 746.0 mean 0 0.0606656 \n", "4 2019-001 2019-02-20 00:00:00 CPF1 749.0 mean 0 0.404852 \n", ".. ... ... ... ... ... ... ... \n", "171 2019-062 2019-10-05 00:00:00 IND0 349.0 mean 0 9.2645 \n", "172 2019-062 2019-10-05 00:00:00 IND2 361.0 mean 0 6.2459 \n", "173 2019-062 2019-10-05 00:00:00 IND4 375.0 mean 0 8.57436 \n", "174 2019-062 2019-10-05 00:00:00 IND7 389.0 mean 0 7.17251 \n", "175 2019-062 2019-10-05 00:00:00 VAN5 395.0 mean 0 1.97541 \n", "\n", " Diatoms-2 Prasinophytes Cryptophytes Dinoflagellates-1 Haptophytes \\\n", "0 0.101655 0.123836 0.3117 0.00876211 0.1462 \n", "1 0.17666 0.112519 0.237514 0.0127872 0.179982 \n", "2 0.340304 0.125008 0.315779 0.0132582 0.07692 \n", "3 0.116013 0.135416 0.332063 0.007642 0.159874 \n", "4 0.41367 0.131683 0.197787 0.0128683 0.0582133 \n", ".. ... ... ... ... ... \n", "171 0.0111426 0.344424 0.291775 0.108978 0.109122 \n", "172 0.18038 0.377089 0.237736 0.0688538 0.112986 \n", "173 0 0.617925 0.224924 0.147793 0.254775 \n", "174 0 0.651271 4.73232 0.330853 0.0166483 \n", "175 0.0706765 0.34576 0.179619 0.115975 0.0282604 \n", "\n", " Dictyo Raphido Cyanobacteria TchlA \n", "0 0.0225388 0.0234123 0.00090972 0.772129 \n", "1 0.0256941 0.0165342 0.000955474 0.919063 \n", "2 0.0203337 0.0362638 0.00278554 1.28703 \n", "3 0.0260862 0.0274673 0.00137643 0.866603 \n", "4 0.0158865 0.0375323 0.00585506 1.27835 \n", ".. ... ... ... ... \n", "171 0.0927235 0.0746044 0 10.2973 \n", "172 0.0723897 0.0478686 0.0262837 7.36948 \n", "173 0.0411009 0.141533 0.0670786 10.0695 \n", "174 0.0334197 0.152275 0.169401 13.2587 \n", "175 0.0335761 0.0599309 0.00947709 2.81868 \n", "\n", "[176 rows x 16 columns]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfPhyto" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# This is the important step- join the two tables ('left' and 'right'), \n", "# matching the cruise IDs and sample numbers\n", "# how='outer' -> all rows from both the left and the right tables will be included, \n", "# even if they cannot be matched; this makes it easy to check for \n", "# unmatched data later\n", "# left_on specifies the name of the column to match in the left table (dfbot) \n", "# right_on specifies the name of the column to match in the right table (dfPhyto)\n", "dfout = pd.merge(dfbot, dfPhyto, how='outer', \n", " left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "# Identify cases where phytoplankton data were matched to multiple samples in bottle data:\n", "dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Zone', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE',\n", " 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'ADM:MISSION', 'LOC:STATION',\n", " 'ADM:PROJECT', 'Bottle_Number', 'Bottle:Firing_Sequence',\n", " 'Sample_Number', 'Pressure [decibar]', 'Depth [metres]',\n", " 'Temperature [deg C (ITS90)]', 'Transmissivity [*/metre]',\n", " 'Fluorescence:URU [mg/m^3]', 'PAR [uE/m^2/sec]', 'Salinity [PSS-78]',\n", " 'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]',\n", " 'Conductivity [S/m]', 'Number_of_bin_records',\n", " 'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted',\n", " 'Phaeo-Pigment:Extracted [mg/m^3]', 'Nitrate_plus_Nitrite [umol/L]',\n", " 'Flag:Nitrate_plus_Nitrite', 'Silicate [umol/L]', 'Flag:Silicate',\n", " 'Phosphate [umol/L]', 'Flag:Phosphate', 'Comments by sample_numbeR',\n", " 'Cruise', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',\n", " 'Depth:Nominal [metres]', 'Temperature:Draw [deg C (ITS90)]',\n", " 'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle',\n", " 'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]',\n", " 'Flag:Oxygen:Dissolved', 'PAR:Reference [uE/m^2/sec]', 'pH:SBE:Nominal',\n", " 'File Name', 'LOC:ALTIMETER (M)', 'ADM:PLATFORM', 'LOC:GEOGRAPHIC AREA',\n", " 'FIL:DATA DESCRIPTION', 'Transmissivity:Green [*/metre]', 'Date',\n", " 'Station', 'Sample#', 'rep', 'depth', 'Diatoms-1', 'Diatoms-2',\n", " 'Prasinophytes', 'Cryptophytes', 'Dinoflagellates-1', 'Haptophytes',\n", " 'Dictyo', 'Raphido', 'Cyanobacteria', 'TchlA'],\n", " dtype='object')" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# show the column names in the resulting table\n", "dfout.keys()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Checks" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})\n", "temp.columns = ['icount']" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 2])" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.unique(temp.icount)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
icount
CruiseSample#
2019-062349.02
361.02
375.02
389.02
395.02
\n", "
" ], "text/plain": [ " icount\n", "Cruise Sample# \n", "2019-062 349.0 2\n", " 361.0 2\n", " 375.0 2\n", " 389.0 2\n", " 395.0 2" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check for Phyto samples matched to multiple bottle samples:\n", "temp.loc[temp.icount>1]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
icount
CruiseSample#
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [icount]\n", "Index: []" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check for phyto samples not matched to bottle samples:\n", "temp.loc[temp.icount==0]" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
icount
CruiseSample_Number
2019-0623492
3612
3752
3892
3952
\n", "
" ], "text/plain": [ " icount\n", "Cruise Sample_Number \n", "2019-062 349 2\n", " 361 2\n", " 375 2\n", " 389 2\n", " 395 2" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})\n", "temp2.columns = ['icount']\n", "# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:\n", "temp2.loc[temp2.icount>1]" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
icount
CruiseSample#
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [icount]\n", "Index: []" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check for phyto samples not matched to bottle samples:\n", "temp.loc[temp.icount==0]" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1449, 176, 1444)" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# if the output table is longer than either of the input tables, some columns were not matched\n", "len(dfout), len(dfPhyto), len(dfbot)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(176, 176)" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check that the number of cells with data in the 'Cyanobacteria' column is \n", "# the same for the input and output tables to show that no rows are missing:\n", "np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Empty DataFrame\n", "Columns: [Cruise, Sample#]\n", "Index: []\n" ] } ], "source": [ "# If there were data rows from the phytoplankton table that were not matched to \n", "# rows from the bottle table, their indices from the phytoplankton table would be \n", "# displayed below (the series [] would not be empty)\n", "print(dfout.loc[dfout['ADM:SCIENTIST'].isna(),['Cruise','Sample#']])" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sample#CruiseDiatoms-1Prasinophytes
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [Sample#, Cruise, Diatoms-1, Prasinophytes]\n", "Index: []" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfout.loc[dfout['ADM:SCIENTIST'].isna(),['Sample#','Cruise','Diatoms-1','Prasinophytes']]" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "# drop repetetive/unecessary columns:\n", "dfout.drop(labels=['Date', 'Station', 'Sample#', 'rep',\n", " 'depth',],axis=1,inplace=True)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "# truncate phyto group values to 3 decimal places:\n", "for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',\n", " 'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',\n", " 'TchlA'):\n", " dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ZoneLOC:EVENT_NUMBERLOC:LATITUDELOC:LONGITUDELOC:WATER DEPTHADM:SCIENTISTADM:MISSIONLOC:STATIONADM:PROJECTBottle_Number...Diatoms-1Diatoms-2PrasinophytesCryptophytesDinoflagellates-1HaptophytesDictyoRaphidoCyanobacteriaTchlA
\n", "

0 rows × 60 columns

\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [Zone, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, ADM:MISSION, LOC:STATION, ADM:PROJECT, Bottle_Number, Bottle:Firing_Sequence, Sample_Number, Pressure [decibar], Depth [metres], Temperature [deg C (ITS90)], Transmissivity [*/metre], Fluorescence:URU [mg/m^3], PAR [uE/m^2/sec], Salinity [PSS-78], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Conductivity [S/m], Number_of_bin_records, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, FIL:START TIME YYYY/MM/DD HH:MM:SS, Depth:Nominal [metres], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, PAR:Reference [uE/m^2/sec], pH:SBE:Nominal, File Name, LOC:ALTIMETER (M), ADM:PLATFORM, LOC:GEOGRAPHIC AREA, FIL:DATA DESCRIPTION, Transmissivity:Green [*/metre], Diatoms-1, Diatoms-2, Prasinophytes, Cryptophytes, Dinoflagellates-1, Haptophytes, Dictyo, Raphido, Cyanobacteria, TchlA]\n", "Index: []\n", "\n", "[0 rows x 60 columns]" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check for unexpected depths\n", "dfout.loc[((dfout['Pressure [decibar]']>10)|(dfout['Depth [metres]']>10))&(dfout['Diatoms-1']>=0)]#,\n", "# ['FIL:START TIME YYYY/MM/DD HH:MM:SS','Pressure [decibar]','Depth [metres]','Diatoms-1']]" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 NaN\n", "1 NaN\n", "2 0.001\n", "3 NaN\n", "4 NaN\n", " ... \n", "1444 NaN\n", "1445 NaN\n", "1446 NaN\n", "1447 0.002\n", "1448 0.009\n", "Name: Cyanobacteria, Length: 1449, dtype: float64" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfout['Cyanobacteria']" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "# now write the output table to a .csv file:\n", "dfout.to_csv(pathOut, index=False) " ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Zone', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE',\n", " 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'ADM:MISSION', 'LOC:STATION',\n", " 'ADM:PROJECT', 'Bottle_Number', 'Bottle:Firing_Sequence',\n", " 'Sample_Number', 'Pressure [decibar]', 'Depth [metres]',\n", " 'Temperature [deg C (ITS90)]', 'Transmissivity [*/metre]',\n", " 'Fluorescence:URU [mg/m^3]', 'PAR [uE/m^2/sec]', 'Salinity [PSS-78]',\n", " 'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]',\n", " 'Conductivity [S/m]', 'Number_of_bin_records',\n", " 'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted',\n", " 'Phaeo-Pigment:Extracted [mg/m^3]', 'Nitrate_plus_Nitrite [umol/L]',\n", " 'Flag:Nitrate_plus_Nitrite', 'Silicate [umol/L]', 'Flag:Silicate',\n", " 'Phosphate [umol/L]', 'Flag:Phosphate', 'Comments by sample_numbeR',\n", " 'Cruise', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',\n", " 'Depth:Nominal [metres]', 'Temperature:Draw [deg C (ITS90)]',\n", " 'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle',\n", " 'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]',\n", " 'Flag:Oxygen:Dissolved', 'PAR:Reference [uE/m^2/sec]', 'pH:SBE:Nominal',\n", " 'File Name', 'LOC:ALTIMETER (M)', 'ADM:PLATFORM', 'LOC:GEOGRAPHIC AREA',\n", " 'FIL:DATA DESCRIPTION', 'Transmissivity:Green [*/metre]', 'Diatoms-1',\n", " 'Diatoms-2', 'Prasinophytes', 'Cryptophytes', 'Dinoflagellates-1',\n", " 'Haptophytes', 'Dictyo', 'Raphido', 'Cyanobacteria', 'TchlA'],\n", " dtype='object')" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfout.keys()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python (py39)", "language": "python", "name": "py39" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" } }, "nbformat": 4, "nbformat_minor": 4 }