{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np # this module handles arrays, but here we need it for its NaN value\n",
"import pandas as pd # this module contains a lot of tools for handling tabular data\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# define paths to the source files and eventual output file\n",
"#pathBottle='/ocean/eolson/MEOPAR/obs/Nina/All 2018 SoG bottle.xlsx'\n",
"#pathPhyto='/ocean/eolson/MEOPAR/obs/Nina/2015-2018 Abs phyto groups.xlsx'\n",
"pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2019 SoG bottle_revWithNew044.xlsx'\n",
"pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2019 Abs phyto groups.xlsx'\n",
"\n",
"pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2019.csv'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"#formatting function to get year-(3digit) format\n",
"def fmtCruise(istr):\n",
" if re.fullmatch('[0-9]{4}-[0-9]{2}',istr): \n",
" sp=re.split('-',istr)\n",
" rstr=sp[0]+'-0'+sp[1]\n",
" elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr):\n",
" rstr=istr\n",
" else:\n",
" raise ValueError('Input had unexpected format:',istr)\n",
" return rstr"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['2019-001', '2019-007', '2019-016', '2019-044', '2019-005', '2019-045', '2019-050', '2019-023', '2019-062']\n"
]
}
],
"source": [
"# get names of sheets in notebook\n",
"with pd.ExcelFile(pathBottle) as xl:\n",
" sheets=xl.sheet_names\n",
"print(sheets)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading sheet 2019-001\n",
"Reading sheet 2019-007\n",
"Reading sheet 2019-016\n",
"Reading sheet 2019-044\n",
"Reading sheet 2019-005\n",
"Reading sheet 2019-045\n",
"Reading sheet 2019-050\n",
"Reading sheet 2019-023\n",
"Reading sheet 2019-062\n"
]
}
],
"source": [
"# load each sheet in the bottle Excel file and concatenate them together into one table\n",
"dfbotlist=list()\n",
"for sheet in sheets:\n",
" df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,\n",
" na_values=(-99,-99.9)) # read each sheet; include additional na values\n",
" df0['Cruise']=fmtCruise(sheet) # create and populate Cruise column based on sheet name\n",
" dfbotlist.append(df0) # append the sheet to a list\n",
"dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"removed empty columns: {'Flag:Dimethylsulfoniopropionate_Total', 'Dimethyl_Sulphide [nmol/L]', 'Flag:Dimethyl_Sulphide', 'Flag:Dimethylsulfoniopropionate_Dissolve', 'Dimethylsulfoniopropionate_Total [nmol/L]', 'Ammonium [umol/L]', 'Dimethylsulfoniopropionate_Dissolved [nmol/L]', 'Flag:Ammonium'}\n"
]
}
],
"source": [
"# Drop columns with no data in them\n",
"l1=set(dfbot.keys())\n",
"dfbot.dropna(axis=1,how='all',inplace=True)\n",
"print('removed empty columns:',l1-set(dfbot.keys()))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Chlorophyll:Extracted\n",
"Chlorophyll:Extracted [mg/m^3]\n",
"Comments by sample_numbeR\n",
"Conductivity:Primary [S/m]\n",
"Conductivity:Secondary\n",
"Conductivity:Secondary [S/m]\n",
"Cruise\n",
"Depth\n",
"Depth [metres]\n",
"Depth:Nominal [metres]\n",
"FIL:DATA DESCRIPTION\n",
"FIL:START TIME YYYY/MM/DD HH:MM:SS\n",
"File Name\n",
"Fluorescence:URU:Seapoint\n",
"Fluorescence:URU:Seapoint [mg/m^3]\n",
"Fluorescence:URU:Wetlabs [mg/m^3]\n",
"Nitrate_plus_Nitrite\n",
"Nitrate_plus_Nitrite [umol/L]\n",
"Number_of_bin_records\n",
"Oxygen:Dissolved\n",
"Oxygen:Dissolved [mL/L]\n",
"Oxygen:Dissolved [umol/kg]\n",
"Oxygen:Dissolved.1\n",
"Oxygen:Dissolved:SBE\n",
"Oxygen:Dissolved:SBE [mL/L]\n",
"Oxygen:Dissolved:SBE [umol/kg]\n",
"Oxygen:Dissolved:SBE.1\n",
"PAR\n",
"PAR [uE/m^2/sec]\n",
"PAR:Reference\n",
"PAR:Reference [uE/m^2/sec]\n",
"Phaeo-Pigment:Extracted\n",
"Phaeo-Pigment:Extracted [mg/m^3]\n",
"Phosphate\n",
"Phosphate [umol/L]\n",
"Pressure\n",
"Pressure [decibar]\n",
"Salinity:Bottle\n",
"Salinity:Bottle [PSS-78]\n",
"Salinity:T0:C0 [PSS-78]\n",
"Salinity:T1:C1\n",
"Salinity:T1:C1 [PSS-78]\n",
"Sample_Number\n",
"Silicate\n",
"Silicate [umol/L]\n",
"Temperature:Draw\n",
"Temperature:Draw [deg C (ITS90)]\n",
"Temperature:Primary [deg C (ITS90)]\n",
"Temperature:Secondary\n",
"Temperature:Secondary [deg C (ITS90)]\n",
"Transmissivity\n",
"Transmissivity [*/metre]\n",
"Transmissivity2 [*/metre]\n",
"Transmissivity:Green [*/metre]\n",
"YYYY/MM/DD HH:MM:SS\n",
"Zone\n",
"Zone.1\n",
"pH:SBE:Nominal\n"
]
}
],
"source": [
"# list the column names in the resulting table\n",
"for el in sorted(dfbot.keys()):\n",
" if not el.startswith('ADM') and not el.startswith('Flag') and not el.startswith('LOC') \\\n",
" and not el.startswith('Bottle'):\n",
" print(el)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Oxygen:Dissolved:SBE [mL/L] 275.2\n",
"Oxygen:Dissolved:SBE [umol/kg] 447.5\n",
"Oxygen:Dissolved [mL/L] 9.571\n",
"Oxygen:Dissolved [umol/kg] 419.6\n",
"Flag:Oxygen:Dissolved 46.0\n",
"Oxygen:Dissolved:SBE 9.19\n",
"Oxygen:Dissolved:SBE.1 9.19\n",
"Oxygen:Dissolved 8.924\n",
"Oxygen:Dissolved.1 8.924\n"
]
}
],
"source": [
"for el in dfbot.keys():\n",
" if 'Oxy' in el:\n",
" print(el,np.max(dfbot[el]))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1104 1104 340 340\n"
]
},
{
"data": {
"text/plain": [
"340"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# no rows returned, so there are no rows with both primary and secondary temperature values\n",
"print(np.sum(dfbot['Depth [metres]']>=0),\n",
" np.sum(dfbot['Pressure [decibar]']>=0),\n",
" np.sum(dfbot['Pressure']>=0),\n",
" np.sum(dfbot['Depth']>=0))\n",
"len(dfbot.loc[~(dfbot['Depth [metres]']>=0)&\\\n",
" ~(dfbot['Pressure [decibar]']>=0)&\\\n",
" (dfbot['Pressure']>=0)])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"726 378 340\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Zone | \n",
" LOC:EVENT_NUMBER | \n",
" LOC:LATITUDE | \n",
" LOC:LONGITUDE | \n",
" LOC:WATER DEPTH | \n",
" ADM:SCIENTIST | \n",
" ADM:MISSION | \n",
" LOC:STATION | \n",
" ADM:PROJECT | \n",
" YYYY/MM/DD HH:MM:SS | \n",
" ... | \n",
" Temperature:Draw | \n",
" Salinity:Bottle | \n",
" Chlorophyll:Extracted | \n",
" Phaeo-Pigment:Extracted | \n",
" Oxygen:Dissolved | \n",
" Oxygen:Dissolved.1 | \n",
" Nitrate_plus_Nitrite | \n",
" Silicate | \n",
" Phosphate | \n",
" Transmissivity:Green [*/metre] | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
0 rows × 77 columns
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Zone, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, ADM:MISSION, LOC:STATION, ADM:PROJECT, YYYY/MM/DD HH:MM:SS, Bottle_Number, Bottle:Firing_Sequence, Sample_Number, Pressure [decibar], Depth [metres], Temperature:Primary [deg C (ITS90)], Transmissivity [*/metre], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], Salinity:T0:C0 [PSS-78], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Conductivity:Primary [S/m], Number_of_bin_records, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, FIL:START TIME YYYY/MM/DD HH:MM:SS, Fluorescence:URU:Wetlabs [mg/m^3], Depth:Nominal [metres], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Temperature:Secondary [deg C (ITS90)], Conductivity:Secondary [S/m], PAR:Reference [uE/m^2/sec], pH:SBE:Nominal, Salinity:T1:C1 [PSS-78], File Name, Zone.1, Transmissivity2 [*/metre], LOC:ALTIMETER (M), ADM:PLATFORM, LOC:GEOGRAPHIC AREA, FIL:DATA DESCRIPTION, Pressure, Depth, Temperature:Secondary, Conductivity:Secondary, Transmissivity, Fluorescence:URU:Seapoint, PAR, PAR:Reference, Salinity:T1:C1, Oxygen:Dissolved:SBE, Oxygen:Dissolved:SBE.1, Temperature:Draw, Salinity:Bottle, Chlorophyll:Extracted, Phaeo-Pigment:Extracted, Oxygen:Dissolved, Oxygen:Dissolved.1, Nitrate_plus_Nitrite, Silicate, Phosphate, Transmissivity:Green [*/metre]]\n",
"Index: []\n",
"\n",
"[0 rows x 77 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# no rows returned, so there are no rows with both primary and secondary temperature values\n",
"print(np.sum(dfbot['Temperature:Primary [deg C (ITS90)]']>=0),\n",
" np.sum(dfbot['Temperature:Secondary [deg C (ITS90)]']>=0),\n",
" np.sum(dfbot['Temperature:Secondary']>=0))\n",
"dfbot.loc[(np.array([int(ii) for ii in (dfbot['Temperature:Primary [deg C (ITS90)]']>=0)])+\\\n",
" np.array([int(ii) for ii in (dfbot['Temperature:Secondary [deg C (ITS90)]']>=0)])+\\\n",
" np.array([int(ii) for ii in (dfbot['Temperature:Secondary']>=0)]))>1]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"993 340 110\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Zone | \n",
" LOC:EVENT_NUMBER | \n",
" LOC:LATITUDE | \n",
" LOC:LONGITUDE | \n",
" LOC:WATER DEPTH | \n",
" ADM:SCIENTIST | \n",
" ADM:MISSION | \n",
" LOC:STATION | \n",
" ADM:PROJECT | \n",
" YYYY/MM/DD HH:MM:SS | \n",
" ... | \n",
" Temperature:Draw | \n",
" Salinity:Bottle | \n",
" Chlorophyll:Extracted | \n",
" Phaeo-Pigment:Extracted | \n",
" Oxygen:Dissolved | \n",
" Oxygen:Dissolved.1 | \n",
" Nitrate_plus_Nitrite | \n",
" Silicate | \n",
" Phosphate | \n",
" Transmissivity:Green [*/metre] | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
0 rows × 77 columns
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Zone, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, ADM:MISSION, LOC:STATION, ADM:PROJECT, YYYY/MM/DD HH:MM:SS, Bottle_Number, Bottle:Firing_Sequence, Sample_Number, Pressure [decibar], Depth [metres], Temperature:Primary [deg C (ITS90)], Transmissivity [*/metre], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], Salinity:T0:C0 [PSS-78], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Conductivity:Primary [S/m], Number_of_bin_records, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, FIL:START TIME YYYY/MM/DD HH:MM:SS, Fluorescence:URU:Wetlabs [mg/m^3], Depth:Nominal [metres], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Temperature:Secondary [deg C (ITS90)], Conductivity:Secondary [S/m], PAR:Reference [uE/m^2/sec], pH:SBE:Nominal, Salinity:T1:C1 [PSS-78], File Name, Zone.1, Transmissivity2 [*/metre], LOC:ALTIMETER (M), ADM:PLATFORM, LOC:GEOGRAPHIC AREA, FIL:DATA DESCRIPTION, Pressure, Depth, Temperature:Secondary, Conductivity:Secondary, Transmissivity, Fluorescence:URU:Seapoint, PAR, PAR:Reference, Salinity:T1:C1, Oxygen:Dissolved:SBE, Oxygen:Dissolved:SBE.1, Temperature:Draw, Salinity:Bottle, Chlorophyll:Extracted, Phaeo-Pigment:Extracted, Oxygen:Dissolved, Oxygen:Dissolved.1, Nitrate_plus_Nitrite, Silicate, Phosphate, Transmissivity:Green [*/metre]]\n",
"Index: []\n",
"\n",
"[0 rows x 77 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# no rows returned, so there are no rows with both primary and secondary temperature values\n",
"print(np.sum(dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0),\n",
" np.sum(dfbot['Fluorescence:URU:Seapoint']>=0),\n",
" np.sum(dfbot['Fluorescence:URU:Wetlabs [mg/m^3]']>=0))\n",
"dfbot.loc[(np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0)])+\\\n",
" np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Seapoint']>=0)])+\\\n",
" np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Wetlabs [mg/m^3]']>=0)]))>1]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"726 378 340\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Zone | \n",
" LOC:EVENT_NUMBER | \n",
" LOC:LATITUDE | \n",
" LOC:LONGITUDE | \n",
" LOC:WATER DEPTH | \n",
" ADM:SCIENTIST | \n",
" ADM:MISSION | \n",
" LOC:STATION | \n",
" ADM:PROJECT | \n",
" YYYY/MM/DD HH:MM:SS | \n",
" ... | \n",
" Temperature:Draw | \n",
" Salinity:Bottle | \n",
" Chlorophyll:Extracted | \n",
" Phaeo-Pigment:Extracted | \n",
" Oxygen:Dissolved | \n",
" Oxygen:Dissolved.1 | \n",
" Nitrate_plus_Nitrite | \n",
" Silicate | \n",
" Phosphate | \n",
" Transmissivity:Green [*/metre] | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
0 rows × 77 columns
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Zone, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, ADM:MISSION, LOC:STATION, ADM:PROJECT, YYYY/MM/DD HH:MM:SS, Bottle_Number, Bottle:Firing_Sequence, Sample_Number, Pressure [decibar], Depth [metres], Temperature:Primary [deg C (ITS90)], Transmissivity [*/metre], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], Salinity:T0:C0 [PSS-78], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Conductivity:Primary [S/m], Number_of_bin_records, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, FIL:START TIME YYYY/MM/DD HH:MM:SS, Fluorescence:URU:Wetlabs [mg/m^3], Depth:Nominal [metres], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Temperature:Secondary [deg C (ITS90)], Conductivity:Secondary [S/m], PAR:Reference [uE/m^2/sec], pH:SBE:Nominal, Salinity:T1:C1 [PSS-78], File Name, Zone.1, Transmissivity2 [*/metre], LOC:ALTIMETER (M), ADM:PLATFORM, LOC:GEOGRAPHIC AREA, FIL:DATA DESCRIPTION, Pressure, Depth, Temperature:Secondary, Conductivity:Secondary, Transmissivity, Fluorescence:URU:Seapoint, PAR, PAR:Reference, Salinity:T1:C1, Oxygen:Dissolved:SBE, Oxygen:Dissolved:SBE.1, Temperature:Draw, Salinity:Bottle, Chlorophyll:Extracted, Phaeo-Pigment:Extracted, Oxygen:Dissolved, Oxygen:Dissolved.1, Nitrate_plus_Nitrite, Silicate, Phosphate, Transmissivity:Green [*/metre]]\n",
"Index: []\n",
"\n",
"[0 rows x 77 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# no rows returned, so there are no rows with both both salinity fields\n",
"print(np.sum(dfbot['Salinity:T0:C0 [PSS-78]']>=0),\n",
" np.sum(dfbot['Salinity:T1:C1 [PSS-78]']>=0),\n",
" np.sum(dfbot['Salinity:T1:C1']>=0))\n",
"dfbot.loc[(np.array([int(ii) for ii in (dfbot['Salinity:T0:C0 [PSS-78]']>=0)])+\\\n",
" np.array([int(ii) for ii in (dfbot['Salinity:T1:C1 [PSS-78]']>=0)])+\\\n",
" np.array([int(ii) for ii in (dfbot['Salinity:T1:C1']>=0)]))>1]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1104 340\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Zone | \n",
" LOC:EVENT_NUMBER | \n",
" LOC:LATITUDE | \n",
" LOC:LONGITUDE | \n",
" LOC:WATER DEPTH | \n",
" ADM:SCIENTIST | \n",
" ADM:MISSION | \n",
" LOC:STATION | \n",
" ADM:PROJECT | \n",
" YYYY/MM/DD HH:MM:SS | \n",
" ... | \n",
" Temperature:Draw | \n",
" Salinity:Bottle | \n",
" Chlorophyll:Extracted | \n",
" Phaeo-Pigment:Extracted | \n",
" Oxygen:Dissolved | \n",
" Oxygen:Dissolved.1 | \n",
" Nitrate_plus_Nitrite | \n",
" Silicate | \n",
" Phosphate | \n",
" Transmissivity:Green [*/metre] | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
0 rows × 77 columns
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Zone, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, ADM:MISSION, LOC:STATION, ADM:PROJECT, YYYY/MM/DD HH:MM:SS, Bottle_Number, Bottle:Firing_Sequence, Sample_Number, Pressure [decibar], Depth [metres], Temperature:Primary [deg C (ITS90)], Transmissivity [*/metre], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], Salinity:T0:C0 [PSS-78], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Conductivity:Primary [S/m], Number_of_bin_records, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, FIL:START TIME YYYY/MM/DD HH:MM:SS, Fluorescence:URU:Wetlabs [mg/m^3], Depth:Nominal [metres], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Temperature:Secondary [deg C (ITS90)], Conductivity:Secondary [S/m], PAR:Reference [uE/m^2/sec], pH:SBE:Nominal, Salinity:T1:C1 [PSS-78], File Name, Zone.1, Transmissivity2 [*/metre], LOC:ALTIMETER (M), ADM:PLATFORM, LOC:GEOGRAPHIC AREA, FIL:DATA DESCRIPTION, Pressure, Depth, Temperature:Secondary, Conductivity:Secondary, Transmissivity, Fluorescence:URU:Seapoint, PAR, PAR:Reference, Salinity:T1:C1, Oxygen:Dissolved:SBE, Oxygen:Dissolved:SBE.1, Temperature:Draw, Salinity:Bottle, Chlorophyll:Extracted, Phaeo-Pigment:Extracted, Oxygen:Dissolved, Oxygen:Dissolved.1, Nitrate_plus_Nitrite, Silicate, Phosphate, Transmissivity:Green [*/metre]]\n",
"Index: []\n",
"\n",
"[0 rows x 77 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# no rows returned, so there are no rows with both primary and secondary temperature values\n",
"print(np.sum(dfbot['Pressure [decibar]']>=0),\n",
" np.sum(dfbot['Pressure']>=0))\n",
"dfbot.loc[(dfbot['Pressure [decibar]']>=0)&\\\n",
" (dfbot['Pressure']>=0)]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def subval(idf,colList):\n",
" # first value in colList should be the column you are going to keep\n",
" # follow with other columns that will be used to fill in when that column is NaN\n",
" # in order of precedence\n",
" if len(colList)==2:\n",
" idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \\\n",
" else r[colList[1]] for i,r in idf.iterrows()]\n",
" elif len(colList)==3:\n",
" idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \\\n",
" else r[colList[1]] if not pd.isna(r[colList[1]]) \\\n",
" else r[colList[2]] for i,r in idf.iterrows()]\n",
" else:\n",
" raise NotImplementedError('Add to code to handle this case')\n",
" idf.drop(columns=list(colList[1:]),inplace=True)\n",
" return idf"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# there are some duplicate columns here; handle them:\n",
"dfbot=subval(dfbot,('Depth [metres]','Depth'))\n",
"dfbot=subval(dfbot,('Pressure [decibar]','Pressure'))\n",
"dfbot=subval(dfbot,('Temperature:Primary [deg C (ITS90)]',\n",
" 'Temperature:Secondary [deg C (ITS90)]',\n",
" 'Temperature:Secondary'))\n",
"dfbot=subval(dfbot,('Salinity:T0:C0 [PSS-78]',\n",
" 'Salinity:T1:C1 [PSS-78]',\n",
" 'Salinity:T1:C1'))\n",
"dfbot=subval(dfbot,('Fluorescence:URU:Seapoint [mg/m^3]',\n",
" 'Fluorescence:URU:Wetlabs [mg/m^3]',\n",
" 'Fluorescence:URU:Seapoint'))\n",
"dfbot=subval(dfbot,('PAR [uE/m^2/sec]','PAR'))\n",
"dfbot=subval(dfbot,('PAR:Reference [uE/m^2/sec]','PAR:Reference'))\n",
"dfbot=subval(dfbot,('Oxygen:Dissolved:SBE [mL/L]',\n",
" 'Oxygen:Dissolved:SBE',\n",
" 'Oxygen:Dissolved:SBE.1'))\n",
"dfbot=subval(dfbot,('Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved', 'Oxygen:Dissolved.1'))\n",
"dfbot=subval(dfbot,('Transmissivity [*/metre]','Transmissivity2 [*/metre]','Transmissivity'))\n",
"dfbot=subval(dfbot,('Nitrate_plus_Nitrite [umol/L]', 'Nitrate_plus_Nitrite'))\n",
"dfbot=subval(dfbot,('Silicate [umol/L]','Silicate'))\n",
"dfbot=subval(dfbot,('Phosphate [umol/L]','Phosphate'))\n",
"dfbot=subval(dfbot,('Phaeo-Pigment:Extracted [mg/m^3]','Phaeo-Pigment:Extracted'))\n",
"dfbot=subval(dfbot,('Chlorophyll:Extracted [mg/m^3]','Chlorophyll:Extracted'))\n",
"dfbot=subval(dfbot,('Conductivity:Primary [S/m]',\n",
" 'Conductivity:Secondary [S/m]',\n",
" 'Conductivity:Secondary'))\n",
"dfbot=subval(dfbot,('FIL:START TIME YYYY/MM/DD HH:MM:SS','YYYY/MM/DD HH:MM:SS'))\n",
"dfbot=subval(dfbot,('Zone', 'Zone.1'))\n",
"dfbot=subval(dfbot,('Temperature:Draw [deg C (ITS90)]','Temperature:Draw'))\n",
"dfbot=subval(dfbot,('Salinity:Bottle [PSS-78]','Salinity:Bottle'))\n",
"\n",
"dfbot.rename(columns={'Temperature:Primary [deg C (ITS90)]':'Temperature [deg C (ITS90)]'},\n",
" inplace=True)\n",
"dfbot.rename(columns={'Salinity:T0:C0 [PSS-78]':'Salinity [PSS-78]'},\n",
" inplace=True)\n",
"dfbot.rename(columns={'Fluorescence:URU:Seapoint [mg/m^3]':'Fluorescence:URU [mg/m^3]'},\n",
" inplace=True)\n",
"dfbot.rename(columns={'Conductivity:Primary [S/m]':'Conductivity [S/m]'},\n",
" inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# define a function that will be applied to the values in the index column;\n",
"# this makes it easier to drop non-data rows later\n",
"def convertIndex(val):\n",
" try:\n",
" x =int(val)\n",
" except ValueError:\n",
" x=np.nan\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading sheet 2019 CHEMTAX abs rerun results\n"
]
}
],
"source": [
"# load the phytoplankton data with the following options:\n",
"# sheet_name='???? CHEMTAX abs results' -> choose the sheet\n",
"# usecols='A:I,T:AC' -> read only columns A:I and T:AC from the Excel sheet\n",
"# skiprows=2 -> start reading at the 3rd row of the sheet, \n",
"# which contains the column headings\n",
"# converters={'Index': convertIndex,} -> apply the function defined above to the Index column\n",
"# verbose = True -> print extra information/ warnings/ errors\n",
"dfPhyto=pd.read_excel(pathPhyto,sheet_name='2019 CHEMTAX abs rerun results',usecols='A:S',\n",
" skiprows=2,converters={'Index': convertIndex,},\n",
" verbose=True)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Bin # | \n",
" Index | \n",
" Subgroup | \n",
" Cruise | \n",
" Date | \n",
" Station | \n",
" Sample# | \n",
" rep | \n",
" depth | \n",
" Diatoms-1 | \n",
" Diatoms-2 | \n",
" Prasinophytes | \n",
" Cryptophytes | \n",
" Dinoflagellates-1 | \n",
" Haptophytes | \n",
" Dictyo | \n",
" Raphido | \n",
" Cyanobacteria | \n",
" TchlA | \n",
"
\n",
" \n",
" \n",
" \n",
" 48 | \n",
" 7 | \n",
" 46.0 | \n",
" 2 | \n",
" 2019-016 | \n",
" 2019-04-10 00:00:00 | \n",
" 56 | \n",
" 289 | \n",
" mean | \n",
" 0 | \n",
" 0.235929 | \n",
" 0.0557088 | \n",
" 0.000116464 | \n",
" 0.252259 | \n",
" 0.00720253 | \n",
" 0.015618 | \n",
" 0.00990757 | \n",
" 0.00374288 | \n",
" 0.00721907 | \n",
" 0.587704 | \n",
"
\n",
" \n",
" 49 | \n",
" 7 | \n",
" 47.0 | \n",
" 2 | \n",
" 2019-016 | \n",
" 2019-04-10 00:00:00 | \n",
" 59 | \n",
" 303 | \n",
" mean | \n",
" 0 | \n",
" 1.17636 | \n",
" 0.0649978 | \n",
" 0.00151012 | \n",
" 0.116654 | \n",
" 0.0114187 | \n",
" 0.0189902 | \n",
" 0.00697747 | \n",
" 0.000317128 | \n",
" 0.00746592 | \n",
" 1.40469 | \n",
"
\n",
" \n",
" 50 | \n",
" 7 | \n",
" 48.0 | \n",
" 2 | \n",
" 2019-016 | \n",
" 2019-04-11 00:00:00 | \n",
" 102 | \n",
" 318 | \n",
" mean | \n",
" 0 | \n",
" 1.81861 | \n",
" 0.254111 | \n",
" 0.0210325 | \n",
" 0.106842 | \n",
" 0.0526873 | \n",
" 0.222644 | \n",
" 0 | \n",
" 0.00369154 | \n",
" 0.0117118 | \n",
" 2.49133 | \n",
"
\n",
" \n",
" 51 | \n",
" 7 | \n",
" 49.0 | \n",
" 2 | \n",
" 2019-016 | \n",
" 2019-04-11 00:00:00 | \n",
" 75 | \n",
" 332 | \n",
" mean | \n",
" 0 | \n",
" 1.66103 | \n",
" 0.145549 | \n",
" 0.0161815 | \n",
" 0.113534 | \n",
" 0.0543728 | \n",
" 0.236785 | \n",
" 0 | \n",
" 0.00986979 | \n",
" 0.00749211 | \n",
" 2.24481 | \n",
"
\n",
" \n",
" 52 | \n",
" 7 | \n",
" 50.0 | \n",
" 2 | \n",
" 2019-016 | \n",
" 2019-04-11 00:00:00 | \n",
" 72 | \n",
" 345 | \n",
" mean | \n",
" 0 | \n",
" 1.69253 | \n",
" 0.0607043 | \n",
" 0.0273842 | \n",
" 0.20995 | \n",
" 0.107062 | \n",
" 0.235914 | \n",
" 0.0171222 | \n",
" 0.019343 | \n",
" 0.00543582 | \n",
" 2.37545 | \n",
"
\n",
" \n",
" 53 | \n",
" 7 | \n",
" 51.0 | \n",
" 2 | \n",
" 2019-016 | \n",
" 2019-04-11 00:00:00 | \n",
" 69 | \n",
" 357 | \n",
" mean | \n",
" 0 | \n",
" 1.20815 | \n",
" 0.189921 | \n",
" 0.0237731 | \n",
" 0.219837 | \n",
" 0.0697194 | \n",
" 0.275599 | \n",
" 0 | \n",
" 0.0128893 | \n",
" 0.00430002 | \n",
" 2.00419 | \n",
"
\n",
" \n",
" 54 | \n",
" 7 | \n",
" 52.0 | \n",
" 2 | \n",
" 2019-016 | \n",
" 2019-04-11 00:00:00 | \n",
" ADCP | \n",
" 367 | \n",
" mean | \n",
" 0 | \n",
" 0.277147 | \n",
" 0.0376473 | \n",
" 0.0182011 | \n",
" 0.222086 | \n",
" 0.0207715 | \n",
" 0.0508863 | \n",
" 0.0027147 | \n",
" 0.00533482 | \n",
" 0.00312848 | \n",
" 0.637917 | \n",
"
\n",
" \n",
" 55 | \n",
" 7 | \n",
" 53.0 | \n",
" 2 | \n",
" 2019-016 | \n",
" 2019-04-11 00:00:00 | \n",
" 65 | \n",
" 377 | \n",
" mean | \n",
" 0 | \n",
" 0.91255 | \n",
" 0.0374169 | \n",
" 0.0295607 | \n",
" 0.226087 | \n",
" 0.046938 | \n",
" 0.232094 | \n",
" 0.00658676 | \n",
" 0.0116996 | \n",
" 0.00512161 | \n",
" 1.50806 | \n",
"
\n",
" \n",
" 56 | \n",
" 7 | \n",
" 54.0 | \n",
" 2 | \n",
" 2019-016 | \n",
" 2019-04-11 00:00:00 | \n",
" 63 | \n",
" 380 | \n",
" mean | \n",
" 0 | \n",
" 0.704281 | \n",
" 0.0266105 | \n",
" 0.0317602 | \n",
" 0.254517 | \n",
" 0.0525936 | \n",
" 0.207993 | \n",
" 0.00172955 | \n",
" 0.0059513 | \n",
" 0.00552183 | \n",
" 1.29096 | \n",
"
\n",
" \n",
" 57 | \n",
" 7 | \n",
" 55.0 | \n",
" 2 | \n",
" 2019-016 | \n",
" 2019-04-11 00:00:00 | \n",
" 62 | \n",
" 391 | \n",
" mean | \n",
" 0 | \n",
" 0.52177 | \n",
" 0.0800526 | \n",
" 0.0118601 | \n",
" 0.117773 | \n",
" 0.0196437 | \n",
" 0.0626985 | \n",
" 0.0115089 | \n",
" 0 | \n",
" 0.0046034 | \n",
" 0.82991 | \n",
"
\n",
" \n",
" 58 | \n",
" 7 | \n",
" 56.0 | \n",
" 2 | \n",
" 2019-016 | \n",
" 2019-04-12 00:00:00 | \n",
" SC04 | \n",
" 394 | \n",
" mean | \n",
" 0 | \n",
" 3.5872 | \n",
" 0.139061 | \n",
" 0 | \n",
" 0.112151 | \n",
" 0.0294817 | \n",
" 0.0468341 | \n",
" 0 | \n",
" 0 | \n",
" 0.0152213 | \n",
" 3.92995 | \n",
"
\n",
" \n",
" 59 | \n",
" Absolute Pigment Compositions - Bin # 13 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" From Sheet: OutR9 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Bin # Index Subgroup Cruise \\\n",
"48 7 46.0 2 2019-016 \n",
"49 7 47.0 2 2019-016 \n",
"50 7 48.0 2 2019-016 \n",
"51 7 49.0 2 2019-016 \n",
"52 7 50.0 2 2019-016 \n",
"53 7 51.0 2 2019-016 \n",
"54 7 52.0 2 2019-016 \n",
"55 7 53.0 2 2019-016 \n",
"56 7 54.0 2 2019-016 \n",
"57 7 55.0 2 2019-016 \n",
"58 7 56.0 2 2019-016 \n",
"59 Absolute Pigment Compositions - Bin # 13 NaN NaN NaN \n",
"\n",
" Date Station Sample# rep depth Diatoms-1 \\\n",
"48 2019-04-10 00:00:00 56 289 mean 0 0.235929 \n",
"49 2019-04-10 00:00:00 59 303 mean 0 1.17636 \n",
"50 2019-04-11 00:00:00 102 318 mean 0 1.81861 \n",
"51 2019-04-11 00:00:00 75 332 mean 0 1.66103 \n",
"52 2019-04-11 00:00:00 72 345 mean 0 1.69253 \n",
"53 2019-04-11 00:00:00 69 357 mean 0 1.20815 \n",
"54 2019-04-11 00:00:00 ADCP 367 mean 0 0.277147 \n",
"55 2019-04-11 00:00:00 65 377 mean 0 0.91255 \n",
"56 2019-04-11 00:00:00 63 380 mean 0 0.704281 \n",
"57 2019-04-11 00:00:00 62 391 mean 0 0.52177 \n",
"58 2019-04-12 00:00:00 SC04 394 mean 0 3.5872 \n",
"59 NaN From Sheet: OutR9 NaN NaN NaN NaN \n",
"\n",
" Diatoms-2 Prasinophytes Cryptophytes Dinoflagellates-1 Haptophytes \\\n",
"48 0.0557088 0.000116464 0.252259 0.00720253 0.015618 \n",
"49 0.0649978 0.00151012 0.116654 0.0114187 0.0189902 \n",
"50 0.254111 0.0210325 0.106842 0.0526873 0.222644 \n",
"51 0.145549 0.0161815 0.113534 0.0543728 0.236785 \n",
"52 0.0607043 0.0273842 0.20995 0.107062 0.235914 \n",
"53 0.189921 0.0237731 0.219837 0.0697194 0.275599 \n",
"54 0.0376473 0.0182011 0.222086 0.0207715 0.0508863 \n",
"55 0.0374169 0.0295607 0.226087 0.046938 0.232094 \n",
"56 0.0266105 0.0317602 0.254517 0.0525936 0.207993 \n",
"57 0.0800526 0.0118601 0.117773 0.0196437 0.0626985 \n",
"58 0.139061 0 0.112151 0.0294817 0.0468341 \n",
"59 NaN NaN NaN NaN NaN \n",
"\n",
" Dictyo Raphido Cyanobacteria TchlA \n",
"48 0.00990757 0.00374288 0.00721907 0.587704 \n",
"49 0.00697747 0.000317128 0.00746592 1.40469 \n",
"50 0 0.00369154 0.0117118 2.49133 \n",
"51 0 0.00986979 0.00749211 2.24481 \n",
"52 0.0171222 0.019343 0.00543582 2.37545 \n",
"53 0 0.0128893 0.00430002 2.00419 \n",
"54 0.0027147 0.00533482 0.00312848 0.637917 \n",
"55 0.00658676 0.0116996 0.00512161 1.50806 \n",
"56 0.00172955 0.0059513 0.00552183 1.29096 \n",
"57 0.0115089 0 0.0046034 0.82991 \n",
"58 0 0 0.0152213 3.92995 \n",
"59 NaN NaN NaN NaN "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# display rows 48 to 59 of the resulting table\n",
"dfPhyto[48:60]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# now, drop any rows from the table that have NaN values in either of the columns\n",
"# 'Index' or 'TchlA (ug/L)'\n",
"# This is why we applied a function to the Index column to make sure all \n",
"# non-numeric Index values would have a consistent NaN entry, making them easy to identify\n",
"# and remove\n",
"dfPhyto.dropna(subset=['Index', 'TchlA'],how='any',inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# pandas creates its own index, and after dropping rows I like to reset it -\n",
"# this is just for convenience\n",
"dfPhyto.reset_index(drop=True,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# apply formatting function all rows in Cruise column to get year-3digit format\n",
"dfPhyto['Cruise']=[fmtCruise(ii) for ii in dfPhyto['Cruise']]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Bin # | \n",
" Index | \n",
" Subgroup | \n",
" Cruise | \n",
" Date | \n",
" Station | \n",
" Sample# | \n",
" rep | \n",
" depth | \n",
" Diatoms-1 | \n",
" Diatoms-2 | \n",
" Prasinophytes | \n",
" Cryptophytes | \n",
" Dinoflagellates-1 | \n",
" Haptophytes | \n",
" Dictyo | \n",
" Raphido | \n",
" Cyanobacteria | \n",
" TchlA | \n",
"
\n",
" \n",
" \n",
" \n",
" 48 | \n",
" 7 | \n",
" 49.0 | \n",
" 2 | \n",
" 2019-016 | \n",
" 2019-04-11 00:00:00 | \n",
" 75 | \n",
" 332 | \n",
" mean | \n",
" 0 | \n",
" 1.66103 | \n",
" 0.145549 | \n",
" 0.0161815 | \n",
" 0.113534 | \n",
" 0.0543728 | \n",
" 0.236785 | \n",
" 0 | \n",
" 0.00986979 | \n",
" 0.00749211 | \n",
" 2.24481 | \n",
"
\n",
" \n",
" 49 | \n",
" 7 | \n",
" 50.0 | \n",
" 2 | \n",
" 2019-016 | \n",
" 2019-04-11 00:00:00 | \n",
" 72 | \n",
" 345 | \n",
" mean | \n",
" 0 | \n",
" 1.69253 | \n",
" 0.0607043 | \n",
" 0.0273842 | \n",
" 0.20995 | \n",
" 0.107062 | \n",
" 0.235914 | \n",
" 0.0171222 | \n",
" 0.019343 | \n",
" 0.00543582 | \n",
" 2.37545 | \n",
"
\n",
" \n",
" 50 | \n",
" 7 | \n",
" 51.0 | \n",
" 2 | \n",
" 2019-016 | \n",
" 2019-04-11 00:00:00 | \n",
" 69 | \n",
" 357 | \n",
" mean | \n",
" 0 | \n",
" 1.20815 | \n",
" 0.189921 | \n",
" 0.0237731 | \n",
" 0.219837 | \n",
" 0.0697194 | \n",
" 0.275599 | \n",
" 0 | \n",
" 0.0128893 | \n",
" 0.00430002 | \n",
" 2.00419 | \n",
"
\n",
" \n",
" 51 | \n",
" 7 | \n",
" 52.0 | \n",
" 2 | \n",
" 2019-016 | \n",
" 2019-04-11 00:00:00 | \n",
" ADCP | \n",
" 367 | \n",
" mean | \n",
" 0 | \n",
" 0.277147 | \n",
" 0.0376473 | \n",
" 0.0182011 | \n",
" 0.222086 | \n",
" 0.0207715 | \n",
" 0.0508863 | \n",
" 0.0027147 | \n",
" 0.00533482 | \n",
" 0.00312848 | \n",
" 0.637917 | \n",
"
\n",
" \n",
" 52 | \n",
" 7 | \n",
" 53.0 | \n",
" 2 | \n",
" 2019-016 | \n",
" 2019-04-11 00:00:00 | \n",
" 65 | \n",
" 377 | \n",
" mean | \n",
" 0 | \n",
" 0.91255 | \n",
" 0.0374169 | \n",
" 0.0295607 | \n",
" 0.226087 | \n",
" 0.046938 | \n",
" 0.232094 | \n",
" 0.00658676 | \n",
" 0.0116996 | \n",
" 0.00512161 | \n",
" 1.50806 | \n",
"
\n",
" \n",
" 53 | \n",
" 7 | \n",
" 54.0 | \n",
" 2 | \n",
" 2019-016 | \n",
" 2019-04-11 00:00:00 | \n",
" 63 | \n",
" 380 | \n",
" mean | \n",
" 0 | \n",
" 0.704281 | \n",
" 0.0266105 | \n",
" 0.0317602 | \n",
" 0.254517 | \n",
" 0.0525936 | \n",
" 0.207993 | \n",
" 0.00172955 | \n",
" 0.0059513 | \n",
" 0.00552183 | \n",
" 1.29096 | \n",
"
\n",
" \n",
" 54 | \n",
" 7 | \n",
" 55.0 | \n",
" 2 | \n",
" 2019-016 | \n",
" 2019-04-11 00:00:00 | \n",
" 62 | \n",
" 391 | \n",
" mean | \n",
" 0 | \n",
" 0.52177 | \n",
" 0.0800526 | \n",
" 0.0118601 | \n",
" 0.117773 | \n",
" 0.0196437 | \n",
" 0.0626985 | \n",
" 0.0115089 | \n",
" 0 | \n",
" 0.0046034 | \n",
" 0.82991 | \n",
"
\n",
" \n",
" 55 | \n",
" 7 | \n",
" 56.0 | \n",
" 2 | \n",
" 2019-016 | \n",
" 2019-04-12 00:00:00 | \n",
" SC04 | \n",
" 394 | \n",
" mean | \n",
" 0 | \n",
" 3.5872 | \n",
" 0.139061 | \n",
" 0 | \n",
" 0.112151 | \n",
" 0.0294817 | \n",
" 0.0468341 | \n",
" 0 | \n",
" 0 | \n",
" 0.0152213 | \n",
" 3.92995 | \n",
"
\n",
" \n",
" 56 | \n",
" 13 | \n",
" 57.0 | \n",
" 3 | \n",
" 2019-044 | \n",
" 2019-04-29 00:00:00 | \n",
" GEO1 | \n",
" 9 | \n",
" mean | \n",
" 0 | \n",
" 5.62557 | \n",
" 0.409893 | \n",
" 0.3874 | \n",
" 0.241505 | \n",
" 0.0396982 | \n",
" 0.155595 | \n",
" 0.070028 | \n",
" 0.117273 | \n",
" 0.0390135 | \n",
" 7.08597 | \n",
"
\n",
" \n",
" 57 | \n",
" 13 | \n",
" 58.0 | \n",
" 3 | \n",
" 2019-044 | \n",
" 2019-04-30 00:00:00 | \n",
" 39 | \n",
" 10 | \n",
" mean | \n",
" 0 | \n",
" 8.91175 | \n",
" 0.803637 | \n",
" 0.238999 | \n",
" 0.523477 | \n",
" 0.23522 | \n",
" 0.153088 | \n",
" 0.120047 | \n",
" 0.100953 | \n",
" 0 | \n",
" 11.0872 | \n",
"
\n",
" \n",
" 58 | \n",
" 13 | \n",
" 59.0 | \n",
" 3 | \n",
" 2019-044 | \n",
" 2019-04-30 00:00:00 | \n",
" 27 | \n",
" 11 | \n",
" mean | \n",
" 0 | \n",
" 12.1251 | \n",
" 1.06535 | \n",
" 0.168672 | \n",
" 0.28809 | \n",
" 0.00246724 | \n",
" 0.240602 | \n",
" 0.0952687 | \n",
" 0.251948 | \n",
" 0.0393948 | \n",
" 14.2769 | \n",
"
\n",
" \n",
" 59 | \n",
" 13 | \n",
" 60.0 | \n",
" 3 | \n",
" 2019-044 | \n",
" 2019-04-30 00:00:00 | \n",
" 22 | \n",
" 12 | \n",
" mean | \n",
" 0 | \n",
" 10.9518 | \n",
" 0.681424 | \n",
" 0.174355 | \n",
" 0.573097 | \n",
" 0.0855425 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0.062748 | \n",
" 12.529 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Bin # Index Subgroup Cruise Date Station Sample# rep \\\n",
"48 7 49.0 2 2019-016 2019-04-11 00:00:00 75 332 mean \n",
"49 7 50.0 2 2019-016 2019-04-11 00:00:00 72 345 mean \n",
"50 7 51.0 2 2019-016 2019-04-11 00:00:00 69 357 mean \n",
"51 7 52.0 2 2019-016 2019-04-11 00:00:00 ADCP 367 mean \n",
"52 7 53.0 2 2019-016 2019-04-11 00:00:00 65 377 mean \n",
"53 7 54.0 2 2019-016 2019-04-11 00:00:00 63 380 mean \n",
"54 7 55.0 2 2019-016 2019-04-11 00:00:00 62 391 mean \n",
"55 7 56.0 2 2019-016 2019-04-12 00:00:00 SC04 394 mean \n",
"56 13 57.0 3 2019-044 2019-04-29 00:00:00 GEO1 9 mean \n",
"57 13 58.0 3 2019-044 2019-04-30 00:00:00 39 10 mean \n",
"58 13 59.0 3 2019-044 2019-04-30 00:00:00 27 11 mean \n",
"59 13 60.0 3 2019-044 2019-04-30 00:00:00 22 12 mean \n",
"\n",
" depth Diatoms-1 Diatoms-2 Prasinophytes Cryptophytes Dinoflagellates-1 \\\n",
"48 0 1.66103 0.145549 0.0161815 0.113534 0.0543728 \n",
"49 0 1.69253 0.0607043 0.0273842 0.20995 0.107062 \n",
"50 0 1.20815 0.189921 0.0237731 0.219837 0.0697194 \n",
"51 0 0.277147 0.0376473 0.0182011 0.222086 0.0207715 \n",
"52 0 0.91255 0.0374169 0.0295607 0.226087 0.046938 \n",
"53 0 0.704281 0.0266105 0.0317602 0.254517 0.0525936 \n",
"54 0 0.52177 0.0800526 0.0118601 0.117773 0.0196437 \n",
"55 0 3.5872 0.139061 0 0.112151 0.0294817 \n",
"56 0 5.62557 0.409893 0.3874 0.241505 0.0396982 \n",
"57 0 8.91175 0.803637 0.238999 0.523477 0.23522 \n",
"58 0 12.1251 1.06535 0.168672 0.28809 0.00246724 \n",
"59 0 10.9518 0.681424 0.174355 0.573097 0.0855425 \n",
"\n",
" Haptophytes Dictyo Raphido Cyanobacteria TchlA \n",
"48 0.236785 0 0.00986979 0.00749211 2.24481 \n",
"49 0.235914 0.0171222 0.019343 0.00543582 2.37545 \n",
"50 0.275599 0 0.0128893 0.00430002 2.00419 \n",
"51 0.0508863 0.0027147 0.00533482 0.00312848 0.637917 \n",
"52 0.232094 0.00658676 0.0116996 0.00512161 1.50806 \n",
"53 0.207993 0.00172955 0.0059513 0.00552183 1.29096 \n",
"54 0.0626985 0.0115089 0 0.0046034 0.82991 \n",
"55 0.0468341 0 0 0.0152213 3.92995 \n",
"56 0.155595 0.070028 0.117273 0.0390135 7.08597 \n",
"57 0.153088 0.120047 0.100953 0 11.0872 \n",
"58 0.240602 0.0952687 0.251948 0.0393948 14.2769 \n",
"59 0 0 0 0.062748 12.529 "
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# display part of the table, confirming that non-data rows have been removed\n",
"dfPhyto[48:60]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# due to repeated column names in the original spreadsheet, '.1' was appended to the names\n",
"# of the phytoplankton columns; \n",
"# these lines correct the column names, removing the '.1':\n",
"renameDict=dict()\n",
"for colName in dfPhyto.keys():\n",
" if colName.endswith('.1'):\n",
" renameDict[colName]=colName.split('.1')[0]\n",
"dfPhyto.rename(columns=renameDict,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Bin # | \n",
" Index | \n",
" Subgroup | \n",
" Cruise | \n",
" Date | \n",
" Station | \n",
" Sample# | \n",
" rep | \n",
" depth | \n",
" Diatoms-1 | \n",
" Diatoms-2 | \n",
" Prasinophytes | \n",
" Cryptophytes | \n",
" Dinoflagellates-1 | \n",
" Haptophytes | \n",
" Dictyo | \n",
" Raphido | \n",
" Cyanobacteria | \n",
" TchlA | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 16 | \n",
" 1.0 | \n",
" 1 | \n",
" 2019-001 | \n",
" 2019-02-20 00:00:00 | \n",
" 14 | \n",
" 729 | \n",
" mean | \n",
" 0 | \n",
" 0.0331146 | \n",
" 0.101655 | \n",
" 0.123836 | \n",
" 0.3117 | \n",
" 0.00876211 | \n",
" 0.1462 | \n",
" 0.0225388 | \n",
" 0.0234123 | \n",
" 0.00090972 | \n",
" 0.772129 | \n",
"
\n",
" \n",
" 1 | \n",
" 16 | \n",
" 2.0 | \n",
" 1 | \n",
" 2019-001 | \n",
" 2019-02-20 00:00:00 | \n",
" 12 | \n",
" 732 | \n",
" mean | \n",
" 0 | \n",
" 0.156416 | \n",
" 0.17666 | \n",
" 0.112519 | \n",
" 0.237514 | \n",
" 0.0127872 | \n",
" 0.179982 | \n",
" 0.0256941 | \n",
" 0.0165342 | \n",
" 0.000955474 | \n",
" 0.919063 | \n",
"
\n",
" \n",
" 2 | \n",
" 16 | \n",
" 3.0 | \n",
" 1 | \n",
" 2019-001 | \n",
" 2019-02-20 00:00:00 | \n",
" 22 | \n",
" 743 | \n",
" mean | \n",
" 0 | \n",
" 0.356374 | \n",
" 0.340304 | \n",
" 0.125008 | \n",
" 0.315779 | \n",
" 0.0132582 | \n",
" 0.07692 | \n",
" 0.0203337 | \n",
" 0.0362638 | \n",
" 0.00278554 | \n",
" 1.28703 | \n",
"
\n",
" \n",
" 3 | \n",
" 16 | \n",
" 4.0 | \n",
" 1 | \n",
" 2019-001 | \n",
" 2019-02-20 00:00:00 | \n",
" CPF2 | \n",
" 746 | \n",
" mean | \n",
" 0 | \n",
" 0.0606656 | \n",
" 0.116013 | \n",
" 0.135416 | \n",
" 0.332063 | \n",
" 0.007642 | \n",
" 0.159874 | \n",
" 0.0260862 | \n",
" 0.0274673 | \n",
" 0.00137643 | \n",
" 0.866603 | \n",
"
\n",
" \n",
" 4 | \n",
" 16 | \n",
" 5.0 | \n",
" 1 | \n",
" 2019-001 | \n",
" 2019-02-20 00:00:00 | \n",
" CPF1 | \n",
" 749 | \n",
" mean | \n",
" 0 | \n",
" 0.404852 | \n",
" 0.41367 | \n",
" 0.131683 | \n",
" 0.197787 | \n",
" 0.0128683 | \n",
" 0.0582133 | \n",
" 0.0158865 | \n",
" 0.0375323 | \n",
" 0.00585506 | \n",
" 1.27835 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 166 | \n",
" 19 | \n",
" 167.0 | \n",
" 7 | \n",
" 2019-062 | \n",
" 2019-10-05 00:00:00 | \n",
" IND0 | \n",
" 349 | \n",
" mean | \n",
" 0 | \n",
" 9.11885 | \n",
" 0 | \n",
" 0.289786 | \n",
" 0.612077 | \n",
" 0.0876324 | \n",
" 0.0656088 | \n",
" 0.0868017 | \n",
" 0.0365193 | \n",
" 0 | \n",
" 10.2973 | \n",
"
\n",
" \n",
" 167 | \n",
" 19 | \n",
" 168.0 | \n",
" 7 | \n",
" 2019-062 | \n",
" 2019-10-05 00:00:00 | \n",
" IND2 | \n",
" 361 | \n",
" mean | \n",
" 0 | \n",
" 6.2426 | \n",
" 0 | \n",
" 0.312427 | \n",
" 0.510974 | \n",
" 0.0604039 | \n",
" 0.146785 | \n",
" 0.0594772 | \n",
" 0.0247012 | \n",
" 0.0121179 | \n",
" 7.36948 | \n",
"
\n",
" \n",
" 168 | \n",
" 19 | \n",
" 169.0 | \n",
" 7 | \n",
" 2019-062 | \n",
" 2019-10-05 00:00:00 | \n",
" IND4 | \n",
" 375 | \n",
" mean | \n",
" 0 | \n",
" 8.54631 | \n",
" 0 | \n",
" 0.520754 | \n",
" 0.511822 | \n",
" 0.158578 | \n",
" 0.177592 | \n",
" 0.0323331 | \n",
" 0.0887445 | \n",
" 0.0333497 | \n",
" 10.0695 | \n",
"
\n",
" \n",
" 169 | \n",
" 19 | \n",
" 170.0 | \n",
" 7 | \n",
" 2019-062 | \n",
" 2019-10-05 00:00:00 | \n",
" IND7 | \n",
" 389 | \n",
" mean | \n",
" 0 | \n",
" 4.28515 | \n",
" 0 | \n",
" 0.365631 | \n",
" 8.53139 | \n",
" 0 | \n",
" 0 | \n",
" 0.00631162 | \n",
" 0.00703949 | \n",
" 0.0631797 | \n",
" 13.2587 | \n",
"
\n",
" \n",
" 170 | \n",
" 19 | \n",
" 171.0 | \n",
" 7 | \n",
" 2019-062 | \n",
" 2019-10-05 00:00:00 | \n",
" VAN5 | \n",
" 395 | \n",
" mean | \n",
" 0 | \n",
" 1.94256 | \n",
" 0 | \n",
" 0.279896 | \n",
" 0.383945 | \n",
" 0.106905 | \n",
" 0.0399761 | \n",
" 0.0289777 | \n",
" 0.0341708 | \n",
" 0.00224957 | \n",
" 2.81868 | \n",
"
\n",
" \n",
"
\n",
"
171 rows × 19 columns
\n",
"
"
],
"text/plain": [
" Bin # Index Subgroup Cruise Date Station Sample# \\\n",
"0 16 1.0 1 2019-001 2019-02-20 00:00:00 14 729 \n",
"1 16 2.0 1 2019-001 2019-02-20 00:00:00 12 732 \n",
"2 16 3.0 1 2019-001 2019-02-20 00:00:00 22 743 \n",
"3 16 4.0 1 2019-001 2019-02-20 00:00:00 CPF2 746 \n",
"4 16 5.0 1 2019-001 2019-02-20 00:00:00 CPF1 749 \n",
".. ... ... ... ... ... ... ... \n",
"166 19 167.0 7 2019-062 2019-10-05 00:00:00 IND0 349 \n",
"167 19 168.0 7 2019-062 2019-10-05 00:00:00 IND2 361 \n",
"168 19 169.0 7 2019-062 2019-10-05 00:00:00 IND4 375 \n",
"169 19 170.0 7 2019-062 2019-10-05 00:00:00 IND7 389 \n",
"170 19 171.0 7 2019-062 2019-10-05 00:00:00 VAN5 395 \n",
"\n",
" rep depth Diatoms-1 Diatoms-2 Prasinophytes Cryptophytes \\\n",
"0 mean 0 0.0331146 0.101655 0.123836 0.3117 \n",
"1 mean 0 0.156416 0.17666 0.112519 0.237514 \n",
"2 mean 0 0.356374 0.340304 0.125008 0.315779 \n",
"3 mean 0 0.0606656 0.116013 0.135416 0.332063 \n",
"4 mean 0 0.404852 0.41367 0.131683 0.197787 \n",
".. ... ... ... ... ... ... \n",
"166 mean 0 9.11885 0 0.289786 0.612077 \n",
"167 mean 0 6.2426 0 0.312427 0.510974 \n",
"168 mean 0 8.54631 0 0.520754 0.511822 \n",
"169 mean 0 4.28515 0 0.365631 8.53139 \n",
"170 mean 0 1.94256 0 0.279896 0.383945 \n",
"\n",
" Dinoflagellates-1 Haptophytes Dictyo Raphido Cyanobacteria \\\n",
"0 0.00876211 0.1462 0.0225388 0.0234123 0.00090972 \n",
"1 0.0127872 0.179982 0.0256941 0.0165342 0.000955474 \n",
"2 0.0132582 0.07692 0.0203337 0.0362638 0.00278554 \n",
"3 0.007642 0.159874 0.0260862 0.0274673 0.00137643 \n",
"4 0.0128683 0.0582133 0.0158865 0.0375323 0.00585506 \n",
".. ... ... ... ... ... \n",
"166 0.0876324 0.0656088 0.0868017 0.0365193 0 \n",
"167 0.0604039 0.146785 0.0594772 0.0247012 0.0121179 \n",
"168 0.158578 0.177592 0.0323331 0.0887445 0.0333497 \n",
"169 0 0 0.00631162 0.00703949 0.0631797 \n",
"170 0.106905 0.0399761 0.0289777 0.0341708 0.00224957 \n",
"\n",
" TchlA \n",
"0 0.772129 \n",
"1 0.919063 \n",
"2 1.28703 \n",
"3 0.866603 \n",
"4 1.27835 \n",
".. ... \n",
"166 10.2973 \n",
"167 7.36948 \n",
"168 10.0695 \n",
"169 13.2587 \n",
"170 2.81868 \n",
"\n",
"[171 rows x 19 columns]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfPhyto"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# This is the important step- join the two tables ('left' and 'right'), \n",
"# matching the cruise IDs and sample numbers\n",
"# how='outer' -> all rows from both the left and the right tables will be included, \n",
"# even if they cannot be matched; this makes it easy to check for \n",
"# unmatched data later\n",
"# left_on specifies the name of the column to match in the left table (dfbot) \n",
"# right_on specifies the name of the column to match in the right table (dfPhyto)\n",
"dfout = pd.merge(dfbot, dfPhyto, how='outer', \n",
" left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"# Identify cases where phytoplankton data were matched to multiple samples in bottle data:\n",
"dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Zone', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE',\n",
" 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'ADM:MISSION', 'LOC:STATION',\n",
" 'ADM:PROJECT', 'Bottle_Number', 'Bottle:Firing_Sequence',\n",
" 'Sample_Number', 'Pressure [decibar]', 'Depth [metres]',\n",
" 'Temperature [deg C (ITS90)]', 'Transmissivity [*/metre]',\n",
" 'Fluorescence:URU [mg/m^3]', 'PAR [uE/m^2/sec]', 'Salinity [PSS-78]',\n",
" 'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]',\n",
" 'Conductivity [S/m]', 'Number_of_bin_records',\n",
" 'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted',\n",
" 'Phaeo-Pigment:Extracted [mg/m^3]', 'Nitrate_plus_Nitrite [umol/L]',\n",
" 'Flag:Nitrate_plus_Nitrite', 'Silicate [umol/L]', 'Flag:Silicate',\n",
" 'Phosphate [umol/L]', 'Flag:Phosphate', 'Comments by sample_numbeR',\n",
" 'Cruise', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',\n",
" 'Depth:Nominal [metres]', 'Temperature:Draw [deg C (ITS90)]',\n",
" 'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle',\n",
" 'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]',\n",
" 'Flag:Oxygen:Dissolved', 'PAR:Reference [uE/m^2/sec]', 'pH:SBE:Nominal',\n",
" 'File Name', 'LOC:ALTIMETER (M)', 'ADM:PLATFORM', 'LOC:GEOGRAPHIC AREA',\n",
" 'FIL:DATA DESCRIPTION', 'Transmissivity:Green [*/metre]', 'Bin #',\n",
" 'Index', 'Subgroup', 'Date', 'Station', 'Sample#', 'rep', 'depth',\n",
" 'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes',\n",
" 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',\n",
" 'Cyanobacteria', 'TchlA'],\n",
" dtype='object')"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# show the column names in the resulting table\n",
"dfout.keys()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Checks"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})\n",
"temp.columns = ['icount']"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1])"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.unique(temp.icount)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" icount | \n",
"
\n",
" \n",
" Cruise | \n",
" Sample# | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [icount]\n",
"Index: []"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check for Phyto samples matched to multiple bottle samples:\n",
"temp.loc[temp.icount>1]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" icount | \n",
"
\n",
" \n",
" Cruise | \n",
" Sample# | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [icount]\n",
"Index: []"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check for phyto samples not matched to bottle samples:\n",
"temp.loc[temp.icount==0]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" icount | \n",
"
\n",
" \n",
" Cruise | \n",
" Sample_Number | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [icount]\n",
"Index: []"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})\n",
"temp2.columns = ['icount']\n",
"# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:\n",
"temp2.loc[temp2.icount>1]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" icount | \n",
"
\n",
" \n",
" Cruise | \n",
" Sample# | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [icount]\n",
"Index: []"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check for phyto samples not matched to bottle samples:\n",
"temp.loc[temp.icount==0]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1444, 171, 1444)"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# if the output table is longer than either of the input tables, some columns were not matched\n",
"len(dfout), len(dfPhyto), len(dfbot)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(171, 171)"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check that the number of cells with data in the 'Cyanobacteria' column is \n",
"# the same for the input and output tables to show that no rows are missing:\n",
"np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Series([], Name: Index, dtype: float64)\n"
]
}
],
"source": [
"# If there were data rows from the phytoplankton table that were not matched to \n",
"# rows from the bottle table, their indices from the phytoplankton table would be \n",
"# displayed below (the series [] would not be empty)\n",
"print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Index'])"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Index | \n",
" Cruise | \n",
" Diatoms-1 | \n",
" Prasinophytes | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Index, Cruise, Diatoms-1, Prasinophytes]\n",
"Index: []"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfout.loc[dfout['ADM:SCIENTIST'].isna(),['Index','Cruise','Diatoms-1','Prasinophytes']]"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"# drop repetetive/unecessary columns:\n",
"dfout.drop(labels=['Bin #', 'Index', 'Subgroup', 'Station', 'Sample#', 'rep',\n",
" 'depth',],axis=1,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"# truncate phyto group values to 3 decimal places:\n",
"for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',\n",
" 'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',\n",
" 'TchlA'):\n",
" dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 NaN\n",
"1 NaN\n",
"2 0.001\n",
"3 NaN\n",
"4 NaN\n",
" ... \n",
"1439 NaN\n",
"1440 NaN\n",
"1441 NaN\n",
"1442 NaN\n",
"1443 0.002\n",
"Name: Cyanobacteria, Length: 1444, dtype: float64"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfout['Cyanobacteria']"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"# now write the output table to a .csv file:\n",
"dfout.to_csv(pathOut, index=False) "
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Zone', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE',\n",
" 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'ADM:MISSION', 'LOC:STATION',\n",
" 'ADM:PROJECT', 'Bottle_Number', 'Bottle:Firing_Sequence',\n",
" 'Sample_Number', 'Pressure [decibar]', 'Depth [metres]',\n",
" 'Temperature [deg C (ITS90)]', 'Transmissivity [*/metre]',\n",
" 'Fluorescence:URU [mg/m^3]', 'PAR [uE/m^2/sec]', 'Salinity [PSS-78]',\n",
" 'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]',\n",
" 'Conductivity [S/m]', 'Number_of_bin_records',\n",
" 'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted',\n",
" 'Phaeo-Pigment:Extracted [mg/m^3]', 'Nitrate_plus_Nitrite [umol/L]',\n",
" 'Flag:Nitrate_plus_Nitrite', 'Silicate [umol/L]', 'Flag:Silicate',\n",
" 'Phosphate [umol/L]', 'Flag:Phosphate', 'Comments by sample_numbeR',\n",
" 'Cruise', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',\n",
" 'Depth:Nominal [metres]', 'Temperature:Draw [deg C (ITS90)]',\n",
" 'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle',\n",
" 'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]',\n",
" 'Flag:Oxygen:Dissolved', 'PAR:Reference [uE/m^2/sec]', 'pH:SBE:Nominal',\n",
" 'File Name', 'LOC:ALTIMETER (M)', 'ADM:PLATFORM', 'LOC:GEOGRAPHIC AREA',\n",
" 'FIL:DATA DESCRIPTION', 'Transmissivity:Green [*/metre]', 'Date',\n",
" 'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes',\n",
" 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',\n",
" 'Cyanobacteria', 'TchlA'],\n",
" dtype='object')"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfout.keys()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (py38_ct)",
"language": "python",
"name": "py38_ct"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}