{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np # this module handles arrays, but here we need it for its NaN value\n",
"import pandas as pd # this module contains a lot of tools for handling tabular data\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# define paths to the source files and eventual output file\n",
"#pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2018 SoG bottle.xlsx'\n",
"#pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2018 Abs phyto groups.xlsx'\n",
"pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2018 SoG bottleCorrected.xlsx'\n",
"pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2018 Abs phyto groupsCorrected.xlsx'\n",
"\n",
"pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2018.csv'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"#formatting function to get year-(3digit) format\n",
"def fmtCruise(istr):\n",
" if re.fullmatch('[0-9]{4}-[0-9]{2}',istr): \n",
" sp=re.split('-',istr)\n",
" rstr=sp[0]+'-0'+sp[1]\n",
" elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr):\n",
" rstr=istr\n",
" else:\n",
" raise ValueError('Input had unexpected format:',istr)\n",
" return rstr"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['2018-005', '2018-01', '2018-96', '2018-29', '2018-035', '2018-39', '2018-036', '2018-030', '2018-037', '2018-34', '2018-40', '2018-31', '2018-032']\n"
]
}
],
"source": [
"# get names of sheets in notebook\n",
"with pd.ExcelFile(pathBottle) as xl:\n",
" sheets=xl.sheet_names\n",
"print(sheets)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading sheet 2018-005\n",
"Reading sheet 2018-01\n",
"Reading sheet 2018-96\n",
"Reading sheet 2018-29\n",
"Reading sheet 2018-035\n",
"Reading sheet 2018-39\n",
"Reading sheet 2018-036\n",
"Reading sheet 2018-030\n",
"Reading sheet 2018-037\n",
"Reading sheet 2018-34\n",
"Reading sheet 2018-40\n",
"Reading sheet 2018-31\n",
"Reading sheet 2018-032\n",
"removed empty columns: {'Flag:Ammonium', 'Ammonium [umol/L]'}\n"
]
}
],
"source": [
"# load each sheet in the bottle Excel file and concatenate them together into one table\n",
"dfbotlist=list()\n",
"for sheet in sheets:\n",
" df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,\n",
" na_values=(-99,-99.9)) # read each sheet; include additional na values\n",
" df0['Cruise']=fmtCruise(sheet) # create and populate Cruise column based on sheet name\n",
" dfbotlist.append(df0) # append the sheet to a list\n",
"dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table\n",
"# Drop columns with no data in them\n",
"l1=set(dfbot.keys())\n",
"dfbot.dropna(axis=1,how='all',inplace=True)\n",
"print('removed empty columns:',l1-set(dfbot.keys()))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['File Name', 'Zone', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE',\n",
" 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'ADM:MISSION',\n",
" 'LOC:STATION', 'ADM:PROJECT', 'Zone.1', 'YYYY/MM/DD HH:MM:SS',\n",
" 'Sample_Number', 'Pressure [decibar]', 'Depth [metres]',\n",
" 'Temperature:Primary [deg C (ITS90)]', 'Transmissivity [*/metre]',\n",
" 'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]',\n",
" 'PAR:Reference [uE/m^2/sec]', 'pH:SBE:Nominal',\n",
" 'Salinity:T0:C0 [PSS-78]', 'Oxygen:Dissolved:SBE [mL/L]',\n",
" 'Oxygen:Dissolved:SBE [umol/kg]', 'Temperature:Draw [deg C (ITS90)]',\n",
" 'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle',\n",
" 'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted',\n",
" 'Phaeo-Pigment:Extracted [mg/m^3]', 'Oxygen:Dissolved [mL/L]',\n",
" 'Oxygen:Dissolved [umol/kg]', 'Flag:Oxygen:Dissolved',\n",
" 'Nitrate_plus_Nitrite [umol/L]', 'Flag:Nitrate_plus_Nitrite',\n",
" 'Silicate [umol/L]', 'Flag:Silicate', 'Phosphate [umol/L]',\n",
" 'Flag:Phosphate', 'Comments by sample_numbeR', 'Cruise',\n",
" 'FIL:START TIME YYYY/MM/DD HH:MM:SS',\n",
" 'Temperature:Secondary [deg C (ITS90)]', 'Pressure:CTD [decibar]',\n",
" 'Depth:CTD [metres]', 'Temperature:CTD [deg C (ITS90)]',\n",
" 'Salinity:CTD [PSS-78]', 'Oxygen:Dissolved:Volume:CTD [mL/L]',\n",
" 'Oxygen:Dissolved:Mass:CTD [umol/kg]', 'Fluorescence:URU:CTD [mg/m^3]',\n",
" 'Depth:Nominal [metres]', 'Alkalinity:Total [umol/L]',\n",
" 'Flag:Alkalinity:Total', 'Carbon:Dissolved:Inorganic [umol/kg]',\n",
" 'Flag:Carbon:Dissolved:Inorganic', 'Salinity:T1:C1 [PSS-78]',\n",
" 'Fluorescence:URU:Wetlabs [mg/m^3]', 'Bottle_Number',\n",
" 'Bottle:Firing_Sequence'],\n",
" dtype='object')\n"
]
}
],
"source": [
"# list the column names in the resulting table\n",
"print(dfbot.keys())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2010 36\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" File Name | \n",
" Zone | \n",
" LOC:EVENT_NUMBER | \n",
" LOC:LATITUDE | \n",
" LOC:LONGITUDE | \n",
" LOC:WATER DEPTH | \n",
" ADM:SCIENTIST | \n",
" ADM:MISSION | \n",
" LOC:STATION | \n",
" ADM:PROJECT | \n",
" ... | \n",
" Fluorescence:URU:CTD [mg/m^3] | \n",
" Depth:Nominal [metres] | \n",
" Alkalinity:Total [umol/L] | \n",
" Flag:Alkalinity:Total | \n",
" Carbon:Dissolved:Inorganic [umol/kg] | \n",
" Flag:Carbon:Dissolved:Inorganic | \n",
" Salinity:T1:C1 [PSS-78] | \n",
" Fluorescence:URU:Wetlabs [mg/m^3] | \n",
" Bottle_Number | \n",
" Bottle:Firing_Sequence | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
0 rows × 59 columns
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [File Name, Zone, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, ADM:MISSION, LOC:STATION, ADM:PROJECT, Zone.1, YYYY/MM/DD HH:MM:SS, Sample_Number, Pressure [decibar], Depth [metres], Temperature:Primary [deg C (ITS90)], Transmissivity [*/metre], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], PAR:Reference [uE/m^2/sec], pH:SBE:Nominal, Salinity:T0:C0 [PSS-78], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, FIL:START TIME YYYY/MM/DD HH:MM:SS, Temperature:Secondary [deg C (ITS90)], Pressure:CTD [decibar], Depth:CTD [metres], Temperature:CTD [deg C (ITS90)], Salinity:CTD [PSS-78], Oxygen:Dissolved:Volume:CTD [mL/L], Oxygen:Dissolved:Mass:CTD [umol/kg], Fluorescence:URU:CTD [mg/m^3], Depth:Nominal [metres], Alkalinity:Total [umol/L], Flag:Alkalinity:Total, Carbon:Dissolved:Inorganic [umol/kg], Flag:Carbon:Dissolved:Inorganic, Salinity:T1:C1 [PSS-78], Fluorescence:URU:Wetlabs [mg/m^3], Bottle_Number, Bottle:Firing_Sequence]\n",
"Index: []\n",
"\n",
"[0 rows x 59 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# no rows returned, so there are no rows with both primary and secondary temperature values\n",
"print(np.sum(dfbot['Depth [metres]']>=0),\n",
" np.sum(dfbot['Depth:CTD [metres]']>=0))\n",
"dfbot.loc[(dfbot['Depth [metres]']>=0)&\\\n",
" (dfbot['Depth:CTD [metres]']>=0)]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1014 956 36\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" File Name | \n",
" Zone | \n",
" LOC:EVENT_NUMBER | \n",
" LOC:LATITUDE | \n",
" LOC:LONGITUDE | \n",
" LOC:WATER DEPTH | \n",
" ADM:SCIENTIST | \n",
" ADM:MISSION | \n",
" LOC:STATION | \n",
" ADM:PROJECT | \n",
" ... | \n",
" Fluorescence:URU:CTD [mg/m^3] | \n",
" Depth:Nominal [metres] | \n",
" Alkalinity:Total [umol/L] | \n",
" Flag:Alkalinity:Total | \n",
" Carbon:Dissolved:Inorganic [umol/kg] | \n",
" Flag:Carbon:Dissolved:Inorganic | \n",
" Salinity:T1:C1 [PSS-78] | \n",
" Fluorescence:URU:Wetlabs [mg/m^3] | \n",
" Bottle_Number | \n",
" Bottle:Firing_Sequence | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
0 rows × 59 columns
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [File Name, Zone, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, ADM:MISSION, LOC:STATION, ADM:PROJECT, Zone.1, YYYY/MM/DD HH:MM:SS, Sample_Number, Pressure [decibar], Depth [metres], Temperature:Primary [deg C (ITS90)], Transmissivity [*/metre], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], PAR:Reference [uE/m^2/sec], pH:SBE:Nominal, Salinity:T0:C0 [PSS-78], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, FIL:START TIME YYYY/MM/DD HH:MM:SS, Temperature:Secondary [deg C (ITS90)], Pressure:CTD [decibar], Depth:CTD [metres], Temperature:CTD [deg C (ITS90)], Salinity:CTD [PSS-78], Oxygen:Dissolved:Volume:CTD [mL/L], Oxygen:Dissolved:Mass:CTD [umol/kg], Fluorescence:URU:CTD [mg/m^3], Depth:Nominal [metres], Alkalinity:Total [umol/L], Flag:Alkalinity:Total, Carbon:Dissolved:Inorganic [umol/kg], Flag:Carbon:Dissolved:Inorganic, Salinity:T1:C1 [PSS-78], Fluorescence:URU:Wetlabs [mg/m^3], Bottle_Number, Bottle:Firing_Sequence]\n",
"Index: []\n",
"\n",
"[0 rows x 59 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# no rows returned, so there are no rows with both primary and secondary temperature values\n",
"print(np.sum(dfbot['Temperature:Primary [deg C (ITS90)]']>=0),\n",
" np.sum(dfbot['Temperature:Secondary [deg C (ITS90)]']>=0),\n",
" np.sum(dfbot['Temperature:CTD [deg C (ITS90)]']>=0))\n",
"dfbot.loc[(np.array([int(ii) for ii in (dfbot['Temperature:Primary [deg C (ITS90)]']>=0)])+\\\n",
" np.array([int(ii) for ii in (dfbot['Temperature:Secondary [deg C (ITS90)]']>=0)])+\\\n",
" np.array([int(ii) for ii in (dfbot['Temperature:CTD [deg C (ITS90)]']>=0)]))>1]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1517 36 104\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" File Name | \n",
" Zone | \n",
" LOC:EVENT_NUMBER | \n",
" LOC:LATITUDE | \n",
" LOC:LONGITUDE | \n",
" LOC:WATER DEPTH | \n",
" ADM:SCIENTIST | \n",
" ADM:MISSION | \n",
" LOC:STATION | \n",
" ADM:PROJECT | \n",
" ... | \n",
" Fluorescence:URU:CTD [mg/m^3] | \n",
" Depth:Nominal [metres] | \n",
" Alkalinity:Total [umol/L] | \n",
" Flag:Alkalinity:Total | \n",
" Carbon:Dissolved:Inorganic [umol/kg] | \n",
" Flag:Carbon:Dissolved:Inorganic | \n",
" Salinity:T1:C1 [PSS-78] | \n",
" Fluorescence:URU:Wetlabs [mg/m^3] | \n",
" Bottle_Number | \n",
" Bottle:Firing_Sequence | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
0 rows × 59 columns
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [File Name, Zone, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, ADM:MISSION, LOC:STATION, ADM:PROJECT, Zone.1, YYYY/MM/DD HH:MM:SS, Sample_Number, Pressure [decibar], Depth [metres], Temperature:Primary [deg C (ITS90)], Transmissivity [*/metre], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], PAR:Reference [uE/m^2/sec], pH:SBE:Nominal, Salinity:T0:C0 [PSS-78], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, FIL:START TIME YYYY/MM/DD HH:MM:SS, Temperature:Secondary [deg C (ITS90)], Pressure:CTD [decibar], Depth:CTD [metres], Temperature:CTD [deg C (ITS90)], Salinity:CTD [PSS-78], Oxygen:Dissolved:Volume:CTD [mL/L], Oxygen:Dissolved:Mass:CTD [umol/kg], Fluorescence:URU:CTD [mg/m^3], Depth:Nominal [metres], Alkalinity:Total [umol/L], Flag:Alkalinity:Total, Carbon:Dissolved:Inorganic [umol/kg], Flag:Carbon:Dissolved:Inorganic, Salinity:T1:C1 [PSS-78], Fluorescence:URU:Wetlabs [mg/m^3], Bottle_Number, Bottle:Firing_Sequence]\n",
"Index: []\n",
"\n",
"[0 rows x 59 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# no rows returned, so there are no rows with both primary and secondary temperature values\n",
"print(np.sum(dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0),\n",
" np.sum(dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0),\n",
" np.sum(dfbot['Fluorescence:URU:Wetlabs [mg/m^3]']>=0))\n",
"dfbot.loc[(np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0)])+\\\n",
" np.array([int(ii) for ii in (dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0)])+\\\n",
" np.array([int(ii) for ii in (dfbot['Fluorescence:URU:Wetlabs [mg/m^3]']>=0)]))>1]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1517 36\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" File Name | \n",
" Zone | \n",
" LOC:EVENT_NUMBER | \n",
" LOC:LATITUDE | \n",
" LOC:LONGITUDE | \n",
" LOC:WATER DEPTH | \n",
" ADM:SCIENTIST | \n",
" ADM:MISSION | \n",
" LOC:STATION | \n",
" ADM:PROJECT | \n",
" ... | \n",
" Fluorescence:URU:CTD [mg/m^3] | \n",
" Depth:Nominal [metres] | \n",
" Alkalinity:Total [umol/L] | \n",
" Flag:Alkalinity:Total | \n",
" Carbon:Dissolved:Inorganic [umol/kg] | \n",
" Flag:Carbon:Dissolved:Inorganic | \n",
" Salinity:T1:C1 [PSS-78] | \n",
" Fluorescence:URU:Wetlabs [mg/m^3] | \n",
" Bottle_Number | \n",
" Bottle:Firing_Sequence | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
0 rows × 59 columns
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [File Name, Zone, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, ADM:MISSION, LOC:STATION, ADM:PROJECT, Zone.1, YYYY/MM/DD HH:MM:SS, Sample_Number, Pressure [decibar], Depth [metres], Temperature:Primary [deg C (ITS90)], Transmissivity [*/metre], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], PAR:Reference [uE/m^2/sec], pH:SBE:Nominal, Salinity:T0:C0 [PSS-78], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, FIL:START TIME YYYY/MM/DD HH:MM:SS, Temperature:Secondary [deg C (ITS90)], Pressure:CTD [decibar], Depth:CTD [metres], Temperature:CTD [deg C (ITS90)], Salinity:CTD [PSS-78], Oxygen:Dissolved:Volume:CTD [mL/L], Oxygen:Dissolved:Mass:CTD [umol/kg], Fluorescence:URU:CTD [mg/m^3], Depth:Nominal [metres], Alkalinity:Total [umol/L], Flag:Alkalinity:Total, Carbon:Dissolved:Inorganic [umol/kg], Flag:Carbon:Dissolved:Inorganic, Salinity:T1:C1 [PSS-78], Fluorescence:URU:Wetlabs [mg/m^3], Bottle_Number, Bottle:Firing_Sequence]\n",
"Index: []\n",
"\n",
"[0 rows x 59 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# no rows returned, so there are no rows with both primary and secondary temperature values\n",
"print(np.sum(dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0),\n",
" np.sum(dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0))\n",
"dfbot.loc[(dfbot['Fluorescence:URU:Seapoint [mg/m^3]']>=0)&\\\n",
" (dfbot['Fluorescence:URU:CTD [mg/m^3]']>=0)]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1132 836 36\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" File Name | \n",
" Zone | \n",
" LOC:EVENT_NUMBER | \n",
" LOC:LATITUDE | \n",
" LOC:LONGITUDE | \n",
" LOC:WATER DEPTH | \n",
" ADM:SCIENTIST | \n",
" ADM:MISSION | \n",
" LOC:STATION | \n",
" ADM:PROJECT | \n",
" ... | \n",
" Fluorescence:URU:CTD [mg/m^3] | \n",
" Depth:Nominal [metres] | \n",
" Alkalinity:Total [umol/L] | \n",
" Flag:Alkalinity:Total | \n",
" Carbon:Dissolved:Inorganic [umol/kg] | \n",
" Flag:Carbon:Dissolved:Inorganic | \n",
" Salinity:T1:C1 [PSS-78] | \n",
" Fluorescence:URU:Wetlabs [mg/m^3] | \n",
" Bottle_Number | \n",
" Bottle:Firing_Sequence | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
0 rows × 59 columns
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [File Name, Zone, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, ADM:MISSION, LOC:STATION, ADM:PROJECT, Zone.1, YYYY/MM/DD HH:MM:SS, Sample_Number, Pressure [decibar], Depth [metres], Temperature:Primary [deg C (ITS90)], Transmissivity [*/metre], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], PAR:Reference [uE/m^2/sec], pH:SBE:Nominal, Salinity:T0:C0 [PSS-78], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, FIL:START TIME YYYY/MM/DD HH:MM:SS, Temperature:Secondary [deg C (ITS90)], Pressure:CTD [decibar], Depth:CTD [metres], Temperature:CTD [deg C (ITS90)], Salinity:CTD [PSS-78], Oxygen:Dissolved:Volume:CTD [mL/L], Oxygen:Dissolved:Mass:CTD [umol/kg], Fluorescence:URU:CTD [mg/m^3], Depth:Nominal [metres], Alkalinity:Total [umol/L], Flag:Alkalinity:Total, Carbon:Dissolved:Inorganic [umol/kg], Flag:Carbon:Dissolved:Inorganic, Salinity:T1:C1 [PSS-78], Fluorescence:URU:Wetlabs [mg/m^3], Bottle_Number, Bottle:Firing_Sequence]\n",
"Index: []\n",
"\n",
"[0 rows x 59 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# no rows returned, so there are no rows with both both salinity fields\n",
"print(np.sum(dfbot['Salinity:T0:C0 [PSS-78]']>=0),\n",
" np.sum(dfbot['Salinity:T1:C1 [PSS-78]']>=0),\n",
" np.sum(dfbot['Salinity:CTD [PSS-78]']>=0))\n",
"dfbot.loc[(np.array([int(ii) for ii in (dfbot['Salinity:T0:C0 [PSS-78]']>=0)])+\\\n",
" np.array([int(ii) for ii in (dfbot['Salinity:T1:C1 [PSS-78]']>=0)])+\\\n",
" np.array([int(ii) for ii in (dfbot['Salinity:CTD [PSS-78]']>=0)]))>1]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1983 36\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" File Name | \n",
" Zone | \n",
" LOC:EVENT_NUMBER | \n",
" LOC:LATITUDE | \n",
" LOC:LONGITUDE | \n",
" LOC:WATER DEPTH | \n",
" ADM:SCIENTIST | \n",
" ADM:MISSION | \n",
" LOC:STATION | \n",
" ADM:PROJECT | \n",
" ... | \n",
" Fluorescence:URU:CTD [mg/m^3] | \n",
" Depth:Nominal [metres] | \n",
" Alkalinity:Total [umol/L] | \n",
" Flag:Alkalinity:Total | \n",
" Carbon:Dissolved:Inorganic [umol/kg] | \n",
" Flag:Carbon:Dissolved:Inorganic | \n",
" Salinity:T1:C1 [PSS-78] | \n",
" Fluorescence:URU:Wetlabs [mg/m^3] | \n",
" Bottle_Number | \n",
" Bottle:Firing_Sequence | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
0 rows × 59 columns
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [File Name, Zone, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, ADM:MISSION, LOC:STATION, ADM:PROJECT, Zone.1, YYYY/MM/DD HH:MM:SS, Sample_Number, Pressure [decibar], Depth [metres], Temperature:Primary [deg C (ITS90)], Transmissivity [*/metre], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], PAR:Reference [uE/m^2/sec], pH:SBE:Nominal, Salinity:T0:C0 [PSS-78], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, FIL:START TIME YYYY/MM/DD HH:MM:SS, Temperature:Secondary [deg C (ITS90)], Pressure:CTD [decibar], Depth:CTD [metres], Temperature:CTD [deg C (ITS90)], Salinity:CTD [PSS-78], Oxygen:Dissolved:Volume:CTD [mL/L], Oxygen:Dissolved:Mass:CTD [umol/kg], Fluorescence:URU:CTD [mg/m^3], Depth:Nominal [metres], Alkalinity:Total [umol/L], Flag:Alkalinity:Total, Carbon:Dissolved:Inorganic [umol/kg], Flag:Carbon:Dissolved:Inorganic, Salinity:T1:C1 [PSS-78], Fluorescence:URU:Wetlabs [mg/m^3], Bottle_Number, Bottle:Firing_Sequence]\n",
"Index: []\n",
"\n",
"[0 rows x 59 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# no rows returned, so there are no rows with both primary and secondary temperature values\n",
"print(np.sum(dfbot['Pressure [decibar]']>=0),\n",
" np.sum(dfbot['Pressure:CTD [decibar]']>=0))\n",
"dfbot.loc[(dfbot['Pressure [decibar]']>=0)&\\\n",
" (dfbot['Pressure:CTD [decibar]']>=0)]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2010 36\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" File Name | \n",
" Zone | \n",
" LOC:EVENT_NUMBER | \n",
" LOC:LATITUDE | \n",
" LOC:LONGITUDE | \n",
" LOC:WATER DEPTH | \n",
" ADM:SCIENTIST | \n",
" ADM:MISSION | \n",
" LOC:STATION | \n",
" ADM:PROJECT | \n",
" ... | \n",
" Fluorescence:URU:CTD [mg/m^3] | \n",
" Depth:Nominal [metres] | \n",
" Alkalinity:Total [umol/L] | \n",
" Flag:Alkalinity:Total | \n",
" Carbon:Dissolved:Inorganic [umol/kg] | \n",
" Flag:Carbon:Dissolved:Inorganic | \n",
" Salinity:T1:C1 [PSS-78] | \n",
" Fluorescence:URU:Wetlabs [mg/m^3] | \n",
" Bottle_Number | \n",
" Bottle:Firing_Sequence | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
0 rows × 59 columns
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [File Name, Zone, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, ADM:MISSION, LOC:STATION, ADM:PROJECT, Zone.1, YYYY/MM/DD HH:MM:SS, Sample_Number, Pressure [decibar], Depth [metres], Temperature:Primary [deg C (ITS90)], Transmissivity [*/metre], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], PAR:Reference [uE/m^2/sec], pH:SBE:Nominal, Salinity:T0:C0 [PSS-78], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, FIL:START TIME YYYY/MM/DD HH:MM:SS, Temperature:Secondary [deg C (ITS90)], Pressure:CTD [decibar], Depth:CTD [metres], Temperature:CTD [deg C (ITS90)], Salinity:CTD [PSS-78], Oxygen:Dissolved:Volume:CTD [mL/L], Oxygen:Dissolved:Mass:CTD [umol/kg], Fluorescence:URU:CTD [mg/m^3], Depth:Nominal [metres], Alkalinity:Total [umol/L], Flag:Alkalinity:Total, Carbon:Dissolved:Inorganic [umol/kg], Flag:Carbon:Dissolved:Inorganic, Salinity:T1:C1 [PSS-78], Fluorescence:URU:Wetlabs [mg/m^3], Bottle_Number, Bottle:Firing_Sequence]\n",
"Index: []\n",
"\n",
"[0 rows x 59 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# no rows returned, so there are no rows with both primary and secondary temperature values\n",
"print(np.sum(dfbot['Depth [metres]']>=0),\n",
" np.sum(dfbot['Depth:CTD [metres]']>=0))\n",
"dfbot.loc[(dfbot['Depth [metres]']>=0)&\\\n",
" (dfbot['Depth:CTD [metres]']>=0)]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1975 36\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" File Name | \n",
" Zone | \n",
" LOC:EVENT_NUMBER | \n",
" LOC:LATITUDE | \n",
" LOC:LONGITUDE | \n",
" LOC:WATER DEPTH | \n",
" ADM:SCIENTIST | \n",
" ADM:MISSION | \n",
" LOC:STATION | \n",
" ADM:PROJECT | \n",
" ... | \n",
" Fluorescence:URU:CTD [mg/m^3] | \n",
" Depth:Nominal [metres] | \n",
" Alkalinity:Total [umol/L] | \n",
" Flag:Alkalinity:Total | \n",
" Carbon:Dissolved:Inorganic [umol/kg] | \n",
" Flag:Carbon:Dissolved:Inorganic | \n",
" Salinity:T1:C1 [PSS-78] | \n",
" Fluorescence:URU:Wetlabs [mg/m^3] | \n",
" Bottle_Number | \n",
" Bottle:Firing_Sequence | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
0 rows × 59 columns
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [File Name, Zone, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, ADM:MISSION, LOC:STATION, ADM:PROJECT, Zone.1, YYYY/MM/DD HH:MM:SS, Sample_Number, Pressure [decibar], Depth [metres], Temperature:Primary [deg C (ITS90)], Transmissivity [*/metre], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], PAR:Reference [uE/m^2/sec], pH:SBE:Nominal, Salinity:T0:C0 [PSS-78], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, FIL:START TIME YYYY/MM/DD HH:MM:SS, Temperature:Secondary [deg C (ITS90)], Pressure:CTD [decibar], Depth:CTD [metres], Temperature:CTD [deg C (ITS90)], Salinity:CTD [PSS-78], Oxygen:Dissolved:Volume:CTD [mL/L], Oxygen:Dissolved:Mass:CTD [umol/kg], Fluorescence:URU:CTD [mg/m^3], Depth:Nominal [metres], Alkalinity:Total [umol/L], Flag:Alkalinity:Total, Carbon:Dissolved:Inorganic [umol/kg], Flag:Carbon:Dissolved:Inorganic, Salinity:T1:C1 [PSS-78], Fluorescence:URU:Wetlabs [mg/m^3], Bottle_Number, Bottle:Firing_Sequence]\n",
"Index: []\n",
"\n",
"[0 rows x 59 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# no rows returned, so there are no rows with both primary and secondary temperature values\n",
"print(np.sum(dfbot['Oxygen:Dissolved:SBE [mL/L]']>=0),\n",
" np.sum(dfbot['Oxygen:Dissolved:Volume:CTD [mL/L]']>=0))\n",
"dfbot.loc[(dfbot['Oxygen:Dissolved:SBE [mL/L]']>=0)&\\\n",
" (dfbot['Oxygen:Dissolved:Volume:CTD [mL/L]']>=0)]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1952 36\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" File Name | \n",
" Zone | \n",
" LOC:EVENT_NUMBER | \n",
" LOC:LATITUDE | \n",
" LOC:LONGITUDE | \n",
" LOC:WATER DEPTH | \n",
" ADM:SCIENTIST | \n",
" ADM:MISSION | \n",
" LOC:STATION | \n",
" ADM:PROJECT | \n",
" ... | \n",
" Fluorescence:URU:CTD [mg/m^3] | \n",
" Depth:Nominal [metres] | \n",
" Alkalinity:Total [umol/L] | \n",
" Flag:Alkalinity:Total | \n",
" Carbon:Dissolved:Inorganic [umol/kg] | \n",
" Flag:Carbon:Dissolved:Inorganic | \n",
" Salinity:T1:C1 [PSS-78] | \n",
" Fluorescence:URU:Wetlabs [mg/m^3] | \n",
" Bottle_Number | \n",
" Bottle:Firing_Sequence | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
0 rows × 59 columns
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [File Name, Zone, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, ADM:MISSION, LOC:STATION, ADM:PROJECT, Zone.1, YYYY/MM/DD HH:MM:SS, Sample_Number, Pressure [decibar], Depth [metres], Temperature:Primary [deg C (ITS90)], Transmissivity [*/metre], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], PAR:Reference [uE/m^2/sec], pH:SBE:Nominal, Salinity:T0:C0 [PSS-78], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, FIL:START TIME YYYY/MM/DD HH:MM:SS, Temperature:Secondary [deg C (ITS90)], Pressure:CTD [decibar], Depth:CTD [metres], Temperature:CTD [deg C (ITS90)], Salinity:CTD [PSS-78], Oxygen:Dissolved:Volume:CTD [mL/L], Oxygen:Dissolved:Mass:CTD [umol/kg], Fluorescence:URU:CTD [mg/m^3], Depth:Nominal [metres], Alkalinity:Total [umol/L], Flag:Alkalinity:Total, Carbon:Dissolved:Inorganic [umol/kg], Flag:Carbon:Dissolved:Inorganic, Salinity:T1:C1 [PSS-78], Fluorescence:URU:Wetlabs [mg/m^3], Bottle_Number, Bottle:Firing_Sequence]\n",
"Index: []\n",
"\n",
"[0 rows x 59 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# no rows returned, so there are no rows with both primary and secondary temperature values\n",
"print(np.sum(dfbot['Oxygen:Dissolved:SBE [umol/kg]']>=0),\n",
" np.sum(dfbot['Oxygen:Dissolved:Mass:CTD [umol/kg]']>=0))\n",
"dfbot.loc[(dfbot['Oxygen:Dissolved:SBE [umol/kg]']>=0)&\\\n",
" (dfbot['Oxygen:Dissolved:Mass:CTD [umol/kg]']>=0)]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def subval(idf,colList):\n",
" # first value in colList should be the column you are going to keep\n",
" # follow with other columns that will be used to fill in when that column is NaN\n",
" # in order of precedence\n",
" if len(colList)==2:\n",
" idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \\\n",
" else r[colList[1]] for i,r in idf.iterrows()]\n",
" elif len(colList)==3:\n",
" idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \\\n",
" else r[colList[1]] if not pd.isna(r[colList[1]]) \\\n",
" else r[colList[2]] for i,r in idf.iterrows()]\n",
" else:\n",
" raise NotImplementedError('Add to code to handle this case')\n",
" idf.drop(columns=list(colList[1:]),inplace=True)\n",
" return idf"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# there are some duplicate columns here; handle them:\n",
"dfbot=subval(dfbot,('FIL:START TIME YYYY/MM/DD HH:MM:SS',\n",
" 'YYYY/MM/DD HH:MM:SS'))\n",
"dfbot=subval(dfbot,('Temperature:Primary [deg C (ITS90)]',\n",
" 'Temperature:Secondary [deg C (ITS90)]',\n",
" 'Temperature:CTD [deg C (ITS90)]'))\n",
"dfbot=subval(dfbot,('Salinity:T0:C0 [PSS-78]',\n",
" 'Salinity:T1:C1 [PSS-78]',\n",
" 'Salinity:CTD [PSS-78]'))\n",
"dfbot=subval(dfbot,('Pressure [decibar]',\n",
" 'Pressure:CTD [decibar]'))\n",
"dfbot=subval(dfbot,('Depth [metres]',\n",
" 'Depth:CTD [metres]'))\n",
"dfbot=subval(dfbot,('Oxygen:Dissolved:SBE [mL/L]',\n",
" 'Oxygen:Dissolved:Volume:CTD [mL/L]'))\n",
"dfbot=subval(dfbot,('Oxygen:Dissolved:SBE [umol/kg]',\n",
" 'Oxygen:Dissolved:Mass:CTD [umol/kg]'))\n",
"dfbot=subval(dfbot,('Fluorescence:URU:Seapoint [mg/m^3]',\n",
" 'Fluorescence:URU:CTD [mg/m^3]',\n",
" 'Fluorescence:URU:Wetlabs [mg/m^3]'))\n",
"dfbot.rename(columns={'Temperature:Primary [deg C (ITS90)]':'Temperature [deg C (ITS90)]'},\n",
" inplace=True)\n",
"dfbot.rename(columns={'Salinity:T0:C0 [PSS-78]':'Salinity [PSS-78]'},\n",
" inplace=True)\n",
"dfbot.rename(columns={'Oxygen:Dissolved:SBE [mL/L]':'Oxygen:Dissolved:CTD [mL/L]'},\n",
" inplace=True)\n",
"dfbot.rename(columns={'Oxygen:Dissolved:SBE [umol/kg]':'Oxygen:Dissolved:CTD [umol/kg]'},\n",
" inplace=True)\n",
"dfbot.rename(columns={'Fluorescence:URU:Seapoint [mg/m^3]':'Fluorescence [mg/m^3]'},\n",
" inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# define a function that will be applied to the values in the index column;\n",
"# this makes it easier to drop non-data rows later\n",
"def convertIndex(val):\n",
" try:\n",
" x =int(val)\n",
" except ValueError:\n",
" x=np.nan\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading sheet 2018 CHEMTAX abs results \n"
]
}
],
"source": [
"# load the phytoplankton data with the following options:\n",
"# sheet_name='???? CHEMTAX abs results' -> choose the sheet\n",
"# usecols='A:I,T:AC' -> read only columns A:I and T:AC from the Excel sheet\n",
"# skiprows=2 -> start reading at the 3rd row of the sheet, \n",
"# which contains the column headings\n",
"# converters={'Index': convertIndex,} -> apply the function defined above to the Index column\n",
"# verbose = True -> print extra information/ warnings/ errors\n",
"dfPhyto=pd.read_excel(pathPhyto,sheet_name='2018 CHEMTAX abs results ',usecols='A:I,T:AC',\n",
" skiprows=2,converters={'Index': convertIndex,},\n",
" verbose=True)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Bin # | \n",
" Index | \n",
" Subgroup | \n",
" Cruise | \n",
" Month | \n",
" Station | \n",
" Sample# | \n",
" rep | \n",
" depth | \n",
" Diatoms-1.1 | \n",
" Diatoms-2.1 | \n",
" Prasinophytes.1 | \n",
" Cryptophytes.1 | \n",
" Dinoflagellates | \n",
" Haptophytes.1 | \n",
" Dictyochophytes | \n",
" Raphidophytes | \n",
" Cyanobacteria.1 | \n",
" TchlA.1 | \n",
"
\n",
" \n",
" \n",
" \n",
" 48 | \n",
" 20 | \n",
" 44.0 | \n",
" 1 | \n",
" 2018-05 | \n",
" 2018-02-06 00:00:00 | \n",
" IND4 | \n",
" 283 | \n",
" A | \n",
" 0 | \n",
" 0.0649569 | \n",
" 0 | \n",
" 0.00302939 | \n",
" 0.00476828 | \n",
" 0.00010182 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0.000196532 | \n",
" 0.0730529 | \n",
"
\n",
" \n",
" 49 | \n",
" 20 | \n",
" 45.0 | \n",
" 1 | \n",
" 2018-05 | \n",
" 2018-02-06 00:00:00 | \n",
" IND4 | \n",
" 283 | \n",
" B | \n",
" 0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 50 | \n",
" 20 | \n",
" 42.0 | \n",
" 1 | \n",
" 2018-05 | \n",
" 2018-02-06 00:00:00 | \n",
" IND7 | \n",
" 258 | \n",
" A | \n",
" 0 | \n",
" 0.108673 | \n",
" 0 | \n",
" 0.0150159 | \n",
" 0.0356732 | \n",
" 0.000435557 | \n",
" 0.000280088 | \n",
" 0 | \n",
" 0 | \n",
" 0.000201243 | \n",
" 0.160279 | \n",
"
\n",
" \n",
" 51 | \n",
" 20 | \n",
" 43.0 | \n",
" 1 | \n",
" 2018-05 | \n",
" 2018-02-06 00:00:00 | \n",
" IND7 | \n",
" 258 | \n",
" B | \n",
" 0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 52 | \n",
" Absolute Pigment Compositions - Bin # 6 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" From Sheet: OutR6 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 53 | \n",
" Tchl_a | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 54 | \n",
" Bin # | \n",
" NaN | \n",
" Subgroup | \n",
" Cruise | \n",
" Date | \n",
" Station | \n",
" Sample# | \n",
" rep | \n",
" depth | \n",
" Diatoms-1 | \n",
" Diatoms-2 | \n",
" Prasinophytes | \n",
" Cryptophytes | \n",
" Dinoflagellates-1 | \n",
" Haptophytes | \n",
" Dictyo | \n",
" Raphido | \n",
" Cyanobacteria | \n",
" TchlA | \n",
"
\n",
" \n",
" 55 | \n",
" 6 | \n",
" 52.0 | \n",
" 2 | \n",
" 2018-01 | \n",
" 2018-03-05 00:00:00 | \n",
" JF2 | \n",
" 515 | \n",
" A | \n",
" 2.311 | \n",
" 1.11284 | \n",
" 0.191776 | \n",
" 0.129091 | \n",
" 0.120473 | \n",
" 0.00356186 | \n",
" 0 | \n",
" 0.00256907 | \n",
" 0.0637685 | \n",
" 0 | \n",
" 1.62408 | \n",
"
\n",
" \n",
" 56 | \n",
" 6 | \n",
" 53.0 | \n",
" 2 | \n",
" 2018-01 | \n",
" 2018-03-05 00:00:00 | \n",
" JF2 | \n",
" 515 | \n",
" B | \n",
" 2.311 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 57 | \n",
" 6 | \n",
" 54.0 | \n",
" 2 | \n",
" 2018-01 | \n",
" 2018-03-06 00:00:00 | \n",
" 56 | \n",
" 556 | \n",
" A | \n",
" 2.218 | \n",
" 0.474503 | \n",
" 0.066797 | \n",
" 0.0240524 | \n",
" 0.0413634 | \n",
" 0 | \n",
" 0.101717 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0.708433 | \n",
"
\n",
" \n",
" 58 | \n",
" 6 | \n",
" 55.0 | \n",
" 2 | \n",
" 2018-01 | \n",
" 2018-03-06 00:00:00 | \n",
" 56 | \n",
" 556 | \n",
" B | \n",
" 2.218 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 59 | \n",
" 6 | \n",
" 56.0 | \n",
" 2 | \n",
" 2018-01 | \n",
" 2018-03-06 00:00:00 | \n",
" 46 | \n",
" 559 | \n",
" A | \n",
" 2.596 | \n",
" 1.13658 | \n",
" 0.252949 | \n",
" 0.0463031 | \n",
" 0.0438257 | \n",
" 0 | \n",
" 0.230516 | \n",
" 0.0018858 | \n",
" 0 | \n",
" 0 | \n",
" 1.71206 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Bin # Index Subgroup Cruise \\\n",
"48 20 44.0 1 2018-05 \n",
"49 20 45.0 1 2018-05 \n",
"50 20 42.0 1 2018-05 \n",
"51 20 43.0 1 2018-05 \n",
"52 Absolute Pigment Compositions - Bin # 6 NaN NaN NaN \n",
"53 Tchl_a NaN NaN NaN \n",
"54 Bin # NaN Subgroup Cruise \n",
"55 6 52.0 2 2018-01 \n",
"56 6 53.0 2 2018-01 \n",
"57 6 54.0 2 2018-01 \n",
"58 6 55.0 2 2018-01 \n",
"59 6 56.0 2 2018-01 \n",
"\n",
" Month Station Sample# rep depth Diatoms-1.1 \\\n",
"48 2018-02-06 00:00:00 IND4 283 A 0 0.0649569 \n",
"49 2018-02-06 00:00:00 IND4 283 B 0 NaN \n",
"50 2018-02-06 00:00:00 IND7 258 A 0 0.108673 \n",
"51 2018-02-06 00:00:00 IND7 258 B 0 NaN \n",
"52 NaN From Sheet: OutR6 NaN NaN NaN NaN \n",
"53 NaN NaN NaN NaN NaN NaN \n",
"54 Date Station Sample# rep depth Diatoms-1 \n",
"55 2018-03-05 00:00:00 JF2 515 A 2.311 1.11284 \n",
"56 2018-03-05 00:00:00 JF2 515 B 2.311 NaN \n",
"57 2018-03-06 00:00:00 56 556 A 2.218 0.474503 \n",
"58 2018-03-06 00:00:00 56 556 B 2.218 NaN \n",
"59 2018-03-06 00:00:00 46 559 A 2.596 1.13658 \n",
"\n",
" Diatoms-2.1 Prasinophytes.1 Cryptophytes.1 Dinoflagellates \\\n",
"48 0 0.00302939 0.00476828 0.00010182 \n",
"49 NaN NaN NaN NaN \n",
"50 0 0.0150159 0.0356732 0.000435557 \n",
"51 NaN NaN NaN NaN \n",
"52 NaN NaN NaN NaN \n",
"53 NaN NaN NaN NaN \n",
"54 Diatoms-2 Prasinophytes Cryptophytes Dinoflagellates-1 \n",
"55 0.191776 0.129091 0.120473 0.00356186 \n",
"56 NaN NaN NaN NaN \n",
"57 0.066797 0.0240524 0.0413634 0 \n",
"58 NaN NaN NaN NaN \n",
"59 0.252949 0.0463031 0.0438257 0 \n",
"\n",
" Haptophytes.1 Dictyochophytes Raphidophytes Cyanobacteria.1 TchlA.1 \n",
"48 0 0 0 0.000196532 0.0730529 \n",
"49 NaN NaN NaN NaN NaN \n",
"50 0.000280088 0 0 0.000201243 0.160279 \n",
"51 NaN NaN NaN NaN NaN \n",
"52 NaN NaN NaN NaN NaN \n",
"53 NaN NaN NaN NaN NaN \n",
"54 Haptophytes Dictyo Raphido Cyanobacteria TchlA \n",
"55 0 0.00256907 0.0637685 0 1.62408 \n",
"56 NaN NaN NaN NaN NaN \n",
"57 0.101717 0 0 0 0.708433 \n",
"58 NaN NaN NaN NaN NaN \n",
"59 0.230516 0.0018858 0 0 1.71206 "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# display rows 48 to 59 of the resulting table\n",
"dfPhyto[48:60]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# now, drop any rows from the table that have NaN values in either of the columns\n",
"# 'Index' or 'TchlA (ug/L)'\n",
"# This is why we applied a function to the Index column to make sure all \n",
"# non-numeric Index values would have a consistent NaN entry, making them easy to identify\n",
"# and remove\n",
"dfPhyto.dropna(subset=['Index', 'TchlA.1'],how='any',inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"# pandas creates its own index, and after dropping rows I like to reset it -\n",
"# this is just for convenience\n",
"dfPhyto.reset_index(drop=True,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# apply formatting function all rows in Cruise column to get year-3digit format\n",
"dfPhyto['Cruise']=[fmtCruise(ii) for ii in dfPhyto['Cruise']]"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Bin # | \n",
" Index | \n",
" Subgroup | \n",
" Cruise | \n",
" Month | \n",
" Station | \n",
" Sample# | \n",
" rep | \n",
" depth | \n",
" Diatoms-1.1 | \n",
" Diatoms-2.1 | \n",
" Prasinophytes.1 | \n",
" Cryptophytes.1 | \n",
" Dinoflagellates | \n",
" Haptophytes.1 | \n",
" Dictyochophytes | \n",
" Raphidophytes | \n",
" Cyanobacteria.1 | \n",
" TchlA.1 | \n",
"
\n",
" \n",
" \n",
" \n",
" 48 | \n",
" 7 | \n",
" 95.0 | \n",
" 3 | \n",
" 2018-029 | \n",
" 2018-04-07 00:00:00 | \n",
" 69 | \n",
" 36 | \n",
" A | \n",
" 0 | \n",
" 0.187208 | \n",
" 0.053063 | \n",
" 0.000209863 | \n",
" 0.064098 | \n",
" 0.000847701 | \n",
" 0 | \n",
" 0 | \n",
" 0.00279431 | \n",
" 0.000899571 | \n",
" 0.30912 | \n",
"
\n",
" \n",
" 49 | \n",
" 7 | \n",
" 99.0 | \n",
" 3 | \n",
" 2018-029 | \n",
" 2018-04-07 00:00:00 | \n",
" ADCP | \n",
" 60 | \n",
" A | \n",
" 0 | \n",
" 0.19605 | \n",
" 0.0455146 | \n",
" 0.022445 | \n",
" 0.150537 | \n",
" 0.000617533 | \n",
" 0 | \n",
" 0.0101852 | \n",
" 0 | \n",
" 0 | \n",
" 0.42535 | \n",
"
\n",
" \n",
" 50 | \n",
" 7 | \n",
" 101.0 | \n",
" 3 | \n",
" 2018-029 | \n",
" 2018-04-07 00:00:00 | \n",
" 65 | \n",
" 70 | \n",
" A | \n",
" 0 | \n",
" 0.190369 | \n",
" 0.0611141 | \n",
" 0.0298991 | \n",
" 0.13724 | \n",
" 0.00104771 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0.41967 | \n",
"
\n",
" \n",
" 51 | \n",
" 7 | \n",
" 93.0 | \n",
" 3 | \n",
" 2018-029 | \n",
" 2018-04-07 00:00:00 | \n",
" 59 | \n",
" 24 | \n",
" A | \n",
" 0 | \n",
" 0.624601 | \n",
" 0.0413709 | \n",
" 0 | \n",
" 0.0655129 | \n",
" 0.000867299 | \n",
" 0 | \n",
" 0 | \n",
" 0.00178241 | \n",
" 0 | \n",
" 0.734135 | \n",
"
\n",
" \n",
" 52 | \n",
" 7 | \n",
" 103.0 | \n",
" 3 | \n",
" 2018-029 | \n",
" 2018-04-08 00:00:00 | \n",
" 56 | \n",
" 87 | \n",
" A | \n",
" 0 | \n",
" 0.92955 | \n",
" 0.00740467 | \n",
" 0 | \n",
" 0.0206606 | \n",
" 0.0600011 | \n",
" 0 | \n",
" 0.0201656 | \n",
" 0 | \n",
" 0 | \n",
" 1.03778 | \n",
"
\n",
" \n",
" 53 | \n",
" 7 | \n",
" 105.0 | \n",
" 3 | \n",
" 2018-029 | \n",
" 2018-04-08 00:00:00 | \n",
" 46 | \n",
" 99 | \n",
" A | \n",
" 0 | \n",
" 3.42264 | \n",
" 0.392457 | \n",
" 0 | \n",
" 0.197309 | \n",
" 0.0680703 | \n",
" 0.150816 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 4.2313 | \n",
"
\n",
" \n",
" 54 | \n",
" 7 | \n",
" 107.0 | \n",
" 3 | \n",
" 2018-029 | \n",
" 2018-04-08 00:00:00 | \n",
" 42 | \n",
" 115 | \n",
" A | \n",
" 0 | \n",
" 11.6617 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0.00175876 | \n",
" 0.467956 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 12.1314 | \n",
"
\n",
" \n",
" 55 | \n",
" 7 | \n",
" 109.0 | \n",
" 3 | \n",
" 2018-029 | \n",
" 2018-04-08 00:00:00 | \n",
" 39 | \n",
" 132 | \n",
" A | \n",
" 0 | \n",
" 9.28337 | \n",
" 0 | \n",
" 0 | \n",
" 0.00637019 | \n",
" 0.090733 | \n",
" 0.205921 | \n",
" 0 | \n",
" 0.00158903 | \n",
" 0 | \n",
" 9.58799 | \n",
"
\n",
" \n",
" 56 | \n",
" 7 | \n",
" 111.0 | \n",
" 3 | \n",
" 2018-029 | \n",
" 2018-04-08 00:00:00 | \n",
" GEO1 | \n",
" 135 | \n",
" A | \n",
" 0 | \n",
" 8.67672 | \n",
" 0.0262928 | \n",
" 0 | \n",
" 0.146423 | \n",
" 0.221538 | \n",
" 0.415636 | \n",
" 0.0321994 | \n",
" 0 | \n",
" 0 | \n",
" 9.51881 | \n",
"
\n",
" \n",
" 57 | \n",
" 7 | \n",
" 113.0 | \n",
" 3 | \n",
" 2018-029 | \n",
" 2018-04-09 00:00:00 | \n",
" 27 | \n",
" 151 | \n",
" A | \n",
" 0 | \n",
" 6.3555 | \n",
" 0.134811 | \n",
" 0 | \n",
" 0.234917 | \n",
" 0.19905 | \n",
" 0.418803 | \n",
" 0.0603483 | \n",
" 0 | \n",
" 0 | \n",
" 7.40343 | \n",
"
\n",
" \n",
" 58 | \n",
" 7 | \n",
" 115.0 | \n",
" 3 | \n",
" 2018-029 | \n",
" 2018-04-09 00:00:00 | \n",
" CPF1 | \n",
" 154 | \n",
" A | \n",
" 0 | \n",
" 2.43778 | \n",
" 0.078365 | \n",
" 7.92647e-05 | \n",
" 0.00468596 | \n",
" 0.00262092 | \n",
" 0 | \n",
" 0 | \n",
" 0.00473855 | \n",
" 0.000425651 | \n",
" 2.52869 | \n",
"
\n",
" \n",
" 59 | \n",
" 7 | \n",
" 117.0 | \n",
" 3 | \n",
" 2018-029 | \n",
" 2018-04-09 00:00:00 | \n",
" 2 | \n",
" 169 | \n",
" A | \n",
" 0 | \n",
" 1.73363 | \n",
" 0.417418 | \n",
" 0.000101997 | \n",
" 0.198702 | \n",
" 0 | \n",
" 0.100577 | \n",
" 0 | \n",
" 0 | \n",
" 0.000486359 | \n",
" 2.45091 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Bin # Index Subgroup Cruise Month Station Sample# rep \\\n",
"48 7 95.0 3 2018-029 2018-04-07 00:00:00 69 36 A \n",
"49 7 99.0 3 2018-029 2018-04-07 00:00:00 ADCP 60 A \n",
"50 7 101.0 3 2018-029 2018-04-07 00:00:00 65 70 A \n",
"51 7 93.0 3 2018-029 2018-04-07 00:00:00 59 24 A \n",
"52 7 103.0 3 2018-029 2018-04-08 00:00:00 56 87 A \n",
"53 7 105.0 3 2018-029 2018-04-08 00:00:00 46 99 A \n",
"54 7 107.0 3 2018-029 2018-04-08 00:00:00 42 115 A \n",
"55 7 109.0 3 2018-029 2018-04-08 00:00:00 39 132 A \n",
"56 7 111.0 3 2018-029 2018-04-08 00:00:00 GEO1 135 A \n",
"57 7 113.0 3 2018-029 2018-04-09 00:00:00 27 151 A \n",
"58 7 115.0 3 2018-029 2018-04-09 00:00:00 CPF1 154 A \n",
"59 7 117.0 3 2018-029 2018-04-09 00:00:00 2 169 A \n",
"\n",
" depth Diatoms-1.1 Diatoms-2.1 Prasinophytes.1 Cryptophytes.1 \\\n",
"48 0 0.187208 0.053063 0.000209863 0.064098 \n",
"49 0 0.19605 0.0455146 0.022445 0.150537 \n",
"50 0 0.190369 0.0611141 0.0298991 0.13724 \n",
"51 0 0.624601 0.0413709 0 0.0655129 \n",
"52 0 0.92955 0.00740467 0 0.0206606 \n",
"53 0 3.42264 0.392457 0 0.197309 \n",
"54 0 11.6617 0 0 0 \n",
"55 0 9.28337 0 0 0.00637019 \n",
"56 0 8.67672 0.0262928 0 0.146423 \n",
"57 0 6.3555 0.134811 0 0.234917 \n",
"58 0 2.43778 0.078365 7.92647e-05 0.00468596 \n",
"59 0 1.73363 0.417418 0.000101997 0.198702 \n",
"\n",
" Dinoflagellates Haptophytes.1 Dictyochophytes Raphidophytes \\\n",
"48 0.000847701 0 0 0.00279431 \n",
"49 0.000617533 0 0.0101852 0 \n",
"50 0.00104771 0 0 0 \n",
"51 0.000867299 0 0 0.00178241 \n",
"52 0.0600011 0 0.0201656 0 \n",
"53 0.0680703 0.150816 0 0 \n",
"54 0.00175876 0.467956 0 0 \n",
"55 0.090733 0.205921 0 0.00158903 \n",
"56 0.221538 0.415636 0.0321994 0 \n",
"57 0.19905 0.418803 0.0603483 0 \n",
"58 0.00262092 0 0 0.00473855 \n",
"59 0 0.100577 0 0 \n",
"\n",
" Cyanobacteria.1 TchlA.1 \n",
"48 0.000899571 0.30912 \n",
"49 0 0.42535 \n",
"50 0 0.41967 \n",
"51 0 0.734135 \n",
"52 0 1.03778 \n",
"53 0 4.2313 \n",
"54 0 12.1314 \n",
"55 0 9.58799 \n",
"56 0 9.51881 \n",
"57 0 7.40343 \n",
"58 0.000425651 2.52869 \n",
"59 0.000486359 2.45091 "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# display part of the table, confirming that non-data rows have been removed\n",
"dfPhyto[48:60]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# due to repeated column names in the original spreadsheet, '.1' was appended to the names\n",
"# of the phytoplankton columns; \n",
"# these lines correct the column names, removing the '.1':\n",
"renameDict=dict()\n",
"for colName in dfPhyto.keys():\n",
" if colName.endswith('.1'):\n",
" renameDict[colName]=colName.split('.1')[0]\n",
"dfPhyto.rename(columns=renameDict,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Bin # | \n",
" Index | \n",
" Subgroup | \n",
" Cruise | \n",
" Month | \n",
" Station | \n",
" Sample# | \n",
" rep | \n",
" depth | \n",
" Diatoms-1 | \n",
" Diatoms-2 | \n",
" Prasinophytes | \n",
" Cryptophytes | \n",
" Dinoflagellates | \n",
" Haptophytes | \n",
" Dictyochophytes | \n",
" Raphidophytes | \n",
" Cyanobacteria | \n",
" TchlA | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 20 | \n",
" 1.0 | \n",
" 1 | \n",
" 2018-005 | \n",
" 2018-02-02 00:00:00 | \n",
" SI | \n",
" 9 | \n",
" A | \n",
" 0 | \n",
" 0.153863 | \n",
" 0.0714609 | \n",
" 0.166427 | \n",
" 0.0874306 | \n",
" 0 | \n",
" 0.185031 | \n",
" 0.0149052 | \n",
" 0.0734495 | \n",
" 0 | \n",
" 0.752567 | \n",
"
\n",
" \n",
" 1 | \n",
" 20 | \n",
" 3.0 | \n",
" 1 | \n",
" 2018-005 | \n",
" 2018-02-02 00:00:00 | \n",
" 59 | \n",
" 23 | \n",
" A | \n",
" 0 | \n",
" 0.205173 | \n",
" 0.0166544 | \n",
" 0.00668857 | \n",
" 0.00172965 | \n",
" 0.000179561 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 3.44658e-06 | \n",
" 0.230429 | \n",
"
\n",
" \n",
" 2 | \n",
" 20 | \n",
" 5.0 | \n",
" 1 | \n",
" 2018-005 | \n",
" 2018-02-02 00:00:00 | \n",
" 56 | \n",
" 37 | \n",
" A | \n",
" 0 | \n",
" 0.175883 | \n",
" 0.0100662 | \n",
" 0.013261 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1.1709e-05 | \n",
" 0 | \n",
" 0 | \n",
" 0.199222 | \n",
"
\n",
" \n",
" 3 | \n",
" 20 | \n",
" 7.0 | \n",
" 1 | \n",
" 2018-005 | \n",
" 2018-02-03 00:00:00 | \n",
" 46 | \n",
" 49 | \n",
" A | \n",
" 0 | \n",
" 0.326609 | \n",
" 0.0461415 | \n",
" 0.131239 | \n",
" 0.0709554 | \n",
" 0 | \n",
" 0.142245 | \n",
" 0.00767466 | \n",
" 0.0317834 | \n",
" 0 | \n",
" 0.756648 | \n",
"
\n",
" \n",
" 4 | \n",
" 20 | \n",
" 9.0 | \n",
" 1 | \n",
" 2018-005 | \n",
" 2018-02-03 00:00:00 | \n",
" 42 | \n",
" 65 | \n",
" A | \n",
" 0 | \n",
" 0.404061 | \n",
" 0.0270814 | \n",
" 0.095717 | \n",
" 0.135733 | \n",
" 0 | \n",
" 0 | \n",
" 0.0106509 | \n",
" 0 | \n",
" 0 | \n",
" 0.673244 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 221 | \n",
" 13 | \n",
" 424.0 | \n",
" 11 | \n",
" 2018-032 | \n",
" 2018-11-25 00:00:00 | \n",
" 9 | \n",
" 263 | \n",
" A | \n",
" 5 | \n",
" 0.186443 | \n",
" 0.000274801 | \n",
" 0.0594893 | \n",
" 0.0232346 | \n",
" 0.000438217 | \n",
" 0.00532654 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0.275206 | \n",
"
\n",
" \n",
" 222 | \n",
" 13 | \n",
" 426.0 | \n",
" 11 | \n",
" 2018-032 | \n",
" 2018-11-25 00:00:00 | \n",
" 12 | \n",
" 281 | \n",
" A | \n",
" 0 | \n",
" 0.218516 | \n",
" 0 | \n",
" 0.10337 | \n",
" 0.104374 | \n",
" 0.000538635 | \n",
" 0.00475885 | \n",
" 8.41674e-05 | \n",
" 0 | \n",
" 0 | \n",
" 0.431642 | \n",
"
\n",
" \n",
" 223 | \n",
" 13 | \n",
" 428.0 | \n",
" 11 | \n",
" 2018-032 | \n",
" 2018-11-25 00:00:00 | \n",
" 14 | \n",
" 297 | \n",
" A | \n",
" 0 | \n",
" 0.137821 | \n",
" 0.000257954 | \n",
" 0.091518 | \n",
" 0.0677467 | \n",
" 0.000241816 | \n",
" 0.00206663 | \n",
" 7.29151e-05 | \n",
" 0 | \n",
" 0 | \n",
" 0.299725 | \n",
"
\n",
" \n",
" 224 | \n",
" 13 | \n",
" 430.0 | \n",
" 11 | \n",
" 2018-032 | \n",
" 2018-11-25 00:00:00 | \n",
" 16 | \n",
" 308 | \n",
" A | \n",
" 0 | \n",
" 0.159867 | \n",
" 0 | \n",
" 0.0727466 | \n",
" 0.0962076 | \n",
" 0.000445752 | \n",
" 0.00355829 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0.332825 | \n",
"
\n",
" \n",
" 225 | \n",
" 13 | \n",
" 432.0 | \n",
" 11 | \n",
" 2018-032 | \n",
" 2018-11-25 00:00:00 | \n",
" 22 | \n",
" 311 | \n",
" A | \n",
" 0 | \n",
" 0.228341 | \n",
" 0 | \n",
" 0.154744 | \n",
" 0.162821 | \n",
" 0.00065139 | \n",
" 0.00554693 | \n",
" 0.0100801 | \n",
" 0 | \n",
" 0 | \n",
" 0.562185 | \n",
"
\n",
" \n",
"
\n",
"
226 rows × 19 columns
\n",
"
"
],
"text/plain": [
" Bin # Index Subgroup Cruise Month Station Sample# rep \\\n",
"0 20 1.0 1 2018-005 2018-02-02 00:00:00 SI 9 A \n",
"1 20 3.0 1 2018-005 2018-02-02 00:00:00 59 23 A \n",
"2 20 5.0 1 2018-005 2018-02-02 00:00:00 56 37 A \n",
"3 20 7.0 1 2018-005 2018-02-03 00:00:00 46 49 A \n",
"4 20 9.0 1 2018-005 2018-02-03 00:00:00 42 65 A \n",
".. ... ... ... ... ... ... ... .. \n",
"221 13 424.0 11 2018-032 2018-11-25 00:00:00 9 263 A \n",
"222 13 426.0 11 2018-032 2018-11-25 00:00:00 12 281 A \n",
"223 13 428.0 11 2018-032 2018-11-25 00:00:00 14 297 A \n",
"224 13 430.0 11 2018-032 2018-11-25 00:00:00 16 308 A \n",
"225 13 432.0 11 2018-032 2018-11-25 00:00:00 22 311 A \n",
"\n",
" depth Diatoms-1 Diatoms-2 Prasinophytes Cryptophytes Dinoflagellates \\\n",
"0 0 0.153863 0.0714609 0.166427 0.0874306 0 \n",
"1 0 0.205173 0.0166544 0.00668857 0.00172965 0.000179561 \n",
"2 0 0.175883 0.0100662 0.013261 0 0 \n",
"3 0 0.326609 0.0461415 0.131239 0.0709554 0 \n",
"4 0 0.404061 0.0270814 0.095717 0.135733 0 \n",
".. ... ... ... ... ... ... \n",
"221 5 0.186443 0.000274801 0.0594893 0.0232346 0.000438217 \n",
"222 0 0.218516 0 0.10337 0.104374 0.000538635 \n",
"223 0 0.137821 0.000257954 0.091518 0.0677467 0.000241816 \n",
"224 0 0.159867 0 0.0727466 0.0962076 0.000445752 \n",
"225 0 0.228341 0 0.154744 0.162821 0.00065139 \n",
"\n",
" Haptophytes Dictyochophytes Raphidophytes Cyanobacteria TchlA \n",
"0 0.185031 0.0149052 0.0734495 0 0.752567 \n",
"1 0 0 0 3.44658e-06 0.230429 \n",
"2 0 1.1709e-05 0 0 0.199222 \n",
"3 0.142245 0.00767466 0.0317834 0 0.756648 \n",
"4 0 0.0106509 0 0 0.673244 \n",
".. ... ... ... ... ... \n",
"221 0.00532654 0 0 0 0.275206 \n",
"222 0.00475885 8.41674e-05 0 0 0.431642 \n",
"223 0.00206663 7.29151e-05 0 0 0.299725 \n",
"224 0.00355829 0 0 0 0.332825 \n",
"225 0.00554693 0.0100801 0 0 0.562185 \n",
"\n",
"[226 rows x 19 columns]"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfPhyto"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"# This is the important step- join the two tables ('left' and 'right'), \n",
"# matching the cruise IDs and sample numbers\n",
"# how='outer' -> all rows from both the left and the right tables will be included, \n",
"# even if they cannot be matched; this makes it easy to check for \n",
"# unmatched data later\n",
"# left_on specifies the name of the column to match in the left table (dfbot) \n",
"# right_on specifies the name of the column to match in the right table (dfPhyto)\n",
"dfout = pd.merge(dfbot, dfPhyto, how='outer', \n",
" left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# Identify cases where phytoplankton data were matched to multiple samples in bottle data:\n",
"dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['File Name', 'Zone', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE',\n",
" 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'ADM:MISSION',\n",
" 'LOC:STATION', 'ADM:PROJECT', 'Zone.1', 'Sample_Number',\n",
" 'Pressure [decibar]', 'Depth [metres]', 'Temperature [deg C (ITS90)]',\n",
" 'Transmissivity [*/metre]', 'Fluorescence [mg/m^3]', 'PAR [uE/m^2/sec]',\n",
" 'PAR:Reference [uE/m^2/sec]', 'pH:SBE:Nominal', 'Salinity [PSS-78]',\n",
" 'Oxygen:Dissolved:CTD [mL/L]', 'Oxygen:Dissolved:CTD [umol/kg]',\n",
" 'Temperature:Draw [deg C (ITS90)]', 'Salinity:Bottle [PSS-78]',\n",
" 'Flag:Salinity:Bottle', 'Chlorophyll:Extracted [mg/m^3]',\n",
" 'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted [mg/m^3]',\n",
" 'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]',\n",
" 'Flag:Oxygen:Dissolved', 'Nitrate_plus_Nitrite [umol/L]',\n",
" 'Flag:Nitrate_plus_Nitrite', 'Silicate [umol/L]', 'Flag:Silicate',\n",
" 'Phosphate [umol/L]', 'Flag:Phosphate', 'Comments by sample_numbeR',\n",
" 'Cruise', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',\n",
" 'Depth:Nominal [metres]', 'Alkalinity:Total [umol/L]',\n",
" 'Flag:Alkalinity:Total', 'Carbon:Dissolved:Inorganic [umol/kg]',\n",
" 'Flag:Carbon:Dissolved:Inorganic', 'Bottle_Number',\n",
" 'Bottle:Firing_Sequence', 'Bin #', 'Index', 'Subgroup', 'Month',\n",
" 'Station', 'Sample#', 'rep', 'depth', 'Diatoms-1', 'Diatoms-2',\n",
" 'Prasinophytes', 'Cryptophytes', 'Dinoflagellates', 'Haptophytes',\n",
" 'Dictyochophytes', 'Raphidophytes', 'Cyanobacteria', 'TchlA'],\n",
" dtype='object')"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# show the column names in the resulting table\n",
"dfout.keys()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Checks"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})\n",
"temp.columns = ['icount']"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1])"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.unique(temp.icount)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" icount | \n",
"
\n",
" \n",
" Cruise | \n",
" Sample# | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [icount]\n",
"Index: []"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check for Phyto samples matched to multiple bottle samples:\n",
"temp.loc[temp.icount>1]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" icount | \n",
"
\n",
" \n",
" Cruise | \n",
" Sample# | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [icount]\n",
"Index: []"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check for phyto samples not matched to bottle samples:\n",
"temp.loc[temp.icount==0]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" icount | \n",
"
\n",
" \n",
" Cruise | \n",
" Sample_Number | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 2018-035 | \n",
" 13 | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" icount\n",
"Cruise Sample_Number \n",
"2018-035 13 2"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})\n",
"temp2.columns = ['icount']\n",
"# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:\n",
"temp2.loc[temp2.icount>1]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" icount | \n",
"
\n",
" \n",
" Cruise | \n",
" Sample# | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [icount]\n",
"Index: []"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check for phyto samples not matched to bottle samples:\n",
"temp.loc[temp.icount==0]"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(2082, 226, 2080)"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# if the output table is longer than either of the input tables, some columns were not matched\n",
"len(dfout), len(dfPhyto), len(dfbot)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(226, 226)"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check that the number of cells with data in the 'Cyanobacteria' column is \n",
"# the same for the input and output tables to show that no rows are missing:\n",
"np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2080 141.0\n",
"2081 192.0\n",
"Name: Index, dtype: float64\n"
]
}
],
"source": [
"# If there were data rows from the phytoplankton table that were not matched to \n",
"# rows from the bottle table, their indices from the phytoplankton table would be \n",
"# displayed below (the series [] would not be empty)\n",
"print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Index'])"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Index | \n",
" Cruise | \n",
" Diatoms-1 | \n",
" Prasinophytes | \n",
"
\n",
" \n",
" \n",
" \n",
" 2080 | \n",
" 141.0 | \n",
" 2018-035 | \n",
" 3.75868 | \n",
" 0.36334 | \n",
"
\n",
" \n",
" 2081 | \n",
" 192.0 | \n",
" 2018-039 | \n",
" 0.546726 | \n",
" 0.00136988 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Index Cruise Diatoms-1 Prasinophytes\n",
"2080 141.0 2018-035 3.75868 0.36334\n",
"2081 192.0 2018-039 0.546726 0.00136988"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfout.loc[dfout['ADM:SCIENTIST'].isna(),['Index','Cruise','Diatoms-1','Prasinophytes']]"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"# drop repetetive/unecessary columns:\n",
"dfout.drop(labels=['Bin #', 'Index', 'Subgroup', 'Month', 'Station', 'Sample#', 'rep',\n",
" 'depth',],axis=1,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"# truncate phyto group values to 3 decimal places:\n",
"for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',\n",
" 'Diatoms-2', 'Dinoflagellates', 'Haptophytes', 'Dictyochophytes', 'Raphidophytes',\n",
" 'TchlA'):\n",
" dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 NaN\n",
"1 NaN\n",
"2 NaN\n",
"3 NaN\n",
"4 NaN\n",
" ... \n",
"2077 NaN\n",
"2078 NaN\n",
"2079 0.000\n",
"2080 0.047\n",
"2081 0.001\n",
"Name: Cyanobacteria, Length: 2082, dtype: float64"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfout['Cyanobacteria']"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"# now write the output table to a .csv file:\n",
"dfout.to_csv(pathOut, index=False) "
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['File Name', 'Zone', 'LOC:EVENT_NUMBER', 'LOC:LATITUDE',\n",
" 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'ADM:MISSION',\n",
" 'LOC:STATION', 'ADM:PROJECT', 'Zone.1', 'Sample_Number',\n",
" 'Pressure [decibar]', 'Depth [metres]', 'Temperature [deg C (ITS90)]',\n",
" 'Transmissivity [*/metre]', 'Fluorescence [mg/m^3]', 'PAR [uE/m^2/sec]',\n",
" 'PAR:Reference [uE/m^2/sec]', 'pH:SBE:Nominal', 'Salinity [PSS-78]',\n",
" 'Oxygen:Dissolved:CTD [mL/L]', 'Oxygen:Dissolved:CTD [umol/kg]',\n",
" 'Temperature:Draw [deg C (ITS90)]', 'Salinity:Bottle [PSS-78]',\n",
" 'Flag:Salinity:Bottle', 'Chlorophyll:Extracted [mg/m^3]',\n",
" 'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted [mg/m^3]',\n",
" 'Oxygen:Dissolved [mL/L]', 'Oxygen:Dissolved [umol/kg]',\n",
" 'Flag:Oxygen:Dissolved', 'Nitrate_plus_Nitrite [umol/L]',\n",
" 'Flag:Nitrate_plus_Nitrite', 'Silicate [umol/L]', 'Flag:Silicate',\n",
" 'Phosphate [umol/L]', 'Flag:Phosphate', 'Comments by sample_numbeR',\n",
" 'Cruise', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',\n",
" 'Depth:Nominal [metres]', 'Alkalinity:Total [umol/L]',\n",
" 'Flag:Alkalinity:Total', 'Carbon:Dissolved:Inorganic [umol/kg]',\n",
" 'Flag:Carbon:Dissolved:Inorganic', 'Bottle_Number',\n",
" 'Bottle:Firing_Sequence', 'Diatoms-1', 'Diatoms-2', 'Prasinophytes',\n",
" 'Cryptophytes', 'Dinoflagellates', 'Haptophytes', 'Dictyochophytes',\n",
" 'Raphidophytes', 'Cyanobacteria', 'TchlA'],\n",
" dtype='object')"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfout.keys()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (py38_ct)",
"language": "python",
"name": "py38_ct"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}