{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np # this module handles arrays, but here we need it for its NaN value\n",
"import pandas as pd # this module contains a lot of tools for handling tabular data\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# define paths to the source files and eventual output file\n",
"pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2016 SoG bottle.xlsx'\n",
"pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2019 SoG abs values New ALLO.xlsx'\n",
"pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2016_NewALLO.csv'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"#formatting function to get year-(3digit) format\n",
"def fmtCruise(istr):\n",
" if re.fullmatch('[0-9]{4}-[0-9]{2}',istr): \n",
" sp=re.split('-',istr)\n",
" rstr=sp[0]+'-0'+sp[1]\n",
" elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr):\n",
" rstr=istr\n",
" else:\n",
" raise ValueError('Input had unexpected format:',istr)\n",
" return rstr"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['2016-05', '2016-47', '2016-07', '2016-62', '2016-010', '2016-071']\n"
]
}
],
"source": [
"# get names of sheets in notebook\n",
"with pd.ExcelFile(pathBottle,engine='openpyxl') as xl:\n",
" sheets=xl.sheet_names\n",
"print(sheets)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading sheet 2016-05\n",
"Reading sheet 2016-47\n",
"Reading sheet 2016-07\n",
"Reading sheet 2016-62\n",
"Reading sheet 2016-010\n",
"Reading sheet 2016-071\n",
"removed empty columns: {'Flag:Ammonium', 'Oxygen:Dissolved.1', 'Ammonium'}\n"
]
}
],
"source": [
"# load each sheet in the 2015 bottle Excel file and concatenate them together into one table\n",
"dfbotlist=list()\n",
"for sheet in sheets:\n",
" df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,engine='openpyxl',\n",
" na_values=(-99,-99.9)) # read each sheet; include additional na values\n",
" df0['Cruise']=fmtCruise(sheet) # create and populate Cruise column based on sheet name\n",
" dfbotlist.append(df0) # append the sheet to a list\n",
"dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table\n",
"# Drop columns with no data in them\n",
"l1=set(dfbot.keys())\n",
"dfbot.dropna(axis=1,how='all',inplace=True)\n",
"print('removed empty columns:',l1-set(dfbot.keys()))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',\n",
" 'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH',\n",
" 'ADM:SCIENTIST', 'LOC:STATION', 'Sample_Number', 'Pressure',\n",
" 'Temperature:Secondary', 'Transmissivity', 'Fluorescence:URU:Seapoint',\n",
" 'PAR', 'PAR:Reference', 'pH:SBE:Nominal', 'Salinity:T1:C1',\n",
" 'Oxygen:Dissolved:SBE', 'Temperature:Draw', 'Salinity:Bottle',\n",
" 'Flag:Salinity:Bottle', 'Chlorophyll:Extracted',\n",
" 'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted',\n",
" 'Oxygen:Dissolved', 'Flag:Oxygen:Dissolved', 'Nitrate_plus_Nitrite',\n",
" 'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate',\n",
" 'Flag:Phosphate', 'Cruise', 'PAR1', 'Sample_Number.1',\n",
" 'Number_of_bin_records', 'YYYY/MM/DD HH:MM:SS', 'ADM:MISSION',\n",
" 'ADM:PROJECT', 'Pressure [decibar]',\n",
" 'Temperature:Secondary [deg C (ITS90)]', 'Transmissivity [*/metre]',\n",
" 'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]',\n",
" 'PAR:Reference [uE/m^2/sec]', 'Salinity:T1:C1 [PSS-78]',\n",
" 'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]',\n",
" 'Salinity:Bottle [PSS-78]', 'Chlorophyll:Extracted [mg/m^3]',\n",
" 'Phaeo-Pigment:Extracted [mg/m^3]', 'Nitrate_plus_Nitrite [umol/L]',\n",
" 'Silicate [umol/L]', 'Phosphate [umol/L]', 'Bottle_Number',\n",
" 'Bottle:Firing_Sequence', 'Oxygen:Dissolved [mL/L]',\n",
" 'Oxygen:Dissolved [umol/kg]'],\n",
" dtype='object')\n"
]
}
],
"source": [
"# list the column names in the resulting table\n",
"print(dfbot.keys())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def subval(idf,colList):\n",
" # first value in colList should be the column you are going to keep\n",
" # follow with other columns that will be used to fill in when that column is NaN\n",
" # in order of precedence\n",
" if len(colList)==2:\n",
" idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \\\n",
" else r[colList[1]] for i,r in idf.iterrows()]\n",
" elif len(colList)==3:\n",
" idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \\\n",
" else r[colList[1]] if not pd.isna(r[colList[1]]) \\\n",
" else r[colList[2]] for i,r in idf.iterrows()]\n",
" else:\n",
" raise NotImplementedError('Add to code to handle this case')\n",
" return idf"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0 : zero means no difference between Sample_Number and Sample_number.1 columns containing data\n",
"0 : zero means no missing values in Sample_Number\n"
]
}
],
"source": [
"# there are some duplicate columns here; handle them:\n",
"print(np.max(np.abs(dfbot['Sample_Number']-dfbot['Sample_Number.1'])),\n",
" ': zero means no difference between Sample_Number and Sample_number.1 columns containing data')\n",
"# check there are no rows missing a value in 'Sample_Number':\n",
"print(np.sum(pd.isna(dfbot['Sample_Number'])),\n",
" ': zero means no missing values in Sample_Number')\n",
"# -> extra copy is repeat so drop it\n",
"# 'ADM:MISSION' contains cruise id so drop it\n",
"# 'YYYY/MM/DD HH:MM:SS' data should be in 'FIL:START TIME YYYY/MM/DD HH:MM:SS'; move it there:\n",
"dfbot=subval(dfbot,('FIL:START TIME YYYY/MM/DD HH:MM:SS','YYYY/MM/DD HH:MM:SS'))\n",
"dfbot=subval(dfbot,('Pressure [decibar]','Pressure'))\n",
"dfbot=subval(dfbot,('Temperature:Secondary [deg C (ITS90)]','Temperature:Secondary'))\n",
"dfbot=subval(dfbot,('Transmissivity [*/metre]','Transmissivity'))\n",
"dfbot=subval(dfbot,('Fluorescence:URU:Seapoint [mg/m^3]','Fluorescence:URU:Seapoint'))\n",
"dfbot=subval(dfbot,('PAR [uE/m^2/sec]','PAR'))\n",
"dfbot=subval(dfbot,('PAR:Reference [uE/m^2/sec]','PAR:Reference'))\n",
"dfbot=subval(dfbot,('Salinity:T1:C1 [PSS-78]','Salinity:T1:C1'))\n",
"dfbot=subval(dfbot,('Salinity:Bottle [PSS-78]','Salinity:Bottle'))\n",
"dfbot=subval(dfbot,('Chlorophyll:Extracted [mg/m^3]','Chlorophyll:Extracted'))\n",
"dfbot=subval(dfbot,('Phaeo-Pigment:Extracted [mg/m^3]','Phaeo-Pigment:Extracted'))\n",
"dfbot=subval(dfbot,('Oxygen:Dissolved:SBE [mL/L]','Oxygen:Dissolved:SBE'))\n",
"dfbot=subval(dfbot,('Oxygen:Dissolved [mL/L]','Oxygen:Dissolved'))\n",
"dfbot=subval(dfbot,('Nitrate_plus_Nitrite [umol/L]','Nitrate_plus_Nitrite'))\n",
"dfbot=subval(dfbot,('Silicate [umol/L]','Silicate'))\n",
"dfbot=subval(dfbot,('Phosphate [umol/L]','Phosphate'))\n",
"dfbot=subval(dfbot,('PAR [uE/m^2/sec]','PAR1'))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# drop repetetive/unecessary columns:\n",
"dfbot.drop(labels=['Sample_Number.1','ADM:MISSION','YYYY/MM/DD HH:MM:SS','Transmissivity',\n",
" 'Pressure','Temperature:Secondary','Fluorescence:URU:Seapoint','PAR',\n",
" 'PAR:Reference','Salinity:T1:C1','Chlorophyll:Extracted',\n",
" 'Oxygen:Dissolved:SBE','Oxygen:Dissolved',\n",
" 'Nitrate_plus_Nitrite','Silicate','PAR1'],axis=1,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# define a function that will be applied to the values in the index column;\n",
"# this makes it easier to drop non-data rows later\n",
"def convertIndex(val):\n",
" try:\n",
" x =int(val)\n",
" except ValueError:\n",
" x=np.nan\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading sheet 2016\n"
]
}
],
"source": [
"# load the 2015 phytoplankton data with the following options:\n",
"# sheet_name='2015 CHEMTAX abs results' -> choose the 2015 sheet\n",
"# usecols='A:I,T:AC' -> read only columns A:I and T:AC from the Excel sheet\n",
"# skiprows=2 -> start reading at the 3rd row of the sheet, \n",
"# which contains the column headings\n",
"# converters={'Index': convertIndex,} -> apply the function defined above to the Index column\n",
"# verbose = True -> print extra information/ warnings/ errors\n",
"dfPhyto=pd.read_excel(pathPhyto,sheet_name='2016',engine='openpyxl',\n",
" skiprows=0,converters={'Sample#': convertIndex,},\n",
" verbose=True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Cruise | \n",
" Date | \n",
" Station | \n",
" Sample# | \n",
" rep | \n",
" depth | \n",
" Diatoms-1 | \n",
" Diatoms-2 | \n",
" Prasinophytes | \n",
" Cryptophytes | \n",
" Dinoflagellates-1 | \n",
" Haptophytes | \n",
" Dictyo | \n",
" Raphido | \n",
" Cyanobacteria | \n",
" TchlA | \n",
"
\n",
" \n",
" \n",
" \n",
" 48 | \n",
" 2016-007 | \n",
" June | \n",
" 14 | \n",
" 306.0 | \n",
" mean | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0.215782 | \n",
" 0.568259 | \n",
" 0.155735 | \n",
" 1.02837 | \n",
" 0 | \n",
" 0 | \n",
" 0.00485905 | \n",
" 1.973 | \n",
"
\n",
" \n",
" 49 | \n",
" 2016-007 | \n",
" June | \n",
" 16 | \n",
" 318.0 | \n",
" mean | \n",
" 0 | \n",
" 0.176595 | \n",
" 0 | \n",
" 0.197922 | \n",
" 0.556587 | \n",
" 0.129862 | \n",
" 0.704256 | \n",
" 0.0753036 | \n",
" 0.0859203 | \n",
" 0.0115539 | \n",
" 1.938 | \n",
"
\n",
" \n",
" 50 | \n",
" 2016-007 | \n",
" June | \n",
" OKE | \n",
" 329.0 | \n",
" mean | \n",
" 0 | \n",
" 0.534565 | \n",
" 1.30459 | \n",
" 0.310359 | \n",
" 1.03751 | \n",
" 0.435442 | \n",
" 2.29758 | \n",
" 2.14958 | \n",
" 0.247075 | \n",
" 0.0502969 | \n",
" 8.367 | \n",
"
\n",
" \n",
" 51 | \n",
" 2016-007 | \n",
" June | \n",
" 22 | \n",
" 333.0 | \n",
" mean | \n",
" 0 | \n",
" 0.748735 | \n",
" 2.2245 | \n",
" 0.0510914 | \n",
" 0.270685 | \n",
" 0.417001 | \n",
" 0.156895 | \n",
" 3.49932 | \n",
" 0.0870553 | \n",
" 0.0197189 | \n",
" 7.475 | \n",
"
\n",
" \n",
" 52 | \n",
" Cruise | \n",
" Date | \n",
" Station | \n",
" NaN | \n",
" rep | \n",
" depth | \n",
" Diatoms-1 | \n",
" Diatoms-2 | \n",
" Prasinophytes | \n",
" Cryptophytes | \n",
" Dinoflagellates-1 | \n",
" Haptophytes | \n",
" Dictyo | \n",
" Raphido | \n",
" Cyanobacteria | \n",
" TchlA | \n",
"
\n",
" \n",
" 53 | \n",
" 2016-047 | \n",
" June | \n",
" 14 | \n",
" 438.0 | \n",
" mean | \n",
" 0 | \n",
" 1.03723 | \n",
" 0.402856 | \n",
" 0.0826985 | \n",
" 0.215331 | \n",
" 0.117579 | \n",
" 0.224789 | \n",
" 0.037861 | \n",
" 0.0581045 | \n",
" 0.0265498 | \n",
" 2.203 | \n",
"
\n",
" \n",
" 54 | \n",
" 2016-047 | \n",
" June | \n",
" 11 | \n",
" 449.0 | \n",
" mean | \n",
" 0 | \n",
" 0.37708 | \n",
" 0.165123 | \n",
" 0.087552 | \n",
" 0.113769 | \n",
" 0.101932 | \n",
" 0.26103 | \n",
" 0.0118683 | \n",
" 0.0485396 | \n",
" 0.0161065 | \n",
" 1.183 | \n",
"
\n",
" \n",
" 55 | \n",
" 2016-047 | \n",
" June | \n",
" CPF1 | \n",
" 452.0 | \n",
" mean | \n",
" 0 | \n",
" 0.00981477 | \n",
" 0 | \n",
" 0.125556 | \n",
" 0.143086 | \n",
" 0.0616604 | \n",
" 0.287119 | \n",
" 0 | \n",
" 0.0535119 | \n",
" 0.00525178 | \n",
" 0.686 | \n",
"
\n",
" \n",
" 56 | \n",
" 2016-047 | \n",
" June | \n",
" CPF2 | \n",
" 455.0 | \n",
" mean | \n",
" 0 | \n",
" 0.223128 | \n",
" 0 | \n",
" 0.0825152 | \n",
" 0.324354 | \n",
" 0.067458 | \n",
" 0.251326 | \n",
" 0.0179209 | \n",
" 0.125422 | \n",
" 0.026876 | \n",
" 1.119 | \n",
"
\n",
" \n",
" 57 | \n",
" 2016-047 | \n",
" June | \n",
" 22 | \n",
" 466.0 | \n",
" mean | \n",
" 0 | \n",
" 0.612099 | \n",
" 0.124002 | \n",
" 0.0585194 | \n",
" 0.158803 | \n",
" 0.0759229 | \n",
" 0.119778 | \n",
" 0.0561802 | \n",
" 0.0190252 | \n",
" 0.00866994 | \n",
" 1.233 | \n",
"
\n",
" \n",
" 58 | \n",
" 2016-047 | \n",
" June | \n",
" 24 | \n",
" 469.0 | \n",
" mean | \n",
" 0 | \n",
" 0.696984 | \n",
" 0.106455 | \n",
" 0.0752098 | \n",
" 0.160245 | \n",
" 0.0776968 | \n",
" 0.182455 | \n",
" 0.0441467 | \n",
" 0.0226451 | \n",
" 0.00516366 | \n",
" 1.371 | \n",
"
\n",
" \n",
" 59 | \n",
" 2016-047 | \n",
" June | \n",
" 28 | \n",
" 472.0 | \n",
" mean | \n",
" 0 | \n",
" 0.818837 | \n",
" 0 | \n",
" 0.0915953 | \n",
" 0.174323 | \n",
" 0.055937 | \n",
" 0.0619514 | \n",
" 0.0265586 | \n",
" 0.0612878 | \n",
" 0.0205105 | \n",
" 1.311 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Cruise Date Station Sample# rep depth Diatoms-1 Diatoms-2 \\\n",
"48 2016-007 June 14 306.0 mean 0 0 0 \n",
"49 2016-007 June 16 318.0 mean 0 0.176595 0 \n",
"50 2016-007 June OKE 329.0 mean 0 0.534565 1.30459 \n",
"51 2016-007 June 22 333.0 mean 0 0.748735 2.2245 \n",
"52 Cruise Date Station NaN rep depth Diatoms-1 Diatoms-2 \n",
"53 2016-047 June 14 438.0 mean 0 1.03723 0.402856 \n",
"54 2016-047 June 11 449.0 mean 0 0.37708 0.165123 \n",
"55 2016-047 June CPF1 452.0 mean 0 0.00981477 0 \n",
"56 2016-047 June CPF2 455.0 mean 0 0.223128 0 \n",
"57 2016-047 June 22 466.0 mean 0 0.612099 0.124002 \n",
"58 2016-047 June 24 469.0 mean 0 0.696984 0.106455 \n",
"59 2016-047 June 28 472.0 mean 0 0.818837 0 \n",
"\n",
" Prasinophytes Cryptophytes Dinoflagellates-1 Haptophytes Dictyo \\\n",
"48 0.215782 0.568259 0.155735 1.02837 0 \n",
"49 0.197922 0.556587 0.129862 0.704256 0.0753036 \n",
"50 0.310359 1.03751 0.435442 2.29758 2.14958 \n",
"51 0.0510914 0.270685 0.417001 0.156895 3.49932 \n",
"52 Prasinophytes Cryptophytes Dinoflagellates-1 Haptophytes Dictyo \n",
"53 0.0826985 0.215331 0.117579 0.224789 0.037861 \n",
"54 0.087552 0.113769 0.101932 0.26103 0.0118683 \n",
"55 0.125556 0.143086 0.0616604 0.287119 0 \n",
"56 0.0825152 0.324354 0.067458 0.251326 0.0179209 \n",
"57 0.0585194 0.158803 0.0759229 0.119778 0.0561802 \n",
"58 0.0752098 0.160245 0.0776968 0.182455 0.0441467 \n",
"59 0.0915953 0.174323 0.055937 0.0619514 0.0265586 \n",
"\n",
" Raphido Cyanobacteria TchlA \n",
"48 0 0.00485905 1.973 \n",
"49 0.0859203 0.0115539 1.938 \n",
"50 0.247075 0.0502969 8.367 \n",
"51 0.0870553 0.0197189 7.475 \n",
"52 Raphido Cyanobacteria TchlA \n",
"53 0.0581045 0.0265498 2.203 \n",
"54 0.0485396 0.0161065 1.183 \n",
"55 0.0535119 0.00525178 0.686 \n",
"56 0.125422 0.026876 1.119 \n",
"57 0.0190252 0.00866994 1.233 \n",
"58 0.0226451 0.00516366 1.371 \n",
"59 0.0612878 0.0205105 1.311 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# display rows 48 to 59 of the resulting table\n",
"dfPhyto[48:60]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# now, drop any rows from the table that have NaN values in either of the columns\n",
"# 'Index' or 'TchlA (ug/L)'\n",
"# This is why we applied a function to the Index column to make sure all \n",
"# non-numeric Index values would have a consistent NaN entry, making them easy to identify\n",
"# and remove\n",
"dfPhyto.dropna(subset=['Sample#', 'TchlA'],how='any',inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# pandas creates its own index, and after dropping rows I like to reset it -\n",
"# this is just for convenience\n",
"dfPhyto.reset_index(drop=True,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# apply formatting function all rows in Cruise column to get year-3digit format\n",
"dfPhyto['Cruise']=[fmtCruise(ii) for ii in dfPhyto['Cruise']]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Cruise | \n",
" Date | \n",
" Station | \n",
" Sample# | \n",
" rep | \n",
" depth | \n",
" Diatoms-1 | \n",
" Diatoms-2 | \n",
" Prasinophytes | \n",
" Cryptophytes | \n",
" Dinoflagellates-1 | \n",
" Haptophytes | \n",
" Dictyo | \n",
" Raphido | \n",
" Cyanobacteria | \n",
" TchlA | \n",
"
\n",
" \n",
" \n",
" \n",
" 48 | \n",
" 2016-007 | \n",
" June | \n",
" 16 | \n",
" 318.0 | \n",
" mean | \n",
" 0 | \n",
" 0.176595 | \n",
" 0 | \n",
" 0.197922 | \n",
" 0.556587 | \n",
" 0.129862 | \n",
" 0.704256 | \n",
" 0.0753036 | \n",
" 0.0859203 | \n",
" 0.0115539 | \n",
" 1.938 | \n",
"
\n",
" \n",
" 49 | \n",
" 2016-007 | \n",
" June | \n",
" OKE | \n",
" 329.0 | \n",
" mean | \n",
" 0 | \n",
" 0.534565 | \n",
" 1.30459 | \n",
" 0.310359 | \n",
" 1.03751 | \n",
" 0.435442 | \n",
" 2.29758 | \n",
" 2.14958 | \n",
" 0.247075 | \n",
" 0.0502969 | \n",
" 8.367 | \n",
"
\n",
" \n",
" 50 | \n",
" 2016-007 | \n",
" June | \n",
" 22 | \n",
" 333.0 | \n",
" mean | \n",
" 0 | \n",
" 0.748735 | \n",
" 2.2245 | \n",
" 0.0510914 | \n",
" 0.270685 | \n",
" 0.417001 | \n",
" 0.156895 | \n",
" 3.49932 | \n",
" 0.0870553 | \n",
" 0.0197189 | \n",
" 7.475 | \n",
"
\n",
" \n",
" 51 | \n",
" 2016-047 | \n",
" June | \n",
" 14 | \n",
" 438.0 | \n",
" mean | \n",
" 0 | \n",
" 1.03723 | \n",
" 0.402856 | \n",
" 0.0826985 | \n",
" 0.215331 | \n",
" 0.117579 | \n",
" 0.224789 | \n",
" 0.037861 | \n",
" 0.0581045 | \n",
" 0.0265498 | \n",
" 2.203 | \n",
"
\n",
" \n",
" 52 | \n",
" 2016-047 | \n",
" June | \n",
" 11 | \n",
" 449.0 | \n",
" mean | \n",
" 0 | \n",
" 0.37708 | \n",
" 0.165123 | \n",
" 0.087552 | \n",
" 0.113769 | \n",
" 0.101932 | \n",
" 0.26103 | \n",
" 0.0118683 | \n",
" 0.0485396 | \n",
" 0.0161065 | \n",
" 1.183 | \n",
"
\n",
" \n",
" 53 | \n",
" 2016-047 | \n",
" June | \n",
" CPF1 | \n",
" 452.0 | \n",
" mean | \n",
" 0 | \n",
" 0.00981477 | \n",
" 0 | \n",
" 0.125556 | \n",
" 0.143086 | \n",
" 0.0616604 | \n",
" 0.287119 | \n",
" 0 | \n",
" 0.0535119 | \n",
" 0.00525178 | \n",
" 0.686 | \n",
"
\n",
" \n",
" 54 | \n",
" 2016-047 | \n",
" June | \n",
" CPF2 | \n",
" 455.0 | \n",
" mean | \n",
" 0 | \n",
" 0.223128 | \n",
" 0 | \n",
" 0.0825152 | \n",
" 0.324354 | \n",
" 0.067458 | \n",
" 0.251326 | \n",
" 0.0179209 | \n",
" 0.125422 | \n",
" 0.026876 | \n",
" 1.119 | \n",
"
\n",
" \n",
" 55 | \n",
" 2016-047 | \n",
" June | \n",
" 22 | \n",
" 466.0 | \n",
" mean | \n",
" 0 | \n",
" 0.612099 | \n",
" 0.124002 | \n",
" 0.0585194 | \n",
" 0.158803 | \n",
" 0.0759229 | \n",
" 0.119778 | \n",
" 0.0561802 | \n",
" 0.0190252 | \n",
" 0.00866994 | \n",
" 1.233 | \n",
"
\n",
" \n",
" 56 | \n",
" 2016-047 | \n",
" June | \n",
" 24 | \n",
" 469.0 | \n",
" mean | \n",
" 0 | \n",
" 0.696984 | \n",
" 0.106455 | \n",
" 0.0752098 | \n",
" 0.160245 | \n",
" 0.0776968 | \n",
" 0.182455 | \n",
" 0.0441467 | \n",
" 0.0226451 | \n",
" 0.00516366 | \n",
" 1.371 | \n",
"
\n",
" \n",
" 57 | \n",
" 2016-047 | \n",
" June | \n",
" 28 | \n",
" 472.0 | \n",
" mean | \n",
" 0 | \n",
" 0.818837 | \n",
" 0 | \n",
" 0.0915953 | \n",
" 0.174323 | \n",
" 0.055937 | \n",
" 0.0619514 | \n",
" 0.0265586 | \n",
" 0.0612878 | \n",
" 0.0205105 | \n",
" 1.311 | \n",
"
\n",
" \n",
" 58 | \n",
" 2016-047 | \n",
" June | \n",
" GE01 | \n",
" 483.0 | \n",
" mean | \n",
" 0 | \n",
" 1.71173 | \n",
" 0 | \n",
" 0.120347 | \n",
" 0.210731 | \n",
" 0.0864201 | \n",
" 0.0260724 | \n",
" 0.0207969 | \n",
" 0.0199069 | \n",
" 0 | \n",
" 2.196 | \n",
"
\n",
" \n",
" 59 | \n",
" 2016-047 | \n",
" June | \n",
" 38 | \n",
" 486.0 | \n",
" mean | \n",
" 0 | \n",
" 6.43463 | \n",
" 0 | \n",
" 0.186531 | \n",
" 0.432792 | \n",
" 0.137378 | \n",
" 0 | \n",
" 0 | \n",
" 0.196669 | \n",
" 0 | \n",
" 7.388 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Cruise Date Station Sample# rep depth Diatoms-1 Diatoms-2 \\\n",
"48 2016-007 June 16 318.0 mean 0 0.176595 0 \n",
"49 2016-007 June OKE 329.0 mean 0 0.534565 1.30459 \n",
"50 2016-007 June 22 333.0 mean 0 0.748735 2.2245 \n",
"51 2016-047 June 14 438.0 mean 0 1.03723 0.402856 \n",
"52 2016-047 June 11 449.0 mean 0 0.37708 0.165123 \n",
"53 2016-047 June CPF1 452.0 mean 0 0.00981477 0 \n",
"54 2016-047 June CPF2 455.0 mean 0 0.223128 0 \n",
"55 2016-047 June 22 466.0 mean 0 0.612099 0.124002 \n",
"56 2016-047 June 24 469.0 mean 0 0.696984 0.106455 \n",
"57 2016-047 June 28 472.0 mean 0 0.818837 0 \n",
"58 2016-047 June GE01 483.0 mean 0 1.71173 0 \n",
"59 2016-047 June 38 486.0 mean 0 6.43463 0 \n",
"\n",
" Prasinophytes Cryptophytes Dinoflagellates-1 Haptophytes Dictyo \\\n",
"48 0.197922 0.556587 0.129862 0.704256 0.0753036 \n",
"49 0.310359 1.03751 0.435442 2.29758 2.14958 \n",
"50 0.0510914 0.270685 0.417001 0.156895 3.49932 \n",
"51 0.0826985 0.215331 0.117579 0.224789 0.037861 \n",
"52 0.087552 0.113769 0.101932 0.26103 0.0118683 \n",
"53 0.125556 0.143086 0.0616604 0.287119 0 \n",
"54 0.0825152 0.324354 0.067458 0.251326 0.0179209 \n",
"55 0.0585194 0.158803 0.0759229 0.119778 0.0561802 \n",
"56 0.0752098 0.160245 0.0776968 0.182455 0.0441467 \n",
"57 0.0915953 0.174323 0.055937 0.0619514 0.0265586 \n",
"58 0.120347 0.210731 0.0864201 0.0260724 0.0207969 \n",
"59 0.186531 0.432792 0.137378 0 0 \n",
"\n",
" Raphido Cyanobacteria TchlA \n",
"48 0.0859203 0.0115539 1.938 \n",
"49 0.247075 0.0502969 8.367 \n",
"50 0.0870553 0.0197189 7.475 \n",
"51 0.0581045 0.0265498 2.203 \n",
"52 0.0485396 0.0161065 1.183 \n",
"53 0.0535119 0.00525178 0.686 \n",
"54 0.125422 0.026876 1.119 \n",
"55 0.0190252 0.00866994 1.233 \n",
"56 0.0226451 0.00516366 1.371 \n",
"57 0.0612878 0.0205105 1.311 \n",
"58 0.0199069 0 2.196 \n",
"59 0.196669 0 7.388 "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# display part of the table, confirming that non-data rows have been removed\n",
"dfPhyto[48:60]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# due to repeated column names in the original spreadsheet, '.1' was appended to the names\n",
"# of the phytoplankton columns; \n",
"# these lines correct the column names, removing the '.1':\n",
"renameDict=dict()\n",
"for colName in dfPhyto.keys():\n",
" if colName.endswith('.1'):\n",
" renameDict[colName]=colName.split('.1')[0]\n",
"dfPhyto.rename(columns=renameDict,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Cruise | \n",
" Date | \n",
" Station | \n",
" Sample# | \n",
" rep | \n",
" depth | \n",
" Diatoms-1 | \n",
" Diatoms-2 | \n",
" Prasinophytes | \n",
" Cryptophytes | \n",
" Dinoflagellates-1 | \n",
" Haptophytes | \n",
" Dictyo | \n",
" Raphido | \n",
" Cyanobacteria | \n",
" TchlA | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2016-005 | \n",
" April | \n",
" 59 | \n",
" 23.0 | \n",
" mean | \n",
" 0 | \n",
" 4.50144 | \n",
" 0.438107 | \n",
" 0.048483 | \n",
" 0.338791 | \n",
" 0.0478202 | \n",
" 0 | \n",
" 0.00362949 | \n",
" 0.000129969 | \n",
" 0.0086402 | \n",
" 5.38704 | \n",
"
\n",
" \n",
" 1 | \n",
" 2016-005 | \n",
" April | \n",
" 102 | \n",
" 37.0 | \n",
" mean | \n",
" 0 | \n",
" 3.55538 | \n",
" 1.73326 | \n",
" 0.385956 | \n",
" 1.82211 | \n",
" 0.139308 | \n",
" 0.186364 | \n",
" 0.0440193 | \n",
" 0.0767997 | \n",
" 0 | \n",
" 7.9432 | \n",
"
\n",
" \n",
" 2 | \n",
" 2016-005 | \n",
" April | \n",
" 75 | \n",
" 51.0 | \n",
" mean | \n",
" 0 | \n",
" 14.7795 | \n",
" 0.563364 | \n",
" 0.397391 | \n",
" 1.18036 | \n",
" 0.075357 | \n",
" 0.0941965 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 17.0901 | \n",
"
\n",
" \n",
" 3 | \n",
" 2016-005 | \n",
" April | \n",
" 72 | \n",
" 64.0 | \n",
" mean | \n",
" 0 | \n",
" 0.923699 | \n",
" 0.745832 | \n",
" 0.337639 | \n",
" 1.20012 | \n",
" 0.0704119 | \n",
" 0.0893556 | \n",
" 0.00326812 | \n",
" 0.0523156 | \n",
" 0 | \n",
" 3.42264 | \n",
"
\n",
" \n",
" 4 | \n",
" 2016-005 | \n",
" April | \n",
" 69 | \n",
" 76.0 | \n",
" mean | \n",
" 0 | \n",
" 3.88279 | \n",
" 0.290172 | \n",
" 0.183778 | \n",
" 0.531872 | \n",
" 0.0478852 | \n",
" 0.0609817 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 4.99748 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 115 | \n",
" 2016-071 | \n",
" November | \n",
" 9 | \n",
" 250.0 | \n",
" mean | \n",
" 0 | \n",
" 0.0142285 | \n",
" 0.082281 | \n",
" 0.0897016 | \n",
" 0.0562649 | \n",
" 0.000186116 | \n",
" 0.00849243 | \n",
" 0.017071 | \n",
" 0.00322275 | \n",
" 0.00355171 | \n",
" 0.275 | \n",
"
\n",
" \n",
" 116 | \n",
" 2016-071 | \n",
" November | \n",
" 6 | \n",
" 264.0 | \n",
" mean | \n",
" 0 | \n",
" 0.00399836 | \n",
" 0.104654 | \n",
" 0.145557 | \n",
" 0.109826 | \n",
" 0.00047569 | \n",
" 0.040834 | \n",
" 0.0139237 | \n",
" 0.00948663 | \n",
" 0.00424517 | \n",
" 0.433 | \n",
"
\n",
" \n",
" 117 | \n",
" 2016-071 | \n",
" November | \n",
" 3 | \n",
" 281.0 | \n",
" mean | \n",
" 0 | \n",
" 0 | \n",
" 0.0634224 | \n",
" 0.0775545 | \n",
" 0.0395764 | \n",
" 0.0314152 | \n",
" 0.0276703 | \n",
" 0.0142296 | \n",
" 0.0202238 | \n",
" 0.00390783 | \n",
" 0.278 | \n",
"
\n",
" \n",
" 118 | \n",
" 2016-071 | \n",
" November | \n",
" 2 | \n",
" 297.0 | \n",
" mean | \n",
" 0 | \n",
" 0 | \n",
" 0.0636933 | \n",
" 0.181767 | \n",
" 0.274402 | \n",
" 0.0402761 | \n",
" 0.0463697 | \n",
" 0.0196038 | \n",
" 0.0268882 | \n",
" 0 | \n",
" 0.653 | \n",
"
\n",
" \n",
" 119 | \n",
" 2016-071 | \n",
" November | \n",
" 27 | \n",
" 314.0 | \n",
" mean | \n",
" 0 | \n",
" 0.00343804 | \n",
" 0.112049 | \n",
" 0.343388 | \n",
" 0.400357 | \n",
" 0.035914 | \n",
" 0.0469057 | \n",
" 0.00552154 | \n",
" 0.0769891 | \n",
" 0.00243813 | \n",
" 1.027 | \n",
"
\n",
" \n",
"
\n",
"
120 rows × 16 columns
\n",
"
"
],
"text/plain": [
" Cruise Date Station Sample# rep depth Diatoms-1 Diatoms-2 \\\n",
"0 2016-005 April 59 23.0 mean 0 4.50144 0.438107 \n",
"1 2016-005 April 102 37.0 mean 0 3.55538 1.73326 \n",
"2 2016-005 April 75 51.0 mean 0 14.7795 0.563364 \n",
"3 2016-005 April 72 64.0 mean 0 0.923699 0.745832 \n",
"4 2016-005 April 69 76.0 mean 0 3.88279 0.290172 \n",
".. ... ... ... ... ... ... ... ... \n",
"115 2016-071 November 9 250.0 mean 0 0.0142285 0.082281 \n",
"116 2016-071 November 6 264.0 mean 0 0.00399836 0.104654 \n",
"117 2016-071 November 3 281.0 mean 0 0 0.0634224 \n",
"118 2016-071 November 2 297.0 mean 0 0 0.0636933 \n",
"119 2016-071 November 27 314.0 mean 0 0.00343804 0.112049 \n",
"\n",
" Prasinophytes Cryptophytes Dinoflagellates-1 Haptophytes Dictyo \\\n",
"0 0.048483 0.338791 0.0478202 0 0.00362949 \n",
"1 0.385956 1.82211 0.139308 0.186364 0.0440193 \n",
"2 0.397391 1.18036 0.075357 0.0941965 0 \n",
"3 0.337639 1.20012 0.0704119 0.0893556 0.00326812 \n",
"4 0.183778 0.531872 0.0478852 0.0609817 0 \n",
".. ... ... ... ... ... \n",
"115 0.0897016 0.0562649 0.000186116 0.00849243 0.017071 \n",
"116 0.145557 0.109826 0.00047569 0.040834 0.0139237 \n",
"117 0.0775545 0.0395764 0.0314152 0.0276703 0.0142296 \n",
"118 0.181767 0.274402 0.0402761 0.0463697 0.0196038 \n",
"119 0.343388 0.400357 0.035914 0.0469057 0.00552154 \n",
"\n",
" Raphido Cyanobacteria TchlA \n",
"0 0.000129969 0.0086402 5.38704 \n",
"1 0.0767997 0 7.9432 \n",
"2 0 0 17.0901 \n",
"3 0.0523156 0 3.42264 \n",
"4 0 0 4.99748 \n",
".. ... ... ... \n",
"115 0.00322275 0.00355171 0.275 \n",
"116 0.00948663 0.00424517 0.433 \n",
"117 0.0202238 0.00390783 0.278 \n",
"118 0.0268882 0 0.653 \n",
"119 0.0769891 0.00243813 1.027 \n",
"\n",
"[120 rows x 16 columns]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfPhyto"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# This is the important step- join the two tables ('left' and 'right'), \n",
"# matching the cruise IDs and sample numbers\n",
"# how='outer' -> all rows from both the left and the right tables will be included, \n",
"# even if they cannot be matched; this makes it easy to check for \n",
"# unmatched data later\n",
"# left_on specifies the name of the column to match in the left table (dfbot) \n",
"# right_on specifies the name of the column to match in the right table (dfPhyto)\n",
"dfout = pd.merge(dfbot, dfPhyto, how='outer', \n",
" left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',\n",
" 'LOC:EVENT_NUMBER', 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH',\n",
" 'ADM:SCIENTIST', 'LOC:STATION', 'Sample_Number', 'pH:SBE:Nominal',\n",
" 'Temperature:Draw', 'Salinity:Bottle', 'Flag:Salinity:Bottle',\n",
" 'Flag:Chlorophyll:Extracted', 'Phaeo-Pigment:Extracted',\n",
" 'Flag:Oxygen:Dissolved', 'Flag:Nitrate_plus_Nitrite', 'Flag:Silicate',\n",
" 'Phosphate', 'Flag:Phosphate', 'Cruise', 'Number_of_bin_records',\n",
" 'ADM:PROJECT', 'Pressure [decibar]',\n",
" 'Temperature:Secondary [deg C (ITS90)]', 'Transmissivity [*/metre]',\n",
" 'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]',\n",
" 'PAR:Reference [uE/m^2/sec]', 'Salinity:T1:C1 [PSS-78]',\n",
" 'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]',\n",
" 'Salinity:Bottle [PSS-78]', 'Chlorophyll:Extracted [mg/m^3]',\n",
" 'Phaeo-Pigment:Extracted [mg/m^3]', 'Nitrate_plus_Nitrite [umol/L]',\n",
" 'Silicate [umol/L]', 'Phosphate [umol/L]', 'Bottle_Number',\n",
" 'Bottle:Firing_Sequence', 'Oxygen:Dissolved [mL/L]',\n",
" 'Oxygen:Dissolved [umol/kg]', 'Date', 'Station', 'Sample#', 'rep',\n",
" 'depth', 'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes',\n",
" 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',\n",
" 'Cyanobacteria', 'TchlA'],\n",
" dtype='object')"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# show the column names in the resulting table\n",
"dfout.keys()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Checks"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# Identify cases where phytoplankton data were matched to multiple samples in bottle data:\n",
"dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})\n",
"temp.columns = ['icount']"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1])"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.unique(temp.icount)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" icount | \n",
"
\n",
" \n",
" Cruise | \n",
" Sample# | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [icount]\n",
"Index: []"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check for Phyto samples matched to multiple bottle samples:\n",
"temp.loc[temp.icount>1]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" icount | \n",
"
\n",
" \n",
" Cruise | \n",
" Sample# | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [icount]\n",
"Index: []"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check for phyto samples not matched to bottle samples:\n",
"temp.loc[temp.icount==0]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" icount | \n",
"
\n",
" \n",
" Cruise | \n",
" Sample_Number | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [icount]\n",
"Index: []"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})\n",
"temp2.columns = ['icount']\n",
"# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:\n",
"temp2.loc[temp2.icount>1]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" icount | \n",
"
\n",
" \n",
" Cruise | \n",
" Sample# | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [icount]\n",
"Index: []"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check for phyto samples not matched to bottle samples:\n",
"temp.loc[temp.icount==0]"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1420, 120, 1420)"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# if the output table is longer than either of the input tables, some columns were not matched\n",
"len(dfout), len(dfPhyto), len(dfbot)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(120, 120)"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check that the number of cells with data in the 'Cyanobacteria' column is \n",
"# the same for the input and output tables to show that no rows are missing:\n",
"np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Series([], Name: Sample#, dtype: float64)\n"
]
}
],
"source": [
"# If there were data rows from the phytoplankton table that were not matched to \n",
"# rows from the bottle table, their indices from the phytoplankton table would be \n",
"# displayed below (the series [] would not be empty)\n",
"print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Sample#'])"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"# drop repetetive/unecessary columns:\n",
"dfout.drop(labels=['Date', 'Station', 'Sample#', 'rep',\n",
" 'depth',],axis=1,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"# truncate phyto group values to 3 decimal places:\n",
"for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',\n",
" 'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',\n",
" 'TchlA'):\n",
" dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 NaN\n",
"1 NaN\n",
"2 NaN\n",
"3 NaN\n",
"4 NaN\n",
" ... \n",
"1415 NaN\n",
"1416 NaN\n",
"1417 NaN\n",
"1418 0.002\n",
"1419 NaN\n",
"Name: Cyanobacteria, Length: 1420, dtype: float64"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfout['Cyanobacteria']"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "'Depth [metres]'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m~/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 2897\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2898\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2899\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: 'Depth [metres]'",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Check for unexpected depths\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdfout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdfout\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Pressure [decibar]'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m>\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m|\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdfout\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Depth [metres]'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m>\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m&\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdfout\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Diatoms-1'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m>=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;31m#,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;31m# ['FIL:START TIME YYYY/MM/DD HH:MM:SS','Pressure [decibar]','Depth [metres]','Diatoms-1']]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2904\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnlevels\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2905\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_multilevel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2906\u001b[0;31m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2907\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_integer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2908\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 2898\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2899\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2900\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2901\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2902\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtolerance\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: 'Depth [metres]'"
]
}
],
"source": [
"# Check for unexpected depths\n",
"dfout.loc[((dfout['Pressure [decibar]']>10)|(dfout['Depth [metres]']>10))&(dfout['Diatoms-1']>=0)]#,\n",
"# ['FIL:START TIME YYYY/MM/DD HH:MM:SS','Pressure [decibar]','Depth [metres]','Diatoms-1']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# now write the output table to a .csv file:\n",
"dfout.to_csv(pathOut, index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dfout.keys()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (py39)",
"language": "python",
"name": "py39"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}