{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np # this module handles arrays, but here we need it for its NaN value\n", "import pandas as pd # this module contains a lot of tools for handling tabular data\n", "import re\n", "import gsw" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "## note: 2015-2019 SoG abs values New ALLO.xlsx has an incorrect sample number\n", " -- 2017-05 Station 28 sample 416 should be sample 9416 (9 is treatment of duplicate sample numbers)\n", "It is corrected in the code below rather than in the file itself so that the can be reproduced in future versions" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# define paths to the source files and eventual output file\n", "pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC/All 2017 SoG bottle.xlsx'\n", "pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2019 SoG abs values New ALLO.xlsx'\n", "pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2017_NewALLO.csv'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "#formatting function to get year-(3digit) format\n", "def fmtCruise(istr):\n", " if re.fullmatch('[0-9]{4}-[0-9]{2}',istr): \n", " sp=re.split('-',istr)\n", " rstr=sp[0]+'-0'+sp[1]\n", " elif re.fullmatch('[0-9]{4}-[0-9]{3}',istr):\n", " rstr=istr\n", " else:\n", " raise ValueError('Input had unexpected format:',istr)\n", " return rstr" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['2017-01', '2017-63', '2017-05', '2017-64', '2017-09', '2017-65']\n" ] } ], "source": [ "# get names of sheets in notebook\n", "with pd.ExcelFile(pathBottle,engine='openpyxl') as xl:\n", " sheets=xl.sheet_names\n", "print(sheets)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reading sheet 2017-01\n", "Reading sheet 2017-63\n", "Reading sheet 2017-05\n", "Reading sheet 2017-64\n", "Reading sheet 2017-09\n", "Reading sheet 2017-65\n", "removed empty columns: set()\n" ] } ], "source": [ "# load each sheet in the bottle Excel file and concatenate them together into one table\n", "dfbotlist=list()\n", "for sheet in sheets:\n", " df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,engine='openpyxl',\n", " na_values=(-99,-99.9)) # read each sheet; include additional na values\n", " df0['Cruise']=fmtCruise(sheet) # create and populate Cruise column based on sheet name\n", " dfbotlist.append(df0) # append the sheet to a list\n", "dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table\n", "# Drop columns with no data in them\n", "l1=set(dfbot.keys())\n", "dfbot.dropna(axis=1,how='all',inplace=True)\n", "print('removed empty columns:',l1-set(dfbot.keys()))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['FIL:START TIME YYYY/MM/DD HH:MM:SS', 'LOC:EVENT_NUMBER',\n", " 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST',\n", " 'LOC:STATION', 'ADM:PROJECT', 'Bottle_Number', 'Sample_Number',\n", " 'Pressure [decibar]', 'Depth [metres]',\n", " 'Temperature:Primary [deg C (ITS90)]',\n", " 'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]',\n", " 'Salinity:T0:C0 [PSS-78]', 'Temperature:Draw [deg C (ITS90)]',\n", " 'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle',\n", " 'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted',\n", " 'Phaeo-Pigment:Extracted [mg/m^3]', 'Oxygen:Dissolved [mL/L]',\n", " 'Oxygen:Dissolved [umol/kg]', 'Flag:Oxygen:Dissolved',\n", " 'Nitrate_plus_Nitrite [umol/L]', 'Flag:Nitrate_plus_Nitrite',\n", " 'Silicate [umol/L]', 'Flag:Silicate', 'Phosphate [umol/L]',\n", " 'Flag:Phosphate', 'Comments by sample_numbeR', 'Cruise', 'Zone',\n", " 'ADM:MISSION', 'Bottle:Firing_Sequence', 'Temperature*',\n", " 'Transmissivity [*/metre]', 'Oxygen:Dissolved:SBE [mL/L]',\n", " 'Oxygen:Dissolved:SBE [umol/kg]',\n", " 'Temperature:Secondary [deg C (ITS90)]', 'Salinity:T1:C1 [PSS-78]',\n", " 'Number_of_bin_records'],\n", " dtype='object')\n" ] } ], "source": [ "# list the column names in the resulting table\n", "print(dfbot.keys())" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "204 71 1061\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
FIL:START TIME YYYY/MM/DD HH:MM:SSLOC:EVENT_NUMBERLOC:LATITUDELOC:LONGITUDELOC:WATER DEPTHADM:SCIENTISTLOC:STATIONADM:PROJECTBottle_NumberSample_Number...ZoneADM:MISSIONBottle:Firing_SequenceTemperature*Transmissivity [*/metre]Oxygen:Dissolved:SBE [mL/L]Oxygen:Dissolved:SBE [umol/kg]Temperature:Secondary [deg C (ITS90)]Salinity:T1:C1 [PSS-78]Number_of_bin_records
\n", "

0 rows × 43 columns

\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [FIL:START TIME YYYY/MM/DD HH:MM:SS, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, LOC:STATION, ADM:PROJECT, Bottle_Number, Sample_Number, Pressure [decibar], Depth [metres], Temperature:Primary [deg C (ITS90)], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], Salinity:T0:C0 [PSS-78], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, Zone, ADM:MISSION, Bottle:Firing_Sequence, Temperature*, Transmissivity [*/metre], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Temperature:Secondary [deg C (ITS90)], Salinity:T1:C1 [PSS-78], Number_of_bin_records]\n", "Index: []\n", "\n", "[0 rows x 43 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# no rows returned, so there are no rows with multiple temperature values\n", "print(np.sum(dfbot['Temperature:Primary [deg C (ITS90)]']>=0),\n", " np.sum(dfbot['Temperature:Secondary [deg C (ITS90)]']>=0),\n", " np.sum(dfbot['Temperature*']>=0))\n", "dfbot.loc[(np.array([int(ii) for ii in (dfbot['Temperature:Primary [deg C (ITS90)]']>=0)])+\\\n", " np.array([int(ii) for ii in (dfbot['Temperature:Secondary [deg C (ITS90)]']>=0)])+\\\n", " np.array([int(ii) for ii in (dfbot['Temperature*']>=0)]))>1]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "931 71\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
FIL:START TIME YYYY/MM/DD HH:MM:SSLOC:EVENT_NUMBERLOC:LATITUDELOC:LONGITUDELOC:WATER DEPTHADM:SCIENTISTLOC:STATIONADM:PROJECTBottle_NumberSample_Number...ZoneADM:MISSIONBottle:Firing_SequenceTemperature*Transmissivity [*/metre]Oxygen:Dissolved:SBE [mL/L]Oxygen:Dissolved:SBE [umol/kg]Temperature:Secondary [deg C (ITS90)]Salinity:T1:C1 [PSS-78]Number_of_bin_records
\n", "

0 rows × 43 columns

\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [FIL:START TIME YYYY/MM/DD HH:MM:SS, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, LOC:STATION, ADM:PROJECT, Bottle_Number, Sample_Number, Pressure [decibar], Depth [metres], Temperature:Primary [deg C (ITS90)], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], Salinity:T0:C0 [PSS-78], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, Zone, ADM:MISSION, Bottle:Firing_Sequence, Temperature*, Transmissivity [*/metre], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Temperature:Secondary [deg C (ITS90)], Salinity:T1:C1 [PSS-78], Number_of_bin_records]\n", "Index: []\n", "\n", "[0 rows x 43 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# no rows returned, so there are no rows with both both salinity fields\n", "print(np.sum(dfbot['Salinity:T0:C0 [PSS-78]']>=0),\n", " np.sum(dfbot['Salinity:T1:C1 [PSS-78]']>=0))\n", "dfbot.loc[(dfbot['Salinity:T0:C0 [PSS-78]']>=0)&\\\n", " (dfbot['Salinity:T1:C1 [PSS-78]']>=0)]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def subval(idf,colList):\n", " # first value in colList should be the column you are going to keep\n", " # follow with other columns that will be used to fill in when that column is NaN\n", " # in order of precedence\n", " if len(colList)==2:\n", " idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \\\n", " else r[colList[1]] for i,r in idf.iterrows()]\n", " elif len(colList)==3:\n", " idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \\\n", " else r[colList[1]] if not pd.isna(r[colList[1]]) \\\n", " else r[colList[2]] for i,r in idf.iterrows()]\n", " else:\n", " raise NotImplementedError('Add to code to handle this case')\n", " return idf" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# there are some duplicate columns here; handle them:\n", "dfbot=subval(dfbot,('Temperature:Primary [deg C (ITS90)]',\n", " 'Temperature:Secondary [deg C (ITS90)]',\n", " 'Temperature*'))\n", "dfbot=subval(dfbot,('Salinity:T0:C0 [PSS-78]',\n", " 'Salinity:T1:C1 [PSS-78]'))\n", "dfbot.rename(columns={'Temperature:Primary [deg C (ITS90)]':'Temperature [deg C (ITS90)]'},\n", " inplace=True)\n", "dfbot.rename(columns={'Salinity:T0:C0 [PSS-78]':'Salinity [PSS-78]'},\n", " inplace=True)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# drop repetetive/unecessary columns:\n", "dfbot.drop(labels=['Temperature:Secondary [deg C (ITS90)]',\n", " 'Salinity:T1:C1 [PSS-78]','Temperature*'],axis=1,inplace=True)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# define a function that will be applied to the values in the index column;\n", "# this makes it easier to drop non-data rows later\n", "def convertIndex(val):\n", " try:\n", " x =int(val)\n", " except ValueError:\n", " x=np.nan\n", " return x" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reading sheet 2017\n" ] } ], "source": [ "# load the 2015 phytoplankton data with the following options:\n", "# sheet_name='2015 CHEMTAX abs results' -> choose the 2015 sheet\n", "# usecols='A:I,T:AC' -> read only columns A:I and T:AC from the Excel sheet\n", "# skiprows=2 -> start reading at the 3rd row of the sheet, \n", "# which contains the column headings\n", "# converters={'Index': convertIndex,} -> apply the function defined above to the Index column\n", "# verbose = True -> print extra information/ warnings/ errors\n", "dfPhyto=pd.read_excel(pathPhyto,sheet_name='2017',engine='openpyxl',\n", " skiprows=0,converters={'Sample#': convertIndex,},\n", " verbose=True)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CruiseDateStationSample#repdepthDiatoms-1Diatoms-2PrasinophytesCryptophytesDinoflagellates-1HaptophytesDictyoRaphidoCyanobacteriaTchlA
482017-005MayCPF1398.0mean09.726241.181630.09000090.4666540.12447000011.589
492017-005May22410.0mean02.291920.08597580.03562230.2720790.0565730.1763520.004511220.01844670.004518092.946
502017-005May24413.0mean03.282440.06120940.03547720.6983030.07802270.1956140.003917470.066013804.421
512017-005May28416.0mean01.7383400.03204770.4276690.03952940.1461050.008648310.0328970.001768142.427
522017-005MayGEO1425.0mean02.749010.1367140.002764010.3730350.032715700.001700310.03847250.02459273.359
532017-005May38428.0mean01.4246100.1520712.291140.2179390.1816030.01608220.032556204.316
542017-005May41441.0mean00.1305200.02249330.2579590.031990802.00975e-050.02636170.02265470.492
55CruiseDateStationNaNrepdepthDiatoms-1Diatoms-2PrasinophytesCryptophytesDinoflagellates-1HaptophytesDictyoRaphidoCyanobacteriaTchlA
562017-064JuneSI9.0mean00.18626500.09656130.5775160.4572050.7130540.08959570.065804302.186
572017-064June5923.0mean00.09644280.06757010.02908690.1791290.03537870.06769730.001452370.014242300.491
582017-064June10239.0mean00.4327130.09596440.249120.7026230.2658390.451360.03620030.08830460.02087532.343
592017-064June7554.0mean00.5543190.1483220.1509590.8541820.2968140.5731490.04487030.079660.02872532.731
\n", "
" ], "text/plain": [ " Cruise Date Station Sample# rep depth Diatoms-1 Diatoms-2 \\\n", "48 2017-005 May CPF1 398.0 mean 0 9.72624 1.18163 \n", "49 2017-005 May 22 410.0 mean 0 2.29192 0.0859758 \n", "50 2017-005 May 24 413.0 mean 0 3.28244 0.0612094 \n", "51 2017-005 May 28 416.0 mean 0 1.73834 0 \n", "52 2017-005 May GEO1 425.0 mean 0 2.74901 0.136714 \n", "53 2017-005 May 38 428.0 mean 0 1.42461 0 \n", "54 2017-005 May 41 441.0 mean 0 0.13052 0 \n", "55 Cruise Date Station NaN rep depth Diatoms-1 Diatoms-2 \n", "56 2017-064 June SI 9.0 mean 0 0.186265 0 \n", "57 2017-064 June 59 23.0 mean 0 0.0964428 0.0675701 \n", "58 2017-064 June 102 39.0 mean 0 0.432713 0.0959644 \n", "59 2017-064 June 75 54.0 mean 0 0.554319 0.148322 \n", "\n", " Prasinophytes Cryptophytes Dinoflagellates-1 Haptophytes Dictyo \\\n", "48 0.0900009 0.466654 0.12447 0 0 \n", "49 0.0356223 0.272079 0.056573 0.176352 0.00451122 \n", "50 0.0354772 0.698303 0.0780227 0.195614 0.00391747 \n", "51 0.0320477 0.427669 0.0395294 0.146105 0.00864831 \n", "52 0.00276401 0.373035 0.0327157 0 0.00170031 \n", "53 0.152071 2.29114 0.217939 0.181603 0.0160822 \n", "54 0.0224933 0.257959 0.0319908 0 2.00975e-05 \n", "55 Prasinophytes Cryptophytes Dinoflagellates-1 Haptophytes Dictyo \n", "56 0.0965613 0.577516 0.457205 0.713054 0.0895957 \n", "57 0.0290869 0.179129 0.0353787 0.0676973 0.00145237 \n", "58 0.24912 0.702623 0.265839 0.45136 0.0362003 \n", "59 0.150959 0.854182 0.296814 0.573149 0.0448703 \n", "\n", " Raphido Cyanobacteria TchlA \n", "48 0 0 11.589 \n", "49 0.0184467 0.00451809 2.946 \n", "50 0.0660138 0 4.421 \n", "51 0.032897 0.00176814 2.427 \n", "52 0.0384725 0.0245927 3.359 \n", "53 0.0325562 0 4.316 \n", "54 0.0263617 0.0226547 0.492 \n", "55 Raphido Cyanobacteria TchlA \n", "56 0.0658043 0 2.186 \n", "57 0.0142423 0 0.491 \n", "58 0.0883046 0.0208753 2.343 \n", "59 0.07966 0.0287253 2.731 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# display rows 48 to 59 of the resulting table\n", "dfPhyto[48:60]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# now, drop any rows from the table that have NaN values in either of the columns\n", "# 'Index' or 'TchlA (ug/L)'\n", "# This is why we applied a function to the Index column to make sure all \n", "# non-numeric Index values would have a consistent NaN entry, making them easy to identify\n", "# and remove\n", "dfPhyto.dropna(subset=['Sample#', 'TchlA'],how='any',inplace=True)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# pandas creates its own index, and after dropping rows I like to reset it -\n", "# this is just for convenience\n", "dfPhyto.reset_index(drop=True,inplace=True)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# apply formatting function all rows in Cruise column to get year-3digit format\n", "dfPhyto['Cruise']=[fmtCruise(ii) for ii in dfPhyto['Cruise']]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CruiseDateStationSample#repdepthDiatoms-1Diatoms-2PrasinophytesCryptophytesDinoflagellates-1HaptophytesDictyoRaphidoCyanobacteriaTchlA
482017-005May24413.0mean03.282440.06120940.03547720.6983030.07802270.1956140.003917470.066013804.421
492017-005May28416.0mean01.7383400.03204770.4276690.03952940.1461050.008648310.0328970.001768142.427
502017-005MayGEO1425.0mean02.749010.1367140.002764010.3730350.032715700.001700310.03847250.02459273.359
512017-005May38428.0mean01.4246100.1520712.291140.2179390.1816030.01608220.032556204.316
522017-005May41441.0mean00.1305200.02249330.2579590.031990802.00975e-050.02636170.02265470.492
532017-064JuneSI9.0mean00.18626500.09656130.5775160.4572050.7130540.08959570.065804302.186
542017-064June5923.0mean00.09644280.06757010.02908690.1791290.03537870.06769730.001452370.014242300.491
552017-064June10239.0mean00.4327130.09596440.249120.7026230.2658390.451360.03620030.08830460.02087532.343
562017-064June7554.0mean00.5543190.1483220.1509590.8541820.2968140.5731490.04487030.079660.02872532.731
572017-064June7268.0mean0000.1597310.7568240.09627370.1285635.65439e-050.10455201.246
582017-064June6981.0mean000.06965120.1468230.516750.04942230.1141770.01034990.051194500.958368
592017-064JuneADCP92.0mean000.02290160.07609850.4056450.01809520.055100900.033158400.611
\n", "
" ], "text/plain": [ " Cruise Date Station Sample# rep depth Diatoms-1 Diatoms-2 \\\n", "48 2017-005 May 24 413.0 mean 0 3.28244 0.0612094 \n", "49 2017-005 May 28 416.0 mean 0 1.73834 0 \n", "50 2017-005 May GEO1 425.0 mean 0 2.74901 0.136714 \n", "51 2017-005 May 38 428.0 mean 0 1.42461 0 \n", "52 2017-005 May 41 441.0 mean 0 0.13052 0 \n", "53 2017-064 June SI 9.0 mean 0 0.186265 0 \n", "54 2017-064 June 59 23.0 mean 0 0.0964428 0.0675701 \n", "55 2017-064 June 102 39.0 mean 0 0.432713 0.0959644 \n", "56 2017-064 June 75 54.0 mean 0 0.554319 0.148322 \n", "57 2017-064 June 72 68.0 mean 0 0 0 \n", "58 2017-064 June 69 81.0 mean 0 0 0.0696512 \n", "59 2017-064 June ADCP 92.0 mean 0 0 0.0229016 \n", "\n", " Prasinophytes Cryptophytes Dinoflagellates-1 Haptophytes Dictyo \\\n", "48 0.0354772 0.698303 0.0780227 0.195614 0.00391747 \n", "49 0.0320477 0.427669 0.0395294 0.146105 0.00864831 \n", "50 0.00276401 0.373035 0.0327157 0 0.00170031 \n", "51 0.152071 2.29114 0.217939 0.181603 0.0160822 \n", "52 0.0224933 0.257959 0.0319908 0 2.00975e-05 \n", "53 0.0965613 0.577516 0.457205 0.713054 0.0895957 \n", "54 0.0290869 0.179129 0.0353787 0.0676973 0.00145237 \n", "55 0.24912 0.702623 0.265839 0.45136 0.0362003 \n", "56 0.150959 0.854182 0.296814 0.573149 0.0448703 \n", "57 0.159731 0.756824 0.0962737 0.128563 5.65439e-05 \n", "58 0.146823 0.51675 0.0494223 0.114177 0.0103499 \n", "59 0.0760985 0.405645 0.0180952 0.0551009 0 \n", "\n", " Raphido Cyanobacteria TchlA \n", "48 0.0660138 0 4.421 \n", "49 0.032897 0.00176814 2.427 \n", "50 0.0384725 0.0245927 3.359 \n", "51 0.0325562 0 4.316 \n", "52 0.0263617 0.0226547 0.492 \n", "53 0.0658043 0 2.186 \n", "54 0.0142423 0 0.491 \n", "55 0.0883046 0.0208753 2.343 \n", "56 0.07966 0.0287253 2.731 \n", "57 0.104552 0 1.246 \n", "58 0.0511945 0 0.958368 \n", "59 0.0331584 0 0.611 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# display part of the table, confirming that non-data rows have been removed\n", "dfPhyto[48:60]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# due to repeated column names in the original spreadsheet, '.1' was appended to the names\n", "# of the phytoplankton columns; \n", "# these lines correct the column names, removing the '.1':\n", "renameDict=dict()\n", "for colName in dfPhyto.keys():\n", " if colName.endswith('.1'):\n", " renameDict[colName]=colName.split('.1')[0]\n", "dfPhyto.rename(columns=renameDict,inplace=True)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CruiseDateStationSample#repdepthDiatoms-1Diatoms-2PrasinophytesCryptophytesDinoflagellates-1HaptophytesDictyoRaphidoCyanobacteriaTchlA
02017-001Feb102615.0mean10.1530310.00709060.09511620.2852360.0578340.15840600.017026701.77374
12017-001FebJF2627.0mean10.04665440.03694620.1335720.365220.07941220.1795470.003763030.060636601.90575
22017-001Feb59641.0mean10.1198540.1794110.06297630.1765990.023265400.005298690.012515301.57992
32017-001Feb56655.0mean10.3461020.2462370.1376050.3957980.0598730.06073720.01090770.04224040.009858422.30936
42017-001Feb46667.0mean20.3365880.2627660.1170030.334330.05323670.0470560.01412360.01946760.01222973.1968
...................................................
1142017-065Oct9260.0mean0000.06329290.10962700.40606400.02070890.03830720.638
1152017-065Oct12276.0mean01.295190.2920600.1985170.01816370.1023960.01077630.009048610.01084951.937
1162017-065Oct14291.0mean00.10013700.1146140.1697330.0001431270.28947900.05103490.02585850.751
1172017-065Oct16304.0mean00.3441550.0534020.01486340.15020500.0699920.009902130.01523990.007240860.665
1182017-065Oct22307.0mean01.525710.3463860.06096040.2612510.06405070.1808170.03773620.03970750.04738172.564
\n", "

119 rows × 16 columns

\n", "
" ], "text/plain": [ " Cruise Date Station Sample# rep depth Diatoms-1 Diatoms-2 \\\n", "0 2017-001 Feb 102 615.0 mean 1 0.153031 0.0070906 \n", "1 2017-001 Feb JF2 627.0 mean 1 0.0466544 0.0369462 \n", "2 2017-001 Feb 59 641.0 mean 1 0.119854 0.179411 \n", "3 2017-001 Feb 56 655.0 mean 1 0.346102 0.246237 \n", "4 2017-001 Feb 46 667.0 mean 2 0.336588 0.262766 \n", ".. ... ... ... ... ... ... ... ... \n", "114 2017-065 Oct 9 260.0 mean 0 0 0 \n", "115 2017-065 Oct 12 276.0 mean 0 1.29519 0.29206 \n", "116 2017-065 Oct 14 291.0 mean 0 0.100137 0 \n", "117 2017-065 Oct 16 304.0 mean 0 0.344155 0.053402 \n", "118 2017-065 Oct 22 307.0 mean 0 1.52571 0.346386 \n", "\n", " Prasinophytes Cryptophytes Dinoflagellates-1 Haptophytes Dictyo \\\n", "0 0.0951162 0.285236 0.057834 0.158406 0 \n", "1 0.133572 0.36522 0.0794122 0.179547 0.00376303 \n", "2 0.0629763 0.176599 0.0232654 0 0.00529869 \n", "3 0.137605 0.395798 0.059873 0.0607372 0.0109077 \n", "4 0.117003 0.33433 0.0532367 0.047056 0.0141236 \n", ".. ... ... ... ... ... \n", "114 0.0632929 0.109627 0 0.406064 0 \n", "115 0 0.198517 0.0181637 0.102396 0.0107763 \n", "116 0.114614 0.169733 0.000143127 0.289479 0 \n", "117 0.0148634 0.150205 0 0.069992 0.00990213 \n", "118 0.0609604 0.261251 0.0640507 0.180817 0.0377362 \n", "\n", " Raphido Cyanobacteria TchlA \n", "0 0.0170267 0 1.77374 \n", "1 0.0606366 0 1.90575 \n", "2 0.0125153 0 1.57992 \n", "3 0.0422404 0.00985842 2.30936 \n", "4 0.0194676 0.0122297 3.1968 \n", ".. ... ... ... \n", "114 0.0207089 0.0383072 0.638 \n", "115 0.00904861 0.0108495 1.937 \n", "116 0.0510349 0.0258585 0.751 \n", "117 0.0152399 0.00724086 0.665 \n", "118 0.0397075 0.0473817 2.564 \n", "\n", "[119 rows x 16 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfPhyto" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "9416 This number should be zero: 0\n", "9213 This number should be zero: 0\n", "9217 This number should be zero: 0\n" ] } ], "source": [ "# sample number corrections\n", "iii=(dfPhyto['Cruise']=='2017-005')&(dfPhyto['Station']==28)&(dfPhyto['Sample#']==416)\n", "if len(dfPhyto.loc[iii])>0:\n", " dfPhyto.loc[iii,['Sample#']]=9416\n", "iii=(dfPhyto['Cruise']=='2017-005')&(dfPhyto['Station']==28)&(dfPhyto['Sample#']==416)\n", "print('9416 This number should be zero:',len(dfPhyto.loc[iii]))\n", "iii=(dfPhyto['Cruise']=='2017-064')&(dfPhyto['Station']==2)&(dfPhyto['Sample#']==213)\n", "if len(dfPhyto.loc[iii])>0:\n", " dfPhyto.loc[iii,['Sample#']]=9213\n", "iii=(dfPhyto['Cruise']=='2017-064')&(dfPhyto['Station']==2)&(dfPhyto['Sample#']==213)\n", "print('9213 This number should be zero:',len(dfPhyto.loc[iii]))\n", "#this ones was correct but test it just in case:\n", "iii=(dfPhyto['Cruise']=='2017-064')&(dfPhyto['Station']=='CPF1')&(dfPhyto['Sample#']==217)\n", "if len(dfPhyto.loc[iii])>0:\n", " dfPhyto.loc[iii,['Sample#']]=9217\n", "iii=(dfPhyto['Cruise']=='2017-064')&(dfPhyto['Station']=='CPF1')&(dfPhyto['Sample#']==217)\n", "print('9217 This number should be zero:',len(dfPhyto.loc[iii]))" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# This is the important step- join the two tables ('left' and 'right'), \n", "# matching the cruise IDs and sample numbers\n", "# how='outer' -> all rows from both the left and the right tables will be included, \n", "# even if they cannot be matched; this makes it easy to check for \n", "# unmatched data later\n", "# left_on specifies the name of the column to match in the left table (dfbot) \n", "# right_on specifies the name of the column to match in the right table (dfPhyto)\n", "dfout = pd.merge(dfbot, dfPhyto, how='outer', \n", " left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['FIL:START TIME YYYY/MM/DD HH:MM:SS', 'LOC:EVENT_NUMBER',\n", " 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST',\n", " 'LOC:STATION', 'ADM:PROJECT', 'Bottle_Number', 'Sample_Number',\n", " 'Pressure [decibar]', 'Depth [metres]', 'Temperature [deg C (ITS90)]',\n", " 'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]',\n", " 'Salinity [PSS-78]', 'Temperature:Draw [deg C (ITS90)]',\n", " 'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle',\n", " 'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted',\n", " 'Phaeo-Pigment:Extracted [mg/m^3]', 'Oxygen:Dissolved [mL/L]',\n", " 'Oxygen:Dissolved [umol/kg]', 'Flag:Oxygen:Dissolved',\n", " 'Nitrate_plus_Nitrite [umol/L]', 'Flag:Nitrate_plus_Nitrite',\n", " 'Silicate [umol/L]', 'Flag:Silicate', 'Phosphate [umol/L]',\n", " 'Flag:Phosphate', 'Comments by sample_numbeR', 'Cruise', 'Zone',\n", " 'ADM:MISSION', 'Bottle:Firing_Sequence', 'Transmissivity [*/metre]',\n", " 'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]',\n", " 'Number_of_bin_records', 'Date', 'Station', 'Sample#', 'rep', 'depth',\n", " 'Diatoms-1', 'Diatoms-2', 'Prasinophytes', 'Cryptophytes',\n", " 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',\n", " 'Cyanobacteria', 'TchlA'],\n", " dtype='object')" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# show the column names in the resulting table\n", "dfout.keys()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Checks" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "# Identify cases where phytoplankton data were matched to multiple samples in bottle data:\n", "dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])\n" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})\n", "temp.columns = ['icount']" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1])" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.unique(temp.icount)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
icount
CruiseSample#
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [icount]\n", "Index: []" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check for Phyto samples matched to multiple bottle samples:\n", "temp.loc[temp.icount>1]" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
icount
CruiseSample#
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [icount]\n", "Index: []" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check for phyto samples not matched to bottle samples:\n", "temp.loc[temp.icount==0]" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
icount
CruiseSample_Number
2017-064-997
\n", "
" ], "text/plain": [ " icount\n", "Cruise Sample_Number \n", "2017-064 -99 7" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})\n", "temp2.columns = ['icount']\n", "# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:\n", "temp2.loc[temp2.icount>1]" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
FIL:START TIME YYYY/MM/DD HH:MM:SSLOC:EVENT_NUMBERLOC:LATITUDELOC:LONGITUDELOC:WATER DEPTHADM:SCIENTISTLOC:STATIONADM:PROJECTBottle_NumberSample_Number...Flag:PhosphateComments by sample_numbeRCruiseZoneADM:MISSIONBottle:Firing_SequenceTransmissivity [*/metre]Oxygen:Dissolved:SBE [mL/L]Oxygen:Dissolved:SBE [umol/kg]Number_of_bin_records
6472017-06-22 17:07:45748.50000-124.73450252Chandler P.102NaNNaN-99...NaNNaN2017-064UTC2017-64NaN35.56.13267.4241.0
6622017-06-22 20:56:061048.46950-124.54600220Chandler P.75NaNNaN-99...NaNNaN2017-064UTC2017-64NaN34.86.34276.683.0
7142017-06-23 12:35:062748.24317-122.97500153Chandler P.63NaNNaN-99...NaNNaN2017-064UTC2017-64NaN35.72.75119.7241.0
7442017-06-23 22:43:153448.77350-123.02900205Chandler P.56NaNNaN-99...NaNNaN2017-064UTC2017-64NaN36.95.59244.8241.0
8192017-06-24 19:30:306149.31816-123.79933342Chandler P.27NaNNaN-99...NaNNaN2017-064UTC2017-64NaN12.48.18359.1241.0
8352017-06-25 00:47:056849.40200-124.15434277Chandler P.2NaNNaN-99...NaNNaN2017-064UTC2017-64NaN23.97.26318.2241.0
9362017-06-26 01:21:439549.88317-124.99350316Chandler P.14NaNNaN-99...NaNNaN2017-064UTC2017-64NaN31.76.96304.5241.0
\n", "

7 rows × 40 columns

\n", "
" ], "text/plain": [ " FIL:START TIME YYYY/MM/DD HH:MM:SS LOC:EVENT_NUMBER LOC:LATITUDE \\\n", "647 2017-06-22 17:07:45 7 48.50000 \n", "662 2017-06-22 20:56:06 10 48.46950 \n", "714 2017-06-23 12:35:06 27 48.24317 \n", "744 2017-06-23 22:43:15 34 48.77350 \n", "819 2017-06-24 19:30:30 61 49.31816 \n", "835 2017-06-25 00:47:05 68 49.40200 \n", "936 2017-06-26 01:21:43 95 49.88317 \n", "\n", " LOC:LONGITUDE LOC:WATER DEPTH ADM:SCIENTIST LOC:STATION ADM:PROJECT \\\n", "647 -124.73450 252 Chandler P. 102 NaN \n", "662 -124.54600 220 Chandler P. 75 NaN \n", "714 -122.97500 153 Chandler P. 63 NaN \n", "744 -123.02900 205 Chandler P. 56 NaN \n", "819 -123.79933 342 Chandler P. 27 NaN \n", "835 -124.15434 277 Chandler P. 2 NaN \n", "936 -124.99350 316 Chandler P. 14 NaN \n", "\n", " Bottle_Number Sample_Number ... Flag:Phosphate \\\n", "647 NaN -99 ... NaN \n", "662 NaN -99 ... NaN \n", "714 NaN -99 ... NaN \n", "744 NaN -99 ... NaN \n", "819 NaN -99 ... NaN \n", "835 NaN -99 ... NaN \n", "936 NaN -99 ... NaN \n", "\n", " Comments by sample_numbeR Cruise Zone ADM:MISSION \\\n", "647 NaN 2017-064 UTC 2017-64 \n", "662 NaN 2017-064 UTC 2017-64 \n", "714 NaN 2017-064 UTC 2017-64 \n", "744 NaN 2017-064 UTC 2017-64 \n", "819 NaN 2017-064 UTC 2017-64 \n", "835 NaN 2017-064 UTC 2017-64 \n", "936 NaN 2017-064 UTC 2017-64 \n", "\n", " Bottle:Firing_Sequence Transmissivity [*/metre] \\\n", "647 NaN 35.5 \n", "662 NaN 34.8 \n", "714 NaN 35.7 \n", "744 NaN 36.9 \n", "819 NaN 12.4 \n", "835 NaN 23.9 \n", "936 NaN 31.7 \n", "\n", " Oxygen:Dissolved:SBE [mL/L] Oxygen:Dissolved:SBE [umol/kg] \\\n", "647 6.13 267.4 \n", "662 6.34 276.6 \n", "714 2.75 119.7 \n", "744 5.59 244.8 \n", "819 8.18 359.1 \n", "835 7.26 318.2 \n", "936 6.96 304.5 \n", "\n", " Number_of_bin_records \n", "647 241.0 \n", "662 83.0 \n", "714 241.0 \n", "744 241.0 \n", "819 241.0 \n", "835 241.0 \n", "936 241.0 \n", "\n", "[7 rows x 40 columns]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfbot.loc[(dfbot.Cruise=='2017-064')&(dfbot.Sample_Number<0)]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
FIL:START TIME YYYY/MM/DD HH:MM:SSLOC:EVENT_NUMBERLOC:LATITUDELOC:LONGITUDELOC:WATER DEPTHADM:SCIENTISTLOC:STATIONADM:PROJECTBottle_NumberSample_Number...Diatoms-1Diatoms-2PrasinophytesCryptophytesDinoflagellates-1HaptophytesDictyoRaphidoCyanobacteriaTchlA
6472017-06-22 17:07:45748.50000-124.73450252Chandler P.102NaNNaN-99...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
6482017-06-22 20:56:061048.46950-124.54600220Chandler P.75NaNNaN-99...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
6492017-06-23 12:35:062748.24317-122.97500153Chandler P.63NaNNaN-99...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
6502017-06-23 22:43:153448.77350-123.02900205Chandler P.56NaNNaN-99...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
6512017-06-24 19:30:306149.31816-123.79933342Chandler P.27NaNNaN-99...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
6522017-06-25 00:47:056849.40200-124.15434277Chandler P.2NaNNaN-99...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
6532017-06-26 01:21:439549.88317-124.99350316Chandler P.14NaNNaN-99...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "

7 rows × 55 columns

\n", "
" ], "text/plain": [ " FIL:START TIME YYYY/MM/DD HH:MM:SS LOC:EVENT_NUMBER LOC:LATITUDE \\\n", "647 2017-06-22 17:07:45 7 48.50000 \n", "648 2017-06-22 20:56:06 10 48.46950 \n", "649 2017-06-23 12:35:06 27 48.24317 \n", "650 2017-06-23 22:43:15 34 48.77350 \n", "651 2017-06-24 19:30:30 61 49.31816 \n", "652 2017-06-25 00:47:05 68 49.40200 \n", "653 2017-06-26 01:21:43 95 49.88317 \n", "\n", " LOC:LONGITUDE LOC:WATER DEPTH ADM:SCIENTIST LOC:STATION ADM:PROJECT \\\n", "647 -124.73450 252 Chandler P. 102 NaN \n", "648 -124.54600 220 Chandler P. 75 NaN \n", "649 -122.97500 153 Chandler P. 63 NaN \n", "650 -123.02900 205 Chandler P. 56 NaN \n", "651 -123.79933 342 Chandler P. 27 NaN \n", "652 -124.15434 277 Chandler P. 2 NaN \n", "653 -124.99350 316 Chandler P. 14 NaN \n", "\n", " Bottle_Number Sample_Number ... Diatoms-1 Diatoms-2 Prasinophytes \\\n", "647 NaN -99 ... NaN NaN NaN \n", "648 NaN -99 ... NaN NaN NaN \n", "649 NaN -99 ... NaN NaN NaN \n", "650 NaN -99 ... NaN NaN NaN \n", "651 NaN -99 ... NaN NaN NaN \n", "652 NaN -99 ... NaN NaN NaN \n", "653 NaN -99 ... NaN NaN NaN \n", "\n", " Cryptophytes Dinoflagellates-1 Haptophytes Dictyo Raphido \\\n", "647 NaN NaN NaN NaN NaN \n", "648 NaN NaN NaN NaN NaN \n", "649 NaN NaN NaN NaN NaN \n", "650 NaN NaN NaN NaN NaN \n", "651 NaN NaN NaN NaN NaN \n", "652 NaN NaN NaN NaN NaN \n", "653 NaN NaN NaN NaN NaN \n", "\n", " Cyanobacteria TchlA \n", "647 NaN NaN \n", "648 NaN NaN \n", "649 NaN NaN \n", "650 NaN NaN \n", "651 NaN NaN \n", "652 NaN NaN \n", "653 NaN NaN \n", "\n", "[7 rows x 55 columns]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfout.loc[(dfout.Cruise=='2017-064')&(dfout.Sample_Number<0)]" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
icount
CruiseSample#
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [icount]\n", "Index: []" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check for phyto samples not matched to bottle samples:\n", "temp.loc[temp.icount==0]" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1336, 119, 1336)" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# if the output table is longer than either of the input tables, some columns were not matched\n", "len(dfout), len(dfPhyto), len(dfbot)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(119, 119)" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check that the number of cells with data in the 'Cyanobacteria' column is \n", "# the same for the input and output tables to show that no rows are missing:\n", "np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Series([], Name: Sample#, dtype: float64)\n" ] } ], "source": [ "# If there were data rows from the phytoplankton table that were not matched to \n", "# rows from the bottle table, their indices from the phytoplankton table would be \n", "# displayed below (the series [] would not be empty)\n", "print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Sample#'])" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "# drop repetetive/unecessary columns:\n", "dfout.drop(labels=['Date', 'Station', 'Sample#', 'rep',\n", " 'depth',],axis=1,inplace=True)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "# truncate phyto group values to 3 decimal places:\n", "for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',\n", " 'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',\n", " 'TchlA'):\n", " dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
FIL:START TIME YYYY/MM/DD HH:MM:SSLOC:EVENT_NUMBERLOC:LATITUDELOC:LONGITUDELOC:WATER DEPTHADM:SCIENTISTLOC:STATIONADM:PROJECTBottle_NumberSample_Number...Diatoms-1Diatoms-2PrasinophytesCryptophytesDinoflagellates-1HaptophytesDictyoRaphidoCyanobacteriaTchlA
\n", "

0 rows × 50 columns

\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [FIL:START TIME YYYY/MM/DD HH:MM:SS, LOC:EVENT_NUMBER, LOC:LATITUDE, LOC:LONGITUDE, LOC:WATER DEPTH, ADM:SCIENTIST, LOC:STATION, ADM:PROJECT, Bottle_Number, Sample_Number, Pressure [decibar], Depth [metres], Temperature [deg C (ITS90)], Fluorescence:URU:Seapoint [mg/m^3], PAR [uE/m^2/sec], Salinity [PSS-78], Temperature:Draw [deg C (ITS90)], Salinity:Bottle [PSS-78], Flag:Salinity:Bottle, Chlorophyll:Extracted [mg/m^3], Flag:Chlorophyll:Extracted, Phaeo-Pigment:Extracted [mg/m^3], Oxygen:Dissolved [mL/L], Oxygen:Dissolved [umol/kg], Flag:Oxygen:Dissolved, Nitrate_plus_Nitrite [umol/L], Flag:Nitrate_plus_Nitrite, Silicate [umol/L], Flag:Silicate, Phosphate [umol/L], Flag:Phosphate, Comments by sample_numbeR, Cruise, Zone, ADM:MISSION, Bottle:Firing_Sequence, Transmissivity [*/metre], Oxygen:Dissolved:SBE [mL/L], Oxygen:Dissolved:SBE [umol/kg], Number_of_bin_records, Diatoms-1, Diatoms-2, Prasinophytes, Cryptophytes, Dinoflagellates-1, Haptophytes, Dictyo, Raphido, Cyanobacteria, TchlA]\n", "Index: []\n", "\n", "[0 rows x 50 columns]" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check for unexpected depths\n", "dfout.loc[((dfout['Pressure [decibar]']>10)|(dfout['Depth [metres]']>10))&(dfout['Diatoms-1']>=0)]#,\n", "# ['FIL:START TIME YYYY/MM/DD HH:MM:SS','Pressure [decibar]','Depth [metres]','Diatoms-1']]" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 NaN\n", "1 NaN\n", "2 NaN\n", "3 NaN\n", "4 NaN\n", " ... \n", "1331 NaN\n", "1332 0.007\n", "1333 NaN\n", "1334 NaN\n", "1335 0.047\n", "Name: Cyanobacteria, Length: 1336, dtype: float64" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfout['Cyanobacteria']" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "# now write the output table to a .csv file:\n", "dfout.to_csv(pathOut, index=False) " ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['FIL:START TIME YYYY/MM/DD HH:MM:SS', 'LOC:EVENT_NUMBER',\n", " 'LOC:LATITUDE', 'LOC:LONGITUDE', 'LOC:WATER DEPTH', 'ADM:SCIENTIST',\n", " 'LOC:STATION', 'ADM:PROJECT', 'Bottle_Number', 'Sample_Number',\n", " 'Pressure [decibar]', 'Depth [metres]', 'Temperature [deg C (ITS90)]',\n", " 'Fluorescence:URU:Seapoint [mg/m^3]', 'PAR [uE/m^2/sec]',\n", " 'Salinity [PSS-78]', 'Temperature:Draw [deg C (ITS90)]',\n", " 'Salinity:Bottle [PSS-78]', 'Flag:Salinity:Bottle',\n", " 'Chlorophyll:Extracted [mg/m^3]', 'Flag:Chlorophyll:Extracted',\n", " 'Phaeo-Pigment:Extracted [mg/m^3]', 'Oxygen:Dissolved [mL/L]',\n", " 'Oxygen:Dissolved [umol/kg]', 'Flag:Oxygen:Dissolved',\n", " 'Nitrate_plus_Nitrite [umol/L]', 'Flag:Nitrate_plus_Nitrite',\n", " 'Silicate [umol/L]', 'Flag:Silicate', 'Phosphate [umol/L]',\n", " 'Flag:Phosphate', 'Comments by sample_numbeR', 'Cruise', 'Zone',\n", " 'ADM:MISSION', 'Bottle:Firing_Sequence', 'Transmissivity [*/metre]',\n", " 'Oxygen:Dissolved:SBE [mL/L]', 'Oxygen:Dissolved:SBE [umol/kg]',\n", " 'Number_of_bin_records', 'Diatoms-1', 'Diatoms-2', 'Prasinophytes',\n", " 'Cryptophytes', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',\n", " 'Cyanobacteria', 'TchlA'],\n", " dtype='object')" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfout.keys()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python (py39)", "language": "python", "name": "py39" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" } }, "nbformat": 4, "nbformat_minor": 4 }