{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np # this module handles arrays, but here we need it for its NaN value\n", "import pandas as pd # this module contains a lot of tools for handling tabular data\n", "from matplotlib import pyplot as plt\n", "\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# define paths to the source files and eventual output file\n", "pathBottle='/ocean/eolson/MEOPAR/obs/NemcekHPLC//All 2015 SoG_bottle.xlsx'\n", "pathPhyto='/ocean/eolson/MEOPAR/obs/NemcekHPLC/2015-2018 Abs phyto groupsCorrected.xlsx'\n", "pathOut='/ocean/eolson/MEOPAR/obs/NemcekHPLC/bottlePhytoMerged2015.csv'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reading sheet 2015-17\n", "Reading sheet 2015-20\n", "Reading sheet 2015-18\n", "Reading sheet 2015-21\n", "Reading sheet 2015-19\n", "removed empty columns: {'Ammonium', 'Flag:Ammonium'}\n" ] } ], "source": [ "# load each sheet in the 2015 bottle Excel file and concatenate them together into one table\n", "dfbotlist=list()\n", "for sheet in ('2015-17','2015-20','2015-18','2015-21','2015-19'):\n", " df0=pd.read_excel(pathBottle,sheet_name=sheet,verbose=True,na_values=(-99,-99.9)) # read each sheet\n", " df0['Cruise']=sheet # create and populate Cruise column based on sheet name\n", " dfbotlist.append(df0) # append the sheet to a list\n", "dfbot=pd.concat(dfbotlist,ignore_index=True,sort=False) # concatenate the list into a single table\n", "# Drop columns with no data in them\n", "l1=set(dfbot.keys())\n", "dfbot.dropna(axis=1,how='all',inplace=True)\n", "print('removed empty columns:',l1-set(dfbot.keys()))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',\n", " 'LOC:EVENT_NUMBER', 'LOC:STATION', 'LOC:LATITUDE', 'LOC:LONGITUDE',\n", " 'LOC:WATER DEPTH', 'ADM:SCIENTIST', 'Sample_Number', 'Pressure',\n", " 'Temperature:Primary', 'Transmissivity', 'Fluorescence:URU:Seapoint',\n", " 'PAR', 'Salinity:T0:C0', 'Oxygen:Dissolved:SBE',\n", " 'Oxygen:Dissolved:SBE.1', 'pH:SBE:Nominal', 'Salinity:Bottle',\n", " 'Flag:Salinity:Bottle', 'Chlorophyll:Extracted',\n", " 'Flag:Chlorophyll:Extracted', 'Nitrate_plus_Nitrite',\n", " 'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate',\n", " 'Flag:Phosphate', 'Cruise', 'Temperature:Draw',\n", " 'Phaeo-Pigment:Extracted', 'Oxygen:Dissolved', 'Oxygen:Dissolved.1',\n", " 'Flag:Oxygen:Dissolved', 'Temperature:Secondary', 'Salinity:T1:C1',\n", " 'Number_of_bin_records'],\n", " dtype='object')\n" ] } ], "source": [ "# list the column names in the resulting table\n", "print(dfbot.keys())" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def subval(idf,colList):\n", " # first value in colList should be the column you are going to keep\n", " # follow with other columns that will be used to fill in when that column is NaN\n", " # in order of precedence\n", " if len(colList)==2:\n", " idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \\\n", " else r[colList[1]] for i,r in idf.iterrows()]\n", " elif len(colList)==3:\n", " idf[colList[0]]=[r[colList[0]] if not pd.isna(r[colList[0]]) \\\n", " else r[colList[1]] if not pd.isna(r[colList[1]]) \\\n", " else r[colList[2]] for i,r in idf.iterrows()]\n", " else:\n", " raise NotImplementedError('Add to code to handle this case')\n", " idf.drop(columns=list(colList[1:]),inplace=True)\n", " return idf" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "dfbot=subval(dfbot,('Oxygen:Dissolved','Oxygen:Dissolved.1'))\n", "dfbot=subval(dfbot,('Temperature:Primary','Temperature:Secondary'))\n", "dfbot=subval(dfbot,('Salinity:T0:C0','Salinity:T1:C1'))\n", "dfbot=subval(dfbot,('Oxygen:Dissolved:SBE','Oxygen:Dissolved:SBE.1'))\n", "dfbot.rename(columns={'LOC:LATITUDE':'Lat',\n", " 'LOC:LONGITUDE':'Lon',\n", " 'Temperature:Primary':'Temperature',\n", " 'Oxygen:Dissolved:SBE':'Oxygen:Dissolved:CTD',\n", " 'Salinity:T0:C0':'Salinity',},inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# define a function that will be applied to the values in the index column;\n", "# this makes it easier to drop non-data rows later\n", "def convertIndex(val):\n", " try:\n", " x =int(val)\n", " except ValueError:\n", " x=np.nan\n", " return x" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reading sheet 2015 CHEMTAX abs results\n" ] } ], "source": [ "# load the 2015 phytoplankton data with the following options:\n", "# sheet_name='2015 CHEMTAX abs results' -> choose the 2015 sheet\n", "# usecols='A:I,T:AC' -> read only columns A:I and T:AC from the Excel sheet\n", "# skiprows=2 -> start reading at the 3rd row of the sheet, \n", "# which contains the column headings\n", "# converters={'Index': convertIndex,} -> apply the function defined above to the Index column\n", "# verbose = True -> print extra information/ warnings/ errors\n", "dfPhyto=pd.read_excel(pathPhyto,sheet_name='2015 CHEMTAX abs results',usecols='A:I,T:AC',\n", " skiprows=2,converters={'Index': convertIndex,},\n", " verbose=True)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Bin #IndexSubgroupCruiseMonthStationSample#repdepthCyanobacteria.1Prasinophytes.1Cryptophytes.1Diatoms-1.1Diatoms-2.1Dinoflagellates-1.1Haptophytes.1Dictyo.1Raphido.1TchlA (ug/L)
481249.012015-17April22303A000.09155330.09331849.816341.172530.0063032900011.1801
491250.012015-17April22303B0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
50Absolute Pigment Compositions - Bin # 16NaNNaNNaNNaNFrom Sheet: OutR4NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
51Tchl_aNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
52Bin #NaNSubgroupCruiseMonthStationSample#repdepthCyanobacteriaPrasinophytesCryptophytesDiatoms-1Diatoms-2Dinoflagellates-1HaptophytesDictyoRaphidoTchlA
531651.022015-20JuneCPF2396A00.06721120.104270.048996900.016357200.18435500.04222330.463414
541652.022015-20JuneCPF2396B0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
551653.022015-20JuneCPF1397A00.00435790.4956590.4626610.69259900.09549980.2464770.05654260.05693772.11073
561654.022015-20JuneCPF1397B0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
571655.022015-20June22398A00.01408080.1694150.0870690.3923010.03316263.15016e-060.084758500.008379010.789169
581656.022015-20June22398B0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
591657.022015-20June24399A00.02007840.1012760.06929010.4061070.002417220.0001526930.002133476.5546e-0500.60152
\n", "
" ], "text/plain": [ " Bin # Index Subgroup Cruise Month \\\n", "48 12 49.0 1 2015-17 April \n", "49 12 50.0 1 2015-17 April \n", "50 Absolute Pigment Compositions - Bin # 16 NaN NaN NaN NaN \n", "51 Tchl_a NaN NaN NaN NaN \n", "52 Bin # NaN Subgroup Cruise Month \n", "53 16 51.0 2 2015-20 June \n", "54 16 52.0 2 2015-20 June \n", "55 16 53.0 2 2015-20 June \n", "56 16 54.0 2 2015-20 June \n", "57 16 55.0 2 2015-20 June \n", "58 16 56.0 2 2015-20 June \n", "59 16 57.0 2 2015-20 June \n", "\n", " Station Sample# rep depth Cyanobacteria.1 Prasinophytes.1 \\\n", "48 22 303 A 0 0 0.0915533 \n", "49 22 303 B 0 NaN NaN \n", "50 From Sheet: OutR4 NaN NaN NaN NaN NaN \n", "51 NaN NaN NaN NaN NaN NaN \n", "52 Station Sample# rep depth Cyanobacteria Prasinophytes \n", "53 CPF2 396 A 0 0.0672112 0.10427 \n", "54 CPF2 396 B 0 NaN NaN \n", "55 CPF1 397 A 0 0.0043579 0.495659 \n", "56 CPF1 397 B 0 NaN NaN \n", "57 22 398 A 0 0.0140808 0.169415 \n", "58 22 398 B 0 NaN NaN \n", "59 24 399 A 0 0.0200784 0.101276 \n", "\n", " Cryptophytes.1 Diatoms-1.1 Diatoms-2.1 Dinoflagellates-1.1 Haptophytes.1 \\\n", "48 0.0933184 9.81634 1.17253 0.00630329 0 \n", "49 NaN NaN NaN NaN NaN \n", "50 NaN NaN NaN NaN NaN \n", "51 NaN NaN NaN NaN NaN \n", "52 Cryptophytes Diatoms-1 Diatoms-2 Dinoflagellates-1 Haptophytes \n", "53 0.0489969 0 0.0163572 0 0.184355 \n", "54 NaN NaN NaN NaN NaN \n", "55 0.462661 0.692599 0 0.0954998 0.246477 \n", "56 NaN NaN NaN NaN NaN \n", "57 0.087069 0.392301 0.0331626 3.15016e-06 0.0847585 \n", "58 NaN NaN NaN NaN NaN \n", "59 0.0692901 0.406107 0.00241722 0.000152693 0.00213347 \n", "\n", " Dictyo.1 Raphido.1 TchlA (ug/L) \n", "48 0 0 11.1801 \n", "49 NaN NaN NaN \n", "50 NaN NaN NaN \n", "51 NaN NaN NaN \n", "52 Dictyo Raphido TchlA \n", "53 0 0.0422233 0.463414 \n", "54 NaN NaN NaN \n", "55 0.0565426 0.0569377 2.11073 \n", "56 NaN NaN NaN \n", "57 0 0.00837901 0.789169 \n", "58 NaN NaN NaN \n", "59 6.5546e-05 0 0.60152 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# display rows 48 to 59 of the resulting table\n", "dfPhyto[48:60]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# now, drop any rows from the table that have NaN values in either of the columns\n", "# 'Index' or 'TchlA (ug/L)'\n", "# This is why we applied a function to the Index column to make sure all \n", "# non-numeric Index values would have a consistent NaN entry, making them easy to identify\n", "# and remove\n", "dfPhyto.dropna(subset=['Index', 'TchlA (ug/L)'],how='any',inplace=True)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# pandas creates its own index, and after dropping rows I like to reset it -\n", "# this is just for convenience\n", "dfPhyto.reset_index(drop=True,inplace=True)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Bin #IndexSubgroupCruiseMonthStationSample#repdepthCyanobacteria.1Prasinophytes.1Cryptophytes.1Diatoms-1.1Diatoms-2.1Dinoflagellates-1.1Haptophytes.1Dictyo.1Raphido.1TchlA (ug/L)
48795.032015-18June2202A00.12010.08763950.1516120000.36045200.08617210.805976
49797.032015-18June3218A00.1087160.1006950.1133580000.43759700.009021260.769387
50799.032015-18June6249A00.08994460.02573130.06716020000.347255000.530091
517101.032015-18June9261A00.09296190.08735610.1020040.00641763000.279578000.568318
527103.032015-18June12277A00.07869770.13560.1683940000.44463400.02144130.848767
537105.032015-18June14292A00.05202650.1359790.1612250000.933363001.28259
547107.032015-18June16303A00.01625570.06590370.1313070000.79041001.00388
551109.042015-21Sept14327A00.1037250.4117550.5419980.6444630.1616440.07430510.7941150.04553060.04276932.82031
561111.042015-21Sept11328A00.4736660.8636160.9227170.1983150.03259980.1868881.378230.03346390.1066274.19612
571113.042015-21SeptCPF2329A00.2709051.433410.9104510.1954090.6187700.49640.1278370.3879244.4411
581115.042015-21SeptCPF1330A00.3692111.566420.7523090.2073750.3921540.0007386650.1420020.1308420.40173.96275
591117.042015-21Sept22331A00.216021.219560.4719390.4219890.2564690.001685250.4023570.08101560.1805233.25155
\n", "
" ], "text/plain": [ " Bin # Index Subgroup Cruise Month Station Sample# rep depth \\\n", "48 7 95.0 3 2015-18 June 2 202 A 0 \n", "49 7 97.0 3 2015-18 June 3 218 A 0 \n", "50 7 99.0 3 2015-18 June 6 249 A 0 \n", "51 7 101.0 3 2015-18 June 9 261 A 0 \n", "52 7 103.0 3 2015-18 June 12 277 A 0 \n", "53 7 105.0 3 2015-18 June 14 292 A 0 \n", "54 7 107.0 3 2015-18 June 16 303 A 0 \n", "55 1 109.0 4 2015-21 Sept 14 327 A 0 \n", "56 1 111.0 4 2015-21 Sept 11 328 A 0 \n", "57 1 113.0 4 2015-21 Sept CPF2 329 A 0 \n", "58 1 115.0 4 2015-21 Sept CPF1 330 A 0 \n", "59 1 117.0 4 2015-21 Sept 22 331 A 0 \n", "\n", " Cyanobacteria.1 Prasinophytes.1 Cryptophytes.1 Diatoms-1.1 Diatoms-2.1 \\\n", "48 0.1201 0.0876395 0.151612 0 0 \n", "49 0.108716 0.100695 0.113358 0 0 \n", "50 0.0899446 0.0257313 0.0671602 0 0 \n", "51 0.0929619 0.0873561 0.102004 0.00641763 0 \n", "52 0.0786977 0.1356 0.168394 0 0 \n", "53 0.0520265 0.135979 0.161225 0 0 \n", "54 0.0162557 0.0659037 0.131307 0 0 \n", "55 0.103725 0.411755 0.541998 0.644463 0.161644 \n", "56 0.473666 0.863616 0.922717 0.198315 0.0325998 \n", "57 0.270905 1.43341 0.910451 0.195409 0.61877 \n", "58 0.369211 1.56642 0.752309 0.207375 0.392154 \n", "59 0.21602 1.21956 0.471939 0.421989 0.256469 \n", "\n", " Dinoflagellates-1.1 Haptophytes.1 Dictyo.1 Raphido.1 TchlA (ug/L) \n", "48 0 0.360452 0 0.0861721 0.805976 \n", "49 0 0.437597 0 0.00902126 0.769387 \n", "50 0 0.347255 0 0 0.530091 \n", "51 0 0.279578 0 0 0.568318 \n", "52 0 0.444634 0 0.0214413 0.848767 \n", "53 0 0.933363 0 0 1.28259 \n", "54 0 0.79041 0 0 1.00388 \n", "55 0.0743051 0.794115 0.0455306 0.0427693 2.82031 \n", "56 0.186888 1.37823 0.0334639 0.106627 4.19612 \n", "57 0 0.4964 0.127837 0.387924 4.4411 \n", "58 0.000738665 0.142002 0.130842 0.4017 3.96275 \n", "59 0.00168525 0.402357 0.0810156 0.180523 3.25155 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# display part of the table, confirming that non-data rows have been removed\n", "dfPhyto[48:60]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# due to repeated column names in the original spreadsheet, '.1' was appended to the names\n", "# of the phytoplankton columns; \n", "# these lines correct the column names, removing the '.1':\n", "renameDict=dict()\n", "for colName in dfPhyto.keys():\n", " if colName.endswith('.1'):\n", " renameDict[colName]=colName.split('.1')[0]\n", "dfPhyto.rename(columns=renameDict,inplace=True)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Bin #IndexSubgroupCruiseMonthStationSample#repdepthCyanobacteriaPrasinophytesCryptophytesDiatoms-1Diatoms-2Dinoflagellates-1HaptophytesDictyoRaphidoTchlA (ug/L)
0121.012015-17AprilSI9A000024.47091.6068700.014371700.069561726.1618
1123.012015-17April5923A03.57707e-054.01063e-0500.3379570.03998400.04470332.42373e-050.0008042320.423549
2125.012015-17April10237A00.002644670.0211580.04509182.406750.6109750.004636610.02426010.001185090.01181633.12852
3127.012015-17April7551A00.004889750.005149940.06715813.172960.8818440.005639530.04385630.01512780.02242634.21906
4129.012015-17April7264A000.1301890.3262764.429560.9917160.506950006.38469
............................................................
8316165.052015-19Sept9248A00.09037420.8447920.153710.5528660.200320.0003786530.3142220.03421320.04651542.23739
8416167.052015-19Sept6261A00.08385460.813720.3086940.2915790.327578.07944e-050.5617290.07467830.05881622.52072
8516169.052015-19SeptBS11269A0000.12548510.14420000010.2697
8616171.052015-19Sept3285A00.127820.5151510.2625250.6342160.1705710.0004419820.487410.0334520.12692.35849
8716173.052015-19Sept2300A00.1118821.005040.1657121.528580.013451200.3966090.02521890.2166543.46315
\n", "

88 rows × 19 columns

\n", "
" ], "text/plain": [ " Bin # Index Subgroup Cruise Month Station Sample# rep depth \\\n", "0 12 1.0 1 2015-17 April SI 9 A 0 \n", "1 12 3.0 1 2015-17 April 59 23 A 0 \n", "2 12 5.0 1 2015-17 April 102 37 A 0 \n", "3 12 7.0 1 2015-17 April 75 51 A 0 \n", "4 12 9.0 1 2015-17 April 72 64 A 0 \n", ".. ... ... ... ... ... ... ... .. ... \n", "83 16 165.0 5 2015-19 Sept 9 248 A 0 \n", "84 16 167.0 5 2015-19 Sept 6 261 A 0 \n", "85 16 169.0 5 2015-19 Sept BS11 269 A 0 \n", "86 16 171.0 5 2015-19 Sept 3 285 A 0 \n", "87 16 173.0 5 2015-19 Sept 2 300 A 0 \n", "\n", " Cyanobacteria Prasinophytes Cryptophytes Diatoms-1 Diatoms-2 \\\n", "0 0 0 0 24.4709 1.60687 \n", "1 3.57707e-05 4.01063e-05 0 0.337957 0.039984 \n", "2 0.00264467 0.021158 0.0450918 2.40675 0.610975 \n", "3 0.00488975 0.00514994 0.0671581 3.17296 0.881844 \n", "4 0 0.130189 0.326276 4.42956 0.991716 \n", ".. ... ... ... ... ... \n", "83 0.0903742 0.844792 0.15371 0.552866 0.20032 \n", "84 0.0838546 0.81372 0.308694 0.291579 0.32757 \n", "85 0 0 0.125485 10.1442 0 \n", "86 0.12782 0.515151 0.262525 0.634216 0.170571 \n", "87 0.111882 1.00504 0.165712 1.52858 0.0134512 \n", "\n", " Dinoflagellates-1 Haptophytes Dictyo Raphido TchlA (ug/L) \n", "0 0 0.0143717 0 0.0695617 26.1618 \n", "1 0 0.0447033 2.42373e-05 0.000804232 0.423549 \n", "2 0.00463661 0.0242601 0.00118509 0.0118163 3.12852 \n", "3 0.00563953 0.0438563 0.0151278 0.0224263 4.21906 \n", "4 0.50695 0 0 0 6.38469 \n", ".. ... ... ... ... ... \n", "83 0.000378653 0.314222 0.0342132 0.0465154 2.23739 \n", "84 8.07944e-05 0.561729 0.0746783 0.0588162 2.52072 \n", "85 0 0 0 0 10.2697 \n", "86 0.000441982 0.48741 0.033452 0.1269 2.35849 \n", "87 0 0.396609 0.0252189 0.216654 3.46315 \n", "\n", "[88 rows x 19 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfPhyto" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# This is the important step- join the two tables ('left' and 'right'), \n", "# matching the cruise IDs and sample numbers\n", "# how='outer' -> all rows from both the left and the right tables will be included, \n", "# even if they cannot be matched; this makes it easy to check for \n", "# unmatched data later\n", "# left_on specifies the name of the column to match in the left table (dfbot) \n", "# right_on specifies the name of the column to match in the right table (dfPhyto)\n", "dfout = pd.merge(dfbot, dfPhyto, how='outer', \n", " left_on=['Cruise','Sample_Number'], right_on = ['Cruise','Sample#'])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',\n", " 'LOC:EVENT_NUMBER', 'LOC:STATION', 'Lat', 'Lon', 'LOC:WATER DEPTH',\n", " 'ADM:SCIENTIST', 'Sample_Number', 'Pressure', 'Temperature',\n", " 'Transmissivity', 'Fluorescence:URU:Seapoint', 'PAR', 'Salinity',\n", " 'Oxygen:Dissolved:CTD', 'pH:SBE:Nominal', 'Salinity:Bottle',\n", " 'Flag:Salinity:Bottle', 'Chlorophyll:Extracted',\n", " 'Flag:Chlorophyll:Extracted', 'Nitrate_plus_Nitrite',\n", " 'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate',\n", " 'Flag:Phosphate', 'Cruise', 'Temperature:Draw',\n", " 'Phaeo-Pigment:Extracted', 'Oxygen:Dissolved', 'Flag:Oxygen:Dissolved',\n", " 'Number_of_bin_records', 'Bin #', 'Index', 'Subgroup', 'Month',\n", " 'Station', 'Sample#', 'rep', 'depth', 'Cyanobacteria', 'Prasinophytes',\n", " 'Cryptophytes', 'Diatoms-1', 'Diatoms-2', 'Dinoflagellates-1',\n", " 'Haptophytes', 'Dictyo', 'Raphido', 'TchlA (ug/L)'],\n", " dtype='object')" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# show the column names in the resulting table\n", "dfout.keys()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Checks" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# Identify cases where phytoplankton data were matched to multiple samples in bottle data:\n", "dftest=pd.merge(dfbot, dfPhyto,how='right', left_on=['Cruise','Sample_Number'],right_on = ['Cruise','Sample#'])" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "temp=dftest.groupby(['Cruise','Sample#']).agg({'Cruise':['count']})\n", "temp.columns = ['icount']" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1])" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.unique(temp.icount)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
icount
CruiseSample#
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [icount]\n", "Index: []" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check for Phyto samples matched to multiple bottle samples:\n", "temp.loc[temp.icount>1]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
icount
CruiseSample#
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [icount]\n", "Index: []" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check for phyto samples not matched to bottle samples:\n", "temp.loc[temp.icount==0]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
icount
CruiseSample_Number
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [icount]\n", "Index: []" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "temp2=dfout.groupby(['Cruise','Sample_Number']).agg({'Cruise':['count']})\n", "temp2.columns = ['icount']\n", "# this will catch phyto matched to multiple bottle but also bottle with duplicate sample numbers per cruise:\n", "temp2.loc[temp2.icount>1]" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
icount
CruiseSample#
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [icount]\n", "Index: []" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check for phyto samples not matched to bottle samples:\n", "temp.loc[temp.icount==0]" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(938, 88, 938)" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# if the output table is longer than either of the input tables, some columns were not matched\n", "len(dfout), len(dfPhyto), len(dfbot)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(88, 88)" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check that the number of cells with data in the 'Cyanobacteria' column is \n", "# the same for the input and output tables:\n", "np.sum(dfPhyto['Cyanobacteria']>=0), np.sum(dfout['Cyanobacteria']>=0)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Series([], Name: Index, dtype: float64)\n" ] } ], "source": [ "# If there were data rows from the phytoplankton table that were not matched to \n", "# rows from the bottle table, their indices from the phytoplankton table would be \n", "# displayed below (the series [] would not be empty)\n", "print(dfout.loc[dfout['ADM:SCIENTIST'].isna()]['Index'])" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# drop repetetive/unecessary columns:\n", "dfout.drop(labels=['Bin #', 'Index', 'Subgroup', 'Month', 'Station', 'Sample#', 'rep',\n", " 'depth',],axis=1,inplace=True)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "# truncate phyto group values to 3 decimal places:\n", "for col in ('Cyanobacteria', 'Prasinophytes', 'Cryptophytes', 'Diatoms-1',\n", " 'Diatoms-2', 'Dinoflagellates-1', 'Haptophytes', 'Dictyo', 'Raphido',\n", " 'TchlA (ug/L)'):\n", " dfout[col]=[np.round(ii,decimals=3) for ii in dfout[col]] # use list comprehension to set values for entire column" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['File Name', 'Zone', 'FIL:START TIME YYYY/MM/DD HH:MM:SS',\n", " 'LOC:EVENT_NUMBER', 'LOC:STATION', 'Lat', 'Lon', 'LOC:WATER DEPTH',\n", " 'ADM:SCIENTIST', 'Sample_Number', 'Pressure', 'Temperature',\n", " 'Transmissivity', 'Fluorescence:URU:Seapoint', 'PAR', 'Salinity',\n", " 'Oxygen:Dissolved:CTD', 'pH:SBE:Nominal', 'Salinity:Bottle',\n", " 'Flag:Salinity:Bottle', 'Chlorophyll:Extracted',\n", " 'Flag:Chlorophyll:Extracted', 'Nitrate_plus_Nitrite',\n", " 'Flag:Nitrate_plus_Nitrite', 'Silicate', 'Flag:Silicate', 'Phosphate',\n", " 'Flag:Phosphate', 'Cruise', 'Temperature:Draw',\n", " 'Phaeo-Pigment:Extracted', 'Oxygen:Dissolved', 'Flag:Oxygen:Dissolved',\n", " 'Number_of_bin_records', 'Cyanobacteria', 'Prasinophytes',\n", " 'Cryptophytes', 'Diatoms-1', 'Diatoms-2', 'Dinoflagellates-1',\n", " 'Haptophytes', 'Dictyo', 'Raphido', 'TchlA (ug/L)'],\n", " dtype='object')" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfout.keys()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "# now write the output table to a .csv file:\n", "dfout.to_csv(pathOut, index=False) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python (py38_ct)", "language": "python", "name": "py38_ct" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" } }, "nbformat": 4, "nbformat_minor": 2 }