{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import re\n", "import os\n", "import glob" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "basepath='/ocean/shared/SalishSeaCastData/DFO/CTD/'\n", "# note: if further files requests are added, see createDBfromDFO_OPDB.py for how to manage multiple directories\n", "\n", "dirs0=[os.path.join(basepath,x) for x in os.listdir(basepath) if (os.path.isdir(basepath+x) and not re.match('^\\.', x))]\n", "dirs1=list()\n", "for ireq in dirs0:\n", " dirs1=dirs1+[os.path.join(ireq,x) for x in os.listdir(ireq) \\\n", " if (os.path.isdir(os.path.join(ireq,x)) and not re.match('^\\.', x))]\n", "dirs1.sort()\n", "# create full list of filenames\n", "filenames1=list()\n", "bnamesAll=list() \n", "for cdirpath in dirs1:\n", " filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))]\n", " bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)]\n", " # left over from nutrients version where multiple requests led to overlap; retain for future use\n", "filenames1.sort()\n", "filenames=filenames1 #contains full paths\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "directories to be processed:\n", "['/ocean/shared/SalishSeaCastData/DFO/CTD/req20181116/EO UBC November 16, 2018 (2014 data)', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20181116/EO UBC November 16, 2018 (2015 data)', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20181116/EO UBC November 16, 2018 (2016 data a)', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20181116/EO UBC November 16, 2018 (2016 data b)', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20181116/EO UBC November 16, 2018 (2017 data)', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20181116/EO UBC November 16, 2018 (2018 data)', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20200928/CTD1', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20200928/CTD2', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20210618/2013 A', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20210618/2013 B', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20210618/2013 C']\n", "\n", "\n" ] } ], "source": [ "print('directories to be processed:\\n' + repr(dirs1) + '\\n\\n')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(True, False)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test={0:'car',1:'elephant'}\n", "'car' in test.values(),'horse' in test.values()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# create empty set to store variable names and dictionary to store units\n", "varlist=set()\n", "#unitsdict={} #There were multiple units for some variables, so better to record the units with the data\n", "# loop throught directories to get all variable names and units:\n", "for ifile in filenames:\n", " varNames={}\n", " varUnits={}\n", " with open(ifile, 'rt', encoding = \"ISO-8859-1\") as f:\n", " infile=False\n", " invars=False\n", " indetail=False\n", " inadmin=False\n", " inloc=False\n", " indata=False\n", " detformat=False\n", " for line in f:\n", " if infile:\n", " if re.match('\\s*\\$', line) or len(line)==0:\n", " infile=False\n", " if invars:\n", " if re.search('\\$END', line):\n", " invars=False\n", " else:\n", " test=re.findall(\"'.*?'\",line) # (.*? matches anything but chooses min len match - not greedy)\n", " for expr in test:\n", " line=re.sub(re.escape(expr),re.sub(' ','_',expr),line) # remove spaces from items in quotes\n", " splitline=re.split('\\s* \\s*',line.strip())\n", " if re.match('[0-9]', splitline[0]):\n", " varnum=int(splitline[0])\n", " cvar=splitline[1]\n", " cvar = re.sub('(?<=[0-9])*\\.(?=[0-9])','point',cvar) # decimal points -> point\n", " cvar = re.sub('\\-','',cvar) # remove - from column names\n", " cvar = re.sub('\\:','_',cvar) # replace : with _\n", " cvar = re.sub('\\>','gt',cvar) # replace > with gt\n", " cvar = re.sub('\\<','lt',cvar) # replace < with lt\n", " cvar = re.sub('(\\'|\\.)','',cvar) # remove special characters (' and .)\n", " cunits = splitline[2].strip()\n", " # some files have multiple variables of same name (eg Oxygen:Dissolved:SBE)\n", " # fix this:\n", " cvarbase=cvar\n", " xx=1\n", " while cvar in varNames.values():\n", " cvar=cvarbase+'_'+str(xx)\n", " xx=xx+1\n", " varNames[varnum]=cvar\n", " varUnits[varnum]=cunits\n", " varlist = varlist | {cvar}\n", " elif indetail:\n", " detcount+=1\n", " if re.search('\\$END', line):\n", " indetail=False\n", " elif (detcount==1 and re.match('\\s*\\!\\s*No\\s*Pad\\s*Start\\s*Width', line)):\n", " detformat=True\n", " elif inadmin:\n", " if len(line)==0:\n", " inadmin=False\n", " elif inloc:\n", " if len(line)==0:\n", " inloc=False\n", " if re.match('![- ]*$',line):\n", " tem=re.search('(?<=\\!)[- ]*$',line)\n", " splitline=re.split(r'\\s',tem.group(0))\n", " for ii in range(1, 1+len(splitline)):\n", " detformat=True\n", " if re.search('\\*FILE', line):\n", " infile=True\n", " if re.search('\\$TABLE\\: CHANNELS', line):\n", " invars=True\n", " if re.search('\\$TABLE\\: CHANNEL DETAIL', line):\n", " indetail=True\n", " detcount=0\n", " if re.search('\\*ADMINISTRATION', line):\n", " inadmin=True\n", " if re.search('\\*LOCATION', line):\n", " inloc=True\n", " inadmin=False\n", " if re.search('\\*END OF HEADER', line):\n", " indata=True\n", " inloc=False\n", " if re.search('\\$END',line):\n", " inloc=False" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{1: 'Pressure', 2: 'Temperature_Primary', 3: 'Salinity_T0_C0', 4: 'Number_of_bin_records'}\n" ] } ], "source": [ "print(varNames)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{1: 'decibar', 2: \"'deg_C_(ITS90)'\", 3: 'PSS-78', 4: 'n/a'}\n" ] } ], "source": [ "print(varUnits)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'pH_SBE_Nominal', 'Salinity_T1_C1', 'PAR', 'Oxygen_Dissolved_SBE', 'Temperature', 'Oxygen_Dissolved_Satuation_RBR', 'Number_of_bin_records_1', 'Speed_Sound', 'Turbidity_Wetlabs', 'Number_of_bin_records', 'Conductivity', 'Date', 'Temperature_Primary', 'Transmissivity', 'Salinity_T0_C0', 'Conductance_Specific', 'Record_Number', 'Oxygen_Dissolved_Saturation', 'Oxygen_Dissolved_Saturation_RBR', 'Pressure', 'Conductivity_Primary', 'Time', 'Fluorescence_URU', 'Fluorescence_URU_Seapoint', 'PAR1', 'Depth', 'Transmissivity2', 'Transmissivity_Green', 'PAR_1', 'PAR_Reference', 'Conductivity_Secondary', 'Fluorescence_URU_Wetlabs', 'Salinity', 'Density', 'Oxygen_Dissolved_SBE_1', 'Temperature_Secondary'}\n" ] } ], "source": [ "print(varlist)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "choosevars={'Fluorescence_URU_Wetlabs', 'Oxygen_Dissolved_SBE', 'Speed_Sound', 'PAR1',\n", " 'Conductivity_Primary', 'Temperature_Secondary', 'Depth', 'Salinity_T1_C1',\n", " 'Conductivity_Secondary', 'Transmissivity', 'PAR_Reference', 'Temperature_Primary',\n", " 'Salinity_T0_C0', 'Conductivity', 'Salinity', 'Number_of_bin_records',\n", " 'pH_SBE_Nominal', 'PAR_1', 'Pressure', 'Fluorescence_URU_Seapoint', 'Temperature',\n", " 'Conductance_Specific', 'Density', 'PAR'}\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "set()\n" ] } ], "source": [ "print(choosevars-varlist)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/ocean/shared/SalishSeaCastData/DFO/CTD/req20200928/CTD2/lbb_20190123_20190810_0336m_L2.ctd'" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ifile" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "ename": "IndexError", "evalue": "no such group", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbasepath\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mifile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mIndexError\u001b[0m: no such group" ] } ], "source": [ "test=re.search(basepath,ifile).group(1)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'req20181116/EO UBC November 16, 2018 (2018 data)/2018_map.jpg'" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.search(basepath+'(.*)',ifile).group(1)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "yes\n" ] } ], "source": [ "if re.match('.*jpg$', ifile):\n", " print('yes')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true } }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 4 }