{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import os\n",
    "import glob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "basepath='/ocean/shared/SalishSeaCastData/DFO/CTD/'\n",
    "# note: if further files requests are added, see createDBfromDFO_OPDB.py for how to manage multiple directories\n",
    "\n",
    "dirs0=[os.path.join(basepath,x) for x in os.listdir(basepath) if (os.path.isdir(basepath+x) and not re.match('^\\.', x))]\n",
    "dirs1=list()\n",
    "for ireq in dirs0:\n",
    "    dirs1=dirs1+[os.path.join(ireq,x) for x in os.listdir(ireq) \\\n",
    "                 if (os.path.isdir(os.path.join(ireq,x)) and not re.match('^\\.', x))]\n",
    "dirs1.sort()\n",
    "# create full list of filenames\n",
    "filenames1=list()\n",
    "bnamesAll=list() \n",
    "for cdirpath in dirs1:\n",
    "    filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))]\n",
    "    bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)]\n",
    "    # left over from nutrients version where multiple requests led to overlap; retain for future use\n",
    "filenames1.sort()\n",
    "filenames=filenames1 #contains full paths\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "directories to be processed:\n",
      "['/ocean/shared/SalishSeaCastData/DFO/CTD/req20181116/EO UBC November 16, 2018 (2014 data)', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20181116/EO UBC November 16, 2018 (2015 data)', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20181116/EO UBC November 16, 2018 (2016 data a)', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20181116/EO UBC November 16, 2018 (2016 data b)', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20181116/EO UBC November 16, 2018 (2017 data)', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20181116/EO UBC November 16, 2018 (2018 data)', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20200928/CTD1', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20200928/CTD2', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20210618/2013 A', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20210618/2013 B', '/ocean/shared/SalishSeaCastData/DFO/CTD/req20210618/2013 C']\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print('directories to be processed:\\n' + repr(dirs1) + '\\n\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(True, False)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test={0:'car',1:'elephant'}\n",
    "'car' in test.values(),'horse' in test.values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create empty set to store variable names and dictionary to store units\n",
    "varlist=set()\n",
    "#unitsdict={} #There were multiple units for some variables, so better to record the units with the data\n",
    "# loop throught directories to get all variable names and units:\n",
    "for ifile in filenames:\n",
    "    varNames={}\n",
    "    varUnits={}\n",
    "    with open(ifile, 'rt', encoding = \"ISO-8859-1\") as f:\n",
    "        infile=False\n",
    "        invars=False\n",
    "        indetail=False\n",
    "        inadmin=False\n",
    "        inloc=False\n",
    "        indata=False\n",
    "        detformat=False\n",
    "        for line in f:\n",
    "            if infile:\n",
    "                if re.match('\\s*\\$', line) or len(line)==0:\n",
    "                    infile=False\n",
    "            if invars:\n",
    "                if re.search('\\$END', line):\n",
    "                    invars=False\n",
    "                else:\n",
    "                    test=re.findall(\"'.*?'\",line) # (.*? matches anything but chooses min len match - not greedy)\n",
    "                    for expr in test:\n",
    "                        line=re.sub(re.escape(expr),re.sub(' ','_',expr),line) # remove spaces from items in quotes\n",
    "                    splitline=re.split('\\s* \\s*',line.strip())\n",
    "                    if re.match('[0-9]', splitline[0]):\n",
    "                        varnum=int(splitline[0])\n",
    "                        cvar=splitline[1]\n",
    "                        cvar = re.sub('(?<=[0-9])*\\.(?=[0-9])','point',cvar) # decimal points -> point\n",
    "                        cvar = re.sub('\\-','',cvar) # remove - from column names\n",
    "                        cvar = re.sub('\\:','_',cvar) # replace : with _\n",
    "                        cvar = re.sub('\\>','gt',cvar) # replace > with gt\n",
    "                        cvar = re.sub('\\<','lt',cvar) # replace < with lt\n",
    "                        cvar = re.sub('(\\'|\\.)','',cvar) # remove special characters (' and .)\n",
    "                        cunits = splitline[2].strip()\n",
    "                        # some files have multiple variables of same name (eg Oxygen:Dissolved:SBE)\n",
    "                        # fix this:\n",
    "                        cvarbase=cvar\n",
    "                        xx=1\n",
    "                        while cvar in varNames.values():\n",
    "                            cvar=cvarbase+'_'+str(xx)\n",
    "                            xx=xx+1\n",
    "                        varNames[varnum]=cvar\n",
    "                        varUnits[varnum]=cunits\n",
    "                        varlist = varlist | {cvar}\n",
    "            elif indetail:\n",
    "                detcount+=1\n",
    "                if re.search('\\$END', line):\n",
    "                    indetail=False\n",
    "                elif (detcount==1 and re.match('\\s*\\!\\s*No\\s*Pad\\s*Start\\s*Width', line)):\n",
    "                    detformat=True\n",
    "            elif inadmin:\n",
    "                if len(line)==0:\n",
    "                    inadmin=False\n",
    "            elif inloc:\n",
    "                if len(line)==0:\n",
    "                    inloc=False\n",
    "            if re.match('![- ]*$',line):\n",
    "                tem=re.search('(?<=\\!)[- ]*$',line)\n",
    "                splitline=re.split(r'\\s',tem.group(0))\n",
    "                for ii in range(1, 1+len(splitline)):\n",
    "                    detformat=True\n",
    "            if re.search('\\*FILE', line):\n",
    "                infile=True\n",
    "            if re.search('\\$TABLE\\: CHANNELS', line):\n",
    "                invars=True\n",
    "            if re.search('\\$TABLE\\: CHANNEL DETAIL', line):\n",
    "                indetail=True\n",
    "                detcount=0\n",
    "            if re.search('\\*ADMINISTRATION', line):\n",
    "                inadmin=True\n",
    "            if re.search('\\*LOCATION', line):\n",
    "                inloc=True\n",
    "                inadmin=False\n",
    "            if re.search('\\*END OF HEADER', line):\n",
    "                indata=True\n",
    "                inloc=False\n",
    "            if re.search('\\$END',line):\n",
    "                inloc=False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{1: 'Pressure', 2: 'Temperature_Primary', 3: 'Salinity_T0_C0', 4: 'Number_of_bin_records'}\n"
     ]
    }
   ],
   "source": [
    "print(varNames)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{1: 'decibar', 2: \"'deg_C_(ITS90)'\", 3: 'PSS-78', 4: 'n/a'}\n"
     ]
    }
   ],
   "source": [
    "print(varUnits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'pH_SBE_Nominal', 'Salinity_T1_C1', 'PAR', 'Oxygen_Dissolved_SBE', 'Temperature', 'Oxygen_Dissolved_Satuation_RBR', 'Number_of_bin_records_1', 'Speed_Sound', 'Turbidity_Wetlabs', 'Number_of_bin_records', 'Conductivity', 'Date', 'Temperature_Primary', 'Transmissivity', 'Salinity_T0_C0', 'Conductance_Specific', 'Record_Number', 'Oxygen_Dissolved_Saturation', 'Oxygen_Dissolved_Saturation_RBR', 'Pressure', 'Conductivity_Primary', 'Time', 'Fluorescence_URU', 'Fluorescence_URU_Seapoint', 'PAR1', 'Depth', 'Transmissivity2', 'Transmissivity_Green', 'PAR_1', 'PAR_Reference', 'Conductivity_Secondary', 'Fluorescence_URU_Wetlabs', 'Salinity', 'Density', 'Oxygen_Dissolved_SBE_1', 'Temperature_Secondary'}\n"
     ]
    }
   ],
   "source": [
    "print(varlist)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "choosevars={'Fluorescence_URU_Wetlabs', 'Oxygen_Dissolved_SBE', 'Speed_Sound', 'PAR1',\n",
    "            'Conductivity_Primary', 'Temperature_Secondary', 'Depth', 'Salinity_T1_C1',\n",
    "            'Conductivity_Secondary', 'Transmissivity', 'PAR_Reference', 'Temperature_Primary',\n",
    "            'Salinity_T0_C0', 'Conductivity', 'Salinity', 'Number_of_bin_records',\n",
    "            'pH_SBE_Nominal', 'PAR_1', 'Pressure', 'Fluorescence_URU_Seapoint', 'Temperature',\n",
    "            'Conductance_Specific', 'Density', 'PAR'}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "set()\n"
     ]
    }
   ],
   "source": [
    "print(choosevars-varlist)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/ocean/shared/SalishSeaCastData/DFO/CTD/req20200928/CTD2/lbb_20190123_20190810_0336m_L2.ctd'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ifile"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "ename": "IndexError",
     "evalue": "no such group",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-13-e98ae1fb2f39>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbasepath\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mifile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mIndexError\u001b[0m: no such group"
     ]
    }
   ],
   "source": [
    "test=re.search(basepath,ifile).group(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'req20181116/EO UBC November 16, 2018 (2018 data)/2018_map.jpg'"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.search(basepath+'(.*)',ifile).group(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "yes\n"
     ]
    }
   ],
   "source": [
    "if re.match('.*jpg$', ifile):\n",
    "    print('yes')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}