{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sqlalchemy import create_engine, Column, String, Boolean, Integer, Float, Numeric, MetaData, Table, type_coerce, ForeignKey\n", "from sqlalchemy.orm import mapper, create_session, relationship\n", "from sqlalchemy.ext.declarative import declarative_base\n", "import sqlalchemy.types as types\n", "# sqlalchemy interfaces between SQL databases and python with a syntax that I found relatively easy to adapt to...\n", "from sqlalchemy import create_engine, case, MetaData\n", "from sqlalchemy.orm import create_session, aliased\n", "from sqlalchemy.ext.automap import automap_base\n", "from sqlalchemy.sql import and_, or_, not_, func\n", "import re\n", "import os\n", "import glob\n", "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fsw={k:False for k in ('infile','invars','indetail','inadmin','inloc','indata','detformat')}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fsw" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "basepath='/ocean/shared/SalishSeaCastData/DFO/BOT/'\n", "basedir=basepath\n", "dbname='DFO_OcProfDB'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#ctd version:\n", "dirs0=[os.path.join(basepath,x) for x in os.listdir(basepath) if (os.path.isdir(basepath+x) and not re.match('^\\.', x))]\n", "dirs1=list()\n", "for ireq in dirs0:\n", " dirs1=dirs1+[os.path.join(ireq,x) for x in os.listdir(ireq) \\\n", " if (os.path.isdir(os.path.join(ireq,x)) and not re.match('^\\.', x))]\n", "dirs1.sort(reverse=True)\n", "# create full list of filenames\n", "filenames1=list()\n", "bnamesAll=set()\n", "for cdirpath in dirs1:\n", " filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) \\\n", " if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))]\n", " bnamesAll=bnamesAll.union(set([f for f in os.listdir(cdirpath)]))\n", " # left over from nutrients version where multiple requests led to overlap; retain for future use\n", "filenames1.sort()\n", "filenamesCTD=filenames1 #contains full paths\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dirs1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ " basepath='/ocean/eolson/MEOPAR/obs/'\n", " basedir=basepath + 'DFOOPDB/'\n", " basedir2=basepath + 'DFOOPDB2/'\n", " basedir3=basepath + 'DFOOPDB3/'\n", " basedir4=basepath + 'DFOOPDB4/'\n", " dbname='DFO_OcProfDB'\n", "\n", " fout=open(basedir+'createDBfromDFO_OPDB_log.txt','w')\n", " ferr=open(basedir+'createDBfromDFO_OPDB_errors.txt','w')\n", " fout.write('Files processed:\\n')\n", "\n", " dirs1=[os.path.join(basedir,x) for x in os.listdir(basedir) if (os.path.isdir(basedir+x) \\\n", " and not re.match('^\\.', x))]\n", " dirs1.sort()\n", " dirs2=[os.path.join(basedir2,x) for x in os.listdir(basedir2) if (os.path.isdir(basedir2+x) \\\n", " and not re.match('^\\.', x))]\n", " dirs2.sort(reverse=True)\n", " dirs3=[os.path.join(basedir3,x) for x in os.listdir(basedir3) if (os.path.isdir(basedir3+x) \\\n", " and not re.match('^\\.', x))]\n", " dirs3.sort()\n", " dirs4=[os.path.join(basedir4,x) for x in os.listdir(basedir4) if (os.path.isdir(basedir4+x) \\\n", " and not re.match('^\\.', x))]\n", " dirs4.sort()\n", " # create full list of filenames\n", " filenames1=list()\n", " filenames2=list()\n", " filenames3=list()\n", " filenames4=list()\n", " bnamesAll=list() # most files of same name present in DFOOPDB2 and DFOOPDB are identical but a few appear to have been updated\n", " # therefore use new version (DFOOPDB2)\n", " for cdirpath in dirs4:\n", " filenames4=filenames4+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if f not in bnamesAll]\n", " # some files are in both eo UBC March 27, 2017 and eo UBC Salish March 27, 2017; the duplicates are identical\n", " bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)]\n", " for cdirpath in dirs3:\n", " filenames3=filenames3+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if f not in bnamesAll]\n", " # some files are in both eo UBC March 27, 2017 and eo UBC Salish March 27, 2017; the duplicates are identical\n", " bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)]\n", " for cdirpath in dirs2:\n", " filenames2=filenames2+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if f not in bnamesAll]\n", " # some files are in both eo UBC March 27, 2017 and eo UBC Salish March 27, 2017; the duplicates are identical\n", " bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)]\n", " for cdirpath in dirs1:\n", " filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if f not in bnamesAll]\n", " filenames1.sort()\n", " filenames2.sort()\n", " filenames3.sort()\n", " filenames4.sort()\n", " filenames=filenames1+filenames2+filenames3+filenames4 #contains full paths" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dirs2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(filenamesCTD)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(filenames)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "filenamesCTD[0],filenames[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "re.split('/',filenames[0])[-1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for i,j in zip(filenamesCTD,filenames):\n", " if not re.split('/',i)[-1]==re.split('/',j)[-1]:\n", " print(i,'\\n',j)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "engine = create_engine('sqlite:////ocean/shared/SalishSeaCastData/DFO/BOT/DFO_OcProfDB.sqlite', echo = False)\n", "md=MetaData()\n", "md.reflect(engine)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "obsCols=[el.name for el in md.tables['ObsTBL'].columns]\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "obsCols2=[el.name for el in md2.tables['ObsTBL'].columns]\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Base = automap_base()\n", "Base.prepare(engine, reflect=True)\n", "StationTBL=Base.classes.StationTBL\n", "ObsTBL=Base.classes.ObsTBL\n", "session = create_session(bind = engine, autocommit = False, autoflush = True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "engine2 = create_engine('sqlite:////ocean/shared/SalishSeaCastData/DFO/BOT/DFO_OcProfDB2.sqlite', echo = False)\n", "md2=MetaData()\n", "md2.reflect(engine2)\n", "Base2 = automap_base()\n", "Base2.prepare(engine2, reflect=True)\n", "StationTBL2=Base2.classes.StationTBL\n", "ObsTBL2=Base2.classes.ObsTBL\n", "session2 = create_session(bind = engine2, autocommit = False, autoflush = True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df=pd.DataFrame(session.execute('select * from ObsTBL'),columns=obsCols)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df2=pd.DataFrame(session2.execute('select * from ObsTBL'),columns=obsCols2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for icol in df.keys():\n", " for i,j in zip(df[icol],df2[icol]):\n", " if not str(i)==str(j):\n", " print(icol,i,j)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "basepath='/ocean/shared/SalishSeaCastData/DFO/CTD/'\n", "basedir=basepath + 'req20181116/'\n", "dbname='DFO_CTD'\n", "# if more paths added later (additional data requests) see createDBfromDFO_OPDB.py for how to add\n", "fout=open(basepath+'createDBfromDFO_CTD_log.txt','w')\n", "ferr=open(basepath+'createDBfromDFO_CTD_errors.txt','w')\n", "fout.write('Files processed:\\n')\n", "\n", "dirs0=[os.path.join(basepath,x) for x in os.listdir(basepath) if (os.path.isdir(basepath+x) and not re.match('^\\.', x))]\n", "dirs1=list()\n", " # create full list of filenames\n", " filenames1=list()\n", " bnamesAll=set()\n", " for cdirpath in dirs1:\n", " filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) \\\n", " if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))]\n", " bnamesAll=bnamesAll.union(set([f for f in os.listdir(cdirpath)]))\n", " # left over from nutrients version where multiple requests led to overlap; retain for future use\n", " filenames1.sort()\n", " filenames=filenames1 #contains full paths\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dirs0" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ " for ireq in dirs0:\n", " dirs1=dirs1+[os.path.join(ireq,x) for x in os.listdir(ireq) \\\n", " if (os.path.isdir(os.path.join(ireq,x)) and not re.match('^\\.', x))]\n", " dirs1.sort()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dirs1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "filenames1=list()\n", "bnamesAll=list() \n", "for cdirpath in dirs1:\n", " filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))]\n", " bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)]\n", " # left over from nutrients version where multiple requests led to overlap; retain for future use\n", "filenames1.sort()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", " filenames1=list()\n", " bnamesAll=list()\n", " for cdirpath in dirs1:\n", " filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) \\\n", " if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))]\n", " bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)]\n", " # left over from nutrients version where multiple requests led to overlap; retain for future use\n", " filenames1.sort()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(filenames1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "filenames[-1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "testline=' # 0.250'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f=re.search('(?<=PathLength>).*(?=).*(?=).*(?=).*(?=).*(?=).*(?=',\n", "' # ',\n", "' # 953DR',\n", "' # Aug 9th 2017',\n", "' # 19.2982',\n", "' # -1.1000',\n", "' # 0.250',\n", "' # ')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for line in lines:\n", " for key, pattern in XmissPatterns.items():\n", " print(line)\n", " match=pattern.search(line)\n", " if match is None:\n", " continue\n", " print(key,match[0])\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for line in lines:\n", " for key, pattern in XmissPatterns.items():\n", " print(line)\n", " match=pattern.search(line)\n", " if match is not None:\n", " print(key,match[0])\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "\n", "import sqlalchemy\n", "from sqlalchemy import create_engine, Column, String, Boolean, Integer, Float, MetaData, Table, type_coerce, ForeignKey, case, update\n", "from sqlalchemy.orm import mapper, create_session, relationship, aliased, Session\n", "from sqlalchemy.ext.declarative import declarative_base\n", "import csv\n", "from sqlalchemy import case\n", "import numpy as np\n", "from sqlalchemy.ext.automap import automap_base\n", "import matplotlib.pyplot as plt\n", "import sqlalchemy.types as types\n", "import numbers\n", "from sqlalchemy.sql import and_, or_, not_, func\n", "from sqlalchemy.sql import select\n", "import os\n", "import glob\n", "import re\n", "from os.path import isfile\n", "import gsw\n", "import pandas as pd\n", "from sqlalchemy.sql import and_, or_, not_, func\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# definitions\n", "basedir='/ocean/shared/SalishSeaCastData/DFO/CTD/'\n", "dbname='DFO_CTD'\n", "# if db does not exist, exit\n", "# 1st, set Include=False for all CastAway data and duplicates\n", "engine0 = create_engine('sqlite:///' + basedir + dbname + '.sqlite', echo = False)\n", "md=MetaData()\n", "md.reflect(engine0)\n", "Base0 = automap_base(metadata=md)\n", "# reflect the tables in salish.sqlite:\n", "Base0.prepare()\n", "# mapped classes have been created\n", "# existing tables:\n", "StationTBL0=Base0.classes.StationTBL\n", "AncillaryTBL0=Base0.classes.AncillaryTBL\n", "ObsTBL0=Base0.classes.ObsTBL\n", "session0 = create_session(bind = engine0, autocommit = False, autoflush = True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "temp=pd.DataFrame(session0.query(StationTBL0.ID,StationTBL0.Lat,StationTBL0.Lon,StationTBL0.Include,ObsTBL0.Pressure,\n", " ObsTBL0.Salinity,ObsTBL0.Salinity_T0_C0,ObsTBL0.Salinity_T1_C1,\n", " ObsTBL0.Temperature_Primary,ObsTBL0.Temperature_Secondary,ObsTBL0.Include,ObsTBL0.PAR,ObsTBL0.PAR1,\n", " ObsTBL0.sourceFile).select_from(StationTBL0).\\\n", " join(ObsTBL0,ObsTBL0.StationTBLID==StationTBL0.ID).\\\n", " filter(StationTBL0.ID==120).all())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test=pd.DataFrame(session0.query(StationTBL0.ID,StationTBL0.Lat,StationTBL0.Lon,StationTBL0.Include).\\\n", " filter(StationTBL0.ID==120).all())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test=pd.DataFrame(session0.query(StationTBL0.ID,StationTBL0.Lat,StationTBL0.Lon,StationTBL0.Include).\\\n", " filter(StationTBL0.ID==120).all())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "temp" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "temp2=pd.DataFrame(session0.query(StationTBL0.ID,StationTBL0.Lat,StationTBL0.Lon,StationTBL0.Include,ObsTBL0.Pressure,\n", " ObsTBL0.Salinity,ObsTBL0.Salinity_T0_C0,ObsTBL0.Salinity_T1_C1,\n", " ObsTBL0.Temperature_Primary,ObsTBL0.Temperature_Secondary,ObsTBL0.Include,ObsTBL0.PAR,ObsTBL0.PAR1,\n", " ObsTBL0.sourceFile).select_from(StationTBL0).\\\n", " join(ObsTBL0,ObsTBL0.StationTBLID==StationTBL0.ID).\\\n", " filter(StationTBL0.ID==135).all())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "temp2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for el in np.unique(temp['sourceFile']):\n", " print(el)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(temp)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python (py38_ct)", "language": "python", "name": "py38_ct" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" } }, "nbformat": 4, "nbformat_minor": 2 }