{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sqlalchemy import create_engine, Column, String, Boolean, Integer, Float, Numeric, MetaData, Table, type_coerce, ForeignKey\n",
"from sqlalchemy.orm import mapper, create_session, relationship\n",
"from sqlalchemy.ext.declarative import declarative_base\n",
"import sqlalchemy.types as types\n",
"# sqlalchemy interfaces between SQL databases and python with a syntax that I found relatively easy to adapt to...\n",
"from sqlalchemy import create_engine, case, MetaData\n",
"from sqlalchemy.orm import create_session, aliased\n",
"from sqlalchemy.ext.automap import automap_base\n",
"from sqlalchemy.sql import and_, or_, not_, func\n",
"import re\n",
"import os\n",
"import glob\n",
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fsw={k:False for k in ('infile','invars','indetail','inadmin','inloc','indata','detformat')}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fsw"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"basepath='/ocean/shared/SalishSeaCastData/DFO/BOT/'\n",
"basedir=basepath\n",
"dbname='DFO_OcProfDB'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#ctd version:\n",
"dirs0=[os.path.join(basepath,x) for x in os.listdir(basepath) if (os.path.isdir(basepath+x) and not re.match('^\\.', x))]\n",
"dirs1=list()\n",
"for ireq in dirs0:\n",
" dirs1=dirs1+[os.path.join(ireq,x) for x in os.listdir(ireq) \\\n",
" if (os.path.isdir(os.path.join(ireq,x)) and not re.match('^\\.', x))]\n",
"dirs1.sort(reverse=True)\n",
"# create full list of filenames\n",
"filenames1=list()\n",
"bnamesAll=set()\n",
"for cdirpath in dirs1:\n",
" filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) \\\n",
" if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))]\n",
" bnamesAll=bnamesAll.union(set([f for f in os.listdir(cdirpath)]))\n",
" # left over from nutrients version where multiple requests led to overlap; retain for future use\n",
"filenames1.sort()\n",
"filenamesCTD=filenames1 #contains full paths\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dirs1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" basepath='/ocean/eolson/MEOPAR/obs/'\n",
" basedir=basepath + 'DFOOPDB/'\n",
" basedir2=basepath + 'DFOOPDB2/'\n",
" basedir3=basepath + 'DFOOPDB3/'\n",
" basedir4=basepath + 'DFOOPDB4/'\n",
" dbname='DFO_OcProfDB'\n",
"\n",
" fout=open(basedir+'createDBfromDFO_OPDB_log.txt','w')\n",
" ferr=open(basedir+'createDBfromDFO_OPDB_errors.txt','w')\n",
" fout.write('Files processed:\\n')\n",
"\n",
" dirs1=[os.path.join(basedir,x) for x in os.listdir(basedir) if (os.path.isdir(basedir+x) \\\n",
" and not re.match('^\\.', x))]\n",
" dirs1.sort()\n",
" dirs2=[os.path.join(basedir2,x) for x in os.listdir(basedir2) if (os.path.isdir(basedir2+x) \\\n",
" and not re.match('^\\.', x))]\n",
" dirs2.sort(reverse=True)\n",
" dirs3=[os.path.join(basedir3,x) for x in os.listdir(basedir3) if (os.path.isdir(basedir3+x) \\\n",
" and not re.match('^\\.', x))]\n",
" dirs3.sort()\n",
" dirs4=[os.path.join(basedir4,x) for x in os.listdir(basedir4) if (os.path.isdir(basedir4+x) \\\n",
" and not re.match('^\\.', x))]\n",
" dirs4.sort()\n",
" # create full list of filenames\n",
" filenames1=list()\n",
" filenames2=list()\n",
" filenames3=list()\n",
" filenames4=list()\n",
" bnamesAll=list() # most files of same name present in DFOOPDB2 and DFOOPDB are identical but a few appear to have been updated\n",
" # therefore use new version (DFOOPDB2)\n",
" for cdirpath in dirs4:\n",
" filenames4=filenames4+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if f not in bnamesAll]\n",
" # some files are in both eo UBC March 27, 2017 and eo UBC Salish March 27, 2017; the duplicates are identical\n",
" bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)]\n",
" for cdirpath in dirs3:\n",
" filenames3=filenames3+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if f not in bnamesAll]\n",
" # some files are in both eo UBC March 27, 2017 and eo UBC Salish March 27, 2017; the duplicates are identical\n",
" bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)]\n",
" for cdirpath in dirs2:\n",
" filenames2=filenames2+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if f not in bnamesAll]\n",
" # some files are in both eo UBC March 27, 2017 and eo UBC Salish March 27, 2017; the duplicates are identical\n",
" bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)]\n",
" for cdirpath in dirs1:\n",
" filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if f not in bnamesAll]\n",
" filenames1.sort()\n",
" filenames2.sort()\n",
" filenames3.sort()\n",
" filenames4.sort()\n",
" filenames=filenames1+filenames2+filenames3+filenames4 #contains full paths"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dirs2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(filenamesCTD)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(filenames)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"filenamesCTD[0],filenames[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"re.split('/',filenames[0])[-1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for i,j in zip(filenamesCTD,filenames):\n",
" if not re.split('/',i)[-1]==re.split('/',j)[-1]:\n",
" print(i,'\\n',j)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"engine = create_engine('sqlite:////ocean/shared/SalishSeaCastData/DFO/BOT/DFO_OcProfDB.sqlite', echo = False)\n",
"md=MetaData()\n",
"md.reflect(engine)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"obsCols=[el.name for el in md.tables['ObsTBL'].columns]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"obsCols2=[el.name for el in md2.tables['ObsTBL'].columns]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Base = automap_base()\n",
"Base.prepare(engine, reflect=True)\n",
"StationTBL=Base.classes.StationTBL\n",
"ObsTBL=Base.classes.ObsTBL\n",
"session = create_session(bind = engine, autocommit = False, autoflush = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"engine2 = create_engine('sqlite:////ocean/shared/SalishSeaCastData/DFO/BOT/DFO_OcProfDB2.sqlite', echo = False)\n",
"md2=MetaData()\n",
"md2.reflect(engine2)\n",
"Base2 = automap_base()\n",
"Base2.prepare(engine2, reflect=True)\n",
"StationTBL2=Base2.classes.StationTBL\n",
"ObsTBL2=Base2.classes.ObsTBL\n",
"session2 = create_session(bind = engine2, autocommit = False, autoflush = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df=pd.DataFrame(session.execute('select * from ObsTBL'),columns=obsCols)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df2=pd.DataFrame(session2.execute('select * from ObsTBL'),columns=obsCols2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for icol in df.keys():\n",
" for i,j in zip(df[icol],df2[icol]):\n",
" if not str(i)==str(j):\n",
" print(icol,i,j)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"basepath='/ocean/shared/SalishSeaCastData/DFO/CTD/'\n",
"basedir=basepath + 'req20181116/'\n",
"dbname='DFO_CTD'\n",
"# if more paths added later (additional data requests) see createDBfromDFO_OPDB.py for how to add\n",
"fout=open(basepath+'createDBfromDFO_CTD_log.txt','w')\n",
"ferr=open(basepath+'createDBfromDFO_CTD_errors.txt','w')\n",
"fout.write('Files processed:\\n')\n",
"\n",
"dirs0=[os.path.join(basepath,x) for x in os.listdir(basepath) if (os.path.isdir(basepath+x) and not re.match('^\\.', x))]\n",
"dirs1=list()\n",
" # create full list of filenames\n",
" filenames1=list()\n",
" bnamesAll=set()\n",
" for cdirpath in dirs1:\n",
" filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) \\\n",
" if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))]\n",
" bnamesAll=bnamesAll.union(set([f for f in os.listdir(cdirpath)]))\n",
" # left over from nutrients version where multiple requests led to overlap; retain for future use\n",
" filenames1.sort()\n",
" filenames=filenames1 #contains full paths\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dirs0"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" for ireq in dirs0:\n",
" dirs1=dirs1+[os.path.join(ireq,x) for x in os.listdir(ireq) \\\n",
" if (os.path.isdir(os.path.join(ireq,x)) and not re.match('^\\.', x))]\n",
" dirs1.sort()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dirs1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"filenames1=list()\n",
"bnamesAll=list() \n",
"for cdirpath in dirs1:\n",
" filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))]\n",
" bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)]\n",
" # left over from nutrients version where multiple requests led to overlap; retain for future use\n",
"filenames1.sort()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
" filenames1=list()\n",
" bnamesAll=list()\n",
" for cdirpath in dirs1:\n",
" filenames1=filenames1+[os.path.join(cdirpath,f) for f in os.listdir(cdirpath) \\\n",
" if ((f not in bnamesAll) and (not re.match('.*jpg$',f)))]\n",
" bnamesAll=bnamesAll+[f for f in os.listdir(cdirpath)]\n",
" # left over from nutrients version where multiple requests led to overlap; retain for future use\n",
" filenames1.sort()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(filenames1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"filenames[-1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"testline=' # 0.250'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f=re.search('(?<=PathLength>).*(?='"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f=re.search('Transmissometer,',testline)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"XmissPatterns={'xmiss_Serial':re.compile('(?<=SerialNumber>).*(?=).*(?=).*(?=).*(?=).*(?=',\n",
"' # ',\n",
"' # 953DR',\n",
"' # Aug 9th 2017',\n",
"' # 19.2982',\n",
"' # -1.1000',\n",
"' # 0.250',\n",
"' # ')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for line in lines:\n",
" for key, pattern in XmissPatterns.items():\n",
" print(line)\n",
" match=pattern.search(line)\n",
" if match is None:\n",
" continue\n",
" print(key,match[0])\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for line in lines:\n",
" for key, pattern in XmissPatterns.items():\n",
" print(line)\n",
" match=pattern.search(line)\n",
" if match is not None:\n",
" print(key,match[0])\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"import sqlalchemy\n",
"from sqlalchemy import create_engine, Column, String, Boolean, Integer, Float, MetaData, Table, type_coerce, ForeignKey, case, update\n",
"from sqlalchemy.orm import mapper, create_session, relationship, aliased, Session\n",
"from sqlalchemy.ext.declarative import declarative_base\n",
"import csv\n",
"from sqlalchemy import case\n",
"import numpy as np\n",
"from sqlalchemy.ext.automap import automap_base\n",
"import matplotlib.pyplot as plt\n",
"import sqlalchemy.types as types\n",
"import numbers\n",
"from sqlalchemy.sql import and_, or_, not_, func\n",
"from sqlalchemy.sql import select\n",
"import os\n",
"import glob\n",
"import re\n",
"from os.path import isfile\n",
"import gsw\n",
"import pandas as pd\n",
"from sqlalchemy.sql import and_, or_, not_, func\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# definitions\n",
"basedir='/ocean/shared/SalishSeaCastData/DFO/CTD/'\n",
"dbname='DFO_CTD'\n",
"# if db does not exist, exit\n",
"# 1st, set Include=False for all CastAway data and duplicates\n",
"engine0 = create_engine('sqlite:///' + basedir + dbname + '.sqlite', echo = False)\n",
"md=MetaData()\n",
"md.reflect(engine0)\n",
"Base0 = automap_base(metadata=md)\n",
"# reflect the tables in salish.sqlite:\n",
"Base0.prepare()\n",
"# mapped classes have been created\n",
"# existing tables:\n",
"StationTBL0=Base0.classes.StationTBL\n",
"AncillaryTBL0=Base0.classes.AncillaryTBL\n",
"ObsTBL0=Base0.classes.ObsTBL\n",
"session0 = create_session(bind = engine0, autocommit = False, autoflush = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"temp=pd.DataFrame(session0.query(StationTBL0.ID,StationTBL0.Lat,StationTBL0.Lon,StationTBL0.Include,ObsTBL0.Pressure,\n",
" ObsTBL0.Salinity,ObsTBL0.Salinity_T0_C0,ObsTBL0.Salinity_T1_C1,\n",
" ObsTBL0.Temperature_Primary,ObsTBL0.Temperature_Secondary,ObsTBL0.Include,ObsTBL0.PAR,ObsTBL0.PAR1,\n",
" ObsTBL0.sourceFile).select_from(StationTBL0).\\\n",
" join(ObsTBL0,ObsTBL0.StationTBLID==StationTBL0.ID).\\\n",
" filter(StationTBL0.ID==120).all())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test=pd.DataFrame(session0.query(StationTBL0.ID,StationTBL0.Lat,StationTBL0.Lon,StationTBL0.Include).\\\n",
" filter(StationTBL0.ID==120).all())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test=pd.DataFrame(session0.query(StationTBL0.ID,StationTBL0.Lat,StationTBL0.Lon,StationTBL0.Include).\\\n",
" filter(StationTBL0.ID==120).all())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"temp"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"temp2=pd.DataFrame(session0.query(StationTBL0.ID,StationTBL0.Lat,StationTBL0.Lon,StationTBL0.Include,ObsTBL0.Pressure,\n",
" ObsTBL0.Salinity,ObsTBL0.Salinity_T0_C0,ObsTBL0.Salinity_T1_C1,\n",
" ObsTBL0.Temperature_Primary,ObsTBL0.Temperature_Secondary,ObsTBL0.Include,ObsTBL0.PAR,ObsTBL0.PAR1,\n",
" ObsTBL0.sourceFile).select_from(StationTBL0).\\\n",
" join(ObsTBL0,ObsTBL0.StationTBLID==StationTBL0.ID).\\\n",
" filter(StationTBL0.ID==135).all())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"temp2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for el in np.unique(temp['sourceFile']):\n",
" print(el)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(temp)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (py38_ct)",
"language": "python",
"name": "py38_ct"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}