{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Elaborate statistics features for Silvereye\n", "\n", "This is the first of a series of notebooks to tease the statistics on AWRA and related data for Silvereye.\n", "\n", "The current notebook is for the initial trial-error.\n", "\n", "## Purpose\n", "\n", "Merge capabilities from:\n", "\n", "* Ben's code in the prototype app\n", "* Ramneek's stat on daily data.\n", "* possibly some of the princeton-chelsa blend code (optional)\n", "\n", "Plan is:\n", "\n", "* Subset AWRA daily data sets to a more manageable size for initial elaboration. ACT or Tassie.\n", "* Elaborate on use cases:\n", " * Existing features infered from BoM web served maps\n", " * compare periods (e.g. recent past ) to similar historical periods* Subset AWRA daily data sets to a more manageable size for initial elaboration. ACT or Tassie.\n", "\n", "## Dependencies imports\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import xarray as xr\n", "import os\n", "import sys\n", "import pandas as pd\n", "from functools import wraps\n", "import numpy as np\n", "\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import seaborn as sns # noqa, pandas aware plotting library" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if ('SP_SRC' in os.environ):\n", " root_src_dir = os.environ['SP_SRC']\n", "elif sys.platform == 'win32':\n", " root_src_dir = r'C:\\src\\csiro\\stash\\silverpieces'\n", "else:\n", " root_src_dir = '/home/per202/src/csiro/stash/silverpieces'\n", "\n", "pkg_src_dir = root_src_dir\n", "sys.path.append(pkg_src_dir)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if ('SP_DATA' in os.environ):\n", " root_data_dir = os.environ['SP_DATA']\n", "elif sys.platform == 'win32':\n", " root_data_dir = r'C:\\data\\silverpieces'\n", "else:\n", " root_data_dir = '/home/per202/data/silverpieces'\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#from silverpieces.blah import *" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "%config InlineBackend.figure_format = 'retina'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "from siphon.catalog import TDSCatalog\n", "import requests\n", "# from distributed import Client, LocalCluster\n", "# from flask import Flask, redirect, url_for, request, render_template, jsonify, abort\n", "# from flask_cors import CORS\n", "# import intake\n", "# import regionmask\n", "# import random\n", "# from flask import jsonify, make_response\n", "# import json\n", "# import uuid\n", "# from flask.logging import default_handler\n", "# import geopandas as gpd\n", "# from owslib.wps import WebProcessingService\n", "# from owslib.wps import printInputOutput\n", "# from birdy import WPSClient\n", "# import birdy\n", "# import tarfile\n", "\n", "# import tempfile\n", "# import shutil\n", "# import urllib\n", "import urllib.request\n", "\n", "# from urllib.parse import urlparse\n", "# from datetime import datetime\n", "# from dateutil.relativedelta import relativedelta\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "I tried to use [xarray-tips-and-tricks](https://rabernat.github.io/research_computing_2018/xarray-tips-and-tricks.html) by similarity but while this looked OK, retrieving values ended up in a `RuntimeError: NetCDF: Access failure`. Park this. This may be an issue with the thredds server serving awra data." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# #base_url = 'http://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis/surface/air.sig995'\n", "# base_url = 'http://data-mel.it.csiro.au/thredds/dodsC/catch_all/lw_oznome/AWRA-L_historical_data/qtot/qtot_avg_'\n", "# f = ['http://data-mel.it.csiro.au/thredds/dodsC/catch_all/lw_oznome/AWRA-L_historical_data/qtot/qtot_avg_' + str(yr) + '.nc' for yr in [2011, 2012]]\n", "# files = [f'{base_url}{year}.nc' for year in range(2010, 2015)]\n", "# files" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# ds = xr.open_mfdataset(files)\n", "# ds" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# x = ds.qtot_avg\n", "# x[1,1,1].values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cat_url = 'http://data-mel.it.csiro.au/thredds/catalog/catch_all/lw_oznome/AWRA-L_historical_data/qtot/catalog.xml'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cat = TDSCatalog(cat_url)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ds_names = [str(x) for x in cat.datasets]\n", "m_indices = [i for i in range(len(ds_names)) if ds_names[i].startswith('qtot') and ds_names[i].endswith('nc')]\n", "# somehow sorted\n", "m_indices.reverse()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ds = [cat.datasets[i] for i in m_indices]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ds[0].access_urls" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dsets = [cds.remote_access(use_xarray=True).reset_coords(drop=True).chunk({'latitude': 681, 'longitude': 841}) # 731, latitude: 681, longitude: 841\n", " for cds in ds[:3]] # eventually want to use the whole catalog here\n", "#dsets = [cds.remote_access(use_xarray=True).reset_coords(drop=True)\n", "# for cds in ds[:3]] # eventually want to use the whole catalog here" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(dsets)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "blah = xr.auto_combine(dsets)\n", "blah" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x = blah.qtot_avg" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x['time'].values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`RuntimeError: NetCDF: Access failure`. That said, it worked one time out of ~10 so this is probably more a problem with the thredds server." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Random but frequently ends up RuntimeError: NetCDF: Access failure\n", "m = x[1,:,:].values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.imshow(m)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%time b_box = x.isel(latitude=slice(600,700), longitude=slice(650,750))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%time b_box.isel(time=310).plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%time plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## SILO data\n", "\n", "Starting from some of code J Yu had authored:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ds_desc = 'http://data-cbr.it.csiro.au/thredds/catalog/catch_all/Digiscape_Climate_Data_Portal/silo/climate/catalog.xml?dataset=allDatasetScan/Digiscape_Climate_Data_Portal/silo/climate/daily_rain.nc'\n", "catalog = TDSCatalog(ds_desc)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "catalog.datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dataset = catalog.datasets[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "type(dataset)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "blah" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ncss = dataset.subset()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "type(ncss)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "query = ncss.query()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "tt = pd.to_datetime(\"2014-01-01\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "query.lonlat_box(north=-40, south=-44, east=149, west=144).time(tt)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "query.accept('netcdf4')\n", "query.variables('daily_rain')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# With the following I may have bumped into https://github.com/Unidata/thredds-docker/issues/216 \n", "data = ncss.get_data(query)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Try another approach:\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load the dataset\n", "cat = catalog\n", "dataset_name = sorted(cat.datasets.keys())[-1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dataset_name" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dataset = cat.datasets[dataset_name]\n", "ds = dataset.remote_access(service='OPENDAP')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from xarray.backends import NetCDF4DataStore" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ds = NetCDF4DataStore(ds)\n", "ds = xr.open_dataset(ds)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x = ds.daily_rain" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x.isel(time=22000).plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%time b_box = x.isel(lat=slice(600,700), lon=slice(650,750))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%time b_box.isel(time=22000).plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%time plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Appendix\n", "\n", "Below are attempts to use Numpy-ic ways to calculate quantile classes. No joy." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "convolve = np.vectorize(np.convolve, signature='(n),(m)->(k)')\n", "convolve(np.eye(2), [1, 2])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "convolve = np.vectorize(np.convolve)\n", "convolve(np.eye(2), [1, 2])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "convolve(np.eye(2), [1, 2])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "z = three_years_rains[-1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "z.searchsorted(y)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def searchsorted2d(a,b):\n", " m,n = a.shape\n", " max_num = np.maximum(a.max() - a.min(), b.max() - b.min()) + 1\n", " r = max_num*np.arange(a.shape[0])[:,None]\n", " p = np.searchsorted( (a+r).ravel(), (b+r).ravel() ).reshape(m,-1)\n", " return p - n*(np.arange(m)[:,None])\n", "\n", "def searchsorted2d(mat,ensembles):\n", " m,n = mat.shape\n", " max_num = np.maximum(a.max() - a.min(), b.max() - b.min()) + 1\n", " r = max_num*np.arange(a.shape[0])[:,None]\n", " p = np.searchsorted( (a+r).ravel(), (b+r).ravel() ).reshape(m,-1)\n", " return p - n*(np.arange(m)[:,None])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.random.seed(123)\n", "mat = np.sort(np.arange(12)).reshape(3, 4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mat = mat.reshape(3, 4)\n", "mat" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mat[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "v_searchsorted = np.vectorize(np.searchsorted, signature='(n),(m)->(k)')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "v_searchsorted(mat, np.array([0.5, 5.5, 10.1]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "v_searchsorted = np.vectorize(np.searchsorted, signature='(n,m),(n,m)->(n)')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "v_searchsorted(mat, np.array([0.5, 5.5]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.searchsorted(np.array([-1, 0, 1, 2]), np.array([-1, 0, 1.1, 2]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y.groupby_bins(group, bins, right: bool = True, labels=None, precision: int = 3, include_lowest: bool = False, squeeze: bool = True, restore_coord_dims: Optional[bool] = None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "z.shape, y.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "very_drier = z <= y[0,:,:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "very_drier.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "z[50,50], y[:,50, 50]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.argmin( (200 > y[:,50, 50].values) )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "3000 > y[:,50, 50].values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.argmin(z[50,50].values > y[:,50, 50].values)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.argmin(z[50,50].values < y[:,50, 50].values)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "(z[50,50].values > y[:,50, 50].values) , (z[50,50].values < y[:,50, 50].values)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.eye(4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.argmax(z[50,50].values > y[:,50, 50].values)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.argmax(z[50,50].values < y[:,50, 50].values)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.searchsorted(y[:,50, 50].values, z[50,50].values)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.searchsorted(np.arange(10), 23)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def searchsorted(a, b):\n", " func = lambda x, y: np.searchsorted(x[:,,].values, y[].values)\n", " return xr.apply_ufunc(func, a, b)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "xr.apply_ufunc" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dim = 'time'\n", "xr.apply_ufunc(np.searchsorted,\n", " y, z,\n", " input_core_dims=[[dim], []])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import scipy.stats\n", "\n", "def earth_mover_distance(first_samples,\n", " second_samples,\n", " dim='ensemble'):\n", " return apply_ufunc(scipy.stats.wasserstein_distance,\n", " first_samples, second_samples,\n", " input_core_dims=[[dim], [dim]],\n", " vectorize=True)" ] } ], "metadata": { "kernelspec": { "display_name": "Py3 (DCX)", "language": "python", "name": "dcx" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }