{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Authors: Mackenzie Blanusa, A.Radhakrishnan" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from glob import glob\n", "import xarray as xr\n", "import cftime\n", "import nc_time_axis\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import intake, intake_esm\n", "from dask_gateway import Gateway\n", "import pandas as pd\n", "pd.set_option(\"display.max_colwidth\", None)\n", "#!pip install cmip6_preprocessing\n", "\n", "%pip install git+https://github.com/jbusecke/cmip6_preprocessing.git\n", "\n", "\n", "from cmip6_preprocessing.preprocessing import combined_preprocessing\n", "from cmip6_preprocessing.preprocessing import (correct_units,rename_cmip6)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def latest_version(cat):\n", " \"\"\"\n", " input\n", " cat: esmdatastore \n", " output\n", " esmdatastore with latest DRS versions\n", " \"\"\"\n", " \n", " latest_cat = cat.df.sort_values(by=['version','path']).drop_duplicates(['temporal subset','model','mip_table',\n", " 'institute','variable','ensemble_member',\n", " 'grid_label','experiment_id'],keep='last')\n", " return latest_cat" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def fix_time(ds):\n", " \"\"\" force calendar to noleap\"\"\"\n", " import xarray as xr\n", " ds = ds.copy()\n", " if \"time\" not in ds.dims:\n", " return ds\n", " \n", " if (\"calendar\" not in ds[\"time\"].attrs): \n", " ds[\"time\"].attrs.update({\"calendar\": \"noleap\"})\n", " \n", " if ds[\"time\"].attrs[\"calendar\"] not in [\"noleap\", \"NOLEAP\", \"365_day\"]:\n", " ds[\"time\"].attrs.update({\"calendar\": \"noleap\"})\n", " \n", " ds = xr.decode_cf(ds)\n", " return ds" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def fix_units(ds):\n", " if \"units\" in ds[\"lev\"].attrs:\n", " if ds[\"lev\"].attrs[\"units\"] in [\"cm\", \"centimeters\"]:\n", " ds[\"lev\"] = xr.DataArray(ds[\"lev\"].values / 100., dims=ds[\"lev\"].dims)\n", " return ds" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def pp_thetao(ds):\n", " ds = ds.copy() #the wrapper function makes a copy of the ds and works from this\n", " ds = rename_cmip6(ds)\n", " ds = fix_time(ds)\n", " #ds = fix_units(ds)\n", " ds = correct_units(ds)\n", " return ds" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load the catalog " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "col_url = \"https://cmip6-nc.s3.us-east-2.amazonaws.com/esgf-world.json\"" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "col = intake.open_esm_datastore(col_url)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "debug starts" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "query = dict(experiment_id=['historical'],\n", " mip_table='Omon',\n", " ensemble_member=[\"r1i1p1f1\"],\n", " model=['IPSL-CM6A-LR'],\n", " grid_label=['gn'],\n", " variable=[\"thetao\"]\n", " )\n", "cat_T = col.search(**query)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "WHAT DOES NOT WORK: \n", "\n", "the following misses olevel_bounds (renamed to lev_bnds) and other variables after preprocesssing. \n", "Without preprocessing, the datasets have the old dim/var names as found in the original files/objects" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "--> The keys in the returned dictionary of datasets are constructed as follows:\n", "\t'project.institute.model.experiment_id.mip_table'\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " 100.00% [1/1 00:00<00:00]\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dset_dict_T_orig = cat_T.to_dataset_dict(cdf_kwargs={'decode_times': False, 'chunks': {'time': 1,'olevel':1}},\n", " preprocess = pp_thetao,storage_options={'anon':True})" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CMIP6.IPSL.IPSL-CM6A-LR.historical.Omon\n", "['ensemble_member', 'lev', 'time', 'x', 'y']\n" ] } ], "source": [ "for k, ds in dset_dict_T_orig.items():\n", " print(k)\n", " print(list(ds.dims))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
<xarray.Dataset>\n",
       "Dimensions:          (ensemble_member: 1, lev: 75, time: 1980, x: 362, y: 332)\n",
       "Coordinates:\n",
       "    lat              (y, x) float32 dask.array<chunksize=(332, 362), meta=np.ndarray>\n",
       "    lon              (y, x) float32 dask.array<chunksize=(332, 362), meta=np.ndarray>\n",
       "  * lev              (lev) float32 0.50576 1.5558553 ... 5698.0605 5902.0576\n",
       "  * time             (time) object 1850-01-16 12:00:00 ... 2015-01-25 12:00:00\n",
       "  * ensemble_member  (ensemble_member) <U8 'r1i1p1f1'\n",
       "Dimensions without coordinates: x, y\n",
       "Data variables:\n",
       "    thetao           (ensemble_member, time, lev, y, x) float32 dask.array<chunksize=(1, 1, 1, 332, 362), meta=np.ndarray>\n",
       "Attributes:\n",
       "    title:                   IPSL-CM6A-LR model output prepared for CMIP6 / C...\n",
       "    intake_esm_varname:      ['thetao']\n",
       "    source:                  IPSL-CM6A-LR (2017):  atmos: LMDZ (NPv6, N96; 14...\n",
       "    institution_id:          IPSL\n",
       "    history:                 Sat Dec  1 12:16:38 2018: ncatted -O -a realizat...\n",
       "    physics_index:           [1]\n",
       "    parent_variant_label:    r1i1p1f1\n",
       "    parent_experiment_id:    piControl\n",
       "    branch_method:           standard\n",
       "    grid:                    native ocean tri-polar grid with 105 k ocean cells\n",
       "    realization_index:       [1]\n",
       "    parent_source_id:        IPSL-CM6A-LR\n",
       "    sub_experiment_id:       none\n",
       "    model_version:           6.1.5\n",
       "    variant_label:           r1i1p1f1\n",
       "    sub_experiment:          none\n",
       "    branch_time_in_parent:   [21914.]\n",
       "    forcing_index:           [1]\n",
       "    initialization_index:    [1]\n",
       "    dr2xml_md5sum:           f1e40c1fc5d8281f865f72fbf4e38f9d\n",
       "    license:                 CMIP6 model data produced by IPSL is licensed un...\n",
       "    EXPID:                   historical\n",
       "    grid_label:              gn\n",
       "    Conventions:             CF-1.7 CMIP-6.2\n",
       "    source_id:               IPSL-CM6A-LR\n",
       "    description:             CMIP6 historical\n",
       "    institution:             Institut Pierre Simon Laplace, Paris 75252, France\n",
       "    experiment:              all-forcing simulation of the recent past\n",
       "    frequency:               mon\n",
       "    activity_id:             CMIP\n",
       "    parent_activity_id:      CMIP\n",
       "    contact:                 ipsl-cmip6@listes.ipsl.fr\n",
       "    realm:                   ocean\n",
       "    source_type:             AOGCM BGC\n",
       "    data_specs_version:      01.00.21\n",
       "    further_info_url:        https://furtherinfo.es-doc.org/CMIP6.IPSL.IPSL-C...\n",
       "    dr2xml_version:          1.11\n",
       "    variable_id:             thetao\n",
       "    parent_time_units:       days since 1850-01-01 00:00:00\n",
       "    parent_mip_era:          CMIP6\n",
       "    CMIP6_CV_version:        cv=6.2.3.5-2-g63b123e\n",
       "    product:                 model-output\n",
       "    NCO:                     "4.6.0"\n",
       "    experiment_id:           historical\n",
       "    branch_time_in_child:    [0.]\n",
       "    nominal_resolution:      100 km\n",
       "    tracking_id:             hdl:21.14100/2357970e-3f77-4595-80d8-e3d5c69d0bd...\n",
       "    table_id:                Omon\n",
       "    external_variables:      areacello volcello\n",
       "    mip_era:                 CMIP6\n",
       "    name:                    /ccc/work/cont003/gencmip6/p86caub/IGCM_OUT/IPSL...\n",
       "    intake_esm_dataset_key:  CMIP6.IPSL.IPSL-CM6A-LR.historical.Omon
" ], "text/plain": [ "\n", "Dimensions: (ensemble_member: 1, lev: 75, time: 1980, x: 362, y: 332)\n", "Coordinates:\n", " lat (y, x) float32 dask.array\n", " lon (y, x) float32 dask.array\n", " * lev (lev) float32 0.50576 1.5558553 ... 5698.0605 5902.0576\n", " * time (time) object 1850-01-16 12:00:00 ... 2015-01-25 12:00:00\n", " * ensemble_member (ensemble_member) \n", "Attributes:\n", " title: IPSL-CM6A-LR model output prepared for CMIP6 / C...\n", " intake_esm_varname: ['thetao']\n", " source: IPSL-CM6A-LR (2017): atmos: LMDZ (NPv6, N96; 14...\n", " institution_id: IPSL\n", " history: Sat Dec 1 12:16:38 2018: ncatted -O -a realizat...\n", " physics_index: [1]\n", " parent_variant_label: r1i1p1f1\n", " parent_experiment_id: piControl\n", " branch_method: standard\n", " grid: native ocean tri-polar grid with 105 k ocean cells\n", " realization_index: [1]\n", " parent_source_id: IPSL-CM6A-LR\n", " sub_experiment_id: none\n", " model_version: 6.1.5\n", " variant_label: r1i1p1f1\n", " sub_experiment: none\n", " branch_time_in_parent: [21914.]\n", " forcing_index: [1]\n", " initialization_index: [1]\n", " dr2xml_md5sum: f1e40c1fc5d8281f865f72fbf4e38f9d\n", " license: CMIP6 model data produced by IPSL is licensed un...\n", " EXPID: historical\n", " grid_label: gn\n", " Conventions: CF-1.7 CMIP-6.2\n", " source_id: IPSL-CM6A-LR\n", " description: CMIP6 historical\n", " institution: Institut Pierre Simon Laplace, Paris 75252, France\n", " experiment: all-forcing simulation of the recent past\n", " frequency: mon\n", " activity_id: CMIP\n", " parent_activity_id: CMIP\n", " contact: ipsl-cmip6@listes.ipsl.fr\n", " realm: ocean\n", " source_type: AOGCM BGC\n", " data_specs_version: 01.00.21\n", " further_info_url: https://furtherinfo.es-doc.org/CMIP6.IPSL.IPSL-C...\n", " dr2xml_version: 1.11\n", " variable_id: thetao\n", " parent_time_units: days since 1850-01-01 00:00:00\n", " parent_mip_era: CMIP6\n", " CMIP6_CV_version: cv=6.2.3.5-2-g63b123e\n", " product: model-output\n", " NCO: \"4.6.0\"\n", " experiment_id: historical\n", " branch_time_in_child: [0.]\n", " nominal_resolution: 100 km\n", " tracking_id: hdl:21.14100/2357970e-3f77-4595-80d8-e3d5c69d0bd...\n", " table_id: Omon\n", " external_variables: areacello volcello\n", " mip_era: CMIP6\n", " name: /ccc/work/cont003/gencmip6/p86caub/IGCM_OUT/IPSL...\n", " intake_esm_dataset_key: CMIP6.IPSL.IPSL-CM6A-LR.historical.Omon" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dset_dict_T_orig['CMIP6.IPSL.IPSL-CM6A-LR.historical.Omon'] " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "WHAT WORKS:\n", "\n", "for some reason the following works and includes all data variables. latest_version outputs a pandas dataframe which we then convert to esm datastore" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "--> The keys in the returned dictionary of datasets are constructed as follows:\n", "\t'project.institute.model.experiment_id.mip_table'\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " 100.00% [1/1 00:00<00:00]\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "cat_T_gn_latest = latest_version(cat_T)\n", "esmcol_data = col.esmcol_data\n", "cat_T2 = intake.open_esm_datastore(cat_T_gn_latest,esmcol_data=esmcol_data)\n", "\n", "dset_dict_T = cat_T2.to_dataset_dict(cdf_kwargs={'decode_times': False, 'chunks': {'time': 1}},\n", " preprocess = pp_thetao,storage_options={'anon':True})\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
<xarray.Dataset>\n",
       "Dimensions:          (bnds: 2, ensemble_member: 1, lev: 75, time: 1980, vertex: 4, x: 362, y: 332)\n",
       "Coordinates:\n",
       "    lat              (y, x) float32 dask.array<chunksize=(332, 362), meta=np.ndarray>\n",
       "    lon              (y, x) float32 dask.array<chunksize=(332, 362), meta=np.ndarray>\n",
       "  * lev              (lev) float32 0.50576 1.5558553 ... 5698.0605 5902.0576\n",
       "  * time             (time) object 1850-01-16 12:00:00 ... 2015-01-25 12:00:00\n",
       "  * ensemble_member  (ensemble_member) <U8 'r1i1p1f1'\n",
       "Dimensions without coordinates: bnds, vertex, x, y\n",
       "Data variables:\n",
       "    lon_bounds       (y, x, vertex) float32 dask.array<chunksize=(332, 362, 4), meta=np.ndarray>\n",
       "    lat_bounds       (y, x, vertex) float32 dask.array<chunksize=(332, 362, 4), meta=np.ndarray>\n",
       "    area             (y, x) float32 dask.array<chunksize=(332, 362), meta=np.ndarray>\n",
       "    lev_bounds       (lev, bnds) float32 dask.array<chunksize=(75, 2), meta=np.ndarray>\n",
       "    time_bounds      (time, bnds) object dask.array<chunksize=(1, 2), meta=np.ndarray>\n",
       "    thetao           (ensemble_member, time, lev, y, x) float32 dask.array<chunksize=(1, 1, 75, 332, 362), meta=np.ndarray>\n",
       "Attributes:\n",
       "    title:                   IPSL-CM6A-LR model output prepared for CMIP6 / C...\n",
       "    intake_esm_varname:      ['thetao']\n",
       "    source:                  IPSL-CM6A-LR (2017):  atmos: LMDZ (NPv6, N96; 14...\n",
       "    institution_id:          IPSL\n",
       "    history:                 Sat Dec  1 12:16:38 2018: ncatted -O -a realizat...\n",
       "    physics_index:           [1]\n",
       "    parent_variant_label:    r1i1p1f1\n",
       "    parent_experiment_id:    piControl\n",
       "    branch_method:           standard\n",
       "    grid:                    native ocean tri-polar grid with 105 k ocean cells\n",
       "    realization_index:       [1]\n",
       "    parent_source_id:        IPSL-CM6A-LR\n",
       "    sub_experiment_id:       none\n",
       "    model_version:           6.1.5\n",
       "    variant_label:           r1i1p1f1\n",
       "    sub_experiment:          none\n",
       "    branch_time_in_parent:   [21914.]\n",
       "    forcing_index:           [1]\n",
       "    initialization_index:    [1]\n",
       "    dr2xml_md5sum:           f1e40c1fc5d8281f865f72fbf4e38f9d\n",
       "    license:                 CMIP6 model data produced by IPSL is licensed un...\n",
       "    EXPID:                   historical\n",
       "    grid_label:              gn\n",
       "    Conventions:             CF-1.7 CMIP-6.2\n",
       "    source_id:               IPSL-CM6A-LR\n",
       "    description:             CMIP6 historical\n",
       "    institution:             Institut Pierre Simon Laplace, Paris 75252, France\n",
       "    experiment:              all-forcing simulation of the recent past\n",
       "    frequency:               mon\n",
       "    activity_id:             CMIP\n",
       "    parent_activity_id:      CMIP\n",
       "    contact:                 ipsl-cmip6@listes.ipsl.fr\n",
       "    realm:                   ocean\n",
       "    source_type:             AOGCM BGC\n",
       "    data_specs_version:      01.00.21\n",
       "    further_info_url:        https://furtherinfo.es-doc.org/CMIP6.IPSL.IPSL-C...\n",
       "    dr2xml_version:          1.11\n",
       "    variable_id:             thetao\n",
       "    parent_time_units:       days since 1850-01-01 00:00:00\n",
       "    parent_mip_era:          CMIP6\n",
       "    CMIP6_CV_version:        cv=6.2.3.5-2-g63b123e\n",
       "    product:                 model-output\n",
       "    NCO:                     "4.6.0"\n",
       "    experiment_id:           historical\n",
       "    branch_time_in_child:    [0.]\n",
       "    nominal_resolution:      100 km\n",
       "    tracking_id:             hdl:21.14100/2357970e-3f77-4595-80d8-e3d5c69d0bd...\n",
       "    table_id:                Omon\n",
       "    external_variables:      areacello volcello\n",
       "    mip_era:                 CMIP6\n",
       "    name:                    /ccc/work/cont003/gencmip6/p86caub/IGCM_OUT/IPSL...\n",
       "    intake_esm_dataset_key:  CMIP6.IPSL.IPSL-CM6A-LR.historical.Omon
" ], "text/plain": [ "\n", "Dimensions: (bnds: 2, ensemble_member: 1, lev: 75, time: 1980, vertex: 4, x: 362, y: 332)\n", "Coordinates:\n", " lat (y, x) float32 dask.array\n", " lon (y, x) float32 dask.array\n", " * lev (lev) float32 0.50576 1.5558553 ... 5698.0605 5902.0576\n", " * time (time) object 1850-01-16 12:00:00 ... 2015-01-25 12:00:00\n", " * ensemble_member (ensemble_member) \n", " lat_bounds (y, x, vertex) float32 dask.array\n", " area (y, x) float32 dask.array\n", " lev_bounds (lev, bnds) float32 dask.array\n", " time_bounds (time, bnds) object dask.array\n", " thetao (ensemble_member, time, lev, y, x) float32 dask.array\n", "Attributes:\n", " title: IPSL-CM6A-LR model output prepared for CMIP6 / C...\n", " intake_esm_varname: ['thetao']\n", " source: IPSL-CM6A-LR (2017): atmos: LMDZ (NPv6, N96; 14...\n", " institution_id: IPSL\n", " history: Sat Dec 1 12:16:38 2018: ncatted -O -a realizat...\n", " physics_index: [1]\n", " parent_variant_label: r1i1p1f1\n", " parent_experiment_id: piControl\n", " branch_method: standard\n", " grid: native ocean tri-polar grid with 105 k ocean cells\n", " realization_index: [1]\n", " parent_source_id: IPSL-CM6A-LR\n", " sub_experiment_id: none\n", " model_version: 6.1.5\n", " variant_label: r1i1p1f1\n", " sub_experiment: none\n", " branch_time_in_parent: [21914.]\n", " forcing_index: [1]\n", " initialization_index: [1]\n", " dr2xml_md5sum: f1e40c1fc5d8281f865f72fbf4e38f9d\n", " license: CMIP6 model data produced by IPSL is licensed un...\n", " EXPID: historical\n", " grid_label: gn\n", " Conventions: CF-1.7 CMIP-6.2\n", " source_id: IPSL-CM6A-LR\n", " description: CMIP6 historical\n", " institution: Institut Pierre Simon Laplace, Paris 75252, France\n", " experiment: all-forcing simulation of the recent past\n", " frequency: mon\n", " activity_id: CMIP\n", " parent_activity_id: CMIP\n", " contact: ipsl-cmip6@listes.ipsl.fr\n", " realm: ocean\n", " source_type: AOGCM BGC\n", " data_specs_version: 01.00.21\n", " further_info_url: https://furtherinfo.es-doc.org/CMIP6.IPSL.IPSL-C...\n", " dr2xml_version: 1.11\n", " variable_id: thetao\n", " parent_time_units: days since 1850-01-01 00:00:00\n", " parent_mip_era: CMIP6\n", " CMIP6_CV_version: cv=6.2.3.5-2-g63b123e\n", " product: model-output\n", " NCO: \"4.6.0\"\n", " experiment_id: historical\n", " branch_time_in_child: [0.]\n", " nominal_resolution: 100 km\n", " tracking_id: hdl:21.14100/2357970e-3f77-4595-80d8-e3d5c69d0bd...\n", " table_id: Omon\n", " external_variables: areacello volcello\n", " mip_era: CMIP6\n", " name: /ccc/work/cont003/gencmip6/p86caub/IGCM_OUT/IPSL...\n", " intake_esm_dataset_key: CMIP6.IPSL.IPSL-CM6A-LR.historical.Omon" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dset_dict_T['CMIP6.IPSL.IPSL-CM6A-LR.historical.Omon'] " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "debug ends" ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:notebook] *", "language": "python", "name": "conda-env-notebook-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.6" } }, "nbformat": 4, "nbformat_minor": 5 }