Authors: Mackenzie Blanusa, A.Radhakrishnan

In [None]:
from glob import glob
import xarray as xr
import cftime
import nc_time_axis
import numpy as np
import matplotlib.pyplot as plt
import intake, intake_esm
from dask_gateway import Gateway
import pandas as pd
pd.set_option("display.max_colwidth", None)
#!pip install cmip6_preprocessing

%pip install git+https://github.com/jbusecke/cmip6_preprocessing.git


from cmip6_preprocessing.preprocessing import combined_preprocessing
from cmip6_preprocessing.preprocessing import (correct_units,rename_cmip6)

In [3]:
def latest_version(cat):
    """
    input
    cat: esmdatastore 
    output
    esmdatastore with latest DRS versions
    """
    
    latest_cat = cat.df.sort_values(by=['version','path']).drop_duplicates(['temporal subset','model','mip_table',
                                               'institute','variable','ensemble_member',
                                               'grid_label','experiment_id'],keep='last')
    return latest_cat

In [4]:
def fix_time(ds):
    """ force calendar to noleap"""
    import xarray as xr
    ds = ds.copy()
    if "time" not in ds.dims:
        return ds
    
    if ("calendar" not in ds["time"].attrs): 
        ds["time"].attrs.update({"calendar": "noleap"})
        
    if ds["time"].attrs["calendar"] not in ["noleap", "NOLEAP", "365_day"]:
        ds["time"].attrs.update({"calendar": "noleap"})
        
    ds = xr.decode_cf(ds)
    return ds

In [5]:
def fix_units(ds):
    if "units" in ds["lev"].attrs:
        if ds["lev"].attrs["units"] in ["cm", "centimeters"]:
            ds["lev"] = xr.DataArray(ds["lev"].values / 100., dims=ds["lev"].dims)
    return ds

In [6]:
def pp_thetao(ds):
    ds = ds.copy()   #the wrapper function makes a copy of the ds and works from this
    ds = rename_cmip6(ds)
    ds = fix_time(ds)
    #ds = fix_units(ds)
    ds = correct_units(ds)
    return ds

## Load the catalog 

In [9]:
col_url = "https://cmip6-nc.s3.us-east-2.amazonaws.com/esgf-world.json"

In [10]:
col = intake.open_esm_datastore(col_url)

debug starts

In [11]:
query = dict(experiment_id=['historical'],
                 mip_table='Omon',
                 ensemble_member=["r1i1p1f1"],
                 model=['IPSL-CM6A-LR'],
                 grid_label=['gn'],
                 variable=["thetao"]
        )
cat_T = col.search(**query)

WHAT DOES NOT WORK: 

the following misses olevel_bounds (renamed to lev_bnds) and other variables after preprocesssing. 
Without preprocessing, the datasets have the old dim/var names as found in the original files/objects

In [12]:
dset_dict_T_orig = cat_T.to_dataset_dict(cdf_kwargs={'decode_times': False, 'chunks': {'time': 1,'olevel':1}},
                                preprocess = pp_thetao,storage_options={'anon':True})


--> The keys in the returned dictionary of datasets are constructed as follows:
	'project.institute.model.experiment_id.mip_table'


In [13]:
for k, ds in dset_dict_T_orig.items():
    print(k)
    print(list(ds.dims))

CMIP6.IPSL.IPSL-CM6A-LR.historical.Omon
['ensemble_member', 'lev', 'time', 'x', 'y']


In [14]:
dset_dict_T_orig['CMIP6.IPSL.IPSL-CM6A-LR.historical.Omon'] 

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 480.74 kB 480.74 kB Shape (332, 362) (332, 362) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",362  332,

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 480.74 kB 480.74 kB Shape (332, 362) (332, 362) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",362  332,

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,71.39 GB,480.74 kB
Shape,"(1, 1980, 75, 332, 362)","(1, 1, 1, 332, 362)"
Count,445502 Tasks,148500 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 71.39 GB 480.74 kB Shape (1, 1980, 75, 332, 362) (1, 1, 1, 332, 362) Count 445502 Tasks 148500 Chunks Type float32 numpy.ndarray",1980  1  362  332  75,

Unnamed: 0,Array,Chunk
Bytes,71.39 GB,480.74 kB
Shape,"(1, 1980, 75, 332, 362)","(1, 1, 1, 332, 362)"
Count,445502 Tasks,148500 Chunks
Type,float32,numpy.ndarray


WHAT WORKS:

for some reason the following works and includes all data variables. latest_version outputs a pandas dataframe which we then convert to esm datastore

In [15]:
cat_T_gn_latest = latest_version(cat_T)
esmcol_data = col.esmcol_data
cat_T2 = intake.open_esm_datastore(cat_T_gn_latest,esmcol_data=esmcol_data)

dset_dict_T = cat_T2.to_dataset_dict(cdf_kwargs={'decode_times': False, 'chunks': {'time': 1}},
                                preprocess = pp_thetao,storage_options={'anon':True})



--> The keys in the returned dictionary of datasets are constructed as follows:
	'project.institute.model.experiment_id.mip_table'


In [16]:
dset_dict_T['CMIP6.IPSL.IPSL-CM6A-LR.historical.Omon'] 

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 480.74 kB 480.74 kB Shape (332, 362) (332, 362) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",362  332,

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 480.74 kB 480.74 kB Shape (332, 362) (332, 362) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",362  332,

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.92 MB,1.92 MB
Shape,"(332, 362, 4)","(332, 362, 4)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.92 MB 1.92 MB Shape (332, 362, 4) (332, 362, 4) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",4  362  332,

Unnamed: 0,Array,Chunk
Bytes,1.92 MB,1.92 MB
Shape,"(332, 362, 4)","(332, 362, 4)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.92 MB,1.92 MB
Shape,"(332, 362, 4)","(332, 362, 4)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.92 MB 1.92 MB Shape (332, 362, 4) (332, 362, 4) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",4  362  332,

Unnamed: 0,Array,Chunk
Bytes,1.92 MB,1.92 MB
Shape,"(332, 362, 4)","(332, 362, 4)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 480.74 kB 480.74 kB Shape (332, 362) (332, 362) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",362  332,

Unnamed: 0,Array,Chunk
Bytes,480.74 kB,480.74 kB
Shape,"(332, 362)","(332, 362)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,600 B,600 B
Shape,"(75, 2)","(75, 2)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 600 B 600 B Shape (75, 2) (75, 2) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",2  75,

Unnamed: 0,Array,Chunk
Bytes,600 B,600 B
Shape,"(75, 2)","(75, 2)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,31.68 kB,16 B
Shape,"(1980, 2)","(1, 2)"
Count,5942 Tasks,1980 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 31.68 kB 16 B Shape (1980, 2) (1, 2) Count 5942 Tasks 1980 Chunks Type object numpy.ndarray",2  1980,

Unnamed: 0,Array,Chunk
Bytes,31.68 kB,16 B
Shape,"(1980, 2)","(1, 2)"
Count,5942 Tasks,1980 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,71.39 GB,36.06 MB
Shape,"(1, 1980, 75, 332, 362)","(1, 1, 75, 332, 362)"
Count,5942 Tasks,1980 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 71.39 GB 36.06 MB Shape (1, 1980, 75, 332, 362) (1, 1, 75, 332, 362) Count 5942 Tasks 1980 Chunks Type float32 numpy.ndarray",1980  1  362  332  75,

Unnamed: 0,Array,Chunk
Bytes,71.39 GB,36.06 MB
Shape,"(1, 1980, 75, 332, 362)","(1, 1, 75, 332, 362)"
Count,5942 Tasks,1980 Chunks
Type,float32,numpy.ndarray


debug ends