# Export Hetionet v1.0 to an xarray.Dataset

In [1]:
import os
import pickle
import gzip

import hetio.readwrite

from hetmech.xarray import graph_to_xarray

## Load Hetionet v1.0

In [2]:
url = 'https://github.com/dhimmel/hetionet/raw/76550e6c93fbe92124edc71725e8c7dd4ca8b1f5/hetnet/json/hetionet-v1.0.json.bz2'
graph = hetio.readwrite.read_graph(url)
metagraph = graph.metagraph

## Create xarray.Dataset

In [3]:
dataset = graph_to_xarray(graph)

In [4]:
dataset


Dimensions: (Anatomy: 402, Biological Process: 11381, Cellular Component: 1391, Compound: 1552, Disease: 137, Gene: 20945, Molecular Function: 2884, Pathway: 1822, Pharmacologic Class: 345, Side Effect: 5734, Symptom: 438)
Coordinates:
 * Anatomy (Anatomy) G (Gene, Gene) bool False False False False False ...
 GpMF (Gene, Molecular Function) bool False False False ...
 GpPW (Gene, Pathway) bool False False False False False ...
 PCiC (Pharmacologic Class, Compound) bool False False ...

In [5]:
dataset['Gr>G']


array(['GO:0000002', 'GO:0000012', 'GO:0000018', ..., 'GO:2001301',
 'GO:2001302', 'GO:2001303'], 
 dtype='G' (Gene: 20945)>
array([[False, False, False, ..., False, False, False],
 [False, False, False, ..., False, False, False],
 [False, False, False, ..., False, False, False],
 ..., 
 [False, False, False, ..., False, False, False],
 [False, False, False, ..., False, False, False],
 [False, False, False, ..., False, False, False]], dtype=bool)
Coordinates:
 * Gene (Gene) int64 1 2 9 10 12 13 14 15 16 18 19 20 21 22 23 24 25 26 ...

In [7]:
dataset.Gene


array([ 1, 2, 9, ..., 105379874, 105379878, 105379886])
Coordinates:
 * Gene (Gene) int64 1 2 9 10 12 13 14 15 16 18 19 20 21 22 23 24 25 26 ...

In [8]:
# Density of each metaedge
dataset.mean()


Dimensions: ()
Data variables:
 AdG float64 0.01214
 AeG float64 0.06252
 AuG float64 0.01162
 CrC float64 0.005385
 CpD float64 0.001834
 CtD float64 0.003551
 CbG float64 0.000356
 CdG float64 0.0006492
 CuG float64 0.000577
 CcSE float64 0.01561
 DlA float64 0.0654
 DrD float64 0.05786
 DaG float64 0.004399
 DdG float64 0.002657
 DuG float64 0.002694
 DpS float64 0.05594
 GpBP float64 0.002347
 GpCC float64 0.002525
 GcG float64 0.0002812
 GiG float64 0.0006709
 Gr>G float64 0.0006056
 GpMF float64 0.001609
 GpPW float64 0.002211
 PCiC float64 0.001922

## Dataset IO

In [9]:
# netcdf file was humongous. Avoid!
# dataset.to_netcdf('xarray_dataset.nc')

In [10]:
path = os.path.join('data', 'xarray_dataset.pkl.gz')

In [11]:
# Save as pickle
with gzip.open(path, 'wb') as write_file:
 pickle.dump(dataset, write_file, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
# Read pickle
with gzip.open(path) as read_file:
 dataset = pickle.load(read_file)