{ "cells": [ { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# Export Hetionet v1.0 to an xarray.Dataset" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import os\n", "import pickle\n", "import gzip\n", "\n", "import hetio.readwrite\n", "\n", "from hetmech.xarray import graph_to_xarray" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Load Hetionet v1.0" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "url = 'https://github.com/dhimmel/hetionet/raw/76550e6c93fbe92124edc71725e8c7dd4ca8b1f5/hetnet/json/hetionet-v1.0.json.bz2'\n", "graph = hetio.readwrite.read_graph(url)\n", "metagraph = graph.metagraph" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Create xarray.Dataset" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "dataset = graph_to_xarray(graph)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "data": { "text/plain": [ "\n", "Dimensions: (Anatomy: 402, Biological Process: 11381, Cellular Component: 1391, Compound: 1552, Disease: 137, Gene: 20945, Molecular Function: 2884, Pathway: 1822, Pharmacologic Class: 345, Side Effect: 5734, Symptom: 438)\n", "Coordinates:\n", " * Anatomy (Anatomy) G (Gene, Gene) bool False False False False False ...\n", " GpMF (Gene, Molecular Function) bool False False False ...\n", " GpPW (Gene, Pathway) bool False False False False False ...\n", " PCiC (Pharmacologic Class, Compound) bool False False ..." ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "data": { "text/plain": [ "\n", "array(['GO:0000002', 'GO:0000012', 'GO:0000018', ..., 'GO:2001301',\n", " 'GO:2001302', 'GO:2001303'], \n", " dtype='G' (Gene: 20945)>\n", "array([[False, False, False, ..., False, False, False],\n", " [False, False, False, ..., False, False, False],\n", " [False, False, False, ..., False, False, False],\n", " ..., \n", " [False, False, False, ..., False, False, False],\n", " [False, False, False, ..., False, False, False],\n", " [False, False, False, ..., False, False, False]], dtype=bool)\n", "Coordinates:\n", " * Gene (Gene) int64 1 2 9 10 12 13 14 15 16 18 19 20 21 22 23 24 25 26 ..." ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset['Gr>G']" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "data": { "text/plain": [ "\n", "array([ 1, 2, 9, ..., 105379874, 105379878, 105379886])\n", "Coordinates:\n", " * Gene (Gene) int64 1 2 9 10 12 13 14 15 16 18 19 20 21 22 23 24 25 26 ..." ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.Gene" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "data": { "text/plain": [ "\n", "Dimensions: ()\n", "Data variables:\n", " AdG float64 0.01214\n", " AeG float64 0.06252\n", " AuG float64 0.01162\n", " CrC float64 0.005385\n", " CpD float64 0.001834\n", " CtD float64 0.003551\n", " CbG float64 0.000356\n", " CdG float64 0.0006492\n", " CuG float64 0.000577\n", " CcSE float64 0.01561\n", " DlA float64 0.0654\n", " DrD float64 0.05786\n", " DaG float64 0.004399\n", " DdG float64 0.002657\n", " DuG float64 0.002694\n", " DpS float64 0.05594\n", " GpBP float64 0.002347\n", " GpCC float64 0.002525\n", " GcG float64 0.0002812\n", " GiG float64 0.0006709\n", " Gr>G float64 0.0006056\n", " GpMF float64 0.001609\n", " GpPW float64 0.002211\n", " PCiC float64 0.001922" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Density of each metaedge\n", "dataset.mean()" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Dataset IO" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "# netcdf file was humongous. Avoid!\n", "# dataset.to_netcdf('xarray_dataset.nc')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "path = os.path.join('data', 'xarray_dataset.pkl.gz')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "# Save as pickle\n", "with gzip.open(path, 'wb') as write_file:\n", " pickle.dump(dataset, write_file, protocol=pickle.HIGHEST_PROTOCOL)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "# Read pickle\n", "with gzip.open(path) as read_file:\n", " dataset = pickle.load(read_file)" ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:hetmech]", "language": "python", "name": "conda-env-hetmech-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 2 }