{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Feature comparisons between allowing and excluding paths with duplicate nodes" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/dhimmels/anaconda3/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.\n", " warnings.warn(self.msg_depr % (key, alt_key))\n" ] } ], "source": [ "import re\n", "import functools\n", "\n", "import pandas\n", "import matplotlib.pyplot\n", "import seaborn\n", "import numpy\n", "import sklearn.metrics\n", "import qgrid" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "seaborn.set_style('whitegrid')\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Read feature and partition data\n", "feature_df = pandas.read_table('features.tsv.gz', nrows=None)\n", "part_df = pandas.read_table('../data/partition.tsv.gz')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "@functools.lru_cache(maxsize=None)\n", "def duplicate_metanodes(metapath):\n", " metanodes = re.split('[a-z<>]+', metapath)\n", " return len(set(metanodes)) < len(metanodes)\n", "\n", "# Restrict to metapaths that have duplicate metanodes\n", "feature_df = feature_df[feature_df.metapath.map(duplicate_metanodes)]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "dfs = {}\n", "for key in 'PC', 'DWPC':\n", " # Create spread dataframe\n", " df = feature_df.pivot_table(values=key, index=['compound_id', 'disease_id', 'metapath'], columns='unique_nodes')\n", " df = df.dropna() # use if only reading a part of `dwpc.tsv.gz`\n", " \n", " # Check that the two methods of duplicate node exclusion produce the same results\n", " assert all(numpy.isclose(df.nested, df.expanded) & numpy.isclose(df.expanded, df.labeled))\n", " \n", " # Remove duplicate columns and rename columns\n", " df = df.drop(['nested', 'expanded'], axis=1)\n", " df = df.rename(columns={'False': key + '_dupl', 'labeled': key + '_nodupl'})\n", " dfs[key] = df\n", "\n", "# Create a spread dataframe of Path Counts\n", "spread_df = pandas.concat(dfs.values(), axis=1).reset_index()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
| unique_nodes | \n", "compound_id | \n", "disease_id | \n", "metapath | \n", "DWPC_dupl | \n", "DWPC_nodupl | \n", "PC_dupl | \n", "PC_nodupl | \n", "
|---|---|---|---|---|---|---|---|
| 0 | \n", "DB00091 | \n", "DOID:1312 | \n", "CbG<kdG<kdGaD | \n", "0.000601 | \n", "0.000601 | \n", "18 | \n", "18 | \n", "
| 1 | \n", "DB00091 | \n", "DOID:1312 | \n", "CbG<kdG<kdGdD | \n", "0.000000 | \n", "0.000000 | \n", "0 | \n", "0 | \n", "