{ "metadata": { "name": "", "signature": "sha256:6aeb405fc37f743c8908ece04b2dd69619c2811988446073a919adb87f4b0e0c" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Edges weighted with the combined score generated by the STRING database will be useful for comparison against our own method and to test the community detection analysis before the weighted edges generated using our method are ready.\n", "Two options exist to get these weightings:\n", "\n", "1. Use the STRING online service\n", "2. Parsing the STRING summary features we have already extracted in [this notebook][stringnotes]\n", "\n", "Unfortunately, the online service produces a table that does not include the Entrez IDs that are originally put in, so the output would have to be mapped back to Entrez IDs for our pipeline.\n", "The fastest way will be to use the pickled object created in the above notebook to generate features and take only the combined values:\n", "\n", "[stringnotes]: http://nbviewer.ipython.org/github/ggray1729/opencast-bio/blob/master/notebooks/Extracting%20STRING%20results-based%20features.ipynb" ] }, { "cell_type": "code", "collapsed": false, "input": [ "cd ../../features" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/home/gavin/Documents/MRes/features\n" ] } ], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "import csv" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "ls" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\u001b[0m\u001b[40m\u001b[m\u001b[01;36mabundance.Entrez.full.txt\u001b[0m@ \u001b[40m\u001b[m\u001b[01;36mhead.training.nolabel.negative.Entrez.vectors.txt\u001b[0m@\r\n", "\u001b[40m\u001b[m\u001b[01;36mabundance.Entrez.traintest.txt\u001b[0m@ \u001b[40m\u001b[m\u001b[01;36mpulldown.edges.Entrez.txt\u001b[0m@\r\n", "\u001b[40m\u001b[m\u001b[00mautogit.log\u001b[0m \u001b[40m\u001b[m\u001b[01;36mpulldown.nolabel.Entrez.vectors.txt\u001b[0m@\r\n", "\u001b[40m\u001b[m\u001b[01;36mc2s.Entrez.full.txt\u001b[0m@ \u001b[40m\u001b[m\u001b[00mtraining.nolabel.negative.Entrez.vectors.txt\u001b[0m\r\n", "\u001b[40m\u001b[m\u001b[01;36mc2s.Entrez.traintest.txt\u001b[0m@ \u001b[40m\u001b[m\u001b[00mtraining.nolabel.positive.Entrez.vectors.txt\u001b[0m\r\n" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "import sys" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "sys.path.append(\"/home/gavin/Documents/MRes/opencast-bio/\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "import ocbio.string" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "import pickle" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "f = open(\"../string/human.Entrez.string.pickle\")\n", "stringfeatures = pickle.load(f)\n", "f.close()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "pulldownpairfile = open(\"../forGAVIN/pulldown_data/pulldown.interactions.Entrez.tsv\")\n", "stringedgefile = open(\"pulldown.string.edges.tsv\", \"w\")\n", "cp = csv.reader(pulldownpairfile, delimiter=\"\\t\")\n", "cs = csv.writer(stringedgefile, delimiter=\"\\t\")\n", "for l in cp:\n", " # for each pair index the feature dictionary\n", " # write the pairs that are non-zero\n", " pair = frozenset(l)\n", " combinedscore = float(stringfeatures[pair][-1])\n", " if combinedscore > 0.0000001:\n", " cs.writerow(l + [combinedscore])\n", "pulldownpairfile.close()\n", "stringedgefile.close()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 32 }, { "cell_type": "code", "collapsed": false, "input": [ "!head pulldown.string.edges.tsv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "114787\t9520\t198.0\r", "\r\n", "114787\t28964\t163.0\r", "\r\n", "114787\t546\t369.0\r", "\r\n", "114787\t7846\t283.0\r", "\r\n", "114787\t26052\t274.0\r", "\r\n", "114787\t9024\t231.0\r", "\r\n", "114787\t4983\t229.0\r", "\r\n", "114787\t6252\t154.0\r", "\r\n", "114787\t1212\t180.0\r", "\r\n", "114787\t10814\t206.0\r", "\r\n" ] } ], "prompt_number": 33 } ], "metadata": {} } ] }