{ "metadata": { "name": "", "signature": "sha256:14389e00b7edf4aa40aba602361b773f1547d950e8d011fff7d9f0f3032f7c15" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import shelve" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "sys.path.append(\"/home/gavin/Documents/MRes/opencast-bio/\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "import ocbio.extract" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "cd ~/Documents/MRes/" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/home/gavin/Documents/MRes\n" ] } ], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [ "parser = ocbio.extract.ProteinPairParser(\"/home/gavin/Documents/MRes/HIPPIE/hippie_current.txt\",\n", " \"/home/gavin/Documents/MRes/HIPPIE/feature.HIPPIE.db\",\n", " protindexes=(1,3),valindexes=[4])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 80 }, { "cell_type": "code", "collapsed": false, "input": [ "parser.regenerate(force=True, verbose=True)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Forcing regeneration of database /home/gavin/Documents/MRes/HIPPIE/feature.HIPPIE.db from data file /home/gavin/Documents/MRes/HIPPIE/hippie_current.txt.\n", "Filling database.." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..." ] }, { "output_type": "stream", "stream": "stdout", "text": [ "..\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Parsed 169626 lines.\n" ] } ], "prompt_number": 81 }, { "cell_type": "code", "collapsed": false, "input": [ "examplekeys = parser.db.keys()[0:10]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 54 }, { "cell_type": "code", "collapsed": false, "input": [ "print len(parser.db.keys())" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "169323\n" ] } ], "prompt_number": 82 }, { "cell_type": "code", "collapsed": false, "input": [ "parser.db.close()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 88 }, { "cell_type": "code", "collapsed": false, "input": [ "parser2 = ocbio.extract.ProteinPairParser(\"/home/gavin/Documents/MRes/HIPPIE/hippie_current.txt\",\n", " \"/home/gavin/Documents/MRes/HIPPIE/feature2.HIPPIE.db\",\n", " protindexes=(1,3),valindexes=[4])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 77 }, { "cell_type": "code", "collapsed": false, "input": [ "print len(parser2.db.keys())" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0\n" ] } ], "prompt_number": 78 }, { "cell_type": "code", "collapsed": false, "input": [ "parser.close()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 49 }, { "cell_type": "code", "collapsed": false, "input": [ "cd test/" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "/home/gavin/Documents/MRes/test\n" ] } ], "prompt_number": 51 }, { "cell_type": "code", "collapsed": false, "input": [ "class test():\n", " def __init__(self):\n", " self.db = ocbio.extract.ProteinPairDB(\"test.db\")\n", " def __setitem__(self,key,value):\n", " self.db[key] = value" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 55 }, { "cell_type": "code", "collapsed": false, "input": [ "inst = test()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 56 }, { "cell_type": "code", "collapsed": false, "input": [ "for k in examplekeys:\n", " inst[k] = 1" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 57 }, { "cell_type": "code", "collapsed": false, "input": [ "inst.db.keys()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 58, "text": [ "[frozenset({'10054', '201626'}),\n", " frozenset({'10236', '3930'}),\n", " frozenset({'10320', '6605'}),\n", " frozenset({'10399', '8364'}),\n", " frozenset({'10474', '1499'}),\n", " frozenset({'1058', '1060'}),\n", " frozenset({'10289', '10857'}),\n", " frozenset({'11091110911109111091'}),\n", " frozenset({'112495', '9329'}),\n", " frozenset({'10379', '1822'}),\n", " frozenset({'something'})]" ] } ], "prompt_number": 58 }, { "cell_type": "code", "collapsed": false, "input": [ "inst.db.close()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 60 }, { "cell_type": "code", "collapsed": false, "input": [ "inst = test()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 61 }, { "cell_type": "code", "collapsed": false, "input": [ "inst.db.keys()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 62, "text": [ "[frozenset({'10054', '201626'}),\n", " frozenset({'10236', '3930'}),\n", " frozenset({'10320', '6605'}),\n", " frozenset({'10399', '8364'}),\n", " frozenset({'10474', '1499'}),\n", " frozenset({'1058', '1060'}),\n", " frozenset({'10289', '10857'}),\n", " frozenset({'11091110911109111091'}),\n", " frozenset({'112495', '9329'}),\n", " frozenset({'10379', '1822'}),\n", " frozenset({'something'})]" ] } ], "prompt_number": 62 }, { "cell_type": "code", "collapsed": false, "input": [ "%load ../opencast-bio/ocbio/extract.py" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 85 }, { "cell_type": "code", "collapsed": false, "input": [ "# Feature extraction code\n", "# Header pending\n", "\n", "import os\n", "import time\n", "import subprocess\n", "import csv\n", "import shelve\n", "import re\n", "import sys\n", "import pickle\n", "import pdb\n", "\n", "def verbosecheck(verbose):\n", " '''returns a function depending on the state of the verbose flag'''\n", " if verbose:\n", " def v_print(*args):\n", " '''declare v_print function that prints to stdout\n", " if verbose flag is on'''\n", " for argument in args:\n", " print argument,\n", " print\n", " else:\n", " def v_print(*args):\n", " None\n", " return v_print\n", "\n", "class ProteinPairDB(shelve.DbfilenameShelf):\n", " '''A simple database for protein pairs using shelve.'''\n", " def __setitem__(self, key, value):\n", " # key will be frozenset so make it a list first\n", " key = list(key)\n", " # then make it a string\n", " if len(key) == 1:\n", " key = key[0] * 2\n", " else:\n", " key = key[0] + \"\\t\" + key[1]\n", " shelve.DbfilenameShelf.__setitem__(self, key, value)\n", " return None\n", "\n", " def __getitem__(self, key):\n", " # make two strings from the key\n", " key = list(key)\n", " if len(key) == 1:\n", " key1 = key[0] * 2\n", " key1 = key[0] * 2\n", " else:\n", " key1 = key[0] + \"\\t\" + key[1]\n", " key2 = key[1] + \"\\t\" + key[0]\n", " # try the first one\n", " try:\n", " value = shelve.DbfilenameShelf.__getitem__(self, key1)\n", " except KeyError:\n", " # couldn't find the first key, try the second\n", " value = shelve.DbfilenameShelf.__getitem__(self, key2)\n", " # if we don't find this one then error out as usual\n", " return value\n", "\n", " def keys(self):\n", " # retrieve the string keys used by shelve\n", " ks = shelve.DbfilenameShelf.keys(self)\n", " # convert them to frozensets\n", " ks = map(lambda x: frozenset(x.split(\"\\t\")), ks)\n", " return ks\n", "\n", "\n", "class ProteinPairParser():\n", " '''Does simple parsing on data files to produce protein pair files with feature values'''\n", " def __init__(self,\n", " datadir,\n", " outdir,\n", " protindexes=(0, 1),\n", " valindexes=[2],\n", " script=None,\n", " csvdelim=\"\\t\",\n", " ignoreheader=0,\n", " generator=False):\n", " # first, store the initialisation\n", " self.datadir = datadir\n", " self.outdir = outdir\n", " self.protindexes = protindexes\n", " # had to hack this together from the list above\n", " # passing tuple in as default did not work\n", " self.valindexes = tuple(valindexes)\n", " self.script = script\n", " self.csvdelim = csvdelim\n", " self.ignoreheader = ignoreheader\n", " if generator:\n", " #then open up this pickle file\n", " f = open(generator)\n", " self.generator = pickle.load(f)\n", " f.close()\n", " self.db = None\n", " else:\n", " #otherwise open the database that is assumed to exist \n", " self.generator = None\n", " self.db = openpairshelf(self.outdir)\n", " return None\n", "\n", " def regenerate(self, force=False, verbose=False):\n", " '''Regenerate the pair file from the data source\n", " if the data source is newer than the pair file'''\n", " v_print = verbosecheck(verbose)\n", " if self.generator == None:\n", " # so first check the ages of both files\n", " datamtime = os.stat(self.datadir)[-2]\n", " if os.path.isfile(self.outdir):\n", " pairmtime = os.stat(self.outdir)[-2]\n", " else:\n", " # bit of a hack\n", " pairmtime = 0\n", " # if the data modification time is greater than output modification time\n", " if datamtime > pairmtime or force is True:\n", " # now regenerate the data file according to the options defined above:\n", " if verbose and datamtime > pairmtime:\n", " if pairmtime == 0:\n", " print \"Database file not found, regenerating at {0} from {1}.\".format(self.outdir, self.datadir)\n", " else:\n", " print \"Data file {0} is newer than processed database {1}, regenerating.\".format(self.datadir, self.outdir)\n", " if verbose and force:\n", " print \"Forcing regeneration of database {0} from data file {1}.\".format(self.outdir, self.datadir)\n", " # if there's a script to run\n", " if self.script is not None:\n", " v_print(\"Executing script: {0}.\".format(self.script))\n", " # then execute the script\n", " retcode = subprocess.call(\"python2 {0}\".format(self.script), shell=True)\n", "\n", " v_print(\"Script returned: {0}\".format(retcode))\n", " # first delete out of date file, if it's there\n", " if os.path.isfile(self.outdir):\n", " os.remove(self.outdir)\n", " # open the data file\n", " c = csv.reader(open(self.datadir), delimiter=self.csvdelim)\n", " # if the header should be ignored then ignore it\n", "\n", " if self.ignoreheader == \"1\":\n", " v_print(\"Ignoring header.\")\n", " c.next()\n", "\n", " if verbose:\n", " sys.stdout.write(\"Filling database\")\n", " lcount = 0\n", "\n", " for line in c:\n", " # each line use the protein pair as a key\n", " # by formatting it as a frozenset\n", " pair = frozenset([line[self.protindexes[0]], line[self.protindexes[1]]])\n", " # and the value is indexed by valindexes\n", " values = []\n", "\n", " for i in self.valindexes:\n", " values.append(line[i])\n", "\n", " self.db[pair] = values[:]\n", "\n", " if verbose:\n", " lcount = lcount + 1\n", " if lcount % 1000 == 0:\n", " sys.stdout.write(\".\")\n", "\n", " if verbose:\n", " sys.stdout.write(\"\\n\")\n", " print \"Parsed {0} lines.\".format(lcount)\n", " else:\n", " v_print(\"Custom generator function, no database to regenerate.\")\n", "\n", " return None\n", "\n", " def __getitem__(self,key):\n", " if self.generator != None:\n", " #try and read a key from the custom generator\n", " return self.generator[key]\n", " else:\n", " #read key from database\n", " return self.db[key]\n", "\n", " def close(self):\n", " self.db.close()\n", " return None\n", "\n", "\n", "def openpairshelf(filename, flag='c', protocol=None, writeback=False):\n", " \"\"\"Returns a ProteinPairDB object, with similar functionality to shelve.open()\"\"\"\n", " return ProteinPairDB(filename, flag, protocol, writeback)\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 87 }, { "cell_type": "code", "collapsed": false, "input": [ "parser = ProteinPairParser(\"/home/gavin/Documents/MRes/HIPPIE/hippie_current.txt\",\n", " \"/home/gavin/Documents/MRes/HIPPIE/feature.HIPPIE.db\",\n", " protindexes=(1,3),valindexes=[4])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 89 }, { "cell_type": "code", "collapsed": false, "input": [ "parser.db.keys()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 90, "text": [ "[]" ] } ], "prompt_number": 90 }, { "cell_type": "code", "collapsed": false, "input": [ "parser.regenerate(force=True)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 92 }, { "cell_type": "code", "collapsed": false, "input": [ "print len(parser.db.keys())" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "169323\n" ] } ], "prompt_number": 93 }, { "cell_type": "code", "collapsed": false, "input": [ "parser.db.close()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 94 }, { "cell_type": "code", "collapsed": false, "input": [ "parser.db = openpairshelf(\"/home/gavin/Documents/MRes/HIPPIE/feature.HIPPIE.db\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 99 }, { "cell_type": "code", "collapsed": false, "input": [ "print len(parser.db.keys())" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0\n" ] } ], "prompt_number": 96 }, { "cell_type": "code", "collapsed": false, "input": [ "for k in examplekeys:\n", " parser.db[k] = 1" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 100 }, { "cell_type": "code", "collapsed": false, "input": [ "print parser.db.keys()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[frozenset(['10054', '201626']), frozenset(['10236', '3930']), frozenset(['10320', '6605']), frozenset(['10399', '8364']), frozenset(['10474', '1499']), frozenset(['1058', '1060']), frozenset(['10857', '10289']), frozenset(['11091110911109111091']), frozenset(['112495', '9329']), frozenset(['10379', '1822'])]\n" ] } ], "prompt_number": 101 }, { "cell_type": "code", "collapsed": false, "input": [ "parser.close()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 102 }, { "cell_type": "code", "collapsed": false, "input": [ "parser.db = openpairshelf(\"/home/gavin/Documents/MRes/HIPPIE/feature.HIPPIE.db\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 103 }, { "cell_type": "code", "collapsed": false, "input": [ "print parser.db.keys()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[frozenset(['10054', '201626']), frozenset(['10236', '3930']), frozenset(['10320', '6605']), frozenset(['10399', '8364']), frozenset(['10474', '1499']), frozenset(['1058', '1060']), frozenset(['10857', '10289']), frozenset(['11091110911109111091']), frozenset(['112495', '9329']), frozenset(['10379', '1822'])]\n" ] } ], "prompt_number": 104 } ], "metadata": {} } ] }