{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from __future__ import print_function\n", "\n", "from Bio import PDB" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Structure exists: './pdb1tup.ent' \n" ] }, { "data": { "text/plain": [ "'./pdb1tup.ent'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "repository = PDB.PDBList()\n", "repository.retrieve_pdb_file('1TUP', pdir='.')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "rec_types = {\n", " #single line\n", " 'HEADER': [(str, 11, 49), (str, 50, 58), (str, 62, 65)],\n", " #multi_line\n", " 'SOURCE': [(int, 7, 9), (str, 10, 78)],\n", " #multi_rec\n", " 'LINK' : [(str, 12, 15), (str, 16, 16), (str, 17, 19), (str, 21, 21), (int, 22, 25),\n", " (str, 26, 26), (str, 42, 45), (str, 46, 46), (str, 47, 49), (str, 51, 51),\n", " (int, 52, 55), (str, 56, 56), (str, 59, 64), (str, 66, 71), (float, 73, 77)],\n", " 'HELIX': [(int, 7, 9), (str, 11, 13), (str, 15, 17), (str, 19, 19), (int, 21, 24),\n", " (str, 25, 25), (str, 27, 29), (str, 31, 31),\n", " (int, 33, 36), (str, 37 ,37), (int, 38, 39), (str, 40, 69), (int, 71, 75)],\n", " 'SHEET': [(int, 7, 9), (str, 11, 13), (int, 14, 15), (str, 17, 19), (str, 21, 21),\n", " (int, 22, 24), (str, 26, 26), (str, 28, 30),\n", " (str, 32, 32), (int, 33, 36), (str, 37, 37), (int, 38, 39), (str, 41, 44),\n", " (str, 45, 47), (str, 49, 49), (int, 50, 53), (str, 54, 54), (str, 56, 59),\n", " (str, 60, 62), (str, 64, 64), (int, 65, 68), (str, 69, 69)],\n", "}\n", "\n", "def parse_pdb(hdl):\n", " for line in hdl:\n", " line = line[:-1] # remove \\n but not other whitespace\n", " toks = []\n", " for section, elements in rec_types.items():\n", " if line.startswith(section):\n", " for fun, start, end in elements:\n", " try:\n", " toks.append(fun(line[start: end + 1]))\n", " except ValueError:\n", " toks.append(None) # eg continuation\n", " yield (section, toks)\n", " if len(toks) == 0:\n", " yield ('UNKNOWN', line)\n", " " ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('HEADER', ['NTITUMOR PROTEIN/DNA ', '11-JUL-95', '1TUP'])\n", "('SOURCE', [None, 'MOL_ID: 1; '])\n", "('HELIX', [1, ' 1', 'SER', 'A', 166, ' ', 'HIS', 'A', 168, ' ', 5, ' ', 3])\n", "('SHEET', [1, ' A', 4, 'ARG', 'A', 11, ' ', 'GLY', 'A', 112, ' ', 0, ' ', ' ', ' ', None, ' ', ' ', ' ', ' ', None, ' '])\n", "('LINK', ['ZN ', ' ', ' ZN', 'A', 951, ' ', ' SG ', ' ', 'CYS', 'A', 176, ' ', ' 1555', ' 1555', 2.33])\n" ] } ], "source": [ "hdl = open('pdb1tup.ent')\n", "done_rec = set()\n", "for rec in parse_pdb(hdl):\n", " if rec[0] == 'UNKNOWN' or rec[0] in done_rec:\n", " continue\n", " print(rec)\n", " done_rec.add(rec[0])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "multi_lines = ['SOURCE']\n", "\n", "#assume multi is just a string\n", "def process_multi_lines(hdl):\n", " current_multi = ''\n", " current_multi_name = None\n", " for rec_type, toks in parse_pdb(hdl):\n", " if current_multi_name is not None and current_multi_name != rec_type:\n", " yield current_multi_name, [current_multi]\n", " current_multi = ''\n", " current_multi_name = None\n", " if rec_type in multi_lines:\n", " current_multi += toks[1].strip().rstrip() + ' '\n", " current_multi_name = rec_type\n", " else:\n", " if len(current_multi) != 0:\n", " yield current_multi_name, [current_multi]\n", " current_multi = ''\n", " current_multi_name = None \n", " yield rec_type, toks\n", " if len(current_multi) != 0:\n", " yield current_multi_name, [current_multi]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('HEADER', ['NTITUMOR PROTEIN/DNA ', '11-JUL-95', '1TUP'])\n", "('SOURCE', ['MOL_ID: 1; SYNTHETIC: YES; MOL_ID: 2; SYNTHETIC: YES; MOL_ID: 3; ORGANISM_SCIENTIFIC: HOMO SAPIENS; ORGANISM_COMMON: HUMAN; ORGANISM_TAXID: 9606; CELL_LINE: A431; CELL: HUMAN VULVA CARCINOMA; EXPRESSION_SYSTEM: ESCHERICHIA COLI; EXPRESSION_SYSTEM_TAXID: 562; EXPRESSION_SYSTEM_PLASMID: PET3D '])\n", "('HELIX', [1, ' 1', 'SER', 'A', 166, ' ', 'HIS', 'A', 168, ' ', 5, ' ', 3])\n", "('SHEET', [1, ' A', 4, 'ARG', 'A', 11, ' ', 'GLY', 'A', 112, ' ', 0, ' ', ' ', ' ', None, ' ', ' ', ' ', ' ', None, ' '])\n", "('LINK', ['ZN ', ' ', ' ZN', 'A', 951, ' ', ' SG ', ' ', 'CYS', 'A', 176, ' ', ' 1555', ' 1555', 2.33])\n" ] } ], "source": [ "hdl = open('pdb1tup.ent')\n", "done_rec = set()\n", "for rec in process_multi_lines(hdl):\n", " if rec[0] == 'UNKNOWN' or rec[0] in done_rec:\n", " continue\n", " print(rec)\n", " done_rec.add(rec[0])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def get_spec_list(my_str):\n", " #ignoring escape characters\n", " spec_list = {}\n", " elems = my_str.strip().strip().split(';')\n", " for elem in elems:\n", " toks = elem.split(':')\n", " spec_list[toks[0].strip()] = toks[1].strip()\n", " return spec_list\n", "\n", "struct_types = {\n", " 'SOURCE': [get_spec_list] \n", "}\n", "\n", "def process_struct_types(hdl):\n", " for rec_type, toks in process_multi_lines(hdl):\n", " if rec_type in struct_types.keys():\n", " funs = struct_types[rec_type]\n", " struct_toks = []\n", " for tok, fun in zip(toks, funs):\n", " struct_toks.append(fun(tok))\n", " yield rec_type, struct_toks\n", " else:\n", " yield rec_type, toks" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('SOURCE', [{'SYNTHETIC': 'YES', 'MOL_ID': '3', 'EXPRESSION_SYSTEM': 'ESCHERICHIA COLI', 'EXPRESSION_SYSTEM_TAXID': '562', 'ORGANISM_SCIENTIFIC': 'HOMO SAPIENS', 'CELL': 'HUMAN VULVA CARCINOMA', 'EXPRESSION_SYSTEM_PLASMID': 'PET3D', 'CELL_LINE': 'A431', 'ORGANISM_TAXID': '9606', 'ORGANISM_COMMON': 'HUMAN'}])\n" ] } ], "source": [ "hdl = open('pdb1tup.ent')\n", "for rec in process_struct_types(hdl):\n", " if rec[0] != 'SOURCE':\n", " continue\n", " print(rec)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }