In [1]:
from collections import defaultdict

import requests

from Bio import ExPASy, SwissProt

In [2]:
#explain why not biopython
server = 'http://www.uniprot.org/uniprot'
def do_request(server, ID='', **kwargs):
    params = ''
    req = requests.get('%s/%s%s' % (server, ID, params),params=kwargs)
    if not req.ok:
        req.raise_for_status()
    return req

In [3]:
req = do_request(server, query='gene:p53 AND reviewed:yes',# AND organism:Human',
                 format='tab',
                 columns='id,entry name,length,organism,organism-id,database(PDB),database(HGNC)',
                 limit='50')
#We might revisit this for KEGG

In [4]:
import pandas as pd
import StringIO

uniprot_list = pd.read_table(StringIO.StringIO(req.text))
uniprot_list.rename(columns={'Organism ID': 'ID'}, inplace=True)
uniprot_list

Unnamed: 0,Entry,Entry name,Length,Organism,ID,Cross-reference (PDB),Cross-reference (HGNC)
0,Q9W678,P53_BARBU,369,Barbus barbus (Barbel) (Cyprinus barbus),40830,,
1,Q29537,P53_CANFA,381,Canis familiaris (Dog) (Canis lupus familiaris),9615,,
2,O09185,P53_CRIGR,393,Cricetulus griseus (Chinese hamster) (Cricetul...,10029,,
3,Q8SPZ3,P53_DELLE,387,Delphinapterus leucas (Beluga whale),9749,,
4,P79892,P53_HORSE,280,Equus caballus (Horse),9796,,
5,P04637,P53_HUMAN,393,Homo sapiens (Human),9606,1A1U;1AIE;1C26;1DT7;1GZH;1H26;1HS5;1JSP;1KZY;1...,11998;
6,O93379,P53_ICTPU,376,Ictalurus punctatus (Channel catfish) (Silurus...,7998,,
7,P56423,P53_MACFA,393,Macaca fascicularis (Crab-eating macaque) (Cyn...,9541,,
8,P61260,P53_MACFU,393,Macaca fuscata fuscata (Japanese macaque),9543,,
9,P56424,P53_MACMU,393,Macaca mulatta (Rhesus macaque),9544,,


In [5]:
p53_human = uniprot_list[uniprot_list.ID == 9606]['Entry'].tolist()[0]

In [6]:
handle = ExPASy.get_sprot_raw(p53_human)

In [7]:
sp_rec= SwissProt.read(handle)

In [8]:
print(sp_rec.entry_name, sp_rec.sequence_length, sp_rec.gene_name)
print(sp_rec.description)
print(sp_rec.organism, sp_rec.seqinfo)
print(sp_rec.sequence)

('P53_HUMAN', 393, 'Name=TP53; Synonyms=P53;')
RecName: Full=Cellular tumor antigen p53; AltName: Full=Antigen NY-CO-13; AltName: Full=Phosphoprotein p53; AltName: Full=Tumor suppressor p53;
('Homo sapiens (Human).', (393, 43653, 'AD5C149FD8106131'))
MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD


In [9]:
print(sp_rec.comments)
print(sp_rec.keywords)

['FUNCTION: Acts as a tumor suppressor in many tumor types; induces growth arrest or apoptosis depending on the physiological circumstances and cell type. Involved in cell cycle regulation as a trans-activator that acts to negatively regulate cell division by controlling a set of genes required for this process. One of the activated genes is an inhibitor of cyclin-dependent kinases. Apoptosis induction seems to be mediated either by stimulation of BAX and FAS antigen expression, or by repression of Bcl-2 expression. In cooperation with mitochondrial PPIF is involved in activating oxidative stress-induced necrosis; the function is largely independent of transcription. Induces the transcription of long intergenic non-coding RNA p21 (lincRNA-p21) and lincRNA- Mkln1. LincRNA-p21 participates in TP53-dependent transcriptional repression leading to apoptosis and seem to have to effect on cell-cycle regulation. Implicated in Notch signaling cross-over. Prevents CDK7 kinase activity when assoc

In [10]:
help(sp_rec)

Help on Record in module Bio.SwissProt object:

class Record(__builtin__.object)
 |  Holds information from a SwissProt record.
 |  
 |  Members:
 |  
 |      - entry_name        Name of this entry, e.g. RL1_ECOLI.
 |      - data_class        Either 'STANDARD' or 'PRELIMINARY'.
 |      - molecule_type     Type of molecule, 'PRT',
 |      - sequence_length   Number of residues.
 |  
 |      - accessions        List of the accession numbers, e.g. ['P00321']
 |      - created           A tuple of (date, release).
 |      - sequence_update   A tuple of (date, release).
 |      - annotation_update A tuple of (date, release).
 |  
 |      - description       Free-format description.
 |      - gene_name         Gene name.  See userman.txt for description.
 |      - organism          The source of the sequence.
 |      - organelle         The origin of the sequence.
 |      - organism_classification  The taxonomy classification.  List of strings.
 |        (http://www.ncbi.nlm.nih.gov/Taxonomy

In [11]:
done_features = set()
print(len(sp_rec.features))
for feature in sp_rec.features:
    if feature[0] in done_features:
        continue
    else:
        done_features.add(feature[0])
        print(feature)
print(len(sp_rec.cross_references))
per_source = defaultdict(list)
for xref in sp_rec.cross_references:
    source = xref[0]
    per_source[source].append(xref[1:])
print(per_source.keys())
done_GOs = set()
print(len(per_source['GO']))
for annot in per_source['GO']:
    if annot[1][0] in done_GOs:
        continue
    else:
        done_GOs.add(annot[1][0])
        print(annot)

1493
('CHAIN', 1, 393, 'Cellular tumor antigen p53.', 'PRO_0000185703')
('DNA_BIND', 102, 292, '', '')
('REGION', 1, 83, 'Interaction with HRMT1L2.', '')
('MOTIF', 17, 25, 'TADI.', '')
('METAL', 176, 176, 'Zinc.', '')
('SITE', 120, 120, 'Interaction with DNA.', '')
('MOD_RES', 9, 9, 'Phosphoserine; by HIPK4. {ECO:0000269|PubMed:18022393}.', '')
('CROSSLNK', 291, 291, 'Glycyl lysine isopeptide (Lys-Gly) (interchain with G-Cter in ubiquitin). {ECO:0000269|PubMed:19536131}.', '')
('VAR_SEQ', 1, 132, 'Missing (in isoform 7, isoform 8 and isoform 9). {ECO:0000303|PubMed:16131611}.', 'VSP_040833')
('VARIANT', 5, 5, 'Q -> H (in a sporadic cancer; somatic mutation; abolishes strongly phosphorylation).', 'VAR_044543')
('MUTAGEN', 15, 15, 'S->A: Loss of interaction with PPP2R5C, PPP2CA AND PPP2R1A. {ECO:0000269|PubMed:17967874}.', '')
('HELIX', 19, 23, '{ECO:0000244|PDB:3DAC}.', '')
('STRAND', 27, 29, '{ECO:0000244|PDB:2K8F}.', '')
('TURN', 105, 108, '{ECO:0000244|PDB:3D06}.', '')
604
['GeneRevi