In [None]:
# download uniprot ID mapping
#! wget --directory-prefix download ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz
! shasum download/idmapping.dat.gz

### idmapping.dat [documentation](ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README)

1) idmapping.dat
This file has three columns, delimited by tab:
1. UniProtKB-AC 
2. ID_type 
3. ID

where ID_type is the database name as appearing in UniProtKB cross-references, 
and as supported by the ID mapping tool on the UniProt web site, 
http://www.uniprot.org/mapping and where ID is the identifier in 
that cross-referenced database.

In [14]:
import os
import csv
import io
import gzip

In [15]:
def generate_idmapping(path):
 """Returns a generator of idmapping.dat.gz rows."""
 read_file = gzip.open(path, 'rb')
 text = io.TextIOWrapper(read_file)
 reader = csv.reader(text, delimiter='\t')
 for row in reader:
 yield row
 read_file.close()

In [16]:
path = os.path.join('download', 'idmapping.dat.gz')
mapping_generator = generate_idmapping(path)

In [17]:
extract = {'GeneID', 'HGNC'}
mappings = {target: set() for target in extract}

for accession, target, target_id in mapping_generator:
 if target not in extract:
 continue
 mappings[target].add((accession, target_id))

In [18]:
for target, mapset in mappings.items():
 path = os.path.join('data', 'map', '{}.tsv.gz'.format(target))
 write_file = gzip.open(path, 'wb')
 wrapper = io.TextIOWrapper(write_file)
 writer = csv.writer(wrapper, delimiter='\t')
 writer.writerow(['uniprot', target])
 writer.writerows(sorted(mapset))
 write_file.close()