# Solution-1
This tutorial shows how to find proteins for a specific organism, how to calculate protein-protein interactions, and visualize the results.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import substring_index
from mmtfPyspark.datasets import pdbjMineDataset
from mmtfPyspark.webfilters import PdbjMineSearch
from mmtfPyspark.interactions import InteractionFilter, InteractionFingerprinter
from mmtfPyspark.io import mmtfReader
from ipywidgets import interact, IntSlider
import py3Dmol

#### Configure Spark

In [2]:
spark = SparkSession.builder.appName("Solution-1").getOrCreate()

## Find protein structures for mouse

For our first task, we need to run a taxonomy query using SIFTS data. [See examples](https://github.com/sbl-sdsc/mmtf-pyspark/blob/master/demos/datasets/PDBMetaDataDemo.ipynb) and [SIFTS demo](https://github.com/sbl-sdsc/mmtf-pyspark/blob/master/demos/datasets/SiftsDataDemo.ipynb)

To figure out how to query for taxonomy, the command below lists the first 10 entries for the SIFTS taxonomy table. As you can see, we can use the science_name field to query for a specific organism.

In [3]:
taxonomy_query = "SELECT * FROM sifts.pdb_chain_taxonomy LIMIT 10"
taxonomy = pdbjMineDataset.get_dataset(taxonomy_query)
taxonomy.show()

+-----+-----+------+--------------------+----------------+
|pdbid|chain|tax_id|     scientific_name|structureChainId|
+-----+-----+------+--------------------+----------------+
| 101M|    A|  9755|               PHYMC|          101M.A|
| 101M|    A|  9755|    Physeter catodon|          101M.A|
| 101M|    A|  9755|Physeter catodon ...|          101M.A|
| 101M|    A|  9755|Physeter macrocep...|          101M.A|
| 101M|    A|  9755|Physeter macrocep...|          101M.A|
| 101M|    A|  9755|         Sperm whale|          101M.A|
| 101M|    A|  9755|         sperm whale|          101M.A|
| 102L|    A| 10665|                BPT4|          102L.A|
| 102L|    A| 10665|    Bacteriophage T4|          102L.A|
| 102L|    A| 10665|Enterobacteria ph...|          102L.A|
+-----+-----+------+--------------------+----------------+



### TODO-1: specify a taxonomy query where the scientific name is 'Mus musculus'

In [4]:
taxonomy_query = "SELECT * FROM sifts.pdb_chain_taxonomy WHERE scientific_name = 'Mus musculus'"
taxonomy = pdbjMineDataset.get_dataset(taxonomy_query)
taxonomy.show(10)

+-----+-----+------+---------------+----------------+
|pdbid|chain|tax_id|scientific_name|structureChainId|
+-----+-----+------+---------------+----------------+
| 12E8|    H| 10090|   Mus musculus|          12E8.H|
| 12E8|    L| 10090|   Mus musculus|          12E8.L|
| 12E8|    M| 10090|   Mus musculus|          12E8.M|
| 12E8|    P| 10090|   Mus musculus|          12E8.P|
| 15C8|    H| 10090|   Mus musculus|          15C8.H|
| 15C8|    L| 10090|   Mus musculus|          15C8.L|
| 1914|    A| 10090|   Mus musculus|          1914.A|
| 1A0Q|    H| 10090|   Mus musculus|          1A0Q.H|
| 1A0Q|    L| 10090|   Mus musculus|          1A0Q.L|
| 1A14|    H| 10090|   Mus musculus|          1A14.H|
+-----+-----+------+---------------+----------------+
only showing top 10 rows



In [5]:
path = "../resources/mmtf_full_sample/"

pdb = mmtfReader.read_sequence_file(path, fraction=0.1)

### TODO-2: Take the taxonomy from above and use it to filter the pdb structures

In [6]:
pdb = pdb.filter(PdbjMineSearch(taxonomy_query)).cache()

## Calculate polymer-polymer interactions for this subset of structures
Find protein-protein interactions with a 6 A distance cutoff

In [7]:
distance_cutoff = 6.0
interactionFilter = InteractionFilter(distance_cutoff, minInteractions=10)

interactions = InteractionFingerprinter.get_polymer_interactions(pdb, interactionFilter).cache()

In [8]:
interactions = interactions.withColumn("structureId", substring_index(interactions.structureChainId, '.', 1)).cache()
interactions.toPandas().head(10)

Unnamed: 0,structureChainId,queryChainId,targetChainId,groupNumbers,sequenceIndices,sequence,structureId
0,4M48.A,H,A,"[337, 338, 498, 501, 502, 503, 504, 505, 506, ...","[70, 274, 275, 435, 438, 439, 440, 441, 442, 4...",MNSISDERETWSGKVDFLLSVIGFAVDLANVWRFPYLCYKNGGGAF...,4M48
1,4M48.H,A,H,"[100, 101, 102, 103, 31, 33, 50, 52, 53, 54, 5...","[49, 51, 68, 70, 71, 72, 73, 74, 75, 77, 117, ...",MNFGLRLVFLVLILKGVQCEVQLVESGGGLVKPGGSLKLSCAASGF...,4M48
2,4M48.L,H,L,"[1, 100, 101, 115, 117, 118, 119, 120, 121, 12...","[22, 53, 54, 56, 58, 60, 63, 64, 65, 66, 67, 6...",MDFQVQIFSFLLISASVAMSRGENVLTQSPAIMSTSPGEKVTMTCR...,4M48
3,4M48.H,L,H,"[100, 101, 102, 103, 104, 105, 106, 107, 108, ...","[53, 55, 57, 60, 61, 62, 63, 64, 65, 68, 77, 7...",MNFGLRLVFLVLILKGVQCEVQLVESGGGLVKPGGSLKLSCAASGF...,4M48
4,4NN5.A,C,A,"[126, 127, 129, 130, 131, 132, 133, 134, 136, ...","[11, 14, 15, 16, 19, 20, 23, 28, 30, 31, 32, 3...",YNFSNCNFTSITKIYCNIIFHDLTGDLKGAKFEQIEDCESKPACLL...,4NN5
5,4NN5.C,A,C,"[106, 107, 108, 109, 110, 112, 113, 143, 144, ...","[16, 41, 42, 68, 69, 70, 71, 73, 74, 86, 87, 8...",AAAVTSRGDVTVVCHDLETVEVTWGSGPDHHGANLSLEFRYGTGAL...,4NN5
6,2QDQ.A,B,A,"[2496, 2497, 2498, 2500, 2501, 2502, 2504, 250...","[4, 5, 6, 8, 9, 10, 12, 13, 15, 16, 17, 19, 20...",GAMVGGIAQIIAAQEEMLRKERELEEARKKLAQIRQQQYKFLPSEL...,2QDQ
7,2QDQ.B,A,B,"[2497, 2498, 2500, 2501, 2504, 2505, 2507, 250...","[5, 6, 8, 9, 12, 13, 15, 16, 17, 19, 20, 22, 2...",GAMVGGIAQIIAAQEEMLRKERELEEARKKLAQIRQQQYKFLPSEL...,2QDQ
8,4P3A.C,D,C,"[698, 701, 702, 704, 705, 706, 708, 709, 710, ...","[21, 24, 25, 27, 28, 29, 31, 32, 33, 34, 35, 3...",GANLHLLRQKIEEQAAKYKHSVPKKCCYDGARVNFYETCEERVARV...,4P3A
9,4P3A.D,C,D,"[698, 701, 702, 704, 705, 706, 708, 709, 710, ...","[21, 24, 25, 27, 28, 29, 31, 32, 33, 34, 35, 3...",GANLHLLRQKIEEQAAKYKHSVPKKCCYDGARVNFYETCEERVARV...,4P3A


## Visualize the protein-protein interactions

#### Extract id columns as lists (required for visualization)

In [9]:
structure_ids = interactions.select("structureId").rdd.flatMap(lambda x: x).collect()
query_chain_ids = interactions.select("queryChainID").rdd.flatMap(lambda x: x).collect()
target_chain_ids = interactions.select("targetChainID").rdd.flatMap(lambda x: x).collect()
target_groups = interactions.select("groupNumbers").rdd.flatMap(lambda x: x).collect()

Disable scrollbar for the visualization below

In [10]:
#%%javascript 
#IPython.OutputArea.prototype._should_scroll = function(lines) {return false;}

#### Show protein-protein interactions within cutoff distance  (query = orange, target = blue)

In [11]:
def view_protein_protein_interactions(structure_ids, query_chain_ids, target_chain_ids, target_groups, distance=4.5):
    
    def view3d(i=0):
        
        print(f"PDB: {structure_ids[i]}, query: {query_chain_ids[i]}, target: {target_chain_ids[i]}")

        target = {'chain': target_chain_ids[i], 'resi': target_groups[i]}
           
        viewer = py3Dmol.view(query='pdb:' + structure_ids[i], width=600, height=600)
        viewer.setStyle({})

        viewer.setStyle({'chain': query_chain_ids[i]}, {'line': {'colorscheme': 'orangeCarbon'}})
        viewer.setStyle({'chain' : query_chain_ids[i], 'within':{'distance' : distance, 'sel':{'chain': target_chain_ids[i]}}}, {'sphere': {'colorscheme': 'orangeCarbon'}});                   
        viewer.setStyle({'chain': target_chain_ids[i]}, {'line': {'colorscheme': 'lightblueCarbon'}})
        viewer.setStyle(target, {'stick': {'colorscheme': 'lightblueCarbon'}})
        viewer.zoomTo(target)

        return viewer.show()

    s_widget = IntSlider(min=0, max=len(structure_ids)-1, description='Structure', continuous_update=False)
    return interact(view3d, i=s_widget)

In [12]:
view_protein_protein_interactions(structure_ids, query_chain_ids, target_chain_ids, \
                                  target_groups, distance=distance_cutoff);

interactive(children=(IntSlider(value=0, continuous_update=False, description='Structure', max=47), Output()),â€¦

In [13]:
spark.stop()