{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Solution-2\n", "Analyze the unit cell parameters of proteins and protein-protein complexes in the PDB." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from pyspark.sql import Row, SparkSession\n", "from mmtfPyspark.filters import ContainsLProteinChain\n", "from mmtfPyspark.ml import pythonRDDToDataset\n", "from mmtfPyspark.io import mmtfReader\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Configure Spark Session and Spark Context" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "spark = SparkSession.builder.appName(\"Solution-2\").getOrCreate()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Read a sample of the PDB" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "path = \"../resources/mmtf_full_sample\"\n", "pdb = mmtfReader.read_sequence_file(path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### TODO-1 Restrict the analysis to proteins only" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "pdb = pdb.filter(ContainsLProteinChain())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Remove structures without unit cell data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "pdb = pdb.filter(lambda t: t[1].unit_cell != None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### TODO-2 Define method to create a Row with unit cell data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def calcProperties(s):\n", " structure_id = s[0]\n", " space_group = s[1].space_group\n", " a, b, c, alpha, beta, gamma = s[1].unit_cell\n", "\n", " return Row(structure_id, space_group, a, b, c, alpha, beta, gamma)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### TODO-3: Map structures to properties" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "rows = pdb.map(lambda s: calcProperties(s))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### TODO-4: Create a dataset from the RDD" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "col_names = [\"structureId\", \"spaceGroup\", \"a\", \"b\", \"c\", \"alpha\", \"beta\", \"gamma\"]\n", "summary = pythonRDDToDataset.get_dataset(rows, col_names).cache()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Done: Show some details about this dataset" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['structureId', 'spaceGroup', 'a', 'b', 'c', 'alpha', 'beta', 'gamma']" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summary.columns" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- structureId: string (nullable = false)\n", " |-- spaceGroup: string (nullable = false)\n", " |-- a: float (nullable = false)\n", " |-- b: float (nullable = false)\n", " |-- c: float (nullable = false)\n", " |-- alpha: float (nullable = false)\n", " |-- beta: float (nullable = false)\n", " |-- gamma: float (nullable = false)\n", "\n" ] } ], "source": [ "summary.printSchema()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
structureIdspaceGroupabcalphabetagamma
01LBUP 1 21 151.08000249.70000138.65000290.0100.59999890.0
11LC0P 21 21 2151.77999975.73000376.08000290.090.00000090.0
21LC5I 2 2 275.989998103.339996109.25000090.090.00000090.0
31LFPP 21 21 2143.79000165.34999873.77999990.090.00000090.0
41LFWP 21 21 2167.15100177.02500289.95500290.090.00000090.0
51LGHP 4 21 291.59999891.599998209.97000190.090.00000090.0
61LH0C 2 2 2105.570000154.22999652.59999890.090.00000090.0
71LJ8I 2 2 2102.096001103.188004106.19599990.090.00000090.0
81LKIP 21 21 2131.10000056.20000195.30000390.090.00000090.0
91LMIP 65 2 243.12900243.129002228.79800490.090.000000120.0
\n", "
" ], "text/plain": [ " structureId spaceGroup a b c alpha \\\n", "0 1LBU P 1 21 1 51.080002 49.700001 38.650002 90.0 \n", "1 1LC0 P 21 21 21 51.779999 75.730003 76.080002 90.0 \n", "2 1LC5 I 2 2 2 75.989998 103.339996 109.250000 90.0 \n", "3 1LFP P 21 21 21 43.790001 65.349998 73.779999 90.0 \n", "4 1LFW P 21 21 21 67.151001 77.025002 89.955002 90.0 \n", "5 1LGH P 4 21 2 91.599998 91.599998 209.970001 90.0 \n", "6 1LH0 C 2 2 2 105.570000 154.229996 52.599998 90.0 \n", "7 1LJ8 I 2 2 2 102.096001 103.188004 106.195999 90.0 \n", "8 1LKI P 21 21 21 31.100000 56.200001 95.300003 90.0 \n", "9 1LMI P 65 2 2 43.129002 43.129002 228.798004 90.0 \n", "\n", " beta gamma \n", "0 100.599998 90.0 \n", "1 90.000000 90.0 \n", "2 90.000000 90.0 \n", "3 90.000000 90.0 \n", "4 90.000000 90.0 \n", "5 90.000000 90.0 \n", "6 90.000000 90.0 \n", "7 90.000000 90.0 \n", "8 90.000000 90.0 \n", "9 90.000000 120.0 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summary.toPandas().head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Group data by space group and count occurances" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
spaceGroupcount
0P 21 21 212211
1P 1 21 11595
2C 1 2 11076
3P 21 21 2558
4C 2 2 21544
5P 1417
6P 41 21 2324
7P 43 21 2313
8P 32 2 1281
9P 31 2 1262
\n", "
" ], "text/plain": [ " spaceGroup count\n", "0 P 21 21 21 2211\n", "1 P 1 21 1 1595\n", "2 C 1 2 1 1076\n", "3 P 21 21 2 558\n", "4 C 2 2 21 544\n", "5 P 1 417\n", "6 P 41 21 2 324\n", "7 P 43 21 2 313\n", "8 P 32 2 1 281\n", "9 P 31 2 1 262" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = summary.groupBy(\"spaceGroup\")\\\n", " .count()\\\n", " .sort(\"count\", ascending=False)\\\n", " .toPandas()\n", "\n", "df.head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot histogram for the top 10 space groups" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "df.head(10).plot(x='spaceGroup', y='count', kind='bar');" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "spark.stop()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.13" } }, "nbformat": 4, "nbformat_minor": 4 }