{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 4-CreateDatasets\n", "This tutorial shows a basic template to create a dataset computationally." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "from pyspark.sql import Row, SparkSession\n", "from mmtfPyspark.ml import pythonRDDToDataset\n", "from mmtfPyspark.io import mmtfReader" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Configure Spark Session and Spark Context" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "spark = SparkSession.builder.appName(\"4-CreateDatasets\").getOrCreate()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read a 10% fraction of the sample file\n", "Reading a random fraction of the input file is a good strategy to test some new functionality." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "path = \"../resources/mmtf_full_sample\"\n", "pdb = mmtfReader.read_sequence_file(path, fraction=0.1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Creating a dataset in 3 simple steps" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 1: calculate properties for a structure and add it to a Row object" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "def calcProperties(s):\n", " # s[0] pdb id\n", " # s[1] mmtf structure record\n", " return Row(s[0], s[1].num_models, s[1].num_chains, s[1].num_groups, s[1].num_atoms, s[1].num_bonds)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 2: map structures to rows\n", "Here we use a lambda expression to calculate properties." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "rows = pdb.map(lambda s: calcProperties(s))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step 3: convert RDD of Rows to a dataset" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "col_names = [\"pdbId\", \"models\", \"chains\", \"groups\", \"atoms\", \"bonds\"]\n", "summary = pythonRDDToDataset.get_dataset(rows, col_names) \n", "# summary = spark.createDataFrame(rows, col_names) # alternative method, converts int to long" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Done: Show some details about this dataset" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['pdbId', 'models', 'chains', 'groups', 'atoms', 'bonds']" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summary.columns" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- pdbId: string (nullable = false)\n", " |-- models: integer (nullable = false)\n", " |-- chains: integer (nullable = false)\n", " |-- groups: integer (nullable = false)\n", " |-- atoms: integer (nullable = false)\n", " |-- bonds: integer (nullable = false)\n", "\n" ] } ], "source": [ "summary.printSchema()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-----+------+------+------+-----+-----+\n", "|pdbId|models|chains|groups|atoms|bonds|\n", "+-----+------+------+------+-----+-----+\n", "| 1LBU| 1| 3| 443| 1793| 1602|\n", "| 1LC0| 1| 5| 700| 2731| 2358|\n", "| 1LC5| 1| 4| 628| 3056| 2848|\n", "| 1LFP| 1| 2| 593| 2275| 1958|\n", "| 1LFW| 1| 5| 1041| 4238| 3750|\n", "| 1LGH| 1| 68| 512| 5436| 5526|\n", "| 1LH0| 1| 8| 701| 3596| 3375|\n", "| 1LJ8| 1| 3| 930| 4310| 3965|\n", "| 1LKI| 1| 2| 222| 1386| 1364|\n", "| 1LMI| 1| 2| 303| 1139| 989|\n", "| 1LML| 1| 3| 678| 3738| 3616|\n", "| 1LO7| 1| 5| 316| 1375| 1229|\n", "| 1LQ9| 1| 5| 483| 2006| 1794|\n", "| 1LQV| 1| 30| 862| 4048| 3695|\n", "| 1LR0| 1| 5| 251| 1100| 992|\n", "| 1LR5| 1| 16| 1379| 6071| 5531|\n", "| 1LRI| 1| 4| 199| 861| 777|\n", "| 1LRZ| 1| 2| 718| 3631| 3399|\n", "| 1LS1| 1| 6| 577| 5396| 5073|\n", "| 1LTS| 1| 14| 1034| 6271| 6091|\n", "+-----+------+------+------+-----+-----+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "summary.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Print statistics for the numerical columns" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
summarymodelschainsgroupsatomsbonds
0count97569756975697569756
1mean1.00030750307503088.567343173431734699.7203772037723510.46340713407153252.99651496515
2stddev0.017533967885444047.177280313219018437.739004080501332140.1503691700672015.1217534374905
3min1121154144
4max2913026999510077
\n", "
" ], "text/plain": [ " summary models chains groups \\\n", "0 count 9756 9756 9756 \n", "1 mean 1.0003075030750308 8.567343173431734 699.720377203772 \n", "2 stddev 0.01753396788544404 7.177280313219018 437.73900408050133 \n", "3 min 1 1 21 \n", "4 max 2 91 3026 \n", "\n", " atoms bonds \n", "0 9756 9756 \n", "1 3510.4634071340715 3252.99651496515 \n", "2 2140.150369170067 2015.1217534374905 \n", "3 154 144 \n", "4 9995 10077 " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summary.describe(col_names[1:]).toPandas()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "spark.stop()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 2 }