{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4-CreateDatasets\n",
    "This tutorial shows a basic template to create a dataset computationally."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import Row, SparkSession\n",
    "from mmtfPyspark.ml import pythonRDDToDataset\n",
    "from mmtfPyspark.io import mmtfReader"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Configure Spark Session and Spark Context"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark = SparkSession.builder.appName(\"4-CreateDatasets\").getOrCreate()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read a 10% fraction of the sample file\n",
    "Reading a random fraction of the input file is a good strategy to test some new functionality."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"../resources/mmtf_full_sample\"\n",
    "pdb = mmtfReader.read_sequence_file(path, fraction=0.1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Creating a dataset in 3 simple steps"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 1: calculate properties for a structure and add it to a Row object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calcProperties(s):\n",
    "    # s[0] pdb id\n",
    "    # s[1] mmtf structure record\n",
    "    return Row(s[0], s[1].num_models, s[1].num_chains, s[1].num_groups, s[1].num_atoms, s[1].num_bonds)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 2: map structures to rows\n",
    "Here we use a lambda expression to calculate properties."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "rows = pdb.map(lambda s: calcProperties(s))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 3: convert RDD of Rows to a dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "col_names = [\"pdbId\", \"models\", \"chains\", \"groups\", \"atoms\", \"bonds\"]\n",
    "summary = pythonRDDToDataset.get_dataset(rows, col_names) \n",
    "# summary = spark.createDataFrame(rows, col_names) # alternative method, converts int to long"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Done: Show some details about this dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['pdbId', 'models', 'chains', 'groups', 'atoms', 'bonds']"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summary.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- pdbId: string (nullable = false)\n",
      " |-- models: integer (nullable = false)\n",
      " |-- chains: integer (nullable = false)\n",
      " |-- groups: integer (nullable = false)\n",
      " |-- atoms: integer (nullable = false)\n",
      " |-- bonds: integer (nullable = false)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "summary.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----+------+------+------+-----+-----+\n",
      "|pdbId|models|chains|groups|atoms|bonds|\n",
      "+-----+------+------+------+-----+-----+\n",
      "| 1LBU|     1|     3|   443| 1793| 1602|\n",
      "| 1LC0|     1|     5|   700| 2731| 2358|\n",
      "| 1LC5|     1|     4|   628| 3056| 2848|\n",
      "| 1LFP|     1|     2|   593| 2275| 1958|\n",
      "| 1LFW|     1|     5|  1041| 4238| 3750|\n",
      "| 1LGH|     1|    68|   512| 5436| 5526|\n",
      "| 1LH0|     1|     8|   701| 3596| 3375|\n",
      "| 1LJ8|     1|     3|   930| 4310| 3965|\n",
      "| 1LKI|     1|     2|   222| 1386| 1364|\n",
      "| 1LMI|     1|     2|   303| 1139|  989|\n",
      "| 1LML|     1|     3|   678| 3738| 3616|\n",
      "| 1LO7|     1|     5|   316| 1375| 1229|\n",
      "| 1LQ9|     1|     5|   483| 2006| 1794|\n",
      "| 1LQV|     1|    30|   862| 4048| 3695|\n",
      "| 1LR0|     1|     5|   251| 1100|  992|\n",
      "| 1LR5|     1|    16|  1379| 6071| 5531|\n",
      "| 1LRI|     1|     4|   199|  861|  777|\n",
      "| 1LRZ|     1|     2|   718| 3631| 3399|\n",
      "| 1LS1|     1|     6|   577| 5396| 5073|\n",
      "| 1LTS|     1|    14|  1034| 6271| 6091|\n",
      "+-----+------+------+------+-----+-----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "summary.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Print statistics for the numerical columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>summary</th>\n",
       "      <th>models</th>\n",
       "      <th>chains</th>\n",
       "      <th>groups</th>\n",
       "      <th>atoms</th>\n",
       "      <th>bonds</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>count</td>\n",
       "      <td>9756</td>\n",
       "      <td>9756</td>\n",
       "      <td>9756</td>\n",
       "      <td>9756</td>\n",
       "      <td>9756</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>mean</td>\n",
       "      <td>1.0003075030750308</td>\n",
       "      <td>8.567343173431734</td>\n",
       "      <td>699.720377203772</td>\n",
       "      <td>3510.4634071340715</td>\n",
       "      <td>3252.99651496515</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>stddev</td>\n",
       "      <td>0.01753396788544404</td>\n",
       "      <td>7.177280313219018</td>\n",
       "      <td>437.73900408050133</td>\n",
       "      <td>2140.150369170067</td>\n",
       "      <td>2015.1217534374905</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>min</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>21</td>\n",
       "      <td>154</td>\n",
       "      <td>144</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>max</td>\n",
       "      <td>2</td>\n",
       "      <td>91</td>\n",
       "      <td>3026</td>\n",
       "      <td>9995</td>\n",
       "      <td>10077</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  summary               models             chains              groups  \\\n",
       "0   count                 9756               9756                9756   \n",
       "1    mean   1.0003075030750308  8.567343173431734    699.720377203772   \n",
       "2  stddev  0.01753396788544404  7.177280313219018  437.73900408050133   \n",
       "3     min                    1                  1                  21   \n",
       "4     max                    2                 91                3026   \n",
       "\n",
       "                atoms               bonds  \n",
       "0                9756                9756  \n",
       "1  3510.4634071340715    3252.99651496515  \n",
       "2   2140.150369170067  2015.1217534374905  \n",
       "3                 154                 144  \n",
       "4                9995               10077  "
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summary.describe(col_names[1:]).toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.stop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}