{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Solution-2\n", "Analyze the unit cell parameters of proteins and protein-protein complexes in the PDB." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from pyspark.sql import Row, SparkSession\n", "from mmtfPyspark.filters import ContainsLProteinChain\n", "from mmtfPyspark.ml import pythonRDDToDataset\n", "from mmtfPyspark.io import mmtfReader\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Configure Spark Session and Spark Context" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "spark = SparkSession.builder.appName(\"Solution-2\").getOrCreate()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Read a sample of the PDB" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "path = \"../resources/mmtf_full_sample\"\n", "pdb = mmtfReader.read_sequence_file(path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### TODO-1 Restrict the analysis to proteins only" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "pdb = pdb.filter(ContainsLProteinChain())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Remove structures without unit cell data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "pdb = pdb.filter(lambda t: t[1].unit_cell != None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### TODO-2 Define method to create a Row with unit cell data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def calcProperties(s):\n", " structure_id = s[0]\n", " space_group = s[1].space_group\n", " a, b, c, alpha, beta, gamma = s[1].unit_cell\n", "\n", " return Row(structure_id, space_group, a, b, c, alpha, beta, gamma)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### TODO-3: Map structures to properties" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "rows = pdb.map(lambda s: calcProperties(s))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### TODO-4: Create a dataset from the RDD" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "col_names = [\"structureId\", \"spaceGroup\", \"a\", \"b\", \"c\", \"alpha\", \"beta\", \"gamma\"]\n", "summary = pythonRDDToDataset.get_dataset(rows, col_names).cache()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Done: Show some details about this dataset" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['structureId', 'spaceGroup', 'a', 'b', 'c', 'alpha', 'beta', 'gamma']" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summary.columns" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- structureId: string (nullable = false)\n", " |-- spaceGroup: string (nullable = false)\n", " |-- a: float (nullable = false)\n", " |-- b: float (nullable = false)\n", " |-- c: float (nullable = false)\n", " |-- alpha: float (nullable = false)\n", " |-- beta: float (nullable = false)\n", " |-- gamma: float (nullable = false)\n", "\n" ] } ], "source": [ "summary.printSchema()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
structureIdspaceGroupabcalphabetagamma
01LBUP 1 21 151.08000249.70000138.65000290.0100.59999890.0
11LC0P 21 21 2151.77999975.73000376.08000290.090.00000090.0
21LC5I 2 2 275.989998103.339996109.25000090.090.00000090.0
31LFPP 21 21 2143.79000165.34999873.77999990.090.00000090.0
41LFWP 21 21 2167.15100177.02500289.95500290.090.00000090.0
51LGHP 4 21 291.59999891.599998209.97000190.090.00000090.0
61LH0C 2 2 2105.570000154.22999652.59999890.090.00000090.0
71LJ8I 2 2 2102.096001103.188004106.19599990.090.00000090.0
81LKIP 21 21 2131.10000056.20000195.30000390.090.00000090.0
91LMIP 65 2 243.12900243.129002228.79800490.090.000000120.0
\n", "
" ], "text/plain": [ " structureId spaceGroup a b c alpha \\\n", "0 1LBU P 1 21 1 51.080002 49.700001 38.650002 90.0 \n", "1 1LC0 P 21 21 21 51.779999 75.730003 76.080002 90.0 \n", "2 1LC5 I 2 2 2 75.989998 103.339996 109.250000 90.0 \n", "3 1LFP P 21 21 21 43.790001 65.349998 73.779999 90.0 \n", "4 1LFW P 21 21 21 67.151001 77.025002 89.955002 90.0 \n", "5 1LGH P 4 21 2 91.599998 91.599998 209.970001 90.0 \n", "6 1LH0 C 2 2 2 105.570000 154.229996 52.599998 90.0 \n", "7 1LJ8 I 2 2 2 102.096001 103.188004 106.195999 90.0 \n", "8 1LKI P 21 21 21 31.100000 56.200001 95.300003 90.0 \n", "9 1LMI P 65 2 2 43.129002 43.129002 228.798004 90.0 \n", "\n", " beta gamma \n", "0 100.599998 90.0 \n", "1 90.000000 90.0 \n", "2 90.000000 90.0 \n", "3 90.000000 90.0 \n", "4 90.000000 90.0 \n", "5 90.000000 90.0 \n", "6 90.000000 90.0 \n", "7 90.000000 90.0 \n", "8 90.000000 90.0 \n", "9 90.000000 120.0 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summary.toPandas().head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Group data by space group and count occurances" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
spaceGroupcount
0P 21 21 212211
1P 1 21 11595
2C 1 2 11076
3P 21 21 2558
4C 2 2 21544
5P 1417
6P 41 21 2324
7P 43 21 2313
8P 32 2 1281
9P 31 2 1262
\n", "
" ], "text/plain": [ " spaceGroup count\n", "0 P 21 21 21 2211\n", "1 P 1 21 1 1595\n", "2 C 1 2 1 1076\n", "3 P 21 21 2 558\n", "4 C 2 2 21 544\n", "5 P 1 417\n", "6 P 41 21 2 324\n", "7 P 43 21 2 313\n", "8 P 32 2 1 281\n", "9 P 31 2 1 262" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = summary.groupBy(\"spaceGroup\")\\\n", " .count()\\\n", " .sort(\"count\", ascending=False)\\\n", " .toPandas()\n", "\n", "df.head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot histogram for the top 10 space groups" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAE1CAYAAADuwDd5AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAHHpJREFUeJzt3XucXGWd5/HPlxCIQpBAMuGSaBDjQphcYEJEGRaSIAS8BEdAgYWISHQX5DK+djeKDhlFF0XZDMhlmRfhoggCCmSRAbMQlKuQcMllIhLZAB0ChMRFlGFI5Ld/1FNJpdKV7nRXnzqp5/t+verVVc85dc6vuqvrW+ec5zlHEYGZmeVnm1YXYGZmreEAMDPLlAPAzCxTDgAzs0w5AMzMMuUAMDPLlAPAzCxTDgAzs0w5AMzMMrVtqwvYnMGDB8eIESNaXYaZ2VZlwYIFr0XEkK7mK3UAjBgxgvnz57e6DDOzrYqk57szn3cBmZllygFgZpYpB4CZWaZKfQzAzKwra9eupaOjg7feeqvVpRRuwIABDBs2jP79+/fo+Q4AM9uqdXR0MHDgQEaMGIGkVpdTmIhg9erVdHR0sNdee/VoGd4FZGZbtbfeeotdd901qw9/AEnsuuuuvdrycQCY2VYvtw//qt6+bgeAmVmmtvpjACNm/KLXy1h+4ceaUImZlUEzPhNqleHzYdasWUyfPp13v/vdTV2utwDMzEpu1qxZvPnmm01frgPAzKwJrr/+esaMGcPYsWM5+eSTWb58OZMmTWLMmDFMnjyZF154AYDPfe5z3Hrrreuft+OOOwJw//33c9hhh3Hssceyzz77cNJJJxERXHLJJbz00ktMnDiRiRMnNrXmrX4XkJlZqy1ZsoQLLriAhx9+mMGDB7NmzRqmTZu2/jZ79mzOOussbr/99s0u58knn2TJkiXsscceHHzwwTz00EOcddZZXHzxxcybN4/Bgwc3tW5vAZiZ9dJ9993Hcccdt/4DepddduGRRx7hxBNPBODkk0/mwQcf7HI5EyZMYNiwYWyzzTaMGzeO5cuX92XZDgAzsyJtu+22vPPOOwC88847vP322+unbb/99uvv9+vXj3Xr1vVpLQ4AM7NemjRpErfccgurV68GYM2aNXzkIx/hpptuAuCGG27gkEMOASqnuV+wYAEAc+bMYe3atV0uf+DAgbzxxhtNr9vHAMysrbSi2+Z+++3Heeedx6GHHkq/fv3Yf//9ufTSSzn11FO56KKLGDJkCNdccw0Ap59+OlOnTmXs2LFMmTKFHXbYocvlT58+nSlTprDHHnswb968ptWtiGjawppt/Pjx0dUFYTwOwCxvS5cuZd999211GS3T2euXtCAixnf1XO8CMjPLlAPAzCxTDgAz2+qVeVd2X+rt63YAmNlWbcCAAaxevTq7EKheD2DAgAE9XoZ7AZnZVm3YsGF0dHSwatWqVpdSuOoVwXrKAWBmW7X+/fv3+IpYufMuIDOzTDkAzMwy1WUASBouaZ6kf5W0RNLZqX0XSXMlPZt+DkrtknSJpGWSFko6oGZZ09L8z0qa1ncvy8zMutKdLYB1wFciYhRwEHCGpFHADODeiBgJ3JseAxwFjEy36cAVUAkM4HzgQ8AE4PxqaJiZWfG6DICIWBkRT6T7bwBLgT2BqcB1abbrgGPS/anA9VHxKLCzpN2BI4G5EbEmIv4AzAWmNPXVmJlZt23RMQBJI4D9gd8AQyNiZZr0MjA03d8TeLHmaR2prVF7/TqmS5ovaX6O3brMzIrS7QCQtCPwM+CciPhj7bSojMBoyiiMiLgqIsZHxPghQ4Y0Y5FmZtaJbgWApP5UPvxviIifp+ZX0q4d0s9XU/sKYHjN04eltkbtZmbWAt3pBSTgamBpRFxcM2kOUO3JMw24o6b9lNQb6CDg9bSr6B7gCEmD0sHfI1KbmZm1QHdGAh8MnAwskvRUavsacCFws6TTgOeB49O0u4CjgWXAm8CpABGxRtK3gMfTfN+MiDVNeRVmZrbFugyAiHgQUIPJkzuZP4AzGixrNjB7Swo0M7O+4ZHAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZprZtdQFtYeZ7mrCM13u/DDOzLeAtADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTHUZAJJmS3pV0uKatpmSVkh6Kt2Orpn2VUnLJD0j6cia9impbZmkGc1/KWZmtiW6swVwLTClk/b/GRHj0u0uAEmjgM8C+6XnXC6pn6R+wGXAUcAo4IQ0r5mZtUiX4wAi4teSRnRzeVOBmyLi34H/K2kZMCFNWxYRzwFIuinN+69bXLGZmTVFb44BnClpYdpFNCi17Qm8WDNPR2pr1L4JSdMlzZc0f9WqVb0oz8zMNqenAXAFsDcwDlgJ/KBZBUXEVRExPiLGDxkypFmLNTOzOj06FUREvFK9L+mfgTvTwxXA8JpZh6U2NtNuZmYt0KMtAEm71zz8FFDtITQH+Kyk7SXtBYwEHgMeB0ZK2kvSdlQOFM/pedlmZtZbXW4BSLoROAwYLKkDOB84TNI4IIDlwBcBImKJpJupHNxdB5wREX9JyzkTuAfoB8yOiCVNfzVmZtZt3ekFdEInzVdvZv5vA9/upP0u4K4tqs7MzPqMRwKbmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWVq21YXYM0z+rrRvV7GommLmlCJmW0NvAVgZpYpB4CZWaYcAGZmmeoyACTNlvSqpMU1bbtImivp2fRzUGqXpEskLZO0UNIBNc+ZluZ/VtK0vnk5ZmbWXd3ZArgWmFLXNgO4NyJGAvemxwBHASPTbTpwBVQCAzgf+BAwATi/GhpmZtYaXQZARPwaWFPXPBW4Lt2/Djimpv36qHgU2FnS7sCRwNyIWBMRfwDmsmmomJlZgXp6DGBoRKxM918Ghqb7ewIv1szXkdoatZuZWYv0+iBwRAQQTagFAEnTJc2XNH/VqlXNWqyZmdXpaQC8knbtkH6+mtpXAMNr5huW2hq1byIiroqI8RExfsiQIT0sz8zMutLTAJgDVHvyTAPuqGk/JfUGOgh4Pe0qugc4QtKgdPD3iNRmZmYt0uWpICTdCBwGDJbUQaU3z4XAzZJOA54Hjk+z3wUcDSwD3gROBYiINZK+BTye5vtmRNQfWDYzswJ1GQARcUKDSZM7mTeAMxosZzYwe4uqMzOzPuORwGZmmXIAmJllygFgZpYpB4CZWaYcAGZmmXIAmJllygFgZpYpB4CZWaYcAGZmmXIAmJllygFgZpYpB4CZWaYcAGZmmXIAmJllygFgZpYpB4CZWaYcAGZmmXIAmJllygFgZpYpB4CZWaYcAGZmmXIAmJllygFgZpYpB4CZWaYcAGZmmXIAmJllygFgZpYpB4CZWaYcAGZmmXIAmJllygFgZpYpB4CZWaYcAGZmmXIAmJllygFgZpYpB4CZWaa2bXUB1l6W7rNvr5ex72+XNqESM+uKA8Da0mVfuq9Xzz/jyklNqsSsvHq1C0jSckmLJD0laX5q20XSXEnPpp+DUrskXSJpmaSFkg5oxgswM7OeacYxgIkRMS4ixqfHM4B7I2IkcG96DHAUMDLdpgNXNGHdZmbWQ31xEHgqcF26fx1wTE379VHxKLCzpN37YP1mZtYNvQ2AAH4paYGk6altaESsTPdfBoam+3sCL9Y8tyO1bUTSdEnzJc1ftWpVL8szM7NGensQ+G8jYoWkvwLmSvpt7cSICEmxJQuMiKuAqwDGjx+/Rc81M7Pu69UWQESsSD9fBW4DJgCvVHftpJ+vptlXAMNrnj4stZmZWQv0OAAk7SBpYPU+cASwGJgDTEuzTQPuSPfnAKek3kAHAa/X7CoyM7OC9WYX0FDgNknV5fwkIu6W9Dhws6TTgOeB49P8dwFHA8uAN4FTe7FuMzPrpR4HQEQ8B4ztpH01MLmT9gDO6On6zMysuXwuIDOzTDkAzMwy5XMBmfWRH3zm471exld+emcTKjHrnLcAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTHkgmFmb65jxQK+eP+zCQ5pUiZWNA8DM+tzMmTNLsQzbmHcBmZllygFgZpYp7wIys2zce9/evXr+5Em/b1Il5eAtADOzTHkLwMysQLvNe6rXy3h54rgmVOItADOzbDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DMLFOFB4CkKZKekbRM0oyi129mZhWFBoCkfsBlwFHAKOAESaOKrMHMzCqK3gKYACyLiOci4m3gJmBqwTWYmRmgiChuZdKxwJSI+EJ6fDLwoYg4s2ae6cD09PA/AM/0crWDgdd6uYxmKEMdZagBylFHGWqActRRhhqgHHWUoQbofR3vi4ghXc20bS9W0Cci4irgqmYtT9L8iBjfrOVtzXWUoYay1FGGGspSRxlqKEsdZaihyDqK3gW0Ahhe83hYajMzs4IVHQCPAyMl7SVpO+CzwJyCazAzMwreBRQR6ySdCdwD9ANmR8SSPl5t03Yn9VIZ6ihDDVCOOspQA5SjjjLUAOWooww1QEF1FHoQ2MzMysMjgc3MMuUAMDPLlAPAzCxTDgDrU5J2krR3J+1jWlGPgaT+nbQNbkUt1lpZBICkUhzZl7SoBDWcWuC6jgd+C/xM0hJJB9ZMvraoOhqR9C+trgFA0j8UtJ6JkjqAlZJ+KWlEzeRfFlFDqmO0pEclvSjpKkmDaqY9VlQdjZTofdHndZRuJHBPSdql0STg6ALr+LvN1LFbUXVsxj8C1xS0rq8BfxMRKyVNAH4k6asRcRuV30efk3RAo0nAuCJq6IYvAN8sYD3fA46MiCXptCxzJZ0cEY9S0N8juQKYCTxK5bU/KOmTEfF7YJOtk75QlvdFq+tomwAAVgHPs/EbOdLjvyqwjp8CN6R11xtQRAGSFjaaBAwtooakX0SsBIiIxyRNBO6UNJzOfz994XHgV3T+AbdzQTUg6Y+NJgHvKqiM7arjbiLiVklLgZ9L+u8U9/cAGBgRd6f735e0ALg7nRssq/dFq+topwB4DpgcES/UT5D0YoF1LAS+HxGLO6nj8IJqGAocCfyhvgTg4YJqAHhD0t7pmx1pS+Aw4HZgv4JqWAp8MSKerZ9Q8Pvi/wEHRsQrLaxjraTdIuJlgLQlMBm4E9jkOE1fkvSeiHg91TFP0qeBnwGNtuSbrSzvi5bW0U7HAGYBgxpM+16BdZwDNPq296mCargT2DEinq+7LQfuL6gGgP9M3TebiHgDmAJ8vqAaZtL4ff7lgmoAuB54X4NpPymohhnUbQFGRAdwKHBhQTUAfBfYt66OhcBk4OcF1TCTcrwvWlqHRwKbmWWqnbYA3OXQOiVpH0mTJe1Y1z6lVTWZlUHbBEDZuxxaa0g6C7iDyub0Ykm1V6D7TmuqMiuHdjoI3PIuh1ZKp1N5X/wp9Xu/VdKIiPgn/L6wzLXNFgB1XQ6BicDX0zfAUhzoKHIQVtkVONhmm4j4E0A6CH4YcJSkiykwACQNkHSOpB9K+qKkUn35KtFgyUIGxaV1lWKXcSvraKcAeKP2l5jC4DAqF50vqsthV/6xiJVIGi7pJkkPSPpa7dB/SbcXUUNa1wENbn9DcYNtXpG0fl0pDD5O5ZqrowuqAeA6YDywCDgK+EGB6wYqgyUb3HalwMGSXfhCESspyy7jVtdRqm8hvdRpl8N0oO/4ooooySCs2VT6VD8KnAb8StInImI1jbsi9oUyDLY5BVhX2xAR64BTJP2vgmoAGBURowEkXQ204pQHpRgsWZJBcWXZZdzSOtomACLi6Qbta6mMzC1KGQZhDYmIK9P9L0v6T8CvJX2SYneHtXywTern3mjaQ0XUkKytWe86qSWHH8oyWLIMg+LKMEq95XW0TQCUSHUQ1lP1EyTdX1AN/SUNiIi3ACLix5JepnIpzh0KqgHKM9imDMbWfPMV8K70WEBExE4F1FAdLLlJAFDsYMnqoLhNAoDiBsWVYZR6y+vwQLA2JOlc4ImI+FVd+/7A9yLio62pzKwcJI0F/hwRy+ra+wPHR0Qhew1aXYcDwPqUpH2APYHfVHvjpPYpNScEs4JI2onKLsLf17WPSadjKLKW/mkXbW3b4Ih4rcg6ctZOvYAaKrDLIZLGqMTnOi9SGQZhlaVHVBm0usdJTR0TVYLrElgbBUBJuhwCXE5l3/do4HdUznVe7Z5ayLnOS6Q6COsYKl1yvyHp7DStqKOgs6mcAO/LwO5UekTtmqYV2SOqDKo9TsYBp1LpcVI9QWGRR6Wr1yUYDFxF5boEB7Wgjuy100HgMnQ5hHKc67wsNhqElQ5u3SrpfRT3j16WHlFlUJaeL2W5LkH22mYLgA1dDifW34BC9ylKek/1fkTMAz4N/IgSfONUsaORyzAIq7+k9RfiiYgfA2dT6RG1e0E1lEVZBkuulbT+6ngpDCZT2XIeWUQBafTt/5D0I0kn1k27vIgaulLEruu2OQisyiXuFkXEM51MOyYiCtnfm95Mz6XL7NW2vxf4RkScXkQdjUh6ISLeW9C6hgHrqhcgqZt2cBH98N0jaoNW9zipWd/hwKr6sTvpi9OZEfHtAmr4GfAslcGSn6cyTuPEiPh3SU9ERKNLNTa7js1dEvLOiOjTLyltEwC2QRejkT8YEdsXWY9Z2Uh6Kh0LqT4+j8rpMD4JzC0wAP5C413XB0VEn46MbqdjAO5yuEEZRiObldn2kraJiHcAIuLbklYAvwZ23PxTm8qXhGyGMnQ5LJGyXBLSrKz+NzCptiEirgW+ArxdYB0z8SUhe0/SIuDDted9B34UEf8k6cmI2L+lBZqZlUzbbAFQkvO+b44KPNd52aXALmpdviRkFwoeLLmbpCskXSZpV0kzJS2SdLOkQnpmqeJ4Scel+5MlXSLpv0gq7HOx1XW00xbAfcDf156ETZWLbswGToqIfi0rbkM9hfXAKQNJf9doEnBlRAwpoIazgDOo7GsdB5wdEXekaYX19iiDVvc4qanjbuAXVE5MeCKVs/X+BDgGODwipm7m6c2q4XIqp8DeDvgjsD0wB/gY8EpEnL2Zp7dNHe0UAC3vcpjWtdlznUdEWx143xxJ1VNxd/YmOzYiBhZQg3cNJq3ucVJTx/rfe/2XovreOX1Yw6KIGJ26wL4M7B4Rb6cvjU9ERCFXBWt1HW3zYVSi8763/Fznkj4ADK1/3ZIOBl6uPxFYH1oIfD8iFndS4+EF1VCG0chl0fLrMyS1uzau38y0vrQOKtcLkfR4RLydHq+T9E5BNbS8jnY6BlAW1XOdd6aoc53PorI5We+PaVpRzmlQB8CnGrQ3WxlGI5fFTMpxfYY7qsdjIuLr1cb0xeV3BdXwck0N648FpRHKRfYCamkdbbMLyDZI3yQObDBtUaRLE+agLLsGbesgaQdgh4h4NYc6vAXQnjZ38ruirrkKgKQjJZ2mjU/5i6TPF7H+iOjo7MM/Tcv+wz91nmi5stQREX9u9Yd/kXVksQWQ4bfeG4H7IuKf69q/AHw0Ij5TUB3fAf4WeAL4BDArIi5N07LqgVMGnZwiRMAHgWcACjzwWYo6rI0OAnfR5XC3BtPa1TnAbZJOAhaktvFUupoVte8dKh/6+6cDWjOBn0h6f0ScS34HYMtgOZVjMhcA/0blb/AAlb9TjnVkr222AMrQ5TDVMQD4EvABYBFwdUSsK2LdndQyEfjr9HBJRBS6mS1paUTsW/O4H5ULgOwEjIqIvr/odXl6RJWCKheAOZdK76w5kp6LiPfnVkdZ/k9bXUc7BcACYFqDLocvRsTwgur4KZVTyz4AHAU8X9SgkrKRdCdwUSenYr4A+FpE9P1Ix0oNX42IRXXto4HvRER23zrTAcZvAXtTuULYsNzqKMv/aavraKcAOITKL++FTqaNj4j5BdWx/nhDGszxWK77uiW9CyAi/q2TaXtGxIoCanCPqAZUuT7Ah2PDFdOyqaMs/6etrqNtjgFExAObmVbIh3+ytma966R8d3V39sFfM63PP/yT0vSIKpuoXJDl6S5nbM86yvJ/2tI62mYLACpdDoFhwL3phHDV9s9HxOyCavgL8OfqQyofMm+m+xEROxVRh1WUpUeUlUtZ/k9bXUfbBIC7HFpnJA0FbqMyqnKTHlGNxgiY5aCdAmARG7oc7kzltAvPRMS5uZ30qyxa3cOhrpaW9ogqg7L8PcpSh7VXALS8y6FtrNU9HGxjZfl7lKUOa68AaHmXQ9tYq3s42MbK8vcoSx3WXucCOg54rL4xnW2wkDEAtomNeji0shADyvP3KEsd2WubLQArn1b3cLCNleXvUZY6zAFgZpatdtoFZGZmW6BtRgK7a5mZ2ZZpm11A7lpmZrZl2ikA3LXMzGwLtNMxAHctMzPbAu20BeCuZWZmW6BtAsDMzLZMO+0CMjOzLeAAMDPLlAPAzCxTDgCzHpK0raTvSHpW0lPpdl6r6zLrLgeAWc9dAOwBjI6IccAhQP/6mVTh/zUrHb8prS1I2kHSLyQ9LWmxpM9IWi7pe5IWSXpM0gfSvJ+Q9BtJT0r6P+mykUjaUdI1af6Fkj6d2o+Q9IikJyTdkuZ7N3A68OWIeAsgIt6IiJnpOSMkPSPpemAxMFzSCWnZiyV9t6b2P9XcP1bSten+tZKulDRf0u8kfbyI36XlwwFg7WIK8FJEjI2IvwbuTu2vpxHiPwRmpbYHgYPSZUJvAv5bav9Gdf6IGAPcJ2kw8HXg8DSyfD7w91TOOfVCRLyxmZpGApenq9GtBb4LTALGAQdKOqYbr2sEMAH4GHBlOueVWVM4AKxdLAI+Kum7kg6JiNdT+401Pz+c7g8D7knXkf6vQPVyoYcDl1UXGBF/AA4CRgEPSXoKmAa8r37lkk5NxwBelFS9ANHzEfFoun8gcH9ErEoj1W8A/mM3XtfNEfFORDwLPAfs043nmHWLA8DaQkT8DjiAShBcIOkfqpNqZ0s/LwV+mLYMvghs7lu1gLkRMS7dRkXEacAy4L2SBqb1X5OOA7wO9EvP/XPni9y0/Jr79bXUj9T0yE1rGgeAtQVJewBvRsSPgYuohAHAZ2p+PpLuvwdYke5Pq1nMXOCMmmUOAh4FDq45frCDpA9GxJvA1cAPq7tlJPUDtmtQ4mPAoZIGp/lOAKrXr35F0r7pQPGn6p53nKRtJO0NvB94phu/DrNucQBYuxgNPJZ205xPpYcOwCBJC4GzgXNT20zgFkkLgNdqlnFBmn+xpKeBiRGxCvgccGNaziNs2A1zHrASWCzpSSqnIr8OeKm+uIhYCcwA5gFPAwsi4o40eQZwJ/BwWl6tF6iEx78AX6oecDZrBp8LyNqWpOXA+Ih4rat5yyj1BrozIm5tdS3WnrwFYGaWKW8BmJllylsAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZ+v8TLtRE3nWJgAAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "df.head(10).plot(x='spaceGroup', y='count', kind='bar');" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "spark.stop()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.13" } }, "nbformat": 4, "nbformat_minor": 4 }