{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting google-cloud-bigquery\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/72/e1/1ae3f8024e1d011bc567d54ec81e8c9afd08d107a326bd109e578475415d/google_cloud_bigquery-1.6.0-py2.py3-none-any.whl (83kB)\n", "\u001b[K 100% |████████████████████████████████| 92kB 4.7MB/s ta 0:00:011\n", "\u001b[?25hCollecting google-api-core<2.0.0dev,>=1.0.0 (from google-cloud-bigquery)\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e4/3a/f9a5746a4d1c03e4ae6d4fdea0b4275cd80320dde7b2a44439cf9e913e33/google_api_core-1.5.0-py2.py3-none-any.whl (62kB)\n", "\u001b[K 100% |████████████████████████████████| 71kB 8.9MB/s ta 0:00:011\n", "\u001b[?25hCollecting google-resumable-media>=0.2.1 (from google-cloud-bigquery)\n", " Downloading https://files.pythonhosted.org/packages/77/95/2e4020a54366423ddba715f89fb7ca456c8f048b15cada6cd6a54cf10e8c/google_resumable_media-0.3.1-py2.py3-none-any.whl\n", "Collecting google-cloud-core<0.29dev,>=0.28.0 (from google-cloud-bigquery)\n", " Downloading https://files.pythonhosted.org/packages/0f/41/ae2418b4003a14cf21c1c46d61d1b044bf02cf0f8f91598af572b9216515/google_cloud_core-0.28.1-py2.py3-none-any.whl\n", "Requirement not upgraded as not directly required: googleapis-common-protos<2.0dev,>=1.5.3 in /opt/conda/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (1.5.3)\n", "Requirement not upgraded as not directly required: six>=1.10.0 in /opt/conda/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (1.11.0)\n", "Requirement not upgraded as not directly required: requests<3.0.0dev,>=2.18.0 in /opt/conda/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (2.18.4)\n", "Requirement not upgraded as not directly required: pytz in /opt/conda/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (2018.5)\n", "Collecting google-auth<2.0.0dev,>=0.4.0 (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery)\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/58/cb/96dbb4e50e7a9d856e89cc9c8e36ab1055f9774f7d85f37e2156c1d79d9f/google_auth-1.5.1-py2.py3-none-any.whl (65kB)\n", "\u001b[K 100% |████████████████████████████████| 71kB 7.5MB/s ta 0:00:011\n", "\u001b[?25hRequirement not upgraded as not directly required: setuptools>=34.0.0 in /opt/conda/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (39.2.0)\n", "Requirement not upgraded as not directly required: protobuf>=3.4.0 in /opt/conda/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (3.6.1)\n", "Requirement not upgraded as not directly required: chardet<3.1.0,>=3.0.2 in /opt/conda/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (3.0.4)\n", "Requirement not upgraded as not directly required: idna<2.7,>=2.5 in /opt/conda/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (2.6)\n", "Requirement not upgraded as not directly required: urllib3<1.23,>=1.21.1 in /opt/conda/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (1.22)\n", "Requirement not upgraded as not directly required: certifi>=2017.4.17 in /opt/conda/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (2018.10.15)\n", "Collecting cachetools>=2.0.0 (from google-auth<2.0.0dev,>=0.4.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery)\n", " Downloading https://files.pythonhosted.org/packages/0a/58/cbee863250b31d80f47401d04f34038db6766f95dea1cc909ea099c7e571/cachetools-2.1.0-py2.py3-none-any.whl\n", "Requirement not upgraded as not directly required: rsa>=3.1.4 in /opt/conda/lib/python3.6/site-packages (from google-auth<2.0.0dev,>=0.4.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (4.0)\n", "Requirement not upgraded as not directly required: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.6/site-packages (from google-auth<2.0.0dev,>=0.4.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (0.2.2)\n", "Requirement not upgraded as not directly required: pyasn1>=0.1.3 in /opt/conda/lib/python3.6/site-packages (from rsa>=3.1.4->google-auth<2.0.0dev,>=0.4.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery) (0.4.4)\n", "\u001b[31mmkl-random 1.0.1 requires cython, which is not installed.\u001b[0m\n", "Installing collected packages: cachetools, google-auth, google-api-core, google-resumable-media, google-cloud-core, google-cloud-bigquery\n", "Successfully installed cachetools-2.1.0 google-api-core-1.5.0 google-auth-1.5.1 google-cloud-bigquery-1.6.0 google-cloud-core-0.28.1 google-resumable-media-0.3.1\n", "\u001b[33mYou are using pip version 10.0.1, however version 18.1 is available.\n", "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "import sys\n", "!{sys.executable} -m pip install --upgrade google-cloud-bigquery" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: cython in /opt/conda/lib/python3.6/site-packages (0.29)\n", "Requirement already satisfied: pandas-gbq in /opt/conda/lib/python3.6/site-packages (0.7.0)\n", "Requirement already satisfied: pandas in /opt/conda/lib/python3.6/site-packages (from pandas-gbq) (0.23.4)\n", "Requirement already satisfied: google-auth in /opt/conda/lib/python3.6/site-packages (from pandas-gbq) (1.5.1)\n", "Requirement already satisfied: google-cloud-bigquery>=0.32.0 in /opt/conda/lib/python3.6/site-packages (from pandas-gbq) (1.6.0)\n", "Requirement already satisfied: google-auth-oauthlib in /opt/conda/lib/python3.6/site-packages (from pandas-gbq) (0.2.0)\n", "Requirement already satisfied: setuptools in /opt/conda/lib/python3.6/site-packages (from pandas-gbq) (39.2.0)\n", "Requirement already satisfied: python-dateutil>=2.5.0 in /opt/conda/lib/python3.6/site-packages (from pandas->pandas-gbq) (2.7.3)\n", "Requirement already satisfied: pytz>=2011k in /opt/conda/lib/python3.6/site-packages (from pandas->pandas-gbq) (2018.5)\n", "Requirement already satisfied: numpy>=1.9.0 in /opt/conda/lib/python3.6/site-packages (from pandas->pandas-gbq) (1.15.2)\n", "Requirement already satisfied: cachetools>=2.0.0 in /opt/conda/lib/python3.6/site-packages (from google-auth->pandas-gbq) (2.1.0)\n", "Requirement already satisfied: rsa>=3.1.4 in /opt/conda/lib/python3.6/site-packages (from google-auth->pandas-gbq) (4.0)\n", "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.6/site-packages (from google-auth->pandas-gbq) (0.2.2)\n", "Requirement already satisfied: six>=1.9.0 in /opt/conda/lib/python3.6/site-packages (from google-auth->pandas-gbq) (1.11.0)\n", "Requirement already satisfied: google-api-core<2.0.0dev,>=1.0.0 in /opt/conda/lib/python3.6/site-packages (from google-cloud-bigquery>=0.32.0->pandas-gbq) (1.5.0)\n", "Requirement already satisfied: google-resumable-media>=0.2.1 in /opt/conda/lib/python3.6/site-packages (from google-cloud-bigquery>=0.32.0->pandas-gbq) (0.3.1)\n", "Requirement already satisfied: google-cloud-core<0.29dev,>=0.28.0 in /opt/conda/lib/python3.6/site-packages (from google-cloud-bigquery>=0.32.0->pandas-gbq) (0.28.1)\n", "Requirement already satisfied: requests-oauthlib>=0.7.0 in /opt/conda/lib/python3.6/site-packages (from google-auth-oauthlib->pandas-gbq) (1.0.0)\n", "Requirement already satisfied: pyasn1>=0.1.3 in /opt/conda/lib/python3.6/site-packages (from rsa>=3.1.4->google-auth->pandas-gbq) (0.4.4)\n", "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.5.3 in /opt/conda/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery>=0.32.0->pandas-gbq) (1.5.3)\n", "Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /opt/conda/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery>=0.32.0->pandas-gbq) (2.18.4)\n", "Requirement already satisfied: protobuf>=3.4.0 in /opt/conda/lib/python3.6/site-packages (from google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery>=0.32.0->pandas-gbq) (3.6.1)\n", "Requirement already satisfied: oauthlib>=0.6.2 in /opt/conda/lib/python3.6/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib->pandas-gbq) (2.1.0)\n", "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /opt/conda/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery>=0.32.0->pandas-gbq) (3.0.4)\n", "Requirement already satisfied: idna<2.7,>=2.5 in /opt/conda/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery>=0.32.0->pandas-gbq) (2.6)\n", "Requirement already satisfied: urllib3<1.23,>=1.21.1 in /opt/conda/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery>=0.32.0->pandas-gbq) (1.22)\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.6/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2.0.0dev,>=1.0.0->google-cloud-bigquery>=0.32.0->pandas-gbq) (2018.10.15)\n", "\u001b[33mYou are using pip version 10.0.1, however version 18.1 is available.\n", "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "import sys\n", "!{sys.executable} -m pip install cython pandas-gbq" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from google.cloud import bigquery\n", "client = bigquery.Client()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "query_job = client.query(\"\"\"\n", "\tWITH\n", "\t table1 AS (\n", "\t SELECT\n", "\t project_short_name,\n", "\t case_barcode,\n", "\t IF (gender = 'FEMALE',\n", "\t 1,\n", "\t 0) AS F,\n", "\t IF (gender = 'MALE',\n", "\t 1,\n", "\t 0) AS M\n", "\t FROM\n", "\t `isb-cgc.TCGA_bioclin_v0.Clinical`\n", "\t GROUP BY\n", "\t project_short_name,\n", "\t case_barcode,\n", "\t gender)\n", "\t -- \n", "\t --\n", "\tSELECT\n", "\t project_short_name,\n", "\t SUM(M) AS M_count,\n", "\t SUM(F) AS F_count\n", "\tFROM\n", "\t table1\n", "\tGROUP BY\n", "\t project_short_name\n", "\t\"\"\")\n", "\n", "results = query_job.result()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TCGA-UCEC : 548 : 0\n", "TCGA-CESC : 307 : 0\n", "TCGA-OV : 587 : 0\n", "TCGA-UCS : 57 : 0\n", "TCGA-BRCA : 1085 : 12\n", "TCGA-CHOL : 25 : 20\n", "TCGA-DLBC : 26 : 22\n", "TCGA-STAD : 158 : 285\n", "TCGA-LGG : 230 : 285\n", "TCGA-ACC : 60 : 32\n", "TCGA-SKCM : 180 : 290\n", "TCGA-UVM : 35 : 45\n", "TCGA-BLCA : 108 : 304\n", "TCGA-KICH : 51 : 62\n", "TCGA-THYM : 60 : 64\n", "TCGA-MESO : 16 : 71\n", "TCGA-PCPG : 101 : 78\n", "TCGA-KIRC : 191 : 346\n", "TCGA-READ : 78 : 92\n", "TCGA-PAAD : 83 : 102\n", "TCGA-LAML : 91 : 109\n", "TCGA-GBM : 230 : 366\n", "TCGA-LUSC : 131 : 373\n", "TCGA-SARC : 142 : 119\n", "TCGA-HNSC : 142 : 386\n", "TCGA-TGCT : 0 : 134\n", "TCGA-THCA : 371 : 136\n", "TCGA-ESCA : 27 : 158\n", "TCGA-KIRP : 77 : 214\n", "TCGA-LUAD : 280 : 242\n", "TCGA-COAD : 216 : 243\n", "TCGA-PRAD : 0 : 500\n", "TCGA-LIHC : 122 : 255\n" ] } ], "source": [ "for row in results:\n", " print(\"{} : {} : {}\".format(row.project_short_name, row.F_count, row.M_count))" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "import pandas\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "projectid = \"isb-cgc-02-0001\"\n", "query = \"\"\"\n", "\tWITH\n", "\t table1 AS (\n", "\t SELECT\n", "\t project_short_name,\n", "\t case_barcode,\n", "\t IF (gender = 'FEMALE',\n", "\t 1,\n", "\t 0) AS F,\n", "\t IF (gender = 'MALE',\n", "\t 1,\n", "\t 0) AS M\n", "\t FROM\n", "\t `isb-cgc.TCGA_bioclin_v0.Clinical`\n", "\t GROUP BY\n", "\t project_short_name,\n", "\t case_barcode,\n", "\t gender)\n", "\t -- \n", "\t --\n", "\tSELECT\n", "\t project_short_name,\n", "\t SUM(M) AS M_count,\n", "\t SUM(F) AS F_count\n", "\tFROM\n", "\t table1\n", "\tGROUP BY\n", "\t project_short_name\n", "\t\"\"\"\n", "data_frame = pandas.read_gbq(query, project_id=projectid, dialect='standard')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(33, 3)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "data_frame.shape" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
project_short_nameM_countF_count
0TCGA-UCEC0548
1TCGA-CESC0307
2TCGA-OV0587
3TCGA-UCS057
4TCGA-BRCA121085
5TCGA-CHOL2025
6TCGA-DLBC2226
7TCGA-STAD285158
8TCGA-LGG285230
9TCGA-ACC3260
10TCGA-SKCM290180
11TCGA-UVM4535
12TCGA-BLCA304108
13TCGA-KICH6251
14TCGA-THYM6460
15TCGA-MESO7116
16TCGA-PCPG78101
17TCGA-KIRC346191
18TCGA-READ9278
19TCGA-PAAD10283
20TCGA-LAML10991
21TCGA-GBM366230
22TCGA-LUSC373131
23TCGA-SARC119142
24TCGA-HNSC386142
25TCGA-TGCT1340
26TCGA-THCA136371
27TCGA-ESCA15827
28TCGA-KIRP21477
29TCGA-LUAD242280
30TCGA-COAD243216
31TCGA-PRAD5000
32TCGA-LIHC255122
\n", "
" ], "text/plain": [ " project_short_name M_count F_count\n", "0 TCGA-UCEC 0 548\n", "1 TCGA-CESC 0 307\n", "2 TCGA-OV 0 587\n", "3 TCGA-UCS 0 57\n", "4 TCGA-BRCA 12 1085\n", "5 TCGA-CHOL 20 25\n", "6 TCGA-DLBC 22 26\n", "7 TCGA-STAD 285 158\n", "8 TCGA-LGG 285 230\n", "9 TCGA-ACC 32 60\n", "10 TCGA-SKCM 290 180\n", "11 TCGA-UVM 45 35\n", "12 TCGA-BLCA 304 108\n", "13 TCGA-KICH 62 51\n", "14 TCGA-THYM 64 60\n", "15 TCGA-MESO 71 16\n", "16 TCGA-PCPG 78 101\n", "17 TCGA-KIRC 346 191\n", "18 TCGA-READ 92 78\n", "19 TCGA-PAAD 102 83\n", "20 TCGA-LAML 109 91\n", "21 TCGA-GBM 366 230\n", "22 TCGA-LUSC 373 131\n", "23 TCGA-SARC 119 142\n", "24 TCGA-HNSC 386 142\n", "25 TCGA-TGCT 134 0\n", "26 TCGA-THCA 136 371\n", "27 TCGA-ESCA 158 27\n", "28 TCGA-KIRP 214 77\n", "29 TCGA-LUAD 242 280\n", "30 TCGA-COAD 243 216\n", "31 TCGA-PRAD 500 0\n", "32 TCGA-LIHC 255 122" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_frame" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAD/CAYAAAD4xAEfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvqOYd8AAAGsVJREFUeJzt3X+UFOWd7/H31xkEFYI4jsQwmBk3ILqA6J2IK8lKJAcEXeHmylk1WflluCbEn2uWyV1vYNfdhGz2xGii5LBRFt2AZtGoR01cRIzxuhogIpAMK6gTmQvREZDoVRKB7/2jnsGapmemp7tnuofn8zqnTlc99a2qp7qr69v11I82d0dEROJzVKkrICIipaEEICISKSUAEZFIKQGIiERKCUBEJFJKACIikVICEBGJlBKAiEiklABERCKlBCAiEqnKUlegIyeeeKLX1taWuhoiIr3K+vXr33L36s7iyjoB1NbWsm7dulJXQ0SkVzGz3+YSpyYgEZFIKQGIiERKCUBEJFJlfQ5AROLxwQcf0NzczL59+0pdlV6jX79+1NTU0KdPn7ymVwIQkbLQ3NzMgAEDqK2txcxKXZ2y5+7s2rWL5uZm6urq8pqHmoBEpCzs27ePqqoq7fxzZGZUVVUVdMSkBCAiZUM7/64p9P1SAhARiZTOAXSHhQNT/XtLVw+RXqy24bGizq9p0UWdxpgZX/jCF7j33nsB2L9/PyeffDJjx47l0UcfLWp9ctXU1MRzzz3HFVdcUfR56whARCQ47rjj2Lx5M++//z4Aq1atYsiQISWtU1NTE8uXL++WeSsBiIikTJ48mcceS44+VqxYweWXX95h/LvvvsusWbMYNWoUo0eP5oEHHjg07ahRoxg5ciTz588/FN+/f/9D/StXrmTmzJkAzJw5k2uvvZbzzjuPU089lZUrVwLQ0NDAL37xC8aMGcOtt95azFVVAhARSbvsssu477772LdvHxs3bmTs2LEdxt9yyy0MHDiQTZs2sXHjRi644AJ27NjB/Pnzeeqpp9iwYQNr167loYce6nTZO3fu5Nlnn+XRRx+loaEBgEWLFvHpT3+aDRs2cMMNNxRlHVspAYiIpIwePZqmpiZWrFjBlClTOo1/8sknmTdv3qHhQYMGsXbtWsaPH091dTWVlZV8/vOf55lnnul0XtOmTeOoo47ijDPO4I033ihoPXKhk8AiIhkuueQSbrrpJp5++ml27drVYay7H3Y5pru3G5+OzbyGv2/fvjnNo1h0BCAikmH27Nl8/etfZ9SoUZ3GTpw4ke9///uHhvfs2cPYsWP5+c9/zltvvcWBAwdYsWIF559/PgCDBw+msbGRgwcP8pOf/KTT+Q8YMIB33nkn/5XpgI4ARKQs5XLZZnepqanhuuuuyyn25ptvZt68eYwcOZKKigoWLFjA5z73Ob75zW/ymc98BndnypQpTJ06FUja9C+++GKGDh3KyJEjeffddzuc/+jRo6msrOTMM89k5syZRT0PYD1xmJGv+vp675V/CKP7AES6rLGxkdNPP73U1eh1sr1vZrbe3es7m1ZNQCIikVITkIhIDpYuXcptt93WpmzcuHHccccdJapR4ZQARERyMGvWLGbNmlXqahRVp01AZna3mb1pZptTZSeY2Soz2xpeB4VyM7PbzWybmW00s7NT08wI8VvNbEb3rI6IiOQql3MA/wpcmFHWAKx292HA6jAMMBkYFrq5wGJIEgawABgLnAMsaE0aIiJSGp0mAHd/BtidUTwVWBb6lwHTUuX3eOJ54HgzOxmYBKxy993uvgdYxeFJRUREelC+VwENdvedAOH1pFA+BNieimsOZe2Vi4hIiRT7JHC2v6fxDsoPn4HZXJLmI0455ZTi1UxEepf0/TRFmV/n9+RUVFS0ufv3oYceora2trj1yMHbb7/N8uXL+fKXv9yty8n3COCN0LRDeH0zlDcDQ1NxNcCODsoP4+5L3L3e3eurq6vzrJ6ISNcdc8wxbNiw4VBXip0/JAngzjvv7Pbl5JsAHgFar+SZATycKr8yXA10LrA3NBE9AUw0s0Hh5O/EUCYi0uscOHCAm2666dB/AHzve98DYPXq1Zx11lmMGjWK2bNn84c//AGA2tpa3nrrLQDWrVvH+PHjAVi4cCGzZ89m/PjxnHrqqdx+++1A8h8Ar7zyCmPGjOGrX/1qt61Hp01AZrYCGA+caGbNJFfzLAJ+bGZzgNeB6SH8cWAKsA14D5gF4O67zewWYG2I+3t3zzyxLCJSUu+//z5jxowBoK6urt2HtS1ZsoTXXnuNF198kcrKSnbv3s2+ffuYOXMmq1evZvjw4Vx55ZUsXryY66+/vsNlbtmyhTVr1vDOO+9w2mmn8aUvfYlFixaxefNmNmzYUPR1TOs0Abh7e3+HMyFLrAPzssTi7ncDd3epdiIiPai1CagzTz75JFdffTWVlcku9IQTTuCll16irq6O4cOHAzBjxgzuuOOOThPARRddRN++fenbty8nnXRSj/wPQCs9C0hEpIu6+h8AlZWVHDx4EOj4PwAqKirYv39/EWvaMSUAEZEumjhxIj/4wQ8O7ax3797NiBEjaGpqYtu2bQDce++9h/4DoLa2lvXr1wMc+s/gjnTnfwCk6VlAIlKeyvhR6ldddRUvv/wyo0ePpk+fPnzxi1/kK1/5CkuXLmX69Ons37+fT37yk1x99dUALFiwgDlz5vCNb3yj0/8YBqiqqmLcuHGMHDmSyZMn8+1vf7tb1kP/B9Ad9H8AIl2m/wPIj/4PQEREukxNQCIi7XjiiSeYP39+m7KOLg/tbZQARKRsZLu6ppQmTZrEpEmTSl2NdhXahK8mIBEpC/369WPXrl0F79Ri4e7s2rWLfv365T0PHQGISFmoqamhubmZlpaWUlel1+jXrx81NTV5T68EICJloU+fPtTV1ZW6GlFRE5CISKSUAEREIqUEICISKSUAEZFIKQGIiERKCUBEJFJKACIikVICEBGJlBKAiEiklABERCKlBCAiEiklABGRSCkBiIhESglARCRSSgAiIpFSAhARiZQSgIhIpJQAREQipQQgIhKpghKAmd1gZr82s81mtsLM+plZnZm9YGZbzex+Mzs6xPYNw9vC+NpirICIiOQn7wRgZkOAa4F6dx8JVACXAd8CbnX3YcAeYE6YZA6wx90/Adwa4kREpEQKbQKqBI4xs0rgWGAncAGwMoxfBkwL/VPDMGH8BDOzApcvIiJ5yjsBuPv/Bf4ZeJ1kx78XWA+87e77Q1gzMCT0DwG2h2n3h/iqfJcvIiKFKaQJaBDJr/o64GPAccDkLKHeOkkH49LznWtm68xsXUtLS77VExGRThTSBPRZ4DV3b3H3D4AHgfOA40OTEEANsCP0NwNDAcL4gcDuzJm6+xJ3r3f3+urq6gKqJyIiHSkkAbwOnGtmx4a2/AnAb4A1wKUhZgbwcOh/JAwTxj/l7ocdAYiISM8o5BzACyQnc38FbArzWgLMB240s20kbfx3hUnuAqpC+Y1AQwH1FhGRAlV2HtI+d18ALMgofhU4J0vsPmB6IcsTEZHi0Z3AIiKRUgIQEYmUEoCISKSUAEREIqUEICISKSUAEZFIKQGIiERKCUBEJFJKACIikVICEBGJlBKAiEiklABERCKlBCAiEiklABGRSCkBiIhESglARCRSSgAiIpFSAhARiZQSgIhIpJQAREQipQQgIhIpJQARkUgpAYiIREoJQEQkUkoAIiKRUgIQEYmUEoCISKSUAEREIqUEICISKSUAEZFIFZQAzOx4M1tpZlvMrNHM/szMTjCzVWa2NbwOCrFmZreb2TYz22hmZxdnFUREJB+FHgHcBvzM3UcAZwKNQAOw2t2HAavDMMBkYFjo5gKLC1y2iIgUIO8EYGYfAf4cuAvA3f/o7m8DU4FlIWwZMC30TwXu8cTzwPFmdnLeNRcRkYIUcgRwKtACLDWzF83sh2Z2HDDY3XcChNeTQvwQYHtq+uZQJiIiJVBIAqgEzgYWu/tZwP/jw+aebCxLmR8WZDbXzNaZ2bqWlpYCqiciIh0pJAE0A83u/kIYXkmSEN5obdoJr2+m4oempq8BdmTO1N2XuHu9u9dXV1cXUD0REelI3gnA3X8HbDez00LRBOA3wCPAjFA2A3g49D8CXBmuBjoX2NvaVFRyCwcmnYhIRCoLnP4a4EdmdjTwKjCLJKn82MzmAK8D00Ps48AUYBvwXogVEZESKSgBuPsGoD7LqAlZYh2YV8jyRESkeHQnsIhIpJQAREQipQQgIhIpJQARkUgVehVQz0tfrrlwb+nqISLSy+kIQEQkUkoAIiKRUgIQEYmUEoCISKSUAEREIqUEICISKSUAEZFIKQGIiERKCUBEJFK9705gEZEjVG3DY4f6mxZd1O3L0xGAiEiklABERCKlBCAiEiklABGRSCkBiIhESglARCRSSgAiIpFSAhARiZQSgIhIpJQAREQipQQgIhIpJQARkUgpAYiIREoJQEQkUgUnADOrMLMXzezRMFxnZi+Y2VYzu9/Mjg7lfcPwtjC+ttBli4hI/opxBHAd0Jga/hZwq7sPA/YAc0L5HGCPu38CuDXEiYhIiRSUAMysBrgI+GEYNuACYGUIWQZMC/1TwzBh/IQQLyIiJVDoEcB3gb8BDobhKuBtd98fhpuBIaF/CLAdIIzfG+JFRKQE8k4AZnYx8Ka7r08XZwn1HMal5zvXzNaZ2bqWlpZ8qyciIp0o5AhgHHCJmTUB95E0/XwXON7MWv9ruAbYEfqbgaEAYfxAYHfmTN19ibvXu3t9dXV1AdUTEZGO5J0A3P1r7l7j7rXAZcBT7v55YA1waQibATwc+h8Jw4TxT7n7YUcAInJkq2147FAnpdUd9wHMB240s20kbfx3hfK7gKpQfiPQ0A3LFhGRHFV2HtI5d38aeDr0vwqckyVmHzC9GMsTEZHCFSUBiMiRK91U07ToohLWJAcLB6b695auHr2EHgUhEim1w4sSgIhIpJQAREQipQQgIhIpnQQW6Qa96sSpREtHACIikVICEBGJlBKAiEiklABERCKlBCAiEiklABGRSB3ZCWDhwLbPBhERkUOO7AQgvY6eTyPSc5QAREQipQQgIhIpPQpC8qbHHYj0bkoAR4DWHXFnO2HtsEUkTU1AIiLF0AuvOlQCEBGJlBKAiEiklABERCKlBCAi0gsV46ZJJQARkUgpAYiIREr3AYiI5KnNvTX9cowro3twdAQgIhIpHQHEqvWGlYV7S1sPKb30zUvaHqKiIwARkUjlnQDMbKiZrTGzRjP7tZldF8pPMLNVZrY1vA4K5WZmt5vZNjPbaGZnF2slRESk6wo5AtgP/LW7nw6cC8wzszOABmC1uw8DVodhgMnAsNDNBRYXsGwRkW7TrX9MVEbPDMo7Abj7Tnf/Veh/B2gEhgBTgWUhbBkwLfRPBe7xxPPA8WZ2ct41FxGRghTlHICZ1QJnAS8Ag919JyRJAjgphA0Btqcmaw5lIiJSAgUnADPrDzwAXO/uv+8oNEuZZ5nfXDNbZ2brWlpaCq2eiIi0o6AEYGZ9SHb+P3L3B0PxG61NO+H1zVDeDAxNTV4D7Micp7svcfd6d6+vrq4upHoiItKBQq4CMuAuoNHdv5Ma9QgwI/TPAB5OlV8ZrgY6F9jb2lQkckQro5N+ImmF3Ag2DvgrYJOZbQhl/wtYBPzYzOYArwPTw7jHgSnANuA9YFYByxYRkQLlnQDc/Vmyt+sDTMgS78C8fJcnZU53Fov0OroTWEQkUkoAIiKRUgIQEYmUngZ6JNFTHUU61Pp4h3J6Jn8p6QhARCRSSgAiIpFSAhCR4sv15jfdJFdSOgcgImWpK/+je6htv4P/5ZXD6QhARHKnX+xHFCUAEZFIqQlIRIqiTZONmmJ6TgGXf+sIQEQkUjoCkPKkm9ryppudJFc6AhARiZQSgEi+eviKmNqGx9q0s4sUSk1AIiXUlWvdRYpNCUDkSKXzKNIJJQCR3kY7dikSJQCRLujWa921Yy8/R/hnogQggtriJU66CkhEJFJKANIr5XpJpC6dFGmfmoC6orU98AhsC2zXkbLOR3hbrkg+lACkd9OOXSRvSgDS7XSCVaQ86RyAiEikdAQgPUtNNiJlQwlADqM/9hCJgxJAKeX6a1i/mkWKS98poAQJwMwuBG4DKoAfuvuinq5DqR36w45Ofl3nGiciko8ePQlsZhXAHcBk4AzgcjM7oyfrINKpHn7Ov0ip9PQRwDnANnd/FcDM7gOmAr/p4XoUndrNRaS36enLQIcA21PDzaFMRER6mLl7zy3MbDowyd2vCsN/BZzj7tekYuYCc8PgacB/ZczmROCtHBeZa2xscaVcdrnHlXLZ5R5XymWXe1wpl50t7uPuXt3plO7eYx3wZ8ATqeGvAV/r4jzWFTs2trjeUEe9N+UX1xvqqPema11PNwGtBYaZWZ2ZHQ1cBjzSw3UQERF6+CSwu+83s68AT5BcBnq3u/+6J+sgIiKJHr8PwN0fBx4vYBZLuiE2trhSLrvc40q57HKPK+Wyyz2ulMvuSh3b6NGTwCIiUj70NFARkUgpAYiIRKrsHwZnZiNI7hYeAjiwA3jE3Rt7YNnnAO7ua8MjKy4EtoTzGO1Nc4+7X9nddctV6mqrHe7+pJldAZwHNAJL3P2DklZQREqmrM8BmNl84HLgPpK7hgFqSHZo93keD5ILCWUI8IK7v5sqv9Ddf5YaXkDyzKJKYBUwFnga+CzJvQz/aGaZl7Aa8BngKQB3v6SDenyK5NEYm939P1LlY4FGd/+9mR0DNABnkzwu4xvuvjfEXQv8xN23Hz73Nsv5UViHY4G3gf7Ag8AEks9/Rir2T4D/DgwF9gNbgRWtyxQpNjM7yd3fLOL8qtx9V7Hmd8TL9waCnuiAl4E+WcqPBrZ2YT6zwuu1JHcWPwQ0AVNTMb/KmGYTyaWqxwK/Bz4Syo8BNrZOA/wbMB44P7zuDP3nZ8zvl6n+LwIbgAXA/wEaUuN+DVSG/iXAd4FPhdgHU3F7SY6GfgF8GahuZ91b61oJvAFUhGFrHZd6b1YBNwPPAXcC/0iSeMaXelso0vZ0UpHnV1XqdUrVZSCwCNgC7ApdYyg7Psd5/DTV/xHgm8C9wBUZcXem+j8KLCZ5yGMVsDB8d34MnJwx3QkZXVX4Hg4CTkjFXZixXncBG4HlwODUuEXAiaG/HngV2Ab8Nv39C9/Tm4E/6WT964E14Ts9NHwf9pLcv3RWKq4/8Pfhu7oXaAGeB2ZmzK8S+J/Az0L9XwJ+ClxNlv1aB/VakuqvCPO8BRiXEXdzl7ebUm+4naz4FpJbmjPLPw78Vxfm83p43QT0D/21wDrgujD8YsY0L2brD8MbwutRwA1hQxkTyl5tpw7p+a0l7LCB44BNqXGN6Q0323Jb5xeWPzF8QVrChjYDGJCK20ySMAcB77R+0YB+GcvaxIfJ4Vjg6dB/Spb1186m/HY2TwDzgY9mvF/zgVWpsrPb6f4bsDMV90BY52kkN2s+APTN3C7DNncNyZHqxrC8U0LZwxl1PAi8ltF9EF5fTb+Hqf4fAv9A8p2/AXgovc2m+tcAnwz9w0ndHRvm/8/A68Avw3w+luUz+SXJUf/lJM8suzSUTwD+MxX3MDCTpDXiRuB/A8OAZSRH6a1xK0i213NDbE3oXwzc38n2mt5umzPej+XA9cB64Dvt7S9y+h52dYKe7Eja3LeRZM0loftZKLswI3ZjO90m4A8h5jcZ0/QP8/sOqZ1rGPcCcGzoPypjJ5G5Y64B/h34PiHZZFmXl0h2PlVk3LpN2+Tw73x4xLIUqE9t1Gvb+7CBPsAlYaNrSZXfQLKz+i3Jr/zVwL+E92VB+svEh1/wQcD61LjNGcvSzqb8djbt/iBKjwMOkDRRrsnSvZ+Ky/w+/C3J0WpVxnuW3nZfz5gmcx43hc9wVPr9ylLfX3Uwj/SPoC18eLT8fEbcpnbm92mSo9vfhXWem+O6pMe9lDFubet+guQcYS6fycsZwwdIvqfp7bV1+I+puPRReyXJPvFBoC8ZP9Ry6Uq+k++0gsmbei7wP4BLQ39Flrg3gDEkX950V0tyApSw4Y/JmK4SuAc4kFHet536nJjegDPGXZT+UmaMa0p9oK8Sdp4kSSi9UQ8E/hV4hSQJfRDifw6cmW2DzLKsYzKGP0bYCQHHh/fxnIyY60h2qkvCF6s1CVUDz2TEamfTNq4cdjb/AfwNbY9aBpMkySdTZZuBYe18dttT/Y2kfviEshkkRyK/zVY/4B/ae19SZa0/lr4DDCDLETPJ+b4bgb8O276lxqV3gNeE9b6A5Gjwu8CfA38H3JvtM0mVVZD8wFyaKvtPkiPq6SQ/mKaF8vNpm+SfAz4V+v+Cts83S2//z4d5pX9AHgX8Jck5yHR9tgKn5PC5bMkyfgHJ9yXnZvFD03Z1gnLtSA7dP9XOuOWpje+j7cSM64565VDvY4G6LOUDgDNJfi0PzjJ+eDfU5U9JksOITuK0sym/nc0g4FskiWoPsDu8r9+ibZPXpcBp7Xwm01L9/wR8NkvMhekdDUnzVP8scZ8AVnawDf0FyQ7yd1nGLcjoWptLPwrckxE7HrifpEl0E8lTBuaSamMnuWAkl+3/TJKj258CI0j+ufDtsB2elxH3yzDu2db3k+TH0rWpuNpQtzdJzme+HPrvJ+M7D8wj9QMvY9w1qf5/I6P1I5RfBXyQy3q2ma6rE6hTl7Gz2Z2xsxmUiotlZ1OZiin2zmZ0xs5meChvs7MJZSNIrlLrn1Ge2Vw6gqSpKd+4yfnMLzOW5IKKkd1Ux3zjTu9CXC7v9ViSq/2qSC7muAmY0s42cQ4fNi2eQfLD5LDYXONy2g7zmUiduvY6QtNRucVl7GzKso6FxJHjFW5diLummHHdtOzumN+WIsYtIPnhsY7kIofVwNeBZ4C/zXhvMmOfyhaba1yuXbftCNTF2dHOSfByiesNdcwnjhyvcCtVXG+oYzfFdXgpeVdjuzLPXLqyvxNYyo+ZbWxvFMm5gJLG9YY6dsM6V3i4sdHdm8xsPLDSzD4eYksd1xvqWOy4/e5+AHjPzF5x99+Had43s4MZ702usV2ZZ6eUACQfg4FJJCcb04zkpGWp43pDHYsd9zszG+PuGwDc/V0zuxi4GxhVBnG9oY7FjvujmR3r7u+RXMwBgJkNJLlMmTxiuzLPznX1kEGdOnK44qqUcb2hjt0Ql9MVbqWK6w117Ia4nC8lzzW2K/PMpSvrZwGJiEj30eOgRUQipQQgIhIpJQARkUgpAYiIREoJQEQkUv8fMAOl/eqEipEAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "plt.figure();\n", "df2 = pandas.DataFrame(data_frame, columns=['M_count','F_count'])\n", "df2.plot.bar();" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting pyspark\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/5e/cb/d8ff49ba885e2c88b8cf2967edd84235ffa9ac301bffef657dfa5605a112/pyspark-2.3.2.tar.gz (211.9MB)\n", "\u001b[K 100% |████████████████████████████████| 211.9MB 174kB/s eta 0:00:01 36% |███████████▋ | 76.9MB 73.9MB/s eta 0:00:02\n", "\u001b[?25hCollecting findspark\n", " Downloading https://files.pythonhosted.org/packages/b1/c8/e6e1f6a303ae5122dc28d131b5a67c5eb87cbf8f7ac5b9f87764ea1b1e1e/findspark-1.3.0-py2.py3-none-any.whl\n", "Collecting py4j==0.10.7 (from pyspark)\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)\n", "\u001b[K 100% |████████████████████████████████| 204kB 19.0MB/s ta 0:00:01\n", "\u001b[?25hBuilding wheels for collected packages: pyspark\n", " Running setup.py bdist_wheel for pyspark ... \u001b[?25ldone\n", "\u001b[?25h Stored in directory: /root/.cache/pip/wheels/be/7d/34/cd3cfbc75d8b6b6ae0658e5425348560b86d187fe3e53832cc\n", "Successfully built pyspark\n", "\u001b[31mmkl-random 1.0.1 requires cython, which is not installed.\u001b[0m\n", "Installing collected packages: py4j, pyspark, findspark\n", "Successfully installed findspark-1.3.0 py4j-0.10.7 pyspark-2.3.2\n", "\u001b[33mYou are using pip version 10.0.1, however version 18.1 is available.\n", "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "import sys\n", "!{sys.executable} -m pip install pyspark findspark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import findspark\n", "findspark.init()\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from datetime import datetime\n", "from pyspark.context import SparkContext\n", "from pyspark.ml.linalg import Vectors\n", "from pyspark.ml.classification import LogisticRegression\n", "from pyspark.sql.session import SparkSession" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "def vector_from_inputs(r):\n", " return (float(r[\"label\"]), Vectors.dense(float(r[\"EFGR\"]),\n", " float(r[\"TP53\"]),\n", " float(r[\"NOTCH1\"]),\n", " float(r[\"GATA3\"])))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Use Cloud Dataprocs automatically propagated configurations to get\n", "# the Cloud Storage bucket and Google Cloud Platform project for this\n", "# cluster.\n", "sc = SparkContext()\n", "spark = SparkSession(sc)\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dataproc-6b064c10-086c-44db-b3b5-f14e410e0c13-us\n", "isb-cgc-02-0001\n" ] } ], "source": [ "bucket = spark._jsc.hadoopConfiguration().get(\"fs.gs.system.bucket\")\n", "project = spark._jsc.hadoopConfiguration().get(\"fs.gs.project.id\")\n", "\n", "print(bucket)\n", "print(project)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "# Set an input directory for reading data from Bigquery.\n", "todays_date = datetime.strftime(datetime.today(), \"%Y-%m-%d-%H-%M-%S\")\n", "input_directory = \"gs://qotm_oct_2018\" + todays_date" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'mapred.bq.project.id': 'isb-cgc-02-0001', 'mapred.bq.gcs.bucket': 'dataproc-6b064c10-086c-44db-b3b5-f14e410e0c13-us', 'mapred.bq.temp.gcs.path': 'gs://qotm_oct_20182018-10-29-23-47-53', 'mapred.bq.input.project.id': 'isb-cgc-02-0001', 'mapred.bq.input.dataset.id': 'spark_job', 'mapred.bq.input.table.id': 'tcga_spark'}\n" ] } ], "source": [ "# Set the configuration for importing data from BigQuery.\n", "# Specifically, make sure to set the project ID and bucket for Cloud Dataproc,\n", "# and the project ID, dataset, and table names for BigQuery.\n", "\n", "conf = {\n", " # Input Parameters\n", " \"mapred.bq.project.id\": project,\n", " \"mapred.bq.gcs.bucket\": bucket,\n", " \"mapred.bq.temp.gcs.path\": input_directory,\n", " \"mapred.bq.input.project.id\": project,\n", " \"mapred.bq.input.dataset.id\": \"spark_job\",\n", " \"mapred.bq.input.table.id\": \"tcga_spark\"\n", "}\n", "print(conf)\n" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": true }, "outputs": [ { "ename": "Py4JJavaError", "evalue": "An error occurred while calling z:org.apache.spark.api.python.PythonRDD.newAPIHadoopRDD.\n: java.io.IOException: Conflict occurred creating export directory. Path gs://qotm_oct_20182018-10-29-23-47-53 already exists\n\tat com.google.cloud.hadoop.io.bigquery.AbstractExportToCloudStorage.prepare(AbstractExportToCloudStorage.java:68)\n\tat com.google.cloud.hadoop.io.bigquery.AbstractBigQueryInputFormat.getSplits(AbstractBigQueryInputFormat.java:136)\n\tat org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:125)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)\n\tat scala.Option.getOrElse(Option.scala:121)\n\tat org.apache.spark.rdd.RDD.partitions(RDD.scala:250)\n\tat org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)\n\tat scala.Option.getOrElse(Option.scala:121)\n\tat org.apache.spark.rdd.RDD.partitions(RDD.scala:250)\n\tat org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1333)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:362)\n\tat org.apache.spark.rdd.RDD.take(RDD.scala:1327)\n\tat org.apache.spark.api.python.SerDeUtil$.pairRDDToPython(SerDeUtil.scala:203)\n\tat org.apache.spark.api.python.PythonRDD$.newAPIHadoopRDD(PythonRDD.scala:596)\n\tat org.apache.spark.api.python.PythonRDD.newAPIHadoopRDD(PythonRDD.scala)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:280)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:748)\n", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mPy4JJavaError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\"org.apache.hadoop.io.LongWritable\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\"com.google.gson.JsonObject\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m conf=conf)\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/lib/spark/python/pyspark/context.py\u001b[0m in \u001b[0;36mnewAPIHadoopRDD\u001b[0;34m(self, inputFormatClass, keyClass, valueClass, keyConverter, valueConverter, conf, batchSize)\u001b[0m\n\u001b[1;32m 700\u001b[0m jrdd = self._jvm.PythonRDD.newAPIHadoopRDD(self._jsc, inputFormatClass, keyClass,\n\u001b[1;32m 701\u001b[0m \u001b[0mvalueClass\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkeyConverter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalueConverter\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 702\u001b[0;31m jconf, batchSize)\n\u001b[0m\u001b[1;32m 703\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mRDD\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjrdd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 704\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1131\u001b[0m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1132\u001b[0m return_value = get_return_value(\n\u001b[0;32m-> 1133\u001b[0;31m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[1;32m 1134\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1135\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/lib/spark/python/pyspark/sql/utils.py\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0ms\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoString\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m 317\u001b[0m raise Py4JJavaError(\n\u001b[1;32m 318\u001b[0m \u001b[0;34m\"An error occurred while calling {0}{1}{2}.\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 319\u001b[0;31m format(target_id, \".\", name), value)\n\u001b[0m\u001b[1;32m 320\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 321\u001b[0m raise Py4JError(\n", "\u001b[0;31mPy4JJavaError\u001b[0m: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.newAPIHadoopRDD.\n: java.io.IOException: Conflict occurred creating export directory. Path gs://qotm_oct_20182018-10-29-23-47-53 already exists\n\tat com.google.cloud.hadoop.io.bigquery.AbstractExportToCloudStorage.prepare(AbstractExportToCloudStorage.java:68)\n\tat com.google.cloud.hadoop.io.bigquery.AbstractBigQueryInputFormat.getSplits(AbstractBigQueryInputFormat.java:136)\n\tat org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:125)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)\n\tat scala.Option.getOrElse(Option.scala:121)\n\tat org.apache.spark.rdd.RDD.partitions(RDD.scala:250)\n\tat org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)\n\tat org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)\n\tat scala.Option.getOrElse(Option.scala:121)\n\tat org.apache.spark.rdd.RDD.partitions(RDD.scala:250)\n\tat org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1333)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:362)\n\tat org.apache.spark.rdd.RDD.take(RDD.scala:1327)\n\tat org.apache.spark.api.python.SerDeUtil$.pairRDDToPython(SerDeUtil.scala:203)\n\tat org.apache.spark.api.python.PythonRDD$.newAPIHadoopRDD(PythonRDD.scala:596)\n\tat org.apache.spark.api.python.PythonRDD.newAPIHadoopRDD(PythonRDD.scala)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:280)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:748)\n" ] } ], "source": [ "# Read the data from BigQuery into Spark as an RDD.\n", "table_data = spark.sparkContext.newAPIHadoopRDD(\n", " \"com.google.cloud.hadoop.io.bigquery.JsonTextBigQueryInputFormat\",\n", " \"org.apache.hadoop.io.LongWritable\",\n", " \"com.google.gson.JsonObject\",\n", " conf=conf)\n", "\n", "\n", "# Extract the JSON strings from the RDD.\n", "table_json = table_data.map(lambda x: x[1])" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "# Load the JSON strings as a Spark Dataframe.\n", "tcga_data = spark.read.json(table_json)\n", "\n", "# Create a view so that Spark SQL queries can be run against the data.\n", "tcga_data.createOrReplaceTempView(\"tcga_view\")" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "# As a precaution, run a query in Spark SQL to ensure no NULL values exist.\n", "sql_query = \"\"\"\n", "SELECT *\n", "from tcga_view\n", "where label is not null\n", "and EFGR is not null\n", "and TP53 is not null\n", "and GATA3 is not null\n", "and NOTCH1 is not null\n", "\"\"\"\n", "clean_data = spark.sql(sql_query)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DataFrame[label: double, features: vector]" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create an input DataFrame for Spark ML using the above function.\n", "training_data = clean_data.rdd.map(vector_from_inputs).toDF([\"label\",\n", " \"features\"])\n", "training_data.cache()" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Coefficients:[50.29267918772197,0.0,0.16224745918590844,-0.31689142394240727]\n", "Intercept:-0.9932429393509908\n" ] } ], "source": [ "\n", "# Construct a new LinearRegression object and fit the training data.\n", "# https://spark.apache.org/docs/latest/ml-classification-regression.html#binomial-logistic-regression\n", "lr = LogisticRegression(maxIter=5, regParam=0.3, elasticNetParam=0.8)\n", "lrModel = lr.fit(training_data)\n", "# Print the model summary.\n", "print(\"Coefficients:\" + str(model.coefficients))\n", "print(\"Intercept:\" + str(model.intercept))\n", "\n" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "objectiveHistory:\n", "0.5835274717778136\n", "0.5801510529881112\n", "0.5608210301759466\n", "0.5600787243659968\n", "0.559400795893893\n", "0.559111570022316\n", "+--------------------+-------------------+\n", "| FPR| TPR|\n", "+--------------------+-------------------+\n", "| 0.0| 0.0|\n", "|0.001646090534979...|0.03111111111111111|\n", "|0.002469135802469...|0.06444444444444444|\n", "|0.003292181069958...|0.09777777777777778|\n", "|0.003292181069958...|0.13333333333333333|\n", "|0.004938271604938...|0.16444444444444445|\n", "|0.005761316872427984|0.19777777777777777|\n", "|0.007407407407407408| 0.2288888888888889|\n", "| 0.00905349794238683| 0.26|\n", "|0.009876543209876543|0.29333333333333333|\n", "|0.009876543209876543| 0.3288888888888889|\n", "|0.009876543209876543|0.36444444444444446|\n", "|0.010699588477366255| 0.3977777777777778|\n", "|0.011522633744855968| 0.4311111111111111|\n", "|0.012345679012345678|0.46444444444444444|\n", "|0.013991769547325103| 0.4955555555555556|\n", "| 0.01646090534979424| 0.5244444444444445|\n", "| 0.01728395061728395| 0.5577777777777778|\n", "|0.018930041152263374| 0.5888888888888889|\n", "| 0.02139917695473251| 0.6177777777777778|\n", "+--------------------+-------------------+\n", "only showing top 20 rows\n", "\n", "areaUnderROC: 0.9783191586648377\n" ] }, { "data": { "text/plain": [ "LogisticRegression_45c8b09097d92fa6fdcb" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# getting the model performance metrics \n", "trainingSummary = lrModel.summary\n", "\n", "# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.\n", "trainingSummary.roc.show()\n", "print(\"areaUnderROC: \" + str(trainingSummary.areaUnderROC))\n", "\n", "# Set the model threshold to maximize F-Measure\n", "fMeasure = trainingSummary.fMeasureByThreshold\n", "maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()\n", "bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \\\n", " .select('threshold').head()['threshold']\n", "lr.setThreshold(bestThreshold)\n" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEKCAYAAAD9xUlFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvDW2N/gAAFv9JREFUeJzt3X+MXfV55/H3M+NfBAxYnkkU/CMma1PVUGrSKUnkDaVKsiIE2X8EpWZF0lQ0VrMhkZI0Catufgj+2C5pt9oodInboDTRJoQQtYyIW9ptiSBRST0Ix43dJZolKR6cBcc1YMD41zz7x71zfefOvXeuPXPu9dzzfkkW99x75s5zZob5zPf7fc45kZlIkgQw0OsCJEnnDkNBklRjKEiSagwFSVKNoSBJqjEUJEk1hoIkqcZQkCTVGAqSpJpFvS7gTA0NDeW6det6XYYkLSiPP/74LzJzeLb9FlworFu3jrGxsV6XIUkLSkT8ayf7OX0kSaoxFCRJNYaCJKnGUJAk1RgKkqQaQ0GSVGMoSJJqCguFiLgnIp6LiB+3eD0i4osRMR4ReyLiTUXVslAceukYP9r/PIdeOjZv24/85CCP/OS5jj6miM/vtl9rv7Zz2x5/9gj3j+1n/NkjdEORJ699FfgS8LUWr78L2FD992bgf1b/29cOvXSMicNHWb3iPFZesLS2/eNnXuCO7+5j8cAAJyYnee+vrea+xyfmtP2Nf3qak5OVz7t4MLjp19e0/JijJ04SESxbNDhvn99tv9Z+bee2PfKGFXx//FDt98f737qW27f+SqG/oyIzi3vziHXAg5l5RZPXvgx8LzO/Wd1+Erg2M3/e7j1HRkZyIZ3RXB8C3x//BZ/+zp4ZPwCDEbx8/FSvS5W0APzvj13D+tctP+OPi4jHM3Nktv16eZmLVcD+uu2J6nMzQiEitgPbAdauXduV4ubDA7ufqYXA8VOnmEw4cSp5lcqf71977OkeVyhpodm9//mzCoVO9XKhOZo813TYkpk7MnMkM0eGh2e9ntM5YfzZI3zy/j28emKSI8dOcuxkcuJUcaMySeWwac3Fhb5/L0NhAlhTt70aONCjWubVA7uf4fovPsrxqQn9M3D+0kGWLR7g/W9dy7LFAyxfuuistxfVfXcXD0bbj1k0UNlnLp/P7c62/Vr7te10+23rV077/fD+t64tdJQAvV1TeDdwK3A9lQXmL2bm1bO957m8pnDopWPsPfAiH/zaGMeaBMKiARgcGGDJYHVNYWQ1942dXlT6zLs3csWqi2YsQs9le++BF4Hk8ksumvVjgDl/Prc72/Zr7de20+3xZ4+we//zbFpz8ZwCodM1hcJCISK+CVwLDAHPAp8DFgNk5t0REVS6k64DXgF+JzNn/W1/robC1PrBAMErJ2YuGi9ZNMAf3Xglm9cPtf0BkKQi9HyhOTNvmuX1BD5c1OfvpkMvHePT36msHzSzZDDY+ZF/X0v5+l/+Ky9YahhIOmcsuJvsnIv2HniBgZi5bv6aJYNMZnLne64sfB5QkuaDoTBHD+x+hk/d/yOOnZw+Dbd0UXD3zW+qzeVL0kJgKMzB1LTRzEAY4As3Xsk1l722R5VJ0tkxFOZg4vBRFg8M1E5GA3jN4kHuft+vcc1lC+N8Ckmq51VS5+D8JYMcOzV9cXmS5PJLLuxRRZI0N44UztJUC2pUW3qXLa7k653vudI1BEkLlqFwFpq1oE5OJjs/+ja7jCQtaE4fnYWptYR6SxcNeqVTSQueoXAWmq0lnJicrJ1iL0kLldNHZ8i1BEn9zFA4A64lSOp3Th+dgWaXs3AtQVI/caTQoVaXs3AtQVI/caTQgXaXs3AtQVI/caTQAS9nIaksHCl0YPWK8zgx6eUsJPU/Q6EDKy9Yymdu2MiSRQO1eyg7bSSpHxkKHXhg9zPc8eA+Fg8EJ05W7qW8ZdOqXpclSfPOUJhF/bkJLx8/xfFTyR3f3cehl471ujRJmneGwiyanZuweGCAicNHe1SRJBXH7qM2PDdBUtk4UmjBcxMklZEjhRY8N0FSGTlSaMFzEySVkaHQwsoLlnLne65k2eIBli9d5LkJkkrB6aM2tmxaxeb1Q0wcPsrqFecZCJL6niMFSVKNI4U2pu6ytnhggBOTk9z5nis9k1lSX3Ok0EL9mcxHjp3k1ROTfOo7ezyTWVJfKzQUIuK6iHgyIsYj4rYmr6+NiIcj4omI2BMR1xdZz5mYakmt55nMkvpdYaEQEYPAXcC7gI3ATRGxsWG3/wLcl5lXAduAPy2qnjN1/pJBjp2a3pLqmcyS+l2RI4WrgfHMfCozjwP3Alsb9klgqvH/IuBAgfV07IHdz3DDl75PZOVs5mWLB2xJlVQKRS40rwL2121PAG9u2OfzwN9GxEeA84F3FFhPR+rXEqZMTiY7P/o21r9ueQ8rk6TiFTlSiCbPZcP2TcBXM3M1cD3w9YiYUVNEbI+IsYgYO3jwYAGlntZsLWHpokFePn6q0M8rSeeCIkNhAlhTt72amdNDtwD3AWTmPwLLgKHGN8rMHZk5kpkjw8PFXneo2eUtXEuQVBZFhsIuYENEXBoRS6gsJI827PM08HaAiPhlKqFQ7FBgFl7eQlKZFbamkJknI+JW4CFgELgnM/dGxO3AWGaOAp8A/iwiPkZlaukDmdk4xdR1WzatYuPrL2T3/ufZtOZi1xIklUahZzRn5k5gZ8Nzn617vA/YXGQNZ8MzmSWVlWc0N/BMZkllZig08ExmSWVmKDSw+0hSmRkKDVZesJTP3LCRJYsGOH/poN1HkkrFUGjwwO5nuOPBfSweCE6cnOQz797oIrOk0jAU6tQvMr98/BTHTyV3fHefi8ySSsNQqOMis6SyMxTquMgsqewMhQYfvnY9Sxd5iQtJ5eQ9mqvqz2KGZPs1b+Q/vnmtgSCpVBwpMPMs5mMnk7u+N97rsiSp6wwFXGCWpCmGAi4wS9IUQwHvoSBJU1xorvIeCpJkKNR4DwVJcvoI8B4KkjTFUMDuI0maYihg95EkTTEUsPtIkqa40Fy1ZdMqNq8fYuLwUVavOM9AkFRKjhQkSTWOFKpsSZUkRwqALamSNMVQwJZUSZpiKGBLqiRNMRSwJVWSprjQXOUF8STJUKix+0iSnD4C7D6SpCmFhkJEXBcRT0bEeETc1mKf90bEvojYGxHfKLKeVuw+kqSKwqaPImIQuAt4JzAB7IqI0czcV7fPBuA/A5sz83BEvLaoetqx+0iSKoocKVwNjGfmU5l5HLgX2NqwzweBuzLzMEBmPldgPS3ZfSRJFUUuNK8C9tdtTwBvbtjnMoCI+AEwCHw+M/+mwJpa2rx+iB3vGwGSyy+5yECQVEpFhkI0eS6bfP4NwLXAauDRiLgiM5+f9kYR24HtAGvXrp33Qu08kqSKIqePJoA1ddurgQNN9nkgM09k5k+BJ6mExDSZuSMzRzJzZHh4eF6LtPNIkk4rMhR2ARsi4tKIWAJsA0Yb9vkr4DcBImKIynTSUwXWNIOdR5J0WmGhkJkngVuBh4B/Ae7LzL0RcXtEbKnu9hBwKCL2AQ8Dn8zMQ0XV1IydR5J0WmQ2TvOf20ZGRnJsbGxe33N09zN8yjUFSX0sIh7PzJHZ9vMyF9h5JElTSh8Kdh5J0mmlvvaRnUeSNF2pQ8HOI0martShYOeRJE1X6lDwmkeSNN0ZLzRXr366LTP/VwH1dJ13XJOk01qGQkRcCHyYyoXtRoG/o3Iy2u8Du4G+CAW7jyTptHbTR18Hfgn4Z+B3gb8FbgS2ZmbjJbAXJLuPJGm6dtNHb8zMXwGIiD8HfgGszcwjXamsC6a6j17l9GLzVPeR6wqSyqjdSOHE1IPMPAX8tJ8CAew+kqRG7ULhVyPixYg4EhFHgCvrtl/sVoFFsvtIkqZrOX2UmYPdLKRX7D6SpNPadR8tA34PWA/sAe6pXg67r9h9JEmntZs++gtghEr30fXAH3eloi6y+0iSpmvXfbSxrvvoK8A/daek7rH7SJKm67T7qO+mjcDuI0lq1C4UNlW7jV60+0iSyqHd9NGPMvOqrlXSI951TZJOaxcKC+vmzWfBziNJmq5dKLw2Ij7e6sXM/O8F1NM19Z1HUwvNn/rOHjavH3K0IKm02oXCIHABEF2qpavsPJKkmdqFws8z8/auVdJldh5J0kztuo/6coQwxc4jSZqp3Ujh7V2roke87pEkTdfugnj/1s1CesHuI0mart30UV/zukeSNFNpQ2Gq+6jeVPeRJJVVaUPB7iNJmqm0oWD3kSTN1K77aM4i4jrgf1A5Ee7PM/MPW+x3I/Bt4Nczc6zImupt2bSKzeuHmDh8lNUrzjMQJJVeYSOFiBgE7gLeBWwEboqIjU32Ww58FPhhUbVIkjpT5EjhamA8M58CiIh7ga3Avob97gDuBH6/wFqasiVVkqYrck1hFbC/bnui+lxNRFwFrMnMB9u9UURsj4ixiBg7ePDgvBRnS6okzVRkKDS7TEbtctwRMQD8CfCJ2d4oM3dk5khmjgwPD89LcbakStJMRYbCBLCmbns1cKBuezlwBfC9iPgZ8BZgNCJGCqzpdDG2pErSDEWGwi5gQ0RcGhFLgG3A6NSLmflCZg5l5rrMXAc8BmzpVveRLamSNFNhC82ZeTIibgUeotKSek9m7o2I24GxzBxt/w7F81ackjRdoecpZOZOYGfDc59tse+1RdbSyM4jSZqplGc023kkSc2VMhTsPJKk5koZCnYeSVJzpQwFO48kqblCF5rPZV4MT5JmKuVIQZLUXGlHCrakStJMpRwp2JIqSc2VMhRsSZWk5koZCrakSlJzpQwFW1IlqbnSLjTbkipJM5VypCBJaq60IwVbUiVpplKOFGxJlaTmShkKtqRKUnOlDAVbUiWpuVKGgi2pktRcaReabUmVpJlKOVKQJDVX2pGCLamSNFMpRwq2pEpSc6UMBVtSJam5UoaCLamS1FwpQ8GWVElqrrQLzVs2rWLj6y9k9/7n2bTmYta/bnmvS5KknittKNh9JEkzlXL6yO4jSWqulKFg95EkNVfKULD7SJKaKzQUIuK6iHgyIsYj4rYmr388IvZFxJ6I+PuIeEOR9Uyx+0iSmitsoTkiBoG7gHcCE8CuiBjNzH11uz0BjGTmKxHxIeBO4LeKqqne5vVD7HjfCJBcfslFBoIkUWz30dXAeGY+BRAR9wJbgVooZObDdfs/BtxcYD01dh5JUnNFTh+tAvbXbU9Un2vlFuCvm70QEdsjYiwixg4ePDinouw8kqTWigyFaPJcNt0x4mZgBPhCs9czc0dmjmTmyPDw8JyKsvNIklorcvpoAlhTt70aONC4U0S8A/gD4Dcys/A/1+08kqTWihwp7AI2RMSlEbEE2AaM1u8QEVcBXwa2ZOZzBdZSY+eRJLVW2EghM09GxK3AQ8AgcE9m7o2I24GxzBylMl10AfDtiAB4OjO3FFXTFK97JEnNFXrto8zcCexseO6zdY/fUeTnb8XuI0lqrnRnNNt9JEmtlS4U7D6SpNZKFwp2H0lSa6ULBbuPJKm1Ut5kx+seSVJzpQsFO48kqbVSTR/ZeSRJ7ZUqFOw8kqT2ShUKdh5JUnulCgU7jySpvdItNG/ZtIrN64eYOHyU1SvOMxAkqU6pRgqSpPZKN1KwJVWSWivVSMGWVElqr1ShYEuqJLVXqlCwJVWS2itVKNiSKkntlW6h2VtxSlJrpQsFu48kqbVSTR/ZfSRJ7ZUqFOw+kqT2ShUKdh9JUnulCgW7jySpvdItNHtBPElqrVQjBUlSe6UbKdiSKkmtlWqkYEuqJLVXqlCwJVWS2itVKNiSKkntFRoKEXFdRDwZEeMRcVuT15dGxLeqr/8wItYVWY8tqZLUXmELzRExCNwFvBOYAHZFxGhm7qvb7RbgcGauj4htwH8DfquomgA2rx9ix/tGgOTySy4yECSpTpHdR1cD45n5FEBE3AtsBepDYSvw+erj+4EvRURkZhZRkJ1HktRekdNHq4D9ddsT1eea7pOZJ4EXgJVFFGPnkSTNrshQiCbPNY4AOtmHiNgeEWMRMXbw4MGzKsbOI0maXZGhMAGsqdteDRxotU9ELAIuAv6t8Y0yc0dmjmTmyPDw8FkVY+eRJM2uyFDYBWyIiEsjYgmwDRht2GcU+O3q4xuBfyhqPcHOI0maXWELzZl5MiJuBR4CBoF7MnNvRNwOjGXmKPAV4OsRMU5lhLCtqHrAi+FJ0myioD/MCzMyMpJjY2O9LkOSFpSIeDwzR2bbr1RnNEuS2jMUJEk1hoIkqcZQkCTVGAqSpBpDQZJUYyhIkmoW3HkKEXEQ+Nc5vs0Q8It5KGeh8Hj7X9mO2eM9c2/IzFmvE7TgQmE+RMRYJydx9AuPt/+V7Zg93uI4fSRJqjEUJEk1ZQ2FHb0uoMs83v5XtmP2eAtSyjUFSVJzZR0pSJKa6OtQiIjrIuLJiBiPiNuavL40Ir5Vff2HEbGu+1XOnw6O9+MRsS8i9kTE30fEG3pR53yZ7Xjr9rsxIjIiFnS3SifHGxHvrX6P90bEN7pd43zq4Od5bUQ8HBFPVH+mr+9FnfMlIu6JiOci4sctXo+I+GL167EnIt5USCGZ2Zf/qNzY5/8CbwSWAD8CNjbs85+Au6uPtwHf6nXdBR/vbwKvqT7+UL8fb3W/5cAjwGPASK/rLvj7uwF4AlhR3X5tr+su+Hh3AB+qPt4I/KzXdc/xmK8B3gT8uMXr1wN/TeXe9m8BflhEHf08UrgaGM/MpzLzOHAvsLVhn63AX1Qf3w+8PSKiizXOp1mPNzMfzsxXqpuPUblv9kLVyfcX4A7gTuDVbhZXgE6O94PAXZl5GCAzn+tyjfOpk+NN4MLq44uYeQ/4BSUzH6HJPerrbAW+lhWPARdHxOvnu45+DoVVwP667Ynqc033ycyTwAvAyq5UN/86Od56t1D5q2OhmvV4I+IqYE1mPtjNwgrSyff3MuCyiPhBRDwWEdd1rbr518nxfh64OSImgJ3AR7pTWs+c6f/jZ6WwezSfA5r9xd/YatXJPgtFx8cSETcDI8BvFFpRsdoeb0QMAH8CfKBbBRWsk+/vIipTSNdSGQU+GhFXZObzBddWhE6O9ybgq5n5xxHxVir3e78iMyeLL68nuvL7qp9HChPAmrrt1cwcXtb2iYhFVIag7YZv57JOjpeIeAfwB8CWzDzWpdqKMNvxLgeuAL4XET+jMgc7uoAXmzv9eX4gM09k5k+BJ6mExELUyfHeAtwHkJn/CCyjco2gftXR/+Nz1c+hsAvYEBGXRsQSKgvJow37jAK/XX18I/APWV3RWYBmPd7qdMqXqQTCQp5vhlmONzNfyMyhzFyXmeuorKFsycyx3pQ7Z538PP8VlWYCImKIynTSU12tcv50crxPA28HiIhfphIKB7taZXeNAu+vdiG9BXghM38+35+kb6ePMvNkRNwKPESlk+GezNwbEbcDY5k5CnyFypBznMoIYVvvKp6bDo/3C8AFwLer6+lPZ+aWnhU9Bx0eb9/o8HgfAv5DROwDTgGfzMxDvav67HV4vJ8A/iwiPkZlGuUDC/iPOiLim1Sm/oaq6ySfAxYDZObdVNZNrgfGgVeA3ymkjgX8NZQkzbN+nj6SJJ0hQ0GSVGMoSJJqDAVJUo2hIEmqMRSkDkXEqYjYXfdvXURcGxEvVK/U+S8R8bnqvvXP/5+I+KNe1y91om/PU5AKcDQzN9U/Ub3c+qOZeUNEnA/sjoipay1NPX8e8ERE/GVm/qC7JUtnxpGCNE8y82XgceDfNTx/FNhNARcvk+aboSB17ry6qaO/bHwxIlZSucbS3obnV1C5BtEj3SlTOntOH0mdmzF9VPW2iHgCmAT+sHo5hmurz+8Bfqn6/P/rYq3SWTEUpLl7NDNvaPV8RFwGfL+6prC728VJZ8LpI6lgmfkT4L8Cn+51LdJsDAWpO+4GromIS3tdiNSOV0mVJNU4UpAk1RgKkqQaQ0GSVGMoSJJqDAVJUo2hIEmqMRQkSTWGgiSp5v8DBl4V60ypgZUAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import pandas\n", "import matplotlib.pyplot as plt\n", "plt.figure();\n", "trainingSummary.roc.toPandas().plot.scatter('FPR','TPR')\n", "\n" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "sc.stop()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }