{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "<pyspark.context.SparkContext at 0x102cea1d0>" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Spark Context\n", "sc" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Warning: Version mismatch. H2O is version 3.6.0.2, but the python package is version 3.7.0.99999.\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td>H2O cluster uptime: </td>\n", "<td>2 seconds 217 milliseconds </td></tr>\n", "<tr><td>H2O cluster version: </td>\n", "<td>3.6.0.2</td></tr>\n", "<tr><td>H2O cluster name: </td>\n", "<td>sparkling-water-nidhimehta</td></tr>\n", "<tr><td>H2O cluster total nodes: </td>\n", "<td>2</td></tr>\n", "<tr><td>H2O cluster total memory: </td>\n", "<td>3.83 GB</td></tr>\n", "<tr><td>H2O cluster total cores: </td>\n", "<td>16</td></tr>\n", "<tr><td>H2O cluster allowed cores: </td>\n", "<td>16</td></tr>\n", "<tr><td>H2O cluster healthy: </td>\n", "<td>True</td></tr>\n", "<tr><td>H2O Connection ip: </td>\n", "<td>172.16.2.98</td></tr>\n", "<tr><td>H2O Connection port: </td>\n", "<td>54329</td></tr></table></div>" ], "text/plain": [ "-------------------------- --------------------------\n", "H2O cluster uptime: 2 seconds 217 milliseconds\n", "H2O cluster version: 3.6.0.2\n", "H2O cluster name: sparkling-water-nidhimehta\n", "H2O cluster total nodes: 2\n", "H2O cluster total memory: 3.83 GB\n", "H2O cluster total cores: 16\n", "H2O cluster allowed cores: 16\n", "H2O cluster healthy: True\n", "H2O Connection ip: 172.16.2.98\n", "H2O Connection port: 54329\n", "-------------------------- --------------------------" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Start H2O Context\n", "from pysparkling import *\n", "sc\n", "hc= H2OContext(sc).start()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "H2OContext: ip=172.16.2.98, port=54329\n" ] }, { "data": { "text/plain": [] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# H2o Context\n", "hc" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Import H2O Python library\n", "import h2o\n", "# View all the available H2O python functions\n", "#dir(h2o)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Parse Progress: [##################################################] 100%\n", "(9999, 22)\n" ] }, { "data": { "text/html": [ "<table>\n", "<tr><th> </th><th>ID </th><th>Case Number </th><th>Date </th><th>Block </th><th>IUCR </th><th>Primary Type </th><th>Description </th><th>Location Description </th><th>Arrest </th><th>Domestic </th><th>Beat </th><th>District </th><th>Ward </th><th>Community Area </th><th>FBI Code </th><th>X Coordinate </th><th>Y Coordinate </th><th>Year </th><th>Updated On </th><th>Latitude </th><th>Longitude </th><th>Location </th></tr>\n", "<tr><td>type </td><td>int </td><td>string </td><td>string </td><td>enum </td><td>enum </td><td>enum </td><td>enum </td><td>enum </td><td>enum </td><td>enum </td><td>int </td><td>int </td><td>int </td><td>int </td><td>enum </td><td>int </td><td>int </td><td>int </td><td>enum </td><td>real </td><td>real </td><td>enum </td></tr>\n", "<tr><td>mins </td><td>21735.0 </td><td>NaN </td><td>NaN </td><td>0.0 </td><td>0.0 </td><td>0.0 </td><td>0.0 </td><td>0.0 </td><td>0.0 </td><td>0.0 </td><td>111.0 </td><td>1.0 </td><td>1.0 </td><td>1.0 </td><td>0.0 </td><td>1100317.0 </td><td>1814255.0 </td><td>2015.0</td><td>0.0 </td><td>41.64507243 </td><td>-87.906463888 </td><td>0.0 </td></tr>\n", "<tr><td>mean </td><td>9931318.73737</td><td>NaN </td><td>NaN </td><td>NaN </td><td>NaN </td><td>NaN </td><td>NaN </td><td>NaN </td><td>0.292829282928</td><td>0.152315231523</td><td>1159.61806181</td><td>11.3489885128</td><td>22.9540954095</td><td>37.4476447645 </td><td>NaN </td><td>1163880.59815 </td><td>1885916.14984 </td><td>2015.0</td><td>NaN </td><td>41.8425652247 </td><td>-87.6741405221 </td><td>NaN </td></tr>\n", "<tr><td>maxs </td><td>9962898.0 </td><td>NaN </td><td>NaN </td><td>6517.0 </td><td>212.0 </td><td>26.0 </td><td>198.0 </td><td>90.0 </td><td>1.0 </td><td>1.0 </td><td>2535.0 </td><td>25.0 </td><td>50.0 </td><td>77.0 </td><td>24.0 </td><td>1205069.0 </td><td>1951533.0 </td><td>2015.0</td><td>32.0 </td><td>42.022646183 </td><td>-87.524773286 </td><td>8603.0 </td></tr>\n", "<tr><td>sigma </td><td>396787.564221</td><td>NaN </td><td>NaN </td><td>NaN </td><td>NaN </td><td>NaN </td><td>NaN </td><td>NaN </td><td>0.455083515588</td><td>0.35934414686 </td><td>695.76029875 </td><td>6.94547493301</td><td>13.6495661144</td><td>21.2748762223 </td><td>NaN </td><td>16496.4493681 </td><td>31274.0163199 </td><td>0.0 </td><td>NaN </td><td>0.0860186579358</td><td>0.0600357970653</td><td>NaN </td></tr>\n", "<tr><td>zeros </td><td>0 </td><td>0 </td><td>0 </td><td>3 </td><td>16 </td><td>11 </td><td>933 </td><td>19 </td><td>7071 </td><td>8476 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>16 </td><td>0 </td><td>0 </td><td>0 </td><td>603 </td><td>0 </td><td>0 </td><td>1 </td></tr>\n", "<tr><td>missing</td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>6 </td><td>0 </td><td>0 </td><td>0 </td><td>162 </td><td>0 </td><td>0 </td><td>0 </td><td>162 </td><td>162 </td><td>0 </td><td>0 </td><td>162 </td><td>162 </td><td>162 </td></tr>\n", "<tr><td>0 </td><td>9955810.0 </td><td>HY144797 </td><td>02/08/2015 11:43:40 PM</td><td>081XX S COLES AVE </td><td>1811 </td><td>NARCOTICS </td><td>POSS: CANNABIS 30GMS OR LESS</td><td>STREET </td><td>true </td><td>false </td><td>422.0 </td><td>4.0 </td><td>7.0 </td><td>46.0 </td><td>18 </td><td>1198273.0 </td><td>1851626.0 </td><td>2015.0</td><td>02/15/2015 12:43:39 PM</td><td>41.747693646 </td><td>-87.549035389 </td><td>(41.747693646, -87.549035389)</td></tr>\n", "<tr><td>1 </td><td>9955861.0 </td><td>HY144838 </td><td>02/08/2015 11:41:42 PM</td><td>118XX S STATE ST </td><td>0486 </td><td>BATTERY </td><td>DOMESTIC BATTERY SIMPLE </td><td>APARTMENT </td><td>true </td><td>true </td><td>522.0 </td><td>5.0 </td><td>34.0 </td><td>53.0 </td><td>08B </td><td>1178335.0 </td><td>1826581.0 </td><td>2015.0</td><td>02/15/2015 12:43:39 PM</td><td>41.679442289 </td><td>-87.622850758 </td><td>(41.679442289, -87.622850758)</td></tr>\n", "<tr><td>2 </td><td>9955801.0 </td><td>HY144779 </td><td>02/08/2015 11:30:22 PM</td><td>002XX S LARAMIE AVE </td><td>2026 </td><td>NARCOTICS </td><td>POSS: PCP </td><td>SIDEWALK </td><td>true </td><td>false </td><td>1522.0 </td><td>15.0 </td><td>29.0 </td><td>25.0 </td><td>18 </td><td>1141717.0 </td><td>1898581.0 </td><td>2015.0</td><td>02/15/2015 12:43:39 PM</td><td>41.87777333 </td><td>-87.755117993 </td><td>(41.87777333, -87.755117993) </td></tr>\n", "<tr><td>3 </td><td>9956197.0 </td><td>HY144787 </td><td>02/08/2015 11:30:23 PM</td><td>006XX E 67TH ST </td><td>1811 </td><td>NARCOTICS </td><td>POSS: CANNABIS 30GMS OR LESS</td><td>STREET </td><td>true </td><td>false </td><td>321.0 </td><td>nan </td><td>6.0 </td><td>42.0 </td><td>18 </td><td>nan </td><td>nan </td><td>2015.0</td><td>02/15/2015 12:43:39 PM</td><td>nan </td><td>nan </td><td> </td></tr>\n", "<tr><td>4 </td><td>9955846.0 </td><td>HY144829 </td><td>02/08/2015 11:30:58 PM</td><td>0000X S MAYFIELD AVE</td><td>0610 </td><td>BURGLARY </td><td>FORCIBLE ENTRY </td><td>APARTMENT </td><td>false </td><td>false </td><td>1513.0 </td><td>15.0 </td><td>29.0 </td><td>25.0 </td><td>05 </td><td>1137239.0 </td><td>1899372.0 </td><td>2015.0</td><td>02/15/2015 12:43:39 PM</td><td>41.880025548 </td><td>-87.771541324 </td><td>(41.880025548, -87.771541324)</td></tr>\n", "<tr><td>5 </td><td>9955835.0 </td><td>HY144778 </td><td>02/08/2015 11:30:21 PM</td><td>010XX W 48TH ST </td><td>0486 </td><td>BATTERY </td><td>DOMESTIC BATTERY SIMPLE </td><td>APARTMENT </td><td>false </td><td>true </td><td>933.0 </td><td>9.0 </td><td>3.0 </td><td>61.0 </td><td>08B </td><td>1169986.0 </td><td>1873019.0 </td><td>2015.0</td><td>02/15/2015 12:43:39 PM</td><td>41.807059405 </td><td>-87.65206589 </td><td>(41.807059405, -87.65206589) </td></tr>\n", "<tr><td>6 </td><td>9955872.0 </td><td>HY144822 </td><td>02/08/2015 11:27:24 PM</td><td>015XX W ARTHUR AVE </td><td>1320 </td><td>CRIMINAL DAMAGE</td><td>TO VEHICLE </td><td>STREET </td><td>false </td><td>false </td><td>2432.0 </td><td>24.0 </td><td>40.0 </td><td>1.0 </td><td>14 </td><td>1164732.0 </td><td>1943222.0 </td><td>2015.0</td><td>02/15/2015 12:43:39 PM</td><td>41.999814056 </td><td>-87.669342967 </td><td>(41.999814056, -87.669342967)</td></tr>\n", "<tr><td>7 </td><td>21752.0 </td><td>HY144738 </td><td>02/08/2015 11:26:12 PM</td><td>060XX W GRAND AVE </td><td>0110 </td><td>HOMICIDE </td><td>FIRST DEGREE MURDER </td><td>STREET </td><td>true </td><td>false </td><td>2512.0 </td><td>25.0 </td><td>37.0 </td><td>19.0 </td><td>01A </td><td>1135910.0 </td><td>1914206.0 </td><td>2015.0</td><td>02/15/2015 12:43:39 PM</td><td>41.920755683 </td><td>-87.776067514 </td><td>(41.920755683, -87.776067514)</td></tr>\n", "<tr><td>8 </td><td>9955808.0 </td><td>HY144775 </td><td>02/08/2015 11:20:33 PM</td><td>001XX W WACKER DR </td><td>0460 </td><td>BATTERY </td><td>SIMPLE </td><td>OTHER </td><td>false </td><td>false </td><td>122.0 </td><td>1.0 </td><td>42.0 </td><td>32.0 </td><td>08B </td><td>1175384.0 </td><td>1902088.0 </td><td>2015.0</td><td>02/15/2015 12:43:39 PM</td><td>41.886707818 </td><td>-87.631396356 </td><td>(41.886707818, -87.631396356)</td></tr>\n", "<tr><td>9 </td><td>9958275.0 </td><td>HY146732 </td><td>02/08/2015 11:15:36 PM</td><td>001XX W WACKER DR </td><td>0460 </td><td>BATTERY </td><td>SIMPLE </td><td>HOTEL/MOTEL </td><td>false </td><td>false </td><td>122.0 </td><td>1.0 </td><td>42.0 </td><td>32.0 </td><td>08B </td><td>1175384.0 </td><td>1902088.0 </td><td>2015.0</td><td>02/15/2015 12:43:39 PM</td><td>41.886707818 </td><td>-87.631396356 </td><td>(41.886707818, -87.631396356)</td></tr>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Parse Chicago Crime dataset into H2O\n", "column_type = ['Numeric','String','String','Enum','Enum','Enum','Enum','Enum','Enum','Enum','Numeric','Numeric','Numeric','Numeric','Enum','Numeric','Numeric','Numeric','Enum','Numeric','Numeric','Enum']\n", "f_crimes = h2o.import_file(path =\"../data/chicagoCrimes10k.csv\",col_types =column_type)\n", "\n", "print(f_crimes.shape)\n", "f_crimes.summary()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<table>\n", "<tr><th>IUCR </th><th style=\"text-align: right;\"> Count</th></tr>\n", "<tr><td>0110 </td><td style=\"text-align: right;\"> 16</td></tr>\n", "<tr><td>0261 </td><td style=\"text-align: right;\"> 2</td></tr>\n", "<tr><td>0263 </td><td style=\"text-align: right;\"> 2</td></tr>\n", "<tr><td>0265 </td><td style=\"text-align: right;\"> 5</td></tr>\n", "<tr><td>0266 </td><td style=\"text-align: right;\"> 2</td></tr>\n", "<tr><td>0281 </td><td style=\"text-align: right;\"> 41</td></tr>\n", "<tr><td>0291 </td><td style=\"text-align: right;\"> 3</td></tr>\n", "<tr><td>0312 </td><td style=\"text-align: right;\"> 18</td></tr>\n", "<tr><td>0313 </td><td style=\"text-align: right;\"> 20</td></tr>\n", "<tr><td>031A </td><td style=\"text-align: right;\"> 136</td></tr>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Look at the distribution of IUCR column\n", "f_crimes[\"IUCR\"].table()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<table>\n", "<tr><th>Arrest </th><th style=\"text-align: right;\"> Count</th></tr>\n", "<tr><td>false </td><td style=\"text-align: right;\"> 7071</td></tr>\n", "<tr><td>true </td><td style=\"text-align: right;\"> 2928</td></tr>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Look at distribution of Arrest column\n", "f_crimes[\"Arrest\"].table()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<table>\n", "<tr><th style=\"text-align: right;\"> ID</th><th>Case_Number </th><th>Date </th><th>Block </th><th style=\"text-align: right;\"> IUCR</th><th>Primary_Type </th><th>Description </th><th>Location_Description </th><th>Arrest </th><th>Domestic </th><th style=\"text-align: right;\"> Beat</th><th style=\"text-align: right;\"> District</th><th style=\"text-align: right;\"> Ward</th><th style=\"text-align: right;\"> Community_Area</th><th>FBI_Code </th><th style=\"text-align: right;\"> X_Coordinate</th><th style=\"text-align: right;\"> Y_Coordinate</th><th style=\"text-align: right;\"> Year</th><th>Updated_On </th><th style=\"text-align: right;\"> Latitude</th><th style=\"text-align: right;\"> Longitude</th><th>Location </th></tr>\n", "<tr><td style=\"text-align: right;\"> 9.95581e+06</td><td>HY144797 </td><td>02/08/2015 11:43:40 PM</td><td>081XX S COLES AVE </td><td style=\"text-align: right;\"> 1811</td><td>NARCOTICS </td><td>POSS: CANNABIS 30GMS OR LESS</td><td>STREET </td><td>true </td><td>false </td><td style=\"text-align: right;\"> 422</td><td style=\"text-align: right;\"> 4</td><td style=\"text-align: right;\"> 7</td><td style=\"text-align: right;\"> 46</td><td>18 </td><td style=\"text-align: right;\"> 1.19827e+06</td><td style=\"text-align: right;\"> 1.85163e+06</td><td style=\"text-align: right;\"> 2015</td><td>02/15/2015 12:43:39 PM</td><td style=\"text-align: right;\"> 41.7477</td><td style=\"text-align: right;\"> -87.549 </td><td>(41.747693646, -87.549035389)</td></tr>\n", "<tr><td style=\"text-align: right;\"> 9.95586e+06</td><td>HY144838 </td><td>02/08/2015 11:41:42 PM</td><td>118XX S STATE ST </td><td style=\"text-align: right;\"> 0486</td><td>BATTERY </td><td>DOMESTIC BATTERY SIMPLE </td><td>APARTMENT </td><td>true </td><td>true </td><td style=\"text-align: right;\"> 522</td><td style=\"text-align: right;\"> 5</td><td style=\"text-align: right;\"> 34</td><td style=\"text-align: right;\"> 53</td><td>08B </td><td style=\"text-align: right;\"> 1.17834e+06</td><td style=\"text-align: right;\"> 1.82658e+06</td><td style=\"text-align: right;\"> 2015</td><td>02/15/2015 12:43:39 PM</td><td style=\"text-align: right;\"> 41.6794</td><td style=\"text-align: right;\"> -87.6229</td><td>(41.679442289, -87.622850758)</td></tr>\n", "<tr><td style=\"text-align: right;\"> 9.9558e+06 </td><td>HY144779 </td><td>02/08/2015 11:30:22 PM</td><td>002XX S LARAMIE AVE </td><td style=\"text-align: right;\"> 2026</td><td>NARCOTICS </td><td>POSS: PCP </td><td>SIDEWALK </td><td>true </td><td>false </td><td style=\"text-align: right;\"> 1522</td><td style=\"text-align: right;\"> 15</td><td style=\"text-align: right;\"> 29</td><td style=\"text-align: right;\"> 25</td><td>18 </td><td style=\"text-align: right;\"> 1.14172e+06</td><td style=\"text-align: right;\"> 1.89858e+06</td><td style=\"text-align: right;\"> 2015</td><td>02/15/2015 12:43:39 PM</td><td style=\"text-align: right;\"> 41.8778</td><td style=\"text-align: right;\"> -87.7551</td><td>(41.87777333, -87.755117993) </td></tr>\n", "<tr><td style=\"text-align: right;\"> 9.9562e+06 </td><td>HY144787 </td><td>02/08/2015 11:30:23 PM</td><td>006XX E 67TH ST </td><td style=\"text-align: right;\"> 1811</td><td>NARCOTICS </td><td>POSS: CANNABIS 30GMS OR LESS</td><td>STREET </td><td>true </td><td>false </td><td style=\"text-align: right;\"> 321</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> 6</td><td style=\"text-align: right;\"> 42</td><td>18 </td><td style=\"text-align: right;\"> nan </td><td style=\"text-align: right;\"> nan </td><td style=\"text-align: right;\"> 2015</td><td>02/15/2015 12:43:39 PM</td><td style=\"text-align: right;\"> nan </td><td style=\"text-align: right;\"> nan </td><td> </td></tr>\n", "<tr><td style=\"text-align: right;\"> 9.95585e+06</td><td>HY144829 </td><td>02/08/2015 11:30:58 PM</td><td>0000X S MAYFIELD AVE</td><td style=\"text-align: right;\"> 0610</td><td>BURGLARY </td><td>FORCIBLE ENTRY </td><td>APARTMENT </td><td>false </td><td>false </td><td style=\"text-align: right;\"> 1513</td><td style=\"text-align: right;\"> 15</td><td style=\"text-align: right;\"> 29</td><td style=\"text-align: right;\"> 25</td><td>05 </td><td style=\"text-align: right;\"> 1.13724e+06</td><td style=\"text-align: right;\"> 1.89937e+06</td><td style=\"text-align: right;\"> 2015</td><td>02/15/2015 12:43:39 PM</td><td style=\"text-align: right;\"> 41.88 </td><td style=\"text-align: right;\"> -87.7715</td><td>(41.880025548, -87.771541324)</td></tr>\n", "<tr><td style=\"text-align: right;\"> 9.95584e+06</td><td>HY144778 </td><td>02/08/2015 11:30:21 PM</td><td>010XX W 48TH ST </td><td style=\"text-align: right;\"> 0486</td><td>BATTERY </td><td>DOMESTIC BATTERY SIMPLE </td><td>APARTMENT </td><td>false </td><td>true </td><td style=\"text-align: right;\"> 933</td><td style=\"text-align: right;\"> 9</td><td style=\"text-align: right;\"> 3</td><td style=\"text-align: right;\"> 61</td><td>08B </td><td style=\"text-align: right;\"> 1.16999e+06</td><td style=\"text-align: right;\"> 1.87302e+06</td><td style=\"text-align: right;\"> 2015</td><td>02/15/2015 12:43:39 PM</td><td style=\"text-align: right;\"> 41.8071</td><td style=\"text-align: right;\"> -87.6521</td><td>(41.807059405, -87.65206589) </td></tr>\n", "<tr><td style=\"text-align: right;\"> 9.95587e+06</td><td>HY144822 </td><td>02/08/2015 11:27:24 PM</td><td>015XX W ARTHUR AVE </td><td style=\"text-align: right;\"> 1320</td><td>CRIMINAL DAMAGE</td><td>TO VEHICLE </td><td>STREET </td><td>false </td><td>false </td><td style=\"text-align: right;\"> 2432</td><td style=\"text-align: right;\"> 24</td><td style=\"text-align: right;\"> 40</td><td style=\"text-align: right;\"> 1</td><td>14 </td><td style=\"text-align: right;\"> 1.16473e+06</td><td style=\"text-align: right;\"> 1.94322e+06</td><td style=\"text-align: right;\"> 2015</td><td>02/15/2015 12:43:39 PM</td><td style=\"text-align: right;\"> 41.9998</td><td style=\"text-align: right;\"> -87.6693</td><td>(41.999814056, -87.669342967)</td></tr>\n", "<tr><td style=\"text-align: right;\">21752 </td><td>HY144738 </td><td>02/08/2015 11:26:12 PM</td><td>060XX W GRAND AVE </td><td style=\"text-align: right;\"> 0110</td><td>HOMICIDE </td><td>FIRST DEGREE MURDER </td><td>STREET </td><td>true </td><td>false </td><td style=\"text-align: right;\"> 2512</td><td style=\"text-align: right;\"> 25</td><td style=\"text-align: right;\"> 37</td><td style=\"text-align: right;\"> 19</td><td>01A </td><td style=\"text-align: right;\"> 1.13591e+06</td><td style=\"text-align: right;\"> 1.91421e+06</td><td style=\"text-align: right;\"> 2015</td><td>02/15/2015 12:43:39 PM</td><td style=\"text-align: right;\"> 41.9208</td><td style=\"text-align: right;\"> -87.7761</td><td>(41.920755683, -87.776067514)</td></tr>\n", "<tr><td style=\"text-align: right;\"> 9.95581e+06</td><td>HY144775 </td><td>02/08/2015 11:20:33 PM</td><td>001XX W WACKER DR </td><td style=\"text-align: right;\"> 0460</td><td>BATTERY </td><td>SIMPLE </td><td>OTHER </td><td>false </td><td>false </td><td style=\"text-align: right;\"> 122</td><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 42</td><td style=\"text-align: right;\"> 32</td><td>08B </td><td style=\"text-align: right;\"> 1.17538e+06</td><td style=\"text-align: right;\"> 1.90209e+06</td><td style=\"text-align: right;\"> 2015</td><td>02/15/2015 12:43:39 PM</td><td style=\"text-align: right;\"> 41.8867</td><td style=\"text-align: right;\"> -87.6314</td><td>(41.886707818, -87.631396356)</td></tr>\n", "<tr><td style=\"text-align: right;\"> 9.95828e+06</td><td>HY146732 </td><td>02/08/2015 11:15:36 PM</td><td>001XX W WACKER DR </td><td style=\"text-align: right;\"> 0460</td><td>BATTERY </td><td>SIMPLE </td><td>HOTEL/MOTEL </td><td>false </td><td>false </td><td style=\"text-align: right;\"> 122</td><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 42</td><td style=\"text-align: right;\"> 32</td><td>08B </td><td style=\"text-align: right;\"> 1.17538e+06</td><td style=\"text-align: right;\"> 1.90209e+06</td><td style=\"text-align: right;\"> 2015</td><td>02/15/2015 12:43:39 PM</td><td style=\"text-align: right;\"> 41.8867</td><td style=\"text-align: right;\"> -87.6314</td><td>(41.886707818, -87.631396356)</td></tr>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Modify column names to replace blank spaces with underscore\n", "col_names = map(lambda s: s.replace(' ', '_'), f_crimes.col_names)\n", "f_crimes.set_names(col_names)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Set time zone to UTC for date manipulation\n", "h2o.set_timezone(\"Etc/UTC\")\n", "\n", "## Refine date column \n", "def refine_date_col(data, col, pattern):\n", " data[col] = data[col].as_date(pattern)\n", " data[\"Day\"] = data[col].day()\n", " data[\"Month\"] = data[col].month() # Since H2O indexes from 0\n", " data[\"Year\"] = data[col].year()\n", " data[\"WeekNum\"] = data[col].week()\n", " data[\"WeekDay\"] = data[col].dayOfWeek()\n", " data[\"HourOfDay\"] = data[col].hour()\n", " \n", " # Create weekend and season cols\n", " data[\"Weekend\"] = (data[\"WeekDay\"] == \"Sun\" or data[\"WeekDay\"] == \"Sat\").ifelse(1, 0)[0]\n", " data[\"Season\"] = data[\"Month\"].cut([0, 2, 5, 7, 10, 12], [\"Winter\", \"Spring\", \"Summer\", \"Autumn\", \"Winter\"])\n", " \n", "refine_date_col(f_crimes, \"Date\", \"%m/%d/%Y %I:%M:%S %p\")\n", "f_crimes = f_crimes.drop(\"Date\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Parse Progress: [##################################################] 100%\n", "[77, 9]\n" ] } ], "source": [ "# Parse Census data into H2O\n", "f_census = h2o.import_file(\"../data/chicagoCensus.csv\",header=1)\n", "\n", "## Update column names in the table\n", "col_names = map(lambda s: s.strip().replace(' ', '_'), f_census.col_names)\n", "f_census.set_names(col_names)\n", "f_census = f_census[1:78,:]\n", "print(f_census.dim)\n", "#f_census.summary()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Parse Progress: [##################################################] 100%\n", "[5162, 6]\n" ] } ], "source": [ "# Parse Weather data into H2O\n", "f_weather = h2o.import_file(\"../data/chicagoAllWeather.csv\")\n", "f_weather = f_weather[1:]\n", "print(f_weather.dim)\n", "#f_weather.summary()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<table>\n", "<tr><th style=\"text-align: right;\"> month</th><th style=\"text-align: right;\"> day</th><th style=\"text-align: right;\"> year</th><th style=\"text-align: right;\"> maxTemp</th><th style=\"text-align: right;\"> meanTemp</th><th style=\"text-align: right;\"> minTemp</th></tr>\n", "<tr><td style=\"text-align: right;\"> 6</td><td style=\"text-align: right;\"> 19</td><td style=\"text-align: right;\"> 2008</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td></tr>\n", "<tr><td style=\"text-align: right;\"> 9</td><td style=\"text-align: right;\"> 23</td><td style=\"text-align: right;\"> 2008</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td></tr>\n", "<tr><td style=\"text-align: right;\"> 9</td><td style=\"text-align: right;\"> 24</td><td style=\"text-align: right;\"> 2008</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td></tr>\n", "<tr><td style=\"text-align: right;\"> 9</td><td style=\"text-align: right;\"> 25</td><td style=\"text-align: right;\"> 2008</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td></tr>\n", "<tr><td style=\"text-align: right;\"> 9</td><td style=\"text-align: right;\"> 26</td><td style=\"text-align: right;\"> 2008</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td></tr>\n", "<tr><td style=\"text-align: right;\"> 9</td><td style=\"text-align: right;\"> 27</td><td style=\"text-align: right;\"> 2008</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td></tr>\n", "<tr><td style=\"text-align: right;\"> 9</td><td style=\"text-align: right;\"> 28</td><td style=\"text-align: right;\"> 2008</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td></tr>\n", "<tr><td style=\"text-align: right;\"> 9</td><td style=\"text-align: right;\"> 29</td><td style=\"text-align: right;\"> 2008</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td></tr>\n", "<tr><td style=\"text-align: right;\"> 9</td><td style=\"text-align: right;\"> 30</td><td style=\"text-align: right;\"> 2008</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td></tr>\n", "<tr><td style=\"text-align: right;\"> 3</td><td style=\"text-align: right;\"> 4</td><td style=\"text-align: right;\"> 2009</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td><td style=\"text-align: right;\"> nan</td></tr>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Look at all the null entires in the Weather table\n", "f_weather[f_weather[\"meanTemp\"].isna()]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "H2OContext: ip=172.16.2.98, port=54329\n" ] }, { "data": { "text/html": [ "<table>\n", "<tr><th style=\"text-align: right;\"> month</th><th style=\"text-align: right;\"> day</th><th style=\"text-align: right;\"> year</th><th style=\"text-align: right;\"> maxTemp</th><th style=\"text-align: right;\"> meanTemp</th><th style=\"text-align: right;\"> minTemp</th></tr>\n", "<tr><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 2001</td><td style=\"text-align: right;\"> 23</td><td style=\"text-align: right;\"> 14</td><td style=\"text-align: right;\"> 6</td></tr>\n", "<tr><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 2</td><td style=\"text-align: right;\"> 2001</td><td style=\"text-align: right;\"> 18</td><td style=\"text-align: right;\"> 12</td><td style=\"text-align: right;\"> 6</td></tr>\n", "<tr><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 3</td><td style=\"text-align: right;\"> 2001</td><td style=\"text-align: right;\"> 28</td><td style=\"text-align: right;\"> 18</td><td style=\"text-align: right;\"> 8</td></tr>\n", "<tr><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 4</td><td style=\"text-align: right;\"> 2001</td><td style=\"text-align: right;\"> 30</td><td style=\"text-align: right;\"> 24</td><td style=\"text-align: right;\"> 19</td></tr>\n", "<tr><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 5</td><td style=\"text-align: right;\"> 2001</td><td style=\"text-align: right;\"> 36</td><td style=\"text-align: right;\"> 30</td><td style=\"text-align: right;\"> 21</td></tr>\n", "<tr><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 6</td><td style=\"text-align: right;\"> 2001</td><td style=\"text-align: right;\"> 33</td><td style=\"text-align: right;\"> 26</td><td style=\"text-align: right;\"> 19</td></tr>\n", "<tr><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 7</td><td style=\"text-align: right;\"> 2001</td><td style=\"text-align: right;\"> 34</td><td style=\"text-align: right;\"> 28</td><td style=\"text-align: right;\"> 21</td></tr>\n", "<tr><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 8</td><td style=\"text-align: right;\"> 2001</td><td style=\"text-align: right;\"> 26</td><td style=\"text-align: right;\"> 20</td><td style=\"text-align: right;\"> 14</td></tr>\n", "<tr><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 9</td><td style=\"text-align: right;\"> 2001</td><td style=\"text-align: right;\"> 23</td><td style=\"text-align: right;\"> 16</td><td style=\"text-align: right;\"> 10</td></tr>\n", "<tr><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 10</td><td style=\"text-align: right;\"> 2001</td><td style=\"text-align: right;\"> 34</td><td style=\"text-align: right;\"> 26</td><td style=\"text-align: right;\"> 19</td></tr>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Look at the help on as_h2o_frame \n", "hc.as_data_frame?\n", "f_weather" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Copy data frames to Spark from H2O\n", "df_weather = hc.as_spark_frame(f_weather,)\n", "df_census = hc.as_spark_frame(f_census)\n", "df_crimes = hc.as_spark_frame(f_crimes)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-----+---+----+-------+--------+-------+\n", "|month|day|year|maxTemp|meanTemp|minTemp|\n", "+-----+---+----+-------+--------+-------+\n", "| 1| 1|2001| 23| 14| 6|\n", "| 1| 2|2001| 18| 12| 6|\n", "+-----+---+----+-------+--------+-------+\n", "only showing top 2 rows\n", "\n" ] } ], "source": [ "# Look at the weather data as parsed in Spark\n", "df_weather.show(2)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Join columns from Crime, Census and Weather DataFrames in Spark\n", "\n", "## Register DataFrames as tables in SQL context\n", "sqlContext.registerDataFrameAsTable(df_weather, \"chicagoWeather\")\n", "sqlContext.registerDataFrameAsTable(df_census, \"chicagoCensus\")\n", "sqlContext.registerDataFrameAsTable(df_crimes, \"chicagoCrime\")\n", "\n", "\n", "crimeWithWeather = sqlContext.sql(\"\"\"SELECT\n", "a.Year, a.Month, a.Day, a.WeekNum, a.HourOfDay, a.Weekend, a.Season, a.WeekDay,\n", "a.IUCR, a.Primary_Type, a.Location_Description, a.Community_Area, a.District,\n", "a.Arrest, a.Domestic, a.Beat, a.Ward, a.FBI_Code,\n", "b.minTemp, b.maxTemp, b.meanTemp,\n", "c.PERCENT_AGED_UNDER_18_OR_OVER_64, c.PER_CAPITA_INCOME, c.HARDSHIP_INDEX,\n", "c.PERCENT_OF_HOUSING_CROWDED, c.PERCENT_HOUSEHOLDS_BELOW_POVERTY,\n", "c.PERCENT_AGED_16__UNEMPLOYED, c.PERCENT_AGED_25__WITHOUT_HIGH_SCHOOL_DIPLOMA\n", "FROM chicagoCrime a\n", "JOIN chicagoWeather b\n", "ON a.Year = b.year AND a.Month = b.month AND a.Day = b.day\n", "JOIN chicagoCensus c\n", "ON a.Community_Area = c.Community_Area_Number\"\"\")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----+-----+---+-------+---------+-------+------+-------+----+-----------------+--------------------+--------------+--------+------+--------+----+----+--------+-------+-------+--------+--------------------------------+-----------------+--------------+--------------------------+--------------------------------+---------------------------+--------------------------------------------+\n", "|Year|Month|Day|WeekNum|HourOfDay|Weekend|Season|WeekDay|IUCR| Primary_Type|Location_Description|Community_Area|District|Arrest|Domestic|Beat|Ward|FBI_Code|minTemp|maxTemp|meanTemp|PERCENT_AGED_UNDER_18_OR_OVER_64|PER_CAPITA_INCOME|HARDSHIP_INDEX|PERCENT_OF_HOUSING_CROWDED|PERCENT_HOUSEHOLDS_BELOW_POVERTY|PERCENT_AGED_16__UNEMPLOYED|PERCENT_AGED_25__WITHOUT_HIGH_SCHOOL_DIPLOMA|\n", "+----+-----+---+-------+---------+-------+------+-------+----+-----------------+--------------------+--------------+--------+------+--------+----+----+--------+-------+-------+--------+--------------------------------+-----------------+--------------+--------------------------+--------------------------------+---------------------------+--------------------------------------------+\n", "|2015| 1| 23| 4| 22| 0|Winter| Fri|143A|WEAPONS VIOLATION| ALLEY| 31| 12| true| false|1234| 25| 15| 29| 31| 30| 32.6| 16444| 76| 9.600000000000001| 25.8| 15.8| 40.7|\n", "|2015| 1| 23| 4| 19| 0|Winter| Fri|4625| OTHER OFFENSE| SIDEWALK| 31| 10| true| false|1034| 25| 26| 29| 31| 30| 32.6| 16444| 76| 9.600000000000001| 25.8| 15.8| 40.7|\n", "+----+-----+---+-------+---------+-------+------+-------+----+-----------------+--------------------+--------------+--------+------+--------+----+----+--------+-------+-------+--------+--------------------------------+-----------------+--------------+--------------------------+--------------------------------+---------------------------+--------------------------------------------+\n", "only showing top 2 rows\n", "\n" ] } ], "source": [ "# Print the crimeWithWeather data table from Spark\n", "crimeWithWeather.show(2)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "H2OContext: ip=172.16.2.98, port=54329\n" ] } ], "source": [ "#Copy table from Spark to H2O\n", "hc.as_h2o_frame?\n", "crimeWithWeatherHF = hc.as_h2o_frame(crimeWithWeather,framename=\"crimeWithWeather\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<table>\n", "<tr><th> </th><th>Year </th><th>Month </th><th>Day </th><th>WeekNum </th><th>HourOfDay </th><th>Weekend </th><th>Season </th><th>WeekDay </th><th>IUCR </th><th>Primary_Type </th><th>Location_Description </th><th>Community_Area </th><th>District </th><th>Arrest </th><th>Domestic </th><th>Beat </th><th>Ward </th><th>FBI_Code </th><th>minTemp </th><th>maxTemp </th><th>meanTemp </th><th>PERCENT_AGED_UNDER_18_OR_OVER_64 </th><th>PER_CAPITA_INCOME </th><th>HARDSHIP_INDEX </th><th>PERCENT_OF_HOUSING_CROWDED </th><th>PERCENT_HOUSEHOLDS_BELOW_POVERTY </th><th>PERCENT_AGED_16__UNEMPLOYED </th><th>PERCENT_AGED_25__WITHOUT_HIGH_SCHOOL_DIPLOMA </th></tr>\n", "<tr><td>type </td><td>int </td><td>int </td><td>int </td><td>int </td><td>int </td><td>int </td><td>string </td><td>string </td><td>string</td><td>string </td><td>string </td><td>int </td><td>int </td><td>string </td><td>string </td><td>int </td><td>int </td><td>string </td><td>int </td><td>int </td><td>int </td><td>real </td><td>int </td><td>int </td><td>real </td><td>real </td><td>real </td><td>real </td></tr>\n", "<tr><td>mins </td><td>2015.0</td><td>1.0 </td><td>1.0 </td><td>4.0 </td><td>0.0 </td><td>0.0 </td><td>NaN </td><td>NaN </td><td>NaN </td><td>NaN </td><td>NaN </td><td>1.0 </td><td>1.0 </td><td>NaN </td><td>NaN </td><td>111.0 </td><td>1.0 </td><td>NaN </td><td>-2.0 </td><td>15.0 </td><td>7.0 </td><td>13.5 </td><td>8201.0 </td><td>1.0 </td><td>0.3 </td><td>3.3 </td><td>4.7 </td><td>2.5 </td></tr>\n", "<tr><td>mean </td><td>2015.0</td><td>1.41944194419 </td><td>17.6839683968</td><td>5.18081808181 </td><td>13.6319631963</td><td>0.159115911591</td><td>NaN </td><td>NaN </td><td>NaN </td><td>NaN </td><td>NaN </td><td>37.4476447645 </td><td>11.3489885128</td><td>NaN </td><td>NaN </td><td>1159.61806181</td><td>22.9540954095</td><td>NaN </td><td>17.699669967 </td><td>31.7199719972</td><td>24.9408940894</td><td>35.0596759676 </td><td>25221.3057306 </td><td>54.4786478648 </td><td>5.43707370737 </td><td>24.600750075 </td><td>16.8288328833 </td><td>21.096639664 </td></tr>\n", "<tr><td>maxs </td><td>2015.0</td><td>2.0 </td><td>31.0 </td><td>6.0 </td><td>23.0 </td><td>1.0 </td><td>NaN </td><td>NaN </td><td>NaN </td><td>NaN </td><td>NaN </td><td>77.0 </td><td>25.0 </td><td>NaN </td><td>NaN </td><td>2535.0 </td><td>50.0 </td><td>NaN </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>51.5 </td><td>88669.0 </td><td>98.0 </td><td>15.8 </td><td>56.5 </td><td>35.9 </td><td>54.8 </td></tr>\n", "<tr><td>sigma </td><td>0.0 </td><td>0.493492406787</td><td>11.1801043358</td><td>0.738929830409</td><td>6.47321735807</td><td>0.365802434041</td><td>NaN </td><td>NaN </td><td>NaN </td><td>NaN </td><td>NaN </td><td>21.2748762223 </td><td>6.94547493301</td><td>NaN </td><td>NaN </td><td>695.76029875 </td><td>13.6495661144</td><td>NaN </td><td>8.96118136438</td><td>6.93809913472</td><td>7.46302527062</td><td>7.95653388237 </td><td>18010.0446225 </td><td>29.3247456472 </td><td>3.75289588494 </td><td>10.1450570661 </td><td>7.58926327988 </td><td>11.3868817911 </td></tr>\n", "<tr><td>zeros </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>374 </td><td>8408 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td></tr>\n", "<tr><td>missing</td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>6 </td><td>0 </td><td>162 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td></tr>\n", "<tr><td>0 </td><td>2015.0</td><td>1.0 </td><td>24.0 </td><td>4.0 </td><td>22.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>2820 </td><td>OTHER OFFENSE </td><td>APARTMENT </td><td>31.0 </td><td>10.0 </td><td>false </td><td>false </td><td>1034.0 </td><td>25.0 </td><td>26 </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "<tr><td>1 </td><td>2015.0</td><td>1.0 </td><td>24.0 </td><td>4.0 </td><td>21.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>1310 </td><td>CRIMINAL DAMAGE </td><td>RESTAURANT </td><td>31.0 </td><td>12.0 </td><td>true </td><td>false </td><td>1233.0 </td><td>25.0 </td><td>14 </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "<tr><td>2 </td><td>2015.0</td><td>1.0 </td><td>24.0 </td><td>4.0 </td><td>18.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>1750 </td><td>OFFENSE INVOLVING CHILDREN</td><td>RESIDENCE </td><td>31.0 </td><td>12.0 </td><td>false </td><td>true </td><td>1235.0 </td><td>25.0 </td><td>20 </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "<tr><td>3 </td><td>2015.0</td><td>1.0 </td><td>24.0 </td><td>4.0 </td><td>18.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>0460 </td><td>BATTERY </td><td>OTHER </td><td>31.0 </td><td>10.0 </td><td>false </td><td>false </td><td>1023.0 </td><td>25.0 </td><td>08B </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "<tr><td>4 </td><td>2015.0</td><td>1.0 </td><td>24.0 </td><td>4.0 </td><td>13.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>0890 </td><td>THEFT </td><td>CURRENCY EXCHANGE </td><td>31.0 </td><td>10.0 </td><td>false </td><td>false </td><td>1023.0 </td><td>25.0 </td><td>06 </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "<tr><td>5 </td><td>2015.0</td><td>1.0 </td><td>24.0 </td><td>4.0 </td><td>9.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>0560 </td><td>ASSAULT </td><td>OTHER </td><td>31.0 </td><td>12.0 </td><td>false </td><td>false </td><td>1234.0 </td><td>25.0 </td><td>08A </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "<tr><td>6 </td><td>2015.0</td><td>1.0 </td><td>24.0 </td><td>4.0 </td><td>8.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>0486 </td><td>BATTERY </td><td>RESIDENCE </td><td>31.0 </td><td>12.0 </td><td>true </td><td>true </td><td>1235.0 </td><td>25.0 </td><td>08B </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "<tr><td>7 </td><td>2015.0</td><td>1.0 </td><td>24.0 </td><td>4.0 </td><td>1.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>0420 </td><td>BATTERY </td><td>SIDEWALK </td><td>31.0 </td><td>10.0 </td><td>false </td><td>false </td><td>1034.0 </td><td>25.0 </td><td>04B </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "<tr><td>8 </td><td>2015.0</td><td>1.0 </td><td>24.0 </td><td>4.0 </td><td>0.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>1320 </td><td>CRIMINAL DAMAGE </td><td>PARKING LOT/GARAGE(NON.RESID.)</td><td>31.0 </td><td>9.0 </td><td>false </td><td>false </td><td>912.0 </td><td>11.0 </td><td>14 </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "<tr><td>9 </td><td>2015.0</td><td>1.0 </td><td>31.0 </td><td>5.0 </td><td>23.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>0820 </td><td>THEFT </td><td>SIDEWALK </td><td>31.0 </td><td>12.0 </td><td>false </td><td>false </td><td>1234.0 </td><td>25.0 </td><td>06 </td><td>19.0 </td><td>36.0 </td><td>28.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "crimeWithWeatherHF.summary()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Assign column types to the CrimeWeatherHF data table in H2O\n", "crimeWithWeatherHF[\"Season\"]= crimeWithWeatherHF[\"Season\"].asfactor()\n", "crimeWithWeatherHF[\"WeekDay\"]= crimeWithWeatherHF[\"WeekDay\"].asfactor()\n", "crimeWithWeatherHF[\"IUCR\"]= crimeWithWeatherHF[\"IUCR\"].asfactor()\n", "crimeWithWeatherHF[\"Primary_Type\"]= crimeWithWeatherHF[\"Primary_Type\"].asfactor()\n", "crimeWithWeatherHF[\"Location_Description\"]= crimeWithWeatherHF[\"Location_Description\"].asfactor()\n", "crimeWithWeatherHF[\"Arrest\"]= crimeWithWeatherHF[\"Arrest\"].asfactor()\n", "crimeWithWeatherHF[\"Domestic\"]= crimeWithWeatherHF[\"Domestic\"].asfactor()\n", "crimeWithWeatherHF[\"FBI_Code\"]= crimeWithWeatherHF[\"FBI_Code\"].asfactor()\n", "crimeWithWeatherHF[\"Season\"]= crimeWithWeatherHF[\"Season\"].asfactor()\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "<table>\n", "<tr><th> </th><th>Year </th><th>Month </th><th>Day </th><th>WeekNum </th><th>HourOfDay </th><th>Weekend </th><th>Season </th><th>WeekDay </th><th>IUCR </th><th>Primary_Type </th><th>Location_Description </th><th>Community_Area </th><th>District </th><th>Arrest </th><th>Domestic </th><th>Beat </th><th>Ward </th><th>FBI_Code </th><th>minTemp </th><th>maxTemp </th><th>meanTemp </th><th>PERCENT_AGED_UNDER_18_OR_OVER_64 </th><th>PER_CAPITA_INCOME </th><th>HARDSHIP_INDEX </th><th>PERCENT_OF_HOUSING_CROWDED </th><th>PERCENT_HOUSEHOLDS_BELOW_POVERTY </th><th>PERCENT_AGED_16__UNEMPLOYED </th><th>PERCENT_AGED_25__WITHOUT_HIGH_SCHOOL_DIPLOMA </th></tr>\n", "<tr><td>type </td><td>int </td><td>int </td><td>int </td><td>int </td><td>int </td><td>int </td><td>enum </td><td>enum </td><td>enum </td><td>enum </td><td>enum </td><td>int </td><td>int </td><td>enum </td><td>enum </td><td>int </td><td>int </td><td>enum </td><td>int </td><td>int </td><td>int </td><td>real </td><td>int </td><td>int </td><td>real </td><td>real </td><td>real </td><td>real </td></tr>\n", "<tr><td>mins </td><td>2015.0</td><td>1.0 </td><td>1.0 </td><td>4.0 </td><td>0.0 </td><td>0.0 </td><td>0.0 </td><td>0.0 </td><td>0.0 </td><td>0.0 </td><td>0.0 </td><td>1.0 </td><td>1.0 </td><td>0.0 </td><td>0.0 </td><td>111.0 </td><td>1.0 </td><td>0.0 </td><td>-2.0 </td><td>15.0 </td><td>7.0 </td><td>13.5 </td><td>8201.0 </td><td>1.0 </td><td>0.3 </td><td>3.3 </td><td>4.7 </td><td>2.5 </td></tr>\n", "<tr><td>mean </td><td>2015.0</td><td>1.41944194419 </td><td>17.6839683968</td><td>5.18081808181 </td><td>13.6319631963</td><td>0.159115911591</td><td>0.0 </td><td>NaN </td><td>NaN </td><td>NaN </td><td>NaN </td><td>37.4476447645 </td><td>11.3489885128</td><td>0.292829282928</td><td>0.152315231523</td><td>1159.61806181</td><td>22.9540954095</td><td>NaN </td><td>17.699669967 </td><td>31.7199719972</td><td>24.9408940894</td><td>35.0596759676 </td><td>25221.3057306 </td><td>54.4786478648 </td><td>5.43707370737 </td><td>24.600750075 </td><td>16.8288328833 </td><td>21.096639664 </td></tr>\n", "<tr><td>maxs </td><td>2015.0</td><td>2.0 </td><td>31.0 </td><td>6.0 </td><td>23.0 </td><td>1.0 </td><td>0.0 </td><td>6.0 </td><td>212.0 </td><td>26.0 </td><td>90.0 </td><td>77.0 </td><td>25.0 </td><td>1.0 </td><td>1.0 </td><td>2535.0 </td><td>50.0 </td><td>24.0 </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>51.5 </td><td>88669.0 </td><td>98.0 </td><td>15.8 </td><td>56.5 </td><td>35.9 </td><td>54.8 </td></tr>\n", "<tr><td>sigma </td><td>0.0 </td><td>0.493492406787</td><td>11.1801043358</td><td>0.738929830409</td><td>6.47321735807</td><td>0.365802434041</td><td>0.0 </td><td>NaN </td><td>NaN </td><td>NaN </td><td>NaN </td><td>21.2748762223 </td><td>6.94547493301</td><td>0.455083515588</td><td>0.35934414686 </td><td>695.76029875 </td><td>13.6495661144</td><td>NaN </td><td>8.96118136438</td><td>6.93809913472</td><td>7.46302527062</td><td>7.95653388237 </td><td>18010.0446225 </td><td>29.3247456472 </td><td>3.75289588494 </td><td>10.1450570661 </td><td>7.58926327988 </td><td>11.3868817911 </td></tr>\n", "<tr><td>zeros </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>374 </td><td>8408 </td><td>9999 </td><td>1942 </td><td>16 </td><td>11 </td><td>19 </td><td>0 </td><td>0 </td><td>7071 </td><td>8476 </td><td>0 </td><td>0 </td><td>16 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td></tr>\n", "<tr><td>missing</td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>6 </td><td>0 </td><td>162 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td></tr>\n", "<tr><td>0 </td><td>2015.0</td><td>1.0 </td><td>24.0 </td><td>4.0 </td><td>22.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>2820 </td><td>OTHER OFFENSE </td><td>APARTMENT </td><td>31.0 </td><td>10.0 </td><td>false </td><td>false </td><td>1034.0 </td><td>25.0 </td><td>26 </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "<tr><td>1 </td><td>2015.0</td><td>1.0 </td><td>24.0 </td><td>4.0 </td><td>21.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>1310 </td><td>CRIMINAL DAMAGE </td><td>RESTAURANT </td><td>31.0 </td><td>12.0 </td><td>true </td><td>false </td><td>1233.0 </td><td>25.0 </td><td>14 </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "<tr><td>2 </td><td>2015.0</td><td>1.0 </td><td>24.0 </td><td>4.0 </td><td>18.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>1750 </td><td>OFFENSE INVOLVING CHILDREN</td><td>RESIDENCE </td><td>31.0 </td><td>12.0 </td><td>false </td><td>true </td><td>1235.0 </td><td>25.0 </td><td>20 </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "<tr><td>3 </td><td>2015.0</td><td>1.0 </td><td>24.0 </td><td>4.0 </td><td>18.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>0460 </td><td>BATTERY </td><td>OTHER </td><td>31.0 </td><td>10.0 </td><td>false </td><td>false </td><td>1023.0 </td><td>25.0 </td><td>08B </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "<tr><td>4 </td><td>2015.0</td><td>1.0 </td><td>24.0 </td><td>4.0 </td><td>13.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>0890 </td><td>THEFT </td><td>CURRENCY EXCHANGE </td><td>31.0 </td><td>10.0 </td><td>false </td><td>false </td><td>1023.0 </td><td>25.0 </td><td>06 </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "<tr><td>5 </td><td>2015.0</td><td>1.0 </td><td>24.0 </td><td>4.0 </td><td>9.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>0560 </td><td>ASSAULT </td><td>OTHER </td><td>31.0 </td><td>12.0 </td><td>false </td><td>false </td><td>1234.0 </td><td>25.0 </td><td>08A </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "<tr><td>6 </td><td>2015.0</td><td>1.0 </td><td>24.0 </td><td>4.0 </td><td>8.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>0486 </td><td>BATTERY </td><td>RESIDENCE </td><td>31.0 </td><td>12.0 </td><td>true </td><td>true </td><td>1235.0 </td><td>25.0 </td><td>08B </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "<tr><td>7 </td><td>2015.0</td><td>1.0 </td><td>24.0 </td><td>4.0 </td><td>1.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>0420 </td><td>BATTERY </td><td>SIDEWALK </td><td>31.0 </td><td>10.0 </td><td>false </td><td>false </td><td>1034.0 </td><td>25.0 </td><td>04B </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "<tr><td>8 </td><td>2015.0</td><td>1.0 </td><td>24.0 </td><td>4.0 </td><td>0.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>1320 </td><td>CRIMINAL DAMAGE </td><td>PARKING LOT/GARAGE(NON.RESID.)</td><td>31.0 </td><td>9.0 </td><td>false </td><td>false </td><td>912.0 </td><td>11.0 </td><td>14 </td><td>29.0 </td><td>43.0 </td><td>36.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "<tr><td>9 </td><td>2015.0</td><td>1.0 </td><td>31.0 </td><td>5.0 </td><td>23.0 </td><td>0.0 </td><td>Winter </td><td>Sat </td><td>0820 </td><td>THEFT </td><td>SIDEWALK </td><td>31.0 </td><td>12.0 </td><td>false </td><td>false </td><td>1234.0 </td><td>25.0 </td><td>06 </td><td>19.0 </td><td>36.0 </td><td>28.0 </td><td>32.6 </td><td>16444.0 </td><td>76.0 </td><td>9.6 </td><td>25.8 </td><td>15.8 </td><td>40.7 </td></tr>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "crimeWithWeatherHF.summary()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Split final H2O data table into train test and validation sets\n", "ratios = [0.6,0.2]\n", "frs = crimeWithWeatherHF.split_frame(ratios,seed=12345)\n", "train = frs[0]\n", "train.frame_id = \"Train\"\n", "valid = frs[2]\n", "valid.frame_id = \"Validation\"\n", "test = frs[1]\n", "test.frame_id = \"Test\"" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Import Model Builders from H2O Python\n", "from h2o.estimators.gbm import H2OGradientBoostingEstimator\n", "from h2o.estimators.deeplearning import H2ODeepLearningEstimator" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Inspect the avialble gbm parameters\n", "H2OGradientBoostingEstimator?" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Define Preditors\n", "predictors = crimeWithWeatherHF.names[:]\n", "response = \"Arrest\"\n", "predictors.remove(response)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "gbm Model Build Progress: [##################################################] 100%\n", "\n", "deeplearning Model Build Progress: [##################################################] 100%\n" ] } ], "source": [ "#Simple GBM model - Predict Arrest\n", "model_gbm = H2OGradientBoostingEstimator(ntrees =50,\n", " max_depth =6,\n", " learn_rate =0.1, \n", " #nfolds =2,\n", " distribution =\"bernoulli\")\n", "\n", "model_gbm.train(x =predictors,\n", " y =\"Arrest\",\n", " training_frame =train,\n", " validation_frame=valid\n", " )\n", "\n", "# Simple Deep Learning - Predict Arrest\n", "model_dl = H2ODeepLearningEstimator(variable_importances=True,\n", " loss =\"Automatic\")\n", "\n", "model_dl.train(x =predictors,\n", " y =\"Arrest\",\n", " training_frame =train,\n", " validation_frame=valid)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.335827722991:\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td><b></b></td>\n", "<td><b>false</b></td>\n", "<td><b>true</b></td>\n", "<td><b>Error</b></td>\n", "<td><b>Rate</b></td></tr>\n", "<tr><td>false</td>\n", "<td>4125.0</td>\n", "<td>142.0</td>\n", "<td>0.0333</td>\n", "<td> (142.0/4267.0)</td></tr>\n", "<tr><td>true</td>\n", "<td>251.0</td>\n", "<td>1504.0</td>\n", "<td>0.143</td>\n", "<td> (251.0/1755.0)</td></tr>\n", "<tr><td>Total</td>\n", "<td>4376.0</td>\n", "<td>1646.0</td>\n", "<td>0.0653</td>\n", "<td> (393.0/6022.0)</td></tr></table></div>" ], "text/plain": [ " false true Error Rate\n", "----- ------- ------ ------- --------------\n", "false 4125 142 0.0333 (142.0/4267.0)\n", "true 251 1504 0.143 (251.0/1755.0)\n", "Total 4376 1646 0.0653 (393.0/6022.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.432844055866:\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td><b></b></td>\n", "<td><b>false</b></td>\n", "<td><b>true</b></td>\n", "<td><b>Error</b></td>\n", "<td><b>Rate</b></td></tr>\n", "<tr><td>false</td>\n", "<td>1362.0</td>\n", "<td>61.0</td>\n", "<td>0.0429</td>\n", "<td> (61.0/1423.0)</td></tr>\n", "<tr><td>true</td>\n", "<td>150.0</td>\n", "<td>443.0</td>\n", "<td>0.253</td>\n", "<td> (150.0/593.0)</td></tr>\n", "<tr><td>Total</td>\n", "<td>1512.0</td>\n", "<td>504.0</td>\n", "<td>0.1047</td>\n", "<td> (211.0/2016.0)</td></tr></table></div>" ], "text/plain": [ " false true Error Rate\n", "----- ------- ------ ------- --------------\n", "false 1362 61 0.0429 (61.0/1423.0)\n", "true 150 443 0.253 (150.0/593.0)\n", "Total 1512 504 0.1047 (211.0/2016.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# Print confusion matrices for the train and validation set\n", "print(model_gbm.confusion_matrix(train = True))\n", "print(model_gbm.confusion_matrix(valid = True))" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.974667176776\n", "0.92596751276\n" ] } ], "source": [ "print(model_gbm.auc(train=True))\n", "print(model_gbm.auc(valid=True))\n", "model_gbm.plot(metric=\"AUC\")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>variable</th>\n", " <th>relative_importance</th>\n", " <th>scaled_importance</th>\n", " <th>percentage</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>IUCR</td>\n", " <td>4280.939453</td>\n", " <td>1.000000e+00</td>\n", " <td>8.234218e-01</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Location_Description</td>\n", " <td>487.323059</td>\n", " <td>1.138355e-01</td>\n", " <td>9.373466e-02</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>WeekDay</td>\n", " <td>55.790558</td>\n", " <td>1.303232e-02</td>\n", " <td>1.073109e-02</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>HourOfDay</td>\n", " <td>55.419220</td>\n", " <td>1.294557e-02</td>\n", " <td>1.065967e-02</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>PERCENT_AGED_16__UNEMPLOYED</td>\n", " <td>34.422894</td>\n", " <td>8.040967e-03</td>\n", " <td>6.621107e-03</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>Beat</td>\n", " <td>31.468222</td>\n", " <td>7.350775e-03</td>\n", " <td>6.052788e-03</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>PERCENT_HOUSEHOLDS_BELOW_POVERTY</td>\n", " <td>29.103352</td>\n", " <td>6.798356e-03</td>\n", " <td>5.597915e-03</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>PER_CAPITA_INCOME</td>\n", " <td>26.233143</td>\n", " <td>6.127894e-03</td>\n", " <td>5.045841e-03</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>PERCENT_AGED_UNDER_18_OR_OVER_64</td>\n", " <td>24.077402</td>\n", " <td>5.624327e-03</td>\n", " <td>4.631193e-03</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>Day</td>\n", " <td>23.472567</td>\n", " <td>5.483041e-03</td>\n", " <td>4.514855e-03</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>maxTemp</td>\n", " <td>11.300793</td>\n", " <td>2.639793e-03</td>\n", " <td>2.173663e-03</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>Community_Area</td>\n", " <td>10.252146</td>\n", " <td>2.394835e-03</td>\n", " <td>1.971960e-03</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>HARDSHIP_INDEX</td>\n", " <td>10.116072</td>\n", " <td>2.363049e-03</td>\n", " <td>1.945786e-03</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>Domestic</td>\n", " <td>9.294327</td>\n", " <td>2.171095e-03</td>\n", " <td>1.787727e-03</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>District</td>\n", " <td>8.304654</td>\n", " <td>1.939914e-03</td>\n", " <td>1.597367e-03</td>\n", " </tr>\n", " <tr>\n", " <th>20</th>\n", " <td>minTemp</td>\n", " <td>6.243027</td>\n", " <td>1.458331e-03</td>\n", " <td>1.200822e-03</td>\n", " </tr>\n", " <tr>\n", " <th>21</th>\n", " <td>WeekNum</td>\n", " <td>4.230102</td>\n", " <td>9.881246e-04</td>\n", " <td>8.136433e-04</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>FBI_Code</td>\n", " <td>2.363182</td>\n", " <td>5.520241e-04</td>\n", " <td>4.545486e-04</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>Month</td>\n", " <td>0.000018</td>\n", " <td>4.187325e-09</td>\n", " <td>3.447935e-09</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>Weekend</td>\n", " <td>0.000000</td>\n", " <td>0.000000e+00</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>25 rows × 4 columns</p>\n", "</div>" ], "text/plain": [ " variable relative_importance scaled_importance \\\n", "0 IUCR 4280.939453 1.000000e+00 \n", "1 Location_Description 487.323059 1.138355e-01 \n", "2 WeekDay 55.790558 1.303232e-02 \n", "3 HourOfDay 55.419220 1.294557e-02 \n", "4 PERCENT_AGED_16__UNEMPLOYED 34.422894 8.040967e-03 \n", "5 Beat 31.468222 7.350775e-03 \n", "6 PERCENT_HOUSEHOLDS_BELOW_POVERTY 29.103352 6.798356e-03 \n", "7 PER_CAPITA_INCOME 26.233143 6.127894e-03 \n", "8 PERCENT_AGED_UNDER_18_OR_OVER_64 24.077402 5.624327e-03 \n", "9 Day 23.472567 5.483041e-03 \n", ".. ... ... ... \n", "15 maxTemp 11.300793 2.639793e-03 \n", "16 Community_Area 10.252146 2.394835e-03 \n", "17 HARDSHIP_INDEX 10.116072 2.363049e-03 \n", "18 Domestic 9.294327 2.171095e-03 \n", "19 District 8.304654 1.939914e-03 \n", "20 minTemp 6.243027 1.458331e-03 \n", "21 WeekNum 4.230102 9.881246e-04 \n", "22 FBI_Code 2.363182 5.520241e-04 \n", "23 Month 0.000018 4.187325e-09 \n", "24 Weekend 0.000000 0.000000e+00 \n", "\n", " percentage \n", "0 8.234218e-01 \n", "1 9.373466e-02 \n", "2 1.073109e-02 \n", "3 1.065967e-02 \n", "4 6.621107e-03 \n", "5 6.052788e-03 \n", "6 5.597915e-03 \n", "7 5.045841e-03 \n", "8 4.631193e-03 \n", "9 4.514855e-03 \n", ".. ... \n", "15 2.173663e-03 \n", "16 1.971960e-03 \n", "17 1.945786e-03 \n", "18 1.787727e-03 \n", "19 1.597367e-03 \n", "20 1.200822e-03 \n", "21 8.136433e-04 \n", "22 4.545486e-04 \n", "23 3.447935e-09 \n", "24 0.000000e+00 \n", "\n", "[25 rows x 4 columns]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Print variable importance\n", "model_gbm.varimp(True)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model Details\n", "=============\n", "H2ODeepLearningEstimator : Deep Learning\n", "Model Key: DeepLearning_model_python_1446861372065_4\n", "\n", "Status of Neuron Layers: predicting Arrest, 2-class classification, bernoulli distribution, CrossEntropy loss, 118,802 weights/biases, 1.4 MB, 72,478 training samples, mini-batch size 1\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td><b></b></td>\n", "<td><b>layer</b></td>\n", "<td><b>units</b></td>\n", "<td><b>type</b></td>\n", "<td><b>dropout</b></td>\n", "<td><b>l1</b></td>\n", "<td><b>l2</b></td>\n", "<td><b>mean_rate</b></td>\n", "<td><b>rate_RMS</b></td>\n", "<td><b>momentum</b></td>\n", "<td><b>mean_weight</b></td>\n", "<td><b>weight_RMS</b></td>\n", "<td><b>mean_bias</b></td>\n", "<td><b>bias_RMS</b></td></tr>\n", "<tr><td></td>\n", "<td>1</td>\n", "<td>390</td>\n", "<td>Input</td>\n", "<td>0.0</td>\n", "<td></td>\n", "<td></td>\n", "<td></td>\n", "<td></td>\n", "<td></td>\n", "<td></td>\n", "<td></td>\n", "<td></td>\n", "<td></td></tr>\n", "<tr><td></td>\n", "<td>2</td>\n", "<td>200</td>\n", "<td>Rectifier</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.1</td>\n", "<td>0.3</td>\n", "<td>0.0</td>\n", "<td>-0.0</td>\n", "<td>0.1</td>\n", "<td>-0.0</td>\n", "<td>0.1</td></tr>\n", "<tr><td></td>\n", "<td>3</td>\n", "<td>200</td>\n", "<td>Rectifier</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.1</td>\n", "<td>0.2</td>\n", "<td>0.0</td>\n", "<td>-0.0</td>\n", "<td>0.1</td>\n", "<td>0.8</td>\n", "<td>0.2</td></tr>\n", "<tr><td></td>\n", "<td>4</td>\n", "<td>2</td>\n", "<td>Softmax</td>\n", "<td></td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.4</td>\n", "<td>-0.0</td>\n", "<td>0.0</td></tr></table></div>" ], "text/plain": [ " layer units type dropout l1 l2 mean_rate rate_RMS momentum mean_weight weight_RMS mean_bias bias_RMS\n", "-- ------- ------- --------- --------- ---- ---- ---------------- ---------------- ---------- ----------------- --------------- ------------------ ----------------\n", " 1 390 Input 0.0\n", " 2 200 Rectifier 0.0 0.0 0.0 0.125569459979 0.314496159554 0.0 -0.00145149060662 0.0583122819662 -0.0226196222717 0.106465905905\n", " 3 200 Rectifier 0.0 0.0 0.0 0.089552259468 0.167708098888 0.0 -0.0100148269118 0.0716536939144 0.77415192296 0.243278682232\n", " 4 2 Softmax 0.0 0.0 0.00265202885173 0.00204257667065 0.0 0.003976688287 0.384519815445 -0.000194062303944 0.00806335359812" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "ModelMetricsBinomial: deeplearning\n", "** Reported on train data. **\n", "\n", "MSE: 0.0737426129728\n", "R^2: 0.642891439669\n", "LogLoss: 0.242051500943\n", "AUC: 0.950131166302\n", "Gini: 0.900262332604\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.343997370612:\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td><b></b></td>\n", "<td><b>false</b></td>\n", "<td><b>true</b></td>\n", "<td><b>Error</b></td>\n", "<td><b>Rate</b></td></tr>\n", "<tr><td>false</td>\n", "<td>4003.0</td>\n", "<td>264.0</td>\n", "<td>0.0619</td>\n", "<td> (264.0/4267.0)</td></tr>\n", "<tr><td>true</td>\n", "<td>358.0</td>\n", "<td>1397.0</td>\n", "<td>0.204</td>\n", "<td> (358.0/1755.0)</td></tr>\n", "<tr><td>Total</td>\n", "<td>4361.0</td>\n", "<td>1661.0</td>\n", "<td>0.1033</td>\n", "<td> (622.0/6022.0)</td></tr></table></div>" ], "text/plain": [ " false true Error Rate\n", "----- ------- ------ ------- --------------\n", "false 4003 264 0.0619 (264.0/4267.0)\n", "true 358 1397 0.204 (358.0/1755.0)\n", "Total 4361 1661 0.1033 (622.0/6022.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Maximum Metrics: Maximum metrics at their respective thresholds\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td><b>metric</b></td>\n", "<td><b>threshold</b></td>\n", "<td><b>value</b></td>\n", "<td><b>idx</b></td></tr>\n", "<tr><td>max f1</td>\n", "<td>0.3</td>\n", "<td>0.8</td>\n", "<td>195.0</td></tr>\n", "<tr><td>max f2</td>\n", "<td>0.2</td>\n", "<td>0.9</td>\n", "<td>278.0</td></tr>\n", "<tr><td>max f0point5</td>\n", "<td>0.7</td>\n", "<td>0.9</td>\n", "<td>86.0</td></tr>\n", "<tr><td>max accuracy</td>\n", "<td>0.5</td>\n", "<td>0.9</td>\n", "<td>149.0</td></tr>\n", "<tr><td>max precision</td>\n", "<td>1.0</td>\n", "<td>1.0</td>\n", "<td>0.0</td></tr>\n", "<tr><td>max absolute_MCC</td>\n", "<td>0.3</td>\n", "<td>0.7</td>\n", "<td>195.0</td></tr>\n", "<tr><td>max min_per_class_accuracy</td>\n", "<td>0.2</td>\n", "<td>0.9</td>\n", "<td>247.0</td></tr></table></div>" ], "text/plain": [ "metric threshold value idx\n", "-------------------------- ----------- -------- -----\n", "max f1 0.343997 0.817916 195\n", "max f2 0.158173 0.856675 278\n", "max f0point5 0.740563 0.878082 86\n", "max accuracy 0.48931 0.897874 149\n", "max precision 0.999884 1 0\n", "max absolute_MCC 0.343997 0.746452 195\n", "max min_per_class_accuracy 0.217785 0.876923 247" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "ModelMetricsBinomial: deeplearning\n", "** Reported on validation data. **\n", "\n", "MSE: 0.0843305429737\n", "R^2: 0.593831388139\n", "LogLoss: 0.280203809486\n", "AUC: 0.930515181213\n", "Gini: 0.861030362427\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.493462351545:\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td><b></b></td>\n", "<td><b>false</b></td>\n", "<td><b>true</b></td>\n", "<td><b>Error</b></td>\n", "<td><b>Rate</b></td></tr>\n", "<tr><td>false</td>\n", "<td>1361.0</td>\n", "<td>62.0</td>\n", "<td>0.0436</td>\n", "<td> (62.0/1423.0)</td></tr>\n", "<tr><td>true</td>\n", "<td>158.0</td>\n", "<td>435.0</td>\n", "<td>0.2664</td>\n", "<td> (158.0/593.0)</td></tr>\n", "<tr><td>Total</td>\n", "<td>1519.0</td>\n", "<td>497.0</td>\n", "<td>0.1091</td>\n", "<td> (220.0/2016.0)</td></tr></table></div>" ], "text/plain": [ " false true Error Rate\n", "----- ------- ------ ------- --------------\n", "false 1361 62 0.0436 (62.0/1423.0)\n", "true 158 435 0.2664 (158.0/593.0)\n", "Total 1519 497 0.1091 (220.0/2016.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Maximum Metrics: Maximum metrics at their respective thresholds\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td><b>metric</b></td>\n", "<td><b>threshold</b></td>\n", "<td><b>value</b></td>\n", "<td><b>idx</b></td></tr>\n", "<tr><td>max f1</td>\n", "<td>0.5</td>\n", "<td>0.8</td>\n", "<td>137.0</td></tr>\n", "<tr><td>max f2</td>\n", "<td>0.1</td>\n", "<td>0.8</td>\n", "<td>303.0</td></tr>\n", "<tr><td>max f0point5</td>\n", "<td>0.7</td>\n", "<td>0.9</td>\n", "<td>82.0</td></tr>\n", "<tr><td>max accuracy</td>\n", "<td>0.7</td>\n", "<td>0.9</td>\n", "<td>91.0</td></tr>\n", "<tr><td>max precision</td>\n", "<td>1.0</td>\n", "<td>1.0</td>\n", "<td>0.0</td></tr>\n", "<tr><td>max absolute_MCC</td>\n", "<td>0.7</td>\n", "<td>0.7</td>\n", "<td>91.0</td></tr>\n", "<tr><td>max min_per_class_accuracy</td>\n", "<td>0.2</td>\n", "<td>0.8</td>\n", "<td>236.0</td></tr></table></div>" ], "text/plain": [ "metric threshold value idx\n", "-------------------------- ----------- -------- -----\n", "max f1 0.493462 0.798165 137\n", "max f2 0.0962013 0.830341 303\n", "max f0point5 0.740357 0.877695 82\n", "max accuracy 0.69578 0.893849 91\n", "max precision 0.999877 1 0\n", "max absolute_MCC 0.69578 0.738603 91\n", "max min_per_class_accuracy 0.207426 0.836425 236" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Scoring History:\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td><b></b></td>\n", "<td><b>timestamp</b></td>\n", "<td><b>duration</b></td>\n", "<td><b>training_speed</b></td>\n", "<td><b>epochs</b></td>\n", "<td><b>samples</b></td>\n", "<td><b>training_MSE</b></td>\n", "<td><b>training_r2</b></td>\n", "<td><b>training_logloss</b></td>\n", "<td><b>training_AUC</b></td>\n", "<td><b>training_classification_error</b></td>\n", "<td><b>validation_MSE</b></td>\n", "<td><b>validation_r2</b></td>\n", "<td><b>validation_logloss</b></td>\n", "<td><b>validation_AUC</b></td>\n", "<td><b>validation_classification_error</b></td></tr>\n", "<tr><td></td>\n", "<td>2015-11-06 17:57:05</td>\n", "<td> 0.000 sec</td>\n", "<td>None</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>nan</td>\n", "<td>nan</td>\n", "<td>nan</td>\n", "<td>nan</td>\n", "<td>nan</td>\n", "<td>nan</td>\n", "<td>nan</td>\n", "<td>nan</td>\n", "<td>nan</td>\n", "<td>nan</td></tr>\n", "<tr><td></td>\n", "<td>2015-11-06 17:57:09</td>\n", "<td> 2.899 sec</td>\n", "<td>2594 rows/sec</td>\n", "<td>1.0</td>\n", "<td>6068.0</td>\n", "<td>0.1</td>\n", "<td>0.3</td>\n", "<td>0.6</td>\n", "<td>0.9</td>\n", "<td>0.1</td>\n", "<td>0.1</td>\n", "<td>0.3</td>\n", "<td>0.6</td>\n", "<td>0.9</td>\n", "<td>0.1</td></tr>\n", "<tr><td></td>\n", "<td>2015-11-06 17:57:15</td>\n", "<td> 9.096 sec</td>\n", "<td>5465 rows/sec</td>\n", "<td>7.3</td>\n", "<td>43742.0</td>\n", "<td>0.1</td>\n", "<td>0.6</td>\n", "<td>0.3</td>\n", "<td>0.9</td>\n", "<td>0.1</td>\n", "<td>0.1</td>\n", "<td>0.6</td>\n", "<td>0.3</td>\n", "<td>0.9</td>\n", "<td>0.1</td></tr>\n", "<tr><td></td>\n", "<td>2015-11-06 17:57:19</td>\n", "<td>12.425 sec</td>\n", "<td>6571 rows/sec</td>\n", "<td>12.0</td>\n", "<td>72478.0</td>\n", "<td>0.1</td>\n", "<td>0.6</td>\n", "<td>0.2</td>\n", "<td>1.0</td>\n", "<td>0.1</td>\n", "<td>0.1</td>\n", "<td>0.6</td>\n", "<td>0.3</td>\n", "<td>0.9</td>\n", "<td>0.1</td></tr></table></div>" ], "text/plain": [ " timestamp duration training_speed epochs samples training_MSE training_r2 training_logloss training_AUC training_classification_error validation_MSE validation_r2 validation_logloss validation_AUC validation_classification_error\n", "-- ------------------- ---------- ---------------- -------- --------- -------------- ------------- ------------------ -------------- ------------------------------- ---------------- --------------- -------------------- ---------------- ---------------------------------\n", " 2015-11-06 17:57:05 0.000 sec 0 0 nan nan nan nan nan nan nan nan nan nan\n", " 2015-11-06 17:57:09 2.899 sec 2594 rows/sec 1.00764 6068 0.146371 0.291179 0.646759 0.872778 0.146795 0.139313 0.329013 0.616018 0.880032 0.138393\n", " 2015-11-06 17:57:15 9.096 sec 5465 rows/sec 7.2637 43742 0.0872842 0.577315 0.288376 0.931927 0.110761 0.0929823 0.552161 0.302143 0.923719 0.109127\n", " 2015-11-06 17:57:19 12.425 sec 6571 rows/sec 12.0355 72478 0.0737426 0.642891 0.242052 0.950131 0.103288 0.0843305 0.593831 0.280204 0.930515 0.109127" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Variable Importances:\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td><b>variable</b></td>\n", "<td><b>relative_importance</b></td>\n", "<td><b>scaled_importance</b></td>\n", "<td><b>percentage</b></td></tr>\n", "<tr><td>Domestic.false</td>\n", "<td>1.0</td>\n", "<td>1.0</td>\n", "<td>0.0</td></tr>\n", "<tr><td>Primary_Type.NARCOTICS</td>\n", "<td>0.9</td>\n", "<td>0.9</td>\n", "<td>0.0</td></tr>\n", "<tr><td>IUCR.0860</td>\n", "<td>0.8</td>\n", "<td>0.8</td>\n", "<td>0.0</td></tr>\n", "<tr><td>FBI_Code.18</td>\n", "<td>0.8</td>\n", "<td>0.8</td>\n", "<td>0.0</td></tr>\n", "<tr><td>IUCR.4625</td>\n", "<td>0.7</td>\n", "<td>0.7</td>\n", "<td>0.0</td></tr>\n", "<tr><td>---</td>\n", "<td>---</td>\n", "<td>---</td>\n", "<td>---</td></tr>\n", "<tr><td>Location_Description.missing(NA)</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td></tr>\n", "<tr><td>Primary_Type.missing(NA)</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td></tr>\n", "<tr><td>FBI_Code.missing(NA)</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td></tr>\n", "<tr><td>WeekDay.missing(NA)</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td></tr>\n", "<tr><td>Domestic.missing(NA)</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td></tr></table></div>" ], "text/plain": [ "variable relative_importance scaled_importance percentage\n", "-------------------------------- --------------------- ------------------- ----------------\n", "Domestic.false 1.0 1.0 0.00423868455641\n", "Primary_Type.NARCOTICS 0.868295669556 0.868295669556 0.00368043144494\n", "IUCR.0860 0.80918776989 0.80918776989 0.00342989170347\n", "FBI_Code.18 0.793890833855 0.793890833855 0.00336505281694\n", "IUCR.4625 0.744634568691 0.744634568691 0.00315627104648\n", "--- --- --- ---\n", "Location_Description.missing(NA) 0.0 0.0 0.0\n", "Primary_Type.missing(NA) 0.0 0.0 0.0\n", "FBI_Code.missing(NA) 0.0 0.0 0.0\n", "WeekDay.missing(NA) 0.0 0.0 0.0\n", "Domestic.missing(NA) 0.0 0.0 0.0" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Inspect Deep Learning model output\n", "model_dl" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<table>\n", "<tr><th>predict </th><th style=\"text-align: right;\"> false</th><th style=\"text-align: right;\"> true</th></tr>\n", "<tr><td>false </td><td style=\"text-align: right;\">0.946415 </td><td style=\"text-align: right;\">0.0535847 </td></tr>\n", "<tr><td>false </td><td style=\"text-align: right;\">0.862165 </td><td style=\"text-align: right;\">0.137835 </td></tr>\n", "<tr><td>false </td><td style=\"text-align: right;\">0.938661 </td><td style=\"text-align: right;\">0.0613392 </td></tr>\n", "<tr><td>false </td><td style=\"text-align: right;\">0.870186 </td><td style=\"text-align: right;\">0.129814 </td></tr>\n", "<tr><td>false </td><td style=\"text-align: right;\">0.980488 </td><td style=\"text-align: right;\">0.0195118 </td></tr>\n", "<tr><td>false </td><td style=\"text-align: right;\">0.972006 </td><td style=\"text-align: right;\">0.0279937 </td></tr>\n", "<tr><td>false </td><td style=\"text-align: right;\">0.990995 </td><td style=\"text-align: right;\">0.00900489</td></tr>\n", "<tr><td>true </td><td style=\"text-align: right;\">0.0210692</td><td style=\"text-align: right;\">0.978931 </td></tr>\n", "<tr><td>false </td><td style=\"text-align: right;\">0.693061 </td><td style=\"text-align: right;\">0.306939 </td></tr>\n", "<tr><td>false </td><td style=\"text-align: right;\">0.992097 </td><td style=\"text-align: right;\">0.00790253</td></tr>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Predict on the test set using the gbm model\n", "predictions = model_gbm.predict(test)\n", "predictions.show()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "ModelMetricsBinomial: gbm\n", "** Reported on test data. **\n", "\n", "MSE: 0.0893676876445\n", "R^2: 0.57094394422\n", "LogLoss: 0.294019576922\n", "AUC: 0.922152238508\n", "Gini: 0.844304477016\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.365461652105:\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td><b></b></td>\n", "<td><b>false</b></td>\n", "<td><b>true</b></td>\n", "<td><b>Error</b></td>\n", "<td><b>Rate</b></td></tr>\n", "<tr><td>false</td>\n", "<td>1297.0</td>\n", "<td>84.0</td>\n", "<td>0.0608</td>\n", "<td> (84.0/1381.0)</td></tr>\n", "<tr><td>true</td>\n", "<td>153.0</td>\n", "<td>427.0</td>\n", "<td>0.2638</td>\n", "<td> (153.0/580.0)</td></tr>\n", "<tr><td>Total</td>\n", "<td>1450.0</td>\n", "<td>511.0</td>\n", "<td>0.1209</td>\n", "<td> (237.0/1961.0)</td></tr></table></div>" ], "text/plain": [ " false true Error Rate\n", "----- ------- ------ ------- --------------\n", "false 1297 84 0.0608 (84.0/1381.0)\n", "true 153 427 0.2638 (153.0/580.0)\n", "Total 1450 511 0.1209 (237.0/1961.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Maximum Metrics: Maximum metrics at their respective thresholds\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td><b>metric</b></td>\n", "<td><b>threshold</b></td>\n", "<td><b>value</b></td>\n", "<td><b>idx</b></td></tr>\n", "<tr><td>max f1</td>\n", "<td>0.4</td>\n", "<td>0.8</td>\n", "<td>158.0</td></tr>\n", "<tr><td>max f2</td>\n", "<td>0.1</td>\n", "<td>0.8</td>\n", "<td>295.0</td></tr>\n", "<tr><td>max f0point5</td>\n", "<td>0.7</td>\n", "<td>0.9</td>\n", "<td>97.0</td></tr>\n", "<tr><td>max accuracy</td>\n", "<td>0.6</td>\n", "<td>0.9</td>\n", "<td>112.0</td></tr>\n", "<tr><td>max precision</td>\n", "<td>1.0</td>\n", "<td>1.0</td>\n", "<td>0.0</td></tr>\n", "<tr><td>max absolute_MCC</td>\n", "<td>0.6</td>\n", "<td>0.7</td>\n", "<td>112.0</td></tr>\n", "<tr><td>max min_per_class_accuracy</td>\n", "<td>0.2</td>\n", "<td>0.8</td>\n", "<td>235.0</td></tr></table></div>" ], "text/plain": [ "metric threshold value idx\n", "-------------------------- ----------- -------- -----\n", "max f1 0.365462 0.782768 158\n", "max f2 0.121697 0.815152 295\n", "max f0point5 0.657606 0.855615 97\n", "max accuracy 0.57297 0.885773 112\n", "max precision 0.991217 1 0\n", "max absolute_MCC 0.57297 0.717666 112\n", "max min_per_class_accuracy 0.211976 0.821144 235" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Look at performance on test set (if it includes true lables)\n", "test_performance = model_gbm.model_performance(test)\n", "test_performance" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Plots\n", "# Create Plots of Crime type vs Arrest Rate and Proportion of reported Crime" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-----------+---------+\n", "| Crime_Type|crime_count|all_count|\n", "+--------------------+-----------+---------+\n", "| OTHER OFFENSE| 183| 720|\n", "| WEAPONS VIOLATION| 96| 118|\n", "| DECEPTIVE PRACTICE| 25| 445|\n", "| BURGLARY| 14| 458|\n", "| BATTERY| 432| 1851|\n", "| ROBBERY| 17| 357|\n", "| MOTOR VEHICLE THEFT| 17| 414|\n", "| PROSTITUTION| 106| 106|\n", "| CRIMINAL DAMAGE| 76| 1003|\n", "| KIDNAPPING| 1| 7|\n", "| GAMBLING| 3| 3|\n", "|LIQUOR LAW VIOLATION| 12| 12|\n", "+--------------------+-----------+---------+\n", "only showing top 12 rows\n", "\n" ] } ], "source": [ "# Create table to report Crimetype, Arrest count per crime, total reported count per Crime \n", "sqlContext.registerDataFrameAsTable(df_crimes, \"df_crimes\")\n", "allCrimes = sqlContext.sql(\"\"\"SELECT Primary_Type, count(*) as all_count FROM df_crimes GROUP BY Primary_Type\"\"\")\n", "crimesWithArrest = sqlContext.sql(\"SELECT Primary_Type, count(*) as crime_count FROM chicagoCrime WHERE Arrest = 'true' GROUP BY Primary_Type\")\n", "\n", "sqlContext.registerDataFrameAsTable(crimesWithArrest, \"crimesWithArrest\")\n", "sqlContext.registerDataFrameAsTable(allCrimes, \"allCrimes\")\n", "\n", "crime_type = sqlContext.sql(\"Select a.Primary_Type as Crime_Type, a.crime_count, b.all_count \\\n", "FROM crimesWithArrest a \\\n", "JOIN allCrimes b \\\n", "ON a.Primary_Type = b.Primary_Type \")\n", "\n", "crime_type.show(12)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Copy Crime_type table from Spark to H2O\n", "crime_typeHF = hc.as_h2o_frame(crime_type,framename=\"crime_type\")" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Create Additional columns Arrest_rate and Crime_propotion \n", "crime_typeHF[\"Arrest_rate\"] = crime_typeHF[\"crime_count\"]/crime_typeHF[\"all_count\"]\n", "crime_typeHF[\"Crime_proportion\"] = crime_typeHF[\"all_count\"]/crime_typeHF[\"all_count\"].sum()\n", "crime_typeHF[\"Crime_Type\"] = crime_typeHF[\"Crime_Type\"].asfactor()\n", "# h2o.assign(crime_typeHF,crime_type)\n", "crime_typeHF.frame_id = \"Crime_type\"" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<table>\n", "<tr><th>Crime_Type </th><th style=\"text-align: right;\"> crime_count</th><th style=\"text-align: right;\"> all_count</th><th style=\"text-align: right;\"> Arrest_rate</th><th style=\"text-align: right;\"> Crime_proportion</th></tr>\n", "<tr><td>OTHER OFFENSE </td><td style=\"text-align: right;\"> 183</td><td style=\"text-align: right;\"> 720</td><td style=\"text-align: right;\"> 0.254167 </td><td style=\"text-align: right;\"> 0.0721226 </td></tr>\n", "<tr><td>WEAPONS VIOLATION </td><td style=\"text-align: right;\"> 96</td><td style=\"text-align: right;\"> 118</td><td style=\"text-align: right;\"> 0.813559 </td><td style=\"text-align: right;\"> 0.0118201 </td></tr>\n", "<tr><td>DECEPTIVE PRACTICE </td><td style=\"text-align: right;\"> 25</td><td style=\"text-align: right;\"> 445</td><td style=\"text-align: right;\"> 0.0561798</td><td style=\"text-align: right;\"> 0.0445758 </td></tr>\n", "<tr><td>BURGLARY </td><td style=\"text-align: right;\"> 14</td><td style=\"text-align: right;\"> 458</td><td style=\"text-align: right;\"> 0.0305677</td><td style=\"text-align: right;\"> 0.045878 </td></tr>\n", "<tr><td>BATTERY </td><td style=\"text-align: right;\"> 432</td><td style=\"text-align: right;\"> 1851</td><td style=\"text-align: right;\"> 0.233387 </td><td style=\"text-align: right;\"> 0.185415 </td></tr>\n", "<tr><td>ROBBERY </td><td style=\"text-align: right;\"> 17</td><td style=\"text-align: right;\"> 357</td><td style=\"text-align: right;\"> 0.047619 </td><td style=\"text-align: right;\"> 0.0357608 </td></tr>\n", "<tr><td>MOTOR VEHICLE THEFT</td><td style=\"text-align: right;\"> 17</td><td style=\"text-align: right;\"> 414</td><td style=\"text-align: right;\"> 0.0410628</td><td style=\"text-align: right;\"> 0.0414705 </td></tr>\n", "<tr><td>PROSTITUTION </td><td style=\"text-align: right;\"> 106</td><td style=\"text-align: right;\"> 106</td><td style=\"text-align: right;\"> 1 </td><td style=\"text-align: right;\"> 0.0106181 </td></tr>\n", "<tr><td>CRIMINAL DAMAGE </td><td style=\"text-align: right;\"> 76</td><td style=\"text-align: right;\"> 1003</td><td style=\"text-align: right;\"> 0.0757727</td><td style=\"text-align: right;\"> 0.100471 </td></tr>\n", "<tr><td>KIDNAPPING </td><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 7</td><td style=\"text-align: right;\"> 0.142857 </td><td style=\"text-align: right;\"> 0.000701192</td></tr>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "crime_typeHF" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "H2OContext: ip=172.16.2.98, port=54329\n" ] }, { "data": { "text/plain": [] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hc\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Plot the below in Flow \n", "plot (g) -> g(\n", " g.rect(\n", " g.position \"Crime_Type\", \"Arrest_rate\"\n", " g.fillColor g.value 'blue'\n", " g.fillOpacity g.value 0.75\n", " )\n", " g.rect(\n", " g.position \"Crime_Type\", \"Crime_proportion\"\n", " g.fillColor g.value 'red'\n", " g.fillOpacity g.value 0.65\n", " )\n", " g.from inspect \"data\", getFrame \"Crime_type\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#hc.stop()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }