{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Spark Context\n", "sc" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Warning: Version mismatch. H2O is version 3.6.0.2, but the python package is version 3.7.0.99999.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
H2O cluster uptime: 2 seconds 217 milliseconds
H2O cluster version: 3.6.0.2
H2O cluster name: sparkling-water-nidhimehta
H2O cluster total nodes: 2
H2O cluster total memory: 3.83 GB
H2O cluster total cores: 16
H2O cluster allowed cores: 16
H2O cluster healthy: True
H2O Connection ip: 172.16.2.98
H2O Connection port: 54329
" ], "text/plain": [ "-------------------------- --------------------------\n", "H2O cluster uptime: 2 seconds 217 milliseconds\n", "H2O cluster version: 3.6.0.2\n", "H2O cluster name: sparkling-water-nidhimehta\n", "H2O cluster total nodes: 2\n", "H2O cluster total memory: 3.83 GB\n", "H2O cluster total cores: 16\n", "H2O cluster allowed cores: 16\n", "H2O cluster healthy: True\n", "H2O Connection ip: 172.16.2.98\n", "H2O Connection port: 54329\n", "-------------------------- --------------------------" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Start H2O Context\n", "from pysparkling import *\n", "sc\n", "hc= H2OContext(sc).start()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "H2OContext: ip=172.16.2.98, port=54329\n" ] }, { "data": { "text/plain": [] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# H2o Context\n", "hc" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Import H2O Python library\n", "import h2o\n", "# View all the available H2O python functions\n", "#dir(h2o)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Parse Progress: [##################################################] 100%\n", "(9999, 22)\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location
type int string string enum enum enum enum enum enum enum int int int int enum int int int enum real real enum
mins 21735.0 NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 0.0 1100317.0 1814255.0 2015.00.0 41.64507243 -87.906463888 0.0
mean 9931318.73737NaN NaN NaN NaN NaN NaN NaN 0.2928292829280.1523152315231159.6180618111.348988512822.954095409537.4476447645 NaN 1163880.59815 1885916.14984 2015.0NaN 41.8425652247 -87.6741405221 NaN
maxs 9962898.0 NaN NaN 6517.0 212.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 24.0 1205069.0 1951533.0 2015.032.0 42.022646183 -87.524773286 8603.0
sigma 396787.564221NaN NaN NaN NaN NaN NaN NaN 0.4550835155880.35934414686 695.76029875 6.9454749330113.649566114421.2748762223 NaN 16496.4493681 31274.0163199 0.0 NaN 0.08601865793580.0600357970653NaN
zeros 0 0 0 3 16 11 933 19 7071 8476 0 0 0 0 16 0 0 0 603 0 0 1
missing0 0 0 0 0 0 0 6 0 0 0 162 0 0 0 162 162 0 0 162 162 162
0 9955810.0 HY144797 02/08/2015 11:43:40 PM081XX S COLES AVE 1811 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 422.0 4.0 7.0 46.0 18 1198273.0 1851626.0 2015.002/15/2015 12:43:39 PM41.747693646 -87.549035389 (41.747693646, -87.549035389)
1 9955861.0 HY144838 02/08/2015 11:41:42 PM118XX S STATE ST 0486 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 08B 1178335.0 1826581.0 2015.002/15/2015 12:43:39 PM41.679442289 -87.622850758 (41.679442289, -87.622850758)
2 9955801.0 HY144779 02/08/2015 11:30:22 PM002XX S LARAMIE AVE 2026 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18 1141717.0 1898581.0 2015.002/15/2015 12:43:39 PM41.87777333 -87.755117993 (41.87777333, -87.755117993)
3 9956197.0 HY144787 02/08/2015 11:30:23 PM006XX E 67TH ST 1811 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 321.0 nan 6.0 42.0 18 nan nan 2015.002/15/2015 12:43:39 PMnan nan
4 9955846.0 HY144829 02/08/2015 11:30:58 PM0000X S MAYFIELD AVE0610 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 05 1137239.0 1899372.0 2015.002/15/2015 12:43:39 PM41.880025548 -87.771541324 (41.880025548, -87.771541324)
5 9955835.0 HY144778 02/08/2015 11:30:21 PM010XX W 48TH ST 0486 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 08B 1169986.0 1873019.0 2015.002/15/2015 12:43:39 PM41.807059405 -87.65206589 (41.807059405, -87.65206589)
6 9955872.0 HY144822 02/08/2015 11:27:24 PM015XX W ARTHUR AVE 1320 CRIMINAL DAMAGETO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14 1164732.0 1943222.0 2015.002/15/2015 12:43:39 PM41.999814056 -87.669342967 (41.999814056, -87.669342967)
7 21752.0 HY144738 02/08/2015 11:26:12 PM060XX W GRAND AVE 0110 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 01A 1135910.0 1914206.0 2015.002/15/2015 12:43:39 PM41.920755683 -87.776067514 (41.920755683, -87.776067514)
8 9955808.0 HY144775 02/08/2015 11:20:33 PM001XX W WACKER DR 0460 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 08B 1175384.0 1902088.0 2015.002/15/2015 12:43:39 PM41.886707818 -87.631396356 (41.886707818, -87.631396356)
9 9958275.0 HY146732 02/08/2015 11:15:36 PM001XX W WACKER DR 0460 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 08B 1175384.0 1902088.0 2015.002/15/2015 12:43:39 PM41.886707818 -87.631396356 (41.886707818, -87.631396356)
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Parse Chicago Crime dataset into H2O\n", "column_type = ['Numeric','String','String','Enum','Enum','Enum','Enum','Enum','Enum','Enum','Numeric','Numeric','Numeric','Numeric','Enum','Numeric','Numeric','Numeric','Enum','Numeric','Numeric','Enum']\n", "f_crimes = h2o.import_file(path =\"../data/chicagoCrimes10k.csv\",col_types =column_type)\n", "\n", "print(f_crimes.shape)\n", "f_crimes.summary()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
IUCR Count
0110 16
0261 2
0263 2
0265 5
0266 2
0281 41
0291 3
0312 18
0313 20
031A 136
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Look at the distribution of IUCR column\n", "f_crimes[\"IUCR\"].table()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
Arrest Count
false 7071
true 2928
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Look at distribution of Arrest column\n", "f_crimes[\"Arrest\"].table()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
IDCase_Number Date Block IUCRPrimary_Type Description Location_Description Arrest Domestic Beat District Ward Community_AreaFBI_Code X_Coordinate Y_Coordinate YearUpdated_On Latitude LongitudeLocation
9.95581e+06HY144797 02/08/2015 11:43:40 PM081XX S COLES AVE 1811NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 422 4 7 4618 1.19827e+06 1.85163e+06 201502/15/2015 12:43:39 PM 41.7477 -87.549 (41.747693646, -87.549035389)
9.95586e+06HY144838 02/08/2015 11:41:42 PM118XX S STATE ST 0486BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522 5 34 5308B 1.17834e+06 1.82658e+06 201502/15/2015 12:43:39 PM 41.6794 -87.6229(41.679442289, -87.622850758)
9.9558e+06 HY144779 02/08/2015 11:30:22 PM002XX S LARAMIE AVE 2026NARCOTICS POSS: PCP SIDEWALK true false 1522 15 29 2518 1.14172e+06 1.89858e+06 201502/15/2015 12:43:39 PM 41.8778 -87.7551(41.87777333, -87.755117993)
9.9562e+06 HY144787 02/08/2015 11:30:23 PM006XX E 67TH ST 1811NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 321 nan 6 4218 nan nan 201502/15/2015 12:43:39 PM nan nan
9.95585e+06HY144829 02/08/2015 11:30:58 PM0000X S MAYFIELD AVE 0610BURGLARY FORCIBLE ENTRY APARTMENT false false 1513 15 29 2505 1.13724e+06 1.89937e+06 201502/15/2015 12:43:39 PM 41.88 -87.7715(41.880025548, -87.771541324)
9.95584e+06HY144778 02/08/2015 11:30:21 PM010XX W 48TH ST 0486BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933 9 3 6108B 1.16999e+06 1.87302e+06 201502/15/2015 12:43:39 PM 41.8071 -87.6521(41.807059405, -87.65206589)
9.95587e+06HY144822 02/08/2015 11:27:24 PM015XX W ARTHUR AVE 1320CRIMINAL DAMAGETO VEHICLE STREET false false 2432 24 40 114 1.16473e+06 1.94322e+06 201502/15/2015 12:43:39 PM 41.9998 -87.6693(41.999814056, -87.669342967)
21752 HY144738 02/08/2015 11:26:12 PM060XX W GRAND AVE 0110HOMICIDE FIRST DEGREE MURDER STREET true false 2512 25 37 1901A 1.13591e+06 1.91421e+06 201502/15/2015 12:43:39 PM 41.9208 -87.7761(41.920755683, -87.776067514)
9.95581e+06HY144775 02/08/2015 11:20:33 PM001XX W WACKER DR 0460BATTERY SIMPLE OTHER false false 122 1 42 3208B 1.17538e+06 1.90209e+06 201502/15/2015 12:43:39 PM 41.8867 -87.6314(41.886707818, -87.631396356)
9.95828e+06HY146732 02/08/2015 11:15:36 PM001XX W WACKER DR 0460BATTERY SIMPLE HOTEL/MOTEL false false 122 1 42 3208B 1.17538e+06 1.90209e+06 201502/15/2015 12:43:39 PM 41.8867 -87.6314(41.886707818, -87.631396356)
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Modify column names to replace blank spaces with underscore\n", "col_names = map(lambda s: s.replace(' ', '_'), f_crimes.col_names)\n", "f_crimes.set_names(col_names)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Set time zone to UTC for date manipulation\n", "h2o.set_timezone(\"Etc/UTC\")\n", "\n", "## Refine date column \n", "def refine_date_col(data, col, pattern):\n", " data[col] = data[col].as_date(pattern)\n", " data[\"Day\"] = data[col].day()\n", " data[\"Month\"] = data[col].month() # Since H2O indexes from 0\n", " data[\"Year\"] = data[col].year()\n", " data[\"WeekNum\"] = data[col].week()\n", " data[\"WeekDay\"] = data[col].dayOfWeek()\n", " data[\"HourOfDay\"] = data[col].hour()\n", " \n", " # Create weekend and season cols\n", " data[\"Weekend\"] = (data[\"WeekDay\"] == \"Sun\" or data[\"WeekDay\"] == \"Sat\").ifelse(1, 0)[0]\n", " data[\"Season\"] = data[\"Month\"].cut([0, 2, 5, 7, 10, 12], [\"Winter\", \"Spring\", \"Summer\", \"Autumn\", \"Winter\"])\n", " \n", "refine_date_col(f_crimes, \"Date\", \"%m/%d/%Y %I:%M:%S %p\")\n", "f_crimes = f_crimes.drop(\"Date\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Parse Progress: [##################################################] 100%\n", "[77, 9]\n" ] } ], "source": [ "# Parse Census data into H2O\n", "f_census = h2o.import_file(\"../data/chicagoCensus.csv\",header=1)\n", "\n", "## Update column names in the table\n", "col_names = map(lambda s: s.strip().replace(' ', '_'), f_census.col_names)\n", "f_census.set_names(col_names)\n", "f_census = f_census[1:78,:]\n", "print(f_census.dim)\n", "#f_census.summary()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Parse Progress: [##################################################] 100%\n", "[5162, 6]\n" ] } ], "source": [ "# Parse Weather data into H2O\n", "f_weather = h2o.import_file(\"../data/chicagoAllWeather.csv\")\n", "f_weather = f_weather[1:]\n", "print(f_weather.dim)\n", "#f_weather.summary()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
month day year maxTemp meanTemp minTemp
6 19 2008 nan nan nan
9 23 2008 nan nan nan
9 24 2008 nan nan nan
9 25 2008 nan nan nan
9 26 2008 nan nan nan
9 27 2008 nan nan nan
9 28 2008 nan nan nan
9 29 2008 nan nan nan
9 30 2008 nan nan nan
3 4 2009 nan nan nan
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Look at all the null entires in the Weather table\n", "f_weather[f_weather[\"meanTemp\"].isna()]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "H2OContext: ip=172.16.2.98, port=54329\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
month day year maxTemp meanTemp minTemp
1 1 2001 23 14 6
1 2 2001 18 12 6
1 3 2001 28 18 8
1 4 2001 30 24 19
1 5 2001 36 30 21
1 6 2001 33 26 19
1 7 2001 34 28 21
1 8 2001 26 20 14
1 9 2001 23 16 10
1 10 2001 34 26 19
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Look at the help on as_h2o_frame \n", "hc.as_data_frame?\n", "f_weather" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Copy data frames to Spark from H2O\n", "df_weather = hc.as_spark_frame(f_weather,)\n", "df_census = hc.as_spark_frame(f_census)\n", "df_crimes = hc.as_spark_frame(f_crimes)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-----+---+----+-------+--------+-------+\n", "|month|day|year|maxTemp|meanTemp|minTemp|\n", "+-----+---+----+-------+--------+-------+\n", "| 1| 1|2001| 23| 14| 6|\n", "| 1| 2|2001| 18| 12| 6|\n", "+-----+---+----+-------+--------+-------+\n", "only showing top 2 rows\n", "\n" ] } ], "source": [ "# Look at the weather data as parsed in Spark\n", "df_weather.show(2)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Join columns from Crime, Census and Weather DataFrames in Spark\n", "\n", "## Register DataFrames as tables in SQL context\n", "sqlContext.registerDataFrameAsTable(df_weather, \"chicagoWeather\")\n", "sqlContext.registerDataFrameAsTable(df_census, \"chicagoCensus\")\n", "sqlContext.registerDataFrameAsTable(df_crimes, \"chicagoCrime\")\n", "\n", "\n", "crimeWithWeather = sqlContext.sql(\"\"\"SELECT\n", "a.Year, a.Month, a.Day, a.WeekNum, a.HourOfDay, a.Weekend, a.Season, a.WeekDay,\n", "a.IUCR, a.Primary_Type, a.Location_Description, a.Community_Area, a.District,\n", "a.Arrest, a.Domestic, a.Beat, a.Ward, a.FBI_Code,\n", "b.minTemp, b.maxTemp, b.meanTemp,\n", "c.PERCENT_AGED_UNDER_18_OR_OVER_64, c.PER_CAPITA_INCOME, c.HARDSHIP_INDEX,\n", "c.PERCENT_OF_HOUSING_CROWDED, c.PERCENT_HOUSEHOLDS_BELOW_POVERTY,\n", "c.PERCENT_AGED_16__UNEMPLOYED, c.PERCENT_AGED_25__WITHOUT_HIGH_SCHOOL_DIPLOMA\n", "FROM chicagoCrime a\n", "JOIN chicagoWeather b\n", "ON a.Year = b.year AND a.Month = b.month AND a.Day = b.day\n", "JOIN chicagoCensus c\n", "ON a.Community_Area = c.Community_Area_Number\"\"\")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----+-----+---+-------+---------+-------+------+-------+----+-----------------+--------------------+--------------+--------+------+--------+----+----+--------+-------+-------+--------+--------------------------------+-----------------+--------------+--------------------------+--------------------------------+---------------------------+--------------------------------------------+\n", "|Year|Month|Day|WeekNum|HourOfDay|Weekend|Season|WeekDay|IUCR| Primary_Type|Location_Description|Community_Area|District|Arrest|Domestic|Beat|Ward|FBI_Code|minTemp|maxTemp|meanTemp|PERCENT_AGED_UNDER_18_OR_OVER_64|PER_CAPITA_INCOME|HARDSHIP_INDEX|PERCENT_OF_HOUSING_CROWDED|PERCENT_HOUSEHOLDS_BELOW_POVERTY|PERCENT_AGED_16__UNEMPLOYED|PERCENT_AGED_25__WITHOUT_HIGH_SCHOOL_DIPLOMA|\n", "+----+-----+---+-------+---------+-------+------+-------+----+-----------------+--------------------+--------------+--------+------+--------+----+----+--------+-------+-------+--------+--------------------------------+-----------------+--------------+--------------------------+--------------------------------+---------------------------+--------------------------------------------+\n", "|2015| 1| 23| 4| 22| 0|Winter| Fri|143A|WEAPONS VIOLATION| ALLEY| 31| 12| true| false|1234| 25| 15| 29| 31| 30| 32.6| 16444| 76| 9.600000000000001| 25.8| 15.8| 40.7|\n", "|2015| 1| 23| 4| 19| 0|Winter| Fri|4625| OTHER OFFENSE| SIDEWALK| 31| 10| true| false|1034| 25| 26| 29| 31| 30| 32.6| 16444| 76| 9.600000000000001| 25.8| 15.8| 40.7|\n", "+----+-----+---+-------+---------+-------+------+-------+----+-----------------+--------------------+--------------+--------+------+--------+----+----+--------+-------+-------+--------+--------------------------------+-----------------+--------------+--------------------------+--------------------------------+---------------------------+--------------------------------------------+\n", "only showing top 2 rows\n", "\n" ] } ], "source": [ "# Print the crimeWithWeather data table from Spark\n", "crimeWithWeather.show(2)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "H2OContext: ip=172.16.2.98, port=54329\n" ] } ], "source": [ "#Copy table from Spark to H2O\n", "hc.as_h2o_frame?\n", "crimeWithWeatherHF = hc.as_h2o_frame(crimeWithWeather,framename=\"crimeWithWeather\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Year Month Day WeekNum HourOfDay Weekend Season WeekDay IUCR Primary_Type Location_Description Community_Area District Arrest Domestic Beat Ward FBI_Code minTemp maxTemp meanTemp PERCENT_AGED_UNDER_18_OR_OVER_64 PER_CAPITA_INCOME HARDSHIP_INDEX PERCENT_OF_HOUSING_CROWDED PERCENT_HOUSEHOLDS_BELOW_POVERTY PERCENT_AGED_16__UNEMPLOYED PERCENT_AGED_25__WITHOUT_HIGH_SCHOOL_DIPLOMA
type int int int int int int string string stringstring string int int string string int int string int int int real int int real real real real
mins 2015.01.0 1.0 4.0 0.0 0.0 NaN NaN NaN NaN NaN 1.0 1.0 NaN NaN 111.0 1.0 NaN -2.0 15.0 7.0 13.5 8201.0 1.0 0.3 3.3 4.7 2.5
mean 2015.01.41944194419 17.68396839685.18081808181 13.63196319630.159115911591NaN NaN NaN NaN NaN 37.4476447645 11.3489885128NaN NaN 1159.6180618122.9540954095NaN 17.699669967 31.719971997224.940894089435.0596759676 25221.3057306 54.4786478648 5.43707370737 24.600750075 16.8288328833 21.096639664
maxs 2015.02.0 31.0 6.0 23.0 1.0 NaN NaN NaN NaN NaN 77.0 25.0 NaN NaN 2535.0 50.0 NaN 29.0 43.0 36.0 51.5 88669.0 98.0 15.8 56.5 35.9 54.8
sigma 0.0 0.49349240678711.18010433580.7389298304096.473217358070.365802434041NaN NaN NaN NaN NaN 21.2748762223 6.94547493301NaN NaN 695.76029875 13.6495661144NaN 8.961181364386.938099134727.463025270627.95653388237 18010.0446225 29.3247456472 3.75289588494 10.1450570661 7.58926327988 11.3868817911
zeros 0 0 0 0 374 8408 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
missing0 0 0 0 0 0 0 0 0 0 6 0 162 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 2015.01.0 24.0 4.0 22.0 0.0 Winter Sat 2820 OTHER OFFENSE APARTMENT 31.0 10.0 false false 1034.0 25.0 26 29.0 43.0 36.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
1 2015.01.0 24.0 4.0 21.0 0.0 Winter Sat 1310 CRIMINAL DAMAGE RESTAURANT 31.0 12.0 true false 1233.0 25.0 14 29.0 43.0 36.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
2 2015.01.0 24.0 4.0 18.0 0.0 Winter Sat 1750 OFFENSE INVOLVING CHILDRENRESIDENCE 31.0 12.0 false true 1235.0 25.0 20 29.0 43.0 36.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
3 2015.01.0 24.0 4.0 18.0 0.0 Winter Sat 0460 BATTERY OTHER 31.0 10.0 false false 1023.0 25.0 08B 29.0 43.0 36.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
4 2015.01.0 24.0 4.0 13.0 0.0 Winter Sat 0890 THEFT CURRENCY EXCHANGE 31.0 10.0 false false 1023.0 25.0 06 29.0 43.0 36.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
5 2015.01.0 24.0 4.0 9.0 0.0 Winter Sat 0560 ASSAULT OTHER 31.0 12.0 false false 1234.0 25.0 08A 29.0 43.0 36.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
6 2015.01.0 24.0 4.0 8.0 0.0 Winter Sat 0486 BATTERY RESIDENCE 31.0 12.0 true true 1235.0 25.0 08B 29.0 43.0 36.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
7 2015.01.0 24.0 4.0 1.0 0.0 Winter Sat 0420 BATTERY SIDEWALK 31.0 10.0 false false 1034.0 25.0 04B 29.0 43.0 36.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
8 2015.01.0 24.0 4.0 0.0 0.0 Winter Sat 1320 CRIMINAL DAMAGE PARKING LOT/GARAGE(NON.RESID.)31.0 9.0 false false 912.0 11.0 14 29.0 43.0 36.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
9 2015.01.0 31.0 5.0 23.0 0.0 Winter Sat 0820 THEFT SIDEWALK 31.0 12.0 false false 1234.0 25.0 06 19.0 36.0 28.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "crimeWithWeatherHF.summary()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Assign column types to the CrimeWeatherHF data table in H2O\n", "crimeWithWeatherHF[\"Season\"]= crimeWithWeatherHF[\"Season\"].asfactor()\n", "crimeWithWeatherHF[\"WeekDay\"]= crimeWithWeatherHF[\"WeekDay\"].asfactor()\n", "crimeWithWeatherHF[\"IUCR\"]= crimeWithWeatherHF[\"IUCR\"].asfactor()\n", "crimeWithWeatherHF[\"Primary_Type\"]= crimeWithWeatherHF[\"Primary_Type\"].asfactor()\n", "crimeWithWeatherHF[\"Location_Description\"]= crimeWithWeatherHF[\"Location_Description\"].asfactor()\n", "crimeWithWeatherHF[\"Arrest\"]= crimeWithWeatherHF[\"Arrest\"].asfactor()\n", "crimeWithWeatherHF[\"Domestic\"]= crimeWithWeatherHF[\"Domestic\"].asfactor()\n", "crimeWithWeatherHF[\"FBI_Code\"]= crimeWithWeatherHF[\"FBI_Code\"].asfactor()\n", "crimeWithWeatherHF[\"Season\"]= crimeWithWeatherHF[\"Season\"].asfactor()\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Year Month Day WeekNum HourOfDay Weekend Season WeekDay IUCR Primary_Type Location_Description Community_Area District Arrest Domestic Beat Ward FBI_Code minTemp maxTemp meanTemp PERCENT_AGED_UNDER_18_OR_OVER_64 PER_CAPITA_INCOME HARDSHIP_INDEX PERCENT_OF_HOUSING_CROWDED PERCENT_HOUSEHOLDS_BELOW_POVERTY PERCENT_AGED_16__UNEMPLOYED PERCENT_AGED_25__WITHOUT_HIGH_SCHOOL_DIPLOMA
type int int int int int int enum enum enum enum enum int int enum enum int int enum int int int real int int real real real real
mins 2015.01.0 1.0 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 111.0 1.0 0.0 -2.0 15.0 7.0 13.5 8201.0 1.0 0.3 3.3 4.7 2.5
mean 2015.01.41944194419 17.68396839685.18081808181 13.63196319630.1591159115910.0 NaN NaN NaN NaN 37.4476447645 11.34898851280.2928292829280.1523152315231159.6180618122.9540954095NaN 17.699669967 31.719971997224.940894089435.0596759676 25221.3057306 54.4786478648 5.43707370737 24.600750075 16.8288328833 21.096639664
maxs 2015.02.0 31.0 6.0 23.0 1.0 0.0 6.0 212.0 26.0 90.0 77.0 25.0 1.0 1.0 2535.0 50.0 24.0 29.0 43.0 36.0 51.5 88669.0 98.0 15.8 56.5 35.9 54.8
sigma 0.0 0.49349240678711.18010433580.7389298304096.473217358070.3658024340410.0 NaN NaN NaN NaN 21.2748762223 6.945474933010.4550835155880.35934414686 695.76029875 13.6495661144NaN 8.961181364386.938099134727.463025270627.95653388237 18010.0446225 29.3247456472 3.75289588494 10.1450570661 7.58926327988 11.3868817911
zeros 0 0 0 0 374 8408 9999 1942 16 11 19 0 0 7071 8476 0 0 16 0 0 0 0 0 0 0 0 0 0
missing0 0 0 0 0 0 0 0 0 0 6 0 162 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 2015.01.0 24.0 4.0 22.0 0.0 Winter Sat 2820 OTHER OFFENSE APARTMENT 31.0 10.0 false false 1034.0 25.0 26 29.0 43.0 36.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
1 2015.01.0 24.0 4.0 21.0 0.0 Winter Sat 1310 CRIMINAL DAMAGE RESTAURANT 31.0 12.0 true false 1233.0 25.0 14 29.0 43.0 36.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
2 2015.01.0 24.0 4.0 18.0 0.0 Winter Sat 1750 OFFENSE INVOLVING CHILDRENRESIDENCE 31.0 12.0 false true 1235.0 25.0 20 29.0 43.0 36.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
3 2015.01.0 24.0 4.0 18.0 0.0 Winter Sat 0460 BATTERY OTHER 31.0 10.0 false false 1023.0 25.0 08B 29.0 43.0 36.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
4 2015.01.0 24.0 4.0 13.0 0.0 Winter Sat 0890 THEFT CURRENCY EXCHANGE 31.0 10.0 false false 1023.0 25.0 06 29.0 43.0 36.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
5 2015.01.0 24.0 4.0 9.0 0.0 Winter Sat 0560 ASSAULT OTHER 31.0 12.0 false false 1234.0 25.0 08A 29.0 43.0 36.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
6 2015.01.0 24.0 4.0 8.0 0.0 Winter Sat 0486 BATTERY RESIDENCE 31.0 12.0 true true 1235.0 25.0 08B 29.0 43.0 36.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
7 2015.01.0 24.0 4.0 1.0 0.0 Winter Sat 0420 BATTERY SIDEWALK 31.0 10.0 false false 1034.0 25.0 04B 29.0 43.0 36.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
8 2015.01.0 24.0 4.0 0.0 0.0 Winter Sat 1320 CRIMINAL DAMAGE PARKING LOT/GARAGE(NON.RESID.)31.0 9.0 false false 912.0 11.0 14 29.0 43.0 36.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
9 2015.01.0 31.0 5.0 23.0 0.0 Winter Sat 0820 THEFT SIDEWALK 31.0 12.0 false false 1234.0 25.0 06 19.0 36.0 28.0 32.6 16444.0 76.0 9.6 25.8 15.8 40.7
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "crimeWithWeatherHF.summary()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Split final H2O data table into train test and validation sets\n", "ratios = [0.6,0.2]\n", "frs = crimeWithWeatherHF.split_frame(ratios,seed=12345)\n", "train = frs[0]\n", "train.frame_id = \"Train\"\n", "valid = frs[2]\n", "valid.frame_id = \"Validation\"\n", "test = frs[1]\n", "test.frame_id = \"Test\"" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Import Model Builders from H2O Python\n", "from h2o.estimators.gbm import H2OGradientBoostingEstimator\n", "from h2o.estimators.deeplearning import H2ODeepLearningEstimator" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Inspect the avialble gbm parameters\n", "H2OGradientBoostingEstimator?" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Define Preditors\n", "predictors = crimeWithWeatherHF.names[:]\n", "response = \"Arrest\"\n", "predictors.remove(response)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "gbm Model Build Progress: [##################################################] 100%\n", "\n", "deeplearning Model Build Progress: [##################################################] 100%\n" ] } ], "source": [ "#Simple GBM model - Predict Arrest\n", "model_gbm = H2OGradientBoostingEstimator(ntrees =50,\n", " max_depth =6,\n", " learn_rate =0.1, \n", " #nfolds =2,\n", " distribution =\"bernoulli\")\n", "\n", "model_gbm.train(x =predictors,\n", " y =\"Arrest\",\n", " training_frame =train,\n", " validation_frame=valid\n", " )\n", "\n", "# Simple Deep Learning - Predict Arrest\n", "model_dl = H2ODeepLearningEstimator(variable_importances=True,\n", " loss =\"Automatic\")\n", "\n", "model_dl.train(x =predictors,\n", " y =\"Arrest\",\n", " training_frame =train,\n", " validation_frame=valid)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.335827722991:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
falsetrueErrorRate
false4125.0142.00.0333 (142.0/4267.0)
true251.01504.00.143 (251.0/1755.0)
Total4376.01646.00.0653 (393.0/6022.0)
" ], "text/plain": [ " false true Error Rate\n", "----- ------- ------ ------- --------------\n", "false 4125 142 0.0333 (142.0/4267.0)\n", "true 251 1504 0.143 (251.0/1755.0)\n", "Total 4376 1646 0.0653 (393.0/6022.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.432844055866:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
falsetrueErrorRate
false1362.061.00.0429 (61.0/1423.0)
true150.0443.00.253 (150.0/593.0)
Total1512.0504.00.1047 (211.0/2016.0)
" ], "text/plain": [ " false true Error Rate\n", "----- ------- ------ ------- --------------\n", "false 1362 61 0.0429 (61.0/1423.0)\n", "true 150 443 0.253 (150.0/593.0)\n", "Total 1512 504 0.1047 (211.0/2016.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# Print confusion matrices for the train and validation set\n", "print(model_gbm.confusion_matrix(train = True))\n", "print(model_gbm.confusion_matrix(valid = True))" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.974667176776\n", "0.92596751276\n" ] } ], "source": [ "print(model_gbm.auc(train=True))\n", "print(model_gbm.auc(valid=True))\n", "model_gbm.plot(metric=\"AUC\")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
variablerelative_importancescaled_importancepercentage
0IUCR4280.9394531.000000e+008.234218e-01
1Location_Description487.3230591.138355e-019.373466e-02
2WeekDay55.7905581.303232e-021.073109e-02
3HourOfDay55.4192201.294557e-021.065967e-02
4PERCENT_AGED_16__UNEMPLOYED34.4228948.040967e-036.621107e-03
5Beat31.4682227.350775e-036.052788e-03
6PERCENT_HOUSEHOLDS_BELOW_POVERTY29.1033526.798356e-035.597915e-03
7PER_CAPITA_INCOME26.2331436.127894e-035.045841e-03
8PERCENT_AGED_UNDER_18_OR_OVER_6424.0774025.624327e-034.631193e-03
9Day23.4725675.483041e-034.514855e-03
...............
15maxTemp11.3007932.639793e-032.173663e-03
16Community_Area10.2521462.394835e-031.971960e-03
17HARDSHIP_INDEX10.1160722.363049e-031.945786e-03
18Domestic9.2943272.171095e-031.787727e-03
19District8.3046541.939914e-031.597367e-03
20minTemp6.2430271.458331e-031.200822e-03
21WeekNum4.2301029.881246e-048.136433e-04
22FBI_Code2.3631825.520241e-044.545486e-04
23Month0.0000184.187325e-093.447935e-09
24Weekend0.0000000.000000e+000.000000e+00
\n", "

25 rows × 4 columns

\n", "
" ], "text/plain": [ " variable relative_importance scaled_importance \\\n", "0 IUCR 4280.939453 1.000000e+00 \n", "1 Location_Description 487.323059 1.138355e-01 \n", "2 WeekDay 55.790558 1.303232e-02 \n", "3 HourOfDay 55.419220 1.294557e-02 \n", "4 PERCENT_AGED_16__UNEMPLOYED 34.422894 8.040967e-03 \n", "5 Beat 31.468222 7.350775e-03 \n", "6 PERCENT_HOUSEHOLDS_BELOW_POVERTY 29.103352 6.798356e-03 \n", "7 PER_CAPITA_INCOME 26.233143 6.127894e-03 \n", "8 PERCENT_AGED_UNDER_18_OR_OVER_64 24.077402 5.624327e-03 \n", "9 Day 23.472567 5.483041e-03 \n", ".. ... ... ... \n", "15 maxTemp 11.300793 2.639793e-03 \n", "16 Community_Area 10.252146 2.394835e-03 \n", "17 HARDSHIP_INDEX 10.116072 2.363049e-03 \n", "18 Domestic 9.294327 2.171095e-03 \n", "19 District 8.304654 1.939914e-03 \n", "20 minTemp 6.243027 1.458331e-03 \n", "21 WeekNum 4.230102 9.881246e-04 \n", "22 FBI_Code 2.363182 5.520241e-04 \n", "23 Month 0.000018 4.187325e-09 \n", "24 Weekend 0.000000 0.000000e+00 \n", "\n", " percentage \n", "0 8.234218e-01 \n", "1 9.373466e-02 \n", "2 1.073109e-02 \n", "3 1.065967e-02 \n", "4 6.621107e-03 \n", "5 6.052788e-03 \n", "6 5.597915e-03 \n", "7 5.045841e-03 \n", "8 4.631193e-03 \n", "9 4.514855e-03 \n", ".. ... \n", "15 2.173663e-03 \n", "16 1.971960e-03 \n", "17 1.945786e-03 \n", "18 1.787727e-03 \n", "19 1.597367e-03 \n", "20 1.200822e-03 \n", "21 8.136433e-04 \n", "22 4.545486e-04 \n", "23 3.447935e-09 \n", "24 0.000000e+00 \n", "\n", "[25 rows x 4 columns]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Print variable importance\n", "model_gbm.varimp(True)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model Details\n", "=============\n", "H2ODeepLearningEstimator : Deep Learning\n", "Model Key: DeepLearning_model_python_1446861372065_4\n", "\n", "Status of Neuron Layers: predicting Arrest, 2-class classification, bernoulli distribution, CrossEntropy loss, 118,802 weights/biases, 1.4 MB, 72,478 training samples, mini-batch size 1\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
layerunitstypedropoutl1l2mean_raterate_RMSmomentummean_weightweight_RMSmean_biasbias_RMS
1390Input0.0
2200Rectifier0.00.00.00.10.30.0-0.00.1-0.00.1
3200Rectifier0.00.00.00.10.20.0-0.00.10.80.2
42Softmax0.00.00.00.00.00.00.4-0.00.0
" ], "text/plain": [ " layer units type dropout l1 l2 mean_rate rate_RMS momentum mean_weight weight_RMS mean_bias bias_RMS\n", "-- ------- ------- --------- --------- ---- ---- ---------------- ---------------- ---------- ----------------- --------------- ------------------ ----------------\n", " 1 390 Input 0.0\n", " 2 200 Rectifier 0.0 0.0 0.0 0.125569459979 0.314496159554 0.0 -0.00145149060662 0.0583122819662 -0.0226196222717 0.106465905905\n", " 3 200 Rectifier 0.0 0.0 0.0 0.089552259468 0.167708098888 0.0 -0.0100148269118 0.0716536939144 0.77415192296 0.243278682232\n", " 4 2 Softmax 0.0 0.0 0.00265202885173 0.00204257667065 0.0 0.003976688287 0.384519815445 -0.000194062303944 0.00806335359812" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "ModelMetricsBinomial: deeplearning\n", "** Reported on train data. **\n", "\n", "MSE: 0.0737426129728\n", "R^2: 0.642891439669\n", "LogLoss: 0.242051500943\n", "AUC: 0.950131166302\n", "Gini: 0.900262332604\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.343997370612:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
falsetrueErrorRate
false4003.0264.00.0619 (264.0/4267.0)
true358.01397.00.204 (358.0/1755.0)
Total4361.01661.00.1033 (622.0/6022.0)
" ], "text/plain": [ " false true Error Rate\n", "----- ------- ------ ------- --------------\n", "false 4003 264 0.0619 (264.0/4267.0)\n", "true 358 1397 0.204 (358.0/1755.0)\n", "Total 4361 1661 0.1033 (622.0/6022.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Maximum Metrics: Maximum metrics at their respective thresholds\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
metricthresholdvalueidx
max f10.30.8195.0
max f20.20.9278.0
max f0point50.70.986.0
max accuracy0.50.9149.0
max precision1.01.00.0
max absolute_MCC0.30.7195.0
max min_per_class_accuracy0.20.9247.0
" ], "text/plain": [ "metric threshold value idx\n", "-------------------------- ----------- -------- -----\n", "max f1 0.343997 0.817916 195\n", "max f2 0.158173 0.856675 278\n", "max f0point5 0.740563 0.878082 86\n", "max accuracy 0.48931 0.897874 149\n", "max precision 0.999884 1 0\n", "max absolute_MCC 0.343997 0.746452 195\n", "max min_per_class_accuracy 0.217785 0.876923 247" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "ModelMetricsBinomial: deeplearning\n", "** Reported on validation data. **\n", "\n", "MSE: 0.0843305429737\n", "R^2: 0.593831388139\n", "LogLoss: 0.280203809486\n", "AUC: 0.930515181213\n", "Gini: 0.861030362427\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.493462351545:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
falsetrueErrorRate
false1361.062.00.0436 (62.0/1423.0)
true158.0435.00.2664 (158.0/593.0)
Total1519.0497.00.1091 (220.0/2016.0)
" ], "text/plain": [ " false true Error Rate\n", "----- ------- ------ ------- --------------\n", "false 1361 62 0.0436 (62.0/1423.0)\n", "true 158 435 0.2664 (158.0/593.0)\n", "Total 1519 497 0.1091 (220.0/2016.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Maximum Metrics: Maximum metrics at their respective thresholds\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
metricthresholdvalueidx
max f10.50.8137.0
max f20.10.8303.0
max f0point50.70.982.0
max accuracy0.70.991.0
max precision1.01.00.0
max absolute_MCC0.70.791.0
max min_per_class_accuracy0.20.8236.0
" ], "text/plain": [ "metric threshold value idx\n", "-------------------------- ----------- -------- -----\n", "max f1 0.493462 0.798165 137\n", "max f2 0.0962013 0.830341 303\n", "max f0point5 0.740357 0.877695 82\n", "max accuracy 0.69578 0.893849 91\n", "max precision 0.999877 1 0\n", "max absolute_MCC 0.69578 0.738603 91\n", "max min_per_class_accuracy 0.207426 0.836425 236" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Scoring History:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
timestampdurationtraining_speedepochssamplestraining_MSEtraining_r2training_loglosstraining_AUCtraining_classification_errorvalidation_MSEvalidation_r2validation_loglossvalidation_AUCvalidation_classification_error
2015-11-06 17:57:05 0.000 secNone0.00.0nannannannannannannannannannan
2015-11-06 17:57:09 2.899 sec2594 rows/sec1.06068.00.10.30.60.90.10.10.30.60.90.1
2015-11-06 17:57:15 9.096 sec5465 rows/sec7.343742.00.10.60.30.90.10.10.60.30.90.1
2015-11-06 17:57:1912.425 sec6571 rows/sec12.072478.00.10.60.21.00.10.10.60.30.90.1
" ], "text/plain": [ " timestamp duration training_speed epochs samples training_MSE training_r2 training_logloss training_AUC training_classification_error validation_MSE validation_r2 validation_logloss validation_AUC validation_classification_error\n", "-- ------------------- ---------- ---------------- -------- --------- -------------- ------------- ------------------ -------------- ------------------------------- ---------------- --------------- -------------------- ---------------- ---------------------------------\n", " 2015-11-06 17:57:05 0.000 sec 0 0 nan nan nan nan nan nan nan nan nan nan\n", " 2015-11-06 17:57:09 2.899 sec 2594 rows/sec 1.00764 6068 0.146371 0.291179 0.646759 0.872778 0.146795 0.139313 0.329013 0.616018 0.880032 0.138393\n", " 2015-11-06 17:57:15 9.096 sec 5465 rows/sec 7.2637 43742 0.0872842 0.577315 0.288376 0.931927 0.110761 0.0929823 0.552161 0.302143 0.923719 0.109127\n", " 2015-11-06 17:57:19 12.425 sec 6571 rows/sec 12.0355 72478 0.0737426 0.642891 0.242052 0.950131 0.103288 0.0843305 0.593831 0.280204 0.930515 0.109127" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Variable Importances:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
variablerelative_importancescaled_importancepercentage
Domestic.false1.01.00.0
Primary_Type.NARCOTICS0.90.90.0
IUCR.08600.80.80.0
FBI_Code.180.80.80.0
IUCR.46250.70.70.0
------------
Location_Description.missing(NA)0.00.00.0
Primary_Type.missing(NA)0.00.00.0
FBI_Code.missing(NA)0.00.00.0
WeekDay.missing(NA)0.00.00.0
Domestic.missing(NA)0.00.00.0
" ], "text/plain": [ "variable relative_importance scaled_importance percentage\n", "-------------------------------- --------------------- ------------------- ----------------\n", "Domestic.false 1.0 1.0 0.00423868455641\n", "Primary_Type.NARCOTICS 0.868295669556 0.868295669556 0.00368043144494\n", "IUCR.0860 0.80918776989 0.80918776989 0.00342989170347\n", "FBI_Code.18 0.793890833855 0.793890833855 0.00336505281694\n", "IUCR.4625 0.744634568691 0.744634568691 0.00315627104648\n", "--- --- --- ---\n", "Location_Description.missing(NA) 0.0 0.0 0.0\n", "Primary_Type.missing(NA) 0.0 0.0 0.0\n", "FBI_Code.missing(NA) 0.0 0.0 0.0\n", "WeekDay.missing(NA) 0.0 0.0 0.0\n", "Domestic.missing(NA) 0.0 0.0 0.0" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Inspect Deep Learning model output\n", "model_dl" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
predict false true
false 0.946415 0.0535847
false 0.862165 0.137835
false 0.938661 0.0613392
false 0.870186 0.129814
false 0.980488 0.0195118
false 0.972006 0.0279937
false 0.990995 0.00900489
true 0.02106920.978931
false 0.693061 0.306939
false 0.992097 0.00790253
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Predict on the test set using the gbm model\n", "predictions = model_gbm.predict(test)\n", "predictions.show()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "ModelMetricsBinomial: gbm\n", "** Reported on test data. **\n", "\n", "MSE: 0.0893676876445\n", "R^2: 0.57094394422\n", "LogLoss: 0.294019576922\n", "AUC: 0.922152238508\n", "Gini: 0.844304477016\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.365461652105:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
falsetrueErrorRate
false1297.084.00.0608 (84.0/1381.0)
true153.0427.00.2638 (153.0/580.0)
Total1450.0511.00.1209 (237.0/1961.0)
" ], "text/plain": [ " false true Error Rate\n", "----- ------- ------ ------- --------------\n", "false 1297 84 0.0608 (84.0/1381.0)\n", "true 153 427 0.2638 (153.0/580.0)\n", "Total 1450 511 0.1209 (237.0/1961.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Maximum Metrics: Maximum metrics at their respective thresholds\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
metricthresholdvalueidx
max f10.40.8158.0
max f20.10.8295.0
max f0point50.70.997.0
max accuracy0.60.9112.0
max precision1.01.00.0
max absolute_MCC0.60.7112.0
max min_per_class_accuracy0.20.8235.0
" ], "text/plain": [ "metric threshold value idx\n", "-------------------------- ----------- -------- -----\n", "max f1 0.365462 0.782768 158\n", "max f2 0.121697 0.815152 295\n", "max f0point5 0.657606 0.855615 97\n", "max accuracy 0.57297 0.885773 112\n", "max precision 0.991217 1 0\n", "max absolute_MCC 0.57297 0.717666 112\n", "max min_per_class_accuracy 0.211976 0.821144 235" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Look at performance on test set (if it includes true lables)\n", "test_performance = model_gbm.model_performance(test)\n", "test_performance" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Plots\n", "# Create Plots of Crime type vs Arrest Rate and Proportion of reported Crime" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-----------+---------+\n", "| Crime_Type|crime_count|all_count|\n", "+--------------------+-----------+---------+\n", "| OTHER OFFENSE| 183| 720|\n", "| WEAPONS VIOLATION| 96| 118|\n", "| DECEPTIVE PRACTICE| 25| 445|\n", "| BURGLARY| 14| 458|\n", "| BATTERY| 432| 1851|\n", "| ROBBERY| 17| 357|\n", "| MOTOR VEHICLE THEFT| 17| 414|\n", "| PROSTITUTION| 106| 106|\n", "| CRIMINAL DAMAGE| 76| 1003|\n", "| KIDNAPPING| 1| 7|\n", "| GAMBLING| 3| 3|\n", "|LIQUOR LAW VIOLATION| 12| 12|\n", "+--------------------+-----------+---------+\n", "only showing top 12 rows\n", "\n" ] } ], "source": [ "# Create table to report Crimetype, Arrest count per crime, total reported count per Crime \n", "sqlContext.registerDataFrameAsTable(df_crimes, \"df_crimes\")\n", "allCrimes = sqlContext.sql(\"\"\"SELECT Primary_Type, count(*) as all_count FROM df_crimes GROUP BY Primary_Type\"\"\")\n", "crimesWithArrest = sqlContext.sql(\"SELECT Primary_Type, count(*) as crime_count FROM chicagoCrime WHERE Arrest = 'true' GROUP BY Primary_Type\")\n", "\n", "sqlContext.registerDataFrameAsTable(crimesWithArrest, \"crimesWithArrest\")\n", "sqlContext.registerDataFrameAsTable(allCrimes, \"allCrimes\")\n", "\n", "crime_type = sqlContext.sql(\"Select a.Primary_Type as Crime_Type, a.crime_count, b.all_count \\\n", "FROM crimesWithArrest a \\\n", "JOIN allCrimes b \\\n", "ON a.Primary_Type = b.Primary_Type \")\n", "\n", "crime_type.show(12)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Copy Crime_type table from Spark to H2O\n", "crime_typeHF = hc.as_h2o_frame(crime_type,framename=\"crime_type\")" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Create Additional columns Arrest_rate and Crime_propotion \n", "crime_typeHF[\"Arrest_rate\"] = crime_typeHF[\"crime_count\"]/crime_typeHF[\"all_count\"]\n", "crime_typeHF[\"Crime_proportion\"] = crime_typeHF[\"all_count\"]/crime_typeHF[\"all_count\"].sum()\n", "crime_typeHF[\"Crime_Type\"] = crime_typeHF[\"Crime_Type\"].asfactor()\n", "# h2o.assign(crime_typeHF,crime_type)\n", "crime_typeHF.frame_id = \"Crime_type\"" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Crime_Type crime_count all_count Arrest_rate Crime_proportion
OTHER OFFENSE 183 720 0.254167 0.0721226
WEAPONS VIOLATION 96 118 0.813559 0.0118201
DECEPTIVE PRACTICE 25 445 0.0561798 0.0445758
BURGLARY 14 458 0.0305677 0.045878
BATTERY 432 1851 0.233387 0.185415
ROBBERY 17 357 0.047619 0.0357608
MOTOR VEHICLE THEFT 17 414 0.0410628 0.0414705
PROSTITUTION 106 106 1 0.0106181
CRIMINAL DAMAGE 76 1003 0.0757727 0.100471
KIDNAPPING 1 7 0.142857 0.000701192
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "crime_typeHF" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "H2OContext: ip=172.16.2.98, port=54329\n" ] }, { "data": { "text/plain": [] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hc\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Plot the below in Flow \n", "plot (g) -> g(\n", " g.rect(\n", " g.position \"Crime_Type\", \"Arrest_rate\"\n", " g.fillColor g.value 'blue'\n", " g.fillOpacity g.value 0.75\n", " )\n", " g.rect(\n", " g.position \"Crime_Type\", \"Crime_proportion\"\n", " g.fillColor g.value 'red'\n", " g.fillOpacity g.value 0.65\n", " )\n", " g.from inspect \"data\", getFrame \"Crime_type\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#hc.stop()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }