{ "metadata": { "name": "", "signature": "sha256:eba89a5303a3bfeb8cd3c5e2ffb4d81a96cbed610317fe5c18d05898ff4e3a35" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "import folium as fm\n", "import geopy\n", "\n", "from IPython.display import HTML\n", "\n", "from sklearn import linear_model, naive_bayes, feature_selection, metrics, tree\n", "from sklearn.cross_validation import train_test_split\n", "\n", "%matplotlib inline" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "tips_adj_df = pd.read_pickle('./dumps/tips_complete_features.pkl')\n", "zipcode_dummies = pd.get_dummies(tips_adj_df['ZIPCODE'].astype(int))\n", "tips_adj_df = tips_adj_df.join(zipcode_dummies)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 34 }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Predicting Grade \"A\" Restaurants by zipcode" ] }, { "cell_type": "code", "collapsed": false, "input": [ "zip_score_list_A = []\n", "\n", "for zipcode in zipcode_dummies.columns.values:\n", " tip_zip_subset = tips_adj_df[tips_adj_df[zipcode] == 1]\n", " \n", " X_adjs = tip_zip_subset.ix[:, 34:289]\n", " y = tip_zip_subset['grade_A']\n", " \n", " clf = naive_bayes.MultinomialNB()\n", " clf = clf.fit(X_adjs, y)\n", " score = clf.score(X_adjs, y)\n", " \n", " zip_score_list_A.append({'zipcode': zipcode, 'score': score, 'tip_count': len(tip_zip_subset)})" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 35 }, { "cell_type": "code", "collapsed": false, "input": [ "tip_by_zip_df_A = pd.DataFrame(zip_score_list_A)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 36 }, { "cell_type": "code", "collapsed": false, "input": [ "map = fm.Map(location=[40.714623, -74.006605], zoom_start=11, tiles='Stamen Toner')\n", "map.geo_json(geo_path='./nyc-zip-code-tabulation-areas-polygons.geojson',\n", " data=tip_by_zip_df_A, columns=['zipcode', 'score'],\n", " key_on='feature.properties.postalCode',\n", " threshold_scale=[0.50, 0.8, 0.85, 0.90, 0.95, 0.99],\n", " fill_color='RdPu', fill_opacity=0.65, line_opacity=0.5)\n", "map.create_map(path='foursquare_zips_all_A.html')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 37 }, { "cell_type": "code", "collapsed": false, "input": [ "HTML('')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "" ], "metadata": {}, "output_type": "pyout", "prompt_number": 38, "text": [ "" ] } ], "prompt_number": 38 }, { "cell_type": "code", "collapsed": false, "input": [ "tip_threshold_by_zip_A_df = tip_by_zip_df_A[tip_by_zip_df_A['tip_count'] >= 50]\n", "\n", "map = fm.Map(location=[40.714623, -74.006605], zoom_start=11, tiles='Stamen Toner')\n", "map.geo_json(geo_path='./nyc-zip-code-tabulation-areas-polygons.geojson',\n", " data=tip_threshold_by_zip_A_df, columns=['zipcode', 'score'],\n", " key_on='feature.properties.postalCode',\n", " threshold_scale=[0.50, 0.8, 0.85, 0.90, 0.95, 0.99],\n", " fill_color='RdPu', fill_opacity=0.65, line_opacity=0.5)\n", "map.create_map(path='foursquare_zips_threshold_A.html')\n", "\n", "HTML('')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "" ], "metadata": {}, "output_type": "pyout", "prompt_number": 39, "text": [ "" ] } ], "prompt_number": 39 }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Predicting Grade \"C\" Restaurants by zipcode" ] }, { "cell_type": "code", "collapsed": false, "input": [ "zip_score_list_C = []\n", "\n", "for zipcode in zipcode_dummies.columns.values:\n", " tip_zip_subset = tips_adj_df[tips_adj_df[zipcode] == 1]\n", " \n", " X_adjs = tip_zip_subset.ix[:, 34:289]\n", " y = tip_zip_subset['grade_C']\n", " \n", " clf = naive_bayes.MultinomialNB()\n", " clf = clf.fit(X_adjs, y)\n", " score = clf.score(X_adjs, y)\n", " \n", " zip_score_list_C.append({'zipcode': zipcode, 'score': score, 'tip_count': len(tip_zip_subset)})" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 40 }, { "cell_type": "code", "collapsed": false, "input": [ "tip_by_zip_C_df = pd.DataFrame(zip_score_list_C)\n", "\n", "map = fm.Map(location=[40.714623, -74.006605], zoom_start=11, tiles='Stamen Toner')\n", "map.geo_json(geo_path='./nyc-zip-code-tabulation-areas-polygons.geojson',\n", " data=tip_by_zip_C_df, columns=['zipcode', 'score'],\n", " key_on='feature.properties.postalCode',\n", " threshold_scale=[0.50, 0.8, 0.85, 0.90, 0.95, 0.99],\n", " fill_color='BuPu', fill_opacity=0.65, line_opacity=0.5)\n", "map.create_map(path='foursquare_zips_all_C.html')\n", "\n", "HTML('')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "" ], "metadata": {}, "output_type": "pyout", "prompt_number": 41, "text": [ "" ] } ], "prompt_number": 41 }, { "cell_type": "code", "collapsed": false, "input": [ "tip_threshold_by_zip_C_df = tip_by_zip_C_df[tip_by_zip_C_df['tip_count'] >= 50]\n", "\n", "map = fm.Map(location=[40.714623, -74.006605], zoom_start=11, tiles='Stamen Toner')\n", "map.geo_json(geo_path='./nyc-zip-code-tabulation-areas-polygons.geojson',\n", " data=tip_threshold_by_zip_C_df, columns=['zipcode', 'score'],\n", " key_on='feature.properties.postalCode',\n", " threshold_scale=[0.50, 0.8, 0.85, 0.90, 0.95, 0.99],\n", " fill_color='BuPu', fill_opacity=0.65, line_opacity=0.5)\n", "map.create_map(path='foursquare_zips_threshold_C.html')\n", "\n", "HTML('')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "" ], "metadata": {}, "output_type": "pyout", "prompt_number": 42, "text": [ "" ] } ], "prompt_number": 42 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }