{ "metadata": { "name": "", "signature": "sha256:0e9113f47ebd4a9ade123a18e34592ab50c13c4007c50ea2010eb2802c6ef853" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "import folium as fm\n", "import geopy\n", "\n", "from sklearn import linear_model, naive_bayes, feature_selection, metrics, tree\n", "from sklearn.cross_validation import train_test_split\n", "\n", "%matplotlib inline" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "tips_with_adjectives_1 = pd.read_pickle('./dumps/tips_with_adjectives.pkl')\n", "adj_dummies = pd.read_pickle('./dumps/adjective_dataframe.pkl')\n", "adj_df = pd.read_csv('./dumps/adjective_count_list.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "print len(adj_dummies)\n", "print len(tips_with_adjectives_1)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "15527\n", "15527\n" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "adjective_list = list(adj_df[adj_df['count'] > 10]['word'])\n", "\n", "print \"Number of tips: \", len(tips_with_adjectives_1)\n", "print \"Number of adjectives: \", len(adj_df)\n", "print \"Number of significant adjectives (appears in more than 10 tips): \", len(adjective_list)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Number of tips: 15527\n", "Number of adjectives: 1838\n", "Number of significant adjectives (appears in more than 10 tips): 255\n" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "# tips_with_adjectives['address'] = tips_with_adjectives.apply(lambda x: \"{0} {1} {2}, {3}, New York, NY\".format(x['BUILDING'].strip(), x['STREET'].strip(), int(x['ZIPCODE']), x['BORO']), axis=1)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "latlong_df = pd.read_pickle('./dumps/with_lat_long.pkl')[['foursquare_id', 'lat_long']]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "tips_adj_df = tips_with_adjectives_1.join(adj_dummies)\n", "tips_adj_df.drop_duplicates(['foursquare_id', 'description'], inplace=True)\n", "tips_adj_df = tips_adj_df.merge(latlong_df, on='foursquare_id', how='left')\n", "len(tips_adj_df)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 7, "text": [ "10762" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "# desired_columns = [\n", "# 'foursquare_id',\n", "# 'DBA',\n", "# 'description', \n", "# 'tip_words', \n", "# 'tip_adjs', \n", "# 'adj_string', \n", "# 'foursquare_rating',\n", "# 'foursquare_num_of_users',\n", "# 'foursquare_price_tier',\n", "# 'grade_A', \n", "# 'grade_C',\n", "# 'GRADE',\n", "# 'lat_long'\n", "# ]\n", "\n", "# tips_df = tips_with_adjectives[desired_columns]\n", "# len(tips_df)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "def score_and_predict(model, x_features, y_targets, columns, model_type):\n", " score = model.score(x_features, y_targets)\n", " y_pred = model.predict(x_features)\n", " auc = metrics.roc_auc_score(y_targets, y_pred)\n", " \n", " p_values = feature_selection.f_classif(x_features, y_targets)\n", " \n", " if model_type == 'naive-bayes' or model_type == 'logistic':\n", " coef_list = [np.exp(round(x, 4)) for x in model.coef_[0]]\n", " elif model_type == 'linear':\n", " coef_list = [round(x, 4) for x in model.coef_]\n", " \n", " \n", " df_dict = {'adjective': columns, 'p-value': p_values[0], 'coef': coef_list}\n", " model_df = pd.DataFrame(df_dict)\n", " model_df.sort(['p-value', 'coef'], ascending=[1,0], inplace=True)\n", " \n", " print 'MODEL: ', model\n", " print 'SCORE: ', score\n", " print 'AUC: ', auc\n", " print '\\n'\n", " \n", " print 'TOP PREDICTORS (p-value < 0.05):'\n", " print model_df[model_df['p-value'] <= 0.05]\n", " print '\\n'\n", " \n", " # return model_df\n", " \n", " " ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "# for index, column in enumerate(tips_adj_df.columns.values):\n", "# print column, index" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "# ADJECTIVE COLUMNS ARE 31 till second to last column\n", "# based on adjectives included in tip descriptions\n", "X_adjs = tips_adj_df.ix[:, 34:-1]\n", "\n", "# based on ratings, number of users, and price tier\n", "X_foursquare_info = tips_adj_df[['foursquare_rating', 'foursquare_num_of_users', 'foursquare_price_tier']].dropna(axis=1)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Predicting Grade \"A\" restaurants" ] }, { "cell_type": "code", "collapsed": false, "input": [ "y = tips_adj_df['grade_A']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "X_train, X_test, y_train, y_test = train_test_split(X_adjs, y, random_state=8)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "clf_multi_nb = naive_bayes.MultinomialNB()\n", "clf_multi_nb = clf_multi_nb.fit(X_train, y_train)\n", "\n", "print 'Using training set'\n", "score_and_predict(clf_multi_nb, X_train, y_train, X_adjs.columns, 'naive-bayes')\n", "\n", "print 'Using testing set'\n", "score_and_predict(clf_multi_nb, X_test, y_test, X_adjs.columns, 'naive-bayes')\n", "\n", "print 'Using all data'\n", "score_and_predict(clf_multi_nb, X_adjs.values, y.values, X_adjs.columns, 'naive-bayes')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Using training set\n", "MODEL: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)\n", "SCORE: 0.901251393879\n", "AUC: 0.502431233668\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "163 other 0.005634 0.000050\n", "26 cant 0.009299 0.000081\n", "73 epic 0.000679 0.000087\n", "210 spacious 0.000679 0.000087\n", "250 yellow 0.000679 0.000087\n", "87 flavorful 0.001290 0.000174\n", "5 affordable 0.001901 0.000261\n", "96 general 0.002579 0.001122\n", "209 solid 0.001968 0.001918\n", "122 irish 0.001358 0.003553\n", "215 steak 0.006109 0.004008\n", "95 garlic 0.004277 0.004226\n", "46 cozy 0.001833 0.006104\n", "115 hot 0.012015 0.006348\n", "227 terrible 0.002647 0.007124\n", "158 olive 0.000747 0.008315\n", "237 unbelievable 0.000747 0.008315\n", "38 clean 0.004548 0.008604\n", "138 long 0.005430 0.009899\n", "84 first 0.004209 0.011209\n", "56 dish 0.002987 0.013767\n", "140 magic 0.000611 0.014504\n", "151 natural 0.000611 0.014504\n", "191 sad 0.000611 0.014504\n", "216 stellar 0.000611 0.014504\n", "119 indian 0.001765 0.020247\n", "157 old 0.004005 0.020831\n", "223 sweet 0.007738 0.023556\n", "118 incredible 0.002104 0.025013\n", "169 personal 0.001154 0.029040\n", "170 phenomenal 0.001154 0.029040\n", "228 terrific 0.001154 0.029040\n", "107 grilled 0.005974 0.031371\n", "81 fat 0.000815 0.033534\n", "127 kid 0.000815 0.033534\n", "162 original 0.000815 0.033534\n", "178 public 0.000815 0.033534\n", "239 usual 0.000815 0.033534\n", "54 different 0.002308 0.033618\n", "82 favorite 0.006652 0.038926\n", "\n", "\n", "Using testing set\n", "MODEL: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)\n", "SCORE: 0.888888888889\n", "AUC: 0.5\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "83 few 0.003665 0.000000\n", "56 dish 0.002987 0.000000\n", "14 baked 0.002715 0.000000\n", "93 full 0.002579 0.000000\n", "120 inexpensive 0.001290 0.000000\n", "35 chinese 0.006517 0.002318\n", "179 quick 0.003462 0.004851\n", "130 large 0.003937 0.006621\n", "114 horrible 0.003326 0.006621\n", "154 next 0.003326 0.006621\n", "124 ive 0.008349 0.009112\n", "113 high 0.002715 0.012537\n", "118 incredible 0.002104 0.012537\n", "68 eggplant 0.001833 0.012537\n", "146 mediocre 0.001222 0.012537\n", "149 much 0.008077 0.014883\n", "10 available 0.002104 0.015660\n", "47 crazy 0.001968 0.015660\n", "199 short 0.001901 0.015660\n", "25 busy 0.001629 0.015660\n", "104 greasy 0.001154 0.015660\n", "133 later 0.001154 0.015660\n", "163 other 0.005634 0.020173\n", "116 huge 0.004141 0.025169\n", "254 yummy 0.006992 0.040728\n", "131 last 0.002579 0.045608\n", "54 different 0.002308 0.045608\n", "23 bubble 0.002240 0.045608\n", "5 affordable 0.001901 0.045608\n", "51 decent 0.005023 0.049499\n", "\n", "\n", "Using all data\n", "MODEL: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)\n", "SCORE: 0.898160193273\n", "AUC: 0.50176809245\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "230 tiny 0.001968 0.000085\n", "151 natural 0.000611 0.000490\n", "3 addictive 0.000543 0.000490\n", "228 terrific 0.001154 0.000981\n", "238 unique 0.001086 0.000981\n", "169 personal 0.001154 0.002052\n", "54 different 0.002308 0.002205\n", "163 other 0.005634 0.002261\n", "90 fresh 0.014595 0.002581\n", "13 bad 0.008077 0.003905\n", "66 eat 0.009299 0.007615\n", "29 casual 0.000407 0.007943\n", "224 swiss 0.000407 0.007943\n", "9 authentic 0.003190 0.008253\n", "180 quiet 0.001968 0.009322\n", "5 affordable 0.001901 0.009322\n", "40 close 0.001901 0.009322\n", "58 dont 0.025929 0.009461\n", "182 real 0.003734 0.010006\n", "95 garlic 0.004277 0.010360\n", "1 accept 0.001154 0.010862\n", "30 central 0.001154 0.010862\n", "56 dish 0.002987 0.011253\n", "140 magic 0.000611 0.015087\n", "191 sad 0.000611 0.015087\n", "204 slow 0.007195 0.016624\n", "157 old 0.004005 0.018804\n", "79 fantastic 0.004887 0.020690\n", "38 clean 0.004548 0.023304\n", "119 indian 0.001765 0.024510\n", "202 simple 0.001697 0.024510\n", "20 black 0.002851 0.029148\n", "57 dive 0.001018 0.030205\n", "143 many 0.003055 0.032713\n", "130 large 0.003937 0.036375\n", "251 youll 0.003937 0.036375\n", "26 cant 0.009299 0.037648\n", "118 incredible 0.002104 0.038944\n", "110 healthy 0.002308 0.039847\n", "209 solid 0.001968 0.039847\n", "45 cool 0.004277 0.043770\n", "234 turkish 0.000815 0.044770\n", "193 saltfish 0.000475 0.045706\n", "178 public 0.000815 0.046227\n", "210 spacious 0.000679 0.046227\n", "15 basic 0.000475 0.046227\n", "\n", "\n" ] } ], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "clf_linear = linear_model.LinearRegression()\n", "clf_linear = clf_linear.fit(X_train, y_train)\n", "\n", "print 'Using training set'\n", "score_and_predict(clf_linear, X_train, y_train, X_adjs.columns, 'linear')\n", "\n", "print 'Using testing set'\n", "score_and_predict(clf_linear, X_test, y_test, X_adjs.columns, 'linear')\n", "\n", "print 'Using all data'\n", "score_and_predict(clf_linear, X_adjs.values, y.values, X_adjs.columns, 'linear')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Using training set\n", "MODEL: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " LinearRegression(copy_X=True, fit_intercept=True, normalize=False)\n", "SCORE: 0.0306220058522\n", "AUC: 0.65453453789\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "163 other -0.0005 0.000050\n", "26 cant 0.0110 0.000081\n", "73 epic 0.0196 0.000087\n", "250 yellow 0.0143 0.000087\n", "210 spacious 0.0123 0.000087\n", "87 flavorful 0.0218 0.000174\n", "5 affordable 0.0061 0.000261\n", "96 general 0.0292 0.001122\n", "209 solid 0.0167 0.001918\n", "122 irish -0.0082 0.003553\n", "215 steak -0.0008 0.004008\n", "95 garlic -0.0077 0.004226\n", "46 cozy -0.0200 0.006104\n", "115 hot 0.0042 0.006348\n", "227 terrible -0.0021 0.007124\n", "158 olive 0.0464 0.008315\n", "237 unbelievable 0.0079 0.008315\n", "38 clean 0.0037 0.008604\n", "138 long -0.0036 0.009899\n", "84 first -0.0020 0.011209\n", "56 dish 0.0091 0.013767\n", "151 natural 0.0361 0.014504\n", "191 sad 0.0085 0.014504\n", "216 stellar -0.0123 0.014504\n", "140 magic -0.0354 0.014504\n", "119 indian -0.0069 0.020247\n", "157 old -0.0030 0.020831\n", "223 sweet 0.0028 0.023556\n", "118 incredible 0.0127 0.025013\n", "228 terrific 0.0222 0.029040\n", "169 personal -0.0176 0.029040\n", "170 phenomenal -0.0186 0.029040\n", "107 grilled 0.0041 0.031371\n", "239 usual 0.0336 0.033534\n", "127 kid 0.0176 0.033534\n", "162 original 0.0168 0.033534\n", "81 fat 0.0163 0.033534\n", "178 public 0.0145 0.033534\n", "54 different -0.0040 0.033618\n", "82 favorite 0.0016 0.038926\n", "\n", "\n", "Using testing set\n", "MODEL: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " LinearRegression(copy_X=True, fit_intercept=True, normalize=False)\n", "SCORE: -0.0270734866002\n", "AUC: 0.514883782061\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "120 inexpensive 0.1140 0.000000\n", "93 full 0.0438 0.000000\n", "14 baked 0.0327 0.000000\n", "83 few 0.0294 0.000000\n", "56 dish 0.0091 0.000000\n", "35 chinese -0.0829 0.002318\n", "179 quick -0.0168 0.004851\n", "154 next 0.0210 0.006621\n", "130 large -0.0158 0.006621\n", "114 horrible -0.0324 0.006621\n", "124 ive -0.0264 0.009112\n", "113 high 0.0792 0.012537\n", "118 incredible 0.0127 0.012537\n", "68 eggplant -0.0432 0.012537\n", "146 mediocre -0.1384 0.012537\n", "149 much 0.0396 0.014883\n", "199 short 0.1068 0.015660\n", "47 crazy 0.0316 0.015660\n", "25 busy -0.0378 0.015660\n", "104 greasy -0.0779 0.015660\n", "10 available -0.0937 0.015660\n", "133 later -0.1298 0.015660\n", "163 other -0.0005 0.020173\n", "116 huge -0.0639 0.025169\n", "254 yummy 0.0469 0.040728\n", "23 bubble 0.0391 0.045608\n", "5 affordable 0.0061 0.045608\n", "54 different -0.0040 0.045608\n", "131 last -0.0163 0.045608\n", "51 decent 0.0553 0.049499\n", "\n", "\n", "Using all data\n", "MODEL: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " LinearRegression(copy_X=True, fit_intercept=True, normalize=False)\n", "SCORE: 0.0153673116839\n", "AUC: 0.616453415498\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "230 tiny -0.0225 0.000085\n", "3 addictive 0.0950 0.000490\n", "151 natural 0.0361 0.000490\n", "228 terrific 0.0222 0.000981\n", "238 unique -0.0312 0.000981\n", "169 personal -0.0176 0.002052\n", "54 different -0.0040 0.002205\n", "163 other -0.0005 0.002261\n", "90 fresh 0.0131 0.002581\n", "13 bad -0.0062 0.003905\n", "66 eat -0.0031 0.007615\n", "224 swiss 0.0856 0.007943\n", "29 casual -0.0786 0.007943\n", "9 authentic 0.0092 0.008253\n", "180 quiet 0.0412 0.009322\n", "5 affordable 0.0061 0.009322\n", "40 close -0.0286 0.009322\n", "58 dont 0.0163 0.009461\n", "182 real 0.0094 0.010006\n", "95 garlic -0.0077 0.010360\n", "1 accept 0.0405 0.010862\n", "30 central 0.0343 0.010862\n", "56 dish 0.0091 0.011253\n", "191 sad 0.0085 0.015087\n", "140 magic -0.0354 0.015087\n", "204 slow -0.0116 0.016624\n", "157 old -0.0030 0.018804\n", "79 fantastic -0.0188 0.020690\n", "38 clean 0.0037 0.023304\n", "202 simple 0.0226 0.024510\n", "119 indian -0.0069 0.024510\n", "20 black 0.0394 0.029148\n", "57 dive -0.0270 0.030205\n", "143 many 0.0036 0.032713\n", "130 large -0.0158 0.036375\n", "251 youll -0.0307 0.036375\n", "26 cant 0.0110 0.037648\n", "118 incredible 0.0127 0.038944\n", "110 healthy 0.0384 0.039847\n", "209 solid 0.0167 0.039847\n", "45 cool 0.0238 0.043770\n", "234 turkish -0.0607 0.044770\n", "193 saltfish -0.0485 0.045706\n", "15 basic 0.1099 0.046227\n", "178 public 0.0145 0.046227\n", "210 spacious 0.0123 0.046227\n", "\n", "\n" ] } ], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "clf_logistic = linear_model.LogisticRegression()\n", "clf_logistic.fit(X_train, y_train)\n", "\n", "print 'Using training set'\n", "score_and_predict(clf_logistic, X_train, y_train, X_adjs.columns, 'logistic')\n", "\n", "print 'Using testing set'\n", "score_and_predict(clf_logistic, X_test, y_test, X_adjs.columns, 'logistic')\n", "\n", "print 'Using all data'\n", "score_and_predict(clf_logistic, X_adjs.values, y.values, X_adjs.columns, 'logistic')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Using training set\n", "MODEL: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)\n", "SCORE: 0.901003593111\n", "AUC: 0.500625\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "163 other 1.057386 0.000050\n", "26 cant 1.184831 0.000081\n", "73 epic 1.098779 0.000087\n", "250 yellow 1.097571 0.000087\n", "210 spacious 1.052954 0.000087\n", "87 flavorful 1.143622 0.000174\n", "5 affordable 1.044564 0.000261\n", "96 general 1.248696 0.001122\n", "209 solid 1.152807 0.001918\n", "122 irish 0.962232 0.003553\n", "215 steak 0.986887 0.004008\n", "95 garlic 0.938474 0.004226\n", "46 cozy 0.893776 0.006104\n", "115 hot 1.031176 0.006348\n", "227 terrible 1.015316 0.007124\n", "158 olive 1.137463 0.008315\n", "237 unbelievable 1.029219 0.008315\n", "38 clean 1.051376 0.008604\n", "138 long 0.981474 0.009899\n", "84 first 0.957337 0.011209\n", "56 dish 1.086107 0.013767\n", "151 natural 1.067479 0.014504\n", "191 sad 1.002704 0.014504\n", "216 stellar 0.941576 0.014504\n", "140 magic 0.860020 0.014504\n", "119 indian 0.969282 0.020247\n", "157 old 0.971514 0.020831\n", "223 sweet 1.015316 0.023556\n", "118 incredible 1.085239 0.025013\n", "228 terrific 1.048332 0.029040\n", "169 personal 0.903481 0.029040\n", "170 phenomenal 0.886300 0.029040\n", "107 grilled 1.022244 0.031371\n", "239 usual 1.144651 0.033534\n", "81 fat 1.101199 0.033534\n", "162 original 1.086650 0.033534\n", "127 kid 1.086107 0.033534\n", "178 public 1.072186 0.033534\n", "54 different 0.943933 0.033618\n", "82 favorite 1.031589 0.038926\n", "\n", "\n", "Using testing set\n", "MODEL: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)\n", "SCORE: 0.888888888889\n", "AUC: 0.5\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "120 inexpensive 2.360091 0.000000\n", "93 full 1.620282 0.000000\n", "83 few 1.384307 0.000000\n", "14 baked 1.355134 0.000000\n", "56 dish 1.086107 0.000000\n", "35 chinese 0.505807 0.002318\n", "179 quick 0.853679 0.004851\n", "154 next 1.204061 0.006621\n", "130 large 0.860708 0.006621\n", "114 horrible 0.753294 0.006621\n", "124 ive 0.785920 0.009112\n", "113 high 2.387866 0.012537\n", "118 incredible 1.085239 0.012537\n", "68 eggplant 0.727894 0.012537\n", "146 mediocre 0.431538 0.012537\n", "149 much 1.625800 0.014883\n", "199 short 2.895622 0.015660\n", "47 crazy 1.344874 0.015660\n", "25 busy 0.810098 0.015660\n", "104 greasy 0.628449 0.015660\n", "10 available 0.507987 0.015660\n", "133 later 0.476589 0.015660\n", "163 other 1.057386 0.020173\n", "116 huge 0.606652 0.025169\n", "254 yummy 1.751548 0.040728\n", "23 bubble 1.441234 0.045608\n", "5 affordable 1.044564 0.045608\n", "54 different 0.943933 0.045608\n", "131 last 0.852740 0.045608\n", "51 decent 1.715321 0.049499\n", "\n", "\n", "Using all data\n", "MODEL: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)\n", "SCORE: 0.897974354209\n", "AUC: 0.500454959054\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "230 tiny 0.858216 0.000085\n", "3 addictive 1.555816 0.000490\n", "151 natural 1.067479 0.000490\n", "228 terrific 1.048332 0.000981\n", "238 unique 0.815136 0.000981\n", "169 personal 0.903481 0.002052\n", "54 different 0.943933 0.002205\n", "163 other 1.057386 0.002261\n", "90 fresh 1.188509 0.002581\n", "13 bad 0.933607 0.003905\n", "66 eat 0.951800 0.007615\n", "224 swiss 1.352156 0.007943\n", "29 casual 0.736460 0.007943\n", "9 authentic 1.049171 0.008253\n", "180 quiet 1.439218 0.009322\n", "5 affordable 1.044564 0.009322\n", "40 close 0.794613 0.009322\n", "58 dont 1.234542 0.009461\n", "182 real 1.113268 0.010006\n", "95 garlic 0.938474 0.010360\n", "1 accept 1.302128 0.010862\n", "30 central 1.269979 0.010862\n", "56 dish 1.086107 0.011253\n", "191 sad 1.002704 0.015087\n", "140 magic 0.860020 0.015087\n", "204 slow 0.896999 0.016624\n", "157 old 0.971514 0.018804\n", "79 fantastic 0.846623 0.020690\n", "38 clean 1.051376 0.023304\n", "202 simple 1.174568 0.024510\n", "119 indian 0.969282 0.024510\n", "20 black 1.402000 0.029148\n", "57 dive 0.859590 0.030205\n", "143 many 1.043938 0.032713\n", "130 large 0.860708 0.036375\n", "251 youll 0.745575 0.036375\n", "26 cant 1.184831 0.037648\n", "118 incredible 1.085239 0.038944\n", "110 healthy 1.474177 0.039847\n", "209 solid 1.152807 0.039847\n", "45 cool 1.258222 0.043770\n", "234 turkish 0.737713 0.044770\n", "193 saltfish 0.823576 0.045706\n", "15 basic 1.494662 0.046227\n", "178 public 1.072186 0.046227\n", "210 spacious 1.052954 0.046227\n", "\n", "\n" ] } ], "prompt_number": 19 }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Predicting Grade \"C\" restaurants" ] }, { "cell_type": "code", "collapsed": false, "input": [ "y = tips_adj_df['grade_C']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 20 }, { "cell_type": "code", "collapsed": false, "input": [ "X_train, X_test, y_train, y_test = train_test_split(X_adjs, y, random_state=8)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 21 }, { "cell_type": "code", "collapsed": false, "input": [ "clf_multi_nb = naive_bayes.MultinomialNB()\n", "clf_multi_nb = clf_multi_nb.fit(X_train, y_train)\n", "\n", "print 'Using training set'\n", "score_and_predict(clf_multi_nb, X_train, y_train, X_adjs.columns, 'naive-bayes')\n", "\n", "print 'Using testing set'\n", "score_and_predict(clf_multi_nb, X_test, y_test, X_adjs.columns, 'naive-bayes')\n", "\n", "print 'Using all data'\n", "score_and_predict(clf_multi_nb, X_adjs.values, y.values, X_adjs.columns, 'naive-bayes')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Using training set\n", "MODEL: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)\n", "SCORE: 0.987981662743\n", "AUC: 0.499937304075\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "53 delish 0.004338 0.000169\n", "74 excellent 0.006507 0.000341\n", "52 delicious 0.015184 0.001837\n", "106 green 0.004338 0.003959\n", "249 wrong 0.004338 0.005746\n", "22 breakfast 0.006507 0.006458\n", "0 20500daily 0.002169 0.012036\n", "24 bushy 0.002169 0.012036\n", "117 iconic 0.002169 0.012036\n", "176 priceless 0.002169 0.012036\n", "253 yous 0.002169 0.012036\n", "153 new 0.008677 0.017581\n", "107 grilled 0.004338 0.018048\n", "102 good 0.036876 0.023534\n", "171 pic 0.002169 0.024075\n", "192 salad 0.008677 0.029159\n", "19 big 0.004338 0.034574\n", "67 efficient 0.002169 0.036117\n", "99 ginormous 0.002169 0.036117\n", "196 separate 0.002169 0.036117\n", "203 sized 0.002169 0.036117\n", "221 superior 0.002169 0.036117\n", "84 first 0.004338 0.046109\n", "188 royal 0.002169 0.048163\n", "\n", "\n", "Using testing set\n", "MODEL: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)\n", "SCORE: 0.990338164251\n", "AUC: 0.5\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "43 complimentary 0.004338 0.009753\n", "190 russian 0.004338 0.009753\n", "44 congested 0.002169 0.009753\n", "48 creative 0.002169 0.009753\n", "67 efficient 0.002169 0.009753\n", "69 empty 0.002169 0.009753\n", "117 iconic 0.002169 0.009753\n", "151 natural 0.002169 0.009753\n", "169 personal 0.002169 0.009753\n", "171 pic 0.002169 0.009753\n", "188 royal 0.002169 0.009753\n", "193 saltfish 0.002169 0.009753\n", "196 separate 0.002169 0.009753\n", "198 several 0.002169 0.009753\n", "236 unbeatable 0.002169 0.009753\n", "243 weak 0.002169 0.009753\n", "161 organic 0.004338 0.019512\n", "210 spacious 0.004338 0.019512\n", "228 terrific 0.004338 0.019512\n", "237 unbelievable 0.004338 0.019512\n", "2 actual 0.002169 0.019512\n", "31 certain 0.002169 0.019512\n", "39 clear 0.002169 0.019512\n", "103 gorgeous 0.002169 0.019512\n", "140 magic 0.002169 0.019512\n", "144 massive 0.002169 0.019512\n", "187 rosemary 0.002169 0.019512\n", "191 sad 0.002169 0.019512\n", "201 similar 0.002169 0.019512\n", "203 sized 0.002169 0.019512\n", ".. ... ... ...\n", "63 earth 0.002169 0.039054\n", "119 indian 0.002169 0.039054\n", "127 kid 0.002169 0.039054\n", "145 mean 0.002169 0.039054\n", "152 nearby 0.002169 0.039054\n", "174 poor 0.002169 0.039054\n", "207 social 0.002169 0.039054\n", "214 standard 0.002169 0.039054\n", "224 swiss 0.002169 0.039054\n", "155 nice 0.013016 0.041094\n", "64 east 0.004338 0.048836\n", "16 bean 0.002169 0.048836\n", "70 english 0.002169 0.048836\n", "72 entire 0.002169 0.048836\n", "73 epic 0.002169 0.048836\n", "112 helpful 0.002169 0.048836\n", "121 interesting 0.002169 0.048836\n", "129 korean 0.002169 0.048836\n", "134 light 0.002169 0.048836\n", "139 low 0.002169 0.048836\n", "148 modern 0.002169 0.048836\n", "156 normal 0.002169 0.048836\n", "177 private 0.002169 0.048836\n", "186 ridiculous 0.002169 0.048836\n", "197 serious 0.002169 0.048836\n", "200 sicilian 0.002169 0.048836\n", "216 stellar 0.002169 0.048836\n", "218 strong 0.002169 0.048836\n", "235 typical 0.002169 0.048836\n", "244 weird 0.002169 0.048836\n", "\n", "[83 rows x 3 columns]\n", "\n", "\n", "Using all data\n", "MODEL: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)\n", "SCORE: 0.988570897603\n", "AUC: 0.499953007519\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "130 large 0.004338 0.003783\n", "251 youll 0.004338 0.003783\n", "84 first 0.004338 0.004120\n", "157 old 0.004338 0.004120\n", "41 cold 0.002169 0.007306\n", "123 italian 0.004338 0.011308\n", "0 20500daily 0.002169 0.011465\n", "24 bushy 0.002169 0.011465\n", "44 congested 0.002169 0.011465\n", "176 priceless 0.002169 0.011465\n", "253 yous 0.002169 0.011465\n", "13 bad 0.006507 0.022223\n", "150 music 0.004338 0.022223\n", "117 iconic 0.002169 0.022932\n", "53 delish 0.004338 0.024571\n", "106 green 0.004338 0.031070\n", "114 horrible 0.004338 0.031512\n", "99 ginormous 0.002169 0.034402\n", "171 pic 0.002169 0.034402\n", "221 superior 0.002169 0.034402\n", "102 good 0.036876 0.036221\n", "192 salad 0.008677 0.036831\n", "204 slow 0.004338 0.045644\n", "67 efficient 0.002169 0.045873\n", "196 separate 0.002169 0.045873\n", "154 next 0.004338 0.048159\n", "\n", "\n" ] } ], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [ "clf_linear = linear_model.LinearRegression()\n", "clf_linear = clf_linear.fit(X_train, y_train)\n", "\n", "print 'Using training set'\n", "score_and_predict(clf_linear, X_train, y_train, X_adjs.columns, 'linear')\n", "\n", "print 'Using testing set'\n", "score_and_predict(clf_linear, X_test, y_test, X_adjs.columns, 'linear')\n", "\n", "print 'Using all data'\n", "score_and_predict(clf_linear, X_adjs.values, y.values, X_adjs.columns, 'linear')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Using training set\n", "MODEL: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " LinearRegression(copy_X=True, fit_intercept=True, normalize=False)\n", "SCORE: 0.0265341264606\n", "AUC: 0.813025731452\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "53 delish 0.0025 0.000169\n", "74 excellent 0.0006 0.000341\n", "52 delicious 0.0004 0.001837\n", "106 green 0.0031 0.003959\n", "249 wrong 0.0009 0.005746\n", "22 breakfast -0.0010 0.006458\n", "117 iconic 0.0080 0.012036\n", "24 bushy -0.0028 0.012036\n", "0 20500daily -0.0091 0.012036\n", "176 priceless -0.0091 0.012036\n", "253 yous -0.0103 0.012036\n", "153 new 0.0012 0.017581\n", "107 grilled 0.0003 0.018048\n", "102 good 0.0016 0.023534\n", "171 pic -0.0208 0.024075\n", "192 salad -0.0021 0.029159\n", "19 big 0.0016 0.034574\n", "203 sized 0.0032 0.036117\n", "196 separate 0.0028 0.036117\n", "221 superior -0.0084 0.036117\n", "67 efficient -0.0124 0.036117\n", "99 ginormous -0.0202 0.036117\n", "84 first 0.0046 0.046109\n", "188 royal -0.0046 0.048163\n", "\n", "\n", "Using testing set\n", "MODEL: LinearRegression(copy_X=True, fit_intercept=True, normalize=False)\n", "SCORE: -0.0391889077113\n", "AUC: 0.464410448838\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "190 russian 0.0802 0.009753\n", "43 complimentary 0.0609 0.009753\n", "117 iconic 0.0080 0.009753\n", "196 separate 0.0028 0.009753\n", "44 congested 0.0000 0.009753\n", "198 several -0.0005 0.009753\n", "188 royal -0.0046 0.009753\n", "236 unbeatable -0.0051 0.009753\n", "193 saltfish -0.0058 0.009753\n", "243 weak -0.0068 0.009753\n", "69 empty -0.0101 0.009753\n", "151 natural -0.0121 0.009753\n", "67 efficient -0.0124 0.009753\n", "48 creative -0.0159 0.009753\n", "169 personal -0.0189 0.009753\n", "171 pic -0.0208 0.009753\n", "210 spacious 0.0875 0.019512\n", "237 unbelievable 0.0850 0.019512\n", "161 organic 0.0622 0.019512\n", "228 terrific 0.0401 0.019512\n", "203 sized 0.0032 0.019512\n", "140 magic -0.0048 0.019512\n", "252 young -0.0055 0.019512\n", "31 certain -0.0061 0.019512\n", "103 gorgeous -0.0062 0.019512\n", "229 thin -0.0064 0.019512\n", "2 actual -0.0072 0.019512\n", "191 sad -0.0095 0.019512\n", "39 clear -0.0099 0.019512\n", "239 usual -0.0128 0.019512\n", ".. ... ... ...\n", "207 social -0.0083 0.039054\n", "30 central -0.0092 0.039054\n", "174 poor -0.0092 0.039054\n", "119 indian -0.0101 0.039054\n", "145 mean -0.0110 0.039054\n", "152 nearby -0.0124 0.039054\n", "28 caribbean -0.0149 0.039054\n", "63 earth -0.0177 0.039054\n", "214 standard -0.0345 0.039054\n", "155 nice 0.0044 0.041094\n", "64 east 0.0230 0.048836\n", "139 low -0.0045 0.048836\n", "72 entire -0.0055 0.048836\n", "218 strong -0.0068 0.048836\n", "148 modern -0.0078 0.048836\n", "235 typical -0.0092 0.048836\n", "70 english -0.0093 0.048836\n", "156 normal -0.0102 0.048836\n", "200 sicilian -0.0102 0.048836\n", "197 serious -0.0103 0.048836\n", "177 private -0.0112 0.048836\n", "134 light -0.0129 0.048836\n", "73 epic -0.0130 0.048836\n", "186 ridiculous -0.0146 0.048836\n", "129 korean -0.0153 0.048836\n", "121 interesting -0.0164 0.048836\n", "244 weird -0.0174 0.048836\n", "112 helpful -0.0176 0.048836\n", "16 bean -0.0188 0.048836\n", "216 stellar -0.0217 0.048836\n", "\n", "[83 rows x 3 columns]" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "\n", "\n", "Using all data\n", "MODEL: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " LinearRegression(copy_X=True, fit_intercept=True, normalize=False)\n", "SCORE: 0.0125850437395\n", "AUC: 0.738506486503\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "130 large 0.0080 0.003783\n", "251 youll 0.0069 0.003783\n", "157 old 0.0081 0.004120\n", "84 first 0.0046 0.004120\n", "41 cold -0.0127 0.007306\n", "123 italian 0.0008 0.011308\n", "44 congested 0.0000 0.011465\n", "24 bushy -0.0028 0.011465\n", "0 20500daily -0.0091 0.011465\n", "176 priceless -0.0091 0.011465\n", "253 yous -0.0103 0.011465\n", "13 bad 0.0046 0.022223\n", "150 music -0.0060 0.022223\n", "117 iconic 0.0080 0.022932\n", "53 delish 0.0025 0.024571\n", "106 green 0.0031 0.031070\n", "114 horrible 0.0104 0.031512\n", "221 superior -0.0084 0.034402\n", "99 ginormous -0.0202 0.034402\n", "171 pic -0.0208 0.034402\n", "102 good 0.0016 0.036221\n", "192 salad -0.0021 0.036831\n", "204 slow -0.0001 0.045644\n", "196 separate 0.0028 0.045873\n", "67 efficient -0.0124 0.045873\n", "154 next 0.0127 0.048159\n", "\n", "\n" ] } ], "prompt_number": 23 }, { "cell_type": "code", "collapsed": false, "input": [ "clf_logistic = linear_model.LogisticRegression()\n", "clf_logistic = clf_logistic.fit(X_train, y_train)\n", "\n", "print 'Using training set'\n", "score_and_predict(clf_logistic, X_train, y_train, X_adjs.columns, 'logistic')\n", "\n", "print 'Using testing set'\n", "score_and_predict(clf_logistic, X_test, y_test, X_adjs.columns, 'logistic')\n", "\n", "print 'Using all data'\n", "score_and_predict(clf_logistic, X_adjs.values, y.values, X_adjs.columns, 'logistic')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Using training set\n", "MODEL: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)\n", "SCORE: 0.988105563127\n", "AUC: 0.5\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "53 delish 1.041227 0.000169\n", "74 excellent 0.962809 0.000341\n", "52 delicious 0.975505 0.001837\n", "106 green 1.059291 0.003959\n", "249 wrong 0.952657 0.005746\n", "22 breakfast 0.981376 0.006458\n", "24 bushy 0.992131 0.012036\n", "117 iconic 0.990644 0.012036\n", "0 20500daily 0.989060 0.012036\n", "176 priceless 0.989060 0.012036\n", "253 yous 0.988665 0.012036\n", "153 new 1.037486 0.017581\n", "107 grilled 1.029425 0.018048\n", "102 good 1.020814 0.023534\n", "171 pic 0.962424 0.024075\n", "192 salad 0.970543 0.029159\n", "19 big 0.932674 0.034574\n", "203 sized 0.980787 0.036117\n", "196 separate 0.975017 0.036117\n", "221 superior 0.968507 0.036117\n", "67 efficient 0.962713 0.036117\n", "99 ginormous 0.961655 0.036117\n", "84 first 1.126257 0.046109\n", "188 royal 0.962617 0.048163\n", "\n", "\n", "Using testing set\n", "MODEL: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)\n", "SCORE: 0.990338164251\n", "AUC: 0.5\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "43 complimentary 1.939247 0.009753\n", "190 russian 1.936341 0.009753\n", "44 congested 1.000000 0.009753\n", "117 iconic 0.990644 0.009753\n", "196 separate 0.975017 0.009753\n", "67 efficient 0.962713 0.009753\n", "188 royal 0.962617 0.009753\n", "171 pic 0.962424 0.009753\n", "198 several 0.942895 0.009753\n", "193 saltfish 0.939507 0.009753\n", "236 unbeatable 0.934354 0.009753\n", "243 weak 0.933233 0.009753\n", "151 natural 0.912926 0.009753\n", "69 empty 0.881615 0.009753\n", "48 creative 0.845354 0.009753\n", "169 personal 0.800675 0.009753\n", "237 unbelievable 2.080275 0.019512\n", "210 spacious 2.074873 0.019512\n", "161 organic 2.036230 0.019512\n", "228 terrific 1.697913 0.019512\n", "203 sized 0.980787 0.019512\n", "144 massive 0.949424 0.019512\n", "31 certain 0.945917 0.019512\n", "201 similar 0.939037 0.019512\n", "103 gorgeous 0.937536 0.019512\n", "2 actual 0.925334 0.019512\n", "187 rosemary 0.920351 0.019512\n", "191 sad 0.920351 0.019512\n", "140 magic 0.915395 0.019512\n", "39 clear 0.907012 0.019512\n", ".. ... ... ...\n", "28 caribbean 0.918696 0.039054\n", "127 kid 0.888607 0.039054\n", "63 earth 0.877306 0.039054\n", "174 poor 0.857358 0.039054\n", "152 nearby 0.850952 0.039054\n", "30 central 0.842653 0.039054\n", "145 mean 0.819058 0.039054\n", "214 standard 0.796602 0.039054\n", "119 indian 0.760332 0.039054\n", "155 nice 1.240234 0.041094\n", "64 east 1.565492 0.048836\n", "72 entire 0.947337 0.048836\n", "156 normal 0.939695 0.048836\n", "177 private 0.916769 0.048836\n", "186 ridiculous 0.904204 0.048836\n", "73 epic 0.903662 0.048836\n", "148 modern 0.894849 0.048836\n", "216 stellar 0.888963 0.048836\n", "235 typical 0.882938 0.048836\n", "197 serious 0.873366 0.048836\n", "200 sicilian 0.871534 0.048836\n", "139 low 0.857443 0.048836\n", "244 weird 0.855474 0.048836\n", "129 korean 0.825472 0.048836\n", "121 interesting 0.819468 0.048836\n", "70 english 0.816850 0.048836\n", "218 strong 0.816115 0.048836\n", "112 helpful 0.786707 0.048836\n", "16 bean 0.784036 0.048836\n", "134 light 0.686053 0.048836\n", "\n", "[83 rows x 3 columns]" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "\n", "\n", "Using all data\n", "MODEL: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)\n", "SCORE: 0.988663817134\n", "AUC: 0.5\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "130 large 1.219938 0.003783\n", "251 youll 1.153268 0.003783\n", "157 old 1.202016 0.004120\n", "84 first 1.126257 0.004120\n", "41 cold 0.602480 0.007306\n", "123 italian 1.073367 0.011308\n", "44 congested 1.000000 0.011465\n", "24 bushy 0.992131 0.011465\n", "0 20500daily 0.989060 0.011465\n", "176 priceless 0.989060 0.011465\n", "253 yous 0.988665 0.011465\n", "13 bad 1.179039 0.022223\n", "150 music 0.842400 0.022223\n", "117 iconic 0.990644 0.022932\n", "53 delish 1.041227 0.024571\n", "106 green 1.059291 0.031070\n", "114 horrible 1.266301 0.031512\n", "221 superior 0.968507 0.034402\n", "171 pic 0.962424 0.034402\n", "99 ginormous 0.961655 0.034402\n", "102 good 1.020814 0.036221\n", "192 salad 0.970543 0.036831\n", "204 slow 0.879502 0.045644\n", "196 separate 0.975017 0.045873\n", "67 efficient 0.962713 0.045873\n", "154 next 1.290978 0.048159\n", "\n", "\n" ] } ], "prompt_number": 24 }, { "cell_type": "code", "collapsed": false, "input": [ "clf_tree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)\n", "\n", "print 'Using training set'\n", "clf_tree = clf_tree.fit(X_train, y_train)\n", "score = clf_tree.score(X_train, y_train)\n", "y_pred = clf_tree.predict(X_train)\n", "\n", "print \"Accuracy:{0:.3f}\".format(metrics.accuracy_score(y_train, y_pred)), \"\\n\"\n", "print \"Classification report\"\n", "print metrics.classification_report(y_train, y_pred), \"\\n\"\n", " \n", "print \"Confusion matrix\"\n", "print metrics.confusion_matrix(y_train, y_pred), \"\\n\"\n", "\n", "print 'Using testing set'\n", "score = clf_tree.score(X_test, y_test)\n", "y_pred = clf_tree.predict(X_test)\n", "\n", "print \"Accuracy:{0:.3f}\".format(metrics.accuracy_score(y_test, y_pred)), \"\\n\"\n", "print \"Classification report\"\n", "print metrics.classification_report(y_test, y_pred), \"\\n\"\n", " \n", "print \"Confusion matrix\"\n", "print metrics.confusion_matrix(y_test, y_pred), \"\\n\"\n", "\n", "print 'Using all data'\n", "score = clf_tree.score(X_adjs.values, y.values)\n", "y_pred = clf_tree.predict(X_adjs.values)\n", "\n", "print \"Accuracy:{0:.3f}\".format(metrics.accuracy_score(y.values, y_pred)), \"\\n\"\n", "print \"Classification report\"\n", "print metrics.classification_report(y.values, y_pred), \"\\n\"\n", " \n", "print \"Confusion matrix\"\n", "print metrics.confusion_matrix(y.values, y_pred), \"\\n\"\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Using training set\n", "Accuracy:0.991 \n", "\n", "Classification report\n", " precision recall f1-score support\n", "\n", " 0.0 0.99 1.00 1.00 1631\n", " 1.0 0.00 0.00 0.00 15\n", "\n", "avg / total 0.98 0.99 0.99 1646\n", "\n", "\n", "Confusion matrix\n", "[[1631 0]\n", " [ 15 0]] \n", "\n", "Using testing set\n", "Accuracy:0.993 \n", "\n", "Classification report\n", " precision recall f1-score support\n", "\n", " 0.0 0.99 1.00 1.00 545\n", " 1.0 0.00 0.00 0.00 4\n", "\n", "avg / total 0.99 0.99 0.99 549\n", "\n", "\n", "Confusion matrix\n", "[[545 0]\n", " [ 4 0]] \n", "\n", "Using all data\n", "Accuracy:0.991 \n", "\n", "Classification report\n", " precision recall f1-score support\n", "\n", " 0.0 0.99 1.00 1.00 2176\n", " 1.0 0.00 0.00 0.00 19\n", "\n", "avg / total 0.98 0.99 0.99 2195\n", "\n", "\n", "Confusion matrix\n", "[[2176 0]\n", " [ 19 0]] \n", "\n" ] } ], "prompt_number": 28 }, { "cell_type": "code", "collapsed": false, "input": [ "y = tips_adj_df[tips_adj_df['foursquare_price_tier'] == 1]['grade_C']\n", "cheap_df = tips_adj_df[tips_adj_df['foursquare_price_tier'] == 1]\n", "X_adjs = cheap_df.ix[:, 34:-1]\n", "X_train, X_test, y_train, y_test = train_test_split(X_adjs, y, random_state=8)\n", "\n", "clf_multi_nb = naive_bayes.MultinomialNB()\n", "clf_multi_nb.fit(X_train, y_train)\n", "\n", "print 'Using training set'\n", "score_and_predict(clf_multi_nb, X_train, y_train, X_adjs.columns, 'naive-bayes')\n", "\n", "print 'Using testing set'\n", "score_and_predict(clf_multi_nb, X_test, y_test, X_adjs.columns, 'naive-bayes')\n", "\n", "print 'Using all data'\n", "score_and_predict(clf_multi_nb, X_adjs.values, y.values, X_adjs.columns, 'naive-bayes')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Using training set\n", "MODEL: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)\n", "SCORE: 0.990886998785\n", "AUC: 0.5\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "10 available 0.003484 0.009191\n", "11 average 0.003484 0.009191\n", "24 bushy 0.003484 0.009191\n", "31 certain 0.003484 0.009191\n", "67 efficient 0.003484 0.009191\n", "73 epic 0.003484 0.009191\n", "86 flat 0.003484 0.009191\n", "111 heavy 0.003484 0.009191\n", "126 key 0.003484 0.009191\n", "148 modern 0.003484 0.009191\n", "151 natural 0.003484 0.009191\n", "161 organic 0.003484 0.009191\n", "171 pic 0.003484 0.009191\n", "172 pleasant 0.003484 0.009191\n", "186 ridiculous 0.003484 0.009191\n", "190 russian 0.003484 0.009191\n", "193 saltfish 0.003484 0.009191\n", "200 sicilian 0.003484 0.009191\n", "201 similar 0.003484 0.009191\n", "203 sized 0.003484 0.009191\n", "216 stellar 0.003484 0.009191\n", "234 turkish 0.003484 0.009191\n", "236 unbeatable 0.003484 0.009191\n", "238 unique 0.003484 0.009191\n", "5 affordable 0.003484 0.018394\n", "7 asian 0.003484 0.018394\n", "8 attentive 0.003484 0.018394\n", "15 basic 0.003484 0.018394\n", "30 central 0.003484 0.018394\n", "48 creative 0.003484 0.018394\n", ".. ... ... ...\n", "224 swiss 0.003484 0.027608\n", "230 tiny 0.003484 0.027608\n", "233 true 0.003484 0.027608\n", "252 young 0.003484 0.027608\n", "102 good 0.010453 0.028718\n", "12 awful 0.003484 0.036833\n", "18 beautiful 0.003484 0.036833\n", "47 crazy 0.003484 0.036833\n", "57 dive 0.003484 0.036833\n", "72 entire 0.003484 0.036833\n", "101 goat 0.003484 0.036833\n", "104 greasy 0.003484 0.036833\n", "113 high 0.003484 0.036833\n", "158 olive 0.003484 0.036833\n", "165 overall 0.003484 0.036833\n", "166 own 0.003484 0.036833\n", "202 simple 0.003484 0.036833\n", "211 spanish 0.003484 0.036833\n", "217 straight 0.003484 0.036833\n", "219 such 0.003484 0.036833\n", "232 traditional 0.003484 0.036833\n", "16 bean 0.003484 0.046069\n", "40 close 0.003484 0.046069\n", "55 dirty 0.003484 0.046069\n", "65 easy 0.003484 0.046069\n", "81 fat 0.003484 0.046069\n", "87 flavorful 0.003484 0.046069\n", "183 reasonable 0.003484 0.046069\n", "235 typical 0.003484 0.046069\n", "244 weird 0.003484 0.046069\n", "\n", "[116 rows x 3 columns]\n", "\n", "\n", "Using testing set\n", "MODEL: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)\n", "SCORE: 0.992714025501\n", "AUC: 0.5\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "75 expensive 0.006969 0.007326\n", "77 fabulous 0.006969 0.007326\n", "3 addictive 0.003484 0.007326\n", "5 affordable 0.003484 0.007326\n", "6 american 0.003484 0.007326\n", "7 asian 0.003484 0.007326\n", "8 attentive 0.003484 0.007326\n", "10 available 0.003484 0.007326\n", "11 average 0.003484 0.007326\n", "14 baked 0.003484 0.007326\n", "23 bubble 0.003484 0.007326\n", "29 casual 0.003484 0.007326\n", "31 certain 0.003484 0.007326\n", "46 cozy 0.003484 0.007326\n", "55 dirty 0.003484 0.007326\n", "56 dish 0.003484 0.007326\n", "61 dry 0.003484 0.007326\n", "65 easy 0.003484 0.007326\n", "68 eggplant 0.003484 0.007326\n", "69 empty 0.003484 0.007326\n", "73 epic 0.003484 0.007326\n", "86 flat 0.003484 0.007326\n", "87 flavorful 0.003484 0.007326\n", "89 french 0.003484 0.007326\n", "93 full 0.003484 0.007326\n", "94 funny 0.003484 0.007326\n", "97 generous 0.003484 0.007326\n", "99 ginormous 0.003484 0.007326\n", "101 goat 0.003484 0.007326\n", "112 helpful 0.003484 0.007326\n", ".. ... ... ...\n", "106 green 0.003484 0.029467\n", "132 late 0.003484 0.029467\n", "152 nearby 0.003484 0.029467\n", "157 old 0.003484 0.029467\n", "181 ready 0.003484 0.029467\n", "184 red 0.003484 0.029467\n", "83 few 0.006969 0.036902\n", "242 want 0.006969 0.036902\n", "12 awful 0.003484 0.036902\n", "20 black 0.003484 0.036902\n", "25 busy 0.003484 0.036902\n", "49 cute 0.003484 0.036902\n", "51 decent 0.003484 0.036902\n", "53 delish 0.003484 0.036902\n", "85 fish 0.003484 0.036902\n", "96 general 0.003484 0.036902\n", "182 real 0.003484 0.036902\n", "215 steak 0.003484 0.036902\n", "13 bad 0.006969 0.044365\n", "38 clean 0.003484 0.044365\n", "74 excellent 0.003484 0.044365\n", "80 fast 0.003484 0.044365\n", "100 give 0.003484 0.044365\n", "108 happy 0.003484 0.044365\n", "116 huge 0.003484 0.044365\n", "168 perfect 0.003484 0.044365\n", "185 regular 0.003484 0.044365\n", "205 small 0.003484 0.044365\n", "209 solid 0.003484 0.044365\n", "251 youll 0.003484 0.044365\n", "\n", "[144 rows x 3 columns]\n", "\n", "\n", "Using all data\n", "MODEL: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)\n", "SCORE: 0.991343963554\n", "AUC: 0.5\n", "\n", "\n", "TOP PREDICTORS (p-value < 0.05):\n", " adjective coef p-value\n", "3 addictive 0.003484 0.008728\n", "24 bushy 0.003484 0.008728\n", "29 casual 0.003484 0.008728\n", "67 efficient 0.003484 0.008728\n", "99 ginormous 0.003484 0.008728\n", "148 modern 0.003484 0.008728\n", "161 organic 0.003484 0.008728\n", "164 outstanding 0.003484 0.008728\n", "171 pic 0.003484 0.008728\n", "173 polish 0.003484 0.008728\n", "186 ridiculous 0.003484 0.008728\n", "187 rosemary 0.003484 0.008728\n", "190 russian 0.003484 0.008728\n", "193 saltfish 0.003484 0.008728\n", "200 sicilian 0.003484 0.008728\n", "203 sized 0.003484 0.008728\n", "214 standard 0.003484 0.008728\n", "216 stellar 0.003484 0.008728\n", "238 unique 0.003484 0.008728\n", "10 available 0.003484 0.017463\n", "11 average 0.003484 0.017463\n", "15 basic 0.003484 0.017463\n", "30 central 0.003484 0.017463\n", "31 certain 0.003484 0.017463\n", "48 creative 0.003484 0.017463\n", "73 epic 0.003484 0.017463\n", "78 famous 0.003484 0.017463\n", "86 flat 0.003484 0.017463\n", "121 interesting 0.003484 0.017463\n", "126 key 0.003484 0.017463\n", ".. ... ... ...\n", "72 entire 0.003484 0.034959\n", "94 funny 0.003484 0.034959\n", "97 generous 0.003484 0.034959\n", "119 indian 0.003484 0.034959\n", "120 inexpensive 0.003484 0.034959\n", "127 kid 0.003484 0.034959\n", "133 later 0.003484 0.034959\n", "139 low 0.003484 0.034959\n", "158 olive 0.003484 0.034959\n", "162 original 0.003484 0.034959\n", "178 public 0.003484 0.034959\n", "180 quiet 0.003484 0.034959\n", "211 spanish 0.003484 0.034959\n", "217 straight 0.003484 0.034959\n", "232 traditional 0.003484 0.034959\n", "243 weak 0.003484 0.034959\n", "252 young 0.003484 0.034959\n", "16 bean 0.003484 0.043719\n", "33 cheesy 0.003484 0.043719\n", "98 giant 0.003484 0.043719\n", "101 goat 0.003484 0.043719\n", "113 high 0.003484 0.043719\n", "118 incredible 0.003484 0.043719\n", "122 irish 0.003484 0.043719\n", "140 magic 0.003484 0.043719\n", "183 reasonable 0.003484 0.043719\n", "195 second 0.003484 0.043719\n", "202 simple 0.003484 0.043719\n", "206 smile 0.003484 0.043719\n", "218 strong 0.003484 0.043719\n", "\n", "[102 rows x 3 columns]\n", "\n", "\n" ] } ], "prompt_number": 26 }, { "cell_type": "code", "collapsed": false, "input": [ "# tips_adj_df.to_pickle('./dumps/tips_complete_features.pkl')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 27 } ], "metadata": {} } ] }