{
 "metadata": {
  "name": "",
  "signature": "sha256:0e9113f47ebd4a9ade123a18e34592ab50c13c4007c50ea2010eb2802c6ef853"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import pandas as pd\n",
      "import numpy as np\n",
      "import matplotlib.pyplot as plt\n",
      "\n",
      "import folium as fm\n",
      "import geopy\n",
      "\n",
      "from sklearn import linear_model, naive_bayes, feature_selection, metrics, tree\n",
      "from sklearn.cross_validation import train_test_split\n",
      "\n",
      "%matplotlib inline"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "tips_with_adjectives_1 = pd.read_pickle('./dumps/tips_with_adjectives.pkl')\n",
      "adj_dummies = pd.read_pickle('./dumps/adjective_dataframe.pkl')\n",
      "adj_df = pd.read_csv('./dumps/adjective_count_list.csv')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print len(adj_dummies)\n",
      "print len(tips_with_adjectives_1)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "15527\n",
        "15527\n"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "adjective_list = list(adj_df[adj_df['count'] > 10]['word'])\n",
      "\n",
      "print \"Number of tips: \", len(tips_with_adjectives_1)\n",
      "print \"Number of adjectives: \", len(adj_df)\n",
      "print \"Number of significant adjectives (appears in more than 10 tips): \", len(adjective_list)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Number of tips:  15527\n",
        "Number of adjectives:  1838\n",
        "Number of significant adjectives (appears in more than 10 tips):  255\n"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# tips_with_adjectives['address'] = tips_with_adjectives.apply(lambda x: \"{0} {1} {2}, {3}, New York, NY\".format(x['BUILDING'].strip(), x['STREET'].strip(), int(x['ZIPCODE']), x['BORO']), axis=1)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "latlong_df = pd.read_pickle('./dumps/with_lat_long.pkl')[['foursquare_id', 'lat_long']]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "tips_adj_df = tips_with_adjectives_1.join(adj_dummies)\n",
      "tips_adj_df.drop_duplicates(['foursquare_id', 'description'], inplace=True)\n",
      "tips_adj_df = tips_adj_df.merge(latlong_df, on='foursquare_id', how='left')\n",
      "len(tips_adj_df)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 7,
       "text": [
        "10762"
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# desired_columns = [\n",
      "#     'foursquare_id',\n",
      "#     'DBA',\n",
      "#     'description', \n",
      "#     'tip_words', \n",
      "#     'tip_adjs', \n",
      "#     'adj_string', \n",
      "#     'foursquare_rating',\n",
      "#     'foursquare_num_of_users',\n",
      "#     'foursquare_price_tier',\n",
      "#     'grade_A', \n",
      "#     'grade_C',\n",
      "#     'GRADE',\n",
      "#     'lat_long'\n",
      "# ]\n",
      "\n",
      "# tips_df = tips_with_adjectives[desired_columns]\n",
      "# len(tips_df)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def score_and_predict(model, x_features, y_targets, columns, model_type):\n",
      "    score = model.score(x_features, y_targets)\n",
      "    y_pred = model.predict(x_features)\n",
      "    auc = metrics.roc_auc_score(y_targets, y_pred)\n",
      "    \n",
      "    p_values = feature_selection.f_classif(x_features, y_targets)\n",
      "    \n",
      "    if model_type == 'naive-bayes' or model_type == 'logistic':\n",
      "        coef_list = [np.exp(round(x, 4)) for x in model.coef_[0]]\n",
      "    elif model_type == 'linear':\n",
      "        coef_list = [round(x, 4) for x in model.coef_]\n",
      "        \n",
      "    \n",
      "    df_dict = {'adjective': columns, 'p-value': p_values[0], 'coef': coef_list}\n",
      "    model_df = pd.DataFrame(df_dict)\n",
      "    model_df.sort(['p-value', 'coef'], ascending=[1,0], inplace=True)\n",
      "    \n",
      "    print 'MODEL: ', model\n",
      "    print 'SCORE: ', score\n",
      "    print 'AUC: ', auc\n",
      "    print '\\n'\n",
      "    \n",
      "    print 'TOP PREDICTORS (p-value < 0.05):'\n",
      "    print model_df[model_df['p-value'] <= 0.05]\n",
      "    print '\\n'\n",
      "    \n",
      "    #  return model_df\n",
      "    \n",
      "    "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# for index, column in enumerate(tips_adj_df.columns.values):\n",
      "#     print column, index"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 11
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# ADJECTIVE COLUMNS ARE 31 till second to last column\n",
      "# based on adjectives included in tip descriptions\n",
      "X_adjs = tips_adj_df.ix[:, 34:-1]\n",
      "\n",
      "# based on ratings, number of users, and price tier\n",
      "X_foursquare_info = tips_adj_df[['foursquare_rating', 'foursquare_num_of_users', 'foursquare_price_tier']].dropna(axis=1)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 12
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Predicting Grade \"A\" restaurants"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "y = tips_adj_df['grade_A']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "X_train, X_test, y_train, y_test = train_test_split(X_adjs, y, random_state=8)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "clf_multi_nb = naive_bayes.MultinomialNB()\n",
      "clf_multi_nb = clf_multi_nb.fit(X_train, y_train)\n",
      "\n",
      "print 'Using training set'\n",
      "score_and_predict(clf_multi_nb, X_train, y_train, X_adjs.columns, 'naive-bayes')\n",
      "\n",
      "print 'Using testing set'\n",
      "score_and_predict(clf_multi_nb, X_test, y_test, X_adjs.columns, 'naive-bayes')\n",
      "\n",
      "print 'Using all data'\n",
      "score_and_predict(clf_multi_nb, X_adjs.values, y.values, X_adjs.columns, 'naive-bayes')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Using training set\n",
        "MODEL: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)\n",
        "SCORE:  0.901251393879\n",
        "AUC:  0.502431233668\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "        adjective      coef   p-value\n",
        "163         other  0.005634  0.000050\n",
        "26           cant  0.009299  0.000081\n",
        "73           epic  0.000679  0.000087\n",
        "210      spacious  0.000679  0.000087\n",
        "250        yellow  0.000679  0.000087\n",
        "87      flavorful  0.001290  0.000174\n",
        "5      affordable  0.001901  0.000261\n",
        "96        general  0.002579  0.001122\n",
        "209         solid  0.001968  0.001918\n",
        "122         irish  0.001358  0.003553\n",
        "215         steak  0.006109  0.004008\n",
        "95         garlic  0.004277  0.004226\n",
        "46           cozy  0.001833  0.006104\n",
        "115           hot  0.012015  0.006348\n",
        "227      terrible  0.002647  0.007124\n",
        "158         olive  0.000747  0.008315\n",
        "237  unbelievable  0.000747  0.008315\n",
        "38          clean  0.004548  0.008604\n",
        "138          long  0.005430  0.009899\n",
        "84          first  0.004209  0.011209\n",
        "56           dish  0.002987  0.013767\n",
        "140         magic  0.000611  0.014504\n",
        "151       natural  0.000611  0.014504\n",
        "191           sad  0.000611  0.014504\n",
        "216       stellar  0.000611  0.014504\n",
        "119        indian  0.001765  0.020247\n",
        "157           old  0.004005  0.020831\n",
        "223         sweet  0.007738  0.023556\n",
        "118    incredible  0.002104  0.025013\n",
        "169      personal  0.001154  0.029040\n",
        "170    phenomenal  0.001154  0.029040\n",
        "228      terrific  0.001154  0.029040\n",
        "107       grilled  0.005974  0.031371\n",
        "81            fat  0.000815  0.033534\n",
        "127           kid  0.000815  0.033534\n",
        "162      original  0.000815  0.033534\n",
        "178        public  0.000815  0.033534\n",
        "239         usual  0.000815  0.033534\n",
        "54      different  0.002308  0.033618\n",
        "82       favorite  0.006652  0.038926\n",
        "\n",
        "\n",
        "Using testing set\n",
        "MODEL: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)\n",
        "SCORE:  0.888888888889\n",
        "AUC:  0.5\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "       adjective      coef   p-value\n",
        "83           few  0.003665  0.000000\n",
        "56          dish  0.002987  0.000000\n",
        "14         baked  0.002715  0.000000\n",
        "93          full  0.002579  0.000000\n",
        "120  inexpensive  0.001290  0.000000\n",
        "35       chinese  0.006517  0.002318\n",
        "179        quick  0.003462  0.004851\n",
        "130        large  0.003937  0.006621\n",
        "114     horrible  0.003326  0.006621\n",
        "154         next  0.003326  0.006621\n",
        "124          ive  0.008349  0.009112\n",
        "113         high  0.002715  0.012537\n",
        "118   incredible  0.002104  0.012537\n",
        "68      eggplant  0.001833  0.012537\n",
        "146     mediocre  0.001222  0.012537\n",
        "149         much  0.008077  0.014883\n",
        "10     available  0.002104  0.015660\n",
        "47         crazy  0.001968  0.015660\n",
        "199        short  0.001901  0.015660\n",
        "25          busy  0.001629  0.015660\n",
        "104       greasy  0.001154  0.015660\n",
        "133        later  0.001154  0.015660\n",
        "163        other  0.005634  0.020173\n",
        "116         huge  0.004141  0.025169\n",
        "254        yummy  0.006992  0.040728\n",
        "131         last  0.002579  0.045608\n",
        "54     different  0.002308  0.045608\n",
        "23        bubble  0.002240  0.045608\n",
        "5     affordable  0.001901  0.045608\n",
        "51        decent  0.005023  0.049499\n",
        "\n",
        "\n",
        "Using all data\n",
        "MODEL: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)\n",
        "SCORE:  0.898160193273\n",
        "AUC:  0.50176809245\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "      adjective      coef   p-value\n",
        "230        tiny  0.001968  0.000085\n",
        "151     natural  0.000611  0.000490\n",
        "3     addictive  0.000543  0.000490\n",
        "228    terrific  0.001154  0.000981\n",
        "238      unique  0.001086  0.000981\n",
        "169    personal  0.001154  0.002052\n",
        "54    different  0.002308  0.002205\n",
        "163       other  0.005634  0.002261\n",
        "90        fresh  0.014595  0.002581\n",
        "13          bad  0.008077  0.003905\n",
        "66          eat  0.009299  0.007615\n",
        "29       casual  0.000407  0.007943\n",
        "224       swiss  0.000407  0.007943\n",
        "9     authentic  0.003190  0.008253\n",
        "180       quiet  0.001968  0.009322\n",
        "5    affordable  0.001901  0.009322\n",
        "40        close  0.001901  0.009322\n",
        "58         dont  0.025929  0.009461\n",
        "182        real  0.003734  0.010006\n",
        "95       garlic  0.004277  0.010360\n",
        "1        accept  0.001154  0.010862\n",
        "30      central  0.001154  0.010862\n",
        "56         dish  0.002987  0.011253\n",
        "140       magic  0.000611  0.015087\n",
        "191         sad  0.000611  0.015087\n",
        "204        slow  0.007195  0.016624\n",
        "157         old  0.004005  0.018804\n",
        "79    fantastic  0.004887  0.020690\n",
        "38        clean  0.004548  0.023304\n",
        "119      indian  0.001765  0.024510\n",
        "202      simple  0.001697  0.024510\n",
        "20        black  0.002851  0.029148\n",
        "57         dive  0.001018  0.030205\n",
        "143        many  0.003055  0.032713\n",
        "130       large  0.003937  0.036375\n",
        "251       youll  0.003937  0.036375\n",
        "26         cant  0.009299  0.037648\n",
        "118  incredible  0.002104  0.038944\n",
        "110     healthy  0.002308  0.039847\n",
        "209       solid  0.001968  0.039847\n",
        "45         cool  0.004277  0.043770\n",
        "234     turkish  0.000815  0.044770\n",
        "193    saltfish  0.000475  0.045706\n",
        "178      public  0.000815  0.046227\n",
        "210    spacious  0.000679  0.046227\n",
        "15        basic  0.000475  0.046227\n",
        "\n",
        "\n"
       ]
      }
     ],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "clf_linear = linear_model.LinearRegression()\n",
      "clf_linear = clf_linear.fit(X_train, y_train)\n",
      "\n",
      "print 'Using training set'\n",
      "score_and_predict(clf_linear, X_train, y_train, X_adjs.columns, 'linear')\n",
      "\n",
      "print 'Using testing set'\n",
      "score_and_predict(clf_linear, X_test, y_test, X_adjs.columns, 'linear')\n",
      "\n",
      "print 'Using all data'\n",
      "score_and_predict(clf_linear, X_adjs.values, y.values, X_adjs.columns, 'linear')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Using training set\n",
        "MODEL: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " LinearRegression(copy_X=True, fit_intercept=True, normalize=False)\n",
        "SCORE:  0.0306220058522\n",
        "AUC:  0.65453453789\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "        adjective    coef   p-value\n",
        "163         other -0.0005  0.000050\n",
        "26           cant  0.0110  0.000081\n",
        "73           epic  0.0196  0.000087\n",
        "250        yellow  0.0143  0.000087\n",
        "210      spacious  0.0123  0.000087\n",
        "87      flavorful  0.0218  0.000174\n",
        "5      affordable  0.0061  0.000261\n",
        "96        general  0.0292  0.001122\n",
        "209         solid  0.0167  0.001918\n",
        "122         irish -0.0082  0.003553\n",
        "215         steak -0.0008  0.004008\n",
        "95         garlic -0.0077  0.004226\n",
        "46           cozy -0.0200  0.006104\n",
        "115           hot  0.0042  0.006348\n",
        "227      terrible -0.0021  0.007124\n",
        "158         olive  0.0464  0.008315\n",
        "237  unbelievable  0.0079  0.008315\n",
        "38          clean  0.0037  0.008604\n",
        "138          long -0.0036  0.009899\n",
        "84          first -0.0020  0.011209\n",
        "56           dish  0.0091  0.013767\n",
        "151       natural  0.0361  0.014504\n",
        "191           sad  0.0085  0.014504\n",
        "216       stellar -0.0123  0.014504\n",
        "140         magic -0.0354  0.014504\n",
        "119        indian -0.0069  0.020247\n",
        "157           old -0.0030  0.020831\n",
        "223         sweet  0.0028  0.023556\n",
        "118    incredible  0.0127  0.025013\n",
        "228      terrific  0.0222  0.029040\n",
        "169      personal -0.0176  0.029040\n",
        "170    phenomenal -0.0186  0.029040\n",
        "107       grilled  0.0041  0.031371\n",
        "239         usual  0.0336  0.033534\n",
        "127           kid  0.0176  0.033534\n",
        "162      original  0.0168  0.033534\n",
        "81            fat  0.0163  0.033534\n",
        "178        public  0.0145  0.033534\n",
        "54      different -0.0040  0.033618\n",
        "82       favorite  0.0016  0.038926\n",
        "\n",
        "\n",
        "Using testing set\n",
        "MODEL: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " LinearRegression(copy_X=True, fit_intercept=True, normalize=False)\n",
        "SCORE:  -0.0270734866002\n",
        "AUC:  0.514883782061\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "       adjective    coef   p-value\n",
        "120  inexpensive  0.1140  0.000000\n",
        "93          full  0.0438  0.000000\n",
        "14         baked  0.0327  0.000000\n",
        "83           few  0.0294  0.000000\n",
        "56          dish  0.0091  0.000000\n",
        "35       chinese -0.0829  0.002318\n",
        "179        quick -0.0168  0.004851\n",
        "154         next  0.0210  0.006621\n",
        "130        large -0.0158  0.006621\n",
        "114     horrible -0.0324  0.006621\n",
        "124          ive -0.0264  0.009112\n",
        "113         high  0.0792  0.012537\n",
        "118   incredible  0.0127  0.012537\n",
        "68      eggplant -0.0432  0.012537\n",
        "146     mediocre -0.1384  0.012537\n",
        "149         much  0.0396  0.014883\n",
        "199        short  0.1068  0.015660\n",
        "47         crazy  0.0316  0.015660\n",
        "25          busy -0.0378  0.015660\n",
        "104       greasy -0.0779  0.015660\n",
        "10     available -0.0937  0.015660\n",
        "133        later -0.1298  0.015660\n",
        "163        other -0.0005  0.020173\n",
        "116         huge -0.0639  0.025169\n",
        "254        yummy  0.0469  0.040728\n",
        "23        bubble  0.0391  0.045608\n",
        "5     affordable  0.0061  0.045608\n",
        "54     different -0.0040  0.045608\n",
        "131         last -0.0163  0.045608\n",
        "51        decent  0.0553  0.049499\n",
        "\n",
        "\n",
        "Using all data\n",
        "MODEL: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " LinearRegression(copy_X=True, fit_intercept=True, normalize=False)\n",
        "SCORE:  0.0153673116839\n",
        "AUC:  0.616453415498\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "      adjective    coef   p-value\n",
        "230        tiny -0.0225  0.000085\n",
        "3     addictive  0.0950  0.000490\n",
        "151     natural  0.0361  0.000490\n",
        "228    terrific  0.0222  0.000981\n",
        "238      unique -0.0312  0.000981\n",
        "169    personal -0.0176  0.002052\n",
        "54    different -0.0040  0.002205\n",
        "163       other -0.0005  0.002261\n",
        "90        fresh  0.0131  0.002581\n",
        "13          bad -0.0062  0.003905\n",
        "66          eat -0.0031  0.007615\n",
        "224       swiss  0.0856  0.007943\n",
        "29       casual -0.0786  0.007943\n",
        "9     authentic  0.0092  0.008253\n",
        "180       quiet  0.0412  0.009322\n",
        "5    affordable  0.0061  0.009322\n",
        "40        close -0.0286  0.009322\n",
        "58         dont  0.0163  0.009461\n",
        "182        real  0.0094  0.010006\n",
        "95       garlic -0.0077  0.010360\n",
        "1        accept  0.0405  0.010862\n",
        "30      central  0.0343  0.010862\n",
        "56         dish  0.0091  0.011253\n",
        "191         sad  0.0085  0.015087\n",
        "140       magic -0.0354  0.015087\n",
        "204        slow -0.0116  0.016624\n",
        "157         old -0.0030  0.018804\n",
        "79    fantastic -0.0188  0.020690\n",
        "38        clean  0.0037  0.023304\n",
        "202      simple  0.0226  0.024510\n",
        "119      indian -0.0069  0.024510\n",
        "20        black  0.0394  0.029148\n",
        "57         dive -0.0270  0.030205\n",
        "143        many  0.0036  0.032713\n",
        "130       large -0.0158  0.036375\n",
        "251       youll -0.0307  0.036375\n",
        "26         cant  0.0110  0.037648\n",
        "118  incredible  0.0127  0.038944\n",
        "110     healthy  0.0384  0.039847\n",
        "209       solid  0.0167  0.039847\n",
        "45         cool  0.0238  0.043770\n",
        "234     turkish -0.0607  0.044770\n",
        "193    saltfish -0.0485  0.045706\n",
        "15        basic  0.1099  0.046227\n",
        "178      public  0.0145  0.046227\n",
        "210    spacious  0.0123  0.046227\n",
        "\n",
        "\n"
       ]
      }
     ],
     "prompt_number": 17
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "clf_logistic = linear_model.LogisticRegression()\n",
      "clf_logistic.fit(X_train, y_train)\n",
      "\n",
      "print 'Using training set'\n",
      "score_and_predict(clf_logistic, X_train, y_train, X_adjs.columns, 'logistic')\n",
      "\n",
      "print 'Using testing set'\n",
      "score_and_predict(clf_logistic, X_test, y_test, X_adjs.columns, 'logistic')\n",
      "\n",
      "print 'Using all data'\n",
      "score_and_predict(clf_logistic, X_adjs.values, y.values, X_adjs.columns, 'logistic')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Using training set\n",
        "MODEL: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
        "          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)\n",
        "SCORE:  0.901003593111\n",
        "AUC:  0.500625\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "        adjective      coef   p-value\n",
        "163         other  1.057386  0.000050\n",
        "26           cant  1.184831  0.000081\n",
        "73           epic  1.098779  0.000087\n",
        "250        yellow  1.097571  0.000087\n",
        "210      spacious  1.052954  0.000087\n",
        "87      flavorful  1.143622  0.000174\n",
        "5      affordable  1.044564  0.000261\n",
        "96        general  1.248696  0.001122\n",
        "209         solid  1.152807  0.001918\n",
        "122         irish  0.962232  0.003553\n",
        "215         steak  0.986887  0.004008\n",
        "95         garlic  0.938474  0.004226\n",
        "46           cozy  0.893776  0.006104\n",
        "115           hot  1.031176  0.006348\n",
        "227      terrible  1.015316  0.007124\n",
        "158         olive  1.137463  0.008315\n",
        "237  unbelievable  1.029219  0.008315\n",
        "38          clean  1.051376  0.008604\n",
        "138          long  0.981474  0.009899\n",
        "84          first  0.957337  0.011209\n",
        "56           dish  1.086107  0.013767\n",
        "151       natural  1.067479  0.014504\n",
        "191           sad  1.002704  0.014504\n",
        "216       stellar  0.941576  0.014504\n",
        "140         magic  0.860020  0.014504\n",
        "119        indian  0.969282  0.020247\n",
        "157           old  0.971514  0.020831\n",
        "223         sweet  1.015316  0.023556\n",
        "118    incredible  1.085239  0.025013\n",
        "228      terrific  1.048332  0.029040\n",
        "169      personal  0.903481  0.029040\n",
        "170    phenomenal  0.886300  0.029040\n",
        "107       grilled  1.022244  0.031371\n",
        "239         usual  1.144651  0.033534\n",
        "81            fat  1.101199  0.033534\n",
        "162      original  1.086650  0.033534\n",
        "127           kid  1.086107  0.033534\n",
        "178        public  1.072186  0.033534\n",
        "54      different  0.943933  0.033618\n",
        "82       favorite  1.031589  0.038926\n",
        "\n",
        "\n",
        "Using testing set\n",
        "MODEL: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
        "          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)\n",
        "SCORE:  0.888888888889\n",
        "AUC:  0.5\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "       adjective      coef   p-value\n",
        "120  inexpensive  2.360091  0.000000\n",
        "93          full  1.620282  0.000000\n",
        "83           few  1.384307  0.000000\n",
        "14         baked  1.355134  0.000000\n",
        "56          dish  1.086107  0.000000\n",
        "35       chinese  0.505807  0.002318\n",
        "179        quick  0.853679  0.004851\n",
        "154         next  1.204061  0.006621\n",
        "130        large  0.860708  0.006621\n",
        "114     horrible  0.753294  0.006621\n",
        "124          ive  0.785920  0.009112\n",
        "113         high  2.387866  0.012537\n",
        "118   incredible  1.085239  0.012537\n",
        "68      eggplant  0.727894  0.012537\n",
        "146     mediocre  0.431538  0.012537\n",
        "149         much  1.625800  0.014883\n",
        "199        short  2.895622  0.015660\n",
        "47         crazy  1.344874  0.015660\n",
        "25          busy  0.810098  0.015660\n",
        "104       greasy  0.628449  0.015660\n",
        "10     available  0.507987  0.015660\n",
        "133        later  0.476589  0.015660\n",
        "163        other  1.057386  0.020173\n",
        "116         huge  0.606652  0.025169\n",
        "254        yummy  1.751548  0.040728\n",
        "23        bubble  1.441234  0.045608\n",
        "5     affordable  1.044564  0.045608\n",
        "54     different  0.943933  0.045608\n",
        "131         last  0.852740  0.045608\n",
        "51        decent  1.715321  0.049499\n",
        "\n",
        "\n",
        "Using all data\n",
        "MODEL: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
        "          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)\n",
        "SCORE:  0.897974354209\n",
        "AUC:  0.500454959054\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "      adjective      coef   p-value\n",
        "230        tiny  0.858216  0.000085\n",
        "3     addictive  1.555816  0.000490\n",
        "151     natural  1.067479  0.000490\n",
        "228    terrific  1.048332  0.000981\n",
        "238      unique  0.815136  0.000981\n",
        "169    personal  0.903481  0.002052\n",
        "54    different  0.943933  0.002205\n",
        "163       other  1.057386  0.002261\n",
        "90        fresh  1.188509  0.002581\n",
        "13          bad  0.933607  0.003905\n",
        "66          eat  0.951800  0.007615\n",
        "224       swiss  1.352156  0.007943\n",
        "29       casual  0.736460  0.007943\n",
        "9     authentic  1.049171  0.008253\n",
        "180       quiet  1.439218  0.009322\n",
        "5    affordable  1.044564  0.009322\n",
        "40        close  0.794613  0.009322\n",
        "58         dont  1.234542  0.009461\n",
        "182        real  1.113268  0.010006\n",
        "95       garlic  0.938474  0.010360\n",
        "1        accept  1.302128  0.010862\n",
        "30      central  1.269979  0.010862\n",
        "56         dish  1.086107  0.011253\n",
        "191         sad  1.002704  0.015087\n",
        "140       magic  0.860020  0.015087\n",
        "204        slow  0.896999  0.016624\n",
        "157         old  0.971514  0.018804\n",
        "79    fantastic  0.846623  0.020690\n",
        "38        clean  1.051376  0.023304\n",
        "202      simple  1.174568  0.024510\n",
        "119      indian  0.969282  0.024510\n",
        "20        black  1.402000  0.029148\n",
        "57         dive  0.859590  0.030205\n",
        "143        many  1.043938  0.032713\n",
        "130       large  0.860708  0.036375\n",
        "251       youll  0.745575  0.036375\n",
        "26         cant  1.184831  0.037648\n",
        "118  incredible  1.085239  0.038944\n",
        "110     healthy  1.474177  0.039847\n",
        "209       solid  1.152807  0.039847\n",
        "45         cool  1.258222  0.043770\n",
        "234     turkish  0.737713  0.044770\n",
        "193    saltfish  0.823576  0.045706\n",
        "15        basic  1.494662  0.046227\n",
        "178      public  1.072186  0.046227\n",
        "210    spacious  1.052954  0.046227\n",
        "\n",
        "\n"
       ]
      }
     ],
     "prompt_number": 19
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Predicting Grade \"C\" restaurants"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "y = tips_adj_df['grade_C']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 20
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "X_train, X_test, y_train, y_test = train_test_split(X_adjs, y, random_state=8)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 21
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "clf_multi_nb = naive_bayes.MultinomialNB()\n",
      "clf_multi_nb = clf_multi_nb.fit(X_train, y_train)\n",
      "\n",
      "print 'Using training set'\n",
      "score_and_predict(clf_multi_nb, X_train, y_train, X_adjs.columns, 'naive-bayes')\n",
      "\n",
      "print 'Using testing set'\n",
      "score_and_predict(clf_multi_nb, X_test, y_test, X_adjs.columns, 'naive-bayes')\n",
      "\n",
      "print 'Using all data'\n",
      "score_and_predict(clf_multi_nb, X_adjs.values, y.values, X_adjs.columns, 'naive-bayes')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Using training set\n",
        "MODEL: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)\n",
        "SCORE:  0.987981662743\n",
        "AUC:  0.499937304075\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "      adjective      coef   p-value\n",
        "53       delish  0.004338  0.000169\n",
        "74    excellent  0.006507  0.000341\n",
        "52    delicious  0.015184  0.001837\n",
        "106       green  0.004338  0.003959\n",
        "249       wrong  0.004338  0.005746\n",
        "22    breakfast  0.006507  0.006458\n",
        "0    20500daily  0.002169  0.012036\n",
        "24        bushy  0.002169  0.012036\n",
        "117      iconic  0.002169  0.012036\n",
        "176   priceless  0.002169  0.012036\n",
        "253        yous  0.002169  0.012036\n",
        "153         new  0.008677  0.017581\n",
        "107     grilled  0.004338  0.018048\n",
        "102        good  0.036876  0.023534\n",
        "171         pic  0.002169  0.024075\n",
        "192       salad  0.008677  0.029159\n",
        "19          big  0.004338  0.034574\n",
        "67    efficient  0.002169  0.036117\n",
        "99    ginormous  0.002169  0.036117\n",
        "196    separate  0.002169  0.036117\n",
        "203       sized  0.002169  0.036117\n",
        "221    superior  0.002169  0.036117\n",
        "84        first  0.004338  0.046109\n",
        "188       royal  0.002169  0.048163\n",
        "\n",
        "\n",
        "Using testing set\n",
        "MODEL: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)\n",
        "SCORE:  0.990338164251\n",
        "AUC:  0.5\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "         adjective      coef   p-value\n",
        "43   complimentary  0.004338  0.009753\n",
        "190        russian  0.004338  0.009753\n",
        "44       congested  0.002169  0.009753\n",
        "48        creative  0.002169  0.009753\n",
        "67       efficient  0.002169  0.009753\n",
        "69           empty  0.002169  0.009753\n",
        "117         iconic  0.002169  0.009753\n",
        "151        natural  0.002169  0.009753\n",
        "169       personal  0.002169  0.009753\n",
        "171            pic  0.002169  0.009753\n",
        "188          royal  0.002169  0.009753\n",
        "193       saltfish  0.002169  0.009753\n",
        "196       separate  0.002169  0.009753\n",
        "198        several  0.002169  0.009753\n",
        "236     unbeatable  0.002169  0.009753\n",
        "243           weak  0.002169  0.009753\n",
        "161        organic  0.004338  0.019512\n",
        "210       spacious  0.004338  0.019512\n",
        "228       terrific  0.004338  0.019512\n",
        "237   unbelievable  0.004338  0.019512\n",
        "2           actual  0.002169  0.019512\n",
        "31         certain  0.002169  0.019512\n",
        "39           clear  0.002169  0.019512\n",
        "103       gorgeous  0.002169  0.019512\n",
        "140          magic  0.002169  0.019512\n",
        "144        massive  0.002169  0.019512\n",
        "187       rosemary  0.002169  0.019512\n",
        "191            sad  0.002169  0.019512\n",
        "201        similar  0.002169  0.019512\n",
        "203          sized  0.002169  0.019512\n",
        "..             ...       ...       ...\n",
        "63           earth  0.002169  0.039054\n",
        "119         indian  0.002169  0.039054\n",
        "127            kid  0.002169  0.039054\n",
        "145           mean  0.002169  0.039054\n",
        "152         nearby  0.002169  0.039054\n",
        "174           poor  0.002169  0.039054\n",
        "207         social  0.002169  0.039054\n",
        "214       standard  0.002169  0.039054\n",
        "224          swiss  0.002169  0.039054\n",
        "155           nice  0.013016  0.041094\n",
        "64            east  0.004338  0.048836\n",
        "16            bean  0.002169  0.048836\n",
        "70         english  0.002169  0.048836\n",
        "72          entire  0.002169  0.048836\n",
        "73            epic  0.002169  0.048836\n",
        "112        helpful  0.002169  0.048836\n",
        "121    interesting  0.002169  0.048836\n",
        "129         korean  0.002169  0.048836\n",
        "134          light  0.002169  0.048836\n",
        "139            low  0.002169  0.048836\n",
        "148         modern  0.002169  0.048836\n",
        "156         normal  0.002169  0.048836\n",
        "177        private  0.002169  0.048836\n",
        "186     ridiculous  0.002169  0.048836\n",
        "197        serious  0.002169  0.048836\n",
        "200       sicilian  0.002169  0.048836\n",
        "216        stellar  0.002169  0.048836\n",
        "218         strong  0.002169  0.048836\n",
        "235        typical  0.002169  0.048836\n",
        "244          weird  0.002169  0.048836\n",
        "\n",
        "[83 rows x 3 columns]\n",
        "\n",
        "\n",
        "Using all data\n",
        "MODEL: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)\n",
        "SCORE:  0.988570897603\n",
        "AUC:  0.499953007519\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "      adjective      coef   p-value\n",
        "130       large  0.004338  0.003783\n",
        "251       youll  0.004338  0.003783\n",
        "84        first  0.004338  0.004120\n",
        "157         old  0.004338  0.004120\n",
        "41         cold  0.002169  0.007306\n",
        "123     italian  0.004338  0.011308\n",
        "0    20500daily  0.002169  0.011465\n",
        "24        bushy  0.002169  0.011465\n",
        "44    congested  0.002169  0.011465\n",
        "176   priceless  0.002169  0.011465\n",
        "253        yous  0.002169  0.011465\n",
        "13          bad  0.006507  0.022223\n",
        "150       music  0.004338  0.022223\n",
        "117      iconic  0.002169  0.022932\n",
        "53       delish  0.004338  0.024571\n",
        "106       green  0.004338  0.031070\n",
        "114    horrible  0.004338  0.031512\n",
        "99    ginormous  0.002169  0.034402\n",
        "171         pic  0.002169  0.034402\n",
        "221    superior  0.002169  0.034402\n",
        "102        good  0.036876  0.036221\n",
        "192       salad  0.008677  0.036831\n",
        "204        slow  0.004338  0.045644\n",
        "67    efficient  0.002169  0.045873\n",
        "196    separate  0.002169  0.045873\n",
        "154        next  0.004338  0.048159\n",
        "\n",
        "\n"
       ]
      }
     ],
     "prompt_number": 22
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "clf_linear = linear_model.LinearRegression()\n",
      "clf_linear = clf_linear.fit(X_train, y_train)\n",
      "\n",
      "print 'Using training set'\n",
      "score_and_predict(clf_linear, X_train, y_train, X_adjs.columns, 'linear')\n",
      "\n",
      "print 'Using testing set'\n",
      "score_and_predict(clf_linear, X_test, y_test, X_adjs.columns, 'linear')\n",
      "\n",
      "print 'Using all data'\n",
      "score_and_predict(clf_linear, X_adjs.values, y.values, X_adjs.columns, 'linear')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Using training set\n",
        "MODEL: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " LinearRegression(copy_X=True, fit_intercept=True, normalize=False)\n",
        "SCORE:  0.0265341264606\n",
        "AUC:  0.813025731452\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "      adjective    coef   p-value\n",
        "53       delish  0.0025  0.000169\n",
        "74    excellent  0.0006  0.000341\n",
        "52    delicious  0.0004  0.001837\n",
        "106       green  0.0031  0.003959\n",
        "249       wrong  0.0009  0.005746\n",
        "22    breakfast -0.0010  0.006458\n",
        "117      iconic  0.0080  0.012036\n",
        "24        bushy -0.0028  0.012036\n",
        "0    20500daily -0.0091  0.012036\n",
        "176   priceless -0.0091  0.012036\n",
        "253        yous -0.0103  0.012036\n",
        "153         new  0.0012  0.017581\n",
        "107     grilled  0.0003  0.018048\n",
        "102        good  0.0016  0.023534\n",
        "171         pic -0.0208  0.024075\n",
        "192       salad -0.0021  0.029159\n",
        "19          big  0.0016  0.034574\n",
        "203       sized  0.0032  0.036117\n",
        "196    separate  0.0028  0.036117\n",
        "221    superior -0.0084  0.036117\n",
        "67    efficient -0.0124  0.036117\n",
        "99    ginormous -0.0202  0.036117\n",
        "84        first  0.0046  0.046109\n",
        "188       royal -0.0046  0.048163\n",
        "\n",
        "\n",
        "Using testing set\n",
        "MODEL:  LinearRegression(copy_X=True, fit_intercept=True, normalize=False)\n",
        "SCORE:  -0.0391889077113\n",
        "AUC:  0.464410448838\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "         adjective    coef   p-value\n",
        "190        russian  0.0802  0.009753\n",
        "43   complimentary  0.0609  0.009753\n",
        "117         iconic  0.0080  0.009753\n",
        "196       separate  0.0028  0.009753\n",
        "44       congested  0.0000  0.009753\n",
        "198        several -0.0005  0.009753\n",
        "188          royal -0.0046  0.009753\n",
        "236     unbeatable -0.0051  0.009753\n",
        "193       saltfish -0.0058  0.009753\n",
        "243           weak -0.0068  0.009753\n",
        "69           empty -0.0101  0.009753\n",
        "151        natural -0.0121  0.009753\n",
        "67       efficient -0.0124  0.009753\n",
        "48        creative -0.0159  0.009753\n",
        "169       personal -0.0189  0.009753\n",
        "171            pic -0.0208  0.009753\n",
        "210       spacious  0.0875  0.019512\n",
        "237   unbelievable  0.0850  0.019512\n",
        "161        organic  0.0622  0.019512\n",
        "228       terrific  0.0401  0.019512\n",
        "203          sized  0.0032  0.019512\n",
        "140          magic -0.0048  0.019512\n",
        "252          young -0.0055  0.019512\n",
        "31         certain -0.0061  0.019512\n",
        "103       gorgeous -0.0062  0.019512\n",
        "229           thin -0.0064  0.019512\n",
        "2           actual -0.0072  0.019512\n",
        "191            sad -0.0095  0.019512\n",
        "39           clear -0.0099  0.019512\n",
        "239          usual -0.0128  0.019512\n",
        "..             ...     ...       ...\n",
        "207         social -0.0083  0.039054\n",
        "30         central -0.0092  0.039054\n",
        "174           poor -0.0092  0.039054\n",
        "119         indian -0.0101  0.039054\n",
        "145           mean -0.0110  0.039054\n",
        "152         nearby -0.0124  0.039054\n",
        "28       caribbean -0.0149  0.039054\n",
        "63           earth -0.0177  0.039054\n",
        "214       standard -0.0345  0.039054\n",
        "155           nice  0.0044  0.041094\n",
        "64            east  0.0230  0.048836\n",
        "139            low -0.0045  0.048836\n",
        "72          entire -0.0055  0.048836\n",
        "218         strong -0.0068  0.048836\n",
        "148         modern -0.0078  0.048836\n",
        "235        typical -0.0092  0.048836\n",
        "70         english -0.0093  0.048836\n",
        "156         normal -0.0102  0.048836\n",
        "200       sicilian -0.0102  0.048836\n",
        "197        serious -0.0103  0.048836\n",
        "177        private -0.0112  0.048836\n",
        "134          light -0.0129  0.048836\n",
        "73            epic -0.0130  0.048836\n",
        "186     ridiculous -0.0146  0.048836\n",
        "129         korean -0.0153  0.048836\n",
        "121    interesting -0.0164  0.048836\n",
        "244          weird -0.0174  0.048836\n",
        "112        helpful -0.0176  0.048836\n",
        "16            bean -0.0188  0.048836\n",
        "216        stellar -0.0217  0.048836\n",
        "\n",
        "[83 rows x 3 columns]"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "\n",
        "\n",
        "Using all data\n",
        "MODEL: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " LinearRegression(copy_X=True, fit_intercept=True, normalize=False)\n",
        "SCORE:  0.0125850437395\n",
        "AUC:  0.738506486503\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "      adjective    coef   p-value\n",
        "130       large  0.0080  0.003783\n",
        "251       youll  0.0069  0.003783\n",
        "157         old  0.0081  0.004120\n",
        "84        first  0.0046  0.004120\n",
        "41         cold -0.0127  0.007306\n",
        "123     italian  0.0008  0.011308\n",
        "44    congested  0.0000  0.011465\n",
        "24        bushy -0.0028  0.011465\n",
        "0    20500daily -0.0091  0.011465\n",
        "176   priceless -0.0091  0.011465\n",
        "253        yous -0.0103  0.011465\n",
        "13          bad  0.0046  0.022223\n",
        "150       music -0.0060  0.022223\n",
        "117      iconic  0.0080  0.022932\n",
        "53       delish  0.0025  0.024571\n",
        "106       green  0.0031  0.031070\n",
        "114    horrible  0.0104  0.031512\n",
        "221    superior -0.0084  0.034402\n",
        "99    ginormous -0.0202  0.034402\n",
        "171         pic -0.0208  0.034402\n",
        "102        good  0.0016  0.036221\n",
        "192       salad -0.0021  0.036831\n",
        "204        slow -0.0001  0.045644\n",
        "196    separate  0.0028  0.045873\n",
        "67    efficient -0.0124  0.045873\n",
        "154        next  0.0127  0.048159\n",
        "\n",
        "\n"
       ]
      }
     ],
     "prompt_number": 23
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "clf_logistic = linear_model.LogisticRegression()\n",
      "clf_logistic = clf_logistic.fit(X_train, y_train)\n",
      "\n",
      "print 'Using training set'\n",
      "score_and_predict(clf_logistic, X_train, y_train, X_adjs.columns, 'logistic')\n",
      "\n",
      "print 'Using testing set'\n",
      "score_and_predict(clf_logistic, X_test, y_test, X_adjs.columns, 'logistic')\n",
      "\n",
      "print 'Using all data'\n",
      "score_and_predict(clf_logistic, X_adjs.values, y.values, X_adjs.columns, 'logistic')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Using training set\n",
        "MODEL: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
        "          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)\n",
        "SCORE:  0.988105563127\n",
        "AUC:  0.5\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "      adjective      coef   p-value\n",
        "53       delish  1.041227  0.000169\n",
        "74    excellent  0.962809  0.000341\n",
        "52    delicious  0.975505  0.001837\n",
        "106       green  1.059291  0.003959\n",
        "249       wrong  0.952657  0.005746\n",
        "22    breakfast  0.981376  0.006458\n",
        "24        bushy  0.992131  0.012036\n",
        "117      iconic  0.990644  0.012036\n",
        "0    20500daily  0.989060  0.012036\n",
        "176   priceless  0.989060  0.012036\n",
        "253        yous  0.988665  0.012036\n",
        "153         new  1.037486  0.017581\n",
        "107     grilled  1.029425  0.018048\n",
        "102        good  1.020814  0.023534\n",
        "171         pic  0.962424  0.024075\n",
        "192       salad  0.970543  0.029159\n",
        "19          big  0.932674  0.034574\n",
        "203       sized  0.980787  0.036117\n",
        "196    separate  0.975017  0.036117\n",
        "221    superior  0.968507  0.036117\n",
        "67    efficient  0.962713  0.036117\n",
        "99    ginormous  0.961655  0.036117\n",
        "84        first  1.126257  0.046109\n",
        "188       royal  0.962617  0.048163\n",
        "\n",
        "\n",
        "Using testing set\n",
        "MODEL:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
        "          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)\n",
        "SCORE:  0.990338164251\n",
        "AUC:  0.5\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "         adjective      coef   p-value\n",
        "43   complimentary  1.939247  0.009753\n",
        "190        russian  1.936341  0.009753\n",
        "44       congested  1.000000  0.009753\n",
        "117         iconic  0.990644  0.009753\n",
        "196       separate  0.975017  0.009753\n",
        "67       efficient  0.962713  0.009753\n",
        "188          royal  0.962617  0.009753\n",
        "171            pic  0.962424  0.009753\n",
        "198        several  0.942895  0.009753\n",
        "193       saltfish  0.939507  0.009753\n",
        "236     unbeatable  0.934354  0.009753\n",
        "243           weak  0.933233  0.009753\n",
        "151        natural  0.912926  0.009753\n",
        "69           empty  0.881615  0.009753\n",
        "48        creative  0.845354  0.009753\n",
        "169       personal  0.800675  0.009753\n",
        "237   unbelievable  2.080275  0.019512\n",
        "210       spacious  2.074873  0.019512\n",
        "161        organic  2.036230  0.019512\n",
        "228       terrific  1.697913  0.019512\n",
        "203          sized  0.980787  0.019512\n",
        "144        massive  0.949424  0.019512\n",
        "31         certain  0.945917  0.019512\n",
        "201        similar  0.939037  0.019512\n",
        "103       gorgeous  0.937536  0.019512\n",
        "2           actual  0.925334  0.019512\n",
        "187       rosemary  0.920351  0.019512\n",
        "191            sad  0.920351  0.019512\n",
        "140          magic  0.915395  0.019512\n",
        "39           clear  0.907012  0.019512\n",
        "..             ...       ...       ...\n",
        "28       caribbean  0.918696  0.039054\n",
        "127            kid  0.888607  0.039054\n",
        "63           earth  0.877306  0.039054\n",
        "174           poor  0.857358  0.039054\n",
        "152         nearby  0.850952  0.039054\n",
        "30         central  0.842653  0.039054\n",
        "145           mean  0.819058  0.039054\n",
        "214       standard  0.796602  0.039054\n",
        "119         indian  0.760332  0.039054\n",
        "155           nice  1.240234  0.041094\n",
        "64            east  1.565492  0.048836\n",
        "72          entire  0.947337  0.048836\n",
        "156         normal  0.939695  0.048836\n",
        "177        private  0.916769  0.048836\n",
        "186     ridiculous  0.904204  0.048836\n",
        "73            epic  0.903662  0.048836\n",
        "148         modern  0.894849  0.048836\n",
        "216        stellar  0.888963  0.048836\n",
        "235        typical  0.882938  0.048836\n",
        "197        serious  0.873366  0.048836\n",
        "200       sicilian  0.871534  0.048836\n",
        "139            low  0.857443  0.048836\n",
        "244          weird  0.855474  0.048836\n",
        "129         korean  0.825472  0.048836\n",
        "121    interesting  0.819468  0.048836\n",
        "70         english  0.816850  0.048836\n",
        "218         strong  0.816115  0.048836\n",
        "112        helpful  0.786707  0.048836\n",
        "16            bean  0.784036  0.048836\n",
        "134          light  0.686053  0.048836\n",
        "\n",
        "[83 rows x 3 columns]"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "\n",
        "\n",
        "Using all data\n",
        "MODEL: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
        "          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)\n",
        "SCORE:  0.988663817134\n",
        "AUC:  0.5\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "      adjective      coef   p-value\n",
        "130       large  1.219938  0.003783\n",
        "251       youll  1.153268  0.003783\n",
        "157         old  1.202016  0.004120\n",
        "84        first  1.126257  0.004120\n",
        "41         cold  0.602480  0.007306\n",
        "123     italian  1.073367  0.011308\n",
        "44    congested  1.000000  0.011465\n",
        "24        bushy  0.992131  0.011465\n",
        "0    20500daily  0.989060  0.011465\n",
        "176   priceless  0.989060  0.011465\n",
        "253        yous  0.988665  0.011465\n",
        "13          bad  1.179039  0.022223\n",
        "150       music  0.842400  0.022223\n",
        "117      iconic  0.990644  0.022932\n",
        "53       delish  1.041227  0.024571\n",
        "106       green  1.059291  0.031070\n",
        "114    horrible  1.266301  0.031512\n",
        "221    superior  0.968507  0.034402\n",
        "171         pic  0.962424  0.034402\n",
        "99    ginormous  0.961655  0.034402\n",
        "102        good  1.020814  0.036221\n",
        "192       salad  0.970543  0.036831\n",
        "204        slow  0.879502  0.045644\n",
        "196    separate  0.975017  0.045873\n",
        "67    efficient  0.962713  0.045873\n",
        "154        next  1.290978  0.048159\n",
        "\n",
        "\n"
       ]
      }
     ],
     "prompt_number": 24
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "clf_tree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)\n",
      "\n",
      "print 'Using training set'\n",
      "clf_tree = clf_tree.fit(X_train, y_train)\n",
      "score = clf_tree.score(X_train, y_train)\n",
      "y_pred = clf_tree.predict(X_train)\n",
      "\n",
      "print \"Accuracy:{0:.3f}\".format(metrics.accuracy_score(y_train, y_pred)), \"\\n\"\n",
      "print \"Classification report\"\n",
      "print metrics.classification_report(y_train, y_pred), \"\\n\"\n",
      "        \n",
      "print \"Confusion matrix\"\n",
      "print metrics.confusion_matrix(y_train, y_pred), \"\\n\"\n",
      "\n",
      "print 'Using testing set'\n",
      "score = clf_tree.score(X_test, y_test)\n",
      "y_pred = clf_tree.predict(X_test)\n",
      "\n",
      "print \"Accuracy:{0:.3f}\".format(metrics.accuracy_score(y_test, y_pred)), \"\\n\"\n",
      "print \"Classification report\"\n",
      "print metrics.classification_report(y_test, y_pred), \"\\n\"\n",
      "        \n",
      "print \"Confusion matrix\"\n",
      "print metrics.confusion_matrix(y_test, y_pred), \"\\n\"\n",
      "\n",
      "print 'Using all data'\n",
      "score = clf_tree.score(X_adjs.values, y.values)\n",
      "y_pred = clf_tree.predict(X_adjs.values)\n",
      "\n",
      "print \"Accuracy:{0:.3f}\".format(metrics.accuracy_score(y.values, y_pred)), \"\\n\"\n",
      "print \"Classification report\"\n",
      "print metrics.classification_report(y.values, y_pred), \"\\n\"\n",
      "        \n",
      "print \"Confusion matrix\"\n",
      "print metrics.confusion_matrix(y.values, y_pred), \"\\n\"\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Using training set\n",
        "Accuracy:0.991 \n",
        "\n",
        "Classification report\n",
        "             precision    recall  f1-score   support\n",
        "\n",
        "        0.0       0.99      1.00      1.00      1631\n",
        "        1.0       0.00      0.00      0.00        15\n",
        "\n",
        "avg / total       0.98      0.99      0.99      1646\n",
        "\n",
        "\n",
        "Confusion matrix\n",
        "[[1631    0]\n",
        " [  15    0]] \n",
        "\n",
        "Using testing set\n",
        "Accuracy:0.993 \n",
        "\n",
        "Classification report\n",
        "             precision    recall  f1-score   support\n",
        "\n",
        "        0.0       0.99      1.00      1.00       545\n",
        "        1.0       0.00      0.00      0.00         4\n",
        "\n",
        "avg / total       0.99      0.99      0.99       549\n",
        "\n",
        "\n",
        "Confusion matrix\n",
        "[[545   0]\n",
        " [  4   0]] \n",
        "\n",
        "Using all data\n",
        "Accuracy:0.991 \n",
        "\n",
        "Classification report\n",
        "             precision    recall  f1-score   support\n",
        "\n",
        "        0.0       0.99      1.00      1.00      2176\n",
        "        1.0       0.00      0.00      0.00        19\n",
        "\n",
        "avg / total       0.98      0.99      0.99      2195\n",
        "\n",
        "\n",
        "Confusion matrix\n",
        "[[2176    0]\n",
        " [  19    0]] \n",
        "\n"
       ]
      }
     ],
     "prompt_number": 28
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "y = tips_adj_df[tips_adj_df['foursquare_price_tier'] == 1]['grade_C']\n",
      "cheap_df = tips_adj_df[tips_adj_df['foursquare_price_tier'] == 1]\n",
      "X_adjs = cheap_df.ix[:, 34:-1]\n",
      "X_train, X_test, y_train, y_test = train_test_split(X_adjs, y, random_state=8)\n",
      "\n",
      "clf_multi_nb = naive_bayes.MultinomialNB()\n",
      "clf_multi_nb.fit(X_train, y_train)\n",
      "\n",
      "print 'Using training set'\n",
      "score_and_predict(clf_multi_nb, X_train, y_train, X_adjs.columns, 'naive-bayes')\n",
      "\n",
      "print 'Using testing set'\n",
      "score_and_predict(clf_multi_nb, X_test, y_test, X_adjs.columns, 'naive-bayes')\n",
      "\n",
      "print 'Using all data'\n",
      "score_and_predict(clf_multi_nb, X_adjs.values, y.values, X_adjs.columns, 'naive-bayes')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Using training set\n",
        "MODEL:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)\n",
        "SCORE:  0.990886998785\n",
        "AUC:  0.5\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "       adjective      coef   p-value\n",
        "10     available  0.003484  0.009191\n",
        "11       average  0.003484  0.009191\n",
        "24         bushy  0.003484  0.009191\n",
        "31       certain  0.003484  0.009191\n",
        "67     efficient  0.003484  0.009191\n",
        "73          epic  0.003484  0.009191\n",
        "86          flat  0.003484  0.009191\n",
        "111        heavy  0.003484  0.009191\n",
        "126          key  0.003484  0.009191\n",
        "148       modern  0.003484  0.009191\n",
        "151      natural  0.003484  0.009191\n",
        "161      organic  0.003484  0.009191\n",
        "171          pic  0.003484  0.009191\n",
        "172     pleasant  0.003484  0.009191\n",
        "186   ridiculous  0.003484  0.009191\n",
        "190      russian  0.003484  0.009191\n",
        "193     saltfish  0.003484  0.009191\n",
        "200     sicilian  0.003484  0.009191\n",
        "201      similar  0.003484  0.009191\n",
        "203        sized  0.003484  0.009191\n",
        "216      stellar  0.003484  0.009191\n",
        "234      turkish  0.003484  0.009191\n",
        "236   unbeatable  0.003484  0.009191\n",
        "238       unique  0.003484  0.009191\n",
        "5     affordable  0.003484  0.018394\n",
        "7          asian  0.003484  0.018394\n",
        "8      attentive  0.003484  0.018394\n",
        "15         basic  0.003484  0.018394\n",
        "30       central  0.003484  0.018394\n",
        "48      creative  0.003484  0.018394\n",
        "..           ...       ...       ...\n",
        "224        swiss  0.003484  0.027608\n",
        "230         tiny  0.003484  0.027608\n",
        "233         true  0.003484  0.027608\n",
        "252        young  0.003484  0.027608\n",
        "102         good  0.010453  0.028718\n",
        "12         awful  0.003484  0.036833\n",
        "18     beautiful  0.003484  0.036833\n",
        "47         crazy  0.003484  0.036833\n",
        "57          dive  0.003484  0.036833\n",
        "72        entire  0.003484  0.036833\n",
        "101         goat  0.003484  0.036833\n",
        "104       greasy  0.003484  0.036833\n",
        "113         high  0.003484  0.036833\n",
        "158        olive  0.003484  0.036833\n",
        "165      overall  0.003484  0.036833\n",
        "166          own  0.003484  0.036833\n",
        "202       simple  0.003484  0.036833\n",
        "211      spanish  0.003484  0.036833\n",
        "217     straight  0.003484  0.036833\n",
        "219         such  0.003484  0.036833\n",
        "232  traditional  0.003484  0.036833\n",
        "16          bean  0.003484  0.046069\n",
        "40         close  0.003484  0.046069\n",
        "55         dirty  0.003484  0.046069\n",
        "65          easy  0.003484  0.046069\n",
        "81           fat  0.003484  0.046069\n",
        "87     flavorful  0.003484  0.046069\n",
        "183   reasonable  0.003484  0.046069\n",
        "235      typical  0.003484  0.046069\n",
        "244        weird  0.003484  0.046069\n",
        "\n",
        "[116 rows x 3 columns]\n",
        "\n",
        "\n",
        "Using testing set\n",
        "MODEL: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)\n",
        "SCORE:  0.992714025501\n",
        "AUC:  0.5\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "      adjective      coef   p-value\n",
        "75    expensive  0.006969  0.007326\n",
        "77     fabulous  0.006969  0.007326\n",
        "3     addictive  0.003484  0.007326\n",
        "5    affordable  0.003484  0.007326\n",
        "6      american  0.003484  0.007326\n",
        "7         asian  0.003484  0.007326\n",
        "8     attentive  0.003484  0.007326\n",
        "10    available  0.003484  0.007326\n",
        "11      average  0.003484  0.007326\n",
        "14        baked  0.003484  0.007326\n",
        "23       bubble  0.003484  0.007326\n",
        "29       casual  0.003484  0.007326\n",
        "31      certain  0.003484  0.007326\n",
        "46         cozy  0.003484  0.007326\n",
        "55        dirty  0.003484  0.007326\n",
        "56         dish  0.003484  0.007326\n",
        "61          dry  0.003484  0.007326\n",
        "65         easy  0.003484  0.007326\n",
        "68     eggplant  0.003484  0.007326\n",
        "69        empty  0.003484  0.007326\n",
        "73         epic  0.003484  0.007326\n",
        "86         flat  0.003484  0.007326\n",
        "87    flavorful  0.003484  0.007326\n",
        "89       french  0.003484  0.007326\n",
        "93         full  0.003484  0.007326\n",
        "94        funny  0.003484  0.007326\n",
        "97     generous  0.003484  0.007326\n",
        "99    ginormous  0.003484  0.007326\n",
        "101        goat  0.003484  0.007326\n",
        "112     helpful  0.003484  0.007326\n",
        "..          ...       ...       ...\n",
        "106       green  0.003484  0.029467\n",
        "132        late  0.003484  0.029467\n",
        "152      nearby  0.003484  0.029467\n",
        "157         old  0.003484  0.029467\n",
        "181       ready  0.003484  0.029467\n",
        "184         red  0.003484  0.029467\n",
        "83          few  0.006969  0.036902\n",
        "242        want  0.006969  0.036902\n",
        "12        awful  0.003484  0.036902\n",
        "20        black  0.003484  0.036902\n",
        "25         busy  0.003484  0.036902\n",
        "49         cute  0.003484  0.036902\n",
        "51       decent  0.003484  0.036902\n",
        "53       delish  0.003484  0.036902\n",
        "85         fish  0.003484  0.036902\n",
        "96      general  0.003484  0.036902\n",
        "182        real  0.003484  0.036902\n",
        "215       steak  0.003484  0.036902\n",
        "13          bad  0.006969  0.044365\n",
        "38        clean  0.003484  0.044365\n",
        "74    excellent  0.003484  0.044365\n",
        "80         fast  0.003484  0.044365\n",
        "100        give  0.003484  0.044365\n",
        "108       happy  0.003484  0.044365\n",
        "116        huge  0.003484  0.044365\n",
        "168     perfect  0.003484  0.044365\n",
        "185     regular  0.003484  0.044365\n",
        "205       small  0.003484  0.044365\n",
        "209       solid  0.003484  0.044365\n",
        "251       youll  0.003484  0.044365\n",
        "\n",
        "[144 rows x 3 columns]\n",
        "\n",
        "\n",
        "Using all data\n",
        "MODEL: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)\n",
        "SCORE:  0.991343963554\n",
        "AUC:  0.5\n",
        "\n",
        "\n",
        "TOP PREDICTORS (p-value < 0.05):\n",
        "       adjective      coef   p-value\n",
        "3      addictive  0.003484  0.008728\n",
        "24         bushy  0.003484  0.008728\n",
        "29        casual  0.003484  0.008728\n",
        "67     efficient  0.003484  0.008728\n",
        "99     ginormous  0.003484  0.008728\n",
        "148       modern  0.003484  0.008728\n",
        "161      organic  0.003484  0.008728\n",
        "164  outstanding  0.003484  0.008728\n",
        "171          pic  0.003484  0.008728\n",
        "173       polish  0.003484  0.008728\n",
        "186   ridiculous  0.003484  0.008728\n",
        "187     rosemary  0.003484  0.008728\n",
        "190      russian  0.003484  0.008728\n",
        "193     saltfish  0.003484  0.008728\n",
        "200     sicilian  0.003484  0.008728\n",
        "203        sized  0.003484  0.008728\n",
        "214     standard  0.003484  0.008728\n",
        "216      stellar  0.003484  0.008728\n",
        "238       unique  0.003484  0.008728\n",
        "10     available  0.003484  0.017463\n",
        "11       average  0.003484  0.017463\n",
        "15         basic  0.003484  0.017463\n",
        "30       central  0.003484  0.017463\n",
        "31       certain  0.003484  0.017463\n",
        "48      creative  0.003484  0.017463\n",
        "73          epic  0.003484  0.017463\n",
        "78        famous  0.003484  0.017463\n",
        "86          flat  0.003484  0.017463\n",
        "121  interesting  0.003484  0.017463\n",
        "126          key  0.003484  0.017463\n",
        "..           ...       ...       ...\n",
        "72        entire  0.003484  0.034959\n",
        "94         funny  0.003484  0.034959\n",
        "97      generous  0.003484  0.034959\n",
        "119       indian  0.003484  0.034959\n",
        "120  inexpensive  0.003484  0.034959\n",
        "127          kid  0.003484  0.034959\n",
        "133        later  0.003484  0.034959\n",
        "139          low  0.003484  0.034959\n",
        "158        olive  0.003484  0.034959\n",
        "162     original  0.003484  0.034959\n",
        "178       public  0.003484  0.034959\n",
        "180        quiet  0.003484  0.034959\n",
        "211      spanish  0.003484  0.034959\n",
        "217     straight  0.003484  0.034959\n",
        "232  traditional  0.003484  0.034959\n",
        "243         weak  0.003484  0.034959\n",
        "252        young  0.003484  0.034959\n",
        "16          bean  0.003484  0.043719\n",
        "33        cheesy  0.003484  0.043719\n",
        "98         giant  0.003484  0.043719\n",
        "101         goat  0.003484  0.043719\n",
        "113         high  0.003484  0.043719\n",
        "118   incredible  0.003484  0.043719\n",
        "122        irish  0.003484  0.043719\n",
        "140        magic  0.003484  0.043719\n",
        "183   reasonable  0.003484  0.043719\n",
        "195       second  0.003484  0.043719\n",
        "202       simple  0.003484  0.043719\n",
        "206        smile  0.003484  0.043719\n",
        "218       strong  0.003484  0.043719\n",
        "\n",
        "[102 rows x 3 columns]\n",
        "\n",
        "\n"
       ]
      }
     ],
     "prompt_number": 26
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# tips_adj_df.to_pickle('./dumps/tips_complete_features.pkl')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 27
    }
   ],
   "metadata": {}
  }
 ]
}