{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "#Where will a new guest book their first travel experience?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW8AAACJCAMAAADUiEkNAAAAgVBMVEX/WmD/////V13/SlH/Rk3/\nVVv/dHn/T1b/1NX/Uln/+fn/XmT/wsT/W2H/cHX+Q0r/u73/Zmv+a3D/hor+9PT+lZj+yMn+zc7+\n2Nn+ra/+m57+4eL+trj+7+/+6uv+vb/+qqz+5OX+foL+io3+pKb+PET+j5L+eHz+gYT+n6L+Nj8X\nUVrJAAAOXElEQVR4nO1deX/iLBA2QIx4odazHtVqr/f7f8A3amYYEhKwJrtry/PH7s8mIjwZhrkg\nrVZAQEBAQEBAQEBAQEBAQEBAQEBAwI2IpRBJLLnikv3tvvx8JOpjs+5GKbadU0/Ef7s/PxsyWW0j\ngvW7CjLeGJh6jvI4TPjf7tZPRSwPBbpTrNTf7tjPRDzu2uiOok4gvAHEYzvbKZaB8NrBGGV4t17T\ndXMRCK8bSuvu+StXQig20386JX+7fz8MfIMmYMwzozt+e0KNPg5mYZ1gqLw31OCWcgdmYdAodUKt\nkW7j74yDGn+Wf6lrPxHxESyRt9wV1squdIPbUx8EqI1ihCqZZZdWYcmsCxLc+JlFaaCqCQqlLqjM\nDNnZFkU2yfgeBY1SD5KvjNFPa/RVzLPLLNiEtYB3K912XDLnQcDrQLLK+PwoSS7wl+yGXhDwGoDi\nLcrukH9UwFns+VhZDKhXDrDZRpJbCXjy5S473jJpXsATMTx+cJ8HG388ZTi26+xX74jtNjFcEN5O\nqXjrexYV99SCWF4fbb/lli2+hBhE9FKjb8A+sNmoAQH3EG+i4hsWcJLyaDvNfd5BXkZ18t1ukm8M\ne/crp7CIXDq+FkiSYXJGJB+Sb0/bAwV82KSA87kearR2PdpH5JvFXuKdCni1jV4PeEThsj4fkW8U\n75ZjcBJ80DIjvQbErwbfM8cvPSDfbOAp3tpKXzcn4PG7wffesWI+IN+8nzXsDo1gDLE5AY+fDL6f\nf5x8o3j79FdNGxdwGVG4bM/H4xvtAR93DjX4a2MCLrQHE0Vb13N9OL5Z76buikzAmzNRGC05cuqt\nh+Mbtbdfb+XMl4lvQ2oN/uzs1KPxjeLtm5kEDd5gbYTsXbN3u6G7T4/GN781b/MHBLzFxORrdRoK\njy49GN9onGy8O6uyYpQGTZS0X1L67WN5ML5vMk6uiEHA7YnOP4zH4hu1t794aw3+TxQoPxbflcYJ\nK0lSSfC4v6nBWcKV4Ok/PPlenDFOhEi4UOc5meebybTxJEkvJh69k2lL556I/M0FvuNLkzy9/Y79\neqi9C8YJS1Qy/DweX8dcFH7gLg2eiI/VfHeOw0yXmw91HgyXgOsvsdLPZ4Zj9flySL/e3XUGeb6Z\nGJzmlzKx7XL0LnmecmyJs3NZZPzVP1x6sn6ZXf6kbzT5lupptD5P6+5hvh+q7xY9ofbONSBV+wWK\n26LlaZAzFHQU5eZHzePNNCLYvStGUmKX6GvSx8/9ixz08POaM/W+pbcbfCe9BW08WlyfJ0KzOO0x\n3pubN4+F7c6U71g9G53enpJvqS5M65jaOxYno/l0mK8m4yDgt2pwKc0hXtpuEX/SzvdA3zBpHSLj\ndsr3f5soj90rTViwIV44vRVvXmitQvnm423h1lFh6ngA494G3eJpWmg+6gzoFIi/Z4OL92LDKcjQ\nnHybwXGT740p3BnmRDUTvmdLy73dNjBB+f60NTv9vDmpiCVThnirvq35KDpSi/FbiR5l5cOAk+/C\n7YTvEkzHKCqE7xK8J0W+S7C5dfVKRtk3qRHC12Xt78kDRQ1+Q6q+vGWNJvgmWX4339Gr9OX75v1j\nSfY9areKQ3n7JyLhYINX1auYUB50N8R3NIy9+c7iGj5830g41pwQzUynfLczWm36B9L+Ud8pTzAU\nTwFXPrw0xTfsIPDhe8G9+Y5Wt5T2QR6FZC0TvXN+8Zqa9ylU8oyWIX0yoMEXfj+JGySq0RTfB+HN\n97XkxY/vW0IayT77DgkMYinCdqjP4JAKZJlWsmGix2uPoLmPNsW0s9nvRwUbwY/v9eq9PWy/r9Zj\nC9/b+f60mh9yf/2SVr7ns/ZkeNwY9t7VTS3wvdyc9qNObrP71N9IAW5fiHiDhlmb1mWC3STaQ2V/\n8hJwYZomm3HqRcvUmZYz0/b04XvFhIwZY7E8OwU5vkdjdfZWuYpnptFs4/s54ZeGEkVn31YU+d4d\nubh0WAxN8817P5OEWikin7CAFnaUSKheJLXIWGw1cAu4kSKLllLHTaR6oZfcfB8G5ggNvg9j7ZTH\nelqecbF5Db7XUrfEqWdwfjQm3yftpqZu6SHK3ewD2K1DGMTKj6KK4EAuEWaoRXlxCzinQrExdxsK\nWv/g5HuZP/mG8r0wLyYT0nJ0liHKd8e4WRHV1o5zfLeNETJFneSTH+E29Ztk7djKfmB5fNKKBgXc\nVZWlJ86F7vzk4WQyO/kuiALhu+B8xVSc32OTbzM2SateZjm+P/KUUlvLurusCBBvWssNURGbD5Nk\n036jf5uBgeMUcDqYbX4vrY7B+/BdMD+pP1/QpQlRKed1hvKda4iIxFlkKd9FCRZk2fTy+Kz+YVa1\naV1zoaqPliLj8upaMxISGipGXNgt8aoqvi35hjeyaIpKvpW2elc5vos2nyRT0kuhQOtLSwiyYxXX\n7GHQ+hsUcNciLbRracnqN8m33BtfreCbVBmd54kjv8O1gPvYZ7gy0u7D9nl7WgoceCr8vqUrZPpZ\nyi+b5JvqqlQt18e3XjKdFWAtLd5GNTvwbU9lgnI3wskwGtec0n1/Kna+Sb5bb9q+T2dhFd/mOuDg\nW2pHvOt2eWIwpw131MG3Rb7xMXcrnzEjdpnFG22Ub1Nsa5NvWqTu9nggVGfmH6GNubWBTGOZzCJV\nX1UCTvs+sFxukm9y+aVGvmlDzvUSxftoJveycVu3zIDqyK13MAsrldhv57uMpMwEtSokMKFzqzH+\n7rEiUEb1icVabZRvEnX/W/oEh/+eawhWUVvED0zovH0B3ax2s3Tn8r/ZappvbRo51sub+JbakXKu\nlzyL1U3zFIE/bzukCuQkH/CNIZlqMTz0WPSgLYtxo/YgmVp12oPEmnHZg+VGHBg5tu2ssGOwMCQ4\nsbAqc0z8HUvvmuSberaTGv0dsufF5e9Alrh49BcOvFiaDOGt4qPA2EhFYo2O+r0weZrkm8ysbrU/\nfxPfCXFbXb4HBGb2xd6BAi86gW/ZFYvn7pE5pvGqoqneYLxKjPQ3HfGqCr4LzRrZKkd+qyrIBNGG\nQgwPLUjLbl90tSp+mG4ZLhy2J4j038X3rhAbp1mEY/xdvmc5TcBIbMsZjxUVWQJ8bvnFD9Jh1oMK\nYAdVxW5ZI9/QMV9PYKRh7uI7WptVrsLIUefzDf58R+/GsKWklR0OdYLSaM2CwY/mvBrUsBZrjthG\n5QfsUJVxTkZjeool3Mhs3sd3NG3rlnliZBoL+bQb+I5GCrITTKonI2fs0N6gbe2HIqGiNd0XsH5K\nwiS2uqEchFmnuZ7Jc7UFV+Nc0eSdfKcz8PNS0C3ER64s79LQd/mOuvvBufqbK/a1My44QtHWQCwB\naCZDK6ERW7KXHZaEiqgVa0U57Dr9fqdQd3o33ynW8/6iUMv1dR/fKbaL/nyZ77CrHgLcljJrGY+Q\npT4P+sQlChqP86g4zVfWVO/jwbcNWWTuHr6tcOyyxlZK64gFPEHNLVJlsSCzMWfaosrVEsXKbwsa\n4rubCULdfO8dvg4sTuVGDAp4H2cK+gyljaPCqXLq66nX/B7ftnrNGvh21WvixK84xAULMWCqoGDa\njvHNdbbStxV11CN/i++JrR75fr7nrkwaaIaqdAx2aXsVcIw87gqFDBp4U6V1VFbJT+yru/gueZHN\ndqyLo+rk2/1GImiz0ojBTSZXjYKZ6HbV0gA+T8UcaJXtJ+mSGN5dfK+sS8Sc7Dv7Ht9W6qev7rw8\nGMrVLj/q6ydJSomqi3qg1speS4GQ8iXKoyPrile9/GdUDF6wNmrR/Pge5eMnPdPmPmPvsV8KYtUO\nl19HlxjDsm3HiwTAA61OHLfO+wFHhg27O6p0Fe9myPh+ieDzS7YfEG+w8A3XpkOW3w/YeTVDKinf\n2FJpO5caspRv+NznTD2bnd577QeEkljX3m1cIQ9vONVdx/lAuYT7qSdqsl9c9rt2l6vJdb+rAGS3\n4OdrTxl+tpwYwek1xuXz4tKV6XLzxIv7XUtbGuR+VN95ljQJ76fs7hZfY8/9rmANOovyE9AoG5hI\nfWfSaO73XC5DkVyc906kI/OpvbsR8aVxLgpbtO8DO3f33Gf//dxgkDkP8I7zK8TWWdMCc8ca0fql\ngL1nHvXaK5Nv9xHrkADyLIf+FYBUo8eeE2XsrHGfIoU7jl3HM/4m+Mu3UQAa9T0qEoN8F+Gtv1PE\n2glZe51LmKXiXMe9/iaAfeJlQyTbW8QbfZR/4oylfwRgQ3jsYWOc7NMb+VQ4Z4+nAQPvYQGOo3uL\nD0sMD9Z9EgJmOP+FI63+GUD8xLXFh4lcwMAZCoOUmiN+8ssAMRnHOd9Ed/uqFCgwCcslBR7LUxkg\njHtI9wK1eLVDj/5RjSfS/QSAGFblDvRm+Wj5pkOly4rDRdF47Ae+DWAx26hUXNUXoZi+jn46LiNT\nv273D7x56rGAbuPJTnhMDnK4FK/FtIA6X593BcNM8LzpF089HHQZyMZCHqO1Wp3rEhn39J/WsUXE\nY36A68E4KUAHojqtHHlMTEiuG73KuEWslRUvfOkVn0d18vKXItHiehI6/8ESNaG5KGJxs4QWMuyl\nwLpMFiuSwHLmJH4lyAoYdeefXPEUSrRXhovzZGRZzUKG5XPvctyuUK1nErZ1vnnrl0LmTgpazued\nnDu56+V3oeWq/7rrxXyxNo5COgS6SyAnUTUsS2nSOlR/p+FXBj404kFVZdmubTMzGLHLrY/oj4/i\ngcDUqoy47rMqCYHYzjbOMP0IlmA1koGVvOmXqLDp+MBaANg9lT2iAATj403+2OnF0VHIwrjc51XR\nuvIRBSBSi7u9Wmacd9ebz2I1kgUyNQLnYM7sFl+Db5+t/wvBpBByPJmM5S3VSHGieGs8nAzS/+94\nd8RvBWO+b5a6/1sBAQEBAQEBAQEBAQEBAQEBAQEBAT8X/wMb5cqZ5/G69AAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<IPython.core.display.Image object>"
      ]
     },
     "execution_count": 1,
     "metadata": {
      "image/png": {
       "width": 400
      }
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from IPython.display import Image\n",
    "Image('airbnb.png', width=400)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##Introduction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "In this [competition](https://www.kaggle.com/c/airbnb-recruiting-new-user-bookings) Airbnb challenged Kagglers to predict in which country a new user will make his or her first booking. \n",
    "The below implementation is the result of the team work with [Gabriele Lanaro](http://gabrielelanaro.github.io/). Both of us actively contributed to code, technical discussions and finding solutions to blockers along the road.\n",
    "The final pipeline consists of the following steps:\n",
    "1. Reading and preprocessing the available data (including user sessions).\n",
    "2. Using a Random Forest classifier to identify negligible features. Specifically we decided to remove all the features having less than 0.1% of relative importance.\n",
    "3. Running Cross Validation to select best classifier/hyperparameters. This step pointed us in the direction of the Bagging algorithm. \n",
    "4. Due to technical restrictions in our infrastructure (limitations in RAM and computational power) we ran 5 separate Bagging Classifiers on the final dataset and averaged the results to come up with a definitive outcome.\n",
    "5. Generating the submission file on top of the previous model."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "####Getting Train and Test set and outputting a data set ready to be fed into a Machine Learning pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def make_user_features(train, test):\n",
    "    #encoding country destinations in train set\n",
    "    outcome = train.country_destination\n",
    "    labels = outcome.values\n",
    "    le = LabelEncoder()\n",
    "    y = le.fit_transform(labels)\n",
    "    train = train.drop(['country_destination'], axis=1)\n",
    "\n",
    "    #storing user ids in test set\n",
    "    id_test = test['id']\n",
    "\n",
    "    #appending test to train and dropping date first booking which is redundant\n",
    "    data = pd.concat((train, test), axis=0, ignore_index=True)\n",
    "    data = data.drop(['date_first_booking'], axis=1)\n",
    "\n",
    "    #extracting features from date_account_created\n",
    "    data['dac_year'] = data.date_account_created.apply(lambda x: x.year)\n",
    "    data['dac_month'] = data.date_account_created.apply(lambda x: x.month)\n",
    "    data['dac_weekday'] = data.date_account_created.apply(lambda x: x.weekday())\n",
    "    data = data.drop(['date_account_created'], axis=1)\n",
    "\n",
    "    #extracting features from timestamp_first_active\n",
    "    data['tfa_year'] = data.timestamp_first_active.apply(lambda x: x.year)\n",
    "    data['tfa_month'] = data.timestamp_first_active.apply(lambda x: x.month)\n",
    "    data['tfa_weekday'] = data.timestamp_first_active.apply(lambda x: x.weekday())\n",
    "    data = data.drop(['timestamp_first_active'], axis=1)\n",
    "\n",
    "    #filling age nan with age median\n",
    "    data.age = data.age.fillna(data.age.median())\n",
    "\n",
    "    #binning age column\n",
    "    bins = list(np.arange(15, 85, 5))\n",
    "    bins.insert(0,0)\n",
    "    bins.append(int(max(data.age)))\n",
    "    group_names = ['<15', '15-20', '20-25', '25-30', '30-35', '35-40', '40-45', '45-50',\n",
    "                   '50-55', '55-60', '60-65', '65-70', '70-75', '75-80', '>80']\n",
    "    data['age_bucket'] = pd.cut(data['age'], bins, labels=group_names)\n",
    "\n",
    "    #cleaning gender column and filling nan in all dataframe with 'unknown'\n",
    "    data.gender = data.gender.replace('-unknown-','unknown')\n",
    "    data.ix[:, data.columns != 'age_bucket'] = data.ix[:, data.columns != 'age_bucket'].fillna('unknown')\n",
    "\n",
    "    #generating dummy variables in top of categorical columns\n",
    "    to_be_dummified = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', \n",
    "                       'affiliate_provider', 'first_affiliate_tracked', 'signup_app',\n",
    "                       'first_device_type', 'first_browser','age_bucket']\n",
    "    for f in to_be_dummified:\n",
    "        dummies = pd.get_dummies(data[f], prefix=f)\n",
    "        data = data.drop([f], axis=1)\n",
    "        data = pd.concat((data, dummies), axis=1)\n",
    "\n",
    "    return data[:train.shape[0]], data[train.shape[0]:], y, le"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "####Adding Sessions to the previously generated User data set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def make_sessions_features(data, df_sessions):\n",
    "    # Drop row with nan values from the \"user_id\" column as they're useless\n",
    "    df_sessions = df_sessions.dropna(subset=[\"user_id\"])\n",
    "\n",
    "    # print df_sessions\n",
    "\n",
    "    # Frequency of devices - by user\n",
    "    device_freq = df_sessions.groupby('user_id').device_type.value_counts()\n",
    "    \n",
    "    # Frequency of actions taken - by user\n",
    "    action_freq = df_sessions.groupby('user_id').action.value_counts()\n",
    "\n",
    "    # Total list of users\n",
    "    users = data.id.values\n",
    "    def feature_dict(df):\n",
    "        f_dict = dict(list(df.groupby(level='user_id')))\n",
    "        res = {}\n",
    "        for k, v in f_dict.items():\n",
    "            v.index = v.index.droplevel('user_id')\n",
    "            res[k] = v.to_dict()\n",
    "        return res\n",
    "\n",
    "    # Make a dictionary with the frequencies { 'user_id' : {\"IPhone\": 2, \"Windows\": 1}}\n",
    "    action_dict = feature_dict(action_freq)\n",
    "    device_dict = feature_dict(device_freq)\n",
    "\n",
    "    # Transform to a list of dictionaries\n",
    "    action_rows = [action_dict.get(k, {}) for k in users]\n",
    "    device_rows = [device_dict.get(k, {}) for k in users]\n",
    "\n",
    "    device_transf = DictVectorizer()\n",
    "    tf = device_transf.fit_transform(device_rows)\n",
    "\n",
    "    action_transf = DictVectorizer()\n",
    "    tf2 = action_transf.fit_transform(action_rows)\n",
    "\n",
    "    # Concatenate the two datasets\n",
    "    # Those are row vectors with the frequencies of both device and actions [0, 0, 0, 2, 0, 1, ...]\n",
    "    features = sp.hstack([tf, tf2])\n",
    "\n",
    "    # We create a dataframe with the new features and we write it to disk\n",
    "    df_sess_features = pd.DataFrame(features.todense())\n",
    "    \n",
    "    df_sess_features['id'] = users\n",
    "\n",
    "    #left joining data and sessions on user_id\n",
    "    final = pd.merge(data, df_sess_features, how='left', left_on='id', right_on='id')\n",
    "    final.ix[:, final.columns != 'age_bucket'].fillna(-1, inplace=True)\n",
    "\n",
    "    final.drop(['id'], axis=1, inplace=True)\n",
    "    return final"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "####Putting together the competition specific evaluation metrics,  NDCG (Normalized discounted cumulative gain) @k where k=5. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\" Reference from https://gist.github.com/bwhite/3726239\n",
    "\"\"\"\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "def mean_ndcg(clf, X, y):\n",
    "    # Predict class probabilities\n",
    "    y_predict = clf.predict_proba(X)\n",
    "    # Get highest 5 predictions\n",
    "    best_5 = np.argsort(-y_predict, axis=1)[:, :5]\n",
    "\n",
    "    # Transform to relevance scores\n",
    "    relevance = (best_5 == y[:, np.newaxis]).astype('int')\n",
    "\n",
    "    # Calculate ndcg for each sample and take average (?)\n",
    "    return np.mean([ndcg_at_k(row, 5) for row in relevance])\n",
    "\n",
    "def dcg_at_k(r, k, method=0):\n",
    "    \"\"\"Score is discounted cumulative gain (dcg)\n",
    "    Relevance is positive real values.  Can use binary\n",
    "    as the previous methods.\n",
    "    Example from\n",
    "    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf\n",
    "    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]\n",
    "    >>> dcg_at_k(r, 1)\n",
    "    3.0\n",
    "    >>> dcg_at_k(r, 1, method=1)\n",
    "    3.0\n",
    "    >>> dcg_at_k(r, 2)\n",
    "    5.0\n",
    "    >>> dcg_at_k(r, 2, method=1)\n",
    "    4.2618595071429155\n",
    "    >>> dcg_at_k(r, 10)\n",
    "    9.6051177391888114\n",
    "    >>> dcg_at_k(r, 11)\n",
    "    9.6051177391888114\n",
    "    Args:\n",
    "        r: Relevance scores (list or numpy) in rank order\n",
    "            (first element is the first item)\n",
    "        k: Number of results to consider\n",
    "        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]\n",
    "                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]\n",
    "    Returns:\n",
    "        Discounted cumulative gain\n",
    "    \"\"\"\n",
    "    r = np.asfarray(r)[:k]\n",
    "    if r.size:\n",
    "        if method == 0:\n",
    "            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))\n",
    "        elif method == 1:\n",
    "            return np.sum(r / np.log2(np.arange(2, r.size + 2)))\n",
    "        else:\n",
    "            raise ValueError('method must be 0 or 1.')\n",
    "    return 0.\n",
    "\n",
    "\n",
    "def ndcg_at_k(r, k, method=0):\n",
    "    \"\"\"Score is normalized discounted cumulative gain (ndcg)\n",
    "    Relevance is positive real values.  Can use binary\n",
    "    as the previous methods.\n",
    "    Example from\n",
    "    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf\n",
    "    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]\n",
    "    >>> ndcg_at_k(r, 1)\n",
    "    1.0\n",
    "    >>> r = [2, 1, 2, 0]\n",
    "    >>> ndcg_at_k(r, 4)\n",
    "    0.9203032077642922\n",
    "    >>> ndcg_at_k(r, 4, method=1)\n",
    "    0.96519546960144276\n",
    "    >>> ndcg_at_k([0], 1)\n",
    "    0.0\n",
    "    >>> ndcg_at_k([1], 2)\n",
    "    1.0\n",
    "    Args:\n",
    "        r: Relevance scores (list or numpy) in rank order\n",
    "            (first element is the first item)\n",
    "        k: Number of results to consider\n",
    "        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]\n",
    "                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]\n",
    "    Returns:\n",
    "        Normalized discounted cumulative gain\n",
    "    \"\"\"\n",
    "    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)\n",
    "    if not dcg_max:\n",
    "        return 0.\n",
    "    return dcg_at_k(r, k, method) / dcg_max"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "####Function to generate submission files for Kaggle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def generate_submission(y_pred, id_test, le, filename):\n",
    "    ids = []  #list of ids\n",
    "    cts = []  #list of countries\n",
    "    for i in range(len(id_test)):\n",
    "        idx = id_test[i]\n",
    "        ids += [idx] * 5\n",
    "        cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()\n",
    "\n",
    "    #Generate submission\n",
    "    sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])\n",
    "    sub.to_csv(os.path.join('data', filename),index=False)\n",
    "    print 'Submission File Successfully Generated'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##Machine Learning Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#importing packages\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.feature_extraction import DictVectorizer\n",
    "import scipy.sparse as sp"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "####Reading and preprocessing data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#reading data\n",
    "pretest = pd.read_csv(os.path.join('data', 'test_users.csv'), header=0, parse_dates=[1,2,3])\n",
    "pretrain = pd.read_csv(os.path.join('data', 'train_users_2.csv'), header=0, parse_dates=[1,2,3])\n",
    "df_sessions = pd.read_csv(\"data/sessions.csv\", encoding='utf8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>date_account_created</th>\n",
       "      <th>timestamp_first_active</th>\n",
       "      <th>date_first_booking</th>\n",
       "      <th>gender</th>\n",
       "      <th>age</th>\n",
       "      <th>signup_method</th>\n",
       "      <th>signup_flow</th>\n",
       "      <th>language</th>\n",
       "      <th>affiliate_channel</th>\n",
       "      <th>affiliate_provider</th>\n",
       "      <th>first_affiliate_tracked</th>\n",
       "      <th>signup_app</th>\n",
       "      <th>first_device_type</th>\n",
       "      <th>first_browser</th>\n",
       "      <th>country_destination</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>gxn3p5htnn</td>\n",
       "      <td>2010-06-28</td>\n",
       "      <td>2009-03-19 04:32:55</td>\n",
       "      <td>NaT</td>\n",
       "      <td>-unknown-</td>\n",
       "      <td>NaN</td>\n",
       "      <td>facebook</td>\n",
       "      <td>0</td>\n",
       "      <td>en</td>\n",
       "      <td>direct</td>\n",
       "      <td>direct</td>\n",
       "      <td>untracked</td>\n",
       "      <td>Web</td>\n",
       "      <td>Mac Desktop</td>\n",
       "      <td>Chrome</td>\n",
       "      <td>NDF</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>820tgsjxq7</td>\n",
       "      <td>2011-05-25</td>\n",
       "      <td>2009-05-23 17:48:09</td>\n",
       "      <td>NaT</td>\n",
       "      <td>MALE</td>\n",
       "      <td>38</td>\n",
       "      <td>facebook</td>\n",
       "      <td>0</td>\n",
       "      <td>en</td>\n",
       "      <td>seo</td>\n",
       "      <td>google</td>\n",
       "      <td>untracked</td>\n",
       "      <td>Web</td>\n",
       "      <td>Mac Desktop</td>\n",
       "      <td>Chrome</td>\n",
       "      <td>NDF</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4ft3gnwmtx</td>\n",
       "      <td>2010-09-28</td>\n",
       "      <td>2009-06-09 23:12:47</td>\n",
       "      <td>2010-08-02</td>\n",
       "      <td>FEMALE</td>\n",
       "      <td>56</td>\n",
       "      <td>basic</td>\n",
       "      <td>3</td>\n",
       "      <td>en</td>\n",
       "      <td>direct</td>\n",
       "      <td>direct</td>\n",
       "      <td>untracked</td>\n",
       "      <td>Web</td>\n",
       "      <td>Windows Desktop</td>\n",
       "      <td>IE</td>\n",
       "      <td>US</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>bjjt8pjhuk</td>\n",
       "      <td>2011-12-05</td>\n",
       "      <td>2009-10-31 06:01:29</td>\n",
       "      <td>2012-09-08</td>\n",
       "      <td>FEMALE</td>\n",
       "      <td>42</td>\n",
       "      <td>facebook</td>\n",
       "      <td>0</td>\n",
       "      <td>en</td>\n",
       "      <td>direct</td>\n",
       "      <td>direct</td>\n",
       "      <td>untracked</td>\n",
       "      <td>Web</td>\n",
       "      <td>Mac Desktop</td>\n",
       "      <td>Firefox</td>\n",
       "      <td>other</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>87mebub9p4</td>\n",
       "      <td>2010-09-14</td>\n",
       "      <td>2009-12-08 06:11:05</td>\n",
       "      <td>2010-02-18</td>\n",
       "      <td>-unknown-</td>\n",
       "      <td>41</td>\n",
       "      <td>basic</td>\n",
       "      <td>0</td>\n",
       "      <td>en</td>\n",
       "      <td>direct</td>\n",
       "      <td>direct</td>\n",
       "      <td>untracked</td>\n",
       "      <td>Web</td>\n",
       "      <td>Mac Desktop</td>\n",
       "      <td>Chrome</td>\n",
       "      <td>US</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           id date_account_created timestamp_first_active date_first_booking  \\\n",
       "0  gxn3p5htnn           2010-06-28    2009-03-19 04:32:55                NaT   \n",
       "1  820tgsjxq7           2011-05-25    2009-05-23 17:48:09                NaT   \n",
       "2  4ft3gnwmtx           2010-09-28    2009-06-09 23:12:47         2010-08-02   \n",
       "3  bjjt8pjhuk           2011-12-05    2009-10-31 06:01:29         2012-09-08   \n",
       "4  87mebub9p4           2010-09-14    2009-12-08 06:11:05         2010-02-18   \n",
       "\n",
       "      gender  age signup_method  signup_flow language affiliate_channel  \\\n",
       "0  -unknown-  NaN      facebook            0       en            direct   \n",
       "1       MALE   38      facebook            0       en               seo   \n",
       "2     FEMALE   56         basic            3       en            direct   \n",
       "3     FEMALE   42      facebook            0       en            direct   \n",
       "4  -unknown-   41         basic            0       en            direct   \n",
       "\n",
       "  affiliate_provider first_affiliate_tracked signup_app first_device_type  \\\n",
       "0             direct               untracked        Web       Mac Desktop   \n",
       "1             google               untracked        Web       Mac Desktop   \n",
       "2             direct               untracked        Web   Windows Desktop   \n",
       "3             direct               untracked        Web       Mac Desktop   \n",
       "4             direct               untracked        Web       Mac Desktop   \n",
       "\n",
       "  first_browser country_destination  \n",
       "0        Chrome                 NDF  \n",
       "1        Chrome                 NDF  \n",
       "2            IE                  US  \n",
       "3       Firefox               other  \n",
       "4        Chrome                  US  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pretrain.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "train, test, y, le = make_user_features(pretrain, pretest)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data = pd.concat((train, test), axis=0, ignore_index=True)\n",
    "final = make_sessions_features(data, df_sessions)\n",
    "\n",
    "del data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X_train = final.ix[:train.shape[0]-1]\n",
    "X_test = final.ix[train.shape[0]:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "assert train.shape[0] == y.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train Set shape: (213451, 549)\n",
      "Test Set shape: (62096, 549)\n",
      "Labels shape: (213451L,)\n"
     ]
    }
   ],
   "source": [
    "assert X_train.shape[0] == train.shape[0]\n",
    "assert X_train.shape[0] == y.shape[0]\n",
    "print 'Train Set shape:', X_train.shape\n",
    "print 'Test Set shape:', X_test.shape\n",
    "print 'Labels shape:', y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "del final"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "####Selecting irrelevant features via a Random Forest Classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of features to be discarded:  311\n"
     ]
    }
   ],
   "source": [
    "#fitting a Random Forest Classifier to select negligible features\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "clf = RandomForestClassifier(n_estimators=160, oob_score=True, n_jobs=-1, criterion='entropy')\n",
    "clf.fit(X_train, y)\n",
    "print 'Number of features to be discarded: ', np.count_nonzero(clf.feature_importances_ < 1e-4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#getting uninportant features\n",
    "unimportant_features = clf.feature_importances_ < 1e-4"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "####Selecting best model/hyperparameters via Cross Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Splitting Train set into Train and Test again to perform CV\n",
    "#The classification task is very challenging due to the data being extremely skewed.\n",
    "#Hence, we make sure to stratify our samples as almost 95% of them are covered by 3 classes only: NDF, US, OTHER. \n",
    "from sklearn.cross_validation import train_test_split\n",
    "from sklearn.metrics import f1_score\n",
    "from sklearn.grid_search import GridSearchCV\n",
    "from pprint import pprint\n",
    "\n",
    "sub_X_train, sub_X_test, sub_y_train, sub_y_test = train_test_split(X_train, y, test_size=0.33, random_state=42, stratify=y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###Bagging"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.6min\n",
      "[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed: 62.5min finished\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 3 folds for each of 27 candidates, totalling 81 fits\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=3, error_score='raise',\n",
       "       estimator=BaggingClassifier(base_estimator=None, bootstrap=True,\n",
       "         bootstrap_features=False, max_features=1.0, max_samples=1.0,\n",
       "         n_estimators=10, n_jobs=1, oob_score=False, random_state=42,\n",
       "         verbose=0, warm_start=False),\n",
       "       fit_params={}, iid=True, n_jobs=-1,\n",
       "       param_grid={'n_estimators': [10, 50, 100], 'max_samples': [0.1, 0.5, 1.0], 'max_features': [0.1, 0.5, 1.0]},\n",
       "       pre_dispatch='2*n_jobs', refit=True, scoring='f1_weighted',\n",
       "       verbose=4)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import BaggingClassifier\n",
    "bagg = BaggingClassifier(random_state=42)\n",
    "\n",
    "param_grid = {\"n_estimators\": [10, 50, 100],\n",
    "              \"max_samples\": [0.1, 0.5, 1.0],\n",
    "              \"max_features\": [0.1, 0.5, 1.0]}\n",
    "\n",
    "baggsearch = GridSearchCV(bagg, param_grid, scoring='f1_weighted', cv=3, verbose=4, n_jobs=-1)\n",
    "baggsearch.fit(sub_X_train, sub_y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "####Bagging Classifier: Results of Search Grid CV (evaluation metrics: F1 Weighted Score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[mean: 0.59403, std: 0.00169, params: {'max_features': 1.0, 'max_samples': 0.1, 'n_estimators': 100},\n",
      " mean: 0.58978, std: 0.00201, params: {'max_features': 1.0, 'max_samples': 0.1, 'n_estimators': 50},\n",
      " mean: 0.58491, std: 0.00053, params: {'max_features': 1.0, 'max_samples': 0.5, 'n_estimators': 100},\n",
      " mean: 0.58245, std: 0.00059, params: {'max_features': 1.0, 'max_samples': 0.5, 'n_estimators': 50},\n",
      " mean: 0.58213, std: 0.00056, params: {'max_features': 0.5, 'max_samples': 0.1, 'n_estimators': 100},\n",
      " mean: 0.58174, std: 0.00056, params: {'max_features': 0.5, 'max_samples': 0.5, 'n_estimators': 100},\n",
      " mean: 0.58031, std: 0.00087, params: {'max_features': 0.5, 'max_samples': 1.0, 'n_estimators': 100},\n",
      " mean: 0.57837, std: 0.00108, params: {'max_features': 0.5, 'max_samples': 1.0, 'n_estimators': 50},\n",
      " mean: 0.57829, std: 0.00091, params: {'max_features': 0.5, 'max_samples': 0.5, 'n_estimators': 50},\n",
      " mean: 0.57760, std: 0.00127, params: {'max_features': 0.5, 'max_samples': 0.1, 'n_estimators': 50},\n",
      " mean: 0.57705, std: 0.00046, params: {'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 100},\n",
      " mean: 0.57442, std: 0.00063, params: {'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 50},\n",
      " mean: 0.56925, std: 0.00149, params: {'max_features': 1.0, 'max_samples': 0.1, 'n_estimators': 10},\n",
      " mean: 0.56588, std: 0.00250, params: {'max_features': 0.5, 'max_samples': 0.5, 'n_estimators': 10},\n",
      " mean: 0.56550, std: 0.00109, params: {'max_features': 1.0, 'max_samples': 0.5, 'n_estimators': 10},\n",
      " mean: 0.56489, std: 0.00079, params: {'max_features': 0.5, 'max_samples': 1.0, 'n_estimators': 10},\n",
      " mean: 0.56259, std: 0.00147, params: {'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 10},\n",
      " mean: 0.56244, std: 0.00104, params: {'max_features': 0.5, 'max_samples': 0.1, 'n_estimators': 10},\n",
      " mean: 0.47828, std: 0.00109, params: {'max_features': 0.1, 'max_samples': 1.0, 'n_estimators': 10},\n",
      " mean: 0.47787, std: 0.00053, params: {'max_features': 0.1, 'max_samples': 0.5, 'n_estimators': 10},\n",
      " mean: 0.47697, std: 0.00067, params: {'max_features': 0.1, 'max_samples': 0.1, 'n_estimators': 10},\n",
      " mean: 0.47294, std: 0.00097, params: {'max_features': 0.1, 'max_samples': 0.5, 'n_estimators': 50},\n",
      " mean: 0.47263, std: 0.00159, params: {'max_features': 0.1, 'max_samples': 1.0, 'n_estimators': 50},\n",
      " mean: 0.47212, std: 0.00135, params: {'max_features': 0.1, 'max_samples': 0.1, 'n_estimators': 50},\n",
      " mean: 0.47004, std: 0.00131, params: {'max_features': 0.1, 'max_samples': 1.0, 'n_estimators': 100},\n",
      " mean: 0.46982, std: 0.00064, params: {'max_features': 0.1, 'max_samples': 0.5, 'n_estimators': 100},\n",
      " mean: 0.46934, std: 0.00064, params: {'max_features': 0.1, 'max_samples': 0.1, 'n_estimators': 100}]\n"
     ]
    }
   ],
   "source": [
    "pprint(sorted(baggsearch.grid_scores_, key=lambda x: -x.mean_validation_score))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NDCG on Test Set 0.914233699603\n",
      "F1 Weighted Score on Test Set 0.594105356053\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\pochetti\\AppData\\Local\\Continuum\\Anaconda\\lib\\site-packages\\sklearn\\metrics\\classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "print 'NDCG on Test Set', mean_ndcg(baggsearch.best_estimator_, sub_X_test, sub_y_test)\n",
    "print 'F1 Weighted Score on Test Set', f1_score(sub_y_test, baggsearch.best_estimator_.predict(sub_X_test), average='weighted')  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###KNN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 3 folds for each of 5 candidates, totalling 15 fits\n",
      "[CV] n_neighbors=1 ...................................................\n",
      "[CV] .......................... n_neighbors=1, score=0.495831 -21.6min\n",
      "[CV] n_neighbors=1 ...................................................\n",
      "[CV] .......................... n_neighbors=1, score=0.498015 -22.2min\n",
      "[CV] n_neighbors=1 ...................................................\n",
      "[CV] .......................... n_neighbors=1, score=0.497071 -20.3min\n",
      "[CV] n_neighbors=2 ...................................................\n",
      "[CV] .......................... n_neighbors=2, score=0.491657 -23.1min\n",
      "[CV] n_neighbors=2 ...................................................\n",
      "[CV] .......................... n_neighbors=2, score=0.490935 -24.1min\n",
      "[CV] n_neighbors=2 ...................................................\n",
      "[CV] .......................... n_neighbors=2, score=0.489456 -23.1min\n",
      "[CV] n_neighbors=4 ...................................................\n",
      "[CV] .......................... n_neighbors=4, score=0.529255 -24.0min\n",
      "[CV] n_neighbors=4 ...................................................\n",
      "[CV] .......................... n_neighbors=4, score=0.526354 -24.9min\n",
      "[CV] n_neighbors=4 ...................................................\n",
      "[CV] .......................... n_neighbors=4, score=0.527694 -24.1min\n",
      "[CV] n_neighbors=8 ...................................................\n",
      "[CV] .......................... n_neighbors=8, score=0.540271 -27.4min\n",
      "[CV] n_neighbors=8 ...................................................\n",
      "[CV] .......................... n_neighbors=8, score=0.536771 -30.2min\n",
      "[CV] n_neighbors=8 ...................................................\n",
      "[CV] .......................... n_neighbors=8, score=0.536292 -24.7min\n",
      "[CV] n_neighbors=16 ..................................................\n",
      "[CV] ......................... n_neighbors=16, score=0.542694 -25.5min\n",
      "[CV] n_neighbors=16 ..................................................\n",
      "[CV] ......................... n_neighbors=16, score=0.542619 -25.0min\n",
      "[CV] n_neighbors=16 ..................................................\n",
      "[CV] ......................... n_neighbors=16, score=0.540317 -23.5min"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\pochetti\\AppData\\Local\\Continuum\\Anaconda\\lib\\site-packages\\sklearn\\metrics\\classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n",
      "[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 363.8min finished\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=3, error_score='raise',\n",
       "       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "           metric_params=None, n_jobs=1, n_neighbors=2, p=2,\n",
       "           weights='uniform'),\n",
       "       fit_params={}, iid=True, n_jobs=1,\n",
       "       param_grid={'n_neighbors': [1, 2, 4, 8, 16]},\n",
       "       pre_dispatch='2*n_jobs', refit=True, scoring='f1_weighted',\n",
       "       verbose=4)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "neigh = KNeighborsClassifier(n_neighbors=2)\n",
    "\n",
    "param_grid = {\"n_neighbors\": [1, 2, 4, 8, 16]}\n",
    "\n",
    "knnsearch = GridSearchCV(neigh, param_grid, scoring='f1_weighted', cv=3, verbose=4, n_jobs=1)\n",
    "knnsearch.fit(sub_X_train, sub_y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "####KNN Classifier: Results of Search Grid CV (evaluation metrics: F1 Weighted Score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[mean: 0.54188, std: 0.00110, params: {'n_neighbors': 16},\n",
      " mean: 0.53778, std: 0.00177, params: {'n_neighbors': 8},\n",
      " mean: 0.52777, std: 0.00119, params: {'n_neighbors': 4},\n",
      " mean: 0.49697, std: 0.00089, params: {'n_neighbors': 1},\n",
      " mean: 0.49068, std: 0.00092, params: {'n_neighbors': 2}]\n"
     ]
    }
   ],
   "source": [
    "pprint(sorted(knnsearch.grid_scores_, key=lambda x: -x.mean_validation_score))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NDCG on Test Set 0.891440349619\n",
      "F1 Weighted Score on Test Set 0.54514117663\n"
     ]
    }
   ],
   "source": [
    "print 'NDCG on Test Set', mean_ndcg(knnsearch.best_estimator_, sub_X_test, sub_y_test)\n",
    "print 'F1 Weighted Score on Test Set', f1_score(sub_y_test, knnsearch.best_estimator_.predict(sub_X_test), average='weighted')  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For the sake of simplicity and for the general purpose of a proof of concept, we show just the tests performed with KNN and Bagging. For your reference we explored AdaBoosting and Random Forest from scikit-learn, together with XgbBoost from an independent library (you can have a look [here](https://github.com/FraPochetti/Airbnb/blob/master/AirbnbXgbBoosting.ipynb) at a sample implementation of this algorithm). None of the models was as successful as Bagging, hence we sticked with this algorithm. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###Training the Final Bagging Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#training 5 separate Bagging Classifiers\n",
    "from sklearn.ensemble import BaggingClassifier\n",
    "def bagging_prediction(X_train, y_train, X_test, \n",
    "                       n_estimators=100, \n",
    "                       max_samples=0.1, \n",
    "                       max_features=1.0, \n",
    "                       random_state=None):\n",
    "\n",
    "    bagg = BaggingClassifier(random_state=random_state, \n",
    "                             n_estimators=n_estimators, \n",
    "                             max_samples=max_samples, \n",
    "                             max_features=max_features)\n",
    "    bagg.fit(X_train.ix[:, ~unimportant_features], y_train)\n",
    "    return bagg.predict_proba(X_test.ix[:, ~unimportant_features])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "probs = []\n",
    "for i in range(5):\n",
    "    p = bagging_prediction(X_train, y, \n",
    "                           X_test,\n",
    "                           n_estimators=100,\n",
    "                           random_state=i)\n",
    "    probs.append(p)\n",
    "\n",
    "# We take the average of the 5 models\n",
    "avg_probs = sum(probs)/len(probs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Submission File Successfully Generated\n"
     ]
    }
   ],
   "source": [
    "y_pred = avg_probs\n",
    "id_test = pretest.id.values\n",
    "generate_submission(y_pred, id_test, le, 'finalsubmission.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "##Conclusions\n",
    "At the time of writing (2016, February 2) this approch is ensuring us a NDCG score in the Public Kaggle Leaderboard of **0.87661** corresponding to a relative position on 223 over 1277 participants.  "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}