{ "cells": [ { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "#Where will a new guest book their first travel experience?" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW8AAACJCAMAAADUiEkNAAAAgVBMVEX/WmD/////V13/SlH/Rk3/\nVVv/dHn/T1b/1NX/Uln/+fn/XmT/wsT/W2H/cHX+Q0r/u73/Zmv+a3D/hor+9PT+lZj+yMn+zc7+\n2Nn+ra/+m57+4eL+trj+7+/+6uv+vb/+qqz+5OX+foL+io3+pKb+PET+j5L+eHz+gYT+n6L+Nj8X\nUVrJAAAOXElEQVR4nO1deX/iLBA2QIx4odazHtVqr/f7f8A3amYYEhKwJrtry/PH7s8mIjwZhrkg\nrVZAQEBAQEBAQEBAQEBAQEBAQEBAwI2IpRBJLLnikv3tvvx8JOpjs+5GKbadU0/Ef7s/PxsyWW0j\ngvW7CjLeGJh6jvI4TPjf7tZPRSwPBbpTrNTf7tjPRDzu2uiOok4gvAHEYzvbKZaB8NrBGGV4t17T\ndXMRCK8bSuvu+StXQig20386JX+7fz8MfIMmYMwzozt+e0KNPg5mYZ1gqLw31OCWcgdmYdAodUKt\nkW7j74yDGn+Wf6lrPxHxESyRt9wV1squdIPbUx8EqI1ihCqZZZdWYcmsCxLc+JlFaaCqCQqlLqjM\nDNnZFkU2yfgeBY1SD5KvjNFPa/RVzLPLLNiEtYB3K912XDLnQcDrQLLK+PwoSS7wl+yGXhDwGoDi\nLcrukH9UwFns+VhZDKhXDrDZRpJbCXjy5S473jJpXsATMTx+cJ8HG388ZTi26+xX74jtNjFcEN5O\nqXjrexYV99SCWF4fbb/lli2+hBhE9FKjb8A+sNmoAQH3EG+i4hsWcJLyaDvNfd5BXkZ18t1ukm8M\ne/crp7CIXDq+FkiSYXJGJB+Sb0/bAwV82KSA87kearR2PdpH5JvFXuKdCni1jV4PeEThsj4fkW8U\n75ZjcBJ80DIjvQbErwbfM8cvPSDfbOAp3tpKXzcn4PG7wffesWI+IN+8nzXsDo1gDLE5AY+fDL6f\nf5x8o3j79FdNGxdwGVG4bM/H4xvtAR93DjX4a2MCLrQHE0Vb13N9OL5Z76buikzAmzNRGC05cuqt\nh+Mbtbdfb+XMl4lvQ2oN/uzs1KPxjeLtm5kEDd5gbYTsXbN3u6G7T4/GN781b/MHBLzFxORrdRoK\njy49GN9onGy8O6uyYpQGTZS0X1L67WN5ML5vMk6uiEHA7YnOP4zH4hu1t794aw3+TxQoPxbflcYJ\nK0lSSfC4v6nBWcKV4Ok/PPlenDFOhEi4UOc5meebybTxJEkvJh69k2lL556I/M0FvuNLkzy9/Y79\neqi9C8YJS1Qy/DweX8dcFH7gLg2eiI/VfHeOw0yXmw91HgyXgOsvsdLPZ4Zj9flySL/e3XUGeb6Z\nGJzmlzKx7XL0LnmecmyJs3NZZPzVP1x6sn6ZXf6kbzT5lupptD5P6+5hvh+q7xY9ofbONSBV+wWK\n26LlaZAzFHQU5eZHzePNNCLYvStGUmKX6GvSx8/9ixz08POaM/W+pbcbfCe9BW08WlyfJ0KzOO0x\n3pubN4+F7c6U71g9G53enpJvqS5M65jaOxYno/l0mK8m4yDgt2pwKc0hXtpuEX/SzvdA3zBpHSLj\ndsr3f5soj90rTViwIV44vRVvXmitQvnm423h1lFh6ngA494G3eJpWmg+6gzoFIi/Z4OL92LDKcjQ\nnHybwXGT740p3BnmRDUTvmdLy73dNjBB+f60NTv9vDmpiCVThnirvq35KDpSi/FbiR5l5cOAk+/C\n7YTvEkzHKCqE7xK8J0W+S7C5dfVKRtk3qRHC12Xt78kDRQ1+Q6q+vGWNJvgmWX4339Gr9OX75v1j\nSfY9areKQ3n7JyLhYINX1auYUB50N8R3NIy9+c7iGj5830g41pwQzUynfLczWm36B9L+Ud8pTzAU\nTwFXPrw0xTfsIPDhe8G9+Y5Wt5T2QR6FZC0TvXN+8Zqa9ylU8oyWIX0yoMEXfj+JGySq0RTfB+HN\n97XkxY/vW0IayT77DgkMYinCdqjP4JAKZJlWsmGix2uPoLmPNsW0s9nvRwUbwY/v9eq9PWy/r9Zj\nC9/b+f60mh9yf/2SVr7ns/ZkeNwY9t7VTS3wvdyc9qNObrP71N9IAW5fiHiDhlmb1mWC3STaQ2V/\n8hJwYZomm3HqRcvUmZYz0/b04XvFhIwZY7E8OwU5vkdjdfZWuYpnptFs4/s54ZeGEkVn31YU+d4d\nubh0WAxN8817P5OEWikin7CAFnaUSKheJLXIWGw1cAu4kSKLllLHTaR6oZfcfB8G5ggNvg9j7ZTH\nelqecbF5Db7XUrfEqWdwfjQm3yftpqZu6SHK3ewD2K1DGMTKj6KK4EAuEWaoRXlxCzinQrExdxsK\nWv/g5HuZP/mG8r0wLyYT0nJ0liHKd8e4WRHV1o5zfLeNETJFneSTH+E29Ztk7djKfmB5fNKKBgXc\nVZWlJ86F7vzk4WQyO/kuiALhu+B8xVSc32OTbzM2SateZjm+P/KUUlvLurusCBBvWssNURGbD5Nk\n036jf5uBgeMUcDqYbX4vrY7B+/BdMD+pP1/QpQlRKed1hvKda4iIxFlkKd9FCRZk2fTy+Kz+YVa1\naV1zoaqPliLj8upaMxISGipGXNgt8aoqvi35hjeyaIpKvpW2elc5vos2nyRT0kuhQOtLSwiyYxXX\n7GHQ+hsUcNciLbRracnqN8m33BtfreCbVBmd54kjv8O1gPvYZ7gy0u7D9nl7WgoceCr8vqUrZPpZ\nyi+b5JvqqlQt18e3XjKdFWAtLd5GNTvwbU9lgnI3wskwGtec0n1/Kna+Sb5bb9q+T2dhFd/mOuDg\nW2pHvOt2eWIwpw131MG3Rb7xMXcrnzEjdpnFG22Ub1Nsa5NvWqTu9nggVGfmH6GNubWBTGOZzCJV\nX1UCTvs+sFxukm9y+aVGvmlDzvUSxftoJveycVu3zIDqyK13MAsrldhv57uMpMwEtSokMKFzqzH+\n7rEiUEb1icVabZRvEnX/W/oEh/+eawhWUVvED0zovH0B3ax2s3Tn8r/ZappvbRo51sub+JbakXKu\nlzyL1U3zFIE/bzukCuQkH/CNIZlqMTz0WPSgLYtxo/YgmVp12oPEmnHZg+VGHBg5tu2ssGOwMCQ4\nsbAqc0z8HUvvmuSberaTGv0dsufF5e9Alrh49BcOvFiaDOGt4qPA2EhFYo2O+r0weZrkm8ysbrU/\nfxPfCXFbXb4HBGb2xd6BAi86gW/ZFYvn7pE5pvGqoqneYLxKjPQ3HfGqCr4LzRrZKkd+qyrIBNGG\nQgwPLUjLbl90tSp+mG4ZLhy2J4j038X3rhAbp1mEY/xdvmc5TcBIbMsZjxUVWQJ8bvnFD9Jh1oMK\nYAdVxW5ZI9/QMV9PYKRh7uI7WptVrsLIUefzDf58R+/GsKWklR0OdYLSaM2CwY/mvBrUsBZrjthG\n5QfsUJVxTkZjeool3Mhs3sd3NG3rlnliZBoL+bQb+I5GCrITTKonI2fs0N6gbe2HIqGiNd0XsH5K\nwiS2uqEchFmnuZ7Jc7UFV+Nc0eSdfKcz8PNS0C3ER64s79LQd/mOuvvBufqbK/a1My44QtHWQCwB\naCZDK6ERW7KXHZaEiqgVa0U57Dr9fqdQd3o33ynW8/6iUMv1dR/fKbaL/nyZ77CrHgLcljJrGY+Q\npT4P+sQlChqP86g4zVfWVO/jwbcNWWTuHr6tcOyyxlZK64gFPEHNLVJlsSCzMWfaosrVEsXKbwsa\n4rubCULdfO8dvg4sTuVGDAp4H2cK+gyljaPCqXLq66nX/B7ftnrNGvh21WvixK84xAULMWCqoGDa\njvHNdbbStxV11CN/i++JrR75fr7nrkwaaIaqdAx2aXsVcIw87gqFDBp4U6V1VFbJT+yru/gueZHN\ndqyLo+rk2/1GImiz0ojBTSZXjYKZ6HbV0gA+T8UcaJXtJ+mSGN5dfK+sS8Sc7Dv7Ht9W6qev7rw8\nGMrVLj/q6ydJSomqi3qg1speS4GQ8iXKoyPrile9/GdUDF6wNmrR/Pge5eMnPdPmPmPvsV8KYtUO\nl19HlxjDsm3HiwTAA61OHLfO+wFHhg27O6p0Fe9myPh+ieDzS7YfEG+w8A3XpkOW3w/YeTVDKinf\n2FJpO5caspRv+NznTD2bnd577QeEkljX3m1cIQ9vONVdx/lAuYT7qSdqsl9c9rt2l6vJdb+rAGS3\n4OdrTxl+tpwYwek1xuXz4tKV6XLzxIv7XUtbGuR+VN95ljQJ76fs7hZfY8/9rmANOovyE9AoG5hI\nfWfSaO73XC5DkVyc906kI/OpvbsR8aVxLgpbtO8DO3f33Gf//dxgkDkP8I7zK8TWWdMCc8ca0fql\ngL1nHvXaK5Nv9xHrkADyLIf+FYBUo8eeE2XsrHGfIoU7jl3HM/4m+Mu3UQAa9T0qEoN8F+Gtv1PE\n2glZe51LmKXiXMe9/iaAfeJlQyTbW8QbfZR/4oylfwRgQ3jsYWOc7NMb+VQ4Z4+nAQPvYQGOo3uL\nD0sMD9Z9EgJmOP+FI63+GUD8xLXFh4lcwMAZCoOUmiN+8ssAMRnHOd9Ed/uqFCgwCcslBR7LUxkg\njHtI9wK1eLVDj/5RjSfS/QSAGFblDvRm+Wj5pkOly4rDRdF47Ae+DWAx26hUXNUXoZi+jn46LiNT\nv273D7x56rGAbuPJTnhMDnK4FK/FtIA6X593BcNM8LzpF089HHQZyMZCHqO1Wp3rEhn39J/WsUXE\nY36A68E4KUAHojqtHHlMTEiuG73KuEWslRUvfOkVn0d18vKXItHiehI6/8ESNaG5KGJxs4QWMuyl\nwLpMFiuSwHLmJH4lyAoYdeefXPEUSrRXhovzZGRZzUKG5XPvctyuUK1nErZ1vnnrl0LmTgpazued\nnDu56+V3oeWq/7rrxXyxNo5COgS6SyAnUTUsS2nSOlR/p+FXBj404kFVZdmubTMzGLHLrY/oj4/i\ngcDUqoy47rMqCYHYzjbOMP0IlmA1koGVvOmXqLDp+MBaANg9lT2iAATj403+2OnF0VHIwrjc51XR\nuvIRBSBSi7u9Wmacd9ebz2I1kgUyNQLnYM7sFl+Db5+t/wvBpBByPJmM5S3VSHGieGs8nAzS/+94\nd8RvBWO+b5a6/1sBAQEBAQEBAQEBAQEBAQEBAQEBAT8X/wMb5cqZ5/G69AAAAABJRU5ErkJggg==\n", "text/plain": [ "" ] }, "execution_count": 1, "metadata": { "image/png": { "width": 400 } }, "output_type": "execute_result" } ], "source": [ "from IPython.display import Image\n", "Image('airbnb.png', width=400)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##Introduction" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "In this [competition](https://www.kaggle.com/c/airbnb-recruiting-new-user-bookings) Airbnb challenged Kagglers to predict in which country a new user will make his or her first booking. \n", "The below implementation is the result of the team work with [Gabriele Lanaro](http://gabrielelanaro.github.io/). Both of us actively contributed to code, technical discussions and finding solutions to blockers along the road.\n", "The final pipeline consists of the following steps:\n", "1. Reading and preprocessing the available data (including user sessions).\n", "2. Using a Random Forest classifier to identify negligible features. Specifically we decided to remove all the features having less than 0.1% of relative importance.\n", "3. Running Cross Validation to select best classifier/hyperparameters. This step pointed us in the direction of the Bagging algorithm. \n", "4. Due to technical restrictions in our infrastructure (limitations in RAM and computational power) we ran 5 separate Bagging Classifiers on the final dataset and averaged the results to come up with a definitive outcome.\n", "5. Generating the submission file on top of the previous model." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "####Getting Train and Test set and outputting a data set ready to be fed into a Machine Learning pipeline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def make_user_features(train, test):\n", " #encoding country destinations in train set\n", " outcome = train.country_destination\n", " labels = outcome.values\n", " le = LabelEncoder()\n", " y = le.fit_transform(labels)\n", " train = train.drop(['country_destination'], axis=1)\n", "\n", " #storing user ids in test set\n", " id_test = test['id']\n", "\n", " #appending test to train and dropping date first booking which is redundant\n", " data = pd.concat((train, test), axis=0, ignore_index=True)\n", " data = data.drop(['date_first_booking'], axis=1)\n", "\n", " #extracting features from date_account_created\n", " data['dac_year'] = data.date_account_created.apply(lambda x: x.year)\n", " data['dac_month'] = data.date_account_created.apply(lambda x: x.month)\n", " data['dac_weekday'] = data.date_account_created.apply(lambda x: x.weekday())\n", " data = data.drop(['date_account_created'], axis=1)\n", "\n", " #extracting features from timestamp_first_active\n", " data['tfa_year'] = data.timestamp_first_active.apply(lambda x: x.year)\n", " data['tfa_month'] = data.timestamp_first_active.apply(lambda x: x.month)\n", " data['tfa_weekday'] = data.timestamp_first_active.apply(lambda x: x.weekday())\n", " data = data.drop(['timestamp_first_active'], axis=1)\n", "\n", " #filling age nan with age median\n", " data.age = data.age.fillna(data.age.median())\n", "\n", " #binning age column\n", " bins = list(np.arange(15, 85, 5))\n", " bins.insert(0,0)\n", " bins.append(int(max(data.age)))\n", " group_names = ['<15', '15-20', '20-25', '25-30', '30-35', '35-40', '40-45', '45-50',\n", " '50-55', '55-60', '60-65', '65-70', '70-75', '75-80', '>80']\n", " data['age_bucket'] = pd.cut(data['age'], bins, labels=group_names)\n", "\n", " #cleaning gender column and filling nan in all dataframe with 'unknown'\n", " data.gender = data.gender.replace('-unknown-','unknown')\n", " data.ix[:, data.columns != 'age_bucket'] = data.ix[:, data.columns != 'age_bucket'].fillna('unknown')\n", "\n", " #generating dummy variables in top of categorical columns\n", " to_be_dummified = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', \n", " 'affiliate_provider', 'first_affiliate_tracked', 'signup_app',\n", " 'first_device_type', 'first_browser','age_bucket']\n", " for f in to_be_dummified:\n", " dummies = pd.get_dummies(data[f], prefix=f)\n", " data = data.drop([f], axis=1)\n", " data = pd.concat((data, dummies), axis=1)\n", "\n", " return data[:train.shape[0]], data[train.shape[0]:], y, le" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "####Adding Sessions to the previously generated User data set" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def make_sessions_features(data, df_sessions):\n", " # Drop row with nan values from the \"user_id\" column as they're useless\n", " df_sessions = df_sessions.dropna(subset=[\"user_id\"])\n", "\n", " # print df_sessions\n", "\n", " # Frequency of devices - by user\n", " device_freq = df_sessions.groupby('user_id').device_type.value_counts()\n", " \n", " # Frequency of actions taken - by user\n", " action_freq = df_sessions.groupby('user_id').action.value_counts()\n", "\n", " # Total list of users\n", " users = data.id.values\n", " def feature_dict(df):\n", " f_dict = dict(list(df.groupby(level='user_id')))\n", " res = {}\n", " for k, v in f_dict.items():\n", " v.index = v.index.droplevel('user_id')\n", " res[k] = v.to_dict()\n", " return res\n", "\n", " # Make a dictionary with the frequencies { 'user_id' : {\"IPhone\": 2, \"Windows\": 1}}\n", " action_dict = feature_dict(action_freq)\n", " device_dict = feature_dict(device_freq)\n", "\n", " # Transform to a list of dictionaries\n", " action_rows = [action_dict.get(k, {}) for k in users]\n", " device_rows = [device_dict.get(k, {}) for k in users]\n", "\n", " device_transf = DictVectorizer()\n", " tf = device_transf.fit_transform(device_rows)\n", "\n", " action_transf = DictVectorizer()\n", " tf2 = action_transf.fit_transform(action_rows)\n", "\n", " # Concatenate the two datasets\n", " # Those are row vectors with the frequencies of both device and actions [0, 0, 0, 2, 0, 1, ...]\n", " features = sp.hstack([tf, tf2])\n", "\n", " # We create a dataframe with the new features and we write it to disk\n", " df_sess_features = pd.DataFrame(features.todense())\n", " \n", " df_sess_features['id'] = users\n", "\n", " #left joining data and sessions on user_id\n", " final = pd.merge(data, df_sess_features, how='left', left_on='id', right_on='id')\n", " final.ix[:, final.columns != 'age_bucket'].fillna(-1, inplace=True)\n", "\n", " final.drop(['id'], axis=1, inplace=True)\n", " return final" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "####Putting together the competition specific evaluation metrics, NDCG (Normalized discounted cumulative gain) @k where k=5. " ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\"\"\" Reference from https://gist.github.com/bwhite/3726239\n", "\"\"\"\n", "\n", "import numpy as np\n", "\n", "def mean_ndcg(clf, X, y):\n", " # Predict class probabilities\n", " y_predict = clf.predict_proba(X)\n", " # Get highest 5 predictions\n", " best_5 = np.argsort(-y_predict, axis=1)[:, :5]\n", "\n", " # Transform to relevance scores\n", " relevance = (best_5 == y[:, np.newaxis]).astype('int')\n", "\n", " # Calculate ndcg for each sample and take average (?)\n", " return np.mean([ndcg_at_k(row, 5) for row in relevance])\n", "\n", "def dcg_at_k(r, k, method=0):\n", " \"\"\"Score is discounted cumulative gain (dcg)\n", " Relevance is positive real values. Can use binary\n", " as the previous methods.\n", " Example from\n", " http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf\n", " >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]\n", " >>> dcg_at_k(r, 1)\n", " 3.0\n", " >>> dcg_at_k(r, 1, method=1)\n", " 3.0\n", " >>> dcg_at_k(r, 2)\n", " 5.0\n", " >>> dcg_at_k(r, 2, method=1)\n", " 4.2618595071429155\n", " >>> dcg_at_k(r, 10)\n", " 9.6051177391888114\n", " >>> dcg_at_k(r, 11)\n", " 9.6051177391888114\n", " Args:\n", " r: Relevance scores (list or numpy) in rank order\n", " (first element is the first item)\n", " k: Number of results to consider\n", " method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]\n", " If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]\n", " Returns:\n", " Discounted cumulative gain\n", " \"\"\"\n", " r = np.asfarray(r)[:k]\n", " if r.size:\n", " if method == 0:\n", " return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))\n", " elif method == 1:\n", " return np.sum(r / np.log2(np.arange(2, r.size + 2)))\n", " else:\n", " raise ValueError('method must be 0 or 1.')\n", " return 0.\n", "\n", "\n", "def ndcg_at_k(r, k, method=0):\n", " \"\"\"Score is normalized discounted cumulative gain (ndcg)\n", " Relevance is positive real values. Can use binary\n", " as the previous methods.\n", " Example from\n", " http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf\n", " >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]\n", " >>> ndcg_at_k(r, 1)\n", " 1.0\n", " >>> r = [2, 1, 2, 0]\n", " >>> ndcg_at_k(r, 4)\n", " 0.9203032077642922\n", " >>> ndcg_at_k(r, 4, method=1)\n", " 0.96519546960144276\n", " >>> ndcg_at_k([0], 1)\n", " 0.0\n", " >>> ndcg_at_k([1], 2)\n", " 1.0\n", " Args:\n", " r: Relevance scores (list or numpy) in rank order\n", " (first element is the first item)\n", " k: Number of results to consider\n", " method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]\n", " If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]\n", " Returns:\n", " Normalized discounted cumulative gain\n", " \"\"\"\n", " dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)\n", " if not dcg_max:\n", " return 0.\n", " return dcg_at_k(r, k, method) / dcg_max" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "####Function to generate submission files for Kaggle" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def generate_submission(y_pred, id_test, le, filename):\n", " ids = [] #list of ids\n", " cts = [] #list of countries\n", " for i in range(len(id_test)):\n", " idx = id_test[i]\n", " ids += [idx] * 5\n", " cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()\n", "\n", " #Generate submission\n", " sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])\n", " sub.to_csv(os.path.join('data', filename),index=False)\n", " print 'Submission File Successfully Generated'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##Machine Learning Pipeline" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#importing packages\n", "import pandas as pd\n", "import numpy as np\n", "import os\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.feature_extraction import DictVectorizer\n", "import scipy.sparse as sp" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "####Reading and preprocessing data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#reading data\n", "pretest = pd.read_csv(os.path.join('data', 'test_users.csv'), header=0, parse_dates=[1,2,3])\n", "pretrain = pd.read_csv(os.path.join('data', 'train_users_2.csv'), header=0, parse_dates=[1,2,3])\n", "df_sessions = pd.read_csv(\"data/sessions.csv\", encoding='utf8')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iddate_account_createdtimestamp_first_activedate_first_bookinggenderagesignup_methodsignup_flowlanguageaffiliate_channelaffiliate_providerfirst_affiliate_trackedsignup_appfirst_device_typefirst_browsercountry_destination
0gxn3p5htnn2010-06-282009-03-19 04:32:55NaT-unknown-NaNfacebook0endirectdirectuntrackedWebMac DesktopChromeNDF
1820tgsjxq72011-05-252009-05-23 17:48:09NaTMALE38facebook0enseogoogleuntrackedWebMac DesktopChromeNDF
24ft3gnwmtx2010-09-282009-06-09 23:12:472010-08-02FEMALE56basic3endirectdirectuntrackedWebWindows DesktopIEUS
3bjjt8pjhuk2011-12-052009-10-31 06:01:292012-09-08FEMALE42facebook0endirectdirectuntrackedWebMac DesktopFirefoxother
487mebub9p42010-09-142009-12-08 06:11:052010-02-18-unknown-41basic0endirectdirectuntrackedWebMac DesktopChromeUS
\n", "
" ], "text/plain": [ " id date_account_created timestamp_first_active date_first_booking \\\n", "0 gxn3p5htnn 2010-06-28 2009-03-19 04:32:55 NaT \n", "1 820tgsjxq7 2011-05-25 2009-05-23 17:48:09 NaT \n", "2 4ft3gnwmtx 2010-09-28 2009-06-09 23:12:47 2010-08-02 \n", "3 bjjt8pjhuk 2011-12-05 2009-10-31 06:01:29 2012-09-08 \n", "4 87mebub9p4 2010-09-14 2009-12-08 06:11:05 2010-02-18 \n", "\n", " gender age signup_method signup_flow language affiliate_channel \\\n", "0 -unknown- NaN facebook 0 en direct \n", "1 MALE 38 facebook 0 en seo \n", "2 FEMALE 56 basic 3 en direct \n", "3 FEMALE 42 facebook 0 en direct \n", "4 -unknown- 41 basic 0 en direct \n", "\n", " affiliate_provider first_affiliate_tracked signup_app first_device_type \\\n", "0 direct untracked Web Mac Desktop \n", "1 google untracked Web Mac Desktop \n", "2 direct untracked Web Windows Desktop \n", "3 direct untracked Web Mac Desktop \n", "4 direct untracked Web Mac Desktop \n", "\n", " first_browser country_destination \n", "0 Chrome NDF \n", "1 Chrome NDF \n", "2 IE US \n", "3 Firefox other \n", "4 Chrome US " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pretrain.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "train, test, y, le = make_user_features(pretrain, pretest)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data = pd.concat((train, test), axis=0, ignore_index=True)\n", "final = make_sessions_features(data, df_sessions)\n", "\n", "del data" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [], "source": [ "X_train = final.ix[:train.shape[0]-1]\n", "X_test = final.ix[train.shape[0]:]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "assert train.shape[0] == y.shape[0]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train Set shape: (213451, 549)\n", "Test Set shape: (62096, 549)\n", "Labels shape: (213451L,)\n" ] } ], "source": [ "assert X_train.shape[0] == train.shape[0]\n", "assert X_train.shape[0] == y.shape[0]\n", "print 'Train Set shape:', X_train.shape\n", "print 'Test Set shape:', X_test.shape\n", "print 'Labels shape:', y.shape" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "del final" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "####Selecting irrelevant features via a Random Forest Classifier" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of features to be discarded: 311\n" ] } ], "source": [ "#fitting a Random Forest Classifier to select negligible features\n", "from sklearn.ensemble import RandomForestClassifier\n", "clf = RandomForestClassifier(n_estimators=160, oob_score=True, n_jobs=-1, criterion='entropy')\n", "clf.fit(X_train, y)\n", "print 'Number of features to be discarded: ', np.count_nonzero(clf.feature_importances_ < 1e-4)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#getting uninportant features\n", "unimportant_features = clf.feature_importances_ < 1e-4" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "####Selecting best model/hyperparameters via Cross Validation" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Splitting Train set into Train and Test again to perform CV\n", "#The classification task is very challenging due to the data being extremely skewed.\n", "#Hence, we make sure to stratify our samples as almost 95% of them are covered by 3 classes only: NDF, US, OTHER. \n", "from sklearn.cross_validation import train_test_split\n", "from sklearn.metrics import f1_score\n", "from sklearn.grid_search import GridSearchCV\n", "from pprint import pprint\n", "\n", "sub_X_train, sub_X_test, sub_y_train, sub_y_test = train_test_split(X_train, y, test_size=0.33, random_state=42, stratify=y)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###Bagging" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[Parallel(n_jobs=-1)]: Done 17 tasks | elapsed: 1.6min\n", "[Parallel(n_jobs=-1)]: Done 81 out of 81 | elapsed: 62.5min finished\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Fitting 3 folds for each of 27 candidates, totalling 81 fits\n" ] }, { "data": { "text/plain": [ "GridSearchCV(cv=3, error_score='raise',\n", " estimator=BaggingClassifier(base_estimator=None, bootstrap=True,\n", " bootstrap_features=False, max_features=1.0, max_samples=1.0,\n", " n_estimators=10, n_jobs=1, oob_score=False, random_state=42,\n", " verbose=0, warm_start=False),\n", " fit_params={}, iid=True, n_jobs=-1,\n", " param_grid={'n_estimators': [10, 50, 100], 'max_samples': [0.1, 0.5, 1.0], 'max_features': [0.1, 0.5, 1.0]},\n", " pre_dispatch='2*n_jobs', refit=True, scoring='f1_weighted',\n", " verbose=4)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.ensemble import BaggingClassifier\n", "bagg = BaggingClassifier(random_state=42)\n", "\n", "param_grid = {\"n_estimators\": [10, 50, 100],\n", " \"max_samples\": [0.1, 0.5, 1.0],\n", " \"max_features\": [0.1, 0.5, 1.0]}\n", "\n", "baggsearch = GridSearchCV(bagg, param_grid, scoring='f1_weighted', cv=3, verbose=4, n_jobs=-1)\n", "baggsearch.fit(sub_X_train, sub_y_train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "####Bagging Classifier: Results of Search Grid CV (evaluation metrics: F1 Weighted Score)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[mean: 0.59403, std: 0.00169, params: {'max_features': 1.0, 'max_samples': 0.1, 'n_estimators': 100},\n", " mean: 0.58978, std: 0.00201, params: {'max_features': 1.0, 'max_samples': 0.1, 'n_estimators': 50},\n", " mean: 0.58491, std: 0.00053, params: {'max_features': 1.0, 'max_samples': 0.5, 'n_estimators': 100},\n", " mean: 0.58245, std: 0.00059, params: {'max_features': 1.0, 'max_samples': 0.5, 'n_estimators': 50},\n", " mean: 0.58213, std: 0.00056, params: {'max_features': 0.5, 'max_samples': 0.1, 'n_estimators': 100},\n", " mean: 0.58174, std: 0.00056, params: {'max_features': 0.5, 'max_samples': 0.5, 'n_estimators': 100},\n", " mean: 0.58031, std: 0.00087, params: {'max_features': 0.5, 'max_samples': 1.0, 'n_estimators': 100},\n", " mean: 0.57837, std: 0.00108, params: {'max_features': 0.5, 'max_samples': 1.0, 'n_estimators': 50},\n", " mean: 0.57829, std: 0.00091, params: {'max_features': 0.5, 'max_samples': 0.5, 'n_estimators': 50},\n", " mean: 0.57760, std: 0.00127, params: {'max_features': 0.5, 'max_samples': 0.1, 'n_estimators': 50},\n", " mean: 0.57705, std: 0.00046, params: {'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 100},\n", " mean: 0.57442, std: 0.00063, params: {'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 50},\n", " mean: 0.56925, std: 0.00149, params: {'max_features': 1.0, 'max_samples': 0.1, 'n_estimators': 10},\n", " mean: 0.56588, std: 0.00250, params: {'max_features': 0.5, 'max_samples': 0.5, 'n_estimators': 10},\n", " mean: 0.56550, std: 0.00109, params: {'max_features': 1.0, 'max_samples': 0.5, 'n_estimators': 10},\n", " mean: 0.56489, std: 0.00079, params: {'max_features': 0.5, 'max_samples': 1.0, 'n_estimators': 10},\n", " mean: 0.56259, std: 0.00147, params: {'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 10},\n", " mean: 0.56244, std: 0.00104, params: {'max_features': 0.5, 'max_samples': 0.1, 'n_estimators': 10},\n", " mean: 0.47828, std: 0.00109, params: {'max_features': 0.1, 'max_samples': 1.0, 'n_estimators': 10},\n", " mean: 0.47787, std: 0.00053, params: {'max_features': 0.1, 'max_samples': 0.5, 'n_estimators': 10},\n", " mean: 0.47697, std: 0.00067, params: {'max_features': 0.1, 'max_samples': 0.1, 'n_estimators': 10},\n", " mean: 0.47294, std: 0.00097, params: {'max_features': 0.1, 'max_samples': 0.5, 'n_estimators': 50},\n", " mean: 0.47263, std: 0.00159, params: {'max_features': 0.1, 'max_samples': 1.0, 'n_estimators': 50},\n", " mean: 0.47212, std: 0.00135, params: {'max_features': 0.1, 'max_samples': 0.1, 'n_estimators': 50},\n", " mean: 0.47004, std: 0.00131, params: {'max_features': 0.1, 'max_samples': 1.0, 'n_estimators': 100},\n", " mean: 0.46982, std: 0.00064, params: {'max_features': 0.1, 'max_samples': 0.5, 'n_estimators': 100},\n", " mean: 0.46934, std: 0.00064, params: {'max_features': 0.1, 'max_samples': 0.1, 'n_estimators': 100}]\n" ] } ], "source": [ "pprint(sorted(baggsearch.grid_scores_, key=lambda x: -x.mean_validation_score))" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NDCG on Test Set 0.914233699603\n", "F1 Weighted Score on Test Set 0.594105356053\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\pochetti\\AppData\\Local\\Continuum\\Anaconda\\lib\\site-packages\\sklearn\\metrics\\classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n", " 'precision', 'predicted', average, warn_for)\n" ] } ], "source": [ "print 'NDCG on Test Set', mean_ndcg(baggsearch.best_estimator_, sub_X_test, sub_y_test)\n", "print 'F1 Weighted Score on Test Set', f1_score(sub_y_test, baggsearch.best_estimator_.predict(sub_X_test), average='weighted') " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###KNN" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting 3 folds for each of 5 candidates, totalling 15 fits\n", "[CV] n_neighbors=1 ...................................................\n", "[CV] .......................... n_neighbors=1, score=0.495831 -21.6min\n", "[CV] n_neighbors=1 ...................................................\n", "[CV] .......................... n_neighbors=1, score=0.498015 -22.2min\n", "[CV] n_neighbors=1 ...................................................\n", "[CV] .......................... n_neighbors=1, score=0.497071 -20.3min\n", "[CV] n_neighbors=2 ...................................................\n", "[CV] .......................... n_neighbors=2, score=0.491657 -23.1min\n", "[CV] n_neighbors=2 ...................................................\n", "[CV] .......................... n_neighbors=2, score=0.490935 -24.1min\n", "[CV] n_neighbors=2 ...................................................\n", "[CV] .......................... n_neighbors=2, score=0.489456 -23.1min\n", "[CV] n_neighbors=4 ...................................................\n", "[CV] .......................... n_neighbors=4, score=0.529255 -24.0min\n", "[CV] n_neighbors=4 ...................................................\n", "[CV] .......................... n_neighbors=4, score=0.526354 -24.9min\n", "[CV] n_neighbors=4 ...................................................\n", "[CV] .......................... n_neighbors=4, score=0.527694 -24.1min\n", "[CV] n_neighbors=8 ...................................................\n", "[CV] .......................... n_neighbors=8, score=0.540271 -27.4min\n", "[CV] n_neighbors=8 ...................................................\n", "[CV] .......................... n_neighbors=8, score=0.536771 -30.2min\n", "[CV] n_neighbors=8 ...................................................\n", "[CV] .......................... n_neighbors=8, score=0.536292 -24.7min\n", "[CV] n_neighbors=16 ..................................................\n", "[CV] ......................... n_neighbors=16, score=0.542694 -25.5min\n", "[CV] n_neighbors=16 ..................................................\n", "[CV] ......................... n_neighbors=16, score=0.542619 -25.0min\n", "[CV] n_neighbors=16 ..................................................\n", "[CV] ......................... n_neighbors=16, score=0.540317 -23.5min" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\pochetti\\AppData\\Local\\Continuum\\Anaconda\\lib\\site-packages\\sklearn\\metrics\\classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n", " 'precision', 'predicted', average, warn_for)\n", "[Parallel(n_jobs=1)]: Done 15 out of 15 | elapsed: 363.8min finished\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "text/plain": [ "GridSearchCV(cv=3, error_score='raise',\n", " estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", " metric_params=None, n_jobs=1, n_neighbors=2, p=2,\n", " weights='uniform'),\n", " fit_params={}, iid=True, n_jobs=1,\n", " param_grid={'n_neighbors': [1, 2, 4, 8, 16]},\n", " pre_dispatch='2*n_jobs', refit=True, scoring='f1_weighted',\n", " verbose=4)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.neighbors import KNeighborsClassifier\n", "neigh = KNeighborsClassifier(n_neighbors=2)\n", "\n", "param_grid = {\"n_neighbors\": [1, 2, 4, 8, 16]}\n", "\n", "knnsearch = GridSearchCV(neigh, param_grid, scoring='f1_weighted', cv=3, verbose=4, n_jobs=1)\n", "knnsearch.fit(sub_X_train, sub_y_train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "####KNN Classifier: Results of Search Grid CV (evaluation metrics: F1 Weighted Score)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[mean: 0.54188, std: 0.00110, params: {'n_neighbors': 16},\n", " mean: 0.53778, std: 0.00177, params: {'n_neighbors': 8},\n", " mean: 0.52777, std: 0.00119, params: {'n_neighbors': 4},\n", " mean: 0.49697, std: 0.00089, params: {'n_neighbors': 1},\n", " mean: 0.49068, std: 0.00092, params: {'n_neighbors': 2}]\n" ] } ], "source": [ "pprint(sorted(knnsearch.grid_scores_, key=lambda x: -x.mean_validation_score))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NDCG on Test Set 0.891440349619\n", "F1 Weighted Score on Test Set 0.54514117663\n" ] } ], "source": [ "print 'NDCG on Test Set', mean_ndcg(knnsearch.best_estimator_, sub_X_test, sub_y_test)\n", "print 'F1 Weighted Score on Test Set', f1_score(sub_y_test, knnsearch.best_estimator_.predict(sub_X_test), average='weighted') " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For the sake of simplicity and for the general purpose of a proof of concept, we show just the tests performed with KNN and Bagging. For your reference we explored AdaBoosting and Random Forest from scikit-learn, together with XgbBoost from an independent library (you can have a look [here](https://github.com/FraPochetti/Airbnb/blob/master/AirbnbXgbBoosting.ipynb) at a sample implementation of this algorithm). None of the models was as successful as Bagging, hence we sticked with this algorithm. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###Training the Final Bagging Model" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#training 5 separate Bagging Classifiers\n", "from sklearn.ensemble import BaggingClassifier\n", "def bagging_prediction(X_train, y_train, X_test, \n", " n_estimators=100, \n", " max_samples=0.1, \n", " max_features=1.0, \n", " random_state=None):\n", "\n", " bagg = BaggingClassifier(random_state=random_state, \n", " n_estimators=n_estimators, \n", " max_samples=max_samples, \n", " max_features=max_features)\n", " bagg.fit(X_train.ix[:, ~unimportant_features], y_train)\n", " return bagg.predict_proba(X_test.ix[:, ~unimportant_features])" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [], "source": [ "probs = []\n", "for i in range(5):\n", " p = bagging_prediction(X_train, y, \n", " X_test,\n", " n_estimators=100,\n", " random_state=i)\n", " probs.append(p)\n", "\n", "# We take the average of the 5 models\n", "avg_probs = sum(probs)/len(probs)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Submission File Successfully Generated\n" ] } ], "source": [ "y_pred = avg_probs\n", "id_test = pretest.id.values\n", "generate_submission(y_pred, id_test, le, 'finalsubmission.csv')" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "##Conclusions\n", "At the time of writing (2016, February 2) this approch is ensuring us a NDCG score in the Public Kaggle Leaderboard of **0.87661** corresponding to a relative position on 223 over 1277 participants. " ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" } }, "nbformat": 4, "nbformat_minor": 0 }