{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Mangaki Data Challenge"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we describe some baseline strategies in order to tackle the Mangaki Data Challenge.\n",
    "\n",
    "- See the blog post on [research.mangaki.fr](http://research.mangaki.fr)\n",
    "- Feel free to send any comments at: `jj@mangaki.fr`!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df_watched = pd.read_csv('../data/mdc/watched.csv')\n",
    "df_train = pd.read_csv('../data/mdc/train.csv')\n",
    "df_test = pd.read_csv('../data/mdc/test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>work_id</th>\n",
       "      <th>rating</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>717</td>\n",
       "      <td>8025</td>\n",
       "      <td>dislike</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1106</td>\n",
       "      <td>1027</td>\n",
       "      <td>neutral</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1970</td>\n",
       "      <td>3949</td>\n",
       "      <td>neutral</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1685</td>\n",
       "      <td>9815</td>\n",
       "      <td>like</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1703</td>\n",
       "      <td>3482</td>\n",
       "      <td>like</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  work_id   rating\n",
       "0      717     8025  dislike\n",
       "1     1106     1027  neutral\n",
       "2     1970     3949  neutral\n",
       "3     1685     9815     like\n",
       "4     1703     3482     like"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_watched.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>work_id</th>\n",
       "      <th>rating</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>50</td>\n",
       "      <td>4041</td>\n",
       "      <td>wontsee</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>508</td>\n",
       "      <td>1713</td>\n",
       "      <td>wontsee</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1780</td>\n",
       "      <td>7053</td>\n",
       "      <td>willsee</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>658</td>\n",
       "      <td>8853</td>\n",
       "      <td>wontsee</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1003</td>\n",
       "      <td>9401</td>\n",
       "      <td>wontsee</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  work_id   rating\n",
       "0       50     4041  wontsee\n",
       "1      508     1713  wontsee\n",
       "2     1780     7053  willsee\n",
       "3      658     8853  wontsee\n",
       "4     1003     9401  wontsee"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>work_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>486</td>\n",
       "      <td>1086</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1509</td>\n",
       "      <td>3296</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>617</td>\n",
       "      <td>1086</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>270</td>\n",
       "      <td>9648</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>459</td>\n",
       "      <td>3647</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  work_id\n",
       "0      486     1086\n",
       "1     1509     3296\n",
       "2      617     1086\n",
       "3      270     9648\n",
       "4      459     3647"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compute basic count features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "import numpy as np\n",
    "\n",
    "nb = Counter()\n",
    "for user_id, work_id, choice in np.array(pd.concat([df_watched, df_train])):\n",
    "    nb[('user', user_id, choice)] += 1\n",
    "    nb[('work', work_id, choice)] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(('user', 488, 'neutral'), 1119),\n",
       " (('work', 9815, 'like'), 1050),\n",
       " (('work', 991, 'like'), 927),\n",
       " (('work', 4487, 'like'), 893),\n",
       " (('work', 1701, 'like'), 822)]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nb.most_common(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For example, this means that user 488 rated 1119 works as neutral, and work 9815 was liked 1050 times.\n",
    "\n",
    "This will be useful: [BC's solution](#Classifier-#3:-BC's-solution) (ranked **#5**) was solely based on those basic count features, not even on the watched dataset!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Embed ratings into (ad-hoc) values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "rating_values = {'love': 2, 'like': 2, 'dislike': -2, 'neutral': -1, 'willsee': 1, 'wontsee': 0}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_watched['value'] = df_watched['rating'].map(rating_values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_train['value'] = df_train['rating'].map(rating_values)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This will basically add one column to our datasets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>work_id</th>\n",
       "      <th>rating</th>\n",
       "      <th>value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>717</td>\n",
       "      <td>8025</td>\n",
       "      <td>dislike</td>\n",
       "      <td>-2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1106</td>\n",
       "      <td>1027</td>\n",
       "      <td>neutral</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1970</td>\n",
       "      <td>3949</td>\n",
       "      <td>neutral</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1685</td>\n",
       "      <td>9815</td>\n",
       "      <td>like</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1703</td>\n",
       "      <td>3482</td>\n",
       "      <td>like</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  work_id   rating  value\n",
       "0      717     8025  dislike     -2\n",
       "1     1106     1027  neutral     -1\n",
       "2     1970     3949  neutral     -1\n",
       "3     1685     9815     like      2\n",
       "4     1703     3482     like      2"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_watched.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Watched dataset: (198970, 2) dislike/love/neutral/like\n",
      "Train dataset: (11112, 2) willsee/wontsee\n",
      "Test dataset: (100015, 2)\n"
     ]
    }
   ],
   "source": [
    "X_watched = np.array(df_watched[['user_id', 'work_id']])\n",
    "y_watched = df_watched['value']\n",
    "y_text = df_watched['rating']\n",
    "X_train = np.array(df_train[['user_id', 'work_id']])\n",
    "y_train = df_train['value']\n",
    "X_test = np.array(df_test)\n",
    "print('Watched dataset:', X_watched.shape, '/'.join(set(y_text)))\n",
    "print('Train dataset:', X_train.shape, 'willsee/wontsee')\n",
    "print('Test dataset:', X_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "nb_users = 1 + max(df_watched['user_id'])\n",
    "nb_works = 1 + max(df_watched['work_id'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train Alternate Least Squares on the watched dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Computing M: (1983 × 9897)\n",
      "Chrono: fill and center matrix [0q, 1619ms]\n",
      "Shapes (1983, 20) (20, 9897)\n",
      "Chrono: factor matrix [0q, 9049ms]\n"
     ]
    }
   ],
   "source": [
    "from mangaki.algo.als import MangakiALS\n",
    "\n",
    "als = MangakiALS(20)\n",
    "als.set_parameters(nb_users, nb_works)\n",
    "als.fit(X_watched, y_watched)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Classifier #0: Dummy classifier with constant prediction (AUC = 50%, ranked 28/33)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# This part is only executable by admin who knows truth.csv, it is used for evaluation\n",
    "y_test = np.array(pd.read_csv('../data/mdc/truth.csv')['rating'].map({'willsee': 1, 'wontsee': 0}))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Dummy prediction that constantly predicts 0 (wontsee)\n",
    "y_pred = [0] * len(y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.59558066290056488"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics import accuracy_score, roc_auc_score\n",
    "\n",
    "accuracy_score(y_test, y_pred)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If you always predict `wontsee` (0), you will get 59.6% accuracy…"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "roc_auc_score(y_test, y_pred)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "… but only **50% AUC**."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Classifier #1: Logistic Regression (AUC = 69%, ranked 18/33)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We first have to build features for each user-work pair.\n",
    "\n",
    "For this, let's use the features extracted by the ALS algorithm earlier, together with the basic count features:\n",
    "> *How many favorite/like/neutral/dislike does this user/work have?*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "def build_features(user_id, work_id):\n",
    "    return np.concatenate((als.U[user_id] * als.VT.T[work_id],\n",
    "                           als.U[user_id],\n",
    "                           als.VT.T[work_id],\n",
    "                           [nb[('user', user_id, choice)] for choice in ['favorite', 'like', 'neutral', 'dislike']],\n",
    "                           [nb[('work', work_id, choice)] for choice in ['favorite', 'like', 'neutral', 'dislike']]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train_reg = [build_features(user_id, work_id) for user_id, work_id in X_train]\n",
    "X_test_reg = [build_features(user_id, work_id) for user_id, work_id in X_test]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
       "          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
       "          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
       "          verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf = LogisticRegression()\n",
    "clf.fit(X_train_reg, y_train)  # 2 s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_pred_binary = clf.predict(X_test_reg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_pred = clf.predict_proba(X_test_reg)[:, 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Counter({0: 69450, 1: 30565})"
      ]
     },
     "execution_count": 124,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Counter(y_pred_binary)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So it predicted 69k wontsee and 31k willsee."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.653641953706944"
      ]
     },
     "execution_count": 128,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "accuracy_score(y_test, y_pred_binary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.69347185479684292"
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "roc_auc_score(y_test, y_pred_2)\n",
    "# Best AUC: 0.70123, so ranked #18 / 33"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Logistic regression achieves 65% accuracy and **69% AUC**, enough to be ranked **#18**."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Classifier #2: Gradient Boosting Trees (AUC = 81%, ranked 8/33)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GradientBoostingClassifier(criterion='friedman_mse', init=None,\n",
       "              learning_rate=0.1, loss='deviance', max_depth=3,\n",
       "              max_features=None, max_leaf_nodes=None,\n",
       "              min_impurity_split=1e-07, min_samples_leaf=1,\n",
       "              min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "              n_estimators=300, presort='auto', random_state=None,\n",
       "              subsample=1.0, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 144,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "\n",
    "gbc = GradientBoostingClassifier(n_estimators=300)\n",
    "gbc.fit(X_train_reg, y_train)  # 7 s if 100 estimators, 18 s if 200, 20 s if 300"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.74767784832275164"
      ]
     },
     "execution_count": 145,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred_binary = gbc.predict(X_test_reg)\n",
    "accuracy_score(y_test, y_pred_binary)\n",
    "# Accuracy is 0.72589 if 100 estimators, 0.73984 if 200, 0.74768 if 300"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_pred = gbc.predict_proba(X_test_reg)[:, 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.81260462171306735"
      ]
     },
     "execution_count": 148,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "roc_auc_score(y_test, y_pred)\n",
    "# Best AUC: 0.78795 if 100 estimators, 0.80553 if 200, 0.81260 if 300"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This nonlinear classifier achieves 75% accuracy and **81% AUC**, enough to be ranked **#8**.\n",
    "\n",
    "Remember, [the winning solution by GeniusIke](https://wattlebird.github.io/2017/10/02/Mangaki-Data-challange-1st-place-solution/) had **86% AUC**."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Tip: Locate the errors"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It can help to track where were the mistakes. What did the classifier classify wrong?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "User #425 got 274 mistakes\n",
      "They rated {'favorite': 0, 'like': 304, 'neutral': 32, 'dislike': 50, 'willsee': 34, 'wontsee': 81}\n",
      "User #1550 got 274 mistakes\n",
      "They rated {'favorite': 0, 'like': 115, 'neutral': 162, 'dislike': 106, 'willsee': 31, 'wontsee': 111}\n",
      "User #130 got 272 mistakes\n",
      "They rated {'favorite': 0, 'like': 279, 'neutral': 17, 'dislike': 12, 'willsee': 31, 'wontsee': 43}\n",
      "User #1799 got 261 mistakes\n",
      "They rated {'favorite': 0, 'like': 133, 'neutral': 59, 'dislike': 38, 'willsee': 25, 'wontsee': 38}\n",
      "User #459 got 253 mistakes\n",
      "They rated {'favorite': 0, 'like': 115, 'neutral': 132, 'dislike': 8, 'willsee': 28, 'wontsee': 37}\n"
     ]
    }
   ],
   "source": [
    "nb_errors = Counter()\n",
    "for (user_id, work_id), y_p, y_t in zip(X_test, y_pred_binary, y_test):\n",
    "    if y_p != y_t:\n",
    "        nb_errors[('user', user_id)] += 1\n",
    "        nb_errors[('work', work_id)] += 1\n",
    "\n",
    "for (error_type, error_id), mistakes in nb_errors.most_common(5):\n",
    "    if error_type == 'user':\n",
    "        print('User #{} got {} mistakes'.format(error_id, mistakes))\n",
    "        print('They rated', {choice: nb[('user', error_id, choice)] for choice in ['favorite', 'like', 'neutral', 'dislike', 'willsee', 'wontsee']})\n",
    "    else:\n",
    "        print('Work #{} got {} mistakes'.format(error_id, mistakes))\n",
    "        print('It was rated', {choice: nb[('work', error_id, choice)] for choice in ['favorite', 'like', 'neutral', 'dislike', 'willsee', 'wontsee']})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Classifier #3: BC's solution (AUC = 82.6%, ranked #5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Quite surprisingly, this solution got ranked **#5** without using the watched dataset.\n",
    "\n",
    "We try to reproduce it here.\n",
    "\n",
    "The idea: compute an average value in the training set and use it for prediction."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Variant 1: Predict 1 if the user rated more willsee than wontsee, 0 otherwise (AUC = 72.8%, ranked 17/33)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_pred_bc = []\n",
    "for user_id in X_test[:, 0]:\n",
    "    y_pred_bc.append(1 if nb[('user', user_id, 'willsee')] >= nb[('user', user_id, 'wontsee')] else 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.72802620226714432"
      ]
     },
     "execution_count": 178,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "roc_auc_score(y_test, y_pred_bc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Variant 2: Predict the willsee rate of the user (AUC = 77.7%, ranked 13/33)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_pred_bc = []\n",
    "for user_id, work_id in X_test:\n",
    "    user_yes, user_no = nb[('user', user_id, 'willsee')], nb[('user', user_id, 'wontsee')]\n",
    "    work_yes, work_no = nb[('work', work_id, 'willsee')], nb[('work', work_id, 'wontsee')]\n",
    "    if user_yes + user_no > 0:\n",
    "        y_pred_bc.append(user_yes / (user_yes + user_no))\n",
    "    else:\n",
    "        y_pred_bc.append(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 181,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.77708060193706996"
      ]
     },
     "execution_count": 181,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "roc_auc_score(y_test, y_pred_bc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Variant 3: Predict some combination of the willsee rates of the user and the work (AUC = 81%, ranked 8/33)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 182,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_pred_bc = []\n",
    "for user_id, work_id in X_test:\n",
    "    user_yes, user_no = nb[('user', user_id, 'willsee')], nb[('user', user_id, 'wontsee')]\n",
    "    work_yes, work_no = nb[('work', work_id, 'willsee')], nb[('work', work_id, 'wontsee')]\n",
    "    user_rate = user_yes / (user_yes + user_no) if user_yes + user_no > 0 else 0\n",
    "    work_rate = work_yes / (work_yes + work_no) if work_yes + work_no > 0 else 0\n",
    "    if user_yes + user_no > 0:\n",
    "        y_pred_bc.append(0.73 * user_rate + 0.27 * work_rate)\n",
    "    elif work_yes + work_no > 0:\n",
    "        y_pred_bc.append(work_rate)\n",
    "    else:\n",
    "        y_pred_bc.append(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.81446118251383193"
      ]
     },
     "execution_count": 184,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "roc_auc_score(y_test, y_pred_bc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Hope you had fun competing!\n",
    "\n",
    "- Stay in touch on [Twitter @MangakiFR](https://twitter.com/mangakifr) to know when the next challenge will start!\n",
    "- Try Mangaki: https://mangaki.fr"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Django Shell-Plus",
   "language": "python",
   "name": "django_extensions"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}