{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Important: This notebook will only work with fastai-0.7.x. Do not try to run any fastai-1.x code from this path in the repository because it will load fastai-0.7.x**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2\n",
    "%matplotlib inline\n",
    "\n",
    "from fastai.nlp import *\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.svm import LinearSVC\n",
    "from torchtext import vocab, data, datasets\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sl=1000\n",
    "vocab_size=200000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>link</th>\n",
       "      <th>time</th>\n",
       "      <th>favorites</th>\n",
       "      <th>rts</th>\n",
       "      <th>authors</th>\n",
       "      <th>category</th>\n",
       "      <th>published</th>\n",
       "      <th>summary</th>\n",
       "      <th>title</th>\n",
       "      <th>tweeted</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>arxiv.org/abs/1611.10003</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[Tom A. F. Anderson, C. -H. Ruan]</td>\n",
       "      <td>q-bio.NC</td>\n",
       "      <td>2016-11-30 05:17:11</td>\n",
       "      <td>In summary of the research findings presented ...</td>\n",
       "      <td>Vocabulary and the Brain: Evidence from Neuroi...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>arxiv.org/abs/1611.10007</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[M. Amin Rahimian, Amir G. Aghdam]</td>\n",
       "      <td>cs.SY</td>\n",
       "      <td>2016-11-30 05:37:11</td>\n",
       "      <td>In this paper, structural controllability of a...</td>\n",
       "      <td>Structural Controllability of Multi-Agent Netw...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>arxiv.org/abs/1611.10010</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[Debidatta Dwibedi, Tomasz Malisiewicz, Vijay ...</td>\n",
       "      <td>cs.CV</td>\n",
       "      <td>2016-11-30 06:00:47</td>\n",
       "      <td>We present a Deep Cuboid Detector which takes ...</td>\n",
       "      <td>Deep Cuboid Detection: Beyond 2D Bounding Boxes</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>arxiv.org/abs/1611.10012</td>\n",
       "      <td>2016-12-01 01:46:12</td>\n",
       "      <td>11.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>[Jonathan Huang, Vivek Rathod, Chen Sun, Mengl...</td>\n",
       "      <td>cs.CV</td>\n",
       "      <td>2016-11-30 06:06:15</td>\n",
       "      <td>In this paper, we study the trade-off between ...</td>\n",
       "      <td>Speed/accuracy trade-offs for modern convoluti...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>arxiv.org/abs/1611.10014</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[Yoones Hashemi, Amir H. Banihashemi]</td>\n",
       "      <td>cs.IT</td>\n",
       "      <td>2016-11-30 06:12:45</td>\n",
       "      <td>In this paper, we propose a characterization o...</td>\n",
       "      <td>Characterization and Efficient Exhaustive Sear...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       link                 time  favorites  rts  \\\n",
       "0  arxiv.org/abs/1611.10003                  NaN        NaN  NaN   \n",
       "1  arxiv.org/abs/1611.10007                  NaN        NaN  NaN   \n",
       "2  arxiv.org/abs/1611.10010                  NaN        NaN  NaN   \n",
       "3  arxiv.org/abs/1611.10012  2016-12-01 01:46:12       11.0  2.0   \n",
       "4  arxiv.org/abs/1611.10014                  NaN        NaN  NaN   \n",
       "\n",
       "                                             authors  category  \\\n",
       "0                  [Tom A. F. Anderson, C. -H. Ruan]  q-bio.NC   \n",
       "1                 [M. Amin Rahimian, Amir G. Aghdam]     cs.SY   \n",
       "2  [Debidatta Dwibedi, Tomasz Malisiewicz, Vijay ...     cs.CV   \n",
       "3  [Jonathan Huang, Vivek Rathod, Chen Sun, Mengl...     cs.CV   \n",
       "4              [Yoones Hashemi, Amir H. Banihashemi]     cs.IT   \n",
       "\n",
       "             published                                            summary  \\\n",
       "0  2016-11-30 05:17:11  In summary of the research findings presented ...   \n",
       "1  2016-11-30 05:37:11  In this paper, structural controllability of a...   \n",
       "2  2016-11-30 06:00:47  We present a Deep Cuboid Detector which takes ...   \n",
       "3  2016-11-30 06:06:15  In this paper, we study the trade-off between ...   \n",
       "4  2016-11-30 06:12:45  In this paper, we propose a characterization o...   \n",
       "\n",
       "                                               title  tweeted  \n",
       "0  Vocabulary and the Brain: Evidence from Neuroi...        0  \n",
       "1  Structural Controllability of Multi-Agent Netw...        0  \n",
       "2    Deep Cuboid Detection: Beyond 2D Bounding Boxes        0  \n",
       "3  Speed/accuracy trade-offs for modern convoluti...        1  \n",
       "4  Characterization and Efficient Exhaustive Sear...        0  "
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PATH='data/arxiv/arxiv.csv'\n",
    "\n",
    "# You can download a similar to Jeremy's original arxiv.csv here: https://drive.google.com/file/d/0B34BjUTAgwm6SzdPWDAtVG1vWVU/. It comes from this article https://hackernoon.com/building-brundage-bot-10252facf3d1 and github https://github.com/amauboussin/arxiv-twitterbot, just rename it to arxiv.csv\n",
    "\n",
    "df = pd.read_csv(PATH)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['txt'] = df.category + ' ' + df.title + '\\n' + df.summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "q-bio.NC Vocabulary and the Brain: Evidence from Neuroimaging Studies\n",
      "In summary of the research findings presented in this paper, various brain\n",
      "regions are correlated with vocabulary and vocabulary acquisition. Semantic\n",
      "associations for vocabulary seem to be located near brain areas that vary\n",
      "according to the type of vocabulary, e.g. ventral temporal regions important\n",
      "for words for things that can be seen. Semantic processing is believed to be\n",
      "strongly associated with the ANG. Phonological ability has been closely related\n",
      "to the anterior surfaces of the SMG. Pathways through the posterior SMG are\n",
      "thought to link the anterior SMG and the ANG. In vocabulary tasks,\n",
      "mediotemporal structures may be related to long-term memory processing, with\n",
      "left hippocampal and parahippocampal regions related to long-term and working\n",
      "memory, respectively. Precentral structures are associated with phonological\n",
      "retrieval. Furthermore, many more regions of the brain are of interest in\n",
      "vocabulary tasks, particularly in areas important for visual and auditory\n",
      "processing. Furthermore, differences between brain anatomies can be attributed\n",
      "to vocabulary demands of different languages.\n"
     ]
    }
   ],
   "source": [
    "print(df.iloc[0].txt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "27188"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n=len(df); n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "val_idx = get_cv_idxs(n, val_pct=0.1)\n",
    "((val,trn),(val_y,trn_y)) = split_by_idx(val_idx, df.txt.values, df.tweeted.values)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Ngram logistic regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((24470, 3780668), 14938443)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "veczr =  CountVectorizer(ngram_range=(1,3), tokenizer=tokenize)\n",
    "trn_term_doc = veczr.fit_transform(trn)\n",
    "val_term_doc = veczr.transform(val)\n",
    "trn_term_doc.shape, trn_term_doc.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y=trn_y\n",
    "x=trn_term_doc.sign()\n",
    "val_x = val_term_doc.sign()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p = x[np.argwhere(y!=0)[:,0]].sum(0)+1\n",
    "q = x[np.argwhere(y==0)[:,0]].sum(0)+1\n",
    "r = np.log((p/p.sum())/(q/q.sum()))\n",
    "b = np.log(len(p)/len(q))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.80316409124356147"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pre_preds = val_term_doc @ r.T + b\n",
    "preds = pre_preds.T>0\n",
    "(preds==val_y).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.85062545989698313"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "m = LogisticRegression(C=0.1, fit_intercept=False)\n",
    "m.fit(x, y);\n",
    "\n",
    "preds = m.predict(val_x)\n",
    "(preds.T==val_y).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "probs = m.predict_proba(val_x)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAEWCAYAAAB42tAoAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XucXVV99/HPb26Z3GZyD5DbJBgSwh1CACkQBRGoglUf\nC2LVPlSq1WqrtY992pemqLXVx7Za8VFarNeCaIumCqXKJaByyWAuEEhCLoRMLiSTe2Yy91//+O3j\nOZnMnDmZzJ6Zc/J9v17nNefss8/e6+xJ9nfWWnutbe6OiIhIb8qGugAiIjK8KShERCQvBYWIiOSl\noBARkbwUFCIikpeCQkRE8lJQSEHMbI2ZLe5jnZlmdtjMygepWKkzs5fN7Jrk+RIz++5Ql0lksCko\nilxyIjuSnKBfNbN/NbMxA70fdz/L3R/rY51X3H2Mu3cO9P6Tk3R78j33m9mvzOyygd7PycLMvmlm\nHWZ2Wg/LP9NtWZ2ZuZlV5Cx7p5nVJ7+PHWb2oJn9Vj/K8admttPMDpjZN8xsRJ51R5nZV82sMVn/\n8Zz3cv99ZB5zjrc80jMFRWl4s7uPAS4ELgb+qvsKFor99/395HtOAh4FfjDE5RlwuSfjFPcxGngb\ncAC4tR+f/yjwj8DfAFOBmcBXgZuOcztvBD4BXA3UAXOAv87zkbuACcCZyc8/7fb+95M/VDKPTcdT\nHuldsZ84JIe7bwMeBM4GMLPHzOyzZvZLoBmYY2a1ZnZ38lfgNjP7TG5TkZm9z8xeNLNDZvaCmV2Y\nLM9tglmU/DV5MKnF/H2y/Ki/PM3sNDNbamZ7zWyDmb0vZz9LzOw+M/t2sq81ZrawwO/ZAXwPmGZm\nk3O2+SYzW5lT4zg3570ZZvYfZrbbzPaY2VeS5aeb2SPJskYz+56ZjevP8Tezm5L9HzSzjWZ2Xfdj\nl/Pdv9vtmN1mZq8Aj5jZf5nZh7pte5WZvTV5Pt/MfpYc13Vm9o7jLOrbgP3AHcB7jvM71iaf+6C7\n/4e7N7l7u7v/p7t//DjL8R7gbndf4+77gE8D7+1lv/OAG4Hb3X23u3e6+7PHuT/pJwVFCTGzGcAN\nwIqcxb8H3A6MBbYA3wI6gNcAFwDXAn+QfP5/AUuAdwM1xH/MPT3s6kvAl9y9BjgduK+XIt0DNACn\nAW8H/sbMrs55/0bgXmAcsBT4SoHfsyop4x5gX7LsQuAbwB8CE4GvA0vNbEQShD9Jvn8dMC3ZL4AB\nn0vKeCYwIzkGx8XMFgHfBj6efJ8rgZePYxNXJft/I/BvwC05214AzAJ+mtQGfpasMyVZ76tmdlay\n7jvNbHUf+3oP8bu5F5if+WOgQJcB1cD9va2QlGF/nsfMZNWzgFU5H10FTDWziT1s9hLi9/fXSaA/\nZ2Zv67bOm5PwXGNmHziO7yR9cXc9ivhBnIwOE38hbiGaAEYm7z0G3JGz7lSgNfN+suwW4NHk+UPA\nR/Ls55rk+eNEE8GkbuvUAQ5UECfcTmBszvufA76ZPF8C/DznvQXAkTzfcwnQlnzPTiIkFue8//+B\nT3f7zDriBHwZsBuoKOB4vgVY0cv3XgJ8t5fPfR34h76OXfft5ByzOTnvjwWagFnJ688C30ie/y7w\nRA/7/lSB/15mAl3A+Tm/8y/lvP9N4DN5fq+3AjsH6N/uRuC6nNeVyX7qelj3/ybvLQGqkt/rYeDM\nnH8/pwHlwGuBHcAtaf//O1keqlGUhre4+zh3n+Xuf+TuR3Le25rzfBbxn3FH5q874iQzJXl/BvGf\nty+3AWcAa81suZm9qYd1TgP2uvuhnGVbiL/mM3bmPG8Gqs2swsxuzemQfDBnnfvcfRwReM8DF3X7\nbh/L/cs1+T6nJT+3eDRZHcXMppjZvUkz3EHgu0QfyPEq9Nj15je/p+SY/RS4OVl0M9HUBvE9L+n2\nPW8FTilwP78HvOjuK5PX3wPeaWaVyesO4t9IrkoiXLqIgJ5kA9OXcpiouWZknh/qYd0jQDsRYm3u\nvozop7oWwN1fcPftHk1SvyJqvW8fgDIKano6GeROD7yVqFFMSoJlnLvXuPtZOe+f3ucG3V9y91uI\ngPk74IdJk0iu7cAEMxubs2wmsK2A7X/Psx2S1/fwfiPRxLTEzE7NKftnc77XOHcf5e73JO/N7OXk\n9jniGJ3r0ZT2LqI56njlO3ZNwKic1z2d1LtP43wPcIvFlV0jiZNiZj/Lun3PMe5eaFPLu4m+qp1m\nthP4eyIYM8f5FaIGkWs2sNXdu4AngRai5tWjbkHf0yPT9LQGOC/no+cBr7p7T82dfTWndef07/co\nPVBQnETcfQfw38AXzazGzMqSztyrklX+BfgzM7vIwmvMbFb37ZjZu8xscnLi2J8sPuqSWHffCvwK\n+JyZVScdy7eR/cv4RL/LWqLZ5M+TRf8MvN/MLknKPtrMfjsJqmeIpoi/TZZXm9nlyefGkjTdmdk0\noo+hP+4Gft/Mrk6O6zQzm5+8txK42cwqLTrsC/lL9wGi9nAHcTVPV7L8J8AZZvZ7yfYqzexiMzuz\nrw0moXM6sAg4P3mcTfR3ZDq1/x34bTO71szKLS6f/SuSPh13PwB8ErjTzN5icclqpZldb2afT9bJ\nDfqeHq8k+/o2cJuZLTCz8cl+vtlL8R8nQuwvklrn5cBi4t9A5kKC8cnvfhHwYeDHfR0TKdBQt33p\ncWIPurV/d3vvMeAPui2rJdrzG4jLI1cAN+e8/36ibf8w0bxzQff9EM0zu5J11hBNX5DTlp28nk6c\n2PYSzTLvz9nPEnLa+7t/tofvctT6ybJLiL/WpySvrwOWE+G1g7h8dmzy3kzgR0TTSSPw5WT5WcCz\nyXdZCXwMaOjp+PZUhm7l+R3iL99DwAbgjcnyOcDTyT5+CnyZY/sojvneRPg4cHG35fOS7exOvs8j\nZPscbgXW9FK+rwH/3sPyRURNc0Ly+s3JMTlANBd+gZx+rZz91CfHf2dSntf249/vR4FXgYPAvwIj\nct5bA9ya8/osokbTBLwA/E7Oe/ckx+IwsBb48FD/3yylhyUHWUREpEdqehIRkbwUFCIikpeCQkRE\n8lJQiIhIXqlPQDbQJk2a5HV1dUNdDBGRovLss882uvvkvtc8VtEFRV1dHfX19UNdDBGRomJmW/r7\nWTU9iYhIXgoKERHJS0EhIiJ5KShERCQvBYWIiOSloBARkbxSCwoz+4aZ7TKz53t538zsyxb3Ul59\nnLdjFBGRQZJmjeKbxLTPvbkemJs8biemvi5IV1ff64iIyMBILSjc/XHiPgS9uQn4toengHE5dyvr\n1eHD8MQT8PLLA1RQERHJayj7KKZx9P2cGzj6fsq/YWa3m1m9mdXv2bOHFStg5cqe1hQRkYE2lEHR\n0/1se7yLkrvf5e4L3X3hhAkTqa1V85OIyGAZyqBoAGbkvJ4ObB+isoiISC+GMiiWAu9Orn66FDjg\n7juGsDwiItKD1GaPNbN7gMXAJDNrAD4FVAK4+9eAB4AbiJvQNwO/n1ZZRESk/1ILCne/pY/3Hfhg\nWvsXEZGBoZHZIiKSl4JCRETyUlCIiEheCgoREclLQSEiInkpKEREJC8FhYiI5KWgEBGRvBQUIiKS\nl4JCRETyUlCIiEheCgoREclLQSEiInkVfVA0N8d9tEVEJB1FHxTPPQePPQZNTUNdEhGR0lT0QdHW\nBhs3qlYhIpKWog6KXbugtXWoSyEiUtqKOihaW6GhQWEhIpKmog6K9vb4WV4+tOUQESllRRsUHR2w\ne7dqEyIiaSvaoOjqgkOH4OBBqKgY6tKIiJSuog2KjAkToKzov4WIyPBVtKfYlpajXzc26hJZEZE0\nFG1QdHXBq69mm502b4YHHojlIiIycIo2KACOHIGpU7PPd+yITm4RERk4RR0UAJWV8VNXP4mIpKPo\ngwKiuWnvXo2nEBFJQ1EGRWdn9nmmqcldl8mKiKShKIMCYjLAzs640qmpSX0TIiJpKdqg6OqCyZOj\nJtHZGT/b26MJSkREBk7RBgVAVVX2eWVlBMbPfhY3MxIRkYFR1EEBMHJk/MyMzt67V1dAiYgMpFSD\nwsyuM7N1ZrbBzD7Rw/szzexRM1thZqvN7Ibj3ccpp8AVV8DYsQNTZhEROVpqQWFm5cCdwPXAAuAW\nM1vQbbW/Au5z9wuAm4Gv9mdfY8fGFU+HDw9sbaKrKyYdzL3KSkTkZJNmjWIRsMHdN7l7G3AvcFO3\ndRyoSZ7XAttPZIfux04Q6B43N2ps7Hn9hoaYrrwnq1fD8uXw4ovxurUVtmyJWWszMh3pIiKlKs2g\nmAZszXndkCzLtQR4l5k1AA8Af9zThszsdjOrN7P6Awd209kZJ+3czuzufRUZTU2wYQP8+MfHXkK7\nc2fcb3vp0rjcFrJzRW3YEP0dL74Yj7Y2eO45qK+HRx6JdbZuhWeegZUrCzkcIiLFKc0hatbDsu5/\ne98CfNPdv2hmlwHfMbOz3f2oqf3c/S7gLoB58xZ6ZvzEhAnZderqIiw2bjx6B1u3xgn/8OEIisyg\nvPZ22L49AiHTvLRxY8wXNWZMNmBaWuIzzz0X23r55djP2rVRS1m7NkLr7LOz04mIiJSSNIOiAZiR\n83o6xzYt3QZcB+DuT5pZNTAJ2FXIDqxbFHWfwqOtLU7423P22tQUNZHNm6PJaf9+GDEC9u2L2WjX\nrImgaG+HUaNimwcPxja2b4fq6giddeti/QMHohbz0kuwoHsPjIhICUiz6Wk5MNfMZptZFdFZvbTb\nOq8AVwOY2ZlANdBLj8HxO3QIdu2KWkFVVYTEypXRdLR3b/Q3TJkSfQybN0dtYffuWK+rC+bOje20\ntMR7Z50Fs2ZFDWLXLhg3Ds45J8ZtNDUNVKlFRIaX1ILC3TuADwEPAS8SVzetMbM7zOzGZLWPAe8z\ns1XAPcB73Qeua3jz5viLf+LEeL1lSwTB2rXRiV1VlW2+amyMWsXIkVHLmDs3mpw6OqIGMXt2XF2V\ne0e900+HmhqYNOnY2o2ISKlIdRo9d3+A6KTOXfbJnOcvAJcP9H5XrIjxFa2tEQCnnpqdE2rbtlin\nsRHmzYtw6OiIju26umxfxqhRsd4pp2S3AdFM9cY3DnSJRUSGr5Kcb7WxMfoQ6upg/PjoV4DoY2hq\nipN9VVXUELZti/6Iqqq4CdIppxy9renT45Grt9pDpomrP/fw3rcv+kKmTMlewSUiMhyUZFC4R0f2\nzp3ZPoiurqgt1NRE89OCBXHCHzMmah4zZ55Y81FDQ2xn1Kjot9i3L/Y1YkTvn+nqin2uXRvNXRs3\nRh/IFVf0vxwiIgOt5IKiqytOuhBhUVsbfROdnVFzmDMnmpZqa2OdGTOiJpE7JqM/Dh2KYOrqiv00\nNsY+Lk8a1rZujSCZPDlqDM89F7dvzVxVtXkz7NkTn583L9bRtCQiMhyUZFA0N0fzT1lZ/EXvHifp\nU0+Nju3LLjv6MycSEu3t0eSUGQTY3h4D9Pbti31PmBDlaW6O5rCamgim/fvjSqpMjWb27KhdHD4M\nTz4ZIfe2t/WvGUtEZCCVVFDs3x8n6szAt5qa7Im2owNOO23g9zl6dNQKRo+OQXdPPBE1g0w/wy9/\nGeVpaorlBw5EiHR2Ro2irAwuvDD6UXbsiJpIa2uERn09XHzx8TWJdXXBpk2xj7Y2mD8/9jthQlyd\nJSJyvIoyKM46K0643WX+qu/sjGA477xYPndunMgzVzINJLNs8xLAwoVx0t+0KYKruTn2u2BBNHs9\n/3yE2cUXH3vr1nPPjZP5rl3x/errozY0Z07UejLNZb3ZujUGAe7ene0z2bUrQqqlJfpOysoiTMaO\njVqMOs5FpC9FGRTV1dFk05tRo6LdP3MSrKiIzurBMGVK/Jw7N07atbVxss/8Nf+61/X+WTOYNi3+\n+n/66RgUuHJl9H/s2QNXXx21kw0borYwZUqETltb/Ny3L/pjWlvjvRdfjOddXRE0K1ZE7ae1NY7h\nnj3qOBeRvhVlUPQm8xf3KafECbX7lB6DafToqAn0x8iRsHhxzF7b3By1kJaWmLxw/vy4OurgwTjZ\nV1bGib+yMvo3Zs6Mvpiysuwo8tZWWLUqArS5OfpOduyIUOouM8K8sTGC67TTsp3sU6eqz0TkZFRS\nQXHKKfFXd1UVvOY1Q12aEzduXNQmLr44mqHa2+OEX1WVvTd4ZWU0s82YcWxzVllZhM7IkXDVVUdv\n+9e/jm3U10cIZEL10KEIi87OqBHV1cX7O3dGrWjixAiQzk645JKjw3j37mj+Ki+PWlFHRzRxqW9E\npLiVVFBA/nELxWbmzGyT2ciR0bR00UUxiPD887P3wejP+I/MJbwtLdF0VVERJ/gDB6IGMmJE1FAy\nlxXv3x/BUlkZAdTZGT8nTIhwcY91Xn01O4akqio67N/0prgsWESKU8kFRam66KJjl53IAMHzzoNf\n/CJO9Pv3R2DMnRtXbnV2xkm+oSHCIHO1WGa23bFjo0aybl0ETOay4Pb26BvJdJ5n7jr4q19Fc9j4\n8VHT07xYIsVFQXGSqq6Ga66J592nR880J3WfumTy5Hh0dEQNZ8+eCI0jR6LvZMKECIFM30xra1we\nvGVLTJVSXh41krq6o7ebqRnt2hV9L2YRLKNHD+hXFpF+UlDIcauoiM72nm49C9kaQ3V1XFV16FD0\nXezYEcGxdWu8l/lsS0t2oOTu3RFEo0fH5budnRFAs2b1f2BkW1s0mx05kr3ia8aMqO1kLh8eOzbe\nO3IkylNeHvusqYnn1dXxfVUbkpORgkL6xaywk2ZVVXSAZ+bY2ro1Tszt7dl+kaamOAlXVmavyBo5\nMmoXR47E8rlzY90rr8zu98iR2M7YsRFGXV2xn8bG2EZbWzzco/azf3/s89ChmCK+rCyW7dkT5Rwx\nIvbZ3Bz7XLs2rqSrqMj+rKqKsTIiJxMFhQyKykp4wxuiGSozCDHT33HOOXHCz4x7ueCCGANSURGh\nsHcvPPtstkYxalR2fMjBg7GNjo54Pn58dnr5AweyHfUQwTJ1agRWZ2csmzgx3t+1K5rE5s+P7a9b\nF/Nv7dgRgTJqVPbqsPb2WNbREfNy9XZVV0tLhNLYsdkZjEWKkQ3gfYIGxbx5C/3b364/ZlSzlK6W\nlgiV1aujj6StLTvNSVNTXEbc0hJBMG5cnMynTInLpd3jJJ1b+8nMzVVoH0imRpIZ9V5bG9uACJB5\n86I20t4eYdLVFaHX1BRh1dkZYXLhhep3kaFjZs+6e7/qwzrdyrBXXR0n/bKy6BTPTEWSOTkfORIn\n7/b2OCn39df78f51bxYn+pqa2Mf06XHCf/rp6OfITELZ3h7rd3ZmJ6PMhFJDQ9Rapk+PEJk6NUJP\nAxilGCgopGhMmZKdIiWjqirbJFVZmZ0QMg1VVTF+JWPx4mz/Su5sxe3tR98lsbkZli2LUNmzJ1vL\nOeWUmLcs89mOjmiqam6Oz51xhqaal+FBQSFyAnpqSsoNL4jAuP766O/o7Iy5unbvzt7DJHNhgFn0\nlxw4EM1Xzz8PZ54Zj0xnu8hQUFCIDJLMfdenT48mqV/8IprN3LPNVbNnR4d6fX12OpV166KTfsSI\nGBBZVRWv1Wwlg0VBITIEcgc89mTx4riKa+XKuOpr27bo29i5M4Kipiamd2ltjaApL4+rtsaNG6xv\nICcTBYXIMFVTE+NGIPo9HnkkQuHQoQiEHTuiiSpz4eKqVdnxJplxKd3l3tgrU4PRlVjSFwWFSBGo\nrIQ3vjGet7ZG/8X48dE01dUVV1S1tGQHM2Y+k7mMPLOsoyO7rLo6wuVtb9M4D8lPQSFSZEaMOHaS\nyLPOiqDI1CTMopkKoi+jtjZ+VlREv8ihQ9GsVVUFzzyTrblA9JdkpisZynu6yPChoBApEd1rBT3d\nmAriktvMfUIeewzWrInwqaiIgGhqijCBCJ2RIyNkzj9fNY+TlYJC5CRVVRUd4q+8EqPe29uzgwQ7\nOmKdiop4lJXB+vVxRdZFF6mmcbJRUIicxObPj0GMbW1RWxg16ugxIO7xeOqpaMo6fDj6SM48M2b1\n1SW6JwcFhchJbsKE3t/LDAR87WsjTH7+c3jpJdi+Pa7KmjQpxnaUlcWo844OGDMmOto1JXvpUFCI\nSEEy96J/5ZUIipEjoxayfn0EQ2dn1Dja2mJ6kjFj4jLeefPQJJ5FTr8+ESnYGWfEA6LD+8knIxx2\n7IjahXsMDmxsjBpFTQ28/HLc9XDuXAVGsdKvTUT6ZeRIeP3rj11+7rnx88CBuKPhvn0RJOvXx5VY\nZjFz7syZg1te6b+Cg8LMpgGzcj/j7o+nUSgRKX61tXDDDTFm44knIjhefTWuriovh+uuO/b+6TI8\nFRQUZvZ3wO8CLwDJvcFwIG9QmNl1wJeAcuBf3P1ve1jnHcCSZHur3P2dhRZeRIa/sWMjMPbsyd6K\nds0aePjhGJtRWxv3MK+s1FVUw1WhNYq3APPcvbXQDZtZOXAn8AagAVhuZkvd/YWcdeYCfwFc7u77\nzGxKz1sTkWI3cWL8rK2NadYPHoTlyyMgxoyJ90ePjst1J06MgNF4jeGh0KDYBFQCBQcFsAjY4O6b\nAMzsXuAmolaS8T7gTnffB+Duu45j+yJSpC64IEaA79oV90cvL4/7qY8aFQGRmSG3oiJ+1tVlp2mX\nwVdoUDQDK83sYXLCwt0/nOcz04CtOa8bgEu6rXMGgJn9kmieWuLu/1VgmUSkiI0eHbPXzp6dXbZu\nXXR8HzmSHfg3YgQ89xy8730amzFUCg2KpcnjePT0K/Ue9j8XWAxMB54ws7Pdff9RGzK7HbgdYOpU\nXSohUqrmzYtHrlWrol/j0UfjEtza2rhiSqExeAoKCnf/lplVkdQAgHXu3t7HxxqAGTmvpwPbe1jn\nqWRbm81sHREcy7vt/y7gLoB58xZ2DxsRKWHz5sW9OF54IS7JdY8xGYsWxWtJX0HXGJjZYuAlonP6\nq8B6M7sy74fiZD/XzGYnIXMzx9ZKfgS8LtnHJCKINhVcehEpedXVcSntpEnR8b1zJ6xdC/ffH01U\nmZluJT2FNj19EbjW3dcBmNkZwD3ARb19wN07zOxDwENE/8M33H2Nmd0B1Lv70uS9a80sc9ntx919\nT/+/joiUorKyqEVA/HzssRgR/pOfROf3+efHtCGSjkKDojITEgDuvt7MerjR4tHc/QHggW7LPpnz\n3IGPJg8RkT5VVcFVV0Vz1KZNMcdUba2CIk2FBkW9md0NfCd5fSvwbDpFEhHJb8QIuP76aHb69a9j\nepCDByMw5syJZioN3hs4hQbFB4APAh8mrmZ6nOirEBEZMiNHRmg0NMSYjKqquMT20kuPvXpK+q/Q\nq55agb9PHiIiw8bChdnnzz8f06DPnKmgGEh5g8LM7nP3d5jZcxw7BgJ3Pze1komIHKf586M5atOm\nuFrqnHPip6Y3PzF9Hb6PJD/flHZBREROVEVFjLPYsweeeQa2bo2bKl1xRQRGa2sM1Mu93av0LW9Q\nuPuO5GkjcMTdu5JLY+cDD6ZdOBGR47VoUdySddmyuGnSK6/E9OYzZ0ZQdHXFPTOmTs3e6lXyK7RC\n9jhwhZmNBx4G6olpx29Nq2AiIv1VUQFXXx21i2XL4tatu3ZF5/f+/dH5PXVqTEY4ZgzMmhVXSU2e\nHJ3jcrRCg8LcvdnMbgP+yd0/b2Yr0iyYiMiJMoPFi49etmULbN4ctYu2tli2fn3UQmbOhCuvjAkL\nJavgoDCzy4gaxG3H+VkRkWFj1qx4ZDQ2RmC8+GLUNH72Mxg3Di6/PKYMkcJP9n9C3GDo/mQajjnA\no+kVS0RkcEyaFD9HjIDVq6M/o7Y2pga5qNdJik4uhY6jWAYsy3m9iRh8JyJSEiZOhNe9LqYEWbYs\nJh+U0Nc4in909z8xs/+k53EUN6ZWMhGRIVBWFnfa27YtmqUmTtSVUX3VKDJzO/2/tAsiIjIcmMV8\nUatXw09/CtOmxcC91taobUyZcvJ1dvc1jiIz8V89yTgKADMrB3QRmYiUpClTYpT388/H3fVefRVa\nWuLKqFGj4uqoOXOy4zAmTowBfaWq0M7sh4FrgMPJ65HAfwOvTaNQIiJDbfr0mLp89eoYf1FdHWGx\ndWuM/F6/PsZplJfDhAlx7+/zzx/qUqej0KCodvdMSODuh81sVEplEhEZFioq4MILj142fz40NcG+\nfdEctWVLdHzv3AlnnlmaA/YKnbG9ycx+c7jM7CJANyAUkZPS6NFR4zj9dHj96+My2kOH4o57XV1D\nXbqBdzzjKH5gZtuT16cSU3iIiJz0MlN/bN8eA/Ze//rSGqxX6DiK5WY2H5hH3Lhorbu3p1oyEZEi\ncs458OST0QE+bx7U1Q11iQZOQU1PSX/E/wE+4u7PAXVmpqnHRUQStbXwW78V04EsW1ZaTVCF9lH8\nK9AGXJa8bgA+k0qJRESK1JgxcWntwYMxBmPt2uj0LnaFBsXp7v55oB3A3Y8QTVAiIpLj3HOhvT0m\nGVy2DJYujanNi1mhndltZjaSZBoPMzsdaE2tVCIiRaqsDK66KsLihRdi3MW+fTEjbbEqNCg+BfwX\nMMPMvgdcDrw3rUKJiBS7ykqYMSMum121CnbsgNNOiynOi23uqD6DwswMWAu8FbiUaHL6iLs3plw2\nEZGiVlMTo7k3boyBeVVV8Na3xt31ikmfQeHubmY/cveLgJ8OQplEREpCRQVcc00837UrmqLq6+Py\n2cztV8vLh7aMhSi06ekpM7vY3ZenWhoRkRI1cWJMKvjSS9FvMW1aDNI79VQ444zhPfVHoUHxOuD9\nZvYy0EQ0P7m7n5tWwURESkl5edQu9u6FZ56JK6HMollq167oAK8YpjeYLrRY16daChGRk8SECXDd\ndfG8uRmefho2b46+jOuvj+ao4aavO9xVA+8HXgM8B9zt7h2DUTARkVI3alRM/VFfH4P0zjsvJhsc\nbvrKrm8BC4mQuB74YuolEhE5iUyaBK99bVxO29Y21KXpWV9NTwvc/RwAM7sbeCb9IomInFzKyuLe\nFqtWxZ3zhpu+ahS/mSFWTU4iIukYMyauempoiGk/htuUH30FxXlmdjB5HALOzTw3s4N9bdzMrjOz\ndWa2wcwiExCSAAAL1UlEQVQ+kWe9t5uZm9nC4/0CIiKl4MILo59ixQq4//4Y0T1c5A0Kdy9395rk\nMdbdK3Ke1+T7rJmVA3cSfRsLgFvMbEEP640FPgw83f+vISJS3Kqr42qoWbPictn774/ZZ4eDNC/E\nWgRscPdN7t4G3Avc1MN6nwY+D7SkWBYRkaIwc2aExbZt8NhjcPjwUJco3aCYBmzNed2QLPsNM7sA\nmOHuP8m3ITO73czqzaz+wIHdA19SEZFhZO7cmFCwpQV+9CN44omYK6p9iO4rmmZQ9DQ/ov/mTbMy\n4B+Aj/W1IXe/y90XuvvC2trJA1hEEZHhafbsGLm9ZQusXAkPPgg//zkcOTL4ZUlzwHgDMCPn9XRg\ne87rscDZwGMxQS2nAEvN7EZ3r0+xXCIiw151ddxaFeJ+FvX1Mc7iyJGYgXYwpVmjWA7MNbPZZlYF\n3Awszbzp7gfcfZK717l7HfAUoJAQEelm/Hh4wxvAHXbuhPXrB/ee3KkFRTLu4kPAQ8CLwH3uvsbM\n7jCzG9Par4hIqZo7N2oXjz8et1odLKnOVejuDwAPdFv2yV7WXZxmWUREit3UqXDxxTGR4PLlERxV\nVenvdxjOUygiIr2ZODHuX7F3b1wR9cor6e9TQSEiUmRmz47O7o0b4aGH0h9roaAQESky5eVwySUx\nLXljIzzySLr7U1CIiBSpKVPinhY7dsRtVtOioBARKVJmUFcX4yvSHLWtoBARKWKZGx6tWpXePhQU\nIiJFbPJk6OyENWvSa35SUIiIFLGysrgKas+emJ48lX2ks1kRERksU6dGreKpp9LZvoJCRKTIjR4d\nYZHWeAoFhYhICXCPoNiyZeC3raAQESkBmQkDH3544C+VVVCIiJSA0aNh/vwIi/XrB3bbCgoRkRJx\n6qlQURH32x5ICgoRkRJRWRlXP23ZEoPwBoqCQkSkRJSVRV/FwYPw7LMDuN2B25SIiAy1SZOgpQV+\n/esYhDcQFBQiIiWkqgouvTQulX3yyYG5t7aCQkSkxNTUxM+XX46bG50oBYWISIkpK4MrrohLZZ9+\nOgbjndD2BqZYIiIynIwYEZMF7tt34gPwFBQiIiWqtjbGVZwoBYWISAlraYFXXz2xbSgoRERK1IQJ\n0NwML710YttRUIiIlKiqqrgDXtkJnukVFCIikpeCQkRE8lJQiIiUuKamE/u8gkJEpESZQUdHZtpx\ns/5uR0EhIlKizGD6dNi/H06kS1tBISJSwioqYNSoE9uGgkJEpIS1t8dMslBd1d9tpBoUZnadma0z\nsw1m9oke3v+omb1gZqvN7GEzm5VmeURETjbTp8MFF8CJTOaRWlCYWTlwJ3A9sAC4xcwWdFttBbDQ\n3c8Ffgh8Pq3yiIicrGpr4UTmkE2zRrEI2ODum9y9DbgXuCl3BXd/1N2bk5dPAdNTLI+IiPRDmkEx\nDdia87ohWdab24AHe3rDzG43s3ozqz9wYPcAFlFERPqSZlD0dM1uj1UfM3sXsBD4Qk/vu/td7r7Q\n3RfW1k4ewCKKiEhfBmCm8l41ADNyXk8HtndfycyuAf4SuMrdW1Msj4iI9EOaNYrlwFwzm21mVcDN\nwNLcFczsAuDrwI3uvivFsoiISD+lFhTu3gF8CHgIeBG4z93XmNkdZnZjstoXgDHAD8xspZkt7WVz\nIiIyRNJsesLdHwAe6LbskznPr0lz/yIicuI0MltERPJSUIiISF4KChERyUtBISIieSkoREQkLwWF\niIjkpaAQEZG8FBQiIpKXgkJERPJSUIiISF4KChERyUtBISIieSkoREQkLwWFiIjkpaAQEZG8FBQi\nIpKXgkJERPJSUIiISF4KChERyUtBISIieSkoREQkLwWFiIjkpaAQEZG8FBQiIpKXgkJERPJSUIiI\nSF4KChERyUtBISIieSkoREQkLwWFiIjkpaAQEZG8FBQiIpKXgkJERPJKNSjM7DozW2dmG8zsEz28\nP8LMvp+8/7SZ1aVZHhEROX6pBYWZlQN3AtcDC4BbzGxBt9VuA/a5+2uAfwD+Lq3yiIhI/1SkuO1F\nwAZ33wRgZvcCNwEv5KxzE7Akef5D4CtmZu7uvW3UHVpaoCLNkouIlJC2NgDr9+fTPN1OA7bmvG4A\nLultHXfvMLMDwESgMXclM7sduD151bZ4cc1G6DVLTiLt46Fy31CXYnjQscjSscjSsQhmcHhmfz+d\nZlD0FF/dz+6FrIO73wXcBWBm9e4HF5548YpfHIsWHQt0LHLpWGTpWGSZWX1/P5tmZ3YDMCPn9XRg\ne2/rmFkFUAvsTbFMIiJynNIMiuXAXDObbWZVwM3A0m7rLAXekzx/O/BIvv4JEREZfKk1PSV9Dh8C\nHgLKgW+4+xozuwOod/elwN3Ad8xsA1GTuLmATd+VVpmLkI5Flo5Flo5Flo5FVr+PhekPeBERyUcj\ns0VEJC8FhYiI5DVsg0LTf2QVcCw+amYvmNlqM3vYzGYNRTkHQ1/HIme9t5uZm1nJXhpZyLEws3ck\n/zbWmNm/DXYZB0sB/0dmmtmjZrYi+X9yw1CUM21m9g0z22Vmz/fyvpnZl5PjtNrMLixow+4+7B5E\n5/dGYA5QBawCFnRb54+AryXPbwa+P9TlHsJj8TpgVPL8AyfzsUjWGws8DjwFLBzqcg/hv4u5wApg\nfPJ6ylCXewiPxV3AB5LnC4CXh7rcKR2LK4ELged7ef8G4EFiDNulwNOFbHe41ih+M/2Hu7cBmek/\nct0EfCt5/kPgajPr/xj14avPY+Huj7p7c/LyKWLMSikq5N8FwKeBzwMtg1m4QVbIsXgfcKe77wNw\n912DXMbBUsixcKAmeV7LsWO6SoK7P07+sWg3Ad/28BQwzsxO7Wu7wzUoepr+Y1pv67h7B5CZ/qPU\nFHIsct1G/MVQivo8FmZ2ATDD3X8ymAUbAoX8uzgDOMPMfmlmT5nZdYNWusFVyLFYArzLzBqAB4A/\nHpyiDTvHez4B0p3C40QM2PQfJaDg72lm7wIWAlelWqKhk/dYmFkZMQvxewerQEOokH8XFUTz02Ki\nlvmEmZ3t7vtTLttgK+RY3AJ8092/aGaXEeO3znb3rvSLN6z067w5XGsUmv4jq5BjgZldA/wlcKO7\ntw5S2QZbX8diLHA28JiZvUy0wS4t0Q7tQv+P/Njd2919M7COCI5SU8ixuA24D8DdnwSqgUmDUrrh\npaDzSXfDNSg0/UdWn8ciaW75OhESpdoODX0cC3c/4O6T3L3O3euI/pob3b3fk6ENY4X8H/kRcaED\nZjaJaIraNKilHByFHItXgKsBzOxMIih2D2oph4elwLuTq58uBQ64+46+PjQsm548vek/ik6Bx+IL\nwBjgB0l//ivufuOQFTolBR6Lk0KBx+Ih4FozewHoBD7u7nuGrtTpKPBYfAz4ZzP7U6Kp5b2l+Iel\nmd1DNDVOSvpjPgVUArj714j+mRuADUAz8PsFbbcEj5WIiAyg4dr0JCIiw4SCQkRE8lJQiIhIXgoK\nERHJS0EhIiJ5KShEujGzTjNbaWbPm9l/mtm4Ad7+e83sK8nzJWb2ZwO5fZGBpqAQOdYRdz/f3c8m\nxuh8cKgLJDKUFBQi+T1JzqRpZvZxM1uezOX/1znL350sW2Vm30mWvTm5V8oKM/u5mU0dgvKLnLBh\nOTJbZDgws3Ji2oe7k9fXEnMlLSImV1tqZlcCe4h5ti5390Yzm5Bs4hfApe7uZvYHwJ8TI4RFioqC\nQuRYI81sJVAHPAv8LFl+bfJYkbweQwTHecAP3b0RwN0zk1NOB76fzPdfBWwelNKLDDA1PYkc64i7\nnw/MIk7wmT4KAz6X9F+c7+6vcfe7k+U9zYXzT8BX3P0c4A+JiehEio6CQqQX7n4A+DDwZ2ZWSUw6\n97/NbAyAmU0zsynAw8A7zGxisjzT9FQLbEuevweRIqWmJ5E83H2Fma0Cbnb37yRTVD+ZzNJ7GHhX\nMlPpZ4FlZtZJNE29l7ir2g/MbBsx5fnsofgOIidKs8eKiEheanoSEZG8FBQiIpKXgkJERPJSUIiI\nSF4KChERyUtBISIieSkoREQkr/8Bu74+Bz3qyGIAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7fbf37841c88>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from sklearn.metrics import precision_recall_curve, average_precision_score\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "precision, recall, _ = precision_recall_curve(val_y, probs)\n",
    "average_precision = average_precision_score(val_y, probs)\n",
    "\n",
    "plt.step(recall, precision, color='b', alpha=0.2, where='post')\n",
    "plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')\n",
    "\n",
    "plt.xlabel('Recall')\n",
    "plt.ylabel('Precision')\n",
    "plt.ylim([0.0, 1.05])\n",
    "plt.xlim([0.0, 1.0])\n",
    "plt.title('Precision-Recall curve: AUC={0:0.2f}'.format(average_precision));"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7583187390542907"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "recall[precision>=0.6][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_val = df.iloc[sorted(val_idx)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "incorrect_yes = np.where((preds != val_y) & (val_y == 0))[0]\n",
    "most_incorrect_yes = np.argsort(-probs[incorrect_yes])\n",
    "txts = df_val.iloc[incorrect_yes[most_incorrect_yes[:10]]]\n",
    "txts[[\"link\", \"title\", \"summary\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'arxiv.org/abs/1710.00814 arxiv.org/abs/1612.02559 arxiv.org/abs/1708.00489 arxiv.org/abs/1703.11000 arxiv.org/abs/1612.06704 arxiv.org/abs/1708.00973 arxiv.org/abs/1707.07341 arxiv.org/abs/1707.06728 arxiv.org/abs/1710.02318 arxiv.org/abs/1612.01064'"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "' '.join(txts.link.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "incorrect_no = np.where((preds != val_y) & (val_y == 1))[0]\n",
    "most_incorrect_no = np.argsort(probs[incorrect_no])\n",
    "txts = df_val.iloc[incorrect_no[most_incorrect_no[:10]]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>link</th>\n",
       "      <th>title</th>\n",
       "      <th>summary</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>18571</th>\n",
       "      <td>arxiv.org/abs/1707.03243</td>\n",
       "      <td>Malware in the Future? Forecasting Analyst Det...</td>\n",
       "      <td>Cyber attacks endanger physical, economic, soc...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23392</th>\n",
       "      <td>arxiv.org/abs/1709.00765</td>\n",
       "      <td>Number of hidden states needed to physically i...</td>\n",
       "      <td>We consider the problem of implementing a give...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18419</th>\n",
       "      <td>arxiv.org/abs/1707.02651</td>\n",
       "      <td>Model-Based Speech Enhancement in the Modulati...</td>\n",
       "      <td>This paper presents algorithms for modulation-...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10414</th>\n",
       "      <td>arxiv.org/abs/1704.02162</td>\n",
       "      <td>Locally-adapted convolution-based super-resolu...</td>\n",
       "      <td>Super-resolution is a classical problem in ima...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3976</th>\n",
       "      <td>arxiv.org/abs/1701.07381</td>\n",
       "      <td>Design and Implementation of a Semantic Dialog...</td>\n",
       "      <td>This chapter describes a semantic dialogue sys...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13229</th>\n",
       "      <td>arxiv.org/abs/1705.04824</td>\n",
       "      <td>Extracting urban impervious surface from GF-1 ...</td>\n",
       "      <td>Impervious surface area is a direct consequenc...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18308</th>\n",
       "      <td>arxiv.org/abs/1707.02279</td>\n",
       "      <td>A Probabilistic Calculus of Cyber-Physical Sys...</td>\n",
       "      <td>We propose a hybrid probabilistic process calc...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18368</th>\n",
       "      <td>arxiv.org/abs/1707.02477</td>\n",
       "      <td>Hyperspectral Image Restoration via Total Vari...</td>\n",
       "      <td>Hyperspectral images (HSIs) are often corrupte...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22275</th>\n",
       "      <td>arxiv.org/abs/1708.06248</td>\n",
       "      <td>GraphR: Accelerating Graph Processing Using ReRAM</td>\n",
       "      <td>This paper presents GRAPHR, the first ReRAM-ba...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17840</th>\n",
       "      <td>arxiv.org/abs/1707.00506</td>\n",
       "      <td>Recommender System for News Articles using Sup...</td>\n",
       "      <td>In the last decade we have observed a mass inc...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                           link  \\\n",
       "18571  arxiv.org/abs/1707.03243   \n",
       "23392  arxiv.org/abs/1709.00765   \n",
       "18419  arxiv.org/abs/1707.02651   \n",
       "10414  arxiv.org/abs/1704.02162   \n",
       "3976   arxiv.org/abs/1701.07381   \n",
       "13229  arxiv.org/abs/1705.04824   \n",
       "18308  arxiv.org/abs/1707.02279   \n",
       "18368  arxiv.org/abs/1707.02477   \n",
       "22275  arxiv.org/abs/1708.06248   \n",
       "17840  arxiv.org/abs/1707.00506   \n",
       "\n",
       "                                                   title  \\\n",
       "18571  Malware in the Future? Forecasting Analyst Det...   \n",
       "23392  Number of hidden states needed to physically i...   \n",
       "18419  Model-Based Speech Enhancement in the Modulati...   \n",
       "10414  Locally-adapted convolution-based super-resolu...   \n",
       "3976   Design and Implementation of a Semantic Dialog...   \n",
       "13229  Extracting urban impervious surface from GF-1 ...   \n",
       "18308  A Probabilistic Calculus of Cyber-Physical Sys...   \n",
       "18368  Hyperspectral Image Restoration via Total Vari...   \n",
       "22275  GraphR: Accelerating Graph Processing Using ReRAM   \n",
       "17840  Recommender System for News Articles using Sup...   \n",
       "\n",
       "                                                 summary  \n",
       "18571  Cyber attacks endanger physical, economic, soc...  \n",
       "23392  We consider the problem of implementing a give...  \n",
       "18419  This paper presents algorithms for modulation-...  \n",
       "10414  Super-resolution is a classical problem in ima...  \n",
       "3976   This chapter describes a semantic dialogue sys...  \n",
       "13229  Impervious surface area is a direct consequenc...  \n",
       "18308  We propose a hybrid probabilistic process calc...  \n",
       "18368  Hyperspectral images (HSIs) are often corrupte...  \n",
       "22275  This paper presents GRAPHR, the first ReRAM-ba...  \n",
       "17840  In the last decade we have observed a mass inc...  "
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "txts[[\"link\", \"title\", \"summary\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'arxiv.org/abs/1707.03243 arxiv.org/abs/1709.00765 arxiv.org/abs/1707.02651 arxiv.org/abs/1704.02162 arxiv.org/abs/1701.07381 arxiv.org/abs/1705.04824 arxiv.org/abs/1707.02279 arxiv.org/abs/1707.02477 arxiv.org/abs/1708.06248 arxiv.org/abs/1707.00506'"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "' '.join(txts.link.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "to_review = np.where((preds > 0.8) & (val_y == 0))[0]\n",
    "to_review_idx = np.argsort(-probs[to_review])\n",
    "txts = df_val.iloc[to_review[to_review_idx]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "txt_html = ('<li><a href=\"http://' + txts.link + '\">' + txts.title.str.replace('\\n',' ') + '</a>: ' \n",
    "    + txts.summary.str.replace('\\n',' ') + '</li>').values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "full_html = (f\"\"\"<!DOCTYPE html>\n",
    "<html>\n",
    "<head><title>Brundage Bot Backfill</title></head>\n",
    "<body>\n",
    "<ul>\n",
    "{os.linesep.join(txt_html)}\n",
    "</ul>\n",
    "</body>\n",
    "</html>\"\"\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Learner"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "veczr = CountVectorizer(ngram_range=(1,3), tokenizer=tokenize, max_features=vocab_size)\n",
    "\n",
    "trn_term_doc = veczr.fit_transform(trn)\n",
    "val_term_doc = veczr.transform(val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((24470, 200000), 10271515)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trn_term_doc.shape, trn_term_doc.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "md = TextClassifierData.from_bow(trn_term_doc, trn_y, val_term_doc, val_y, sl)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learner = md.dotprod_nb_learner(r_adj=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e79dafe458294f2885fff57373c1cde4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "A Jupyter Widget"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                             \r"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/jhoward/anaconda3/lib/python3.6/site-packages/sklearn/metrics/ranking.py:420: RuntimeWarning: invalid value encountered in true_divide\n",
      "  recall = tps / tps[-1]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 0.       0.13796  0.17713  0.84782  0.68941]\n",
      "[ 1.       0.11556  0.17372  0.85     0.72286]               \n",
      "[ 2.       0.09147  0.17366  0.84714  0.72249]                \n",
      "[ 3.       0.08243  0.17359  0.84646  0.73789]                \n",
      "\n"
     ]
    }
   ],
   "source": [
    "learner.fit(0.02, 4, wds=1e-6, cycle_len=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import precision_recall_curve\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "def prec_at_6(preds,targs):\n",
    "    precision, recall, _ = precision_recall_curve(targs[:,1], preds[:,1])\n",
    "    return recall[precision>=0.6][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.71103327495621715"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prec_at_6(*learner.predict_with_targs())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}