{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Experiments\n",
    "TODO: 24-27 June 2019:\n",
    "* Create pipeline\n",
    "    * with initial unigrams baseline\n",
    "    * accuracy measure (e.g. precision-recall with AUROC)\n",
    "## Setup environment:\n",
    "* import libraries\n",
    "* load csv data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "import pandas as pd\n",
    "from nltk.corpus import treebank\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "description_df = pd.read_csv('./data/description.csv')\n",
    "installation_df = pd.read_csv('./data/installation.csv')\n",
    "invocation_df = pd.read_csv('./data/invocation.csv')\n",
    "citation_df = pd.read_csv('./data/citation.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Preview\n",
    "Make sure that csv data has been successfully imported."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of description entries: 281\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>URL</th>\n",
       "      <th>excerpt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https://github.com/GoogleChrome/puppeteer</td>\n",
       "      <td>Puppeteer is a Node library which provides a h...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>The major contributors of this repository incl...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>Integral Regression is initially described in ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>We build a 3D pose estimation system based mai...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>The Integral Regression is also known as soft-...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                URL  \\\n",
       "0         https://github.com/GoogleChrome/puppeteer   \n",
       "1  https://github.com/JimmySuen/integral-human-pose   \n",
       "2  https://github.com/JimmySuen/integral-human-pose   \n",
       "3  https://github.com/JimmySuen/integral-human-pose   \n",
       "4  https://github.com/JimmySuen/integral-human-pose   \n",
       "\n",
       "                                             excerpt  \n",
       "0  Puppeteer is a Node library which provides a h...  \n",
       "1  The major contributors of this repository incl...  \n",
       "2  Integral Regression is initially described in ...  \n",
       "3  We build a 3D pose estimation system based mai...  \n",
       "4  The Integral Regression is also known as soft-...  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"Number of description entries: {}\".format(len(description_df)))\n",
    "description_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of installation entries: 800\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>URL</th>\n",
       "      <th>excerpt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https://github.com/GoogleChrome/puppeteer</td>\n",
       "      <td>Installation</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>https://github.com/GoogleChrome/puppeteer</td>\n",
       "      <td>To use Puppeteer in your project, run:</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>https://github.com/GoogleChrome/puppeteer</td>\n",
       "      <td>npm i puppeteer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>https://github.com/GoogleChrome/puppeteer</td>\n",
       "      <td># or \"yarn add puppeteer\"</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>https://github.com/GoogleChrome/puppeteer</td>\n",
       "      <td>puppeteer-core</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         URL  \\\n",
       "0  https://github.com/GoogleChrome/puppeteer   \n",
       "1  https://github.com/GoogleChrome/puppeteer   \n",
       "2  https://github.com/GoogleChrome/puppeteer   \n",
       "3  https://github.com/GoogleChrome/puppeteer   \n",
       "4  https://github.com/GoogleChrome/puppeteer   \n",
       "\n",
       "                                  excerpt  \n",
       "0                            Installation  \n",
       "1  To use Puppeteer in your project, run:  \n",
       "2                         npm i puppeteer  \n",
       "3               # or \"yarn add puppeteer\"  \n",
       "4                          puppeteer-core  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"Number of installation entries: {}\".format(len(installation_df)))\n",
    "installation_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of invocation entries: 1118\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>URL</th>\n",
       "      <th>excerpt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>Usage</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>We have placed some example config files in ex...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>Train</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>For Integral Human Pose Regression, cd to pyto...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>Integral Regression</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                URL  \\\n",
       "0  https://github.com/JimmySuen/integral-human-pose   \n",
       "1  https://github.com/JimmySuen/integral-human-pose   \n",
       "2  https://github.com/JimmySuen/integral-human-pose   \n",
       "3  https://github.com/JimmySuen/integral-human-pose   \n",
       "4  https://github.com/JimmySuen/integral-human-pose   \n",
       "\n",
       "                                             excerpt  \n",
       "0                                              Usage  \n",
       "1  We have placed some example config files in ex...  \n",
       "2                                              Train  \n",
       "3  For Integral Human Pose Regression, cd to pyto...  \n",
       "4                                Integral Regression  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"Number of invocation entries: {}\".format(len(invocation_df)))\n",
    "invocation_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of citation entries: 309\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>URL</th>\n",
       "      <th>excerpt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>If you find Integral Regression useful in your...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>@article{sun2017integral,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>title={Integral human pose regression},</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>author={Sun, Xiao and Xiao, Bin and Liang, Shu...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>journal={arXiv preprint arXiv:1711.08229},</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                URL  \\\n",
       "0  https://github.com/JimmySuen/integral-human-pose   \n",
       "1  https://github.com/JimmySuen/integral-human-pose   \n",
       "2  https://github.com/JimmySuen/integral-human-pose   \n",
       "3  https://github.com/JimmySuen/integral-human-pose   \n",
       "4  https://github.com/JimmySuen/integral-human-pose   \n",
       "\n",
       "                                             excerpt  \n",
       "0  If you find Integral Regression useful in your...  \n",
       "1                          @article{sun2017integral,  \n",
       "2            title={Integral human pose regression},  \n",
       "3  author={Sun, Xiao and Xiao, Bin and Liang, Shu...  \n",
       "4         journal={arXiv preprint arXiv:1711.08229},  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"Number of citation entries: {}\".format(len(citation_df)))\n",
    "citation_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Each data set currently contains positive samples of its respective trait. However, negative samples are necessary to distinguish the positive against some sort of control. Per category, negative samples include those from the other categories and also text samples completely unrelated to repository information. For example, in the description classifier, positive samples would be those that were labelled as a description, and negative samples would include those labelled as a installation, invocation, or citation in addition to nonpertinent text such as the Treebank corpus.\n",
    "\n",
    "As there are many more negative samples than there are positive samples, randomly selected negative samples will be used. The aim is for about 40% positive and 60% negative. Of the 60% negative, 15% for each outside category and 15% for random, e.g. Treebank, text. \n",
    "\n",
    "*Question: Treebank sentences are already tokenized / split by word. Does nltk have sentences not already split or is it possible to utilize the already split state of the sentences for later tokenizer usage?*\n",
    "## Description Classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>excerpt</th>\n",
       "      <th>description</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Puppeteer is a Node library which provides a h...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>The major contributors of this repository incl...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Integral Regression is initially described in ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>We build a 3D pose estimation system based mai...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>The Integral Regression is also known as soft-...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>This is an official implementation for Integra...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>The original implementation is based on our in...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>LibGEOS is a LGPL-licensed package for manipul...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Among other things, it allows you to parse Wel...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>This repository contains the experiments in th...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>For the results presented in the paper, we did...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Batch normalization is currently not supported...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Open-source Ground Penetrating Radar processin...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Pytorch implementation for high-resolution (e....</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>The PVGeo Python package contains VTK powered ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>A PyVista (and VTK) interface for the Open Min...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>GeoNotebook is an application that provides cl...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>Fiona is OGR's neat and nimble API for Python ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>Fiona is designed to be simple and dependable....</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>Shapely is a BSD-licensed Python package for m...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>Rain streaks can severely degrade the visibili...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>The pytorch branch contains:</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>the pytorch implementation of Peak Response Ma...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>the PASCAL-VOC demo (training, inference, and ...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>Lithology and stratigraphic logs for wells and...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>This Python module allows you to:</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>Interactively control an instance of ANSYS v14...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>Extract data directly from binary ANSYS v14.5+...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>Rapidly read in binary result (.rst), binary m...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>Official implementation of GANimation. In this...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>670</th>\n",
       "      <td>A Department of Health and Human Services rule...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>671</th>\n",
       "      <td>But Mr. Hahn rose swiftly through the ranks , ...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>672</th>\n",
       "      <td>AT&amp;T FAX :</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>673</th>\n",
       "      <td>And many emerging markets have outpaced more m...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>674</th>\n",
       "      <td>`` * Remember Pinocchio ? '' says *T*-1 a fema...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>675</th>\n",
       "      <td>*-1 Currently a $ 300 million-a-year business ...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>676</th>\n",
       "      <td>Koito has refused *-1 to grant Mr. Pickens sea...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>677</th>\n",
       "      <td>The market again showed little interest in fur...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>678</th>\n",
       "      <td>The idea , of course : * to prove to 125 corpo...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>679</th>\n",
       "      <td>Because of deteriorating hearing , she told co...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>680</th>\n",
       "      <td>And construction also was described *-101 as s...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>681</th>\n",
       "      <td>The restrictions on viewing and dissemination ...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>682</th>\n",
       "      <td>Whereas conventional securities financings are...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>683</th>\n",
       "      <td>What *T*-102 's more , the test and Learning M...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>684</th>\n",
       "      <td>But Robert R. Murray , a special master appoin...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>685</th>\n",
       "      <td>Sales in stores open more than one year rose 3...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>686</th>\n",
       "      <td>`` You 'd see her correcting homework in the s...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>687</th>\n",
       "      <td>The ban on cross-border movement was imposed *...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>688</th>\n",
       "      <td>Perhaps none of the unconstitutional condition...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>689</th>\n",
       "      <td>A steady deposit base .</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>690</th>\n",
       "      <td>Buick approached American Express about a join...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>691</th>\n",
       "      <td>Kalamazoo , Mich.-based First of America said ...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>692</th>\n",
       "      <td>Michael R. Bromwich , a member since January 1...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>693</th>\n",
       "      <td>Terms were n't disclosed *-1 .</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>694</th>\n",
       "      <td>The ultimate goal of any investor is a profit ...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>695</th>\n",
       "      <td>Mr. Trump withdrew a $ 120-a-share *U* bid las...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>696</th>\n",
       "      <td>On Wall Street men and women walk with great p...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>697</th>\n",
       "      <td>One claims 0 he 's pro-choice .</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>698</th>\n",
       "      <td>Another was Nancy Yeargin , who *T*-89 came to...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>699</th>\n",
       "      <td>* Think about what *T*-1 causes the difference...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>700 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               excerpt  description\n",
       "0    Puppeteer is a Node library which provides a h...         True\n",
       "1    The major contributors of this repository incl...         True\n",
       "2    Integral Regression is initially described in ...         True\n",
       "3    We build a 3D pose estimation system based mai...         True\n",
       "4    The Integral Regression is also known as soft-...         True\n",
       "5    This is an official implementation for Integra...         True\n",
       "6    The original implementation is based on our in...         True\n",
       "7    LibGEOS is a LGPL-licensed package for manipul...         True\n",
       "8    Among other things, it allows you to parse Wel...         True\n",
       "9    This repository contains the experiments in th...         True\n",
       "10   For the results presented in the paper, we did...         True\n",
       "11   Batch normalization is currently not supported...         True\n",
       "12   Open-source Ground Penetrating Radar processin...         True\n",
       "13   Pytorch implementation for high-resolution (e....         True\n",
       "14   The PVGeo Python package contains VTK powered ...         True\n",
       "15   A PyVista (and VTK) interface for the Open Min...         True\n",
       "16   GeoNotebook is an application that provides cl...         True\n",
       "17   Fiona is OGR's neat and nimble API for Python ...         True\n",
       "18   Fiona is designed to be simple and dependable....         True\n",
       "19   Shapely is a BSD-licensed Python package for m...         True\n",
       "20   Rain streaks can severely degrade the visibili...         True\n",
       "21                        The pytorch branch contains:         True\n",
       "22   the pytorch implementation of Peak Response Ma...         True\n",
       "23   the PASCAL-VOC demo (training, inference, and ...         True\n",
       "24   Lithology and stratigraphic logs for wells and...         True\n",
       "25                   This Python module allows you to:         True\n",
       "26   Interactively control an instance of ANSYS v14...         True\n",
       "27   Extract data directly from binary ANSYS v14.5+...         True\n",
       "28   Rapidly read in binary result (.rst), binary m...         True\n",
       "29   Official implementation of GANimation. In this...         True\n",
       "..                                                 ...          ...\n",
       "670  A Department of Health and Human Services rule...        False\n",
       "671  But Mr. Hahn rose swiftly through the ranks , ...        False\n",
       "672                                         AT&T FAX :        False\n",
       "673  And many emerging markets have outpaced more m...        False\n",
       "674  `` * Remember Pinocchio ? '' says *T*-1 a fema...        False\n",
       "675  *-1 Currently a $ 300 million-a-year business ...        False\n",
       "676  Koito has refused *-1 to grant Mr. Pickens sea...        False\n",
       "677  The market again showed little interest in fur...        False\n",
       "678  The idea , of course : * to prove to 125 corpo...        False\n",
       "679  Because of deteriorating hearing , she told co...        False\n",
       "680  And construction also was described *-101 as s...        False\n",
       "681  The restrictions on viewing and dissemination ...        False\n",
       "682  Whereas conventional securities financings are...        False\n",
       "683  What *T*-102 's more , the test and Learning M...        False\n",
       "684  But Robert R. Murray , a special master appoin...        False\n",
       "685  Sales in stores open more than one year rose 3...        False\n",
       "686  `` You 'd see her correcting homework in the s...        False\n",
       "687  The ban on cross-border movement was imposed *...        False\n",
       "688  Perhaps none of the unconstitutional condition...        False\n",
       "689                            A steady deposit base .        False\n",
       "690  Buick approached American Express about a join...        False\n",
       "691  Kalamazoo , Mich.-based First of America said ...        False\n",
       "692  Michael R. Bromwich , a member since January 1...        False\n",
       "693                     Terms were n't disclosed *-1 .        False\n",
       "694  The ultimate goal of any investor is a profit ...        False\n",
       "695  Mr. Trump withdrew a $ 120-a-share *U* bid las...        False\n",
       "696  On Wall Street men and women walk with great p...        False\n",
       "697                    One claims 0 he 's pro-choice .        False\n",
       "698  Another was Nancy Yeargin , who *T*-89 came to...        False\n",
       "699  * Think about what *T*-1 causes the difference...        False\n",
       "\n",
       "[700 rows x 2 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "neg_quant = int(len(description_df) * .375)\n",
    "treebank_background = pd.DataFrame(list(map(lambda sent: ' '.join(sent), random.sample(list(treebank.sents()), neg_quant))), columns=[\"excerpt\"]).assign(description=False)\n",
    "description_corpus = pd.concat([description_df.assign(description=True), installation_df.sample(neg_quant).assign(description=False), invocation_df.sample(neg_quant).assign(description=False), citation_df.sample(neg_quant).assign(description=False),treebank_background], sort=False)\n",
    "description_corpus.drop('URL', 1, inplace=True)\n",
    "description_corpus.dropna(0, inplace=True)\n",
    "description_corpus.reset_index(drop=True, inplace=True)\n",
    "description_corpus"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Description Classifier pipeline\n",
    "### Train-test split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "X, y = description_corpus.excerpt, description_corpus.description\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Count Vectorizer and Logistic Regression in Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                x_test  y_TF_pred  y_actual\n",
      "488                           tin = _meshfix.PyTMesh()      False     False\n",
      "597  Lord Chilver , 63-year-old chairman of English...      False     False\n",
      "686  `` You 'd see her correcting homework in the s...      False     False\n",
      "417                                             header      False     False\n",
      "529  title = {{PyVista}: 3D plotting and mesh analy...      False     False\n",
      "566             @inproceedings{pumarola2018ganimation,      False     False\n",
      "282                 pip install opencv-python==3.2.0.6      False     False\n",
      "361                                pip install empymod      False     False\n",
      "365  A C++ compiler for the Python extension, and C...       True     False\n",
      "2    Integral Regression is initially described in ...      False      True\n",
      "561  booktitle = {Proceedings of the International ...      False     False\n",
      "101  The writing functionality in segyio is largely...       True      True\n",
      "595  `` You either believe 0 Seymour can do it agai...      False     False\n",
      "456          Semantic Segmentation with Deeplab-Resnet      False     False\n",
      "492  Key Laboratory of Machine Perception, Shenzhen...      False     False\n",
      "193  The goal of Tippecanoe is to enable making a s...       True      True\n",
      "50   Finally e also provide precompiled Docker imag...       True      True\n",
      "65   Calculates the complete (diffusion and wave ph...       True      True\n",
      "430  and as CurveItem objects with associated metad...      False     False\n",
      "596  If the money manager performing this service i...      False     False\n",
      "303                              From source at GitHub      False     False\n",
      "44   New developments in the field of augmented rea...       True      True\n",
      "347                                         matplotlib      False     False\n",
      "331                             Install the usual way:      False     False\n",
      "434                          tensorboard --logdir logs      False     False\n",
      "320  Installing apsg from the conda-forge channel c...      False     False\n",
      "616  `` There 's no question that some of those wor...      False     False\n",
      "496  Yu, (2018). PyGeoPressure: Geopressure Predict...      False     False\n",
      "559            Fast End-to-End Trainable Guided Filter      False     False\n",
      "91   Segyio is a small LGPL licensed C library for ...       True      True\n",
      "..                                                 ...        ...       ...\n",
      "369  Install python3.6 and pytorch 3. I recommend t...      False     False\n",
      "626  Net income surged 31 % to 7.63 billion yen fro...      False     False\n",
      "527                                        year={2018}      False     False\n",
      "387                                                 ~/      False     False\n",
      "78   Complete full-space (electric and magnetic sou...       True      True\n",
      "572                                      Year = {2017}      False     False\n",
      "156  The file read parameters are based on GSSI's D...       True      True\n",
      "258  Tilematrix handles geographic web tiles and ti...       True      True\n",
      "103  Segyio can handle a lot of files that are SEG-...       True      True\n",
      "489                                         plt.show()      False     False\n",
      "552                                                  }      False     False\n",
      "459  The quantitative results of PSNR and SSIM in t...       True     False\n",
      "105  Declarative: React makes it painless to create...       True      True\n",
      "21                        The pytorch branch contains:      False      True\n",
      "557                    title={CU-Net: Coupled U-Nets},      False     False\n",
      "165  mplleaflet is a Python library that converts a...       True      True\n",
      "696  On Wall Street men and women walk with great p...       True     False\n",
      "503  and Andrew Tao and Jan Kautz and Bryan Catanza...      False     False\n",
      "377  Users who need an older stable version of PySA...      False     False\n",
      "178       exports to common formats (Mapnik XML, PNG…)      False      True\n",
      "618  Mr. Driscoll did n't elaborate about who the p...      False     False\n",
      "486       Pore Pressure Prediction using well log data       True     False\n",
      "659  In its construction spending report , the Comm...      False     False\n",
      "83                          Add-ons (empymod.scripts):      False      True\n",
      "58                                        Introduction       True      True\n",
      "407       strikes = strike + 10 * np.random.randn(num)      False     False\n",
      "475     Run python predict_dgf.py -h for more details.      False     False\n",
      "658                     Terms were n't disclosed *-1 .      False     False\n",
      "617  But the growing controversy comes as many prac...      False     False\n",
      "438                     well.params['horizon'][\"T20\"])      False     False\n",
      "\n",
      "[175 rows x 3 columns]\n",
      "[[110  12]\n",
      " [ 18  35]]\n",
      "---------------------------------------------------------------------------\n",
      "Classification Report\n",
      "\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "       False       0.86      0.90      0.88       122\n",
      "        True       0.74      0.66      0.70        53\n",
      "\n",
      "    accuracy                           0.83       175\n",
      "   macro avg       0.80      0.78      0.79       175\n",
      "weighted avg       0.82      0.83      0.83       175\n",
      "\n",
      "null accuracy: 69.71%\n",
      "accuracy score: 82.86%\n",
      "model is 13.14% more accurate than null accuracy\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/allen/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(0.6971428571428572, 0.8285714285714286)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.metrics import confusion_matrix, accuracy_score, classification_report\n",
    "\n",
    "def display_accuracy_score(y_test, y_pred_class):\n",
    "    score = accuracy_score(y_test, y_pred_class)\n",
    "    print('accuracy score: %s' % '{:.2%}'.format(score))\n",
    "    return score\n",
    "def display_null_accuracy(y_test):\n",
    "    value_counts = pd.value_counts(y_test)\n",
    "    null_accuracy = max(value_counts) / float(len(y_test))\n",
    "    print('null accuracy: %s' % '{:.2%}'.format(null_accuracy))\n",
    "    return null_accuracy\n",
    "\n",
    "def display_accuracy_difference(y_test, y_pred_class):\n",
    "    null_accuracy = display_null_accuracy(y_test)\n",
    "    accuracy_score = display_accuracy_score(y_test, y_pred_class)\n",
    "    difference = accuracy_score - null_accuracy\n",
    "    if difference > 0:\n",
    "        print('model is %s more accurate than null accuracy' % '{:.2%}'.format(difference))\n",
    "    elif difference < 0:\n",
    "        print('model is %s less accurate than null accuracy' % '{:.2%}'.format(abs(difference)))\n",
    "    elif difference == 0:\n",
    "        print('model is exactly as accurate as null accuracy')\n",
    "    return null_accuracy, accuracy_score\n",
    "\n",
    "pipeline = make_pipeline(CountVectorizer(), LogisticRegression())\n",
    "pipeline.fit(X_train, y_train)\n",
    "y_pred_class = pipeline.predict(X_test)\n",
    "y_pred_vals = pipeline.predict_proba(X_test)\n",
    "#print(y_pred_vals)\n",
    "#print(\"X_test: {}, y_pred: {}\".format(X_test, y_pred_class))\n",
    "#results_df = pd.DataFrame({\"x_test\": X_test, \"y_pred\": y_pred_vals[:,1], \"y_TF_pred\": y_pred_class, \"y_actual\": y_test})\n",
    "results_df = pd.DataFrame({\"x_test\": X_test,  \"y_TF_pred\": y_pred_class, \"y_actual\": y_test})\n",
    "print(results_df)\n",
    "print(confusion_matrix(y_test, y_pred_class))\n",
    "print('-' * 75 + '\\nClassification Report\\n')\n",
    "print(classification_report(y_test, y_pred_class))\n",
    "display_accuracy_difference(y_test, y_pred_class)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "281"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(description_df)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}