{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "import pandas as pd\n",
    "import nltk\n",
    "# nltk.download('treebank')\n",
    "from nltk.corpus import treebank\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "description_df = pd.read_csv('./data/description.csv')\n",
    "installation_df = pd.read_csv('./data/installation.csv')\n",
    "invocation_df = pd.read_csv('./data/invocation.csv')\n",
    "citation_df = pd.read_csv('./data/citation.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Preview\n",
    "Make sure that csv data has been successfully imported."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of description entries: 336\n",
      "Number of installation entries: 929\n",
      "Number of invocation entries: 1134\n",
      "Number of citation entries: 316\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>URL</th>\n",
       "      <th>contributor</th>\n",
       "      <th>excerpt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https://github.com/GoogleChrome/puppeteer</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td>Puppeteer is a Node library which provides a h...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td>The major contributors of this repository incl...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td>Integral Regression is initially described in ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td>We build a 3D pose estimation system based mai...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td>The Integral Regression is also known as soft-...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                URL contributor  \\\n",
       "0         https://github.com/GoogleChrome/puppeteer   Allen Mao   \n",
       "1  https://github.com/JimmySuen/integral-human-pose   Allen Mao   \n",
       "2  https://github.com/JimmySuen/integral-human-pose   Allen Mao   \n",
       "3  https://github.com/JimmySuen/integral-human-pose   Allen Mao   \n",
       "4  https://github.com/JimmySuen/integral-human-pose   Allen Mao   \n",
       "\n",
       "                                             excerpt  \n",
       "0  Puppeteer is a Node library which provides a h...  \n",
       "1  The major contributors of this repository incl...  \n",
       "2  Integral Regression is initially described in ...  \n",
       "3  We build a 3D pose estimation system based mai...  \n",
       "4  The Integral Regression is also known as soft-...  "
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"Number of description entries: {}\".format(len(description_df)))\n",
    "print(\"Number of installation entries: {}\".format(len(installation_df)))\n",
    "print(\"Number of invocation entries: {}\".format(len(invocation_df)))\n",
    "print(\"Number of citation entries: {}\".format(len(citation_df)))\n",
    "description_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of installation entries: 929\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>URL</th>\n",
       "      <th>contributor</th>\n",
       "      <th>excerpt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https://github.com/GoogleChrome/puppeteer</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td>Installation</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>https://github.com/GoogleChrome/puppeteer</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td>To use Puppeteer in your project, run:</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>https://github.com/GoogleChrome/puppeteer</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td>npm i puppeteer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>https://github.com/GoogleChrome/puppeteer</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td># or \"yarn add puppeteer\"</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>https://github.com/GoogleChrome/puppeteer</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td>puppeteer-core</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         URL contributor  \\\n",
       "0  https://github.com/GoogleChrome/puppeteer   Allen Mao   \n",
       "1  https://github.com/GoogleChrome/puppeteer   Allen Mao   \n",
       "2  https://github.com/GoogleChrome/puppeteer   Allen Mao   \n",
       "3  https://github.com/GoogleChrome/puppeteer   Allen Mao   \n",
       "4  https://github.com/GoogleChrome/puppeteer   Allen Mao   \n",
       "\n",
       "                                  excerpt  \n",
       "0                            Installation  \n",
       "1  To use Puppeteer in your project, run:  \n",
       "2                         npm i puppeteer  \n",
       "3               # or \"yarn add puppeteer\"  \n",
       "4                          puppeteer-core  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"Number of installation entries: {}\".format(len(installation_df)))\n",
    "installation_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of invocation entries: 1134\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>URL</th>\n",
       "      <th>contributor</th>\n",
       "      <th>excerpt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td>Usage</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td>We have placed some example config files in ex...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td>Train</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td>For Integral Human Pose Regression, cd to pyto...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td>Integral Regression</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                URL contributor  \\\n",
       "0  https://github.com/JimmySuen/integral-human-pose   Allen Mao   \n",
       "1  https://github.com/JimmySuen/integral-human-pose   Allen Mao   \n",
       "2  https://github.com/JimmySuen/integral-human-pose   Allen Mao   \n",
       "3  https://github.com/JimmySuen/integral-human-pose   Allen Mao   \n",
       "4  https://github.com/JimmySuen/integral-human-pose   Allen Mao   \n",
       "\n",
       "                                             excerpt  \n",
       "0                                              Usage  \n",
       "1  We have placed some example config files in ex...  \n",
       "2                                              Train  \n",
       "3  For Integral Human Pose Regression, cd to pyto...  \n",
       "4                                Integral Regression  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"Number of invocation entries: {}\".format(len(invocation_df)))\n",
    "invocation_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of citation entries: 316\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>URL</th>\n",
       "      <th>contributor</th>\n",
       "      <th>excerpt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td>If you find Integral Regression useful in your...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td>@article{sun2017integral,</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td>title={Integral human pose regression},</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td>author={Sun, Xiao and Xiao, Bin and Liang, Shu...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>https://github.com/JimmySuen/integral-human-pose</td>\n",
       "      <td>Allen Mao</td>\n",
       "      <td>journal={arXiv preprint arXiv:1711.08229},</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                URL contributor  \\\n",
       "0  https://github.com/JimmySuen/integral-human-pose   Allen Mao   \n",
       "1  https://github.com/JimmySuen/integral-human-pose   Allen Mao   \n",
       "2  https://github.com/JimmySuen/integral-human-pose   Allen Mao   \n",
       "3  https://github.com/JimmySuen/integral-human-pose   Allen Mao   \n",
       "4  https://github.com/JimmySuen/integral-human-pose   Allen Mao   \n",
       "\n",
       "                                             excerpt  \n",
       "0  If you find Integral Regression useful in your...  \n",
       "1                          @article{sun2017integral,  \n",
       "2            title={Integral human pose regression},  \n",
       "3  author={Sun, Xiao and Xiao, Bin and Liang, Shu...  \n",
       "4         journal={arXiv preprint arXiv:1711.08229},  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"Number of citation entries: {}\".format(len(citation_df)))\n",
    "citation_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Classifier Pipelines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Selected Category: description\n",
      "description has 336 samples;\n",
      "installation has 84 samples;\n",
      "invocation has 84 samples;\n",
      "citation has 84 samples;\n",
      "Selected Category: installation\n",
      "description has 232 samples;\n",
      "installation has 929 samples;\n",
      "invocation has 232 samples;\n",
      "citation has 232 samples;\n",
      "Selected Category: invocation\n",
      "description has 283 samples;\n",
      "installation has 283 samples;\n",
      "invocation has 1134 samples;\n",
      "citation has 283 samples;\n",
      "Selected Category: citation\n",
      "description has 79 samples;\n",
      "installation has 79 samples;\n",
      "invocation has 79 samples;\n",
      "citation has 316 samples;\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pickle\n",
    "from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.model_selection import train_test_split #can add stratified later\n",
    "from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score\n",
    "from setup_corpus import build_corpora\n",
    "corpora = build_corpora()\n",
    "# print(corpora)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "scoring = {'accuracy' : make_scorer(accuracy_score), \n",
    "           'precision' : make_scorer(precision_score),\n",
    "           'recall' : make_scorer(recall_score), \n",
    "           'f1_score' : make_scorer(f1_score)}\n",
    "\n",
    "def evaluate(corpora,pipeline,name):\n",
    "    dec = 3\n",
    "    cv = StratifiedKFold(n_splits = 5, shuffle=True)\n",
    "    for category in corpora:\n",
    "        X = corpora[category].excerpt\n",
    "        Y = corpora[category][category]\n",
    "        print(\"\\n\",category,\"X\",len(X),\"Y\",len(Y))\n",
    "        X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.2)\n",
    "        pipeline.fit(X_train, y_train)\n",
    "        title = category[:3]+name+\".p\"\n",
    "        print(title)\n",
    "        scores = cross_validate(pipeline, X, Y, cv=cv, scoring = scoring)\n",
    "        print(\"Mean test accuracy:\",np.around(scores[\"test_accuracy\"].mean(),decimals=dec),\"\\nPrecision\",np.around(scores[\"test_precision\"].mean(),decimals=dec),\"\\nRecall\",np.around(scores[\"test_recall\"].mean(),decimals=dec),\"\\nF-measure\",np.around(scores[\"test_f1_score\"].mean(),decimals=dec))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CountVectorizer + LogisticRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " description X 588 Y 588\n",
      "descvlr.p\n",
      "Mean test accuracy: 0.821 \n",
      "Precision 0.871 \n",
      "Recall 0.81 \n",
      "F-measure 0.838\n",
      "\n",
      " installation X 1625 Y 1625\n",
      "inscvlr.p\n",
      "Mean test accuracy: 0.877 \n",
      "Precision 0.87 \n",
      "Recall 0.924 \n",
      "F-measure 0.896\n",
      "\n",
      " invocation X 1983 Y 1983\n",
      "invcvlr.p\n",
      "Mean test accuracy: 0.852 \n",
      "Precision 0.829 \n",
      "Recall 0.934 \n",
      "F-measure 0.878\n",
      "\n",
      " citation X 553 Y 553\n",
      "citcvlr.p\n",
      "Mean test accuracy: 0.877 \n",
      "Precision 0.84 \n",
      "Recall 0.971 \n",
      "F-measure 0.901\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score\n",
    "\n",
    "\n",
    "pipeline = make_pipeline(CountVectorizer(), LogisticRegression(solver='liblinear'))\n",
    "cv1 = StratifiedKFold(n_splits = 5, shuffle=True)\n",
    "name = \"cvlr\"\n",
    "evaluate(corpora,pipeline,name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Description: 81\n",
    "Installation: 84\n",
    "Invocation: 83\n",
    "Citation: 90\n",
    "[81 86 83 85]\n",
    "[81 86 84 86]\n",
    "[82 89 86 90]\n",
    "[82 86 85 86]\n",
    "[75 90 86 86]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TFIDF + LogisticRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " description X 588 Y 588\n",
      "destflr.p\n",
      "Mean test accuracy: 0.828 \n",
      "Precision 0.809 \n",
      "Recall 0.92 \n",
      "F-measure 0.86\n",
      "\n",
      " installation X 1625 Y 1625\n",
      "instflr.p\n",
      "Mean test accuracy: 0.884 \n",
      "Precision 0.897 \n",
      "Recall 0.901 \n",
      "F-measure 0.899\n",
      "\n",
      " invocation X 1983 Y 1983\n",
      "invtflr.p\n",
      "Mean test accuracy: 0.846 \n",
      "Precision 0.824 \n",
      "Recall 0.93 \n",
      "F-measure 0.874\n",
      "\n",
      " citation X 553 Y 553\n",
      "cittflr.p\n",
      "Mean test accuracy: 0.875 \n",
      "Precision 0.841 \n",
      "Recall 0.975 \n",
      "F-measure 0.901\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression(solver='liblinear'))\n",
    "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n",
    "name = \"tflr\"\n",
    "evaluate(corpora,pipeline,name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TFIDF + NaiveBayes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " description X 588 Y 588\n",
      "destfnb.p\n",
      "Mean test accuracy: 0.784 \n",
      "Precision 0.73 \n",
      "Recall 0.991 \n",
      "F-measure 0.841\n",
      "\n",
      " installation X 1625 Y 1625\n",
      "instfnb.p\n",
      "Mean test accuracy: 0.838 \n",
      "Precision 0.786 \n",
      "Recall 0.984 \n",
      "F-measure 0.874\n",
      "\n",
      " invocation X 1983 Y 1983\n",
      "invtfnb.p\n",
      "Mean test accuracy: 0.875 \n",
      "Precision 0.853 \n",
      "Recall 0.944 \n",
      "F-measure 0.896\n",
      "\n",
      " citation X 553 Y 553\n",
      "cittfnb.p\n",
      "Mean test accuracy: 0.893 \n",
      "Precision 0.853 \n",
      "Recall 0.984 \n",
      "F-measure 0.914\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "\n",
    "pipeline = make_pipeline(TfidfVectorizer(), MultinomialNB())\n",
    "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n",
    "name = \"tfnb\"\n",
    "evaluate(corpora,pipeline,name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CountVectorizer + NaiveBayes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " description X 588 Y 588\n",
      "descvnb.p\n",
      "Mean test accuracy: 0.823 \n",
      "Precision 0.778 \n",
      "Recall 0.973 \n",
      "F-measure 0.864\n",
      "\n",
      " installation X 1625 Y 1625\n",
      "inscvnb.p\n",
      "Mean test accuracy: 0.876 \n",
      "Precision 0.841 \n",
      "Recall 0.967 \n",
      "F-measure 0.899\n",
      "\n",
      " invocation X 1983 Y 1983\n",
      "invcvnb.p\n",
      "Mean test accuracy: 0.875 \n",
      "Precision 0.884 \n",
      "Recall 0.899 \n",
      "F-measure 0.892\n",
      "\n",
      " citation X 553 Y 553\n",
      "citcvnb.p\n",
      "Mean test accuracy: 0.917 \n",
      "Precision 0.893 \n",
      "Recall 0.972 \n",
      "F-measure 0.93\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "pipeline = make_pipeline(CountVectorizer(), MultinomialNB())\n",
    "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n",
    "name = \"cvnb\"\n",
    "evaluate(corpora,pipeline,name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CountVectorizer + BernoulliBayes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " description X 588 Y 588\n",
      "descvbb.p\n",
      "Mean test accuracy: 0.728 \n",
      "Precision 0.923 \n",
      "Recall 0.571 \n",
      "F-measure 0.703\n",
      "\n",
      " installation X 1625 Y 1625\n",
      "inscvbb.p\n",
      "Mean test accuracy: 0.753 \n",
      "Precision 0.704 \n",
      "Recall 0.982 \n",
      "F-measure 0.82\n",
      "\n",
      " invocation X 1983 Y 1983\n",
      "invcvbb.p\n",
      "Mean test accuracy: 0.76 \n",
      "Precision 0.722 \n",
      "Recall 0.944 \n",
      "F-measure 0.818\n",
      "\n",
      " citation X 553 Y 553\n",
      "citcvbb.p\n",
      "Mean test accuracy: 0.745 \n",
      "Precision 0.7 \n",
      "Recall 0.975 \n",
      "F-measure 0.814\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.naive_bayes import BernoulliNB\n",
    "pipeline = make_pipeline(CountVectorizer(), BernoulliNB())\n",
    "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n",
    "name = \"cvbb\"\n",
    "evaluate(corpora,pipeline,name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TFIDF + Stochastic Gradient Descent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " description X 588 Y 588\n",
      "destfsgd.p\n",
      "Mean test accuracy: 0.852 \n",
      "Precision 0.857 \n",
      "Recall 0.89 \n",
      "F-measure 0.873\n",
      "\n",
      " installation X 1625 Y 1625\n",
      "instfsgd.p\n",
      "Mean test accuracy: 0.895 \n",
      "Precision 0.911 \n",
      "Recall 0.905 \n",
      "F-measure 0.908\n",
      "\n",
      " invocation X 1983 Y 1983\n",
      "invtfsgd.p\n",
      "Mean test accuracy: 0.867 \n",
      "Precision 0.863 \n",
      "Recall 0.912 \n",
      "F-measure 0.887\n",
      "\n",
      " citation X 553 Y 553\n",
      "cittfsgd.p\n",
      "Mean test accuracy: 0.899 \n",
      "Precision 0.864 \n",
      "Recall 0.978 \n",
      "F-measure 0.917\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import SGDClassifier\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "pipeline = make_pipeline(TfidfVectorizer(), SGDClassifier(loss='log'))\n",
    "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n",
    "name = \"tfsgd\"\n",
    "evaluate(corpora,pipeline,name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TFIDF + XGB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " description X 588 Y 588\n",
      "destfxgb.p\n",
      "Mean test accuracy: 0.786 \n",
      "Precision 0.85 \n",
      "Recall 0.759 \n",
      "F-measure 0.801\n",
      "\n",
      " installation X 1625 Y 1625\n",
      "instfxgb.p\n",
      "Mean test accuracy: 0.782 \n",
      "Precision 0.893 \n",
      "Recall 0.704 \n",
      "F-measure 0.787\n",
      "\n",
      " invocation X 1983 Y 1983\n",
      "invtfxgb.p\n",
      "Mean test accuracy: 0.768 \n",
      "Precision 0.741 \n",
      "Recall 0.913 \n",
      "F-measure 0.818\n",
      "\n",
      " citation X 553 Y 553\n",
      "cittfxgb.p\n",
      "Mean test accuracy: 0.799 \n",
      "Precision 0.759 \n",
      "Recall 0.953 \n",
      "F-measure 0.844\n"
     ]
    }
   ],
   "source": [
    "from xgboost.sklearn import XGBClassifier\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "pipeline = make_pipeline(TfidfVectorizer(), XGBClassifier())\n",
    "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n",
    "name = \"tfxgb\"\n",
    "evaluate(corpora,pipeline,name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Perceptron + TFIDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " description X 588 Y 588\n",
      "destfper.p\n",
      "Mean test accuracy: 0.827 \n",
      "Precision 0.826 \n",
      "Recall 0.884 \n",
      "F-measure 0.853\n",
      "\n",
      " installation X 1625 Y 1625\n",
      "instfper.p\n",
      "Mean test accuracy: 0.873 \n",
      "Precision 0.879 \n",
      "Recall 0.902 \n",
      "F-measure 0.89\n",
      "\n",
      " invocation X 1983 Y 1983\n",
      "invtfper.p\n",
      "Mean test accuracy: 0.833 \n",
      "Precision 0.868 \n",
      "Recall 0.837 \n",
      "F-measure 0.851\n",
      "\n",
      " citation X 553 Y 553\n",
      "cittfper.p\n",
      "Mean test accuracy: 0.868 \n",
      "Precision 0.842 \n",
      "Recall 0.949 \n",
      "F-measure 0.892\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.linear_model import Perceptron\n",
    "pipeline = make_pipeline(TfidfVectorizer(), Perceptron(tol=1e-3, random_state=0))\n",
    "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n",
    "name = \"tfper\"\n",
    "evaluate(corpora,pipeline,name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Random Forest Classifier +TFIDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " description X 588 Y 588\n",
      "destfrfc.p\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\harip\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
      "  \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean test accuracy: 0.752 \n",
      "Precision 0.842 \n",
      "Recall 0.7 \n",
      "F-measure 0.763\n",
      "\n",
      " installation X 1625 Y 1625\n",
      "instfrfc.p\n",
      "Mean test accuracy: 0.836 \n",
      "Precision 0.9 \n",
      "Recall 0.803 \n",
      "F-measure 0.848\n",
      "\n",
      " invocation X 1983 Y 1983\n",
      "invtfrfc.p\n",
      "Mean test accuracy: 0.794 \n",
      "Precision 0.856 \n",
      "Recall 0.769 \n",
      "F-measure 0.81\n",
      "\n",
      " citation X 553 Y 553\n",
      "cittfrfc.p\n",
      "Mean test accuracy: 0.799 \n",
      "Precision 0.767 \n",
      "Recall 0.937 \n",
      "F-measure 0.843\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier())  #(max_depth=3, random_state=0))\n",
    "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n",
    "name = \"tfrfc\"\n",
    "evaluate(corpora,pipeline,name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Decision Tree Classifier +TFIDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " description X 588 Y 588\n",
      "destfdtc.p\n",
      "Mean test accuracy: 0.741 \n",
      "Precision 0.79 \n",
      "Recall 0.744 \n",
      "F-measure 0.765\n",
      "\n",
      " installation X 1625 Y 1625\n",
      "instfdtc.p\n",
      "Mean test accuracy: 0.821 \n",
      "Precision 0.884 \n",
      "Recall 0.792 \n",
      "F-measure 0.835\n",
      "\n",
      " invocation X 1983 Y 1983\n",
      "invtfdtc.p\n",
      "Mean test accuracy: 0.765 \n",
      "Precision 0.854 \n",
      "Recall 0.711 \n",
      "F-measure 0.776\n",
      "\n",
      " citation X 553 Y 553\n",
      "cittfdtc.p\n",
      "Mean test accuracy: 0.792 \n",
      "Precision 0.767 \n",
      "Recall 0.921 \n",
      "F-measure 0.835\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "pipeline = make_pipeline(CountVectorizer(), DecisionTreeClassifier())\n",
    "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n",
    "name = \"tfdtc\"\n",
    "evaluate(corpora,pipeline,name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TFIDF + AdaBoostClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " description X 588 Y 588\n",
      "destfada.p\n",
      "Mean test accuracy: 0.78 \n",
      "Precision 0.82 \n",
      "Recall 0.789 \n",
      "F-measure 0.803\n",
      "\n",
      " installation X 1625 Y 1625\n",
      "instfada.p\n",
      "Mean test accuracy: 0.793 \n",
      "Precision 0.905 \n",
      "Recall 0.714 \n",
      "F-measure 0.798\n",
      "\n",
      " invocation X 1983 Y 1983\n",
      "invtfada.p\n",
      "Mean test accuracy: 0.774 \n",
      "Precision 0.755 \n",
      "Recall 0.893 \n",
      "F-measure 0.819\n",
      "\n",
      " citation X 553 Y 553\n",
      "cittfada.p\n",
      "Mean test accuracy: 0.836 \n",
      "Precision 0.92 \n",
      "Recall 0.806 \n",
      "F-measure 0.847\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "pipeline = make_pipeline(TfidfVectorizer(), AdaBoostClassifier())  #(max_depth=3, random_state=0))\n",
    "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n",
    "name = \"tfada\"\n",
    "evaluate(corpora,pipeline,name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}