{ "cells": [ { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "import random\n", "import pandas as pd\n", "import nltk\n", "# nltk.download('treebank')\n", "from nltk.corpus import treebank\n", "from sklearn.model_selection import train_test_split\n", "\n", "description_df = pd.read_csv('./data/description.csv')\n", "installation_df = pd.read_csv('./data/installation.csv')\n", "invocation_df = pd.read_csv('./data/invocation.csv')\n", "citation_df = pd.read_csv('./data/citation.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Preview\n", "Make sure that csv data has been successfully imported." ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of description entries: 336\n", "Number of installation entries: 929\n", "Number of invocation entries: 1134\n", "Number of citation entries: 316\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URLcontributorexcerpt
0https://github.com/GoogleChrome/puppeteerAllen MaoPuppeteer is a Node library which provides a h...
1https://github.com/JimmySuen/integral-human-poseAllen MaoThe major contributors of this repository incl...
2https://github.com/JimmySuen/integral-human-poseAllen MaoIntegral Regression is initially described in ...
3https://github.com/JimmySuen/integral-human-poseAllen MaoWe build a 3D pose estimation system based mai...
4https://github.com/JimmySuen/integral-human-poseAllen MaoThe Integral Regression is also known as soft-...
\n", "
" ], "text/plain": [ " URL contributor \\\n", "0 https://github.com/GoogleChrome/puppeteer Allen Mao \n", "1 https://github.com/JimmySuen/integral-human-pose Allen Mao \n", "2 https://github.com/JimmySuen/integral-human-pose Allen Mao \n", "3 https://github.com/JimmySuen/integral-human-pose Allen Mao \n", "4 https://github.com/JimmySuen/integral-human-pose Allen Mao \n", "\n", " excerpt \n", "0 Puppeteer is a Node library which provides a h... \n", "1 The major contributors of this repository incl... \n", "2 Integral Regression is initially described in ... \n", "3 We build a 3D pose estimation system based mai... \n", "4 The Integral Regression is also known as soft-... " ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"Number of description entries: {}\".format(len(description_df)))\n", "print(\"Number of installation entries: {}\".format(len(installation_df)))\n", "print(\"Number of invocation entries: {}\".format(len(invocation_df)))\n", "print(\"Number of citation entries: {}\".format(len(citation_df)))\n", "description_df.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of installation entries: 929\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URLcontributorexcerpt
0https://github.com/GoogleChrome/puppeteerAllen MaoInstallation
1https://github.com/GoogleChrome/puppeteerAllen MaoTo use Puppeteer in your project, run:
2https://github.com/GoogleChrome/puppeteerAllen Maonpm i puppeteer
3https://github.com/GoogleChrome/puppeteerAllen Mao# or \"yarn add puppeteer\"
4https://github.com/GoogleChrome/puppeteerAllen Maopuppeteer-core
\n", "
" ], "text/plain": [ " URL contributor \\\n", "0 https://github.com/GoogleChrome/puppeteer Allen Mao \n", "1 https://github.com/GoogleChrome/puppeteer Allen Mao \n", "2 https://github.com/GoogleChrome/puppeteer Allen Mao \n", "3 https://github.com/GoogleChrome/puppeteer Allen Mao \n", "4 https://github.com/GoogleChrome/puppeteer Allen Mao \n", "\n", " excerpt \n", "0 Installation \n", "1 To use Puppeteer in your project, run: \n", "2 npm i puppeteer \n", "3 # or \"yarn add puppeteer\" \n", "4 puppeteer-core " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"Number of installation entries: {}\".format(len(installation_df)))\n", "installation_df.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of invocation entries: 1134\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URLcontributorexcerpt
0https://github.com/JimmySuen/integral-human-poseAllen MaoUsage
1https://github.com/JimmySuen/integral-human-poseAllen MaoWe have placed some example config files in ex...
2https://github.com/JimmySuen/integral-human-poseAllen MaoTrain
3https://github.com/JimmySuen/integral-human-poseAllen MaoFor Integral Human Pose Regression, cd to pyto...
4https://github.com/JimmySuen/integral-human-poseAllen MaoIntegral Regression
\n", "
" ], "text/plain": [ " URL contributor \\\n", "0 https://github.com/JimmySuen/integral-human-pose Allen Mao \n", "1 https://github.com/JimmySuen/integral-human-pose Allen Mao \n", "2 https://github.com/JimmySuen/integral-human-pose Allen Mao \n", "3 https://github.com/JimmySuen/integral-human-pose Allen Mao \n", "4 https://github.com/JimmySuen/integral-human-pose Allen Mao \n", "\n", " excerpt \n", "0 Usage \n", "1 We have placed some example config files in ex... \n", "2 Train \n", "3 For Integral Human Pose Regression, cd to pyto... \n", "4 Integral Regression " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"Number of invocation entries: {}\".format(len(invocation_df)))\n", "invocation_df.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of citation entries: 316\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URLcontributorexcerpt
0https://github.com/JimmySuen/integral-human-poseAllen MaoIf you find Integral Regression useful in your...
1https://github.com/JimmySuen/integral-human-poseAllen Mao@article{sun2017integral,
2https://github.com/JimmySuen/integral-human-poseAllen Maotitle={Integral human pose regression},
3https://github.com/JimmySuen/integral-human-poseAllen Maoauthor={Sun, Xiao and Xiao, Bin and Liang, Shu...
4https://github.com/JimmySuen/integral-human-poseAllen Maojournal={arXiv preprint arXiv:1711.08229},
\n", "
" ], "text/plain": [ " URL contributor \\\n", "0 https://github.com/JimmySuen/integral-human-pose Allen Mao \n", "1 https://github.com/JimmySuen/integral-human-pose Allen Mao \n", "2 https://github.com/JimmySuen/integral-human-pose Allen Mao \n", "3 https://github.com/JimmySuen/integral-human-pose Allen Mao \n", "4 https://github.com/JimmySuen/integral-human-pose Allen Mao \n", "\n", " excerpt \n", "0 If you find Integral Regression useful in your... \n", "1 @article{sun2017integral, \n", "2 title={Integral human pose regression}, \n", "3 author={Sun, Xiao and Xiao, Bin and Liang, Shu... \n", "4 journal={arXiv preprint arXiv:1711.08229}, " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"Number of citation entries: {}\".format(len(citation_df)))\n", "citation_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Classifier Pipelines" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Selected Category: description\n", "description has 336 samples;\n", "installation has 84 samples;\n", "invocation has 84 samples;\n", "citation has 84 samples;\n", "Selected Category: installation\n", "description has 232 samples;\n", "installation has 929 samples;\n", "invocation has 232 samples;\n", "citation has 232 samples;\n", "Selected Category: invocation\n", "description has 283 samples;\n", "installation has 283 samples;\n", "invocation has 1134 samples;\n", "citation has 283 samples;\n", "Selected Category: citation\n", "description has 79 samples;\n", "installation has 79 samples;\n", "invocation has 79 samples;\n", "citation has 316 samples;\n" ] } ], "source": [ "import numpy as np\n", "import pickle\n", "from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.model_selection import train_test_split #can add stratified later\n", "from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score\n", "from setup_corpus import build_corpora\n", "corpora = build_corpora()\n", "# print(corpora)" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [], "source": [ "scoring = {'accuracy' : make_scorer(accuracy_score), \n", " 'precision' : make_scorer(precision_score),\n", " 'recall' : make_scorer(recall_score), \n", " 'f1_score' : make_scorer(f1_score)}\n", "\n", "def evaluate(corpora,pipeline,name):\n", " dec = 3\n", " cv = StratifiedKFold(n_splits = 5, shuffle=True)\n", " for category in corpora:\n", " X = corpora[category].excerpt\n", " Y = corpora[category][category]\n", " print(\"\\n\",category,\"X\",len(X),\"Y\",len(Y))\n", " X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.2)\n", " pipeline.fit(X_train, y_train)\n", " title = category[:3]+name+\".p\"\n", " print(title)\n", " scores = cross_validate(pipeline, X, Y, cv=cv, scoring = scoring)\n", " print(\"Mean test accuracy:\",np.around(scores[\"test_accuracy\"].mean(),decimals=dec),\"\\nPrecision\",np.around(scores[\"test_precision\"].mean(),decimals=dec),\"\\nRecall\",np.around(scores[\"test_recall\"].mean(),decimals=dec),\"\\nF-measure\",np.around(scores[\"test_f1_score\"].mean(),decimals=dec))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## CountVectorizer + LogisticRegression" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " description X 588 Y 588\n", "descvlr.p\n", "Mean test accuracy: 0.821 \n", "Precision 0.871 \n", "Recall 0.81 \n", "F-measure 0.838\n", "\n", " installation X 1625 Y 1625\n", "inscvlr.p\n", "Mean test accuracy: 0.877 \n", "Precision 0.87 \n", "Recall 0.924 \n", "F-measure 0.896\n", "\n", " invocation X 1983 Y 1983\n", "invcvlr.p\n", "Mean test accuracy: 0.852 \n", "Precision 0.829 \n", "Recall 0.934 \n", "F-measure 0.878\n", "\n", " citation X 553 Y 553\n", "citcvlr.p\n", "Mean test accuracy: 0.877 \n", "Precision 0.84 \n", "Recall 0.971 \n", "F-measure 0.901\n" ] } ], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score\n", "\n", "\n", "pipeline = make_pipeline(CountVectorizer(), LogisticRegression(solver='liblinear'))\n", "cv1 = StratifiedKFold(n_splits = 5, shuffle=True)\n", "name = \"cvlr\"\n", "evaluate(corpora,pipeline,name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Description: 81\n", "Installation: 84\n", "Invocation: 83\n", "Citation: 90\n", "[81 86 83 85]\n", "[81 86 84 86]\n", "[82 89 86 90]\n", "[82 86 85 86]\n", "[75 90 86 86]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TFIDF + LogisticRegression" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " description X 588 Y 588\n", "destflr.p\n", "Mean test accuracy: 0.828 \n", "Precision 0.809 \n", "Recall 0.92 \n", "F-measure 0.86\n", "\n", " installation X 1625 Y 1625\n", "instflr.p\n", "Mean test accuracy: 0.884 \n", "Precision 0.897 \n", "Recall 0.901 \n", "F-measure 0.899\n", "\n", " invocation X 1983 Y 1983\n", "invtflr.p\n", "Mean test accuracy: 0.846 \n", "Precision 0.824 \n", "Recall 0.93 \n", "F-measure 0.874\n", "\n", " citation X 553 Y 553\n", "cittflr.p\n", "Mean test accuracy: 0.875 \n", "Precision 0.841 \n", "Recall 0.975 \n", "F-measure 0.901\n" ] } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.linear_model import LogisticRegression\n", "\n", "pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression(solver='liblinear'))\n", "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n", "name = \"tflr\"\n", "evaluate(corpora,pipeline,name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TFIDF + NaiveBayes" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " description X 588 Y 588\n", "destfnb.p\n", "Mean test accuracy: 0.784 \n", "Precision 0.73 \n", "Recall 0.991 \n", "F-measure 0.841\n", "\n", " installation X 1625 Y 1625\n", "instfnb.p\n", "Mean test accuracy: 0.838 \n", "Precision 0.786 \n", "Recall 0.984 \n", "F-measure 0.874\n", "\n", " invocation X 1983 Y 1983\n", "invtfnb.p\n", "Mean test accuracy: 0.875 \n", "Precision 0.853 \n", "Recall 0.944 \n", "F-measure 0.896\n", "\n", " citation X 553 Y 553\n", "cittfnb.p\n", "Mean test accuracy: 0.893 \n", "Precision 0.853 \n", "Recall 0.984 \n", "F-measure 0.914\n" ] } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.naive_bayes import GaussianNB\n", "\n", "pipeline = make_pipeline(TfidfVectorizer(), MultinomialNB())\n", "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n", "name = \"tfnb\"\n", "evaluate(corpora,pipeline,name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## CountVectorizer + NaiveBayes" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " description X 588 Y 588\n", "descvnb.p\n", "Mean test accuracy: 0.823 \n", "Precision 0.778 \n", "Recall 0.973 \n", "F-measure 0.864\n", "\n", " installation X 1625 Y 1625\n", "inscvnb.p\n", "Mean test accuracy: 0.876 \n", "Precision 0.841 \n", "Recall 0.967 \n", "F-measure 0.899\n", "\n", " invocation X 1983 Y 1983\n", "invcvnb.p\n", "Mean test accuracy: 0.875 \n", "Precision 0.884 \n", "Recall 0.899 \n", "F-measure 0.892\n", "\n", " citation X 553 Y 553\n", "citcvnb.p\n", "Mean test accuracy: 0.917 \n", "Precision 0.893 \n", "Recall 0.972 \n", "F-measure 0.93\n" ] } ], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.naive_bayes import MultinomialNB\n", "pipeline = make_pipeline(CountVectorizer(), MultinomialNB())\n", "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n", "name = \"cvnb\"\n", "evaluate(corpora,pipeline,name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## CountVectorizer + BernoulliBayes" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " description X 588 Y 588\n", "descvbb.p\n", "Mean test accuracy: 0.728 \n", "Precision 0.923 \n", "Recall 0.571 \n", "F-measure 0.703\n", "\n", " installation X 1625 Y 1625\n", "inscvbb.p\n", "Mean test accuracy: 0.753 \n", "Precision 0.704 \n", "Recall 0.982 \n", "F-measure 0.82\n", "\n", " invocation X 1983 Y 1983\n", "invcvbb.p\n", "Mean test accuracy: 0.76 \n", "Precision 0.722 \n", "Recall 0.944 \n", "F-measure 0.818\n", "\n", " citation X 553 Y 553\n", "citcvbb.p\n", "Mean test accuracy: 0.745 \n", "Precision 0.7 \n", "Recall 0.975 \n", "F-measure 0.814\n" ] } ], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.naive_bayes import BernoulliNB\n", "pipeline = make_pipeline(CountVectorizer(), BernoulliNB())\n", "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n", "name = \"cvbb\"\n", "evaluate(corpora,pipeline,name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TFIDF + Stochastic Gradient Descent" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " description X 588 Y 588\n", "destfsgd.p\n", "Mean test accuracy: 0.852 \n", "Precision 0.857 \n", "Recall 0.89 \n", "F-measure 0.873\n", "\n", " installation X 1625 Y 1625\n", "instfsgd.p\n", "Mean test accuracy: 0.895 \n", "Precision 0.911 \n", "Recall 0.905 \n", "F-measure 0.908\n", "\n", " invocation X 1983 Y 1983\n", "invtfsgd.p\n", "Mean test accuracy: 0.867 \n", "Precision 0.863 \n", "Recall 0.912 \n", "F-measure 0.887\n", "\n", " citation X 553 Y 553\n", "cittfsgd.p\n", "Mean test accuracy: 0.899 \n", "Precision 0.864 \n", "Recall 0.978 \n", "F-measure 0.917\n" ] } ], "source": [ "from sklearn.linear_model import SGDClassifier\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "pipeline = make_pipeline(TfidfVectorizer(), SGDClassifier(loss='log'))\n", "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n", "name = \"tfsgd\"\n", "evaluate(corpora,pipeline,name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TFIDF + XGB" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " description X 588 Y 588\n", "destfxgb.p\n", "Mean test accuracy: 0.786 \n", "Precision 0.85 \n", "Recall 0.759 \n", "F-measure 0.801\n", "\n", " installation X 1625 Y 1625\n", "instfxgb.p\n", "Mean test accuracy: 0.782 \n", "Precision 0.893 \n", "Recall 0.704 \n", "F-measure 0.787\n", "\n", " invocation X 1983 Y 1983\n", "invtfxgb.p\n", "Mean test accuracy: 0.768 \n", "Precision 0.741 \n", "Recall 0.913 \n", "F-measure 0.818\n", "\n", " citation X 553 Y 553\n", "cittfxgb.p\n", "Mean test accuracy: 0.799 \n", "Precision 0.759 \n", "Recall 0.953 \n", "F-measure 0.844\n" ] } ], "source": [ "from xgboost.sklearn import XGBClassifier\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "pipeline = make_pipeline(TfidfVectorizer(), XGBClassifier())\n", "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n", "name = \"tfxgb\"\n", "evaluate(corpora,pipeline,name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Perceptron + TFIDF" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " description X 588 Y 588\n", "destfper.p\n", "Mean test accuracy: 0.827 \n", "Precision 0.826 \n", "Recall 0.884 \n", "F-measure 0.853\n", "\n", " installation X 1625 Y 1625\n", "instfper.p\n", "Mean test accuracy: 0.873 \n", "Precision 0.879 \n", "Recall 0.902 \n", "F-measure 0.89\n", "\n", " invocation X 1983 Y 1983\n", "invtfper.p\n", "Mean test accuracy: 0.833 \n", "Precision 0.868 \n", "Recall 0.837 \n", "F-measure 0.851\n", "\n", " citation X 553 Y 553\n", "cittfper.p\n", "Mean test accuracy: 0.868 \n", "Precision 0.842 \n", "Recall 0.949 \n", "F-measure 0.892\n" ] } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.linear_model import Perceptron\n", "pipeline = make_pipeline(TfidfVectorizer(), Perceptron(tol=1e-3, random_state=0))\n", "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n", "name = \"tfper\"\n", "evaluate(corpora,pipeline,name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Random Forest Classifier +TFIDF" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " description X 588 Y 588\n", "destfrfc.p\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\harip\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n", " \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Mean test accuracy: 0.752 \n", "Precision 0.842 \n", "Recall 0.7 \n", "F-measure 0.763\n", "\n", " installation X 1625 Y 1625\n", "instfrfc.p\n", "Mean test accuracy: 0.836 \n", "Precision 0.9 \n", "Recall 0.803 \n", "F-measure 0.848\n", "\n", " invocation X 1983 Y 1983\n", "invtfrfc.p\n", "Mean test accuracy: 0.794 \n", "Precision 0.856 \n", "Recall 0.769 \n", "F-measure 0.81\n", "\n", " citation X 553 Y 553\n", "cittfrfc.p\n", "Mean test accuracy: 0.799 \n", "Precision 0.767 \n", "Recall 0.937 \n", "F-measure 0.843\n" ] } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.ensemble import RandomForestClassifier\n", "pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier()) #(max_depth=3, random_state=0))\n", "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n", "name = \"tfrfc\"\n", "evaluate(corpora,pipeline,name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Decision Tree Classifier +TFIDF" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " description X 588 Y 588\n", "destfdtc.p\n", "Mean test accuracy: 0.741 \n", "Precision 0.79 \n", "Recall 0.744 \n", "F-measure 0.765\n", "\n", " installation X 1625 Y 1625\n", "instfdtc.p\n", "Mean test accuracy: 0.821 \n", "Precision 0.884 \n", "Recall 0.792 \n", "F-measure 0.835\n", "\n", " invocation X 1983 Y 1983\n", "invtfdtc.p\n", "Mean test accuracy: 0.765 \n", "Precision 0.854 \n", "Recall 0.711 \n", "F-measure 0.776\n", "\n", " citation X 553 Y 553\n", "cittfdtc.p\n", "Mean test accuracy: 0.792 \n", "Precision 0.767 \n", "Recall 0.921 \n", "F-measure 0.835\n" ] } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.tree import DecisionTreeClassifier\n", "pipeline = make_pipeline(CountVectorizer(), DecisionTreeClassifier())\n", "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n", "name = \"tfdtc\"\n", "evaluate(corpora,pipeline,name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TFIDF + AdaBoostClassifier" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " description X 588 Y 588\n", "destfada.p\n", "Mean test accuracy: 0.78 \n", "Precision 0.82 \n", "Recall 0.789 \n", "F-measure 0.803\n", "\n", " installation X 1625 Y 1625\n", "instfada.p\n", "Mean test accuracy: 0.793 \n", "Precision 0.905 \n", "Recall 0.714 \n", "F-measure 0.798\n", "\n", " invocation X 1983 Y 1983\n", "invtfada.p\n", "Mean test accuracy: 0.774 \n", "Precision 0.755 \n", "Recall 0.893 \n", "F-measure 0.819\n", "\n", " citation X 553 Y 553\n", "cittfada.p\n", "Mean test accuracy: 0.836 \n", "Precision 0.92 \n", "Recall 0.806 \n", "F-measure 0.847\n" ] } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.ensemble import AdaBoostClassifier\n", "pipeline = make_pipeline(TfidfVectorizer(), AdaBoostClassifier()) #(max_depth=3, random_state=0))\n", "cv = StratifiedKFold(n_splits = 5, shuffle=True)\n", "name = \"tfada\"\n", "evaluate(corpora,pipeline,name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }