{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Experiments\n", "TODO: 24-27 June 2019:\n", "* Create pipeline\n", " * with initial unigrams baseline\n", " * accuracy measure (e.g. precision-recall with AUROC)\n", "## Setup environment:\n", "* import libraries\n", "* load csv data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import random\n", "import pandas as pd\n", "from nltk.corpus import treebank\n", "from sklearn.model_selection import train_test_split\n", "\n", "description_df = pd.read_csv('./data/description.csv')\n", "installation_df = pd.read_csv('./data/installation.csv')\n", "invocation_df = pd.read_csv('./data/invocation.csv')\n", "citation_df = pd.read_csv('./data/citation.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Preview\n", "Make sure that csv data has been successfully imported." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of description entries: 281\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URLexcerpt
0https://github.com/GoogleChrome/puppeteerPuppeteer is a Node library which provides a h...
1https://github.com/JimmySuen/integral-human-poseThe major contributors of this repository incl...
2https://github.com/JimmySuen/integral-human-poseIntegral Regression is initially described in ...
3https://github.com/JimmySuen/integral-human-poseWe build a 3D pose estimation system based mai...
4https://github.com/JimmySuen/integral-human-poseThe Integral Regression is also known as soft-...
\n", "
" ], "text/plain": [ " URL \\\n", "0 https://github.com/GoogleChrome/puppeteer \n", "1 https://github.com/JimmySuen/integral-human-pose \n", "2 https://github.com/JimmySuen/integral-human-pose \n", "3 https://github.com/JimmySuen/integral-human-pose \n", "4 https://github.com/JimmySuen/integral-human-pose \n", "\n", " excerpt \n", "0 Puppeteer is a Node library which provides a h... \n", "1 The major contributors of this repository incl... \n", "2 Integral Regression is initially described in ... \n", "3 We build a 3D pose estimation system based mai... \n", "4 The Integral Regression is also known as soft-... " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"Number of description entries: {}\".format(len(description_df)))\n", "description_df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of installation entries: 800\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URLexcerpt
0https://github.com/GoogleChrome/puppeteerInstallation
1https://github.com/GoogleChrome/puppeteerTo use Puppeteer in your project, run:
2https://github.com/GoogleChrome/puppeteernpm i puppeteer
3https://github.com/GoogleChrome/puppeteer# or \"yarn add puppeteer\"
4https://github.com/GoogleChrome/puppeteerpuppeteer-core
\n", "
" ], "text/plain": [ " URL \\\n", "0 https://github.com/GoogleChrome/puppeteer \n", "1 https://github.com/GoogleChrome/puppeteer \n", "2 https://github.com/GoogleChrome/puppeteer \n", "3 https://github.com/GoogleChrome/puppeteer \n", "4 https://github.com/GoogleChrome/puppeteer \n", "\n", " excerpt \n", "0 Installation \n", "1 To use Puppeteer in your project, run: \n", "2 npm i puppeteer \n", "3 # or \"yarn add puppeteer\" \n", "4 puppeteer-core " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"Number of installation entries: {}\".format(len(installation_df)))\n", "installation_df.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of invocation entries: 1118\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URLexcerpt
0https://github.com/JimmySuen/integral-human-poseUsage
1https://github.com/JimmySuen/integral-human-poseWe have placed some example config files in ex...
2https://github.com/JimmySuen/integral-human-poseTrain
3https://github.com/JimmySuen/integral-human-poseFor Integral Human Pose Regression, cd to pyto...
4https://github.com/JimmySuen/integral-human-poseIntegral Regression
\n", "
" ], "text/plain": [ " URL \\\n", "0 https://github.com/JimmySuen/integral-human-pose \n", "1 https://github.com/JimmySuen/integral-human-pose \n", "2 https://github.com/JimmySuen/integral-human-pose \n", "3 https://github.com/JimmySuen/integral-human-pose \n", "4 https://github.com/JimmySuen/integral-human-pose \n", "\n", " excerpt \n", "0 Usage \n", "1 We have placed some example config files in ex... \n", "2 Train \n", "3 For Integral Human Pose Regression, cd to pyto... \n", "4 Integral Regression " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"Number of invocation entries: {}\".format(len(invocation_df)))\n", "invocation_df.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of citation entries: 309\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URLexcerpt
0https://github.com/JimmySuen/integral-human-poseIf you find Integral Regression useful in your...
1https://github.com/JimmySuen/integral-human-pose@article{sun2017integral,
2https://github.com/JimmySuen/integral-human-posetitle={Integral human pose regression},
3https://github.com/JimmySuen/integral-human-poseauthor={Sun, Xiao and Xiao, Bin and Liang, Shu...
4https://github.com/JimmySuen/integral-human-posejournal={arXiv preprint arXiv:1711.08229},
\n", "
" ], "text/plain": [ " URL \\\n", "0 https://github.com/JimmySuen/integral-human-pose \n", "1 https://github.com/JimmySuen/integral-human-pose \n", "2 https://github.com/JimmySuen/integral-human-pose \n", "3 https://github.com/JimmySuen/integral-human-pose \n", "4 https://github.com/JimmySuen/integral-human-pose \n", "\n", " excerpt \n", "0 If you find Integral Regression useful in your... \n", "1 @article{sun2017integral, \n", "2 title={Integral human pose regression}, \n", "3 author={Sun, Xiao and Xiao, Bin and Liang, Shu... \n", "4 journal={arXiv preprint arXiv:1711.08229}, " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"Number of citation entries: {}\".format(len(citation_df)))\n", "citation_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Each data set currently contains positive samples of its respective trait. However, negative samples are necessary to distinguish the positive against some sort of control. Per category, negative samples include those from the other categories and also text samples completely unrelated to repository information. For example, in the description classifier, positive samples would be those that were labelled as a description, and negative samples would include those labelled as a installation, invocation, or citation in addition to nonpertinent text such as the Treebank corpus.\n", "\n", "As there are many more negative samples than there are positive samples, randomly selected negative samples will be used. The aim is for about 40% positive and 60% negative. Of the 60% negative, 15% for each outside category and 15% for random, e.g. Treebank, text. \n", "\n", "*Question: Treebank sentences are already tokenized / split by word. Does nltk have sentences not already split or is it possible to utilize the already split state of the sentences for later tokenizer usage?*\n", "## Description Classifier" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
excerptdescription
0Puppeteer is a Node library which provides a h...True
1The major contributors of this repository incl...True
2Integral Regression is initially described in ...True
3We build a 3D pose estimation system based mai...True
4The Integral Regression is also known as soft-...True
5This is an official implementation for Integra...True
6The original implementation is based on our in...True
7LibGEOS is a LGPL-licensed package for manipul...True
8Among other things, it allows you to parse Wel...True
9This repository contains the experiments in th...True
10For the results presented in the paper, we did...True
11Batch normalization is currently not supported...True
12Open-source Ground Penetrating Radar processin...True
13Pytorch implementation for high-resolution (e....True
14The PVGeo Python package contains VTK powered ...True
15A PyVista (and VTK) interface for the Open Min...True
16GeoNotebook is an application that provides cl...True
17Fiona is OGR's neat and nimble API for Python ...True
18Fiona is designed to be simple and dependable....True
19Shapely is a BSD-licensed Python package for m...True
20Rain streaks can severely degrade the visibili...True
21The pytorch branch contains:True
22the pytorch implementation of Peak Response Ma...True
23the PASCAL-VOC demo (training, inference, and ...True
24Lithology and stratigraphic logs for wells and...True
25This Python module allows you to:True
26Interactively control an instance of ANSYS v14...True
27Extract data directly from binary ANSYS v14.5+...True
28Rapidly read in binary result (.rst), binary m...True
29Official implementation of GANimation. In this...True
.........
670A Department of Health and Human Services rule...False
671But Mr. Hahn rose swiftly through the ranks , ...False
672AT&T FAX :False
673And many emerging markets have outpaced more m...False
674`` * Remember Pinocchio ? '' says *T*-1 a fema...False
675*-1 Currently a $ 300 million-a-year business ...False
676Koito has refused *-1 to grant Mr. Pickens sea...False
677The market again showed little interest in fur...False
678The idea , of course : * to prove to 125 corpo...False
679Because of deteriorating hearing , she told co...False
680And construction also was described *-101 as s...False
681The restrictions on viewing and dissemination ...False
682Whereas conventional securities financings are...False
683What *T*-102 's more , the test and Learning M...False
684But Robert R. Murray , a special master appoin...False
685Sales in stores open more than one year rose 3...False
686`` You 'd see her correcting homework in the s...False
687The ban on cross-border movement was imposed *...False
688Perhaps none of the unconstitutional condition...False
689A steady deposit base .False
690Buick approached American Express about a join...False
691Kalamazoo , Mich.-based First of America said ...False
692Michael R. Bromwich , a member since January 1...False
693Terms were n't disclosed *-1 .False
694The ultimate goal of any investor is a profit ...False
695Mr. Trump withdrew a $ 120-a-share *U* bid las...False
696On Wall Street men and women walk with great p...False
697One claims 0 he 's pro-choice .False
698Another was Nancy Yeargin , who *T*-89 came to...False
699* Think about what *T*-1 causes the difference...False
\n", "

700 rows × 2 columns

\n", "
" ], "text/plain": [ " excerpt description\n", "0 Puppeteer is a Node library which provides a h... True\n", "1 The major contributors of this repository incl... True\n", "2 Integral Regression is initially described in ... True\n", "3 We build a 3D pose estimation system based mai... True\n", "4 The Integral Regression is also known as soft-... True\n", "5 This is an official implementation for Integra... True\n", "6 The original implementation is based on our in... True\n", "7 LibGEOS is a LGPL-licensed package for manipul... True\n", "8 Among other things, it allows you to parse Wel... True\n", "9 This repository contains the experiments in th... True\n", "10 For the results presented in the paper, we did... True\n", "11 Batch normalization is currently not supported... True\n", "12 Open-source Ground Penetrating Radar processin... True\n", "13 Pytorch implementation for high-resolution (e.... True\n", "14 The PVGeo Python package contains VTK powered ... True\n", "15 A PyVista (and VTK) interface for the Open Min... True\n", "16 GeoNotebook is an application that provides cl... True\n", "17 Fiona is OGR's neat and nimble API for Python ... True\n", "18 Fiona is designed to be simple and dependable.... True\n", "19 Shapely is a BSD-licensed Python package for m... True\n", "20 Rain streaks can severely degrade the visibili... True\n", "21 The pytorch branch contains: True\n", "22 the pytorch implementation of Peak Response Ma... True\n", "23 the PASCAL-VOC demo (training, inference, and ... True\n", "24 Lithology and stratigraphic logs for wells and... True\n", "25 This Python module allows you to: True\n", "26 Interactively control an instance of ANSYS v14... True\n", "27 Extract data directly from binary ANSYS v14.5+... True\n", "28 Rapidly read in binary result (.rst), binary m... True\n", "29 Official implementation of GANimation. In this... True\n", ".. ... ...\n", "670 A Department of Health and Human Services rule... False\n", "671 But Mr. Hahn rose swiftly through the ranks , ... False\n", "672 AT&T FAX : False\n", "673 And many emerging markets have outpaced more m... False\n", "674 `` * Remember Pinocchio ? '' says *T*-1 a fema... False\n", "675 *-1 Currently a $ 300 million-a-year business ... False\n", "676 Koito has refused *-1 to grant Mr. Pickens sea... False\n", "677 The market again showed little interest in fur... False\n", "678 The idea , of course : * to prove to 125 corpo... False\n", "679 Because of deteriorating hearing , she told co... False\n", "680 And construction also was described *-101 as s... False\n", "681 The restrictions on viewing and dissemination ... False\n", "682 Whereas conventional securities financings are... False\n", "683 What *T*-102 's more , the test and Learning M... False\n", "684 But Robert R. Murray , a special master appoin... False\n", "685 Sales in stores open more than one year rose 3... False\n", "686 `` You 'd see her correcting homework in the s... False\n", "687 The ban on cross-border movement was imposed *... False\n", "688 Perhaps none of the unconstitutional condition... False\n", "689 A steady deposit base . False\n", "690 Buick approached American Express about a join... False\n", "691 Kalamazoo , Mich.-based First of America said ... False\n", "692 Michael R. Bromwich , a member since January 1... False\n", "693 Terms were n't disclosed *-1 . False\n", "694 The ultimate goal of any investor is a profit ... False\n", "695 Mr. Trump withdrew a $ 120-a-share *U* bid las... False\n", "696 On Wall Street men and women walk with great p... False\n", "697 One claims 0 he 's pro-choice . False\n", "698 Another was Nancy Yeargin , who *T*-89 came to... False\n", "699 * Think about what *T*-1 causes the difference... False\n", "\n", "[700 rows x 2 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "neg_quant = int(len(description_df) * .375)\n", "treebank_background = pd.DataFrame(list(map(lambda sent: ' '.join(sent), random.sample(list(treebank.sents()), neg_quant))), columns=[\"excerpt\"]).assign(description=False)\n", "description_corpus = pd.concat([description_df.assign(description=True), installation_df.sample(neg_quant).assign(description=False), invocation_df.sample(neg_quant).assign(description=False), citation_df.sample(neg_quant).assign(description=False),treebank_background], sort=False)\n", "description_corpus.drop('URL', 1, inplace=True)\n", "description_corpus.dropna(0, inplace=True)\n", "description_corpus.reset_index(drop=True, inplace=True)\n", "description_corpus" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Description Classifier pipeline\n", "### Train-test split" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "X, y = description_corpus.excerpt, description_corpus.description\n", "X_train, X_test, y_train, y_test = train_test_split(X, y)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Count Vectorizer and Logistic Regression in Pipeline" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " x_test y_TF_pred y_actual\n", "488 tin = _meshfix.PyTMesh() False False\n", "597 Lord Chilver , 63-year-old chairman of English... False False\n", "686 `` You 'd see her correcting homework in the s... False False\n", "417 header False False\n", "529 title = {{PyVista}: 3D plotting and mesh analy... False False\n", "566 @inproceedings{pumarola2018ganimation, False False\n", "282 pip install opencv-python==3.2.0.6 False False\n", "361 pip install empymod False False\n", "365 A C++ compiler for the Python extension, and C... True False\n", "2 Integral Regression is initially described in ... False True\n", "561 booktitle = {Proceedings of the International ... False False\n", "101 The writing functionality in segyio is largely... True True\n", "595 `` You either believe 0 Seymour can do it agai... False False\n", "456 Semantic Segmentation with Deeplab-Resnet False False\n", "492 Key Laboratory of Machine Perception, Shenzhen... False False\n", "193 The goal of Tippecanoe is to enable making a s... True True\n", "50 Finally e also provide precompiled Docker imag... True True\n", "65 Calculates the complete (diffusion and wave ph... True True\n", "430 and as CurveItem objects with associated metad... False False\n", "596 If the money manager performing this service i... False False\n", "303 From source at GitHub False False\n", "44 New developments in the field of augmented rea... True True\n", "347 matplotlib False False\n", "331 Install the usual way: False False\n", "434 tensorboard --logdir logs False False\n", "320 Installing apsg from the conda-forge channel c... False False\n", "616 `` There 's no question that some of those wor... False False\n", "496 Yu, (2018). PyGeoPressure: Geopressure Predict... False False\n", "559 Fast End-to-End Trainable Guided Filter False False\n", "91 Segyio is a small LGPL licensed C library for ... True True\n", ".. ... ... ...\n", "369 Install python3.6 and pytorch 3. I recommend t... False False\n", "626 Net income surged 31 % to 7.63 billion yen fro... False False\n", "527 year={2018} False False\n", "387 ~/ False False\n", "78 Complete full-space (electric and magnetic sou... True True\n", "572 Year = {2017} False False\n", "156 The file read parameters are based on GSSI's D... True True\n", "258 Tilematrix handles geographic web tiles and ti... True True\n", "103 Segyio can handle a lot of files that are SEG-... True True\n", "489 plt.show() False False\n", "552 } False False\n", "459 The quantitative results of PSNR and SSIM in t... True False\n", "105 Declarative: React makes it painless to create... True True\n", "21 The pytorch branch contains: False True\n", "557 title={CU-Net: Coupled U-Nets}, False False\n", "165 mplleaflet is a Python library that converts a... True True\n", "696 On Wall Street men and women walk with great p... True False\n", "503 and Andrew Tao and Jan Kautz and Bryan Catanza... False False\n", "377 Users who need an older stable version of PySA... False False\n", "178 exports to common formats (Mapnik XML, PNG…) False True\n", "618 Mr. Driscoll did n't elaborate about who the p... False False\n", "486 Pore Pressure Prediction using well log data True False\n", "659 In its construction spending report , the Comm... False False\n", "83 Add-ons (empymod.scripts): False True\n", "58 Introduction True True\n", "407 strikes = strike + 10 * np.random.randn(num) False False\n", "475 Run python predict_dgf.py -h for more details. False False\n", "658 Terms were n't disclosed *-1 . False False\n", "617 But the growing controversy comes as many prac... False False\n", "438 well.params['horizon'][\"T20\"]) False False\n", "\n", "[175 rows x 3 columns]\n", "[[110 12]\n", " [ 18 35]]\n", "---------------------------------------------------------------------------\n", "Classification Report\n", "\n", " precision recall f1-score support\n", "\n", " False 0.86 0.90 0.88 122\n", " True 0.74 0.66 0.70 53\n", "\n", " accuracy 0.83 175\n", " macro avg 0.80 0.78 0.79 175\n", "weighted avg 0.82 0.83 0.83 175\n", "\n", "null accuracy: 69.71%\n", "accuracy score: 82.86%\n", "model is 13.14% more accurate than null accuracy\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/allen/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", " FutureWarning)\n" ] }, { "data": { "text/plain": [ "(0.6971428571428572, 0.8285714285714286)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.metrics import confusion_matrix, accuracy_score, classification_report\n", "\n", "def display_accuracy_score(y_test, y_pred_class):\n", " score = accuracy_score(y_test, y_pred_class)\n", " print('accuracy score: %s' % '{:.2%}'.format(score))\n", " return score\n", "def display_null_accuracy(y_test):\n", " value_counts = pd.value_counts(y_test)\n", " null_accuracy = max(value_counts) / float(len(y_test))\n", " print('null accuracy: %s' % '{:.2%}'.format(null_accuracy))\n", " return null_accuracy\n", "\n", "def display_accuracy_difference(y_test, y_pred_class):\n", " null_accuracy = display_null_accuracy(y_test)\n", " accuracy_score = display_accuracy_score(y_test, y_pred_class)\n", " difference = accuracy_score - null_accuracy\n", " if difference > 0:\n", " print('model is %s more accurate than null accuracy' % '{:.2%}'.format(difference))\n", " elif difference < 0:\n", " print('model is %s less accurate than null accuracy' % '{:.2%}'.format(abs(difference)))\n", " elif difference == 0:\n", " print('model is exactly as accurate as null accuracy')\n", " return null_accuracy, accuracy_score\n", "\n", "pipeline = make_pipeline(CountVectorizer(), LogisticRegression())\n", "pipeline.fit(X_train, y_train)\n", "y_pred_class = pipeline.predict(X_test)\n", "y_pred_vals = pipeline.predict_proba(X_test)\n", "#print(y_pred_vals)\n", "#print(\"X_test: {}, y_pred: {}\".format(X_test, y_pred_class))\n", "#results_df = pd.DataFrame({\"x_test\": X_test, \"y_pred\": y_pred_vals[:,1], \"y_TF_pred\": y_pred_class, \"y_actual\": y_test})\n", "results_df = pd.DataFrame({\"x_test\": X_test, \"y_TF_pred\": y_pred_class, \"y_actual\": y_test})\n", "print(results_df)\n", "print(confusion_matrix(y_test, y_pred_class))\n", "print('-' * 75 + '\\nClassification Report\\n')\n", "print(classification_report(y_test, y_pred_class))\n", "display_accuracy_difference(y_test, y_pred_class)\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "281" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(description_df)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }