{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#OSP Syllabus Classification [work in progress]\n", "\n", "The [Open Syllabus Project](http://opensyllabusproject.org/) has a collection of 1M+ documents to sift through for syllabi.\n", "This is a classifier for whether a document is a syllabus or not. It turns out, roughly half of the documents are syllabi." ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from osp.corpus.syllabus import Syllabus\n", "import pandas as pd\n", "import numpy as np\n", "import scipy\n", "import pickle\n", "\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", "from collections import defaultdict\n", "\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.cross_validation import KFold, cross_val_score\n", "from sklearn.grid_search import GridSearchCV\n", "from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, auc\n", "\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.linear_model import LogisticRegression, RandomizedLogisticRegression" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [], "source": [ "with open('./training_data.p', 'rb') as pf:\n", " training_3 = pickle.load(pf)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [], "source": [ "training_df_3 = pd.DataFrame(training_3).rename(columns={'labels': 'syllabus'})" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [], "source": [ "training_df_1 = pd.read_csv('/home/ubuntu/data/syllabus_tags.csv')\n", "# A second labeled set of 500 documents\n", "training_df_2 = pd.read_csv('/home/ubuntu/data/refinement.csv')\n", "\n", "training_df = pd.concat([training_df_1, training_df_2, training_df_3])" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | syllabus | \n", "text | \n", "title | \n", "
---|---|---|---|
0 | \n", "True | \n", "COURSE SYLLABUS\\n\\n\\n\\n\\n\\n \\n \\n \\n \\... | \n", "000/00fca9975d3718169608b3bc642ac | \n", "
1 | \n", "True | \n", "C. Kaminski\\n\\nProphets In-Depth\\n\\nPage 1\\n\\n... | \n", "000/01d6d57c127c431ecc80499e32a5a | \n", "
2 | \n", "False | \n", "Social Welfare Continuing Education Program--R... | \n", "000/035e701b02548d15ed7d041e794c9 | \n", "
3 | \n", "True | \n", "Physics 110A Electricity, Magnetism, and Optic... | \n", "000/03aafca817d8870961a8b6b2fa79d | \n", "
4 | \n", "False | \n", "Help Me Name My Major | Ask Metafilter\\n\\n\\n\\n... | \n", "000/064ad57e4fb95d02e14e12a361531 | \n", "