{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "%matplotlib inline\n", "\n", "import pandas as pd\n", "import string\n", "import copy\n", "import matplotlib.pyplot as plt\n", "import os\n", "import numpy as np\n", "import scipy\n", "from scipy.stats import pearsonr\n", "from datetime import datetime\n", "import nltk\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.cross_validation import train_test_split\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn import svm, tree\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.ensemble import GradientBoostingClassifier\n", "from sklearn.datasets import fetch_20newsgroups\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.feature_extraction.text import HashingVectorizer\n", "from sklearn.feature_selection import SelectKBest, chi2\n", "from sklearn.linear_model import RidgeClassifier\n", "from sklearn.svm import LinearSVC\n", "from sklearn.linear_model import SGDClassifier\n", "from sklearn.linear_model import Perceptron\n", "from sklearn.linear_model import PassiveAggressiveClassifier\n", "from sklearn.naive_bayes import BernoulliNB, MultinomialNB\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.neighbors import NearestCentroid\n", "from sklearn.utils.extmath import density\n", "from sklearn import metrics\n", "\n", "pd.set_option('display.width', 500)\n", "pd.set_option('display.max_columns', 30)\n", "\n", "# set some nicer defaults for matplotlib\n", "from matplotlib import rcParams\n", "\n", "#these colors come from colorbrewer2.org. Each is an RGB triplet\n", "dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),\n", " (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),\n", " (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),\n", " (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),\n", " (0.4, 0.6509803921568628, 0.11764705882352941),\n", " (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),\n", " (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),\n", " (0.4, 0.4, 0.4)]\n", "\n", "rcParams['figure.figsize'] = (10, 6)\n", "rcParams['figure.dpi'] = 150\n", "rcParams['axes.color_cycle'] = dark2_colors\n", "rcParams['lines.linewidth'] = 2\n", "rcParams['axes.grid'] = False\n", "rcParams['axes.facecolor'] = 'white'\n", "rcParams['font.size'] = 14\n", "rcParams['patch.edgecolor'] = 'none'\n", "\n", "# Not our code here - credit to the CS109 psets.\n", "def remove_border(axes=None, top=False, right=False, left=True, bottom=True):\n", " \"\"\"\n", " Minimize chartjunk by stripping out unnecessary plot borders and axis ticks\n", " \n", " The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn\n", " \"\"\"\n", " ax = axes or plt.gca()\n", " ax.spines['top'].set_visible(top)\n", " ax.spines['right'].set_visible(right)\n", " ax.spines['left'].set_visible(left)\n", " ax.spines['bottom'].set_visible(bottom)\n", " \n", " #turn off all ticks\n", " ax.yaxis.set_ticks_position('none')\n", " ax.xaxis.set_ticks_position('none')\n", " \n", " #now re-enable visibles\n", " if top:\n", " ax.xaxis.tick_top()\n", " if bottom:\n", " ax.xaxis.tick_bottom()\n", " if left:\n", " ax.yaxis.tick_left()\n", " if right:\n", " ax.yaxis.tick_right()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.read_csv('Data/full.csv', encoding='utf-8')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "subs = list(df['subreddit'].unique()) \n", "types = list(df['type'].unique())" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "print \"Original size of data set is\", len(df)\n", "df = df.drop_duplicates('id')\n", "print \"Size of data set with only unique posts is\", len(df)\n", "dfmean = np.mean(df['score'])\n", "\n", "df = df.sort('score')\n", "df = df.reset_index(level=0, drop=True)\n", "median = len(df)/2\n", "md = df['score'][median]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Original size of data set is 44261\n", "Size of data set with only unique posts is" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 25992\n" ] } ], "prompt_number": 5 }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Stemming the titles**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We want to define a function with which we can stem the titles" ] }, { "cell_type": "code", "collapsed": false, "input": [ "st = nltk.stem.lancaster.LancasterStemmer()\n", "def stem_title(title):\n", " tokens = nltk.word_tokenize(title)\n", " stemmed_tokens = [st.stem(word) for word in tokens]\n", " stemmed_title = \" \".join(stemmed_tokens)\n", " return stemmed_title\n", "stem_title(\"Thinking historically is, first, an attitude acknowledging that every event can be meaningfully \\\n", "understood only in relation to previous events, and, second, the methodical application of this attitude, \\\n", "which entails both analyzing events contextually--as having occurred in the midst of pre-existing circumstances--and \\\n", "comprehending them from historical actors.\")" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 6, "text": [ "'think hist is , first , an attitud acknowledg that every ev can be mean understood on in rel to prevy ev , and , second , the method apply of thi attitud , which entail both analys ev context -- as hav occur in the midst of pre-existing circumst -- and comprehend them from hist act .'" ] } ], "prompt_number": 6 }, { "cell_type": "markdown", "metadata": {}, "source": [ "**R squared**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We were trying to find the $r^2$ for the data. This code isn't used anymore" ] }, { "cell_type": "code", "collapsed": false, "input": [ "print len(df)\n", "df = df.drop('type',1)\n", "df = df.drop_duplicates()\n", "print len(df)\n", "df['stems'] = df['title'].map(lambda x: stem_title(x))\n", "print len(df)#dfavgs = [485.13011] *len(df)\n", "\n", "sse = 0\n", "dfidlist = list(df.index)\n", "for i in dfidlist:\n", " sse += (df['score'][i]-md)**2\n", "\n", "sst = 0\n", "dfidlist = list(df.index)\n", "for i in dfidlist:\n", " sst += (df['score'][i]-dfmean)**2\n", "\n", "rsq = 1 - (sst/sse)\n", "print sse\n", "print sst\n", "print rsq\n", "#print pearsonr(dfavgs, df['score'])\n", "\n", "#df['score']" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "25992\n", "25992" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "25992" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "27016697811" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "21238960370.3\n", "0.213858017776\n" ] } ], "prompt_number": 7 }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Define Make XY function and test it**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This function uses a countVectorizer to create a bag of words and then runs a regression using the titles on it." ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.cross_validation import train_test_split\n", "from sklearn.naive_bayes import MultinomialNB\n", "\n", "def make_xy(titles, scores, vectorizer=None):\n", " #Set default vecotrizer\n", " if not vectorizer:\n", " vectorizer = CountVectorizer(min_df=0.001)\n", " \n", " #Build the vocabulary by fitting the vectorizer to the list of quotes\n", " vectorizer.fit(titles) \n", " \n", " #Convert into a bag-of-words and use a sparse array to save memory\n", " x = vectorizer.transform(titles)\n", " x = x.tocsc()\n", " \n", " #save into numpy array, and return everything\n", " y = np.array(scores)\n", "\n", " return x, y, vectorizer\n", "\n", "X,Y,vectorizer = make_xy(list(df['title']), df['score'])\n", "x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)\n", "\n", "clf = MultinomialNB(alpha=50)\n", "clf.fit(x_train, y_train)\n", "print \"Training accuracy is\", clf.score(x_train, y_train)\n", "print \"Test accuracy is\", clf.score(x_test, y_test)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Training accuracy is " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "0.0800246229609\n", "Test accuracy is " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "0.0826408125577\n" ] } ], "prompt_number": 8 }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Test for the best num bins (We get 2 as the best btw)**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We tried binning the data based on the score of the post and then run the regression again in order to find the optimal number of bins" ] }, { "cell_type": "code", "collapsed": false, "input": [ "sorteddf = df.sort('score')\n", "sorteddf['category'] = df['score']\n", "size = len(df)\n", "best_test = 0\n", "best_vect = None\n", "best_Ysort = None\n", "best_clf = None\n", "for num in range(2, 11):\n", " blocksize = size/num\n", " blocks = [blocksize * i for i in range(num)]\n", " blocks.append(size)\n", " for i in range(num):\n", " sorteddf['category'][blocks[i]:blocks[i+1]] = i+1\n", " \n", " Xsort, Ysort, vectorizer2 = make_xy(list(sorteddf['title']), sorteddf['category'])\n", " \n", " x_train3, x_test3, y_train3, y_test3 = train_test_split(Xsort, Ysort, train_size=0.5)\n", " clf3 = MultinomialNB(alpha=50)\n", " clf3.fit(x_train3, y_train3)\n", " train_acc = clf3.score(x_train3, y_train3)\n", " test_acc = clf3.score(x_test3, y_test3)\n", " if best_test < test_acc:\n", " best_test = test_acc\n", " best_vect = copy.deepcopy(vectorizer2)\n", " best_Ysort = copy.deepcopy(Ysort)\n", " best_clf = copy.deepcopy(clf3)\n", " print \"For\", num, \"bins:\"\n", " print \"Training accuracy is\", train_acc\n", " print \"Test accuracy is\", test_acc\n", " print \"---------------------------------\"\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "For 2 bins:\n", "Training accuracy is 0.617343798092\n", "Test accuracy is 0.595183133272\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 3 bins:\n", "Training accuracy is 0.4905355494\n", "Test accuracy is 0.466143428747\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 4 bins:\n", "Training accuracy is 0.39242843952\n", "Test accuracy is 0.366035703293\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 5 bins:\n", "Training accuracy is 0.32971683595\n", "Test accuracy is 0.293551862111\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 6 bins:\n", "Training accuracy is 0.292859341336\n", "Test accuracy is 0.260772545399\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 7 bins:\n", "Training accuracy is 0.265927977839\n", "Test accuracy is 0.233379501385\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 8 bins:\n", "Training accuracy is 0.243459526008\n", "Test accuracy is 0.212296091105\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 9 bins:\n", "Training accuracy is 0.216528162512\n", "Test accuracy is 0.188904278239\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 10 bins:\n", "Training accuracy is 0.20729455217\n", "Test accuracy is 0.18297937827\n", "---------------------------------\n" ] } ], "prompt_number": 9 }, { "cell_type": "markdown", "metadata": {}, "source": [ "** Test with stemming (though we get the same answer)**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We run the same tests using the stemming from earlier." ] }, { "cell_type": "code", "collapsed": false, "input": [ "best_test2 = 0\n", "best_vect2 = None\n", "best_Ysort2 = None\n", "best_clf2 = None\n", "for num in range(2, 11):\n", " blocksize = size/num\n", " blocks = [blocksize * i for i in range(num)]\n", " blocks.append(size)\n", " for i in range(num):\n", " sorteddf['category'][blocks[i]:blocks[i+1]] = i+1\n", " \n", " Xstem, Ystem, vectorizer3 = make_xy(list(sorteddf['stems']), sorteddf['category'])\n", " \n", " x_train4, x_test4, y_train4, y_test4 = train_test_split(Xstem, Ystem, train_size=0.5)\n", " clf4 = MultinomialNB(alpha=1)\n", " clf4.fit(x_train4, y_train4)\n", " train_acc = clf4.score(x_train4, y_train4)\n", " test_acc = clf4.score(x_test4, y_test4)\n", " if best_test < test_acc:\n", " best_test2 = test_acc\n", " best_vect2 = copy.deepcopy(vectorizer3)\n", " best_category2 = copy.deepcopy(sorteddf['category'])\n", " best_clf2 = copy.deepcopy(clf4)\n", " print \"For\", num, \"bins:\"\n", " print \"Training accuracy is\", train_acc\n", " print \"Test accuracy is\", test_acc\n", " print \"---------------------------------\"" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "For 2 bins:\n", "Training accuracy is 0.659664512158\n", "Test accuracy is 0.601261926747\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 3 bins:\n", "Training accuracy is 0.557863958141\n", "Test accuracy is 0.482686980609\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 4 bins:\n", "Training accuracy is 0.479301323484\n", "Test accuracy is 0.388888888889\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 5 bins:\n", "Training accuracy is 0.441982148353\n", "Test accuracy is 0.326100338566\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 6 bins:\n", "Training accuracy is 0.411280393967\n", "Test accuracy is 0.287934749154\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 7 bins:\n", "Training accuracy is 0.39296706679\n", "Test accuracy is 0.257463835026\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 8 bins:\n", "Training accuracy is 0.376808248692\n", "Test accuracy is 0.240150815636\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 9 bins:\n", "Training accuracy is 0.359572176054\n", "Test accuracy is 0.210834102801\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 10 bins:\n", "Training accuracy is 0.350954139735\n", "Test accuracy is 0.199753770391\n", "---------------------------------\n" ] } ], "prompt_number": 10 }, { "cell_type": "markdown", "metadata": {}, "source": [ "**N-grams**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We now try to run the regression using n_grams and the optimal number of bins." ] }, { "cell_type": "code", "collapsed": false, "input": [ "n_grams = CountVectorizer(ngram_range=[1, 5], analyzer='word')\n", "n_grams.fit(list(sorteddf['title']))\n", "Xngram = n_grams.transform(list(sorteddf['title']))\n", "x_train4, x_test4, y_train4, y_test4 = train_test_split(Xngram, best_Ysort, train_size=0.5)\n", "clf4 = MultinomialNB(alpha=1)\n", "clf4.fit(x_train4, y_train4)\n", "print \"Training accuracy is\", clf4.score(x_train4, y_train4)\n", "print \"Test accuracy is\", clf4.score(x_test4, y_test4)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Training accuracy is 0.876192674669\n", "Test accuracy is 0.614112034472\n" ] } ], "prompt_number": 11 }, { "cell_type": "markdown", "metadata": {}, "source": [ "**TDIDF**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Uses TFIDF instead of the normal vectorizer" ] }, { "cell_type": "code", "collapsed": false, "input": [ "tdidf = TfidfVectorizer(ngram_range=[1, 5], sublinear_tf=True)\n", "tdidf.fit(list(sorteddf['title']))\n", "Xtdidf = tdidf.transform(list(sorteddf['title']))\n", "x_train5, x_test5, y_train5, y_test5 = train_test_split(Xtdidf, best_Ysort, train_size=0.5)\n", "clf5 = MultinomialNB(alpha=1)\n", "clf5.fit(x_train5, y_train5)\n", "print \"Training accuracy is\", clf5.score(x_train5, y_train5)\n", "print \"Test accuracy is\", clf5.score(x_test5, y_test5)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Training accuracy is 0.896968297938\n", "Test accuracy is 0.614112034472\n" ] } ], "prompt_number": 12 }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Splitting by subreddits and examining title n-grams**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Maybe looking at distinct subreddits improves the model" ] }, { "cell_type": "code", "collapsed": false, "input": [ "subreddit_ngrams = {}\n", "for subreddit in subs: \n", " smalldf = df[df['subreddit'] == subreddit]\n", " sortedsmalldf = smalldf.sort('score')\n", " sortedsmalldf['category'] = smalldf['score']\n", " size = len(smalldf)\n", " num = 2\n", " blocksize = size/num\n", " blocks = [blocksize * i for i in range(num)]\n", " blocks.append(size)\n", " for i in range(num):\n", " sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1\n", " \n", " n_grams = CountVectorizer(ngram_range=[1, 3])\n", " n_grams.fit(list(sortedsmalldf['title']))\n", " X = n_grams.transform(list(sortedsmalldf['title']))\n", " Y = np.array(sortedsmalldf['category'])\n", " \n", " x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)\n", " clf = MultinomialNB(alpha=50)\n", " clf.fit(x_train, y_train)\n", " subreddit_ngrams[subreddit] = [clf, n_grams]\n", " train_acc = clf.score(x_train, y_train)\n", " test_acc = clf.score(x_test, y_test)\n", " print \"For\", subreddit, \"subreddit:\"\n", " print \"Training accuracy is\", train_acc\n", " print \"Test accuracy is\", test_acc\n", " print \"---------------------------------\"" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "For atheism subreddit:\n", "Training accuracy is 0.768292682927\n", "Test accuracy is 0.587242026266\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " politics subreddit:\n", "Training accuracy is 0.637393767705\n", "Test accuracy is 0.605288007554\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " nosleep subreddit:\n", "Training accuracy is 0.714851485149\n", "Test accuracy is 0.546983184965\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " pettyrevenge subreddit:\n", "Training accuracy is 0.703743315508\n", "Test accuracy is 0.519230769231\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " jokes subreddit:\n", "Training accuracy is 0.709800190295\n", "Test accuracy is 0.55946717412\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " askhistorians subreddit:\n", "Training accuracy is 0.612244897959\n", "Test accuracy is 0.551020408163\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " TalesFromTechsupport subreddit:\n", "Training accuracy is 0.755037115589\n", "Test accuracy is 0.522799575822\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " AskReddit subreddit:\n", "Training accuracy is 0.692571428571\n", "Test accuracy is 0.558285714286\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " talesFromRetail subreddit:\n", "Training accuracy is 0.684656084656\n", "Test accuracy is 0.554497354497\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " askscience subreddit:\n", "Training accuracy is 0.6119257087\n", "Test accuracy is 0.5390625\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tifu subreddit:\n", "Training accuracy is 0.717659137577\n", "Test accuracy is 0.550308008214\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " explainlikeimfive subreddit:\n", "Training accuracy is 0.673290937997\n", "Test accuracy is 0.544876886418\n", "---------------------------------\n" ] } ], "prompt_number": 13 }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Getting the probability of each title being successful (both specific and generic)**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We are calculating the probability of a post being successful for the whole data set and for each subreddit and add it to our dataframe in order to use the data later." ] }, { "cell_type": "code", "collapsed": false, "input": [ "gen_probs = []\n", "spec_probs = []\n", "for i in df.index:\n", " title = df.title[i]\n", " subreddit = df.subreddit[i]\n", " clf = subreddit_ngrams[subreddit][0]\n", " n_grams_spec = subreddit_ngrams[subreddit][1]\n", " #prob_gen = clf4.predict_proba(n_grams.transform([title]))[0][1]\n", " prob_spec = clf.predict_proba(n_grams_spec.transform([title]))[0][1]\n", " #gen_probs.append(prob_gen)\n", " spec_probs.append(prob_spec)\n", " \n", "#df['gen_probs'] = gen_probs\n", "df['spec_probs'] = spec_probs\n", "df.to_csv(\"Data/new_full.csv\", index=False, encoding='utf-8')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 20 }, { "cell_type": "markdown", "metadata": {}, "source": [ "** Prediction Function**" ] }, { "cell_type": "code", "collapsed": false, "input": [ "m, b, r, p, std = scipy.stats.linregress(np.array(df['spec_probs']), np.array(df['score']))\n", "print m\n", "print b\n", "print r**2\n", "print p\n", "print std\n", "\n", "\n", "def predict(title):\n", " x = clf.predict_proba(n_grams_spec.transform([title]))[0][1]\n", " y = m*x + b\n", " return y" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "1961.81582836\n", "-517.566973807\n", "0.137196851471\n", "0.0\n", "30.5167923553\n" ] } ], "prompt_number": 31 }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Testing the prediction function**" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pickle\n", "#for the website " ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 33 }, { "cell_type": "code", "collapsed": false, "input": [ "tup = (clf, n_grams_spec)\n", "with open('clf.pickle', 'wb') as handle:\n", " pickle.dump(tup, handle)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 34 }, { "cell_type": "code", "collapsed": false, "input": [ "print predict(\"If the Big Bang happened 13.7 Billion years ago, how is the edge of the observable universe 16 Billion light years away? Did the universe expand faster than the speed of light?\")" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "1131.34965136\n" ] } ], "prompt_number": 32 }, { "cell_type": "markdown", "metadata": {}, "source": [ "**sklearn regression models**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Ridge Classifier on the regular dataset**" ] }, { "cell_type": "code", "collapsed": false, "input": [ "subreddit_svm = {}\n", "for subreddit in subs: \n", " smalldf = df[df['subreddit'] == subreddit]\n", " sortedsmalldf = smalldf.sort('score')\n", " sortedsmalldf['category'] = smalldf['score']\n", " size = len(smalldf)\n", " num = 2\n", " blocksize = size/num\n", " blocks = [blocksize * i for i in range(num)]\n", " blocks.append(size)\n", " for i in range(num):\n", " sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1\n", " \n", " n_grams = CountVectorizer(ngram_range=[1, 3])\n", " n_grams.fit(list(sortedsmalldf['title']))\n", " X = n_grams.transform(list(sortedsmalldf['title']))\n", " Y = np.array(sortedsmalldf['category'])\n", " \n", " x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)\n", " clf = RidgeClassifier(tol=1e-2, solver=\"lsqr\")\n", " clf.fit(x_train, y_train)\n", " subreddit_svm[subreddit] = [clf, n_grams]\n", " train_acc = clf.score(x_train, y_train)\n", " test_acc = clf.score(x_test, y_test)\n", " print \"For\", subreddit, \"subreddit:\"\n", " print \"Training accuracy is\", train_acc\n", " print \"Test accuracy is\", test_acc\n", " print \"---------------------------------\"" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "For atheism subreddit:\n", "Training accuracy is 0.992495309568\n", "Test accuracy is 0.602251407129\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " politics subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.67044381492\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " nosleep subreddit:\n", "Training accuracy is 0.925742574257\n", "Test accuracy is 0.627101879327\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " pettyrevenge subreddit:\n", "Training accuracy is 0.974331550802\n", "Test accuracy is 0.544871794872\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " jokes subreddit:\n", "Training accuracy is 0.966698382493\n", "Test accuracy is 0.549000951475\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " askhistorians subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.586734693878\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " TalesFromTechsupport subreddit:\n", "Training accuracy is 0.985153764581\n", "Test accuracy is 0.519618239661\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " AskReddit subreddit:\n", "Training accuracy is 0.998857142857\n", "Test accuracy is 0.556571428571\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " talesFromRetail subreddit:\n", "Training accuracy is 0.98835978836\n", "Test accuracy is 0.577777777778\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " askscience subreddit:\n", "Training accuracy is 0.99706744868\n", "Test accuracy is 0.595703125\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tifu subreddit:\n", "Training accuracy is 0.992813141684\n", "Test accuracy is 0.517453798768\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " explainlikeimfive subreddit:\n", "Training accuracy is 0.994435612083\n", "Test accuracy is 0.560762509929\n", "---------------------------------\n" ] } ], "prompt_number": 16 }, { "cell_type": "markdown", "metadata": {}, "source": [ "** Now we apply the ridge classifier to the newly constructed alchemy stuff**" ] }, { "cell_type": "code", "collapsed": false, "input": [ "subreddit_alchemy = {}\n", "for subreddit in subs: \n", " smalldf = df[df['subreddit'] == subreddit]\n", " sortedsmalldf = smalldf.sort('score')\n", " sortedsmalldf['category'] = smalldf['score']\n", " size = len(smalldf)\n", " num = 2\n", " blocksize = size/num\n", " blocks = [blocksize * i for i in range(num)]\n", " blocks.append(size)\n", " for i in range(num):\n", " sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1\n", " \n", " alch_titles = []\n", " for title in list(sortedsmalldf['title']):\n", " titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']]\n", " titles = [lst.replace(')', '') for lst in titles]\n", " titles = [lst.replace('[', '') for lst in titles]\n", " titles = [lst.replace(']', '') for lst in titles]\n", " titles = \"\".join(titles)\n", " titles = \"\".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ')\n", " titles = titles.replace(' ', ' ')\n", " titles = titles.split(' ')\n", " alch_titles.append(\" \".join(titles[1:]))\n", " \n", " n_grams = CountVectorizer(ngram_range=[1, 3])\n", " n_grams.fit(list(alch_titles))\n", " X = n_grams.transform(alch_titles)\n", " Y = np.array(sortedsmalldf['category'])\n", " \n", " x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)\n", " clf = RidgeClassifier(tol=1e-2, solver=\"lsqr\")\n", " clf.fit(x_train, y_train)\n", " subreddit_alchemy[subreddit] = [clf, n_grams]\n", " train_acc = clf.score(x_train, y_train)\n", " test_acc = clf.score(x_test, y_test)\n", " print \"For\", subreddit, \"subreddit:\"\n", " print \"Training accuracy is\", train_acc\n", " print \"Test accuracy is\", test_acc\n", " print \"---------------------------------\"" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "For atheism subreddit:\n", "Training accuracy is 0.997185741088\n", "Test accuracy is 0.71200750469\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " politics subreddit:\n", "Training accuracy is 0.998111425873\n", "Test accuracy is 0.745986779981\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " nosleep subreddit:\n", "Training accuracy is 0.99504950495\n", "Test accuracy is 0.824925816024\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " pettyrevenge subreddit:\n", "Training accuracy is 0.998930481283\n", "Test accuracy is 0.653846153846\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " jokes subreddit:\n", "Training accuracy is 0.997145575642\n", "Test accuracy is 0.649857278782\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " askhistorians subreddit:\n", "Training accuracy is 0.998979591837\n", "Test accuracy is 0.630612244898\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " TalesFromTechsupport subreddit:\n", "Training accuracy is 0.996818663839\n", "Test accuracy is 0.688229056204\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " AskReddit subreddit:\n", "Training accuracy is 0.998857142857\n", "Test accuracy is 0.717142857143\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " talesFromRetail subreddit:\n", "Training accuracy is 0.996825396825\n", "Test accuracy is 0.668783068783\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " askscience subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.6484375\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tifu subreddit:\n", "Training accuracy is 0.99794661191\n", "Test accuracy is 0.686858316222\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " explainlikeimfive subreddit:\n", "Training accuracy is 0.99920508744\n", "Test accuracy is 0.626687847498\n", "---------------------------------\n" ] } ], "prompt_number": 36 }, { "cell_type": "markdown", "metadata": {}, "source": [ "** Now same as above but with the Perceptron algorithm**" ] }, { "cell_type": "code", "collapsed": false, "input": [ "subreddit_svm = {}\n", "for subreddit in subs: \n", " smalldf = df[df['subreddit'] == subreddit]\n", " sortedsmalldf = smalldf.sort('score')\n", " sortedsmalldf['category'] = smalldf['score']\n", " size = len(smalldf)\n", " num = 2\n", " blocksize = size/num\n", " blocks = [blocksize * i for i in range(num)]\n", " blocks.append(size)\n", " for i in range(num):\n", " sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1\n", " \n", " n_grams = CountVectorizer(ngram_range=[1, 3])\n", " n_grams.fit(list(sortedsmalldf['title']))\n", " X = n_grams.transform(list(sortedsmalldf['title']))\n", " Y = np.array(sortedsmalldf['category'])\n", " \n", " x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)\n", " clf = Perceptron(n_iter=50)\n", " clf.fit(x_train, y_train)\n", " subreddit_svm[subreddit] = [clf, n_grams]\n", " train_acc = clf.score(x_train, y_train)\n", " test_acc = clf.score(x_test, y_test)\n", " print \"For\", subreddit, \"subreddit:\"\n", " print \"Training accuracy is\", train_acc\n", " print \"Test accuracy is\", test_acc\n", " print \"---------------------------------\"" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "For atheism subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.575984990619\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " politics subreddit:\n", "Training accuracy is 0.996222851747\n", "Test accuracy is 0.690273843248\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " nosleep subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.603363006924\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " pettyrevenge subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.523504273504\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " jokes subreddit:\n", "Training accuracy is 0.989533777355\n", "Test accuracy is 0.562321598478\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " askhistorians subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.592857142857\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " TalesFromTechsupport subreddit:\n", "Training accuracy is 0.998939554613\n", "Test accuracy is 0.566277836691\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " AskReddit subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.581714285714\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " talesFromRetail subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.550264550265\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " askscience subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.595703125\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tifu subreddit:\n", "Training accuracy is 0.998973305955\n", "Test accuracy is 0.550308008214\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " explainlikeimfive subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.550436854647\n", "---------------------------------\n" ] } ], "prompt_number": 25 }, { "cell_type": "code", "collapsed": false, "input": [ "subreddit_alchemy = {}\n", "for subreddit in subs: \n", " smalldf = df[df['subreddit'] == subreddit]\n", " sortedsmalldf = smalldf.sort('score')\n", " sortedsmalldf['category'] = smalldf['score']\n", " size = len(smalldf)\n", " num = 2\n", " blocksize = size/num\n", " blocks = [blocksize * i for i in range(num)]\n", " blocks.append(size)\n", " for i in range(num):\n", " sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1\n", " \n", " alch_titles = []\n", " for title in list(sortedsmalldf['title']):\n", " titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']]\n", " titles = [lst.replace(')', '') for lst in titles]\n", " titles = [lst.replace('[', '') for lst in titles]\n", " titles = [lst.replace(']', '') for lst in titles]\n", " titles = \"\".join(titles)\n", " titles = \"\".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ')\n", " titles = titles.replace(' ', ' ')\n", " titles = titles.split(' ')\n", " alch_titles.append(\" \".join(titles[1:]))\n", " \n", " n_grams = CountVectorizer(ngram_range=[1, 3])\n", " n_grams.fit(list(alch_titles))\n", " X = n_grams.transform(alch_titles)\n", " Y = np.array(sortedsmalldf['category'])\n", " \n", " x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)\n", " clf = Perceptron(n_iter=50)\n", " clf.fit(x_train, y_train)\n", " subreddit_alchemy[subreddit] = [clf, n_grams]\n", " train_acc = clf.score(x_train, y_train)\n", " test_acc = clf.score(x_test, y_test)\n", " print \"For\", subreddit, \"subreddit:\"\n", " print \"Training accuracy is\", train_acc\n", " print \"Test accuracy is\", test_acc\n", " print \"---------------------------------\"" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "For atheism subreddit:\n", "Training accuracy is 0.996247654784\n", "Test accuracy is 0.711069418386\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " politics subreddit:\n", "Training accuracy is 0.996222851747\n", "Test accuracy is 0.780925401322\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " nosleep subreddit:\n", "Training accuracy is 0.99504950495\n", "Test accuracy is 0.854599406528\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " pettyrevenge subreddit:\n", "Training accuracy is 0.995721925134\n", "Test accuracy is 0.662393162393\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " jokes subreddit:\n", "Training accuracy is 0.985727878211\n", "Test accuracy is 0.693625118934\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " askhistorians subreddit:\n", "Training accuracy is 0.997959183673\n", "Test accuracy is 0.638775510204\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " TalesFromTechsupport subreddit:\n", "Training accuracy is 0.994697773065\n", "Test accuracy is 0.677624602333\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " AskReddit subreddit:\n", "Training accuracy is 0.999428571429\n", "Test accuracy is 0.717142857143\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " talesFromRetail subreddit:\n", "Training accuracy is 0.998941798942\n", "Test accuracy is 0.674074074074\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " askscience subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.654296875\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tifu subreddit:\n", "Training accuracy is 0.990759753593\n", "Test accuracy is 0.724845995893\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " explainlikeimfive subreddit:\n", "Training accuracy is 0.998410174881\n", "Test accuracy is 0.636219221604\n", "---------------------------------\n" ] } ], "prompt_number": 37 }, { "cell_type": "markdown", "metadata": {}, "source": [ "** Now the Passive Aggressive Classifier**" ] }, { "cell_type": "code", "collapsed": false, "input": [ "subreddit_svm = {}\n", "for subreddit in subs: \n", " smalldf = df[df['subreddit'] == subreddit]\n", " sortedsmalldf = smalldf.sort('score')\n", " sortedsmalldf['category'] = smalldf['score']\n", " size = len(smalldf)\n", " num = 2\n", " blocksize = size/num\n", " blocks = [blocksize * i for i in range(num)]\n", " blocks.append(size)\n", " for i in range(num):\n", " sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1\n", " \n", " n_grams = CountVectorizer(ngram_range=[1, 3])\n", " n_grams.fit(list(sortedsmalldf['title']))\n", " X = n_grams.transform(list(sortedsmalldf['title']))\n", " Y = np.array(sortedsmalldf['category'])\n", " \n", " x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)\n", " clf = PassiveAggressiveClassifier(n_iter=50)\n", " clf.fit(x_train, y_train)\n", " subreddit_svm[subreddit] = [clf, n_grams]\n", " train_acc = clf.score(x_train, y_train)\n", " test_acc = clf.score(x_test, y_test)\n", " print \"For\", subreddit, \"subreddit:\"\n", " print \"Training accuracy is\", train_acc\n", " print \"Test accuracy is\", test_acc\n", " print \"---------------------------------\"" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "For atheism subreddit:\n", "Training accuracy is 0.999061913696\n", "Test accuracy is 0.613508442777\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " politics subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.712936732767\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " nosleep subreddit:\n", "Training accuracy is 0.99702970297\n", "Test accuracy is 0.612265084075\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " pettyrevenge subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.530982905983\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " jokes subreddit:\n", "Training accuracy is 0.998097050428\n", "Test accuracy is 0.577545195052\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " askhistorians subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.571428571429\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " TalesFromTechsupport subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.522799575822\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " AskReddit subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.583428571429\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " talesFromRetail subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.57671957672\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " askscience subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.6005859375\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tifu subreddit:\n", "Training accuracy is 0.998973305955\n", "Test accuracy is 0.532854209446\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " explainlikeimfive subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.561556791104\n", "---------------------------------\n" ] } ], "prompt_number": 27 }, { "cell_type": "code", "collapsed": false, "input": [ "subreddit_alchemy = {}\n", "for subreddit in subs: \n", " smalldf = df[df['subreddit'] == subreddit]\n", " sortedsmalldf = smalldf.sort('score')\n", " sortedsmalldf['category'] = smalldf['score']\n", " size = len(smalldf)\n", " num = 2\n", " blocksize = size/num\n", " blocks = [blocksize * i for i in range(num)]\n", " blocks.append(size)\n", " for i in range(num):\n", " sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1\n", " \n", " alch_titles = []\n", " for title in list(sortedsmalldf['title']):\n", " titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']]\n", " titles = [lst.replace(')', '') for lst in titles]\n", " titles = [lst.replace('[', '') for lst in titles]\n", " titles = [lst.replace(']', '') for lst in titles]\n", " titles = \"\".join(titles)\n", " titles = \"\".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ')\n", " titles = titles.replace(' ', ' ')\n", " titles = titles.split(' ')\n", " alch_titles.append(\" \".join(titles[1:]))\n", " \n", " n_grams = CountVectorizer(ngram_range=[1, 3])\n", " n_grams.fit(list(alch_titles))\n", " X = n_grams.transform(alch_titles)\n", " Y = np.array(sortedsmalldf['category'])\n", " \n", " x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)\n", " clf = PassiveAggressiveClassifier(n_iter=50)\n", " clf.fit(x_train, y_train)\n", " subreddit_alchemy[subreddit] = [clf, n_grams]\n", " train_acc = clf.score(x_train, y_train)\n", " test_acc = clf.score(x_test, y_test)\n", " print \"For\", subreddit, \"subreddit:\"\n", " print \"Training accuracy is\", train_acc\n", " print \"Test accuracy is\", test_acc\n", " print \"---------------------------------\"" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "For atheism subreddit:\n", "Training accuracy is 0.996247654784\n", "Test accuracy is 0.757973733583\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " politics subreddit:\n", "Training accuracy is 0.99716713881\n", "Test accuracy is 0.807365439093\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " nosleep subreddit:\n", "Training accuracy is 0.99504950495\n", "Test accuracy is 0.848664688427\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " pettyrevenge subreddit:\n", "Training accuracy is 0.99679144385\n", "Test accuracy is 0.692307692308\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " jokes subreddit:\n", "Training accuracy is 0.996194100856\n", "Test accuracy is 0.742150333016\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " askhistorians subreddit:\n", "Training accuracy is 0.997959183673\n", "Test accuracy is 0.65612244898\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " TalesFromTechsupport subreddit:\n", "Training accuracy is 0.998939554613\n", "Test accuracy is 0.673382820785\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " AskReddit subreddit:\n", "Training accuracy is 0.999428571429\n", "Test accuracy is 0.724\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " talesFromRetail subreddit:\n", "Training accuracy is 0.998941798942\n", "Test accuracy is 0.67619047619\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " askscience subreddit:\n", "Training accuracy is 1.0\n", "Test accuracy is 0.6728515625\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tifu subreddit:\n", "Training accuracy is 0.994866529774\n", "Test accuracy is 0.715605749487\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " explainlikeimfive subreddit:\n", "Training accuracy is 0.99920508744\n", "Test accuracy is 0.669579030977\n", "---------------------------------\n" ] } ], "prompt_number": 38 }, { "cell_type": "markdown", "metadata": {}, "source": [ "** And finally.... K-neighbors... **" ] }, { "cell_type": "code", "collapsed": false, "input": [ "subreddit_svm = {}\n", "for subreddit in subs: \n", " smalldf = df[df['subreddit'] == subreddit]\n", " sortedsmalldf = smalldf.sort('score')\n", " sortedsmalldf['category'] = smalldf['score']\n", " size = len(smalldf)\n", " num = 2\n", " blocksize = size/num\n", " blocks = [blocksize * i for i in range(num)]\n", " blocks.append(size)\n", " for i in range(num):\n", " sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1\n", " \n", " n_grams = CountVectorizer(ngram_range=[1, 3])\n", " n_grams.fit(list(sortedsmalldf['title']))\n", " X = n_grams.transform(list(sortedsmalldf['title']))\n", " Y = np.array(sortedsmalldf['category'])\n", " \n", " x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)\n", " clf = KNeighborsClassifier(n_neighbors=10)\n", " clf.fit(x_train, y_train)\n", " subreddit_svm[subreddit] = [clf, n_grams]\n", " train_acc = clf.score(x_train, y_train)\n", " test_acc = clf.score(x_test, y_test)\n", " print \"For\", subreddit, \"subreddit:\"\n", " print \"Training accuracy is\", train_acc\n", " print \"Test accuracy is\", test_acc\n", " print \"---------------------------------\"" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "For atheism subreddit:\n", "Training accuracy is 0.68574108818\n", "Test accuracy is 0.540337711069\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " politics subreddit:\n", "Training accuracy is 0.478753541076\n", "Test accuracy is 0.522190745987\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " nosleep subreddit:\n", "Training accuracy is 0.714851485149\n", "Test accuracy is 0.495548961424\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " pettyrevenge subreddit:\n", "Training accuracy is 0.626737967914\n", "Test accuracy is 0.520299145299\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " jokes subreddit:\n", "Training accuracy is 0.603235014272\n", "Test accuracy is 0.50808753568\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " askhistorians subreddit:\n", "Training accuracy is 0.521428571429\n", "Test accuracy is 0.488775510204\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " TalesFromTechsupport subreddit:\n", "Training accuracy is 0.71474019088\n", "Test accuracy is 0.520678685048\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " AskReddit subreddit:\n", "Training accuracy is 0.553142857143\n", "Test accuracy is 0.504571428571\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " talesFromRetail subreddit:\n", "Training accuracy is 0.627513227513\n", "Test accuracy is 0.51746031746\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " askscience subreddit:\n", "Training accuracy is 0.492668621701\n", "Test accuracy is 0.5107421875\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tifu subreddit:\n", "Training accuracy is 0.519507186858\n", "Test accuracy is 0.525667351129\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " explainlikeimfive subreddit:\n", "Training accuracy is 0.523847376789\n", "Test accuracy is 0.513105639396\n", "---------------------------------\n" ] } ], "prompt_number": 29 }, { "cell_type": "code", "collapsed": false, "input": [ "subreddit_alchemy = {}\n", "for subreddit in subs: \n", " smalldf = df[df['subreddit'] == subreddit]\n", " sortedsmalldf = smalldf.sort('score')\n", " sortedsmalldf['category'] = smalldf['score']\n", " size = len(smalldf)\n", " num = 2\n", " blocksize = size/num\n", " blocks = [blocksize * i for i in range(num)]\n", " blocks.append(size)\n", " for i in range(num):\n", " sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1\n", " \n", " alch_titles = []\n", " for title in list(sortedsmalldf['title']):\n", " titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']]\n", " titles = [lst.replace(')', '') for lst in titles]\n", " titles = [lst.replace('[', '') for lst in titles]\n", " titles = [lst.replace(']', '') for lst in titles]\n", " titles = \"\".join(titles)\n", " titles = \"\".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ')\n", " titles = titles.replace(' ', ' ')\n", " titles = titles.split(' ')\n", " alch_titles.append(\" \".join(titles[1:]))\n", " \n", " n_grams = CountVectorizer(ngram_range=[1, 3])\n", " n_grams.fit(list(alch_titles))\n", " X = n_grams.transform(alch_titles)\n", " Y = np.array(sortedsmalldf['category'])\n", " \n", " x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)\n", " clf = KNeighborsClassifier(n_neighbors=10)\n", " clf.fit(x_train, y_train)\n", " subreddit_alchemy[subreddit] = [clf, n_grams]\n", " train_acc = clf.score(x_train, y_train)\n", " test_acc = clf.score(x_test, y_test)\n", " print \"For\", subreddit, \"subreddit:\"\n", " print \"Training accuracy is\", train_acc\n", " print \"Test accuracy is\", test_acc\n", " print \"---------------------------------\"" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "For atheism subreddit:\n", "Training accuracy is 0.506566604128\n", "Test accuracy is 0.496247654784\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " politics subreddit:\n", "Training accuracy is 0.503305004721\n", "Test accuracy is 0.496694995279\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " nosleep subreddit:\n", "Training accuracy is 0.619801980198\n", "Test accuracy is 0.525222551929\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " pettyrevenge subreddit:\n", "Training accuracy is 0.500534759358\n", "Test accuracy is 0.498931623932\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " jokes subreddit:\n", "Training accuracy is 0.492863939106\n", "Test accuracy is 0.507136060894\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " askhistorians subreddit:\n", "Training accuracy is 0.504081632653\n", "Test accuracy is 0.495918367347\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " TalesFromTechsupport subreddit:\n", "Training accuracy is 0.496288441145\n", "Test accuracy is 0.503711558855\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " AskReddit subreddit:\n", "Training accuracy is 0.500571428571\n", "Test accuracy is 0.499428571429\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " talesFromRetail subreddit:\n", "Training accuracy is 0.502645502646\n", "Test accuracy is 0.497354497354\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " askscience subreddit:\n", "Training accuracy is 0.502443792766\n", "Test accuracy is 0.498046875\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " tifu subreddit:\n", "Training accuracy is 0.496919917864\n", "Test accuracy is 0.504106776181\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " explainlikeimfive subreddit:\n", "Training accuracy is 0.515103338633\n", "Test accuracy is 0.486894360604\n", "---------------------------------\n" ] } ], "prompt_number": 39 }, { "cell_type": "markdown", "metadata": {}, "source": [ "** NOw putting it all together so we don't have to keep r-running the same code all the time...**" ] }, { "cell_type": "code", "collapsed": false, "input": [ "for i, d in enumerate(['Not alchemy', 'Alchemy']):\n", " for clf, name in (\n", " (RidgeClassifier(tol=1e-2, solver=\"lsqr\"), \"Ridge Classifier\"),\n", " (Perceptron(n_iter=50), \"Perceptron\"),\n", " (PassiveAggressiveClassifier(n_iter=50), \"Passive-Aggressive\"),\n", " (KNeighborsClassifier(n_neighbors=10), \"kNN\")):\n", " subreddit_svm = {}\n", " for subreddit in subs: \n", " smalldf = df[df['subreddit'] == subreddit]\n", " sortedsmalldf = smalldf.sort('score')\n", " sortedsmalldf['category'] = smalldf['score']\n", " size = len(smalldf)\n", " num = 2\n", " blocksize = size/num\n", " blocks = [blocksize * i for i in range(num)]\n", " blocks.append(size)\n", " for i in range(num):\n", " sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1\n", " \n", " titles = list(sortedsmalldf['title'])\n", " bins = list(sortedsmalldf['category'])\n", " if (i==1):\n", " alch_titles = []\n", " for title in list(sortedsmalldf['title']):\n", " titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']]\n", " titles = [lst.replace(')', '') for lst in titles]\n", " titles = [lst.replace('[', '') for lst in titles]\n", " titles = [lst.replace(']', '') for lst in titles]\n", " titles = \"\".join(titles)\n", " titles = \"\".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ')\n", " titles = titles.replace(' ', ' ')\n", " titles = titles.split(' ')[1:]\n", " alch_titles.append(titles) \n", " alch_bins = []\n", " categories = np.array(sortedsmalldf['category'])\n", " for i, lst in enumerate(alch_titles):\n", " b = categories[i]\n", " for j in range(len(lst)):\n", " alch_bins.append(b) \n", " alch_titles = [word for words in alch_titles for word in words]\n", " titles = alch_titles\n", " bins = alch_bins\n", " \n", " n_grams = CountVectorizer(ngram_range=[1, 3])\n", " n_grams.fit(titles)\n", " X = n_grams.transform(titles)\n", " Y = np.array(bins)\n", " \n", " x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)\n", " clf2 = clf\n", " clf2.fit(x_train, y_train)\n", " subreddit_svm[subreddit] = [clf, n_grams]\n", " train_acc = clf.score(x_train, y_train)\n", " test_acc = clf.score(x_test, y_test)\n", " print \"For\", d, \"and\", subreddit, \"subreddit and\", name, \"classifier:\"\n", " print \"Training accuracy is\", train_acc\n", " print \"Test accuracy is\", test_acc\n", " print \"---------------------------------\"" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "For Not alchemy and atheism subreddit and Ridge Classifier classifier:\n", "Training accuracy is 0.781820835532\n", "Test accuracy is 0.709438749733\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and politics subreddit and Ridge Classifier classifier:\n", "Training accuracy is 0.829282780411\n", "Test accuracy is 0.811248025276\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and nosleep subreddit and Ridge Classifier classifier:\n", "Training accuracy is 0.734472447672\n", "Test accuracy is 0.65041733945\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and pettyrevenge subreddit and Ridge Classifier classifier:\n", "Training accuracy is 0.714860702307\n", "Test accuracy is 0.613887139611\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and jokes subreddit and Ridge Classifier classifier:\n", "Training accuracy is 0.859851018196\n", "Test accuracy is 0.801388344365\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and askhistorians subreddit and Ridge Classifier classifier:\n", "Training accuracy is 0.761180962572\n", "Test accuracy is 0.667481365289\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and TalesFromTechsupport subreddit and Ridge Classifier classifier:\n", "Training accuracy is 0.687174322372\n", "Test accuracy is 0.578304943732\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and AskReddit subreddit and Ridge Classifier classifier:\n", "Training accuracy is 0.773876452466\n", "Test accuracy is 0.705906864587\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and talesFromRetail subreddit and Ridge Classifier classifier:\n", "Training accuracy is 0.670503987414\n", "Test accuracy is 0.565479677972\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and askscience subreddit and Ridge Classifier classifier:\n", "Training accuracy is 0.751232014912\n", "Test accuracy is 0.668620026796\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and tifu subreddit and Ridge Classifier classifier:\n", "Training accuracy is 0.735181174345\n", "Test accuracy is 0.648525112942\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and explainlikeimfive subreddit and Ridge Classifier classifier:\n", "Training accuracy is 0.772828252276\n", "Test accuracy is 0.701780369007\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and atheism subreddit and Perceptron classifier:\n", "Training accuracy is 0.73519406303\n", "Test accuracy is 0.671659050848\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and politics subreddit and Perceptron classifier:\n", "Training accuracy is 0.813952606635\n", "Test accuracy is 0.773219589258\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and nosleep subreddit and Perceptron classifier:\n", "Training accuracy is 0.669542930372\n", "Test accuracy is 0.607008910646\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and pettyrevenge subreddit and Perceptron classifier:\n", "Training accuracy is 0.52497775344\n", "Test accuracy is 0.449702926923\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and jokes subreddit and Perceptron classifier:\n", "Training accuracy is 0.822167385439\n", "Test accuracy is 0.757547104541\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and askhistorians subreddit and Perceptron classifier:\n", "Training accuracy is 0.528490478854\n", "Test accuracy is 0.437286445604\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and TalesFromTechsupport subreddit and Perceptron classifier:\n", "Training accuracy is 0.63752900994\n", "Test accuracy is 0.569218811578\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and AskReddit subreddit and Perceptron classifier:\n", "Training accuracy is 0.723346699547\n", "Test accuracy is 0.664537636025\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and talesFromRetail subreddit and Perceptron classifier:\n", "Training accuracy is 0.616372809635\n", "Test accuracy is 0.551993142808\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and askscience subreddit and Perceptron classifier:\n", "Training accuracy is 0.707566843362\n", "Test accuracy is 0.642290440962\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and tifu subreddit and Perceptron classifier:\n", "Training accuracy is 0.693020103908\n", "Test accuracy is 0.622800956683\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and explainlikeimfive subreddit and Perceptron classifier:\n", "Training accuracy is 0.528207428113\n", "Test accuracy is 0.445582452883\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and atheism subreddit and Passive-Aggressive classifier:\n", "Training accuracy is 0.744467772177\n", "Test accuracy is 0.666550651742\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and politics subreddit and Passive-Aggressive classifier:\n", "Training accuracy is 0.828044233807\n", "Test accuracy is 0.781838862559\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and nosleep subreddit and Passive-Aggressive classifier:\n", "Training accuracy is 0.703494233234\n", "Test accuracy is 0.626171497894\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and pettyrevenge subreddit and Passive-Aggressive classifier:\n", "Training accuracy is 0.672160996646\n", "Test accuracy is 0.574035539249\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and jokes subreddit and Passive-Aggressive classifier:\n", "Training accuracy is 0.820875902309\n", "Test accuracy is 0.745370263601\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and askhistorians subreddit and Passive-Aggressive classifier:\n", "Training accuracy is 0.71066903378\n", "Test accuracy is 0.61296951568\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and TalesFromTechsupport subreddit and Passive-Aggressive classifier:\n", "Training accuracy is 0.641864080221\n", "Test accuracy is 0.546142225336\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and AskReddit subreddit and Passive-Aggressive classifier:\n", "Training accuracy is 0.748901898367\n", "Test accuracy is 0.686820731066\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and talesFromRetail subreddit and Passive-Aggressive classifier:\n", "Training accuracy is 0.638821678511\n", "Test accuracy is 0.553305991363\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and askscience subreddit and Passive-Aggressive classifier:\n", "Training accuracy is 0.725822799557\n", "Test accuracy is 0.650620376303\n", "---------------------------------\n", "For" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and tifu subreddit and Passive-Aggressive classifier:\n", "Training accuracy is 0.712233752774\n", "Test accuracy is 0.630826468243\n", "---------------------------------\n", "For" ] }, { "ename": "MemoryError", "evalue": "", "output_type": "pyerr", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mMemoryError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 51\u001b[0m \u001b[0mclf2\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 52\u001b[0m \u001b[0msubreddit_svm\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0msubreddit\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mclf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mn_grams\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 53\u001b[1;33m \u001b[0mtrain_acc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mclf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 54\u001b[0m \u001b[0mtest_acc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mclf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 55\u001b[0m \u001b[1;32mprint\u001b[0m \u001b[1;34m\"For\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0md\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"and\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msubreddit\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"subreddit and\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"classifier:\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\sklearn\\base.pyc\u001b[0m in \u001b[0;36mscore\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 292\u001b[0m \"\"\"\n\u001b[0;32m 293\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mmetrics\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0maccuracy_score\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 294\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0maccuracy_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 295\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 296\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\sklearn\\neighbors\\classification.pyc\u001b[0m in \u001b[0;36mpredict\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m 144\u001b[0m \u001b[0mX\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0matleast2d_or_csr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 145\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 146\u001b[1;33m \u001b[0mneigh_dist\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mneigh_ind\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mkneighbors\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 147\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 148\u001b[0m \u001b[0mclasses_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclasses_\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\sklearn\\neighbors\\base.pyc\u001b[0m in \u001b[0;36mkneighbors\u001b[1;34m(self, X, n_neighbors, return_distance)\u001b[0m\n\u001b[0;32m 292\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0meffective_metric_\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'euclidean'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 293\u001b[0m dist = pairwise_distances(X, self._fit_X, 'euclidean',\n\u001b[1;32m--> 294\u001b[1;33m squared=True)\n\u001b[0m\u001b[0;32m 295\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 296\u001b[0m dist = pairwise_distances(X, self._fit_X,\n", "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\sklearn\\metrics\\pairwise.pyc\u001b[0m in \u001b[0;36mpairwise_distances\u001b[1;34m(X, Y, metric, n_jobs, **kwds)\u001b[0m\n\u001b[0;32m 655\u001b[0m \u001b[0mfunc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mPAIRWISE_DISTANCE_FUNCTIONS\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mmetric\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 656\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mn_jobs\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 657\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mY\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 658\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 659\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0m_parallel_pairwise\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mY\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\sklearn\\metrics\\pairwise.pyc\u001b[0m in \u001b[0;36meuclidean_distances\u001b[1;34m(X, Y, Y_norm_squared, squared)\u001b[0m\n\u001b[0;32m 174\u001b[0m \"Incompatible dimensions for Y and Y_norm_squared\")\n\u001b[0;32m 175\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 176\u001b[1;33m \u001b[0mdistances\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msafe_sparse_dot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mY\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mT\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdense_output\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 177\u001b[0m \u001b[0mdistances\u001b[0m \u001b[1;33m*=\u001b[0m \u001b[1;33m-\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 178\u001b[0m \u001b[0mdistances\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[0mXX\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\sklearn\\utils\\extmath.pyc\u001b[0m in \u001b[0;36msafe_sparse_dot\u001b[1;34m(a, b, dense_output)\u001b[0m\n\u001b[0;32m 78\u001b[0m \u001b[0mret\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0ma\u001b[0m \u001b[1;33m*\u001b[0m \u001b[0mb\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 79\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mdense_output\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mret\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"toarray\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 80\u001b[1;33m \u001b[0mret\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mret\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 81\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mret\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 82\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\scipy\\sparse\\compressed.pyc\u001b[0m in \u001b[0;36mtoarray\u001b[1;34m(self, order, out)\u001b[0m\n\u001b[0;32m 559\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 560\u001b[0m \u001b[1;34m\"\"\"See the docstring for `spmatrix.toarray`.\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 561\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtocoo\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 562\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 563\u001b[0m \u001b[1;31m##############################################################\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\scipy\\sparse\\coo.pyc\u001b[0m in \u001b[0;36mtoarray\u001b[1;34m(self, order, out)\u001b[0m\n\u001b[0;32m 236\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 237\u001b[0m \u001b[1;34m\"\"\"See the docstring for `spmatrix.toarray`.\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 238\u001b[1;33m \u001b[0mB\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_process_toarray_args\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 239\u001b[0m \u001b[0mfortran\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mB\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mflags\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mf_contiguous\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 240\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mfortran\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mB\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mflags\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mc_contiguous\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\scipy\\sparse\\base.pyc\u001b[0m in \u001b[0;36m_process_toarray_args\u001b[1;34m(self, order, out)\u001b[0m\n\u001b[0;32m 633\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 634\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 635\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mzeros\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 636\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 637\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mMemoryError\u001b[0m: " ] }, { "output_type": "stream", "stream": "stdout", "text": [ " Not alchemy and explainlikeimfive subreddit and Passive-Aggressive classifier:\n", "Training accuracy is 0.69543904296\n", "Test accuracy is 0.618268859709\n", "---------------------------------\n" ] } ], "prompt_number": 42 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }