{ "metadata": { "name": "", "signature": "sha256:00b7eeda973657bc1c79aa31766b85a7c7ace04f307af735172709a31b554418" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "NLP and Sentiment Analysis" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import os\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn import cross_validation\n", "import pandas as pd\n", "import numpy as np\n", "import re\n", "import nltk\n", "\n", "from bs4 import BeautifulSoup\n", "from nltk.corpus import stopwords\n", "\n", "#Class defined for processing raw HTML Text\n", "class KaggleWord2VecUtility(object):\n", " \"\"\"KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning\"\"\"\n", "\n", " @staticmethod\n", " def review_to_wordlist( review, remove_stopwords=False ):\n", " # Function to convert a document to a sequence of words,\n", " # optionally removing stop words. Returns a list of words.\n", " #\n", " # 1. Remove HTML\n", " review_text = BeautifulSoup(review).get_text()\n", " #\n", " # 2. Remove non-letters\n", " review_text = re.sub(\"[^a-zA-Z]\",\" \", review_text)\n", " #\n", " # 3. Convert words to lower case and split them\n", " words = review_text.lower().split()\n", " #\n", " # 4. Optionally remove stop words (false by default)\n", " if remove_stopwords:\n", " stops = set(stopwords.words(\"english\"))\n", " words = [w for w in words if not w in stops]\n", " #\n", " # 5. Return a list of words\n", " return(words)\n", "\n", " # Define a function to split a review into parsed sentences\n", " @staticmethod\n", " def review_to_sentences( review, tokenizer, remove_stopwords=False ):\n", " # Function to split a review into parsed sentences. Returns a\n", " # list of sentences, where each sentence is a list of words\n", " #\n", " # 1. Use the NLTK tokenizer to split the paragraph into sentences\n", " raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())\n", " #\n", " # 2. Loop over each sentence\n", " sentences = []\n", " for raw_sentence in raw_sentences:\n", " # If a sentence is empty, skip it\n", " if len(raw_sentence) > 0:\n", " # Otherwise, call review_to_wordlist to get a list of words\n", " sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \\\n", " remove_stopwords ))\n", " #\n", " # Return the list of sentences (each sentence is a list of words,\n", " # so this returns a list of lists\n", " return sentences\n", "\n", "def process():\n", " \n", " train = pd.read_csv(\"/Users/taposh/workspace/kaggle/bow/labeledTrainData.tsv\", header=0, \\\n", " delimiter=\"\\t\", quoting=3)\n", " test = pd.read_csv(\"/Users/taposh/workspace/kaggle/sam/test.tsv\", header=0, delimiter=\"\\t\", \\\n", " quoting=3 ) \n", "\n", " y = train[\"Sentiment\"] \n", " print(\"Cleaning and parsing movie reviews...\\n\") \n", " traindata = []\n", " for i in range( 0, len(train[\"Phrase\"])):\n", " traindata.append(\" \".join(KaggleWord2VecUtility.review_to_wordlist(train[\"Phrase\"][i], False)))\n", " testdata = []\n", " for i in range(0,len(test[\"Phrase\"])):\n", " testdata.append(\" \".join(KaggleWord2VecUtility.review_to_wordlist(test[\"Phrase\"][i], False)))\n", " print ('vectorizing... ',) 
tfv = TfidfVectorizer(min_df=3, max_features=None, 
    strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
    ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
    stop_words = 'english')
X_all = traindata + testdata
lentrain = len(traindata)

#print(X_all)
print ("fitting pipeline... ",)
tfv.fit(X_all) from sklearn.tree import DecisionTreeRegressor

clf_1 = DecisionTreeRegressor(max_depth=2)
#clf_2 = DecisionTreeRegressor(max_depth=5)
clf_1.fit(X.toarray(), y)
#clf_2.fit(X.toarray(), y) plt.title("Decision Tree Regression")
plt.legend()
plt.show() 1 samples, but y has 156060.", "output_type": "pyerr", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtrain_index\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_index\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mkf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_index\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"TRAIN:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_index\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"TEST:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/taposh/anaconda/lib/python3.4/site-packages/sklearn/svm/base.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y)\u001b[0m\n\u001b[1;32m 685\u001b[0m raise ValueError(\"X and y have incompatible shapes.\\n\"\n\u001b[1;32m 686\u001b[0m \u001b[0;34m\"X has %s samples, but y has %s.\"\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 687\u001b[0;31m (X.shape[0], y_ind.shape[0]))\n\u001b[0m\u001b[1;32m print("Retrain on all training data, predicting test labels...\n")
model.fit(X,y)
#result = model.predict_proba(X_test)[:,1]
result = model.predict(X_test)
print(max(result)) output = pd.DataFrame( data={"PhraseId":test["PhraseId"], "Sentiment":result} )

import csv
# Use pandas to write the comma-separated output file
output.to_csv('/Users/taposh/workspace/kaggle/sam/submission.csv',quoting=3,escapechar=",",index=False,encoding='utf-8')
#output.to_csv("/Users/taposhdr/workspace/decision_science/kaggle/bow/data/Bag_of_Words_model-1.csv", index=False, quoting=csv.QUOTE_NONE)
print("Wrote results to submission_sam.csv")

print(max(result))

print(min(result))

print(np.mean(result))

print(np.median(result)) "print(min(result))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0.00641866008008\n" ] } ], "prompt_number": 23 }, { "cell_type": "code", "collapsed": false, "input": [ "print(np.mean(result))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0.151920802397\n" ] } ], "prompt_number": 26 }, { "cell_type": "code", "collapsed": false, "input": [ "print(np.median(result))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0.112866573887\n" ] } ], "prompt_number": 27 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }