{ "metadata": { "name": "", "signature": "sha256:00b7eeda973657bc1c79aa31766b85a7c7ace04f307af735172709a31b554418" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "NLP and Sentiment Analysis" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import os\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn import cross_validation\n", "import pandas as pd\n", "import numpy as np\n", "import re\n", "import nltk\n", "\n", "from bs4 import BeautifulSoup\n", "from nltk.corpus import stopwords\n", "\n", "#Class defined for processing raw HTML Text\n", "class KaggleWord2VecUtility(object):\n", " \"\"\"KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning\"\"\"\n", "\n", " @staticmethod\n", " def review_to_wordlist( review, remove_stopwords=False ):\n", " # Function to convert a document to a sequence of words,\n", " # optionally removing stop words. Returns a list of words.\n", " #\n", " # 1. Remove HTML\n", " review_text = BeautifulSoup(review).get_text()\n", " #\n", " # 2. Remove non-letters\n", " review_text = re.sub(\"[^a-zA-Z]\",\" \", review_text)\n", " #\n", " # 3. Convert words to lower case and split them\n", " words = review_text.lower().split()\n", " #\n", " # 4. Optionally remove stop words (false by default)\n", " if remove_stopwords:\n", " stops = set(stopwords.words(\"english\"))\n", " words = [w for w in words if not w in stops]\n", " #\n", " # 5. Return a list of words\n", " return(words)\n", "\n", " # Define a function to split a review into parsed sentences\n", " @staticmethod\n", " def review_to_sentences( review, tokenizer, remove_stopwords=False ):\n", " # Function to split a review into parsed sentences. Returns a\n", " # list of sentences, where each sentence is a list of words\n", " #\n", " # 1. Use the NLTK tokenizer to split the paragraph into sentences\n", " raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())\n", " #\n", " # 2. Loop over each sentence\n", " sentences = []\n", " for raw_sentence in raw_sentences:\n", " # If a sentence is empty, skip it\n", " if len(raw_sentence) > 0:\n", " # Otherwise, call review_to_wordlist to get a list of words\n", " sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \\\n", " remove_stopwords ))\n", " #\n", " # Return the list of sentences (each sentence is a list of words,\n", " # so this returns a list of lists\n", " return sentences\n", "\n", "def process():\n", " \n", " train = pd.read_csv(\"/Users/taposh/workspace/kaggle/bow/labeledTrainData.tsv\", header=0, \\\n", " delimiter=\"\\t\", quoting=3)\n", " test = pd.read_csv(\"/Users/taposh/workspace/kaggle/sam/test.tsv\", header=0, delimiter=\"\\t\", \\\n", " quoting=3 ) \n", "\n", " y = train[\"Sentiment\"] \n", " print(\"Cleaning and parsing movie reviews...\\n\") \n", " traindata = []\n", " for i in range( 0, len(train[\"Phrase\"])):\n", " traindata.append(\" \".join(KaggleWord2VecUtility.review_to_wordlist(train[\"Phrase\"][i], False)))\n", " testdata = []\n", " for i in range(0,len(test[\"Phrase\"])):\n", " testdata.append(\" \".join(KaggleWord2VecUtility.review_to_wordlist(test[\"Phrase\"][i], False)))\n", " print ('vectorizing... ',) \n", " tfv = TfidfVectorizer(min_df=3, max_features=None, \n", " strip_accents='unicode', analyzer='word',token_pattern=r'\\w{1,}',\n", " ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,\n", " stop_words = 'english')\n", " X_all = traindata + testdata\n", " lentrain = len(traindata)\n", "\n", " #print(X_all)\n", " print (\"fitting pipeline... \",)\n", " tfv.fit(X_all)\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "\n", "X_all = tfv.transform(X_all)\n", "X = X_all[:lentrain]\n", "X_test = X_all[lentrain:]\n", "model = LogisticRegression(penalty='l2', dual=True, tol=0.0000001, \n", " C=1, fit_intercept=True, intercept_scaling=1.0, \n", " class_weight=None, random_state=None)\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "\n", "\n", "\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 31 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.tree import DecisionTreeRegressor\n", "\n", "clf_1 = DecisionTreeRegressor(max_depth=2)\n", "#clf_2 = DecisionTreeRegressor(max_depth=5)\n", "clf_1.fit(X.toarray(), y)\n", "#clf_2.fit(X.toarray(), y)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "# Predict\n", "X_test = np.arange(0, 4,1)[:, np.newaxis]\n", "y_1 = clf_1.predict(X_test)\n", "#y_2 = clf_2.predict(X_test)\n", "\n", "# Plot the results\n", "import matplotlib.pyplot as plt\n", "\n", "plt.figure()\n", "plt.scatter(X, y, c=\"k\", label=\"data\")\n", "plt.plot(X_test, y_1, c=\"g\", label=\"max_depth=2\", linewidth=2)\n", "#plt.plot(X_test, y_2, c=\"r\", label=\"max_depth=5\", linewidth=2)\n", "plt.xlabel(\"data\")\n", "plt.ylabel(\"target\")\n", "plt.title(\"Decision Tree Regression\")\n", "plt.legend()\n", "plt.show()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "kf = cross_validation.KFold(4, n_folds=2)\n", "print(len(kf))\n", "print(kf)\n", "for train_index, test_index in kf:\n", " model.fit(X,y)\n", " print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n", " result = model.predict(train_index)\n", " #rocvalue= cross_validation.cross_val_score(model, train_index, cv=20)\n", "\n", "#print(\"20 Fold CV Score: \")\n", "#rocvalue= cross_validation.cross_val_score(model, X, cv=20, scoring='roc_auc')\n", "#print(rocvalue)\n", "#meanvalue = np.mean()\n", "#print(meanvalue)\n", "\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "2\n", "sklearn.cross_validation.KFold(n=4, n_folds=2, shuffle=False, random_state=None)\n" ] }, { "ename": "ValueError", "evalue": "X and y have incompatible shapes.\nX has 1 samples, but y has 156060.", "output_type": "pyerr", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtrain_index\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_index\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mkf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_index\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"TRAIN:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_index\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"TEST:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/taposh/anaconda/lib/python3.4/site-packages/sklearn/svm/base.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y)\u001b[0m\n\u001b[1;32m 685\u001b[0m raise ValueError(\"X and y have incompatible shapes.\\n\"\n\u001b[1;32m 686\u001b[0m \u001b[0;34m\"X has %s samples, but y has %s.\"\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 687\u001b[0;31m (X.shape[0], y_ind.shape[0]))\n\u001b[0m\u001b[1;32m 688\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 689\u001b[0m \u001b[0mliblinear\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_verbosity_wrap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mverbose\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: X and y have incompatible shapes.\nX has 1 samples, but y has 156060." ] } ], "prompt_number": 40 }, { "cell_type": "code", "collapsed": false, "input": [ "print(\"Retrain on all training data, predicting test labels...\\n\")\n", "model.fit(X,y)\n", "#result = model.predict_proba(X_test)[:,1]\n", "result = model.predict(X_test)\n", "print(max(result))\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Retrain on all training data, predicting test labels...\n", "\n", "4" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Wrote results to submission_sam.csv" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 33 }, { "cell_type": "code", "collapsed": false, "input": [ "output = pd.DataFrame( data={\"PhraseId\":test[\"PhraseId\"], \"Sentiment\":result} )\n", "\n", "import csv\n", "# Use pandas to write the comma-separated output file\n", "output.to_csv('/Users/taposh/workspace/kaggle/sam/submission.csv',quoting=3,escapechar=\",\",index=False,encoding='utf-8')\n", "#output.to_csv(\"/Users/taposhdr/workspace/decision_science/kaggle/bow/data/Bag_of_Words_model-1.csv\", index=False, quoting=csv.QUOTE_NONE)\n", "print(\"Wrote results to submission_sam.csv\")" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "print(max(result))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0.793510629359\n" ] } ], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [ "print(min(result))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0.00641866008008\n" ] } ], "prompt_number": 23 }, { "cell_type": "code", "collapsed": false, "input": [ "print(np.mean(result))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0.151920802397\n" ] } ], "prompt_number": 26 }, { "cell_type": "code", "collapsed": false, "input": [ "print(np.median(result))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0.112866573887\n" ] } ], "prompt_number": 27 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }