{
 "metadata": {
  "name": "",
  "signature": "sha256:00b7eeda973657bc1c79aa31766b85a7c7ace04f307af735172709a31b554418"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "NLP and Sentiment Analysis"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import os\n",
      "from sklearn.feature_extraction.text import TfidfVectorizer\n",
      "from sklearn.linear_model import LogisticRegression\n",
      "from sklearn import cross_validation\n",
      "import pandas as pd\n",
      "import numpy as np\n",
      "import re\n",
      "import nltk\n",
      "\n",
      "from bs4 import BeautifulSoup\n",
      "from nltk.corpus import stopwords\n",
      "\n",
      "#Class defined for processing raw HTML Text\n",
      "class KaggleWord2VecUtility(object):\n",
      "    \"\"\"KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning\"\"\"\n",
      "\n",
      "    @staticmethod\n",
      "    def review_to_wordlist( review, remove_stopwords=False ):\n",
      "        # Function to convert a document to a sequence of words,\n",
      "        # optionally removing stop words.  Returns a list of words.\n",
      "        #\n",
      "        # 1. Remove HTML\n",
      "        review_text = BeautifulSoup(review).get_text()\n",
      "        #\n",
      "        # 2. Remove non-letters\n",
      "        review_text = re.sub(\"[^a-zA-Z]\",\" \", review_text)\n",
      "        #\n",
      "        # 3. Convert words to lower case and split them\n",
      "        words = review_text.lower().split()\n",
      "        #\n",
      "        # 4. Optionally remove stop words (false by default)\n",
      "        if remove_stopwords:\n",
      "            stops = set(stopwords.words(\"english\"))\n",
      "            words = [w for w in words if not w in stops]\n",
      "        #\n",
      "        # 5. Return a list of words\n",
      "        return(words)\n",
      "\n",
      "    # Define a function to split a review into parsed sentences\n",
      "    @staticmethod\n",
      "    def review_to_sentences( review, tokenizer, remove_stopwords=False ):\n",
      "        # Function to split a review into parsed sentences. Returns a\n",
      "        # list of sentences, where each sentence is a list of words\n",
      "        #\n",
      "        # 1. Use the NLTK tokenizer to split the paragraph into sentences\n",
      "        raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())\n",
      "        #\n",
      "        # 2. Loop over each sentence\n",
      "        sentences = []\n",
      "        for raw_sentence in raw_sentences:\n",
      "            # If a sentence is empty, skip it\n",
      "            if len(raw_sentence) > 0:\n",
      "                # Otherwise, call review_to_wordlist to get a list of words\n",
      "                sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \\\n",
      "                  remove_stopwords ))\n",
      "        #\n",
      "        # Return the list of sentences (each sentence is a list of words,\n",
      "        # so this returns a list of lists\n",
      "        return sentences\n",
      "\n",
      "def process():\n",
      "    \n",
      "    train = pd.read_csv(\"/Users/taposh/workspace/kaggle/bow/labeledTrainData.tsv\", header=0, \\\n",
      "                    delimiter=\"\\t\", quoting=3)\n",
      "    test = pd.read_csv(\"/Users/taposh/workspace/kaggle/sam/test.tsv\", header=0, delimiter=\"\\t\", \\\n",
      "                   quoting=3 )    \n",
      "\n",
      "    y = train[\"Sentiment\"]  \n",
      "    print(\"Cleaning and parsing movie reviews...\\n\")      \n",
      "    traindata = []\n",
      "    for i in range( 0, len(train[\"Phrase\"])):\n",
      "        traindata.append(\" \".join(KaggleWord2VecUtility.review_to_wordlist(train[\"Phrase\"][i], False)))\n",
      "    testdata = []\n",
      "    for i in range(0,len(test[\"Phrase\"])):\n",
      "        testdata.append(\" \".join(KaggleWord2VecUtility.review_to_wordlist(test[\"Phrase\"][i], False)))\n",
      "    print ('vectorizing... ',) \n",
      "    tfv = TfidfVectorizer(min_df=3,  max_features=None, \n",
      "            strip_accents='unicode', analyzer='word',token_pattern=r'\\w{1,}',\n",
      "            ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,\n",
      "            stop_words = 'english')\n",
      "    X_all = traindata + testdata\n",
      "    lentrain = len(traindata)\n",
      "\n",
      "    #print(X_all)\n",
      "    print (\"fitting pipeline... \",)\n",
      "    tfv.fit(X_all)\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "\n",
      "X_all = tfv.transform(X_all)\n",
      "X = X_all[:lentrain]\n",
      "X_test = X_all[lentrain:]\n",
      "model = LogisticRegression(penalty='l2', dual=True, tol=0.0000001, \n",
      "                         C=1, fit_intercept=True, intercept_scaling=1.0, \n",
      "                         class_weight=None, random_state=None)\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "\n",
      "\n",
      "\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 31
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.tree import DecisionTreeRegressor\n",
      "\n",
      "clf_1 = DecisionTreeRegressor(max_depth=2)\n",
      "#clf_2 = DecisionTreeRegressor(max_depth=5)\n",
      "clf_1.fit(X.toarray(), y)\n",
      "#clf_2.fit(X.toarray(), y)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Predict\n",
      "X_test = np.arange(0, 4,1)[:, np.newaxis]\n",
      "y_1 = clf_1.predict(X_test)\n",
      "#y_2 = clf_2.predict(X_test)\n",
      "\n",
      "# Plot the results\n",
      "import matplotlib.pyplot as plt\n",
      "\n",
      "plt.figure()\n",
      "plt.scatter(X, y, c=\"k\", label=\"data\")\n",
      "plt.plot(X_test, y_1, c=\"g\", label=\"max_depth=2\", linewidth=2)\n",
      "#plt.plot(X_test, y_2, c=\"r\", label=\"max_depth=5\", linewidth=2)\n",
      "plt.xlabel(\"data\")\n",
      "plt.ylabel(\"target\")\n",
      "plt.title(\"Decision Tree Regression\")\n",
      "plt.legend()\n",
      "plt.show()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "kf = cross_validation.KFold(4, n_folds=2)\n",
      "print(len(kf))\n",
      "print(kf)\n",
      "for train_index, test_index in kf:\n",
      "    model.fit(X,y)\n",
      "    print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n",
      "    result = model.predict(train_index)\n",
      "    #rocvalue= cross_validation.cross_val_score(model, train_index, cv=20)\n",
      "\n",
      "#print(\"20 Fold CV Score: \")\n",
      "#rocvalue= cross_validation.cross_val_score(model, X, cv=20, scoring='roc_auc')\n",
      "#print(rocvalue)\n",
      "#meanvalue = np.mean()\n",
      "#print(meanvalue)\n",
      "\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "2\n",
        "sklearn.cross_validation.KFold(n=4, n_folds=2, shuffle=False, random_state=None)\n"
       ]
      },
      {
       "ename": "ValueError",
       "evalue": "X and y have incompatible shapes.\nX has 1 samples, but y has 156060.",
       "output_type": "pyerr",
       "traceback": [
        "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
        "\u001b[0;32m<ipython-input-40-4fc9aef85c26>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtrain_index\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_index\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mkf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m     \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_index\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"TRAIN:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_index\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"TEST:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m     \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
        "\u001b[0;32m/Users/taposh/anaconda/lib/python3.4/site-packages/sklearn/svm/base.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y)\u001b[0m\n\u001b[1;32m    685\u001b[0m             raise ValueError(\"X and y have incompatible shapes.\\n\"\n\u001b[1;32m    686\u001b[0m                              \u001b[0;34m\"X has %s samples, but y has %s.\"\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 687\u001b[0;31m                              (X.shape[0], y_ind.shape[0]))\n\u001b[0m\u001b[1;32m    688\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    689\u001b[0m         \u001b[0mliblinear\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_verbosity_wrap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mverbose\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
        "\u001b[0;31mValueError\u001b[0m: X and y have incompatible shapes.\nX has 1 samples, but y has 156060."
       ]
      }
     ],
     "prompt_number": 40
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print(\"Retrain on all training data, predicting test labels...\\n\")\n",
      "model.fit(X,y)\n",
      "#result = model.predict_proba(X_test)[:,1]\n",
      "result = model.predict(X_test)\n",
      "print(max(result))\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Retrain on all training data, predicting test labels...\n",
        "\n",
        "4"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Wrote results to submission_sam.csv"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      }
     ],
     "prompt_number": 33
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "output = pd.DataFrame( data={\"PhraseId\":test[\"PhraseId\"], \"Sentiment\":result} )\n",
      "\n",
      "import csv\n",
      "# Use pandas to write the comma-separated output file\n",
      "output.to_csv('/Users/taposh/workspace/kaggle/sam/submission.csv',quoting=3,escapechar=\",\",index=False,encoding='utf-8')\n",
      "#output.to_csv(\"/Users/taposhdr/workspace/decision_science/kaggle/bow/data/Bag_of_Words_model-1.csv\", index=False, quoting=csv.QUOTE_NONE)\n",
      "print(\"Wrote results to submission_sam.csv\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print(max(result))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0.793510629359\n"
       ]
      }
     ],
     "prompt_number": 22
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print(min(result))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0.00641866008008\n"
       ]
      }
     ],
     "prompt_number": 23
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print(np.mean(result))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0.151920802397\n"
       ]
      }
     ],
     "prompt_number": 26
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print(np.median(result))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0.112866573887\n"
       ]
      }
     ],
     "prompt_number": 27
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}