{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Things to try:\n",
      "1. Hasher + Idf transform v.s. Tfidf transform v.s. Hasher\n",
      "2. TruncatedSVD"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.datasets import fetch_20newsgroups\n",
      "from sklearn.decomposition import TruncatedSVD\n",
      "from sklearn.feature_extraction.text import TfidfVectorizer\n",
      "from sklearn.feature_extraction.text import HashingVectorizer\n",
      "from sklearn.feature_extraction.text import TfidfTransformer\n",
      "from sklearn.pipeline import Pipeline\n",
      "from sklearn.preprocessing import Normalizer\n",
      "from sklearn.grid_search import GridSearchCV\n",
      "from sklearn import metrics\n",
      "\n",
      "from sklearn.cluster import MiniBatchKMeans\n",
      "import numpy as np"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "news_data = fetch_20newsgroups(shuffle=True, random_state = 0)\n",
      "texts = news_data.data\n",
      "labels = news_data.target\n",
      "print len(texts), np.unique(labels)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "11314 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]\n"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## different vectorizer\n",
      "\n",
      "## hasher alone - better perform l2 normalization for kmeans\n",
      "hasher = HashingVectorizer(stop_words='english', n_features = 10000,\n",
      "                           non_negative=False,\n",
      "                           norm = 'l2', binary = False)\n",
      "\n",
      "## since tfidf will do the normalization, so \n",
      "## the previous hasher step only need to extract the counts of words\n",
      "idf_hasher = Pipeline(steps = [\n",
      "    ('hasher', HashingVectorizer(stop_words='english', \n",
      "                                 n_features = 10000,\n",
      "                                 non_negative=True,\n",
      "                                 norm = None,\n",
      "                                 binary = False)),\n",
      "    ('tf_idf', TfidfTransformer())\n",
      "])\n",
      "\n",
      "## use tfidf vectorization directly\n",
      "tfidf = TfidfVectorizer(max_df=0.5, stop_words='english', \n",
      "                        use_idf = True, norm = 'l2')\n",
      "\n",
      "vectorizers = {'hasher': hasher, \n",
      "               'idf_hasher': idf_hasher, \n",
      "               'tfidf': tfidf}"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## LSA dimensionality reduction - after vectorization\n",
      "\n",
      "## vectorizer results are normalized, which makes kmeans behave\n",
      "## as expected. Since LSA/SVD results are NOT normalized, we\n",
      "## have to redo the normalization\n",
      "lsa = TruncatedSVD(n_components=500)\n",
      "normalizer = Normalizer(copy = True, norm='l2')\n",
      "lsa_normalizer = Pipeline(steps = [\n",
      "        ('lsa', lsa),\n",
      "        ('normalizer', normalizer)\n",
      "])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## clustering algorithms\n",
      "\n",
      "km = MiniBatchKMeans(n_clusters = 20, init_size=1000, \n",
      "                     batch_size = 1000, verbose = 0)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.base import BaseEstimator, TransformerMixin\n",
      "class OptionalStep(BaseEstimator, TransformerMixin):\n",
      "    def __init__(self, estimator = None, on = True):\n",
      "        self.estimator = estimator\n",
      "        self.on = on\n",
      "    def fit(self, X, y = None):\n",
      "        if self.on:\n",
      "            self.estimator.fit(X, y)\n",
      "        return self\n",
      "    def transform(self, X):\n",
      "        if self.on:\n",
      "            return self.estimator.transform(X)\n",
      "        else:\n",
      "            return X"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 16
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## testing vectorizer without lsa dimensionality reduction\n",
      "for vectorizer_name, vectorizer in vectorizers.items():\n",
      "    print '============================================='\n",
      "    print vectorizer_name\n",
      "    %time X = vectorizer.fit_transform(texts)\n",
      "    ## create new lsa to avoid sign indeterminancy\n",
      "    lsa_normalizer = Pipeline(steps = [\n",
      "        ('lsa', TruncatedSVD(n_components=500)),\n",
      "        ('normalizer', Normalizer(copy = True, norm='l2'))])\n",
      "    %time Z = lsa_normalizer.fit_transform(X)\n",
      "    %time km.fit(X)\n",
      "    clustered_X = km.labels_\n",
      "    print 'clustering result without lsa:',\n",
      "    print metrics.adjusted_mutual_info_score(labels, clustered_X)\n",
      "    %time km.fit(Z)\n",
      "    clustered_Z = km.labels_\n",
      "    print 'clustering result with lsa:',\n",
      "    print metrics.adjusted_mutual_info_score(labels, clustered_Z)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "=============================================\n",
        "tfidf\n",
        "CPU times: user 4.7 s, sys: 76 ms, total: 4.78 s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Wall time: 4.72 s\n",
        "CPU times: user 48.1 s, sys: 12.4 s, total: 1min"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Wall time: 42.2 s\n",
        "CPU times: user 2.59 s, sys: 12 ms, total: 2.6 s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Wall time: 2.6 s\n",
        "clustering result without lsa: 0.267173201537\n",
        "CPU times: user 1.94 s, sys: 424 ms, total: 2.36 s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Wall time: 1.96 s\n",
        "clustering result with lsa: 0.310096714648\n",
        "=============================================\n",
        "idf_hasher\n",
        "CPU times: user 4.73 s, sys: 24 ms, total: 4.76 s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Wall time: 4.74 s\n",
        "CPU times: user 20.1 s, sys: 3.25 s, total: 23.4 s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Wall time: 18 s\n",
        "CPU times: user 1.06 s, sys: 0 ns, total: 1.06 s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Wall time: 1.05 s\n",
        "clustering result without lsa: 0.288526525903\n",
        "CPU times: user 2.49 s, sys: 392 ms, total: 2.88 s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Wall time: 2.48 s\n",
        "clustering result with lsa: 0.30267315394\n",
        "=============================================\n",
        "hasher\n",
        "CPU times: user 4.59 s, sys: 24 ms, total: 4.61 s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Wall time: 4.6 s\n",
        "CPU times: user 20.5 s, sys: 3.37 s, total: 23.8 s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Wall time: 18.2 s\n",
        "CPU times: user 1.76 s, sys: 4 ms, total: 1.77 s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Wall time: 1.77 s\n",
        "clustering result without lsa: 0.176192306413\n",
        "CPU times: user 2.42 s, sys: 404 ms, total: 2.82 s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Wall time: 2.42 s\n",
        "clustering result with lsa: 0.171964867561\n"
       ]
      }
     ],
     "prompt_number": 17
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## LSA is significantly better for dimensionality reduction for TEXT data than not using it\n",
      "\n",
      "## TFIDF transform could be slow on hahserizer if its default hash_size is too big\n",
      "\n",
      "## TFIDF is significantly better if it is used"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Results with high n_features in hasher\n",
      "\n",
      "=============================================\n",
      "tfidf\n",
      "CPU times: user 5.45 s, sys: 56 ms, total: 5.51 s\n",
      "Wall time: 5.48 s\n",
      "CPU times: user 47.7 s, sys: 12.5 s, total: 1min\n",
      "Wall time: 42.1 s\n",
      "CPU times: user 2.98 s, sys: 32 ms, total: 3.01 s\n",
      "Wall time: 3.01 s\n",
      "clustering result without lsa: 0.289170149848\n",
      "CPU times: user 1.92 s, sys: 436 ms, total: 2.35 s\n",
      "Wall time: 1.95 s\n",
      "clustering result with lsa: 0.348944495858\n",
      "\n",
      "=============================================\n",
      "idf_hasher\n",
      "CPU times: user 5.1 s, sys: 64 ms, total: 5.17 s\n",
      "Wall time: 5.14 s\n",
      "CPU times: user 5min 59s, sys: 44 s, total: 6min 43s\n",
      "Wall time: 5min 9s\n",
      "CPU times: user 12.4 s, sys: 216 ms, total: 12.6 s\n",
      "Wall time: 12.7 s\n",
      "clustering result without lsa: 0.303035797432\n",
      "CPU times: user 2.31 s, sys: 448 ms, total: 2.76 s\n",
      "Wall time: 2.35 s\n",
      "clustering result with lsa: 0.312858769047\n",
      "\n",
      "=============================================\n",
      "hasher\n",
      "CPU times: user 4.65 s, sys: 32 ms, total: 4.68 s\n",
      "Wall time: 4.65 s\n",
      "CPU times: user 5min 35s, sys: 46.8 s, total: 6min 21s\n",
      "Wall time: 4min 48s\n",
      "CPU times: user 13.5 s, sys: 192 ms, total: 13.7 s\n",
      "Wall time: 13.7 s\n",
      "clustering result without lsa: 0.165880219367\n",
      "CPU times: user 2.1 s, sys: 424 ms, total: 2.52 s\n",
      "Wall time: 2.12 s\n",
      "clustering result with lsa: 0.175990937104"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}