{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Things to try:\n", "1. Hasher + Idf transform v.s. Tfidf transform v.s. Hasher\n", "2. TruncatedSVD" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.datasets import fetch_20newsgroups\n", "from sklearn.decomposition import TruncatedSVD\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.feature_extraction.text import HashingVectorizer\n", "from sklearn.feature_extraction.text import TfidfTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import Normalizer\n", "from sklearn.grid_search import GridSearchCV\n", "from sklearn import metrics\n", "\n", "from sklearn.cluster import MiniBatchKMeans\n", "import numpy as np" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "news_data = fetch_20newsgroups(shuffle=True, random_state = 0)\n", "texts = news_data.data\n", "labels = news_data.target\n", "print len(texts), np.unique(labels)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "11314 [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19]\n" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "## different vectorizer\n", "\n", "## hasher alone - better perform l2 normalization for kmeans\n", "hasher = HashingVectorizer(stop_words='english', n_features = 10000,\n", " non_negative=False,\n", " norm = 'l2', binary = False)\n", "\n", "## since tfidf will do the normalization, so \n", "## the previous hasher step only need to extract the counts of words\n", "idf_hasher = Pipeline(steps = [\n", " ('hasher', HashingVectorizer(stop_words='english', \n", " n_features = 10000,\n", " non_negative=True,\n", " norm = None,\n", " binary = False)),\n", " ('tf_idf', TfidfTransformer())\n", "])\n", "\n", "## use tfidf vectorization directly\n", "tfidf = TfidfVectorizer(max_df=0.5, stop_words='english', \n", " use_idf = True, norm = 'l2')\n", "\n", "vectorizers = {'hasher': hasher, \n", " 'idf_hasher': idf_hasher, \n", " 'tfidf': tfidf}" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "## LSA dimensionality reduction - after vectorization\n", "\n", "## vectorizer results are normalized, which makes kmeans behave\n", "## as expected. Since LSA/SVD results are NOT normalized, we\n", "## have to redo the normalization\n", "lsa = TruncatedSVD(n_components=500)\n", "normalizer = Normalizer(copy = True, norm='l2')\n", "lsa_normalizer = Pipeline(steps = [\n", " ('lsa', lsa),\n", " ('normalizer', normalizer)\n", "])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "## clustering algorithms\n", "\n", "km = MiniBatchKMeans(n_clusters = 20, init_size=1000, \n", " batch_size = 1000, verbose = 0)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", "class OptionalStep(BaseEstimator, TransformerMixin):\n", " def __init__(self, estimator = None, on = True):\n", " self.estimator = estimator\n", " self.on = on\n", " def fit(self, X, y = None):\n", " if self.on:\n", " self.estimator.fit(X, y)\n", " return self\n", " def transform(self, X):\n", " if self.on:\n", " return self.estimator.transform(X)\n", " else:\n", " return X" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "## testing vectorizer without lsa dimensionality reduction\n", "for vectorizer_name, vectorizer in vectorizers.items():\n", " print '============================================='\n", " print vectorizer_name\n", " %time X = vectorizer.fit_transform(texts)\n", " ## create new lsa to avoid sign indeterminancy\n", " lsa_normalizer = Pipeline(steps = [\n", " ('lsa', TruncatedSVD(n_components=500)),\n", " ('normalizer', Normalizer(copy = True, norm='l2'))])\n", " %time Z = lsa_normalizer.fit_transform(X)\n", " %time km.fit(X)\n", " clustered_X = km.labels_\n", " print 'clustering result without lsa:',\n", " print metrics.adjusted_mutual_info_score(labels, clustered_X)\n", " %time km.fit(Z)\n", " clustered_Z = km.labels_\n", " print 'clustering result with lsa:',\n", " print metrics.adjusted_mutual_info_score(labels, clustered_Z)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "=============================================\n", "tfidf\n", "CPU times: user 4.7 s, sys: 76 ms, total: 4.78 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Wall time: 4.72 s\n", "CPU times: user 48.1 s, sys: 12.4 s, total: 1min" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Wall time: 42.2 s\n", "CPU times: user 2.59 s, sys: 12 ms, total: 2.6 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Wall time: 2.6 s\n", "clustering result without lsa: 0.267173201537\n", "CPU times: user 1.94 s, sys: 424 ms, total: 2.36 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Wall time: 1.96 s\n", "clustering result with lsa: 0.310096714648\n", "=============================================\n", "idf_hasher\n", "CPU times: user 4.73 s, sys: 24 ms, total: 4.76 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Wall time: 4.74 s\n", "CPU times: user 20.1 s, sys: 3.25 s, total: 23.4 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Wall time: 18 s\n", "CPU times: user 1.06 s, sys: 0 ns, total: 1.06 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Wall time: 1.05 s\n", "clustering result without lsa: 0.288526525903\n", "CPU times: user 2.49 s, sys: 392 ms, total: 2.88 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Wall time: 2.48 s\n", "clustering result with lsa: 0.30267315394\n", "=============================================\n", "hasher\n", "CPU times: user 4.59 s, sys: 24 ms, total: 4.61 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Wall time: 4.6 s\n", "CPU times: user 20.5 s, sys: 3.37 s, total: 23.8 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Wall time: 18.2 s\n", "CPU times: user 1.76 s, sys: 4 ms, total: 1.77 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Wall time: 1.77 s\n", "clustering result without lsa: 0.176192306413\n", "CPU times: user 2.42 s, sys: 404 ms, total: 2.82 s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Wall time: 2.42 s\n", "clustering result with lsa: 0.171964867561\n" ] } ], "prompt_number": 17 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## LSA is significantly better for dimensionality reduction for TEXT data than not using it\n", "\n", "## TFIDF transform could be slow on hahserizer if its default hash_size is too big\n", "\n", "## TFIDF is significantly better if it is used" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Results with high n_features in hasher\n", "\n", "=============================================\n", "tfidf\n", "CPU times: user 5.45 s, sys: 56 ms, total: 5.51 s\n", "Wall time: 5.48 s\n", "CPU times: user 47.7 s, sys: 12.5 s, total: 1min\n", "Wall time: 42.1 s\n", "CPU times: user 2.98 s, sys: 32 ms, total: 3.01 s\n", "Wall time: 3.01 s\n", "clustering result without lsa: 0.289170149848\n", "CPU times: user 1.92 s, sys: 436 ms, total: 2.35 s\n", "Wall time: 1.95 s\n", "clustering result with lsa: 0.348944495858\n", "\n", "=============================================\n", "idf_hasher\n", "CPU times: user 5.1 s, sys: 64 ms, total: 5.17 s\n", "Wall time: 5.14 s\n", "CPU times: user 5min 59s, sys: 44 s, total: 6min 43s\n", "Wall time: 5min 9s\n", "CPU times: user 12.4 s, sys: 216 ms, total: 12.6 s\n", "Wall time: 12.7 s\n", "clustering result without lsa: 0.303035797432\n", "CPU times: user 2.31 s, sys: 448 ms, total: 2.76 s\n", "Wall time: 2.35 s\n", "clustering result with lsa: 0.312858769047\n", "\n", "=============================================\n", "hasher\n", "CPU times: user 4.65 s, sys: 32 ms, total: 4.68 s\n", "Wall time: 4.65 s\n", "CPU times: user 5min 35s, sys: 46.8 s, total: 6min 21s\n", "Wall time: 4min 48s\n", "CPU times: user 13.5 s, sys: 192 ms, total: 13.7 s\n", "Wall time: 13.7 s\n", "clustering result without lsa: 0.165880219367\n", "CPU times: user 2.1 s, sys: 424 ms, total: 2.52 s\n", "Wall time: 2.12 s\n", "clustering result with lsa: 0.175990937104" ] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }