{ "metadata": { "name": "Vectorizing" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "vectorizing-a-large-text-corpus-with-the-hashing-trick" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "http://scikit-learn.org/dev/modules/feature_extraction.html" ] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Loading features from dicts" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.feature_extraction import DictVectorizer\n", "\n", "measurements = [\n", " {'city': 'Dubai', 'temperature': 33.},\n", " {'city': 'London', 'temperature': 12.},\n", " {'city': 'San Fransisco', 'temperature': 18.},\n", "]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [ "vec = DictVectorizer()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 23 }, { "cell_type": "code", "collapsed": false, "input": [ "vec.fit_transform(measurements).toarray()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 24, "text": [ "array([[ 1., 0., 0., 33.],\n", " [ 0., 1., 0., 12.],\n", " [ 0., 0., 1., 18.]])" ] } ], "prompt_number": 24 }, { "cell_type": "code", "collapsed": false, "input": [ "vec.get_feature_names()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 25, "text": [ "['city=Dubai', 'city=London', 'city=San Fransisco', 'temperature']" ] } ], "prompt_number": 25 }, { "cell_type": "code", "collapsed": false, "input": [ "pos_window = [\n", " {\n", " 'word-2': 'the',\n", " 'pos-2': 'DT',\n", " 'word-1': 'cat',\n", " 'pos-1': 'NN',\n", " 'word+1': 'on',\n", " 'pos+1': 'PP',\n", " },\n", "]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 26 }, { "cell_type": "code", "collapsed": false, "input": [ "pos_window" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 27, "text": [ "[{'pos+1': 'PP',\n", " 'pos-1': 'NN',\n", " 'pos-2': 'DT',\n", " 'word+1': 'on',\n", " 'word-1': 'cat',\n", " 'word-2': 'the'}]" ] } ], "prompt_number": 27 }, { "cell_type": "code", "collapsed": false, "input": [ "vec = DictVectorizer()\n", "pos_vectorized = vec.fit_transform(pos_window)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 28 }, { "cell_type": "code", "collapsed": false, "input": [ "pos_vectorized" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 29, "text": [ "<1x6 sparse matrix of type ''\n", "\twith 6 stored elements in Compressed Sparse Row format>" ] } ], "prompt_number": 29 }, { "cell_type": "code", "collapsed": false, "input": [ "pos_vectorized.toarray()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 30, "text": [ "array([[ 1., 1., 1., 1., 1., 1.]])" ] } ], "prompt_number": 30 }, { "cell_type": "code", "collapsed": false, "input": [ "vec.get_feature_names()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 31, "text": [ "['pos+1=PP', 'pos-1=NN', 'pos-2=DT', 'word+1=on', 'word-1=cat', 'word-2=the']" ] } ], "prompt_number": 31 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Feature hashing" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def token_features(token, part_of_speech):\n", " if token.isdigit():\n", " yield \"numeric\"\n", " else:\n", " yield \"token={}\".format(token.lower())\n", " yield \"token,pos={},{}\".format(token, part_of_speech)\n", " if token[0].isupper():\n", " yield \"uppercase_initial\"\n", " if token.isupper():\n", " yield \"all_uppercase\"\n", " yield \"pos={}\".format(part_of_speech)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 32 }, { "cell_type": "code", "collapsed": false, "input": [ "raw_X = (token_features(tok, pos_tagger(tok)) for tok in corpus)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 33 }, { "cell_type": "code", "collapsed": false, "input": [ "hasher = FeatureHasher(input_type=string)\n", "X = hasher.transform(raw_X)" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'FeatureHasher' is not defined", "output_type": "pyerr", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mhasher\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mFeatureHasher\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstring\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhasher\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_X\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'FeatureHasher' is not defined" ] } ], "prompt_number": 34 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.feature_extraction.text import CountVectorizer" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 35 }, { "cell_type": "code", "collapsed": false, "input": [ "vectorizer = CountVectorizer(min_df=1)\n", "vectorizer" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 36, "text": [ "CountVectorizer(analyzer='word', binary=False, charset='utf-8',\n", " charset_error='strict', dtype=, input='content',\n", " lowercase=True, max_df=1.0, max_features=None, max_n=None,\n", " min_df=1, min_n=None, ngram_range=(1, 1), preprocessor=None,\n", " stop_words=None, strip_accents=None,\n", " token_pattern=u'(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None,\n", " vocabulary=None)" ] } ], "prompt_number": 36 }, { "cell_type": "code", "collapsed": false, "input": [ "corpus = [\n", " 'This is the first document.',\n", " 'This is the second second document.',\n", " 'And the third one.',\n", " 'Is this the first document?',\n", "]\n", "X = vectorizer.fit_transform(corpus)\n", "X" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 37, "text": [ "<4x9 sparse matrix of type ''\n", "\twith 19 stored elements in COOrdinate format>" ] } ], "prompt_number": 37 }, { "cell_type": "code", "collapsed": false, "input": [ "analyze = vectorizer.build_analyzer()\n", "analyze(\"This is a text document to analyze.\")" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 38, "text": [ "[u'this', u'is', u'text', u'document', u'to', u'analyze']" ] } ], "prompt_number": 38 }, { "cell_type": "code", "collapsed": false, "input": [ "vectorizer.get_feature_names()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 39, "text": [ "[u'and',\n", " u'document',\n", " u'first',\n", " u'is',\n", " u'one',\n", " u'second',\n", " u'the',\n", " u'third',\n", " u'this']" ] } ], "prompt_number": 39 }, { "cell_type": "code", "collapsed": false, "input": [ "X.toarray()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 40, "text": [ "array([[0, 1, 1, 1, 0, 0, 1, 0, 1],\n", " [0, 1, 0, 1, 0, 2, 1, 0, 1],\n", " [1, 0, 0, 0, 1, 0, 1, 1, 0],\n", " [0, 1, 1, 1, 0, 0, 1, 0, 1]])" ] } ], "prompt_number": 40 }, { "cell_type": "code", "collapsed": false, "input": [ "vectorizer.vocabulary_.get('document')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 41, "text": [ "1" ] } ], "prompt_number": 41 }, { "cell_type": "code", "collapsed": false, "input": [ "vectorizer.transform(['Something completely new.']).toarray()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 42, "text": [ "array([[0, 0, 0, 0, 0, 0, 0, 0, 0]])" ] } ], "prompt_number": 42 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Bigram vectorizer" ] }, { "cell_type": "code", "collapsed": false, "input": [ "bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=ur'\\b\\w+\\b', min_df=1)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 43 }, { "cell_type": "code", "collapsed": false, "input": [ "analyze = bigram_vectorizer.build_analyzer()\n", "analyze('Bi-grams are cool!')" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 44, "text": [ "[u'bi', u'grams', u'are', u'cool', u'bi grams', u'grams are', u'are cool']" ] } ], "prompt_number": 44 }, { "cell_type": "code", "collapsed": false, "input": [ "X_2 = bigram_vectorizer.fit_transform(corpus).toarray()\n", "X_2" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 45, "text": [ "array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0],\n", " [0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0],\n", " [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0],\n", " [0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1]])" ] } ], "prompt_number": 45 }, { "cell_type": "code", "collapsed": false, "input": [ "feature_index = bigram_vectorizer.vocabulary_.get(u'is this')\n", "X_2[:, feature_index] " ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 46, "text": [ "array([0, 0, 0, 1])" ] } ], "prompt_number": 46 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Tf\u2013idf term weighting" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.feature_extraction.text import TfidfTransformer\n", "transformer = TfidfTransformer()\n", "transformer " ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 47, "text": [ "TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)" ] } ], "prompt_number": 47 }, { "cell_type": "code", "collapsed": false, "input": [ "counts = [[3, 0, 1],\n", " [2, 0, 0],\n", " [3, 0, 0],\n", " [4, 0, 0],\n", " [3, 2, 0],\n", " [3, 0, 2]]\n", "tfidf = transformer.fit_transform(counts)\n", "tfidf\n", "tfidf.toarray() " ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 48, "text": [ "array([[ 0.85151335, 0. , 0.52433293],\n", " [ 1. , 0. , 0. ],\n", " [ 1. , 0. , 0. ],\n", " [ 1. , 0. , 0. ],\n", " [ 0.55422893, 0.83236428, 0. ],\n", " [ 0.63035731, 0. , 0.77630514]])" ] } ], "prompt_number": 48 }, { "cell_type": "code", "collapsed": false, "input": [ "transformer.idf_" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 49, "text": [ "array([ 1. , 2.25276297, 1.84729786])" ] } ], "prompt_number": 49 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "vectorizer = TfidfVectorizer(min_df=1)\n", "vectorizer.fit_transform(corpus)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 50, "text": [ "<4x9 sparse matrix of type ''\n", "\twith 19 stored elements in Compressed Sparse Row format>" ] } ], "prompt_number": 50 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Bag of Words representation" ] }, { "cell_type": "code", "collapsed": false, "input": [ "ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2), min_df=1)\n", "counts = ngram_vectorizer.fit_transform(['words', 'wprds'])\n", "ngram_vectorizer.get_feature_names()\n", "counts.toarray().astype(int)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 51, "text": [ "array([[1, 1, 1, 0, 1, 1, 1, 0],\n", " [1, 1, 0, 1, 1, 1, 0, 1]])" ] } ], "prompt_number": 51 }, { "cell_type": "code", "collapsed": false, "input": [ "ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(5, 5), min_df=1)\n", "ngram_vectorizer.fit_transform(['jumpy fox'])" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 52, "text": [ "<1x4 sparse matrix of type ''\n", "\twith 4 stored elements in COOrdinate format>" ] } ], "prompt_number": 52 }, { "cell_type": "code", "collapsed": false, "input": [ "ngram_vectorizer.get_feature_names()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 53, "text": [ "[u' fox ', u' jump', u'jumpy', u'umpy ']" ] } ], "prompt_number": 53 }, { "cell_type": "code", "collapsed": false, "input": [ "ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(5, 5), min_df=1)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 54 }, { "cell_type": "code", "collapsed": false, "input": [ "ngram_vectorizer.fit_transform(['jumpy fox'])" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 55, "text": [ "<1x5 sparse matrix of type ''\n", "\twith 5 stored elements in COOrdinate format>" ] } ], "prompt_number": 55 }, { "cell_type": "code", "collapsed": false, "input": [ "ngram_vectorizer.get_feature_names()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 56, "text": [ "[u'jumpy', u'mpy f', u'py fo', u'umpy ', u'y fox']" ] } ], "prompt_number": 56 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Vectorizing a large text corpus with the hashing trick" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.feature_extraction.text import HashingVectorizer\n", "hv = HashingVectorizer(n_features=10)\n", "hv.transform(corpus)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 57, "text": [ "<4x10 sparse matrix of type ''\n", "\twith 16 stored elements in Compressed Sparse Row format>" ] } ], "prompt_number": 57 }, { "cell_type": "code", "collapsed": false, "input": [ "hv = HashingVectorizer()\n", "hv.transform(corpus)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 58, "text": [ "<4x1048576 sparse matrix of type ''\n", "\twith 19 stored elements in Compressed Sparse Row format>" ] } ], "prompt_number": 58 }, { "cell_type": "code", "collapsed": false, "input": [ "def my_tokenizer(s):\n", " return s.split()\n", "\n", "vectorizer = CountVectorizer(tokenizer=my_tokenizer)\n", "vectorizer.build_analyzer()(u\"Some... punctuation!\")" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 59, "text": [ "[u'some...', u'punctuation!']" ] } ], "prompt_number": 59 }, { "cell_type": "code", "collapsed": false, "input": [ "from nltk import word_tokenize \n", "from nltk.stem import WordNetLemmatizer \n", "class LemmaTokenizer(object):\n", " def __init__(self):\n", " self.wnl = WordNetLemmatizer()\n", " def __call__(self, doc):\n", " return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]\n", "\n", "vect = CountVectorizer(tokenizer=LemmaTokenizer())" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 60 }, { "cell_type": "code", "collapsed": false, "input": [ "vect" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 61, "text": [ "CountVectorizer(analyzer='word', binary=False, charset='utf-8',\n", " charset_error='strict', dtype=, input='content',\n", " lowercase=True, max_df=1.0, max_features=None, max_n=None,\n", " min_df=2, min_n=None, ngram_range=(1, 1), preprocessor=None,\n", " stop_words=None, strip_accents=None,\n", " token_pattern=u'(?u)\\\\b\\\\w\\\\w+\\\\b',\n", " tokenizer=<__main__.LemmaTokenizer object at 0x1111e5290>,\n", " vocabulary=None)" ] } ], "prompt_number": 61 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Image Feature extraction" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import numpy as np\n", "from sklearn.feature_extraction import image\n", "\n", "one_image = np.arange(4 * 4 * 3).reshape((4, 4, 3))\n", "one_image[:, :, 0]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 62, "text": [ "array([[ 0, 3, 6, 9],\n", " [12, 15, 18, 21],\n", " [24, 27, 30, 33],\n", " [36, 39, 42, 45]])" ] } ], "prompt_number": 62 }, { "cell_type": "code", "collapsed": false, "input": [ "patches = image.extract_patches_2d(one_image, (2, 2), max_patches=2, random_state=0)\n", "patches.shape" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 63, "text": [ "(2, 2, 2, 3)" ] } ], "prompt_number": 63 }, { "cell_type": "code", "collapsed": false, "input": [ "patches[:, :, :, 0]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 64, "text": [ "array([[[ 0, 3],\n", " [12, 15]],\n", "\n", " [[15, 18],\n", " [27, 30]]])" ] } ], "prompt_number": 64 }, { "cell_type": "code", "collapsed": false, "input": [ "patches = image.extract_patches_2d(one_image, (2, 2))\n", "patches.shape" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 65, "text": [ "(9, 2, 2, 3)" ] } ], "prompt_number": 65 }, { "cell_type": "code", "collapsed": false, "input": [ "patches[4, :, :, 0]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 66, "text": [ "array([[15, 18],\n", " [27, 30]])" ] } ], "prompt_number": 66 }, { "cell_type": "code", "collapsed": false, "input": [ "reconstructed = image.reconstruct_from_patches_2d(patches, (4, 4, 3))\n", "np.testing.assert_array_equal(one_image, reconstructed)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 67 }, { "cell_type": "code", "collapsed": false, "input": [ "five_images = np.arange(5 * 4 * 4 * 3).reshape(5, 4, 4, 3)\n", "patches = image.PatchExtractor((2, 2)).transform(five_images)\n", "patches.shape" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 68, "text": [ "(45, 2, 2, 3)" ] } ], "prompt_number": 68 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }