{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "TextBlob: Text Analytics for Humans\n", "========\n", "\n", "# _Ken Hu_\n", "\n", "# _@whosbacon_" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "fragment" } }, "source": [ "## Who Am I?" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "notes" } }, "source": [ "+ 7+ years DevOps\n", "+ 5+ years Python\n", "+ 3+ years text analytics\n", "+ Digital nomad" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# I keep this as a cell in my title slide so I can rerun \n", "# it easily if I make changes, but it's low enough it won't\n", "# be visible in presentation mode.\n", "%run talktools" ], "language": "python", "metadata": { "slideshow": { "slide_type": "notes" } }, "outputs": [ { "html": [ "\n" ], "metadata": {}, "output_type": "display_data", "text": [ "" ] } ], "prompt_number": 1 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Itinerary\n", "\n", "+ A blob of text\n", "+ Basics\n", "+ Tokenization\n", "+ Parsing & tagging\n", "+ Spell correction\n", "+ Sentiment analysis\n", "+ Classification\n", "+ Blobber" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## A Blob of Text\n", "\n", "+ Popularity of text analytics / NLP\n", "+ nltk complexity\n", "+ TextBlob : nltk :: requests : Httplib2\n", "+ Object-oriented\n", "+ https://textblob.readthedocs.org/en/latest/\n", "+ https://github.com/sloria/textblob" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!sudo pip install -U textblob\n", "!sudo pip install -U nltk textblob-aptagger\n", "!sudo python -m textblob.download_corpora\n", "\n", "import nltk\n", "nltk.download('wordnet')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Basics" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import textblob\n", "\n", "blob = textblob.TextBlob(u' Welcome to textblob: text analytics for humans ')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "print type(blob)\n", "print blob[5:10]\n", "print blob.title()\n", "print blob.find('text')\n", "print blob.endswith(' ')\n", "print blob.stripped" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n", "lcome\n", " Welcome To Textblob: Text Analytics For Humans \n", "14\n", "True\n", "welcome to textblob text analytics for humans\n" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "print textblob.Word\n", "print textblob.Word(u'sweet').pluralize()\n", "print textblob.Word(u'sweets').singularize()\n", "print textblob.Word(u'running').lemmatize('v')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n", "sweets\n", "sweet\n", "run" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "print type(blob.words)\n", "print blob.words.singularize()\n", "print blob.words.count('text')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[u'Welcome', u'to', u'textblob', u'text', u'analytic', u'for', u'human']\n", "1\n" ] } ], "prompt_number": 4 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Tokenization" ] }, { "cell_type": "code", "collapsed": false, "input": [ "blob = textblob.TextBlob(u'why is it so hot in here?!\\nthis is not hot at all')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "print blob.words\n", "print type(blob.words), type(blob.words[0])\n", "print blob.sentences\n", "print blob.ngrams()\n", "print blob.word_counts['hot']" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[u'why', u'is', u'it', u'so', u'hot', u'in', u'here', u'this', u'is', u'not', u'hot', u'at', u'all']\n", " \n", "[Sentence(\"why is it so hot in here?!\"), Sentence(\"this is not hot at all\")]\n", "[WordList([u'why', u'is', u'it']), WordList([u'is', u'it', u'so']), WordList([u'it', u'so', u'hot']), WordList([u'so', u'hot', u'in']), WordList([u'hot', u'in', u'here']), WordList([u'in', u'here', u'this']), WordList([u'here', u'this', u'is']), WordList([u'this', u'is', u'not']), WordList([u'is', u'not', u'hot']), WordList([u'not', u'hot', u'at']), WordList([u'hot', u'at', u'all'])]\n", "2\n" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "print blob.tokenizer\n", "\n", "import nltk.tokenize\n", "\n", "blob.tokenizer = nltk.tokenize.LineTokenizer()\n", "print blob.tokenize()\n", "print blob.tokenize(nltk.tokenize.PunktWordTokenizer())" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[u'why is it so hot in here?!', u'this is not hot at all']\n", "[u'why', u'is', u'it', u'so', u'hot', u'in', u'here', u'?', u'!', u'this', u'is', u'not', u'hot', u'at', u'all']\n" ] } ], "prompt_number": 7 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Parsing & Tagging" ] }, { "cell_type": "code", "collapsed": false, "input": [ "blob = textblob.TextBlob(u'Ray Charles has got Georgia on his mind')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "print blob.noun_phrases\n", "print blob.pos_tags\n", "print blob.parse()" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[u'ray charles', u'georgia']\n", "[(u'Ray', u'NNP'), (u'Charles', u'NNP'), (u'has', u'VBZ'), (u'got', u'VBD'), (u'Georgia', u'NNP'), (u'on', u'IN'), (u'his', u'PRP$'), (u'mind', u'NN')]" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Ray/NNP/B-NP/O Charles/NNP/I-NP/O has/VBZ/B-VP/O got/VBD/I-VP/O Georgia/NNP/B-NP/O on/IN/B-PP/B-PNP his/PRP$/B-NP/I-PNP mind/NN/I-NP/I-PNP\n" ] } ], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "import textblob.np_extractors\n", "\n", "print blob.np_extractor\n", "blob = textblob.TextBlob(u'Ray Charles has got Georgia on his mind',\n", " np_extractor=textblob.np_extractors.ConllExtractor())\n", "print blob.noun_phrases" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[u'ray charles', u'georgia']" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "import textblob.taggers\n", "\n", "print blob.pos_tagger\n", "blob = textblob.TextBlob(u'Ray Charles has got Georgia on his mind',\n", " pos_tagger=textblob.taggers.NLTKTagger())\n", "print blob.pos_tags" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[(u'Ray', u'NNP'), (u'Charles', u'NNP'), (u'has', u'VBZ'), (u'got', u'VBN'), (u'Georgia', u'NNP'), (u'on', u'IN'), (u'his', u'PRP$'), (u'mind', u'NN')]" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 11 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Spell Correction" ] }, { "cell_type": "code", "collapsed": false, "input": [ "blob = textblob.TextBlob(u'I am runing across the boarder')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "print blob.correct() # based on Pattern\n", "print blob.words[2].spellcheck()" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "I am running across the border\n", "[(u'running', 0.8974358974358975), (u'ruling', 0.07692307692307693), (u'ruining', 0.019230769230769232), (u'tuning', 0.00641025641025641)]\n" ] } ], "prompt_number": 13 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Sentiment Analysis" ] }, { "cell_type": "code", "collapsed": false, "input": [ "blob = textblob.TextBlob(u'the weather is fantastic!')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "print blob.sentiment\n", "\n", "import textblob.sentiments\n", "\n", "print blob.analyzer\n", "blob.analyzer = textblob.sentiments.NaiveBayesAnalyzer()\n", "print blob.sentiment" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Sentiment(polarity=0.5, subjectivity=0.9)\n", "\n", "Sentiment(polarity=0.5, subjectivity=0.9)\n" ] } ], "prompt_number": 15 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Classification\n", "\n", "### Opposite Day Classifier" ] }, { "cell_type": "code", "collapsed": false, "input": [ "training = [\n", " (u'tobey maguire is a terrible spiderman.','pos'),\n", " (u'a terrible Javert (Russell Crowe) ruined Les Miserables for me...','pos'),\n", " (u'The Dark Knight is the greatest superhero movie ever!','neg'),\n", " (u'Fantastic Four should have never been made.','pos'),\n", " (u'Wes Anderson is my favorite director!','neg'),\n", " (u'Captain America 2 is pretty awesome.','neg'),\n", " (u'Let\\s pretend \"Batman and Robin\" never happened..','pos'),\n", " ]\n", "testing = [\n", " (u'Superman was never an interesting character.','pos'),\n", " (u'Fantastic Mr Fox is an awesome film!','neg'),\n", " (u'Dragonball Evolution is simply terrible!!','pos')\n", " ]" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "import textblob.classifiers\n", "\n", "classifier = textblob.classifiers.NaiveBayesClassifier(training)\n", "print classifier.accuracy(testing)\n", "classifier.show_informative_features(3)" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "1.0\n", "Most Informative Features\n", " contains(is) = True neg : pos = 2.9 : 1.0\n", " contains(terrible) = False neg : pos = 1.7 : 1.0\n", " contains(never) = False neg : pos = 1.7 : 1.0\n" ] } ], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "blob = textblob.TextBlob(u'the weather is terrible!', classifier=classifier)\n", "print blob.classify()" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "pos\n" ] } ], "prompt_number": 18 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Blobber" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "### Factory Method" ] }, { "cell_type": "code", "collapsed": false, "input": [ "np_extractor = textblob.np_extractors.ConllExtractor()\n", "pos_tagger = textblob.taggers.NLTKTagger()\n", "tokenizer = nltk.tokenize.PunktWordTokenizer()\n", "analyzer = textblob.sentiments.NaiveBayesAnalyzer()" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "prompt_number": 19 }, { "cell_type": "code", "collapsed": false, "input": [ "blob = textblob.TextBlob(u'Dog goes woof. Cat goes meow. Bird goes tweet. And mouse goes squeek.',\n", " np_extractor=np_extractor,\n", " pos_tagger=pos_tagger,\n", " tokenizer=tokenizer,\n", " analyzer=analyzer)\n", "# do something with blob\n", "\n", "blob2 = textblob.TextBlob(u'Cow goes moo. Frog goes croak. And the elephant goes toot.',\n", " np_extractor=np_extractor,\n", " pos_tagger=pos_tagger,\n", " tokenizer=tokenizer,\n", " analyzer=analyzer)\n", "# do something with blob2\n", "\n", "blob3 = textblob.TextBlob(u'Ducks say quack. And fish go blub. And the seal goes ow ow ow ow ow.',\n", " np_extractor=np_extractor,\n", " pos_tagger=pos_tagger,\n", " tokenizer=tokenizer,\n", " analyzer=analyzer)\n", "# do something with blob3" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "prompt_number": 20 }, { "cell_type": "code", "collapsed": false, "input": [ "blobber = textblob.Blobber(\n", " np_extractor=np_extractor,\n", " pos_tagger=pos_tagger,\n", " tokenizer=tokenizer,\n", " analyzer=analyzer)\n", "\n", "blob = blobber(u'But there\\'s one sound that no one knows: What does the fox say?')\n", "\n", "print blob\n", "print blob.np_extractor\n", "print blob.pos_tagger\n", "print blob.tokenizer\n", "print blob.analyzer" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "But there's one sound that no one knows: What does the fox say?\n", "\n", "\n", "\n", "\n" ] } ], "prompt_number": 21 }, { "cell_type": "code", "collapsed": false, "input": [ "print map(blobber, ['Ring-ding-ding-ding-dingeringeding!',\n", " 'Wa-pa-pa-pa-pa-pa-pow!',\n", " 'Hatee-hatee-hatee-ho!',\n", " 'Joff-tchoff-tchoffo-tchoffo-tchoff!'])" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[TextBlob(\"Ring-ding-ding-ding-dingeringeding!\"), TextBlob(\"Wa-pa-pa-pa-pa-pa-pow!\"), TextBlob(\"Hatee-hatee-hatee-ho!\"), TextBlob(\"Joff-tchoff-tchoffo-tchoffo-tchoff!\")]\n" ] } ], "prompt_number": 22 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "### Classifier Comparison" ] }, { "cell_type": "code", "collapsed": false, "input": [ "training = [\n", " (u'tobey maguire is a terrible spiderman.','pos'),\n", " (u'a terrible Javert (Russell Crowe) ruined Les Miserables for me...','pos'),\n", " (u'The Dark Knight is the greatest superhero movie ever!','neg'),\n", " (u'Fantastic Four should have never been made.','pos'),\n", " (u'Wes Anderson is my favorite director!','neg'),\n", " (u'Captain America 2 is pretty awesome.','neg'),\n", " (u'Let\\s pretend \"Batman and Robin\" never happened..','pos'),\n", " ]\n", "texts = [\n", " u'Superman was never an interesting character.',\n", " u'Fantastic Mr Fox is an awesome film!',\n", " u'Dragonball Evolution is simply terrible!!'\n", " ]" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "prompt_number": 23 }, { "cell_type": "code", "collapsed": false, "input": [ "import textblob.classifiers\n", "\n", "nb_classifier = textblob.classifiers.NaiveBayesClassifier(training)\n", "dt_classifier = textblob.classifiers.DecisionTreeClassifier(training)" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "prompt_number": 24 }, { "cell_type": "code", "collapsed": false, "input": [ "for text in texts:\n", " nb_class = textblob.TextBlob(text, classifier=nb_classifier).classify()\n", " dt_class = textblob.TextBlob(text, classifier=dt_classifier).classify()\n", " print nb_class == dt_class" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "True\n", "True\n", "False\n" ] } ], "prompt_number": 25 }, { "cell_type": "code", "collapsed": false, "input": [ "nb_blobber = textblob.Blobber(classifier=nb_classifier)\n", "dt_blobber = textblob.Blobber(classifier=dt_classifier)\n", "\n", "for text in texts:\n", " print nb_blobber(text).classify() == dt_blobber(text).classify()" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "True\n", "True\n", "False\n" ] } ], "prompt_number": 26 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Future Blob\n", "\n", "+ Looking for contributors and maintainers\n", "+ http://textblob.readthedocs.org/en/dev/contributing.html#extension-development" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Fin\n", "\n", "+ @whosbacon\n", "+ http://whosbacon.com\n", "+ http://nbviewer.ipython.org/github/whosken/textblob-intro/blob/master/TextBlob.ipynb" ] } ], "metadata": {} } ] }