{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "
\n", "\n", "" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import json\n", "import codecs\n", "import operator\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "import time\n", "import glob\n", "from datetime import datetime\n", "from dateutil import tz\n", "import nltk\n", "from nltk.collocations import *\n", "from nltk.corpus import stopwords\n", "from nltk.collocations import *\n", "from time import strftime, localtime\n", "import gc\n", "import sys\n", "\n", "#show = sys.argv[1]\n", "out = codecs.open(show.strip('#') + '.trigrams.txt', 'w', encoding='utf-8')\n", "\n", "print \"start reading file at: \" + time.strftime(\"%a, %d %b %Y %H:%M:%S +0000\", time.localtime())\n", "with codecs.open('../twarc/' + show.strip('#') + '.txt', 'r', encoding='utf-8') as content_file:\n", " content = content_file.read()\n", "print \"finished reading file at: \" + time.strftime(\"%a, %d %b %Y %H:%M:%S +0000\", time.localtime())\n", "\n", "'''\n", "jsons = glob.glob('../twarc/' + show + '*')\n", "f = codecs.open('../twarc/scandal.txt', 'r', encoding='utf-8')\n", "data = []\n", "words = \"\"\n", "count = 0\n", "for line in f:\n", " count += 1\n", " print str(count)\n", " words += line\n", "# All attempts to process these massive JSON collections crap out.\n", "# The Python objects that result from parsing the json just don't scale\n", "# 16G Memory locks down machine after a few hundred thousand.\n", "An alternate approach inolves pulling necessary data out via JQ beforehand:\n", "charper@charper-ThinkPad-T530:~/Dropbox/stern/pds/twarc$ touch scandal.txt\n", "charper@charper-ThinkPad-T530:~/Dropbox/stern/pds/twarc$ for j in \\#Scandal-201311*\n", "> do\n", "> cat $j | jq '.text' >> scandal.txt\n", "> done\n", "\n", "'''\n", "\n", "print \"starting tokenization at: \" + time.strftime(\"%a, %d %b %Y %H:%M:%S +0000\", time.localtime())\n", "tokens = content.split()\n", "print \"starting filtering at: \" + time.strftime(\"%a, %d %b %Y %H:%M:%S +0000\", time.localtime())\n", "# Stopwords requires download of stopwords corpora -- import nltk, nltk.download()\n", "stops = stopwords.words('english')\n", "#specifics = ['#DoctorWho', '#DoctorWho50th', '#SaveTheDay', '#DayoftheDoctor', 'RT']\n", "specifics = ['#sleepyhollow', '#Sleepyhollow', 'RT']\n", "stops.extend(specifics)\n", "#filtered_words = [w for w in tokens if not w in stops]\n", "filtered_words = [w for w in tokens if not w in stopwords.words('english')]\n", "\n", "print \"starting finder at: \" + time.strftime(\"%a, %d %b %Y %H:%M:%S +0000\", time.localtime())\n", "bigram_measures = nltk.collocations.BigramAssocMeasures()\n", "finder = BigramCollocationFinder.from_words(filtered_words)\n", "finder.ngram_fd.viewitems()\n", "finder.apply_freq_filter(3)\n", "print finder.nbest(bigram_measures.pmi, 100)\n", "out.write(str(finder.nbest(bigram_measures.pmi, 100)) + \"\\n\")\n", "\n", "print \"trigrams & frequency at: \" + time.strftime(\"%a, %d %b %Y %H:%M:%S +0000\", time.localtime())\n", "tgs = nltk.trigrams(filtered_words)\n", "\n", "fdist = nltk.FreqDist(tgs)\n", "for k,v in fdist.items():\n", " if int(v) > 10: \n", " out.write(str(k) + \"|\" + str(v) + \"\\n\")\n", "'''\n", "Part of speech tagging code, that mostly works, but didn't make the final iteration of things\n", "print \"part of speech tagging at : \" + time.strftime(\"%a, %d %b %Y %H:%M:%S +0000\", time.localtime())\n", "pos = nltk.pos_tag(filtered_words)\n", "pos_df = pd.DataFrame.from_records(pos)\n", "pos_df.to_csv('sleepyhollow.pos.csv', encoding='utf8', quoting=1, index=False)\n", "'''\n", "\n", "print \"finsihed at: \" + time.strftime(\"%a, %d %b %Y %H:%M:%S +0000\", time.localtime())\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "start reading file at: Wed, 18 Dec 2013 18:55:30 +0000\n", "finished reading file at: Wed, 18 Dec 2013 18:55:30 +0000\n", "starting tokenization at: Wed, 18 Dec 2013 18:55:30 +0000\n", "starting filtering at: Wed, 18 Dec 2013 18:55:30 +0000\n", "starting finder at: Wed, 18 Dec 2013 18:55:49 +0000" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[(u'\"#Lmfao', u'#Lmao'), (u'\"watched', u'yesterday:'), (u'#Apps:', u'#DeadTrigger'), (u'#ArrestedDevelopment', u'#RETWEET\"'), (u'#Blog', u'#Geek\"'), (u'#Bud', u'http://t.co/fU3lXpg25T\"'), (u'#Conan\\\\n2)', u'#DoratheExplorer\\\\n3)'), (u'#DoratheExplorer\\\\n3)', u'#TOMANDJERRY\\\\n4)'), (u'#EndersGame', u'Review\\\\\"'), (u'#FREE', u'#podcast!'), (u'#FollowAndFollow', u'siga'), (u'#GameFly', u'http://t.co/AagvYH9IrD\"'), (u'#HarryStyles', u'before!!'), (u'#HomerSimpson\\\\n#Nerdvana', u'http://t.co/U9EMeQ7I39\"'), (u'#JensenAckles', u'#JaredPadalecki\"'), (u'#Jetpack', u'#GameFly'), (u'#MrBergstrom', u'http://t.co/wEQ2gJIhAy\"'), (u'#Nonfiction', u'#Books\"'), (u'#OldEpisodes', u'#BestFriend\"'), (u'#StarCars:', u'http://t.co/KdOJ4uIkuB'), (u'#TOMANDJERRY\\\\n4)', u'#TheSimpsons\\\\n5)...'), (u'#TheHobbit\\\\n#TheSimpsons', u'#HomerSimpson\\\\n#Nerdvana'), (u'#TheSimpsons)', u'Toronto.'), (u'#YouAreLisaSimpsons', u'w/@louisaevecohen'), (u'#codeine', u'#hustle'), (u'#detention', u'#donthaveacowman'), (u'#donthaveacowman', u'#mattgroening\\u2026'), (u'#eatmyshorts', u'#detention'), (u'#ethreal', u'#trippy'), (u'#justgonnareplacetheskinnerpartaccordingly', u'#simpsonslogic'), (u'#kush', u'#trapmusic'), (u'#mattgroening\\u2026', u'http://t.co/xSH6mFN1Rq\"'), (u'#newlifemotto', u'#justgonnareplacetheskinnerpartaccordingly'), (u'#nintendo', u'#gamersunite'), (u'#nouraakekhonger', u'http://t.co/uvHqurEbYP\"'), (u'#ociogay', u'#pikoftheday'), (u'#trippy', u'#wave'), (u'#wave', u'#kush'), (u'#\\u0627\\u0644\\u0623\\u0643\\u062b\\u0631', u'#\\u0634\\u0639\\u0628\\u064a\\u0629'), (u'#\\u0627\\u0644\\u0639\\u0627\\u0644\\u0645', u'\\u061f\\\\n1)'), (u'#\\u0634\\u0639\\u0628\\u064a\\u0629', u'\\u0641\\u064a'), (u'(From', u'#TheSimpsons)'), (u'(cc', u'@camiiilleem'), (u'11x17', u'Signed'), (u'526-529', u'#nintendo'), (u'@2amsnaps:', u'[JOKWON'), (u'@Andreita_Villas', u'@luciasolerapine'), (u'@AzuGadoMart:', u'Haz'), (u'@BeverlyMacca1:', u'OK'), (u'@Brad_Cibane', u'@Julius_S_Malema'), (u'@DACrosse', u'@dliebma'), (u'@FaythAnderton:', u'Omg'), (u'@Fercharmed:', u'.@Alyssa_Milano'), (u'@Joshstrangehill', u'@bbcradiomanc'), (u'@Josi_RF:', u'\\\\\"Hoy'), (u'@LITTLEREGGAEMAN:', u'-Jaa'), (u'@LaurenTom9000', u'#IKnowThatVoice,'), (u'@LetsGoSeeDo', u'@DACrosse'), (u'@LottaBitt:', u'Wtf!'), (u'@Lurdesferizq', u'#osquiero'), (u'@Nathaliajshj:', u'\\u2665\\\\\"'), (u'@TELUS', u'Optik'), (u'@VictorGionatan:', u'Ma'), (u'@_ItsEli:', u'Nel'), (u'@andreagandia14', u'http://t.co/bkXfsGIDRG\"'), (u'@bbcradiomanc', u'4.50pm'), (u'@camiiilleem', u'@MargotDaval'), (u'@debikayo', u'@kaywyoming'), (u'@kaywyoming', u'@LetsGoSeeDo'), (u'@luciasolerapine', u'@Lurdesferizq'), (u'@malmarri', u'@Salem_Belyouha'), (u'@mariamas05', u'@andreagandia14'), (u'@mark_jubb:', u'OMG,'), (u'@paula200084', u'@Andreita_Villas'), (u'@veronii29082249', u'@paula200084'), (u'Astilla\\\\\"', u'#budumtss'), (u'Benvingut,', u'nano!'), (u'Bradbury,', u'Matheson.'), (u'C\\u2019mon!', u'You\\u2019ve'), (u'DANG', u'ROOF!!\\\\n#thesimpsons'), (u'DNS?\\\\n\\xa1NOSOTROS!', u'\\xa1NOSOTROS!\\\\n#sindromeDeDAW'), (u'Database.', u'Please.'), (u'Dias', u':)))'), (u'Doh!\\\\n\\\\nhttp://t.co/KmbKePxpK5\\\\n#LOTR', u'#TheHobbit\\\\n#TheSimpsons'), (u'Email', u'jen@tvfilmnewsllc.com'), (u'Emmy', u'nom.'), (u'Fansite?', u'Email'), (u'GET', u'OFF'), (u'GeoGantArt', u'http://t.co/m77SOEb82p'), (u'HOLLYWOOD', u\"STAR'S\"), (u'Hershel', u'Krustovsky'), (u'Hierarchy', u'Quoting'), (u'ICONIC', u'DRESSES'), (u'IP?', u'\\\\n\\xbfQuien'), (u'Interactive', u'Map'), (u'Jamaicans??', u'Lol...wow...'), (u'Jump', u'see:'), (u'Lunar', u'Module'), (u'MA!', u'GET'), (u'MOST', u'ICONIC')]" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "trigrams & frequency at: Wed, 18 Dec 2013 18:55:52 +0000" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "finsihed at: Wed, 18 Dec 2013 18:55:54 +0000" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "print \"part of speech tagging at : \" + time.strftime(\"%a, %d %b %Y %H:%M:%S +0000\", time.localtime())\n", "pos = nltk.pos_tag(filtered_words)\n", "pos_df = pd.DataFrame.from_records(pos)\n", "pos_df.to_csv(show.strip('#') + '.pos.csv', encoding='utf8', quoting=1, index=False)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "part of speech tagging at : Wed, 18 Dec 2013 18:57:08 +0000\n" ] } ], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "pos_df.head(10)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01
0 \"#TheSimpsons: NN
1 FXX NNP
2 sichert NN
3 sich NN
4 Wiederholungsrechte NNP
5 Mega-Deal NNP
6 http://t.co/UQxUm3pylV\" NNP
7 \"#thesimpsons NNS
8 @ProSieben\" -NONE-
9 \"Yes VBZ
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 19, "text": [ " 0 1\n", "0 \"#TheSimpsons: NN\n", "1 FXX NNP\n", "2 sichert NN\n", "3 sich NN\n", "4 Wiederholungsrechte NNP\n", "5 Mega-Deal NNP\n", "6 http://t.co/UQxUm3pylV\" NNP\n", "7 \"#thesimpsons NNS\n", "8 @ProSieben\" -NONE-\n", "9 \"Yes VBZ" ] } ], "prompt_number": 19 }, { "cell_type": "code", "collapsed": false, "input": [ "counts = pos_df.groupby(1).size()\n", "counts.sort(ascending=False)\n", "counts.head(10)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 23, "text": [ "1\n", "NNP 59609\n", "NN 55267\n", "JJ 26434\n", "NNS 23771\n", "-NONE- 17535\n", "CD 4508\n", "RB 3765\n", "VBP 3564\n", ": 3331\n", "VBZ 3124\n", "dtype: int64" ] } ], "prompt_number": 23 }, { "cell_type": "code", "collapsed": false, "input": [ "!awk ' BEGIN { FS = \"\\t\" } { print $1, $5 } ' POSMappings.txt" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Category PTB\r\n", "Adjective JJ\r\n", "Adjective, ordinal number JJ\r\n", "Adjective, comparative JJR\r\n", "Adjective, superlative JJS\r\n", "Adjective, superlative, semantically JJ\r\n", "Adjective, cardinal number CD\r\n", "Adjective, cardinal number, one CD\r\n", "Adjective, past-part of verb VBN, JJ\r\n", "Adjective, pres-part of verb VBG, JJ\r\n", "Adverb RB\r\n", "Adverb, negative RB\r\n", "Adverb, comparative RBR\r\n", "Adverb, superlative RBS\r\n", "Adverb, particle RP\r\n", "Adverb, question WRB\r\n", "Adverb, degree & question WRB\r\n", "Adverb, degree RB\r\n", "Adverb, degree, postposed RB\r\n", "Adverb, nominal RB\r\n", "Adverb, conjunctive RB\r\n", "Conjunction, coordination CC\r\n", "Conjunction, subordinating IN\r\n", "Conjunction, complementizer 'that' IN\r\n", "Determiner DT\r\n", "Determiner, pronoun DT\r\n", "Determiner, pronoun, plural DT\r\n", "Determiner, prequalifier PDT\r\n", "Determiner, prequalifier PDT\r\n", "Determiner, pronoun or double conj. DT (CC)\r\n", "Determiner, pronoun or double conj. DT (CC)\r\n", "Determiner, article DT\r\n", "Determiner, article DT\r\n", "Determiner, postdeterminer JJ\r\n", "Determiner, possessive PRP$\r\n", "Determiner, possessive, second PRP\r\n", "Determiner, question WDT\r\n", "Determiner, possessive & question WP$\r\n", "Noun NN\r\n", "Noun, singular NN\r\n", "Noun, plural NNS\r\n", "Noun, proper, singular NNP\r\n", "Noun, proper, plural NNPS\r\n", "Noun, adverbial NN, NNP, RB\r\n", "Noun, plural from post-determiner NNS\r\n", "Pronoun, nominal (indefinite) NN\r\n", "Pronoun, personal, subject PRP\r\n", "Pronoun, personal, subject, 3SG PRP\r\n", "Pronoun, personal, object PRP\r\n", "Pronoun, reflexive PRP\r\n", "Pronoun, reflexive, plural PRP\r\n", "Pronoun, question, subject WP\r\n", "Pronoun, question, object WP\r\n", "Pronoun, question, existential there EX\r\n", "Verb. base present form (not infinitive) VBP\r\n", "Verb, infinitive VB\r\n", "Verb, past tense VBD\r\n", "Verb, present participle VBG\r\n", "Verb, past/passive participle VBN\r\n", "Verb, present 3SG -s form VBZ\r\n", "Verb, auxilliary do, base VBP\r\n", "Verb, auxilliary do, infinitive VB\r\n", "Verb, auxilliary do, past VBD\r\n", "Verb, auxilliary do, present part. VBG\r\n", "Verb, auxilliary do, past part. VBN\r\n", "Verb, auxilliary do, present 3SG VBZ\r\n", "Verb, auxilliary have, base VBP\r\n", "Verb, auxilliary have, infinitive VB\r\n", "Verb, auxilliary have, past VBD\r\n", "Verb, auxilliary have, present part. VBG\r\n", "Verb, auxilliary have, past part. VBN\r\n", "Verb, auxilliary have, present 3SG VBZ\r\n", "Verb, auxilliary be, infinitive VB\r\n", "Verb, auxilliary be, past VBD\r\n", "Verb, auxilliary be, past, 3SG VBD\r\n", "Verb, auxilliary be, present part. VBG\r\n", "Verb, auxilliary be, past part. VBN\r\n", "Verb, auxilliary be, present, 3SG VBZ\r\n", "Verb, auxilliary be, present, 1SG VBP\r\n", "Verb, auxilliary be, present VBP\r\n", "Verb, modal MD\r\n", "Infinitive marker TO\r\n", "Preposition, to TO\r\n", "Preposition, IN\r\n", "Preposition, of IN\r\n", "Possessive POS\r\n", "Interjection (or other isolate) UH\r\n", "Punctuation, sentence ender .\r\n", "Punctuation, semicolon :\r\n", "Punctuation, colon or ellipsis :\r\n", "Punctuation, comma ,\r\n", "Punctuation, dash -\r\n", "Punctuation, dollar sign $\r\n", "Punctuation, left bracket (\r\n", "Punctuation, right bracket )\r\n", "Punctuation, left quotation ``\r\n", "Punctuation, right quotation ''\r\n", "Foreign words (not in English lexicon) FW\r\n", "Symbol SYM\r\n", "Symbol, alphabetical \r\n", "Symbol, list item LS\r\n", "URL or email address ??\r\n", "Emoticon ??\r\n", "Online discourse marker ??\r\n", "Possessive nominal ??\r\n", "Possessive proper noun ??\r\n", "Nominal combined with verbal ??\r\n", "Proper noun combined with verbal ??\r\n", "Miscellaneous function word combined with verbal ??\r\n" ] } ], "prompt_number": 31 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }