{ "metadata": { "name": "", "signature": "sha256:0be15cdce8312d1fa62b21a37ddb93a6faf86c90296486bea2a495d17e1e9ee6" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "#import urllib2\n", "#response = urllib2.urlopen('http://www.gutenberg.org/files/30760/30760-0.txt')\n", "#html = response.read()\n", "#file(\"book_temp.txt\",\"w\").write(html)\n", "#words=sc.textFile(\"book_temp.txt\")\n", "\n", "#file(\"book_temp.txt\", \"w\").write(urllib2.urlopen(\"http://www.gutenberg.org/files/30760/30760-0.txt\").read())\n", "#words = sc.textFile(\"hdfs://localhost:9000/user//DailyFullModBusMeiDeviceID\")\n", "words = sc.textFile(\"hdfs://localhost:9000/Python/book.txt\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 53 }, { "cell_type": "code", "collapsed": false, "input": [ "words.filter(lambda w: w.startswith(\" \")).take(5)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 54, "text": [ "[u' from The Works of Theophile Gautier Volume 19',\n", " u' THE WORKS OF',\n", " u' TH\\xc9OPHILE GAUTIER',\n", " u' VOLUME NINETEEN',\n", " u' TRANSLATED AND EDITED BY']" ] } ], "prompt_number": 54 }, { "cell_type": "code", "collapsed": false, "input": [ "counts = words.flatMap(lambda line: line.split(\" \")) \\\n", " .map(lambda word: (word, 1)) \\\n", " .reduceByKey(lambda a, b: a + b)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 55 }, { "cell_type": "code", "collapsed": false, "input": [ "counts.saveAsTextFile(\"hdfs://localhost:9000/Python/spark_output1\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 56 }, { "cell_type": "code", "collapsed": false, "input": [ "counts.collect()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 57, "text": [ "[(u'', 592),\n", " (u'donate,', 1),\n", " (u'yellow', 1),\n", " (u'four', 4),\n", " (u'catch', 1),\n", " (u'protest', 1),\n", " (u'sleep', 3),\n", " (u'right.\\u201d', 1),\n", " (u'appetite', 3),\n", " (u'flash,', 1),\n", " (u'up,', 8),\n", " (u'Goya\\u2019s', 1),\n", " (u'sweetest', 1),\n", " (u'kissed,', 1),\n", " (u'void', 1),\n", " (u'chameleon.', 1),\n", " (u'powder.', 1),\n", " (u'powder,', 1),\n", " (u'every', 19),\n", " (u'fur', 2),\n", " (u'writing,', 1),\n", " (u'ruinous', 1),\n", " (u'bravest', 1),\n", " (u'regret,', 1),\n", " (u'uttering', 2),\n", " (u'\\u201cToo', 1),\n", " (u'wooden', 1),\n", " (u'cause', 4),\n", " (u'also,', 1),\n", " (u'equipage,', 1),\n", " (u'expenses,', 2),\n", " (u'persisted', 1),\n", " (u'red', 8),\n", " (u'seed-box.', 1),\n", " (u'TRANSLATED', 1),\n", " (u'294', 1),\n", " (u'Author:', 1),\n", " (u'enjoy', 2),\n", " (u'VOLUME', 1),\n", " (u'dish', 2),\n", " (u'likely', 1),\n", " (u'comical.', 1),\n", " (u'others.', 2),\n", " (u'binary,', 1),\n", " (u'even', 7),\n", " (u'hide', 2),\n", " (u'o\\u2019clock,', 1),\n", " (u'above', 2),\n", " (u'nonproprietary', 1),\n", " (u'suppose', 1),\n", " (u'ever', 2),\n", " (u'oats,', 1),\n", " (u'hero', 1),\n", " (u'drew', 2),\n", " (u'china', 2),\n", " (u'cart', 1),\n", " (u'glass,', 1),\n", " (u'represented', 1),\n", " (u'dry', 1),\n", " (u'rests', 1),\n", " (u'leaves', 2),\n", " (u'light,', 1),\n", " (u'preceding', 1),\n", " (u'NOT', 2),\n", " (u'settled', 1),\n", " (u'deceived,', 1),\n", " (u'fantastic', 1),\n", " (u'besides', 2),\n", " (u'equipment.', 2),\n", " (u'brought', 11),\n", " (u'modulated', 1),\n", " (u'moral', 2),\n", " (u'r\\xe9gime_.', 1),\n", " (u'trumpet-blast,', 1),\n", " (u'glance', 4),\n", " (u'Indeed,', 1),\n", " (u'commissioned', 1),\n", " (u'temperature,', 1),\n", " (u'antechamber.', 1),\n", " (u'distributing', 6),\n", " (u'(www.gutenberg.net),', 1),\n", " (u'understratum', 1),\n", " (u'strike', 1),\n", " (u'said:', 1),\n", " (u'musk', 1),\n", " (u'paperwork', 1),\n", " (u'lambskin,', 1),\n", " (u'guest,', 2),\n", " (u'whirling', 2),\n", " (u'grief', 1),\n", " (u'feeling.', 1),\n", " (u'outdated', 1),\n", " (u'handkerchiefs', 2),\n", " (u'society.', 2),\n", " (u'Foundation,', 4),\n", " (u'books,', 1),\n", " (u'unfailing', 1),\n", " (u'entertained', 1),\n", " (u'majesty;', 1),\n", " (u'arm,', 1),\n", " (u'appendage', 1),\n", " (u'my', 104),\n", " (u'1.F.1.', 1),\n", " (u'court-yard', 1),\n", " (u'pupils,', 1),\n", " (u'household', 1),\n", " (u'organized', 1),\n", " (u'hen-run', 1),\n", " (u'Classicists', 1),\n", " (u'around,', 1),\n", " (u'end', 7),\n", " (u'provide', 6),\n", " (u'rib', 1),\n", " (u'damages.', 1),\n", " (u'far', 6),\n", " (u'stupidity', 1),\n", " (u'answer', 1),\n", " (u'puns', 1),\n", " (u'ordinary', 2),\n", " (u'preferable', 1),\n", " (u'dignified', 1),\n", " (u'A', 11),\n", " (u'transforming', 1),\n", " (u'confirmed', 1),\n", " (u'sparkled', 1),\n", " (u'downloading,', 1),\n", " (u'modest', 1),\n", " (u'fat', 1),\n", " (u'destined', 2),\n", " (u'bottles', 1),\n", " (u'fellows,', 1),\n", " (u'types', 1),\n", " (u'All', 5),\n", " (u'attempt', 2),\n", " (u'attracts', 1),\n", " (u'acknowledged', 1),\n", " (u'still.', 1),\n", " (u'green', 3),\n", " (u'Once,', 1),\n", " (u'wind', 2),\n", " (u'Fontaine\\u2019s', 1),\n", " (u'blind', 1),\n", " (u'fife', 2),\n", " (u'office', 2),\n", " (u'welcomed', 1),\n", " (u'over', 9),\n", " (u'prohibition', 1),\n", " (u'elevated', 1),\n", " (u'ears,', 2),\n", " (u'ears.', 1),\n", " (u'then,', 2),\n", " (u'flute-like', 1),\n", " (u'Corvi\\u2019s', 1),\n", " (u'fix', 2),\n", " (u'1.E.6.', 1),\n", " (u'Enjolras', 4),\n", " (u'better', 2),\n", " (u'hidden', 1),\n", " (u'\\u201cThen', 1),\n", " (u'overcome', 1),\n", " (u'them', 23),\n", " (u'madly', 1),\n", " (u'Tchoumakoff', 1),\n", " (u'break', 2),\n", " (u'manners.', 1),\n", " (u'they', 48),\n", " (u'bank', 1),\n", " (u'bolted', 1),\n", " (u'parrot', 3),\n", " (u'glance;', 1),\n", " (u'Four', 1),\n", " (u'benevolent', 1),\n", " (u'roast', 1),\n", " (u'went', 5),\n", " (u'_Contents_', 1),\n", " (u'lifted', 1),\n", " (u'financial', 1),\n", " (u'suspended', 1),\n", " (u'list', 1),\n", " (u'innocent', 1),\n", " (u'array', 1),\n", " (u'whip', 1),\n", " (u'velvet', 1),\n", " (u'prescribed,', 1),\n", " (u'scent', 1),\n", " (u'washed', 2),\n", " (u'Muse', 1),\n", " (u'day.', 1),\n", " (u'day,', 7),\n", " (u'forth', 8),\n", " (u'turning', 1),\n", " (u'dancing.', 2),\n", " (u'written', 4),\n", " (u'striking,', 1),\n", " (u'Foundation.', 1),\n", " (u'defence.', 1),\n", " (u'given', 4),\n", " (u'free', 7),\n", " (u'bestow', 3),\n", " (u'\\u201cle', 1),\n", " (u'grapes', 1),\n", " (u'tinkled', 1),\n", " (u'North', 1),\n", " (u'snake,', 1),\n", " (u'enormous', 1),\n", " (u'ANYTHING', 1),\n", " (u'fellow.', 1),\n", " (u'corrections', 1),\n", " (u'leaps', 1),\n", " (u'centaur,', 1),\n", " (u'loud', 1),\n", " (u'bite', 3),\n", " (u'Nicolas', 1),\n", " (u'caressingly,', 1),\n", " (u'synonymous', 1),\n", " (u'rank', 1),\n", " (u'Gutenberg\"),', 1),\n", " (u'philosophical', 3),\n", " (u'another', 2),\n", " (u'sober', 1),\n", " (u'comic', 1),\n", " (u'electronic', 27),\n", " (u'YOU', 6),\n", " (u'mercury', 1),\n", " (u'ten', 1),\n", " (u'top', 3),\n", " (u'1500', 1),\n", " (u'master', 5),\n", " (u'tragic', 1),\n", " (u'bitter', 1),\n", " (u'hay,', 1),\n", " (u'dilated', 1),\n", " (u'butter', 1),\n", " (u'hoofs', 1),\n", " (u'expend', 1),\n", " (u'somewhat', 4),\n", " (u'evil', 1),\n", " (u'shortly', 1),\n", " (u'brushed', 1),\n", " (u'character', 1),\n", " (u'dog.', 2),\n", " (u'past', 3),\n", " (u'dog,', 7),\n", " (u'creatures,', 3),\n", " (u'creatures.', 1),\n", " (u'night.', 2),\n", " (u'enabled', 2),\n", " (u'second', 2),\n", " (u'seated', 1),\n", " (u'unmistakably', 2),\n", " (u'iron', 2),\n", " (u'carriages', 1),\n", " (u'powers', 2),\n", " (u'sees', 1),\n", " (u'sighs,', 1),\n", " (u'ran', 3),\n", " (u'mind', 1),\n", " (u'rat', 2),\n", " (u'rap', 1),\n", " (u'injurious.', 1),\n", " (u'seem', 2),\n", " (u'seek', 1),\n", " (u'INDEMNITY', 1),\n", " (u'alive', 1),\n", " (u'genuine', 1),\n", " (u'do,', 1),\n", " (u'often.', 1),\n", " (u'UNIVERSITY', 1),\n", " (u'-', 7),\n", " (u'eagerly', 1),\n", " (u'chest', 1),\n", " (u'They', 20),\n", " (u'chronicles', 1),\n", " (u'willing.', 1),\n", " (u'disdainfully', 1),\n", " (u'blue', 5),\n", " (u'incapable', 1),\n", " (u'thrilled', 1),\n", " (u'much,', 1),\n", " (u'fees,', 1),\n", " (u'fees.', 1),\n", " (u'caricatured', 1),\n", " (u'letter', 1),\n", " (u'Doyenn\\xe9,', 1),\n", " (u'letters!', 1),\n", " (u'singer', 1),\n", " (u'splints', 1),\n", " (u'_Department', 1),\n", " (u'forsook', 2),\n", " (u'entertaining', 1),\n", " (u'came', 12),\n", " (u'decidedly', 1),\n", " (u'pedestrians.', 1),\n", " (u'simulate', 1),\n", " (u'terror;', 1),\n", " (u'ingenuity,', 1),\n", " (u'shepherdesses,', 1),\n", " (u'determine', 1),\n", " (u'stepping', 1),\n", " (u'elegy.', 1),\n", " (u'tempted', 2),\n", " (u'him.', 5),\n", " (u'finally', 1),\n", " (u'resemblance', 1),\n", " (u'mortals.', 1),\n", " (u'spite', 1),\n", " (u'herself', 4),\n", " (u'explain', 2),\n", " (u'nervous,', 2),\n", " (u'athletic', 1),\n", " (u'tight.', 1),\n", " (u'idiom,', 1),\n", " (u'later,', 2),\n", " (u'\\u201cHistory', 1),\n", " (u'accessed,', 1),\n", " (u'delirious,', 1),\n", " (u'Boatswain,', 1),\n", " (u'report', 1),\n", " (u'nicely', 2),\n", " (u'comply', 6),\n", " (u'_Musa', 1),\n", " (u'Menagerie', 1),\n", " (u'convinced', 2),\n", " (u'them;', 2),\n", " (u'idiotic', 1),\n", " (u'men,', 1),\n", " (u'men.', 2),\n", " (u'stead', 1),\n", " (u'Melan', 1),\n", " (u'brothers.', 1),\n", " (u'elders.', 1),\n", " (u'victim', 1),\n", " (u'uninhabited', 1),\n", " (u'fluffy', 2),\n", " (u'unexpectedly', 1),\n", " (u'depends', 1),\n", " (u'flashed', 1),\n", " (u'wear.', 1),\n", " (u'best', 4),\n", " (u'lightnings,', 1),\n", " (u'face,', 3),\n", " (u'searching,', 1),\n", " (u'gentleman', 1),\n", " (u'artificial', 1),\n", " (u'caught;', 1),\n", " (u'approach', 1),\n", " (u'display,', 1),\n", " (u'preserve', 1),\n", " (u'we', 27),\n", " (u'terms', 21),\n", " (u'rolled', 2),\n", " (u'weak', 1),\n", " (u'twinkling', 1),\n", " (u'alteration,', 1),\n", " (u'extent', 2),\n", " (u'Boulogne,', 1),\n", " (u'suggested', 1),\n", " (u'protect', 2),\n", " (u'(b)', 1),\n", " (u'obsolete,', 1),\n", " (u'deletions', 1),\n", " (u'against', 6),\n", " (u'tenderly,', 1),\n", " (u'glass.', 1),\n", " (u'escapade', 1),\n", " (u'dancers,', 1),\n", " (u'Marius.', 1),\n", " (u'tone', 1),\n", " (u'appeared', 5),\n", " (u'had', 89),\n", " (u'vain', 4),\n", " (u'wearying', 1),\n", " (u'active', 2),\n", " (u'individual.', 1),\n", " (u'majestic', 1),\n", " (u'invented.', 1),\n", " (u'gratitude', 1),\n", " (u'overmastering,', 1),\n", " (u'proceeded', 1),\n", " (u'puts', 1),\n", " (u'Gavroche,', 3),\n", " (u'Gavroche.', 1),\n", " (u'been', 30),\n", " (u'were,', 1),\n", " (u'removed.', 1),\n", " (u'chucked', 1),\n", " (u'interest', 2),\n", " (u'mane.', 1),\n", " (u'life', 7),\n", " (u'easy', 1),\n", " (u'greyhound', 1),\n", " (u'concerning', 3),\n", " (u'uncommon', 1),\n", " (u'slave-owning', 1),\n", " (u'ancestors,', 1),\n", " (u'Cinderella,', 1),\n", " (u'skilfully', 1),\n", " (u'exception', 1),\n", " (u'\\u201cwho', 1),\n", " (u'Childebrand,', 2),\n", " (u'put;', 1),\n", " (u'particular,', 1),\n", " (u'abrupt', 1),\n", " (u'cracked', 1),\n", " (u'employees', 2),\n", " (u'stop', 4),\n", " (u'Accident,', 1),\n", " (u'is', 105),\n", " (u'consent,', 1),\n", " (u'brutal', 1),\n", " (u'io', 1),\n", " (u'accepted,', 1),\n", " (u'ring', 1),\n", " (u'grown', 1),\n", " (u'containing', 1),\n", " (u'drawings:', 1),\n", " (u'heart,', 1),\n", " (u'perform', 1),\n", " (u'savage.', 1),\n", " (u'make', 12),\n", " (u'worry;', 1),\n", " (u'infrequently', 2),\n", " (u'beside', 2),\n", " (u'GUTENBERG', 3),\n", " (u'literature,', 2),\n", " (u'couple', 3),\n", " (u'nib', 1),\n", " (u'middle-aged', 1),\n", " (u'law.', 1),\n", " (u'rain', 1),\n", " (u'delight', 4),\n", " (u'action', 1),\n", " (u'characters', 2),\n", " (u'opportunity', 2),\n", " (u'thoughts', 2),\n", " (u'bell,', 2),\n", " (u'kept', 5),\n", " (u'fireside.', 1),\n", " (u'possessed', 2),\n", " (u'February;', 1),\n", " (u'suffering', 1),\n", " (u'damages,', 1),\n", " (u'locations.', 1),\n", " (u'809', 1),\n", " (u'qualities', 2),\n", " (u'Horses!', 1),\n", " (u'the', 874),\n", " (u'addresses.', 1),\n", " (u'pose,', 1),\n", " (u'just', 10),\n", " (u'associated)', 1),\n", " (u'verses', 2),\n", " (u'human', 4),\n", " (u'tradition,', 1),\n", " (u'electronically', 1),\n", " (u'note', 2),\n", " (u'horse-coupers', 1),\n", " (u'pay-dirt', 1),\n", " (u'flooring', 1),\n", " (u'moonbeam', 1),\n", " (u'royal', 1),\n", " (u'transformed', 1),\n", " (u'beloved', 2),\n", " (u'hat', 3),\n", " (u'Lord,', 1),\n", " (u'desire', 2),\n", " (u'Editor:', 1),\n", " (u'sailors', 1),\n", " (u'duster.', 1),\n", " (u'F3.', 1),\n", " (u'stylishness', 1),\n", " (u'resembling', 1),\n", " (u'boxes', 1),\n", " (u'dreamy', 1),\n", " (u'birth', 1),\n", " (u'shadow', 3),\n", " (u'speaks,', 1),\n", " (u'harmoniously', 1),\n", " (u'company?', 1),\n", " (u'appearing', 1),\n", " (u'gift', 1),\n", " (u'specific', 1),\n", " (u'steps', 3),\n", " (u'Parthenon', 1),\n", " (u'hung', 1),\n", " (u'refused,', 1),\n", " (u'Now,', 2),\n", " (u'old', 6),\n", " (u'onlookers.', 1),\n", " (u'understand', 3),\n", " (u'dead', 2),\n", " (u'kind-hearted', 1),\n", " (u'dear', 2),\n", " (u'bore', 2),\n", " (u'merited,', 1),\n", " (u'Thersites', 1),\n", " (u'ululation', 1),\n", " (u'dressing-table', 1),\n", " (u'proceeding', 2),\n", " (u'theatre', 1),\n", " (u'for', 130),\n", " (u'herring', 1),\n", " (u'gentle,', 1),\n", " (u'poses,', 1),\n", " (u'honoured', 1),\n", " (u'uncertainly,', 1),\n", " (u'production,', 1),\n", " (u'conclusion,', 1),\n", " (u'shifting', 1),\n", " (u'medium,', 2),\n", " (u'post', 1),\n", " (u'chapter', 1),\n", " (u'brief.', 1),\n", " (u'brahmin', 1),\n", " (u'would', 91),\n", " (u'trustworthy', 1),\n", " (u'dinner', 1),\n", " (u'companion,', 2),\n", " (u'table,', 3),\n", " (u'horizon', 2),\n", " (u'lacked', 1),\n", " (u'slightly', 2),\n", " (u'Use', 2),\n", " (u'statements', 1),\n", " (u'formerly', 1),\n", " (u'barks', 1),\n", " (u'low,', 1),\n", " (u'profession', 1),\n", " (u'intellectual', 2),\n", " (u'candlesticks', 1),\n", " (u'beings', 1),\n", " (u'PUNITIVE', 1),\n", " (u'balanced', 1),\n", " (u'frightened', 1),\n", " (u'Gregory', 1),\n", " (u'strangely', 1),\n", " (u'Unless', 1),\n", " (u'indispensable', 1),\n", " (u'illustrious', 1),\n", " (u'asses', 1),\n", " (u'why,', 2),\n", " (u'way', 14),\n", " (u'_Sua', 2),\n", " (u'was', 189),\n", " (u'fork', 1),\n", " (u'head', 6),\n", " (u'form', 2),\n", " (u'fascination', 1),\n", " (u'magnificent', 1),\n", " (u'tigers', 1),\n", " (u'discourage', 1),\n", " (u'fore', 3),\n", " (u'shrilly', 1),\n", " (u'mysterious', 1),\n", " (u'heat', 1),\n", " (u'hear', 3),\n", " (u'astonished', 1),\n", " (u'hands,', 1),\n", " (u'hands.', 1),\n", " (u'claws.', 1),\n", " (u'claws,', 1),\n", " (u'inside', 2),\n", " (u'attached', 3),\n", " (u'dew.', 1),\n", " (u'\"Plain', 2),\n", " (u'prettily,', 1),\n", " (u'twittered,', 1),\n", " (u'separated', 1),\n", " (u'proved', 4),\n", " (u'sight', 3),\n", " (u'winged', 1),\n", " (u'Some', 2),\n", " (u'with!\\u201d', 1),\n", " (u'pawing', 2),\n", " (u'muscular', 2),\n", " (u'left,', 1),\n", " (u'truffle,', 2),\n", " (u'willingly', 1),\n", " (u'astonishing', 1),\n", " (u'Harlequin\\u2019s', 1),\n", " (u'Only', 1),\n", " (u'Piedmont', 1),\n", " (u'generally', 1),\n", " (u'performed,', 1),\n", " (u'handed', 1),\n", " (u'rat\\u2019s', 1),\n", " (u'nestling', 1),\n", " (u'holding', 2),\n", " (u'staggered,', 1),\n", " (u'printer\\u2019s,', 1),\n", " (u'immediate', 2),\n", " (u'PROFESSOR', 1),\n", " (u'prove,', 1),\n", " (u'Never.', 1),\n", " (u'pirouettes', 1),\n", " (u'truffled', 1),\n", " (u'stick.', 1),\n", " (u'died', 3),\n", " (u'younger', 1),\n", " (u'paper,', 1),\n", " (u'warm,', 1),\n", " (u'American', 1),\n", " (u'together', 3),\n", " (u'jacket', 1),\n", " (u'rough', 1),\n", " (u'saying,', 1),\n", " (u'Dash\\u2019s', 3),\n", " (u'royalties.', 1),\n", " (u'profits', 1),\n", " (u'hope', 2),\n", " (u'gloom.', 1),\n", " (u'remarkable', 2),\n", " (u'manes,', 1),\n", " (u'cobra', 1),\n", " (u'leads', 1),\n", " (u'Nestling', 1),\n", " (u'tit-bits', 1),\n", " (u'chair', 2),\n", " (u'displaying', 2),\n", " (u'devoted', 2),\n", " (u'certainly', 1),\n", " (u'Havana,', 1),\n", " (u'DAMAGE.', 1),\n", " (u'father', 1),\n", " (u'Gutenberg', 20),\n", " (u'absorbing', 1),\n", " (u'charge', 8),\n", " (u'tea.', 1),\n", " (u'turn,', 1),\n", " (u'terror', 1),\n", " (u'Let', 2),\n", " (u'analogous', 1),\n", " (u'alternations', 1),\n", " (u'me', 49),\n", " (u'capella.', 1),\n", " (u'_feuilleton_,', 1),\n", " (u'none', 2),\n", " (u'liability', 1),\n", " (u'mourn', 1),\n", " (u'join', 2),\n", " (u'exact', 1),\n", " (u'perceived', 1),\n", " (u'Just', 1),\n", " (u'did', 27),\n", " (u'leave', 2),\n", " (u'Foundation\"', 1),\n", " (u'compressed,', 1),\n", " (u'S\\xe9raphita', 2),\n", " (u'says', 2),\n", " (u'worn', 1),\n", " (u'\\ufeffThe', 1),\n", " (u'eating', 1),\n", " (u'Carrousel,', 1),\n", " (u'sphinx;', 1),\n", " (u'substitute', 1),\n", " (u'arm.', 2),\n", " (u'cleanliness', 1),\n", " (u'slow,', 1),\n", " (u'assistance', 1),\n", " (u'shares', 1),\n", " (u'current', 1),\n", " (u'remembered', 2),\n", " (u'suspect', 1),\n", " (u'goes', 4),\n", " (u'thinks,', 1),\n", " (u'filled', 6),\n", " (u'abide', 1),\n", " (u'hairy', 2),\n", " (u'silky,', 1),\n", " (u'vertigo,', 1),\n", " (u'talented,', 1),\n", " (u'West,', 1),\n", " (u'water', 2),\n", " (u'guise', 1),\n", " (u'ermine\\u2019s', 1),\n", " (u'alone', 2),\n", " (u'based', 2),\n", " (u'along', 4),\n", " (u'My', 6),\n", " (u'vocation.', 1),\n", " (u'works,', 5),\n", " (u'thirty', 2),\n", " (u'odour', 1),\n", " (u'30760-0.txt', 1),\n", " (u'intelligence,', 1),\n", " (u'accomplished', 1),\n", " (u'descended', 1),\n", " (u'usually', 2),\n", " (u'studies', 1),\n", " (u'biped.', 1),\n", " (u'love', 4),\n", " (u'READ', 1),\n", " (u'merely', 8),\n", " (u'prefer', 1),\n", " (u'NOTICE', 1),\n", " (u'marked', 3),\n", " (u'horses', 5),\n", " (u'Perigord', 1),\n", " (u'angora', 2),\n", " (u'everybody', 1),\n", " (u'Hungarian', 1),\n", " (u'perished', 1),\n", " (u'visit', 1),\n", " (u'tears', 1),\n", " (u'Dow.', 1),\n", " (u'live', 2),\n", " (u'prey', 2),\n", " (u'facility:', 1),\n", " (u'negroes', 1),\n", " (u'Those', 2),\n", " (u'These', 4),\n", " (u'cats,', 7),\n", " (u'Decamps', 1),\n", " (u'cats.', 2),\n", " (u'three,', 2),\n", " (u'ignorant', 3),\n", " (u'cases', 2),\n", " (u'effort', 1),\n", " (u'epitaph', 1),\n", " (u'fly', 1),\n", " (u'states', 4),\n", " (u'\"Information', 1),\n", " (u'indescribable', 1),\n", " (u'PLEASE', 1),\n", " (u'entailed', 1),\n", " (u'WARRANTIES', 2),\n", " (u'following', 6),\n", " (u'filberts,', 1),\n", " (u'anxiously,', 1),\n", " (u'meant', 1),\n", " (u'crazy', 2),\n", " (u'hears', 1),\n", " (u'frescoes', 1),\n", " (u'agent', 1),\n", " (u'stroke', 1),\n", " (u'docility,', 1),\n", " (u'pink', 2),\n", " (u'Terms', 2),\n", " (u'Correggio', 1),\n", " (u'thunder,', 1),\n", " (u\"state's\", 1),\n", " (u'candle.', 1),\n", " (u'pure', 1),\n", " (u'nose,', 2),\n", " (u'sworn', 1),\n", " (u'dogs.', 2),\n", " (u'dogs,', 9),\n", " (u'License', 8),\n", " (u'may', 24),\n", " (u'ponies', 4),\n", " (u'fed', 2),\n", " (u'spot', 1),\n", " (u'room.', 2),\n", " (u'spectators\\u2019', 1),\n", " (u'rats,--an', 1),\n", " (u'date', 2),\n", " (u'bread,', 1),\n", " (u'curing', 1),\n", " (u'efforts', 2),\n", " (u'lifting', 1),\n", " (u'natural', 2),\n", " (u'succeeded', 2),\n", " (u'whenever', 1),\n", " (u'trumpets', 1),\n", " (u'tale', 1),\n", " (u'(\"the', 1),\n", " (u'so', 60),\n", " (u'silence', 2),\n", " (u'basket', 1),\n", " (u'lamented', 1),\n", " (u'refund', 6),\n", " (u'permission.', 1),\n", " (u'eat.\\u201d', 1),\n", " (u'seizing', 1),\n", " (u'conversation.', 1),\n", " (u'motions', 2),\n", " (u'cold', 4),\n", " (u'striped', 2),\n", " (u'dreading', 1),\n", " (u'machine', 1),\n", " (u'\\u201cwere', 1),\n", " (u'group', 1),\n", " (u'curly', 1),\n", " (u'hot', 1),\n", " (u'window', 2),\n", " (u'corner,', 1),\n", " (u'bird.', 2),\n", " (u'non', 2),\n", " (u'readers,', 1),\n", " (u'3', 2),\n", " (u'nod', 1),\n", " (u'obtaining', 3),\n", " (u'_My', 1),\n", " (u'unsatisfactory,', 1),\n", " (u'year,', 1),\n", " (u'EVEN', 1),\n", " (u'compelled', 2),\n", " (u'\\xe6sthetic', 1),\n", " (u'arriving', 1),\n", " (u'provision', 1),\n", " (u'nor', 1),\n", " (u'lineage', 1),\n", " (u'down', 18),\n", " (u'feverish', 1),\n", " (u'III', 2),\n", " (u'careless', 1),\n", " (u'cadence', 1),\n", " (u'waltz.', 1),\n", " (u'menagerie', 1),\n", " (u'retrieve', 1),\n", " (u'small;', 1),\n", " (u'GAUTIER', 1),\n", " (u'entering', 1),\n", " (u'happen', 1),\n", " (u'shown', 2),\n", " (u'intercourse', 1),\n", " (u'profit', 1),\n", " (u'Hector,', 1),\n", " (u'looking', 4),\n", " (u'contained', 1),\n", " (u'Gascons,', 1),\n", " (u'Drums', 1),\n", " (u'housemaid', 1),\n", " (u'much.\\u201d', 1),\n", " (u'END', 1),\n", " (u'jig,', 1),\n", " (u'sugar,', 2),\n", " (u'sugar.', 1),\n", " (u'checks,', 1),\n", " (u'remainder', 1),\n", " (u'card', 1),\n", " (u'Charles,', 1),\n", " (u'square,', 1),\n", " (u'language', 3),\n", " (u'fonder', 1),\n", " (u'motion', 2),\n", " (u'place', 11),\n", " (u'husband', 1),\n", " (u'Madame-Th\\xe9ophile', 2),\n", " (u'promotion', 1),\n", " (u'support', 3),\n", " (u'exotic', 2),\n", " (u'saving', 1),\n", " (u'anxiety,', 1),\n", " (u'Black', 2),\n", " (u'fast', 2),\n", " (u'directly', 1),\n", " (u'carry', 2),\n", " (u'sounds', 2),\n", " (u'open', 3),\n", " (u'little', 31),\n", " (u'persons.', 1),\n", " (u'humiliation', 1),\n", " (u'bowlful', 1),\n", " (u'caught', 3),\n", " (u'coat.', 1),\n", " (u'coat,', 3),\n", " (u'anyone', 4),\n", " (u'dog.\\u201d', 1),\n", " (u'proclaimed', 1),\n", " (u'escort', 1),\n", " (u'typographical', 2),\n", " (u'white', 12),\n", " (u'friend', 1),\n", " (u'eyes', 14),\n", " (u'on.', 1),\n", " (u'on,', 4),\n", " (u'distribution', 6),\n", " (u'proving', 1),\n", " (u'much.', 1),\n", " (u'brave,', 1),\n", " (u'austere', 2),\n", " (u'1.E.7', 2),\n", " (u'holder', 1),\n", " (u'enjoyment', 1),\n", " (u'19', 1),\n", " (u'require', 1),\n", " (u'practising!', 1),\n", " (u'IRS.', 1),\n", " (u'asking,', 1),\n", " (u'venture', 1),\n", " (u'Hart,', 1),\n", " (u'wandering', 1),\n", " (u'and', 539),\n", " (u'round.', 1),\n", " (u'page,', 1),\n", " (u'angel', 1),\n", " (u'Passy.', 2),\n", " (u'Passy,', 1),\n", " (u'turned', 5),\n", " (u'round,', 1),\n", " (u'alley', 1),\n", " (u'questions', 1),\n", " (u'say', 6),\n", " (u'saw', 3),\n", " (u'supplication.', 1),\n", " (u'barked,', 1),\n", " (u'speakers', 1),\n", " (u'lumps', 1),\n", " (u'monkeys,', 1),\n", " (u'terrific', 1),\n", " (u'pisces,', 1),\n", " (u'fill,', 1),\n", " (u'funniest', 1),\n", " (u'cheeks,', 2),\n", " (u'curled,', 1),\n", " (u'begin', 1),\n", " (u'amiable', 1),\n", " (u'Nero', 1),\n", " (u'voice:', 1),\n", " (u'ground.', 2),\n", " (u'paid', 7),\n", " (u'voice,', 2),\n", " (u'donation', 1),\n", " (u'20%', 1),\n", " (u'landing', 1),\n", " (u'pair', 7),\n", " (u'impossible,', 1),\n", " (u'D.', 1),\n", " (u'City,', 1),\n", " (u'considered', 1),\n", " (u'brute,', 1),\n", " (u'visitors,', 1),\n", " (u'federal', 2),\n", " (u'link', 1),\n", " (u'surface', 1),\n", " (u'REMEDIES', 1),\n", " (u'pullet;', 1),\n", " (u'forage', 1),\n", " (u'rambling', 1),\n", " (u'_kief_,', 1),\n", " (u'Character', 1),\n", " (u'father\\u2019s', 1),\n", " (u'discovered', 1),\n", " (u'liveliest', 2),\n", " (u'bright', 1),\n", " (u'arbour,', 1),\n", " (u'Pagnest,', 1),\n", " (u'We', 8),\n", " (u'stormy', 1),\n", " (u'true', 2),\n", " (u'behind', 2),\n", " (u'Champs-\\xc9lys\\xe9es.', 1),\n", " (u'approached.', 1),\n", " (u'only', 7),\n", " (u'black', 13),\n", " (u'donate.', 1),\n", " (u'uppermost', 1),\n", " (u'supernatural', 1),\n", " (u'Often,', 1),\n", " (u'lighted', 2),\n", " (u'usual,', 1),\n", " (u'experiencing,', 1),\n", " (u'lighter', 2),\n", " (u'edition.', 1),\n", " (u'Mis\\xe9rables,\\u201d', 1),\n", " (u'tit-bit', 1),\n", " (u'Goethe', 1),\n", " (u'invention.', 1),\n", " (u'turns.', 1),\n", " (u'Cagnotte,', 3),\n", " (u'Cagnotte.', 2),\n", " (u'eBook,', 1),\n", " (u'where', 12),\n", " (u'vision', 1),\n", " (u'streets,', 1),\n", " (u'rage,', 1),\n", " (u'PRIVATE', 5),\n", " (u'imposing', 1),\n", " (u'Descartes,', 1),\n", " (u'Major\\u201d', 1),\n", " (u'seam', 1),\n", " (u'renamed.', 1),\n", " (u'calendar', 1),\n", " (u'setter,', 1),\n", " (u'knowing', 2),\n", " (u'compliance.', 1),\n", " (u'1.E.2.', 1),\n", " (u'ways', 3),\n", " (u'ornament', 1),\n", " (u'quality', 1),\n", " (u'84116,', 1),\n", " (u'\\u201cLes', 2),\n", " (u'enough', 9),\n", " (u'Quai', 1),\n", " (u'mission,', 1),\n", " (u'man,', 2),\n", " (u'man.', 3),\n", " (u'arrival', 1),\n", " (u'notice', 3),\n", " (u'FOR', 5),\n", " (u'U.S.', 3),\n", " (u'jugged', 1),\n", " (u'sensitive,', 1),\n", " (u'singers', 1),\n", " (u'come', 11),\n", " (u'responsive', 1),\n", " (u'his,', 2),\n", " (u'Private', 4),\n", " (u'roof', 1),\n", " ...]" ] } ], "prompt_number": 57 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 45 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }