{ "cells": [ { "cell_type": "markdown", "metadata": { "heading_collapsed": true }, "source": [ "# MapReduce" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "hidden": true }, "source": [ "## Intuition" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true, "hidden": true }, "outputs": [], "source": [ "a = [[1,2,1], [3,2], [4,9,1,0,2]]" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true, "hidden": true }, "outputs": [], "source": [ "sums = map(sum, a)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true, "hidden": true }, "outputs": [], "source": [ "sums = [] \n", "for sublist in a: \n", " results = sum(sublist) \n", " sums.append(results)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true, "hidden": true }, "outputs": [], "source": [ "def add(a, b):\n", " return a + b " ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "hidden": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "25\n" ] } ], "source": [ "from functools import reduce\n", "print(reduce(add, sums, 0))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true, "hidden": true }, "outputs": [], "source": [ "initial = 0\n", "current_result = initial\n", "for element in sums:\n", " current_result = add(current_result, element)" ] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "hidden": true }, "source": [ "## Basic Example" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "hidden": true }, "outputs": [], "source": [ "from collections import defaultdict\n", "\n", "def map_word_count(document_id, document):\n", " counts = defaultdict(int)\n", " for word in document.split():\n", " counts[word] += 1\n", " for word in counts:\n", " yield (word, counts[word])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "hidden": true }, "outputs": [], "source": [ "def shuffle_words(results_generators):\n", " records = defaultdict(list)\n", " for results in results_generators:\n", " for word, count in results:\n", " records[word].append(count)\n", " for word in records:\n", " yield (word, records[word])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "hidden": true }, "outputs": [], "source": [ "def reduce_counts(word, list_of_counts):\n", " return (word, sum(list_of_counts))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "hidden": true }, "outputs": [], "source": [ "from sklearn.datasets import fetch_20newsgroups\n", "dataset = fetch_20newsgroups(subset='train')\n", "documents = dataset.data[:50]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "hidden": true }, "outputs": [], "source": [ "map_results = map(map_word_count, range(len(documents)), documents)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "hidden": true }, "outputs": [], "source": [ "shuffle_results = shuffle_words(map_results)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "hidden": true }, "outputs": [], "source": [ "reduce_results = [reduce_counts(word, list_of_counts) for word, list_of_counts in shuffle_results]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "hidden": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('coming', 1), (\"couldn't\", 4), ('Jose,', 1), ('{As', 1), ('185c', 1)]\n", "5036\n" ] } ], "source": [ "print(reduce_results[:5])\n", "print(len(reduce_results))" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "hidden": true }, "outputs": [], "source": [ "from joblib import Parallel, delayed" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "hidden": true }, "outputs": [], "source": [ "def map_word_count(document_id, document):\n", " counts = defaultdict(int)\n", " for word in document.split():\n", " counts[word] += 1\n", " return list(counts.items())" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "hidden": true }, "outputs": [], "source": [ "map_results = Parallel(n_jobs=2)(delayed(map_word_count)(i, document)\n", " for i, document in enumerate(documents))" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "hidden": true }, "outputs": [], "source": [ "shuffle_results = shuffle_words(map_results)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "hidden": true, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "[('coming', [1]),\n", " (\"couldn't\", [1, 1, 1, 1]),\n", " ('Jose,', [1]),\n", " ('{As', [1]),\n", " ('185c', [1]),\n", " ('burst', [5]),\n", " ('context.', [1]),\n", " ('copy,', [1]),\n", " ('**********************************************************************',\n", " [1]),\n", " ('Modular', [1]),\n", " ('Yeah,', [1]),\n", " ('parking', [1]),\n", " ('Prices!', [1]),\n", " ('em', [1]),\n", " ('record,', [1]),\n", " ('program', [1]),\n", " ('>philosophically<', [1]),\n", " ('kind', [1, 1]),\n", " ('opinions', [2, 1, 1]),\n", " ('cubic', [1]),\n", " ('vision', [1]),\n", " ('later', [1, 1, 1]),\n", " ('$3495,', [1]),\n", " ('she', [2, 1]),\n", " ('xray@is.rice.edu', [1]),\n", " ('up', [2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 3]),\n", " ('Callison', [1]),\n", " ('v8', [1]),\n", " ('No', [6, 1]),\n", " ('disobeys', [1]),\n", " ('term?', [1]),\n", " ('login', [1]),\n", " ('Most', [1, 1, 1, 3, 1]),\n", " ('kept', [1]),\n", " ('(Repost)', [1]),\n", " ('mean', [1, 1, 1]),\n", " ('luck,', [1]),\n", " ('punisher.caltech.edu', [1]),\n", " ('nCUBE', [1]),\n", " ('result', [1]),\n", " ('Problems???', [1]),\n", " ('(I', [2, 1]),\n", " ('Grow', [1]),\n", " ('Goalie', [1]),\n", " ('Binoculars', [1]),\n", " ('boots),', [1]),\n", " ('multiple', [3]),\n", " ('At', [1, 1, 1]),\n", " ('Nearby', [1]),\n", " (\"won't-\", [1]),\n", " ('however', [1]),\n", " ('one', [1, 3, 2, 8, 1, 1, 1, 2, 5, 1, 2, 3, 1, 1]),\n", " ('Vijay', [2]),\n", " ('great.', [1, 1]),\n", " ('stuff', [1, 1, 1]),\n", " ('problem.', [1]),\n", " ('movies', [4]),\n", " ('associated', [1]),\n", " ('continues', [1]),\n", " ('Call', [1, 1]),\n", " ('(David', [2, 1]),\n", " ('hand-cocked', [1]),\n", " ('Brewers', [1]),\n", " ('btw.', [1]),\n", " ('game,', [1]),\n", " (\">(there's\", [1]),\n", " ('boy,', [1]),\n", " ('safest', [1]),\n", " ('add', [2, 1, 1, 2, 1]),\n", " ('mos.', [2]),\n", " ('references', [1, 1]),\n", " ('Negev', [1]),\n", " ('nuclear', [1, 5]),\n", " ('stack@translab.its.uci.edu', [1]),\n", " ('thought.', [1]),\n", " ('this;', [1]),\n", " ('racers,', [1]),\n", " ('things\"', [2]),\n", " ('said,', [1, 1, 1]),\n", " ('it,', [1, 1, 1]),\n", " ('best?', [1]),\n", " (\"How's\", [1]),\n", " ('Silex', [1]),\n", " ('0-5MB/s', [1]),\n", " ('necessary)', [1]),\n", " ('\"If', [2]),\n", " ('semi-autos', [3]),\n", " ('destruction', [3]),\n", " ('saying', [1, 1]),\n", " ('it:', [1]),\n", " ('29', [1]),\n", " ('dealers', [1]),\n", " ('agrees', [1]),\n", " ('low', [1]),\n", " ('round', [1, 1, 1]),\n", " ('fulfilled', [1]),\n", " ('Clause;', [1]),\n", " ('Diskdoubler,', [1]),\n", " ('$60', [1]),\n", " ('exotic', [1]),\n", " ('Such', [1]),\n", " ('conditions,', [1]),\n", " ('tellme', [1]),\n", " ('up??', [1]),\n", " ('abarden@afseo.eglin.af.mil', [1]),\n", " ('wanted', [1]),\n", " ('Does', [1, 1]),\n", " ('annul', [1]),\n", " ('mouth', [1, 1]),\n", " ('appreciate', [1]),\n", " ('gave', [1, 1, 1]),\n", " ('Krueger)', [1]),\n", " ('less.', [2]),\n", " ('Statistics', [1]),\n", " ('anything', [1, 1, 2, 1, 1]),\n", " ('space', [2, 1]),\n", " ('Launch', [1, 1]),\n", " ('station', [1]),\n", " ('COrrado', [1]),\n", " ('troubled', [1]),\n", " ('establishes', [1]),\n", " ('MY', [1]),\n", " ('51.6', [1]),\n", " ('8.3', [1]),\n", " ('pharmacists', [1]),\n", " ('workstation.', [1]),\n", " ('160', [2]),\n", " ('Hewlett', [2]),\n", " ('Gun', [2]),\n", " ('lawyers', [1]),\n", " ('10', [1, 2, 1, 1]),\n", " ('Dwayne', [1]),\n", " ('Zoom', [1]),\n", " ('font', [4]),\n", " ('(Theodore', [1]),\n", " ('keith', [1]),\n", " ('(as', [1, 1, 1, 1]),\n", " ('IBM', [3, 1]),\n", " (\"Colt's\", [1]),\n", " ('corner', [1]),\n", " ('Atheists?', [1]),\n", " ('LX', [1]),\n", " ('MSFC,', [1]),\n", " ('>Excerpts', [1]),\n", " ('town', [2]),\n", " ('F-150', [1]),\n", " ('signing.', [2]),\n", " ('series', [1]),\n", " ('stalled', [1]),\n", " ('Lustig', [1]),\n", " ('bogus.', [1]),\n", " ('holes', [1, 1]),\n", " ('after', [1, 1, 4, 1, 3, 1]),\n", " ('wingless', [1]),\n", " ('checked', [1]),\n", " ('excellent', [1]),\n", " ('because', [1, 1, 1, 2, 2, 1, 2, 4, 1, 1, 1, 3]),\n", " ('(the', [1, 1, 1, 1, 1]),\n", " ('Bonilla,', [1]),\n", " ('sez;', [1]),\n", " ('WHAT', [1]),\n", " ('deterring', [1]),\n", " ('Germany,', [2]),\n", " ('old', [1, 1, 2]),\n", " ('do', [1, 2, 1, 1, 1, 6, 1, 1, 1, 2, 1, 1, 2]),\n", " ('applicable', [1]),\n", " ('1300', [1]),\n", " ('important?', [1]),\n", " ('least,', [1]),\n", " ('initially),', [1]),\n", " (\"doens't\", [1]),\n", " ('tents,', [1]),\n", " ('adam@endor.uucp', [1]),\n", " ('two', [1, 1, 1, 1, 1, 1, 1, 1, 1]),\n", " ('lists', [1]),\n", " ('(which', [1, 1]),\n", " ('T,', [2]),\n", " ('Oklahoma;', [1]),\n", " ('five', [1]),\n", " ('sound', [1]),\n", " ('optimize', [1]),\n", " ('rated', [1, 1]),\n", " ('computed', [1]),\n", " ('post', [1, 1, 1, 1, 1, 1]),\n", " ('immediately', [1, 1]),\n", " ('In', [1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1]),\n", " ('recieved', [1]),\n", " ('Maryland,', [1]),\n", " ('disregardful', [1]),\n", " ('him)', [1]),\n", " ('somewhere', [1]),\n", " ('Data', [1]),\n", " ('SSF', [9]),\n", " ('Soviet', [1]),\n", " ('Car:', [1]),\n", " ('air;', [1]),\n", " ('sky', [1]),\n", " ('tombaker@world.std.com', [1]),\n", " ('posting', [1]),\n", " ('were', [1, 1, 1, 3, 3, 3, 1, 5, 1]),\n", " ('configuration', [2]),\n", " ('>--', [1, 1, 1]),\n", " ('reduced', [1]),\n", " ('Mountain', [1]),\n", " ('Vanbiesbrouck.', [1]),\n", " ('purified', [1]),\n", " ('doubt', [1, 1, 1, 1]),\n", " ('Leafs,', [1]),\n", " ('Funny', [1]),\n", " ('DD', [1]),\n", " ('{16-bit/wide', [1]),\n", " ('Kuo)', [1]),\n", " ('son', [3]),\n", " ('Sunday', [1]),\n", " ('means).', [1]),\n", " ('.481,', [1]),\n", " (\"you're\", [1, 1, 1, 1, 1, 1, 1]),\n", " ('>Lawrence', [1]),\n", " ('freeware', [1]),\n", " ('Loney', [1]),\n", " ('bad', [1]),\n", " ('links', [1]),\n", " ('Bonilla', [2]),\n", " ('beachball!\"', [1]),\n", " ('scares', [1]),\n", " ('care.', [1]),\n", " ('hot', [3]),\n", " ('[ssa@unity.ncsu.edu]', [1]),\n", " ('integer.', [1]),\n", " ('use.', [1]),\n", " ('Gosh..I', [1]),\n", " ('mode}:', [2]),\n", " ('motto', [1]),\n", " ('(Lemieux)', [1]),\n", " ('HoloNet', [1]),\n", " ('much', [1, 1, 1, 1, 3, 1, 1, 1, 1, 1]),\n", " ('>Folks,', [1]),\n", " ('philosophical', [1]),\n", " ('cases', [1, 1, 1]),\n", " ('who/what', [1]),\n", " ('controller', [4]),\n", " ('Insurance', [5]),\n", " ('>then', [1]),\n", " ('None', [1]),\n", " ('shaky', [1]),\n", " ('(Operator)', [1]),\n", " ('1:', [1]),\n", " ('re-claimed', [1]),\n", " ('keith@cco.caltech.edu', [1]),\n", " ('trial\",', [1]),\n", " ('calling', [1]),\n", " ('solicit', [1]),\n", " ('works', [1, 1, 1]),\n", " ('Division', [2, 1]),\n", " ('--salty', [1]),\n", " ('--', [1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),\n", " ('cars.', [2]),\n", " ('Nazis),', [1]),\n", " ('leads', [1]),\n", " ('FTP', [1]),\n", " ('Foligno', [1]),\n", " ('solar', [2]),\n", " (\"You'll\", [1]),\n", " ('CIRCUIT', [1]),\n", " ('up,', [1]),\n", " ('yesterday.', [2]),\n", " ('924,', [3]),\n", " ('same,', [1]),\n", " ('launch,', [1]),\n", " ('################################################################################',\n", " [2]),\n", " ('received.', [1, 1]),\n", " ('jcm@head-cfa.harvard.edu', [1]),\n", " ('.NOT.', [1]),\n", " ('centerline', [1]),\n", " ('$80', [1]),\n", " ('Etzion,', [1]),\n", " ('increased', [1]),\n", " ('12-2', [1]),\n", " ('airbag,', [1]),\n", " ('empty', [1]),\n", " ('trial', [1]),\n", " ('children).', [1]),\n", " ('decided', [2, 1]),\n", " ('\"nigger\"', [1]),\n", " ('well.', [1]),\n", " ('No.', [1, 1]),\n", " ('strictly', [1]),\n", " ('technology.', [1]),\n", " ('Y', [2]),\n", " ('smith', [1]),\n", " ('helps.', [3, 1]),\n", " ('thing)', [1]),\n", " ('1000', [1, 1]),\n", " ('Army', [1]),\n", " ('respect.', [1]),\n", " (\"insurance's\", [1]),\n", " ('spec,', [1]),\n", " ('>fossil', [2]),\n", " ('56', [1]),\n", " ('stop', [2, 1]),\n", " ('Airplane', [1]),\n", " ('into', [1, 1, 1, 1, 1, 1, 1, 1, 1]),\n", " ('fancy', [1]),\n", " ('support)', [1]),\n", " ('(docking', [1]),\n", " ('citizens', [1, 1]),\n", " ('\"inappropriate\"', [1]),\n", " ('VAX/VMS', [1]),\n", " ('55;', [1]),\n", " ('debris)', [1]),\n", " ('code', [1, 1]),\n", " ('murmurs', [1]),\n", " ('own.', [1, 1]),\n", " ('>every<', [1]),\n", " ('finish', [1]),\n", " ('\"subrogation.\"', [1]),\n", " (\"haven't\", [1, 1, 1, 1]),\n", " ('hurts', [1]),\n", " ('most', [1, 1, 2, 1, 2, 1, 1]),\n", " ('unnecessarily', [1]),\n", " ('1948:', [1]),\n", " ('43', [1]),\n", " ('Focus', [1]),\n", " ('played', [1]),\n", " ('Syphers)', [1]),\n", " ('[version', [1, 1, 1]),\n", " ('specific', [1, 1]),\n", " ('death.', [1]),\n", " ('on.', [1, 1, 1]),\n", " ('cryptology;', [1]),\n", " ('R_Tim_Coslet@cup.portal.', [1]),\n", " ('(Portable', [1]),\n", " ('bgrubb@dante.nmsu.edu', [2]),\n", " ('brings', [1]),\n", " ('Nagle', [1]),\n", " ('way.', [1, 1, 1]),\n", " ('YEARS.', [1]),\n", " ('there?', [1]),\n", " ('fault,', [1]),\n", " ('Lockheed', [2]),\n", " ('SO', [1]),\n", " ('Lloyd', [1]),\n", " ('Fox)', [1]),\n", " ('intention', [1]),\n", " ('expressing', [1]),\n", " ('abarden@tybse1.uucp', [1]),\n", " ('magnification', [1]),\n", " ('managers', [1]),\n", " ('preferably', [1]),\n", " ('crook,', [1]),\n", " ('knowledge', [1]),\n", " ('Stac', [2]),\n", " ('too)', [1]),\n", " ('via', [1]),\n", " ('Redesign', [1]),\n", " ('policy', [3]),\n", " ('', [1]),\n", " ('missions', [1]),\n", " ('\"little', [2]),\n", " ('story', [1]),\n", " ('practice.', [1]),\n", " ('gripe', [1]),\n", " ('important.', [1]),\n", " ('why', [1, 1, 1, 1, 1, 2]),\n", " ('EVA', [1]),\n", " ('Jonathan_Hayward@wheaton.edu', [1]),\n", " ('law', [1]),\n", " ('Division,', [1]),\n", " ('spacify', [1]),\n", " ('SS10', [1]),\n", " ('younger,', [1]),\n", " ('bugs,', [1]),\n", " ('frost', [1]),\n", " ('turn', [1]),\n", " ('shooting', [1]),\n", " ('subjectiveness.', [1]),\n", " ('Yassin', [1]),\n", " ('grossly', [1]),\n", " ('Israel', [2]),\n", " ('(Hope', [1]),\n", " ('corner...', [1]),\n", " ('foot', [1]),\n", " ('theft', [2]),\n", " ('Allstate.', [1]),\n", " ('Disclaimer:', [1]),\n", " ('P.S.', [1]),\n", " ('big', [1, 1, 1]),\n", " ('News-Software:', [1]),\n", " ('are',\n", " [1, 2, 2, 1, 2, 4, 8, 1, 2, 4, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 8, 2, 3]),\n", " ('jammed!\"', [1]),\n", " ('society.', [1]),\n", " ('course),', [1]),\n", " ('concrete', [1]),\n", " ('BeHanna)', [1]),\n", " ('1200', [1]),\n", " ('distinguish', [1]),\n", " ('THERMOCOUPLE', [1]),\n", " ('cage).', [1]),\n", " ('voice', [2, 1]),\n", " ('no.', [1]),\n", " ('autumn', [1]),\n", " ('(brian', [1]),\n", " ('undoubtably', [1]),\n", " ('Ken,', [1]),\n", " ('>>>', [3]),\n", " (\"weren't\", [1]),\n", " ('carson.u.washington.edu', [1]),\n", " ('FASTER', [1]),\n", " ('off', [1, 1, 4, 1, 2, 1]),\n", " ('inquiry', [1]),\n", " ('Article-I.D.:', [1, 1]),\n", " ('difference', [1]),\n", " ('ambient.', [1]),\n", " ('head', [1]),\n", " ('<1r466c$an3@news.intercon.com>', [1]),\n", " ('do.', [1]),\n", " ('Marks)', [1]),\n", " ('pay', [4, 2]),\n", " ('controler', [1]),\n", " ('Flyers.', [1]),\n", " ('our', [1]),\n", " ('Plesac', [1]),\n", " ('turning,', [1]),\n", " ('boundaries', [1]),\n", " ('>removing', [1]),\n", " ('existing', [3]),\n", " ('malnutrition', [1]),\n", " ('8-bit', [1]),\n", " ('require', [1]),\n", " ('>contamination.', [1]),\n", " ('play', [1, 1]),\n", " ('Shea.', [1]),\n", " ('{With', [1]),\n", " ('Card)', [1]),\n", " ('lefties', [2]),\n", " (\"'stay'\", [1]),\n", " ('mask', [3]),\n", " ('Cerkoney)', [1]),\n", " ('flow', [1]),\n", " ('unless', [1, 1, 1, 1]),\n", " ('Kyanko', [1]),\n", " ('>capacity', [1]),\n", " ('Station.', [1]),\n", " ('\"Behind', [1]),\n", " ('econoboxes', [1]),\n", " ('fighting', [3]),\n", " ('Wisconsin', [1]),\n", " ('successful.', [1]),\n", " ('bill.', [1]),\n", " ('entry', [2]),\n", " ('torn', [1]),\n", " ('Great!', [1]),\n", " ('stands', [1]),\n", " ('said', [1, 1, 1]),\n", " ('Yale', [1]),\n", " ('new', [1, 1, 1, 1, 1]),\n", " ('Studies,', [1]),\n", " ('4', [2, 1]),\n", " ('Rather', [1]),\n", " ('generation,', [1]),\n", " ('Tape.Tape', [1]),\n", " ('faster', [2, 2, 1]),\n", " ('Treatment', [1]),\n", " ('kmr4@po.CWRU.edu', [1]),\n", " (\">That's\", [1, 1]),\n", " ('$1,000,000', [1]),\n", " ('yrs', [1]),\n", " ('rotate....', [1]),\n", " ('Subscriber', [1]),\n", " ('boilers.', [1]),\n", " ('HELP', [1]),\n", " ('@', [1, 1]),\n", " ('joplin.biosci.arizona.edu', [1]),\n", " ('left', [1, 1, 1, 2]),\n", " ('---', [1, 1]),\n", " ('top', [1, 1]),\n", " ('chalk', [1]),\n", " ('TIFF,', [1]),\n", " ('roughly', [1]),\n", " ('18', [1]),\n", " ('dubing,', [1]),\n", " ('who', [1, 2, 1, 1, 1, 1, 2, 2, 1, 5, 1, 1, 2, 1]),\n", " ('bronze/brown/orange', [1]),\n", " ('$2,500.', [1]),\n", " ('MOVIES', [1]),\n", " ('Even', [1, 1, 1]),\n", " ('$500', [3]),\n", " ('how', [2, 1, 1, 1, 1, 1]),\n", " ('versus', [1]),\n", " ('developed', [2]),\n", " ('(full-cover,', [1]),\n", " ('out.', [1]),\n", " ('Presence', [2]),\n", " ('put', [1, 1, 1, 1]),\n", " ('questions', [1]),\n", " ('three-on-three', [1]),\n", " ('environments', [1]),\n", " ('[via', [1]),\n", " ('Naval', [1]),\n", " ('Vincint', [1]),\n", " ('shelley.1qvfo9INNc3s', [1]),\n", " ('6th', [1]),\n", " ('tended', [1]),\n", " ('Zealand', [1]),\n", " ('smithsonian', [1]),\n", " ('here', [1, 1]),\n", " ('>deductible,', [1]),\n", " ('mind', [1, 1]),\n", " (')>>', [7]),\n", " ('job', [1, 1]),\n", " ('Mudd', [1]),\n", " ('NYT', [1]),\n", " ('utilize', [1]),\n", " ('unlikely', [1]),\n", " ('stpl.ists.ca', [1]),\n", " ('now),', [1]),\n", " ('nose.', [1]),\n", " ('Sabres', [1]),\n", " ('protest', [1]),\n", " ('this?', [1]),\n", " ('reach', [1]),\n", " ('xandor@unixg.ubc.ca', [1]),\n", " ('money', [1, 1, 1, 1]),\n", " ('\"B\"', [2]),\n", " ('raised', [1]),\n", " ('Piaget)', [1]),\n", " ('Player,', [1]),\n", " (\"there's\", [1]),\n", " ('AT&T', [1]),\n", " ('Albert', [1]),\n", " ('rifles.', [1]),\n", " ('chin', [1]),\n", " ('Jaha', [1]),\n", " (\"I've\", [3, 1, 1, 3, 1, 1, 4]),\n", " ('standardized,', [1]),\n", " ('Email:', [1, 1]),\n", " ('Cipale)', [1]),\n", " ('message', [1, 1]),\n", " ('guest)', [1]),\n", " ('woke', [1]),\n", " ('memory.', [1]),\n", " ('$24', [1]),\n", " ('(Eli', [1]),\n", " ('child', [6]),\n", " ('under/into.', [1]),\n", " ('humor.\"', [1]),\n", " ('enough).', [1]),\n", " ('Kerr)', [1]),\n", " ('(Amanda', [1]),\n", " ('Illinois/Urbana', [1]),\n", " ('appearance.', [1]),\n", " ('stable', [1]),\n", " ('Nietzsche', [1]),\n", " ('World', [1]),\n", " ('attacking.', [1]),\n", " ('tapped.', [1]),\n", " ('enough,', [1, 1]),\n", " ('berthing', [1]),\n", " ('chamber', [3]),\n", " ('scuffling?', [1]),\n", " ('Appendix', [1]),\n", " ('captains', [1]),\n", " ('>originally', [1]),\n", " ('missing', [1, 1]),\n", " ('is,', [1]),\n", " ('ST', [1]),\n", " ('heard', [3, 1, 1]),\n", " ('thought,', [1]),\n", " ('1900', [1]),\n", " ('Tucson', [1]),\n", " ('Corp.,', [1]),\n", " ('<26', [1]),\n", " ('flags', [1]),\n", " ('interested,', [1]),\n", " ('downright', [1]),\n", " ('sucked.', [1]),\n", " ('semi-autos.', [1]),\n", " ('unknowable,', [1]),\n", " ('$520/6', [1]),\n", " ('advantage).', [1]),\n", " ('around', [2, 1, 1, 1, 1]),\n", " ('on-board.', [1]),\n", " ('info:', [1]),\n", " ('PA', [2, 1]),\n", " ('wierd', [1]),\n", " ('ssa@unity.ncsu.edu', [1]),\n", " (\"(he'd\", [1]),\n", " ('1970', [2]),\n", " ('high', [1, 2, 2]),\n", " ('98%', [1]),\n", " ('<1993Apr20.151818.4319@samba.oit.unc.edu>', [1]),\n", " ('someone', [1, 1]),\n", " ('better).', [1]),\n", " (\"wanna-be's),\", [1]),\n", " ('other', [1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 2, 1, 1]),\n", " ('final', [1]),\n", " ('modular', [1]),\n", " ('modern', [2, 1]),\n", " ('cradle', [1]),\n", " ('distance!).', [1]),\n", " ('>swashbuckling', [1]),\n", " ('11', [1, 1, 1]),\n", " ('equal', [1]),\n", " ('stolen', [1]),\n", " ('image,', [1, 2]),\n", " ('group', [1]),\n", " ('occur', [1]),\n", " ('>SCSI-I', [1]),\n", " ('propulsion', [1]),\n", " ('Runs', [1]),\n", " ('salesman', [1]),\n", " ('simply', [1]),\n", " ('doubts', [1]),\n", " ('battle', [1]),\n", " ('Front', [1]),\n", " ('failures', [1]),\n", " ('aside,', [1, 1]),\n", " ('thing\"', [1]),\n", " ('shuttle', [2]),\n", " ('granite', [1]),\n", " ('admin', [1]),\n", " ('time.', [1, 1]),\n", " ('attack', [1, 1]),\n", " ('service', [1, 1]),\n", " ('glad', [1]),\n", " ('learns', [1]),\n", " ('accidentally', [1]),\n", " ('336-9591', [1]),\n", " ('summer\"', [1]),\n", " ('LX.)', [1]),\n", " ('intended', [1, 1]),\n", " ('came', [1, 1]),\n", " ('dirtbike', [1]),\n", " ('first', [2, 1, 3]),\n", " ('there)}', [1]),\n", " ('own', [1, 1, 1, 1, 1, 1]),\n", " ('Sundheim)', [1]),\n", " ('And,', [1]),\n", " ('monitor.', [1]),\n", " ('middle', [1, 2]),\n", " ('jap', [1]),\n", " ('Remote', [1]),\n", " ('fault', [1]),\n", " ('summary', [1, 2, 1, 1]),\n", " ('helpful', [1]),\n", " ('man', [1, 1]),\n", " ('rarely', [1]),\n", " ('floppy..BURN', [1]),\n", " (\"ERA's\", [1]),\n", " ('shell', [1]),\n", " ('>Oddly,', [1]),\n", " ('>machines\".', [1]),\n", " ('(Abraham', [1]),\n", " ('do\"', [1]),\n", " ('key', [1, 1]),\n", " ('range', [1]),\n", " ('also),', [1]),\n", " ('with:', [1]),\n", " ('prepared', [1]),\n", " ('(919)467-7909', [1]),\n", " ('expelled?', [1]),\n", " ('cool', [1]),\n", " ('towing,', [1]),\n", " ('signal', [1]),\n", " ('liked.', [1]),\n", " ('knowledge.', [1]),\n", " ('user', [1]),\n", " ('game.', [1]),\n", " ('improves', [1]),\n", " ('necessarily', [1]),\n", " ('Then', [1]),\n", " ('happy', [1]),\n", " ('use', [1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1]),\n", " ('L', [1]),\n", " ('2:', [1]),\n", " ('too', [2]),\n", " ('rac3.wam.umd.edu', [1]),\n", " ('$80/year', [1]),\n", " ('tech', [1]),\n", " ('time', [1, 1, 1, 1, 2, 1]),\n", " ('bangkok', [1]),\n", " ('3)Monolux', [1]),\n", " ('ADC', [1]),\n", " ('known', [2, 2]),\n", " ('am\"', [1]),\n", " ('lesson', [1]),\n", " ('ranks', [1]),\n", " ('129.89.7.4', [1]),\n", " ('>If', [1]),\n", " ('However,', [1, 1, 1, 1, 1, 2]),\n", " ('whom', [1]),\n", " ('before.', [1]),\n", " ('lacked', [1]),\n", " ('image\"', [1]),\n", " ('box', [1]),\n", " ('weapons', [7]),\n", " ('tolerance', [1]),\n", " ('effectively', [1]),\n", " (\">'Cooling\", [1]),\n", " ('constitute', [1]),\n", " ('want', [1, 1, 1, 2, 1, 1, 1, 2, 1]),\n", " ('relationship', [1]),\n", " ('fixable.', [1]),\n", " ('(this', [1]),\n", " ('on)', [1]),\n", " ('considers', [1]),\n", " ('back', [1, 1, 1, 1, 1]),\n", " ('worn', [1]),\n", " ('year', [1, 6, 1]),\n", " ('accidents...', [1]),\n", " ('(Foxvog', [2]),\n", " ('shop', [1]),\n", " ('$3K.', [1]),\n", " ('Treat', [1]),\n", " ('being', [2, 1, 2, 1, 1, 2, 1]),\n", " ('defensive', [2]),\n", " ('>BZZZT!', [1]),\n", " ('future', [2, 1]),\n", " ('YOURS.', [1]),\n", " ('driving', [5]),\n", " ('dogs.', [1]),\n", " ('mode', [1, 2]),\n", " ('Voic', [1]),\n", " ('dirt', [1]),\n", " (\"let's\", [1]),\n", " ('phone,', [1]),\n", " ('competition.)', [1]),\n", " ('serious.', [1]),\n", " ('Original-Sender:', [1]),\n", " ('>>long', [1]),\n", " ('(James', [1, 1]),\n", " ('starting', [1, 2]),\n", " ('turbo', [1]),\n", " ('exception', [1]),\n", " ('restraint', [1]),\n", " ('foxvog', [1]),\n", " ('Man-Tended', [1]),\n", " ('(Assured', [1]),\n", " ('40MB/s', [1]),\n", " ('Sci,', [1]),\n", " ('humanity.', [1]),\n", " ('her,', [1]),\n", " ('Eli', [1]),\n", " ('21', [2, 1, 1]),\n", " ('pack', [2, 1]),\n", " ('subtly', [1]),\n", " ('Vaive', [1]),\n", " ('USL', [1]),\n", " ('swear,', [1]),\n", " ('qualified', [1]),\n", " ('nothing', [1]),\n", " ('mass', [7]),\n", " ('chip}', [1]),\n", " ('saw,', [1]),\n", " ('info', [1, 1, 1, 1, 1, 1]),\n", " ('krueger@helium.gas.uug.arizona.edu', [1]),\n", " ('Management.', [1]),\n", " ('P9000', [2]),\n", " ('Communications,', [1]),\n", " ('citizenship.', [1]),\n", " ('straight,', [1]),\n", " ('Well', [1]),\n", " ('Attacks', [1]),\n", " ('Hebron,', [1]),\n", " ('Still,', [1]),\n", " ('\"those', [1]),\n", " ('WFAN', [1]),\n", " ('$2000', [2]),\n", " (\"Vanbiesbrouck's\", [1]),\n", " ('propulsion,', [1]),\n", " ('(harleys,', [1]),\n", " ('factor', [1, 1]),\n", " ('jaskew@spam.maths.adelaide.edu.au', [1]),\n", " ('PCs,', [1]),\n", " ('Stolen?', [1]),\n", " ('\"Convictions', [1]),\n", " ('two.', [1]),\n", " ('\"It\\'s', [1, 1]),\n", " ('pub', [1]),\n", " ('lictor.acsu.buffalo.edu', [1]),\n", " ('mention', [1, 1]),\n", " ('be...', [1]),\n", " ('Celica', [4]),\n", " ('weak-encryption', [1]),\n", " ('Mellon,', [1]),\n", " ('feel', [1, 1]),\n", " ('they', [2, 1, 1, 2, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1]),\n", " ('IT...I', [1]),\n", " ('neat', [1]),\n", " ('till', [1, 1]),\n", " ('Air', [1]),\n", " (\"team's\", [1]),\n", " ('Script', [1]),\n", " ('Brewer', [1]),\n", " ('Hair', [1]),\n", " ('postseason,', [1]),\n", " ('tickets', [1]),\n", " ('VOICE', [1]),\n", " ('ordering', [1]),\n", " ('nodes', [1]),\n", " ('especially', [1]),\n", " ('X', [1]),\n", " ('industry', [1]),\n", " ('R', [1, 1]),\n", " ('/~~\\\\', [1]),\n", " ('Thanks,', [1, 1, 1]),\n", " ('email,', [1, 1]),\n", " ('Canopies', [1]),\n", " ('blow', [1]),\n", " ('Auto', [1]),\n", " ('\"A\"', [2]),\n", " ('GREAT!),', [1]),\n", " ('>argue', [1]),\n", " ('suspending', [1]),\n", " ('>sold', [1]),\n", " ('writing', [1]),\n", " ('120,', [1]),\n", " ('|/', [1]),\n", " ('1000yds.', [1]),\n", " ('61', [1]),\n", " ('bag!\"', [1]),\n", " ('1.1', [1, 1, 1]),\n", " ('escapes', [1]),\n", " ('lies.\"', [1]),\n", " ('Common', [1]),\n", " ('good?).', [1]),\n", " ('_|/_', [1]),\n", " ('>City,', [1]),\n", " ('Controlled', [1]),\n", " ('probably', [1, 1, 1, 1, 1, 1, 1, 1, 1]),\n", " ('tm0006.lerc.nasa.gov', [1]),\n", " ('interpretation', [2]),\n", " ('that.', [1]),\n", " ('parent.', [1]),\n", " ('war?', [1]),\n", " ('steam', [3]),\n", " ('16-Apr-93', [1]),\n", " ('Sharon).', [1]),\n", " ('implementation', [1]),\n", " ('thought', [1, 1]),\n", " ('System:', [1]),\n", " (\"we've\", [1]),\n", " ('tend', [1]),\n", " ('Talon,', [1]),\n", " ('lucky.', [1]),\n", " ('deduction)', [1]),\n", " ('Looking', [1, 1]),\n", " ('directly', [1]),\n", " ('shows', [1]),\n", " ('what', [2, 2, 4, 2, 3, 1, 2, 1, 1, 2, 2]),\n", " ('right,', [1]),\n", " ('Jesus', [1]),\n", " ('semi', [6]),\n", " ('destruction?', [1]),\n", " ('NL', [2, 1]),\n", " ('his', [3, 1, 1, 3]),\n", " ('Sad,', [1]),\n", " ('system', [1, 1, 1, 1, 1, 1]),\n", " ('hammer', [4]),\n", " ('{120%', [1]),\n", " ('sea.', [1]),\n", " ('r4938585@joplin.biosci.arizona.edu', [1]),\n", " ('/2', [1]),\n", " ('error', [1]),\n", " ('mail-bouncing', [1]),\n", " ('FL', [1]),\n", " ('comes.', [1]),\n", " ('8th', [2]),\n", " ('restrict', [1]),\n", " ('beyond', [1]),\n", " ('Enhancements', [1]),\n", " ('9', [1]),\n", " ('forsale', [1]),\n", " ('Main', [1]),\n", " ('see?', [1]),\n", " ('conclusion.', [1]),\n", " ('yesterday', [1]),\n", " ('runs', [1, 2]),\n", " ('cold', [2]),\n", " ('problem,', [1]),\n", " ('acquisition/control,', [1]),\n", " (\">Site's\", [1]),\n", " ('accessories', [1]),\n", " ('paying', [3]),\n", " ('MIT', [1]),\n", " ('write', [1, 1]),\n", " ('>You', [1]),\n", " ('(Charles', [1]),\n", " ('keeping', [1, 1, 1]),\n", " ('1991', [1, 1]),\n", " ('Nodine)', [1]),\n", " ('wrong,', [1]),\n", " ('male,', [1]),\n", " ('side', [1, 1]),\n", " ('po4.andrew.cmu.edu', [1]),\n", " ('adresses', [1]),\n", " ('Matic', [1]),\n", " ('chip', [4]),\n", " ('year,', [1, 3, 1]),\n", " ('seige,', [1]),\n", " ('fixed', [1]),\n", " ('uprising', [1]),\n", " ('924.', [1]),\n", " ('possible.', [1, 1, 1]),\n", " ('accepted,', [1]),\n", " ('count', [1]),\n", " ('products', [1]),\n", " ('cancelled.', [1]),\n", " ('is.', [1]),\n", " ('edit,', [1]),\n", " ('pick', [1, 1]),\n", " ('Home', [1]),\n", " ('external', [3]),\n", " ('0', [2]),\n", " ('obviously,', [1]),\n", " ('thinking', [1, 1, 1, 1]),\n", " ('unknowable.', [1]),\n", " ('Einstein=======', [1]),\n", " ('self', [2]),\n", " ('import', [1]),\n", " ('holmes7000@iscsvax.uni.edu', [1]),\n", " ('certainly', [1]),\n", " ('Times', [1]),\n", " ('fed', [1]),\n", " (\"'new\", [1]),\n", " ('Brady', [1]),\n", " ('(Adam', [1]),\n", " ('somwhere,', [1]),\n", " ('vs.', [1]),\n", " ('=======>', [1]),\n", " ('60', [1, 2]),\n", " ('DISCLAIMER:', [1]),\n", " ('facets', [1]),\n", " ('slc10.ins.cwru.edu', [1]),\n", " ('Kerr', [1]),\n", " ('1990', [1]),\n", " (\"Investors'\", [1]),\n", " ('containing', [1]),\n", " ('drugs,', [1, 1]),\n", " ('>federal', [1]),\n", " ('made', [2, 1]),\n", " ('>cover', [1]),\n", " ('Stack', [1]),\n", " ('tific', [1]),\n", " ('80Mb', [1]),\n", " ('Student', [1]),\n", " ('priorities.', [1]),\n", " ('10MB/s', [4]),\n", " ('Urbana', [1]),\n", " ('anti', [1]),\n", " ('24.', [1]),\n", " ('\"Although', [2]),\n", " ('>>20%', [1]),\n", " ('Viola', [3]),\n", " ('fell', [1]),\n", " ('tg@cs.toronto.edu', [1]),\n", " ('now,', [1]),\n", " ('36', [1, 1]),\n", " ('ask', [1, 1, 1]),\n", " (\"A's\", [1]),\n", " ('A,', [1]),\n", " ('DSO,', [1]),\n", " ('here,', [1]),\n", " ('criticizing', [1]),\n", " ('open', [1]),\n", " ('showing', [1]),\n", " ('too!)', [1]),\n", " ('time)', [1]),\n", " ('gets', [1, 1, 1, 1]),\n", " ('before,', [1]),\n", " ('William', [1]),\n", " ('honk', [1]),\n", " ('VTT', [1]),\n", " ('Permanent', [2]),\n", " ('power.', [2, 1]),\n", " ('\"God,', [1]),\n", " ('entity,', [1]),\n", " ('writers', [1, 1]),\n", " ('legitimate', [1]),\n", " ('motion,', [1]),\n", " ('that.\"', [1]),\n", " ('>...what', [1]),\n", " ('MORE', [1]),\n", " ('dual', [1]),\n", " ('owned', [1]),\n", " ('respective', [1]),\n", " (\"driver's\", [1]),\n", " ('(4/23)', [1]),\n", " ('DoD#', [1]),\n", " ('here?', [1]),\n", " ('SCSi-2', [1]),\n", " ('used', [3, 1, 1, 1, 1, 1]),\n", " ('outside', [1, 1]),\n", " ('(then', [1]),\n", " ('turning', [1]),\n", " ('0-5MB/s.', [1]),\n", " ('>water.', [1]),\n", " ('interest.', [1]),\n", " ('Askew)', [1]),\n", " ('maybe', [1, 1, 1, 1, 1]),\n", " ...]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(shuffle_results)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "hidden": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true }, "source": [ "# NB Predict" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "hidden": true }, "outputs": [], "source": [ "import os\n", "import re\n", "import numpy as np\n", "from collections import defaultdict\n", "from operator import itemgetter" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "hidden": true }, "outputs": [], "source": [ "word_search_re = re.compile(r\"[\\w']+\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "hidden": true }, "outputs": [], "source": [ "def load_model(model_filename):\n", " model = defaultdict(lambda: defaultdict(float))\n", " with open(model_filename) as inf:\n", " for line in inf:\n", " word, values = line.split(maxsplit=1)\n", " word = eval(word)\n", " values = eval(values)\n", " model[word] = values\n", " return model" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "hidden": true }, "outputs": [], "source": [ "model_filename = os.path.join(os.path.expanduser(\"~\"), \"models\", \"part-00000\")\n", "model = load_model(model_filename)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "hidden": true }, "outputs": [ { "data": { "text/plain": [ "(409.7987003114851, 513.3231594734408)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model[\"i\"][\"male\"], model[\"i\"][\"female\"]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "hidden": true }, "outputs": [], "source": [ "def nb_predict(model, document):\n", " words = word_search_re.findall(document)\n", " probabilities = defaultdict(lambda : 0)\n", " for word in set(words):\n", " probabilities[\"male\"] += np.log(model[word].get(\"male\", 1e-5))\n", " probabilities[\"female\"] += np.log(model[word].get(\"female\", 1e-5))\n", " # Now find the most likely gender\n", " most_likely_genders = sorted(probabilities.items(), key=itemgetter(1), reverse=True)\n", " return most_likely_genders[0][0]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "hidden": true }, "outputs": [], "source": [ "new_post = \"\"\" Every day should be a half day. Took the afternoon off to hit the dentist, and while I was out I managed to get my oil changed, too. Remember that business with my car dealership this winter? Well, consider this the epilogue. The friendly fellas at the Valvoline Instant Oil Change on Snelling were nice enough to notice that my dipstick was broken, and the metal piece was too far down in its little dipstick tube to pull out. Looks like I'm going to need a magnet. Damn you, Kline Nissan, daaaaaaammmnnn yooouuuu.... Today I let my boss know that I've submitted my Corps application. The news has been greeted by everyone in the company with a level of enthusiasm that really floors me. The back deck has finally been cleared off by the construction company working on the place. This company, for anyone who's interested, consists mainly of one guy who spends his days cursing at his crew of Spanish-speaking laborers. Construction of my deck began around the time Nixon was getting out of office.\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "hidden": true }, "outputs": [ { "data": { "text/plain": [ "'male'" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nb_predict(model, new_post)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "hidden": true }, "outputs": [], "source": [ "testing_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"blogposts_testing\")\n", "testing_filenames = []\n", "for filename in os.listdir(testing_folder):\n", " testing_filenames.append(os.path.join(testing_folder, filename))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "hidden": true }, "outputs": [], "source": [ "def nb_predict_many(model, input_filename):\n", " with open(input_filename) as inf:\n", " # remove leading and trailing whitespace\n", " for line in inf:\n", " tokens = line.split()\n", " actual_gender = eval(tokens[0])\n", " blog_post = eval(\" \".join(tokens[1:]))\n", " yield actual_gender, nb_predict(model, blog_post)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "hidden": true }, "outputs": [], "source": [ "def nb_predict(model, document):\n", " words = word_search_re.findall(document)\n", " probabilities = defaultdict(lambda : 1)\n", " for word in set(words):\n", " probabilities[\"male\"] += np.log(model[word].get(\"male\", 1e-15))\n", " probabilities[\"female\"] += np.log(model[word].get(\"female\", 1e-15))\n", " # Now find the most likely gender\n", " most_likely_genders = sorted(probabilities.items(), key=itemgetter(1), reverse=True)\n", " return most_likely_genders" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "hidden": true }, "outputs": [], "source": [ "y_true = []\n", "y_pred = []\n", "for testing_filename in testing_filenames:\n", " for actual_gender, ratios in nb_predict_many(model, testing_filename):\n", " predicted_gender = ratios[0][0]\n", " y_true.append(actual_gender == \"female\")\n", " y_pred.append(predicted_gender == \"female\")\n", "y_true = np.array(y_true, dtype='int')\n", "y_pred = np.array(y_pred, dtype='int')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "hidden": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "f1=0.5540\n", "acc=0.5765\n" ] } ], "source": [ "from sklearn.metrics import f1_score\n", "print(\"f1={:.4f}\".format(f1_score(y_true, y_pred, pos_label=None)))\n", "print(\"acc={:.4f}\".format(np.mean(y_true == y_pred)))\n", " \n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "hidden": true }, "outputs": [], "source": [ "aws_model_filename = os.path.join(os.path.expanduser(\"~\"), \"models\", \"model_aws\")\n", "aws_model = load_model(aws_model_filename)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "hidden": true }, "outputs": [], "source": [ "y_true = []\n", "y_pred = []\n", "for testing_filename in testing_filenames:\n", " for actual_gender, predicted_gender in nb_predict_many(aws_model, testing_filename):\n", " predicted_gender = ratios[0][0]\n", " y_true.append(actual_gender == \"female\")\n", " y_pred.append(predicted_gender == \"female\")\n", " #print(\"Actual: {0}\\tPredicted: {1}\".format(actual_gender, predicted_gender))\n", " if len(y_true) > 500:\n", " break\n", "y_true = np.array(y_true, dtype='int')\n", "y_pred = np.array(y_pred, dtype='int')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "hidden": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "f1=0.8144\n", "acc=0.8734\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.4/dist-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n", " 'precision', 'predicted', average, warn_for)\n" ] } ], "source": [ "print(\"f1={:.4f}\".format(f1_score(y_true, y_pred, pos_label=None)))\n", "print(\"acc={:.4f}\".format(np.mean(y_true == y_pred)))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "hidden": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]\n" ] } ], "source": [ "print(list(zip(y_true, y_pred))[:10])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "hidden": true }, "outputs": [], "source": [ "from sklearn.metrics import confusion_matrix" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "hidden": true }, "outputs": [ { "data": { "text/plain": [ "array([[614, 0],\n", " [ 89, 0]])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "confusion_matrix(y_true, y_pred)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": { "heading_collapsed": true }, "source": [ "# Test load" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "hidden": true }, "outputs": [], "source": [ "import os\n", "filename = os.path.join(os.path.expanduser(\"~\"), \"Data\", \"blogs\", \"1005545.male.25.Engineering.Sagittarius.xml\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "hidden": true }, "outputs": [], "source": [ "all_posts = []\n", "with open(filename) as inf:\n", " # remove leading and trailing whitespace\n", " post_start = False\n", " post = []\n", " for line in inf:\n", " line = line.strip()\n", " if line == \"\":\n", " post_start = True\n", " elif line == \"\":\n", " post_start = False\n", " all_posts.append(\"\\n\".join(post))\n", " post = []\n", " elif post_start:\n", " post.append(line)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "hidden": true }, "outputs": [ { "data": { "text/plain": [ "80" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(all_posts)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" }, "toc": { "colors": { "hover_highlight": "#DAA520", "navigate_num": "#000000", "navigate_text": "#333333", "running_highlight": "#FF0000", "selected_highlight": "#FFD700", "sidebar_border": "#EEEEEE", "wrapper_background": "#FFFFFF" }, "moveMenuLeft": true, "nav_menu": { "height": "81px", "width": "253px" }, "navigate_menu": true, "number_sections": true, "sideBar": true, "threshold": 4, "toc_cell": false, "toc_section_display": "block", "toc_window_display": false, "widenNotebook": false } }, "nbformat": 4, "nbformat_minor": 2 }