{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\n", "# Document retrieval from wikipedia data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Fire up GraphLab Create\n", "(See [Getting Started with SFrames](../Week%201/Getting%20Started%20with%20SFrames.ipynb) for setup instructions)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import graphlab as gl" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Limit number of worker processes. This preserves system memory, which prevents hosted notebooks from crashing.\n", "graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load some text data - from wikipedia, pages on people" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [], "source": [ "people = gl.SFrame('people_wiki.gl/')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Data contains: link to wikipedia article, name of person, text of article." ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URInametext
<http://dbpedia.org/resou
rce/Digby_Morrell> ...
Digby Morrelldigby morrell born 10
october 1979 is a former ...
<http://dbpedia.org/resou
rce/Alfred_J._Lewy> ...
Alfred J. Lewyalfred j lewy aka sandy
lewy graduated from ...
<http://dbpedia.org/resou
rce/Harpdog_Brown> ...
Harpdog Brownharpdog brown is a singer
and harmonica player who ...
<http://dbpedia.org/resou
rce/Franz_Rottensteiner> ...
Franz Rottensteinerfranz rottensteiner born
in waidmannsfeld lower ...
<http://dbpedia.org/resou
rce/G-Enka> ...
G-Enkahenry krvits born 30
december 1974 in tallinn ...
<http://dbpedia.org/resou
rce/Sam_Henderson> ...
Sam Hendersonsam henderson born
october 18 1969 is an ...
<http://dbpedia.org/resou
rce/Aaron_LaCrate> ...
Aaron LaCrateaaron lacrate is an
american music producer ...
<http://dbpedia.org/resou
rce/Trevor_Ferguson> ...
Trevor Fergusontrevor ferguson aka john
farrow born 11 november ...
<http://dbpedia.org/resou
rce/Grant_Nelson> ...
Grant Nelsongrant nelson born 27
april 1971 in london ...
<http://dbpedia.org/resou
rce/Cathy_Caruth> ...
Cathy Caruthcathy caruth born 1955 is
frank h t rhodes ...
\n", "[10 rows x 3 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tURI\tstr\n", "\tname\tstr\n", "\ttext\tstr\n", "\n", "Rows: 10\n", "\n", "Data:\n", "+-------------------------------+---------------------+\n", "| URI | name |\n", "+-------------------------------+---------------------+\n", "| \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URInametext
<http://dbpedia.org/resou
rce/Barack_Obama> ...
Barack Obamabarack hussein obama ii
brk husen bm born august ...
\n", "[? rows x 3 columns]
Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.
You can use sf.materialize() to force materialization.\n", "" ], "text/plain": [ "Columns:\n", "\tURI\tstr\n", "\tname\tstr\n", "\ttext\tstr\n", "\n", "Rows: Unknown\n", "\n", "Data:\n", "+-------------------------------+--------------+-------------------------------+\n", "| URI | name | text |\n", "+-------------------------------+--------------+-------------------------------+\n", "| \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URInametextword_count
<http://dbpedia.org/resou
rce/Barack_Obama> ...
Barack Obamabarack hussein obama ii
brk husen bm born august ...
{'operations': 1,
'represent': 1, 'offi ...
\n", "[1 rows x 4 columns]
\n", "" ], "text/plain": [ "Columns:\n", "\tURI\tstr\n", "\tname\tstr\n", "\ttext\tstr\n", "\tword_count\tdict\n", "\n", "Rows: 1\n", "\n", "Data:\n", "+-------------------------------+--------------+-------------------------------+\n", "| URI | name | text |\n", "+-------------------------------+--------------+-------------------------------+\n", "| \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordcount
the40
in30
and21
of18
to14
his11
obama9
act8
he7
a7
\n", "[10 rows x 2 columns]
\n", "" ], "text/plain": [ "Columns:\n", "\tword\tstr\n", "\tcount\tint\n", "\n", "Rows: 10\n", "\n", "Data:\n", "+-------+-------+\n", "| word | count |\n", "+-------+-------+\n", "| the | 40 |\n", "| in | 30 |\n", "| and | 21 |\n", "| of | 18 |\n", "| to | 14 |\n", "| his | 11 |\n", "| obama | 9 |\n", "| act | 8 |\n", "| he | 7 |\n", "| a | 7 |\n", "+-------+-------+\n", "[10 rows x 2 columns]" ] }, "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obama_word_count_table.head()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordcount
the40
in30
and21
of18
to14
his11
obama9
act8
he7
a7
\n", "[273 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.\n", "
" ], "text/plain": [ "Columns:\n", "\tword\tstr\n", "\tcount\tint\n", "\n", "Rows: 273\n", "\n", "Data:\n", "+-------+-------+\n", "| word | count |\n", "+-------+-------+\n", "| the | 40 |\n", "| in | 30 |\n", "| and | 21 |\n", "| of | 18 |\n", "| to | 14 |\n", "| his | 11 |\n", "| obama | 9 |\n", "| act | 8 |\n", "| he | 7 |\n", "| a | 7 |\n", "+-------+-------+\n", "[273 rows x 2 columns]\n", "Note: Only the head of the SFrame is printed.\n", "You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns." ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obama_word_count_table.sort('count',ascending=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Most common words include uninformative words like \"the\", \"in\", \"and\",..." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Compute TF-IDF for the corpus \n", "\n", "To give more weight to informative words, we weigh them by their TF-IDF scores." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Have to first count all words in the Corpus using `count_words`" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URInametextword_count
<http://dbpedia.org/resou
rce/Digby_Morrell> ...
Digby Morrelldigby morrell born 10
october 1979 is a former ...
{'selection': 1,
'carltons': 1, 'being': ...
<http://dbpedia.org/resou
rce/Alfred_J._Lewy> ...
Alfred J. Lewyalfred j lewy aka sandy
lewy graduated from ...
{'precise': 1, 'thomas':
1, 'closely': 1, ...
<http://dbpedia.org/resou
rce/Harpdog_Brown> ...
Harpdog Brownharpdog brown is a singer
and harmonica player who ...
{'just': 1, 'issued': 1,
'mainly': 1, 'nominat ...
<http://dbpedia.org/resou
rce/Franz_Rottensteiner> ...
Franz Rottensteinerfranz rottensteiner born
in waidmannsfeld lower ...
{'all': 1,
'bauforschung': 1, ...
<http://dbpedia.org/resou
rce/G-Enka> ...
G-Enkahenry krvits born 30
december 1974 in tallinn ...
{'they': 1,
'gangstergenka': 1, ...
<http://dbpedia.org/resou
rce/Sam_Henderson> ...
Sam Hendersonsam henderson born
october 18 1969 is an ...
{'currently': 1, 'less':
1, 'being': 1, ...
<http://dbpedia.org/resou
rce/Aaron_LaCrate> ...
Aaron LaCrateaaron lacrate is an
american music producer ...
{'exclusive': 2,
'producer': 1, 'show' ...
<http://dbpedia.org/resou
rce/Trevor_Ferguson> ...
Trevor Fergusontrevor ferguson aka john
farrow born 11 november ...
{'taxi': 1, 'salon': 1,
'gangs': 1, 'being': 1, ...
<http://dbpedia.org/resou
rce/Grant_Nelson> ...
Grant Nelsongrant nelson born 27
april 1971 in london ...
{'houston': 1, 'frankie':
1, 'labels': 1, ...
<http://dbpedia.org/resou
rce/Cathy_Caruth> ...
Cathy Caruthcathy caruth born 1955 is
frank h t rhodes ...
{'phenomenon': 1,
'deborash': 1, 'both' ...
\n", "[10 rows x 4 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tURI\tstr\n", "\tname\tstr\n", "\ttext\tstr\n", "\tword_count\tdict\n", "\n", "Rows: 10\n", "\n", "Data:\n", "+-------------------------------+---------------------+\n", "| URI | name |\n", "+-------------------------------+---------------------+\n", "| \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URInametextword_count
<http://dbpedia.org/resou
rce/Digby_Morrell> ...
Digby Morrelldigby morrell born 10
october 1979 is a former ...
{'selection': 1,
'carltons': 1, 'being': ...
<http://dbpedia.org/resou
rce/Alfred_J._Lewy> ...
Alfred J. Lewyalfred j lewy aka sandy
lewy graduated from ...
{'precise': 1, 'thomas':
1, 'closely': 1, ...
<http://dbpedia.org/resou
rce/Harpdog_Brown> ...
Harpdog Brownharpdog brown is a singer
and harmonica player who ...
{'just': 1, 'issued': 1,
'mainly': 1, 'nominat ...
<http://dbpedia.org/resou
rce/Franz_Rottensteiner> ...
Franz Rottensteinerfranz rottensteiner born
in waidmannsfeld lower ...
{'all': 1,
'bauforschung': 1, ...
<http://dbpedia.org/resou
rce/G-Enka> ...
G-Enkahenry krvits born 30
december 1974 in tallinn ...
{'they': 1,
'gangstergenka': 1, ...
<http://dbpedia.org/resou
rce/Sam_Henderson> ...
Sam Hendersonsam henderson born
october 18 1969 is an ...
{'currently': 1, 'less':
1, 'being': 1, ...
<http://dbpedia.org/resou
rce/Aaron_LaCrate> ...
Aaron LaCrateaaron lacrate is an
american music producer ...
{'exclusive': 2,
'producer': 1, 'show' ...
<http://dbpedia.org/resou
rce/Trevor_Ferguson> ...
Trevor Fergusontrevor ferguson aka john
farrow born 11 november ...
{'taxi': 1, 'salon': 1,
'gangs': 1, 'being': 1, ...
<http://dbpedia.org/resou
rce/Grant_Nelson> ...
Grant Nelsongrant nelson born 27
april 1971 in london ...
{'houston': 1, 'frankie':
1, 'labels': 1, ...
<http://dbpedia.org/resou
rce/Cathy_Caruth> ...
Cathy Caruthcathy caruth born 1955 is
frank h t rhodes ...
{'phenomenon': 1,
'deborash': 1, 'both' ...
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tfidf
{'selection':
3.836578553093086, ...
{'precise':
6.44320060695519, ...
{'just':
2.7007299687108643, ...
{'all':
1.6431112434912472, ...
{'they':
1.8993401178193898, ...
{'currently':
1.637088969126014, ...
{'exclusive':
10.455187230695827, ...
{'taxi':
6.0520214560945025, ...
{'houston':
3.935505942157149, ...
{'phenomenon':
5.750053426395245, ...
\n", "[10 rows x 5 columns]
\n", "" ], "text/plain": [ "Columns:\n", "\tURI\tstr\n", "\tname\tstr\n", "\ttext\tstr\n", "\tword_count\tdict\n", "\ttfidf\tdict\n", "\n", "Rows: 10\n", "\n", "Data:\n", "+-------------------------------+---------------------+\n", "| URI | name |\n", "+-------------------------------+---------------------+\n", "| \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordtfidf
obama43.2956530721
act27.678222623
iraq17.747378588
control14.8870608452
law14.7229357618
ordered14.5333739509
military13.1159327785
involvement12.7843852412
response12.7843852412
democratic12.4106886973
\n", "[273 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.\n", "" ], "text/plain": [ "Columns:\n", "\tword\tstr\n", "\ttfidf\tfloat\n", "\n", "Rows: 273\n", "\n", "Data:\n", "+-------------+---------------+\n", "| word | tfidf |\n", "+-------------+---------------+\n", "| obama | 43.2956530721 |\n", "| act | 27.678222623 |\n", "| iraq | 17.747378588 |\n", "| control | 14.8870608452 |\n", "| law | 14.7229357618 |\n", "| ordered | 14.5333739509 |\n", "| military | 13.1159327785 |\n", "| involvement | 12.7843852412 |\n", "| response | 12.7843852412 |\n", "| democratic | 12.4106886973 |\n", "+-------------+---------------+\n", "[273 rows x 2 columns]\n", "Note: Only the head of the SFrame is printed.\n", "You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns." ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obama[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Words with highest TF-IDF are much more informative." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Manually compute distances between a few people with `gl.distances.cosine` distance\n", "\n", "Let's manually compare the distances between the articles for a few famous people. " ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "clinton = people[people['name'] == 'Bill Clinton']" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": true }, "outputs": [], "source": [ "beckham = people[people['name'] == 'David Beckham']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Is Obama closer to Clinton than to Beckham?\n", "\n", "We will use cosine distance, which is given by\n", "\n", "(1-cosine_similarity) \n", "\n", "and find that the article about president Obama is closer to the one about former president Clinton than that of footballer David Beckham.\n", "\n", "Note: the lower the cosine the closer the articles are related" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'13th': 4.9534091674263925,\n", " '1961': 3.3207419573634955,\n", " '1992': 2.278351314316948,\n", " '1996': 2.135691193468776,\n", " '1997': 2.1298344522079455,\n", " '20': 4.88376320446593,\n", " '2000in': 6.250296940830698,\n", " '2004': 5.071033082507702,\n", " '2007': 1.4879730697555795,\n", " '2008': 1.5093391374786154,\n", " '2009': 4.693309450812809,\n", " '2010': 3.185667920243947,\n", " '2011': 5.107041270312876,\n", " '2012': 1.7938099524877322,\n", " '2012obama': 10.986495389225194,\n", " '2013': 1.9545642372230505,\n", " '4': 2.437803530749586,\n", " '44th': 7.0744723837970485,\n", " '63': 5.22130428644035,\n", " '8': 2.7572509724892824,\n", " 'a': 0.039334291308082026,\n", " 'act': 27.67822262297991,\n", " 'address': 4.8023464982877115,\n", " 'administration': 3.2952952917023315,\n", " 'affordable': 6.134465125305577,\n", " 'afghanistan': 9.4197037997671,\n", " 'african': 3.582216271187926,\n", " 'after': 3.7773337680052257,\n", " 'against': 2.0079609791418744,\n", " 'american': 3.3821333532750204,\n", " 'americans': 4.761936959949835,\n", " 'and': 0.01564802185902329,\n", " 'arms': 5.030658019760364,\n", " 'as': 0.7630171320744707,\n", " 'ask': 5.510031837293684,\n", " 'at': 0.43063857330825733,\n", " 'attention': 3.4269358932174945,\n", " 'attorney': 3.6593720969659014,\n", " 'august': 2.000173951599175,\n", " 'barack': 5.067601534952048,\n", " 'before': 1.4967823726683713,\n", " 'began': 1.727460095710253,\n", " 'bin': 5.6158573610975315,\n", " 'bm': 7.202305755306933,\n", " 'born': 0.53639254752953,\n", " 'briefs': 7.460134864609033,\n", " 'brk': 10.293348208665249,\n", " 'budget': 4.523465932304524,\n", " 'by': 0.37455341206197373,\n", " 'californias': 5.923900356198227,\n", " 'called': 2.0784770664403074,\n", " 'campaign': 9.077468342511231,\n", " 'care': 3.916621260766622,\n", " 'chicago': 5.858235801334405,\n", " 'civil': 3.3244978303233013,\n", " 'clinton': 4.542364132524754,\n", " 'close': 3.5416621153330006,\n", " 'columbia': 3.232442750189437,\n", " 'combat': 4.932056042955824,\n", " 'community': 2.693196030513871,\n", " 'constitutional': 4.529725733653031,\n", " 'consumer': 4.7540473726746715,\n", " 'continued': 2.720588055069447,\n", " 'control': 14.887060845181308,\n", " 'convention': 4.417013968810898,\n", " 'court': 2.896399606044235,\n", " 'creation': 4.03568062078261,\n", " 'cuba': 5.005081177970713,\n", " 'current': 2.8308461188591933,\n", " 'death': 3.1568650000750016,\n", " 'debate': 4.783959872037272,\n", " 'debt': 5.6158573610975315,\n", " 'defeated': 3.4068165661347387,\n", " 'defeating': 4.285764279677384,\n", " 'defense': 3.794313330511949,\n", " 'degree': 2.0206498944742566,\n", " 'delegates': 6.142308302766603,\n", " 'democratic': 12.410688697332166,\n", " 'district': 2.774469584601757,\n", " 'doddfrank': 9.887883100557085,\n", " 'domestic': 8.967410686619141,\n", " 'dont': 8.28123102792229,\n", " 'down': 2.9476606314374463,\n", " 'during': 2.634930295807099,\n", " 'earning': 3.857197840295821,\n", " 'economic': 3.2704801225826072,\n", " 'election': 7.712676160711769,\n", " 'elementary': 4.96062941539988,\n", " 'ended': 3.3430124821479934,\n", " 'ending': 4.547145018125096,\n", " 'equality': 5.341048491581956,\n", " 'federal': 3.0888272384982884,\n", " 'filed': 4.70797396505935,\n", " 'first': 2.0868146141979307,\n", " 'for': 0.29145011737314763,\n", " 'foreign': 6.3816977057812005,\n", " 'form': 3.233730580373866,\n", " 'from': 0.8812660139569034,\n", " 'full': 3.003055326218652,\n", " 'gains': 6.7238155121838785,\n", " 'general': 2.220724845237937,\n", " 'graduate': 2.6750971107885535,\n", " 'great': 2.7682474625394486,\n", " 'gun': 5.269467687818973,\n", " 'harvard': 6.634000276435001,\n", " 'has': 1.713990158976156,\n", " 'hawaii': 4.806478735572622,\n", " 'he': 1.493579903611068,\n", " 'hillary': 6.002888767516858,\n", " 'his': 2.8887260073502303,\n", " 'hold': 4.144879912747602,\n", " 'honolulu': 5.880549915324614,\n", " 'hook': 6.2951475069960505,\n", " 'house': 4.651096468328522,\n", " 'husen': 10.986495389225194,\n", " 'hussein': 6.126682984863522,\n", " 'ii': 3.3077063910260405,\n", " 'illinois': 7.480254618245467,\n", " 'in': 0.028962190503643476,\n", " 'inaugurated': 6.27696518791286,\n", " 'included': 2.4446095852185854,\n", " 'increased': 4.75012579902149,\n", " 'initiatives': 4.895185507147496,\n", " 'insurance': 4.594578275832593,\n", " 'into': 1.6050629424066056,\n", " 'involvement': 12.784385241175055,\n", " 'iraq': 17.747378587965535,\n", " 'is': 0.05523250095103998,\n", " 'islamic': 4.785986215182504,\n", " 'january': 5.656236009557883,\n", " 'job': 3.446936559924164,\n", " 'john': 1.9913303989140443,\n", " 'july': 1.9428000946579587,\n", " 'keynote': 5.433535804303577,\n", " 'laden': 6.709829270209139,\n", " 'laureateduring': 10.986495389225194,\n", " 'law': 14.722935761763422,\n", " 'legislation': 4.687546142369252,\n", " 'lengthy': 5.862531409821935,\n", " 'levels': 4.647901311022012,\n", " 'lgbt': 5.5883326877074415,\n", " 'libya': 6.332535039067671,\n", " 'limit': 5.355283607403829,\n", " 'lost': 2.731706463076469,\n", " 'made': 1.519886215502974,\n", " 'major': 2.0581201293715634,\n", " 'march': 1.9573172463231197,\n", " 'marriage': 4.115404094614648,\n", " 'mccain': 6.766987684049088,\n", " 'military': 13.115932778499415,\n", " 'mitt': 6.812108119329557,\n", " 'months': 3.0427126967665687,\n", " 'named': 2.0300155412252816,\n", " 'national': 2.3721863295447827,\n", " 'nations': 3.361876403065796,\n", " 'new': 0.8871532656125274,\n", " 'nine': 3.2624907325491286,\n", " 'nobel': 5.075698745184667,\n", " 'nomination': 3.7697859025157365,\n", " 'nominee': 9.43101391473379,\n", " 'normalize': 10.293348208665249,\n", " 'not': 1.5880170751336171,\n", " 'november': 3.9453132752336004,\n", " 'obama': 43.2956530720749,\n", " 'obamacare': 9.04058524016988,\n", " 'of': 0.07481117158400744,\n", " 'office': 5.2481728232196465,\n", " 'often': 2.862641126119281,\n", " 'on': 0.36882550670120073,\n", " 'operation': 4.3584540130456615,\n", " 'operations': 3.811771079388818,\n", " 'or': 1.9128915408224825,\n", " 'ordered': 14.533373950913514,\n", " 'organizer': 5.501698455734539,\n", " 'osama': 6.604468754551313,\n", " 'other': 1.4424007566948476,\n", " 'over': 1.4878231559557336,\n", " 'party': 7.0620334604226676,\n", " 'patient': 5.6393878585077255,\n", " 'peace': 3.7928095708300824,\n", " 'policies': 4.660345916070095,\n", " 'policy': 6.095386282141427,\n", " 'president': 7.226869291326606,\n", " 'presidential': 7.386955418904825,\n", " 'primaries': 6.669007275688884,\n", " 'primary': 6.710127449296579,\n", " 'prize': 2.7453192387302345,\n", " 'process': 3.7451291059028766,\n", " 'promoted': 3.5191243223076345,\n", " 'proposition': 6.926052378678775,\n", " 'protection': 8.698474715881474,\n", " 'raise': 4.67476058007228,\n", " 'reauthorization': 8.907053847545358,\n", " 'receive': 4.448355565457525,\n", " 'received': 1.5396609815666638,\n", " 'recession': 6.642689967371511,\n", " 'recovery': 5.103173000736915,\n", " 'reelected': 3.68265216394749,\n", " 'referred': 4.255477288743111,\n", " 'reform': 4.092839034622559,\n", " 'regained': 5.583818007352915,\n", " 'reinvestment': 8.421546031763658,\n", " 'related': 3.723866788250953,\n", " 'relations': 3.445873860568042,\n", " 'relief': 8.842460838379667,\n", " 'repeal': 7.297615935111258,\n", " 'represent': 4.184100625900883,\n", " 'representatives': 7.484535747243688,\n", " 'representing': 3.535253704237518,\n", " 'republican': 7.207497878972565,\n", " 'republicans': 5.592867842872833,\n", " 'response': 12.784385241175055,\n", " 'resulted': 4.015765311081669,\n", " 'review': 3.227734845067532,\n", " 'rights': 3.06677562830062,\n", " 'rodham': 7.690658523220865,\n", " 'romney': 6.843360662833661,\n", " 'running': 3.3516747114796512,\n", " 'russia': 4.025199343315028,\n", " 'sandy': 5.493433945884646,\n", " 'school': 3.6683618376520553,\n", " 'seats': 5.000043383940756,\n", " 'second': 3.344851662973069,\n", " 'senate': 10.164288179703693,\n", " 'served': 3.0725446998610506,\n", " 'shooting': 4.682046586803213,\n", " 'signed': 7.552957376250012,\n", " 'sought': 4.327201469541557,\n", " 'spending': 4.176352939110058,\n", " 'start': 3.281232914358869,\n", " 'state': 1.7090572737165175,\n", " 'states': 5.473200989631017,\n", " 'stimulus': 7.654290879049991,\n", " 'street': 3.250188292676909,\n", " 'strike': 5.025490049601921,\n", " 'sufficient': 6.432618497624653,\n", " 'supreme': 3.9140734886878232,\n", " 'sworn': 4.96062941539988,\n", " 'taught': 2.8485149347730556,\n", " 'tax': 4.545548848592274,\n", " 'taxpayer': 7.431147327735781,\n", " 'tell': 4.804410482508563,\n", " 'term': 9.319341564760851,\n", " 'terms': 3.8240978918694766,\n", " 'that': 0.6614069466714981,\n", " 'the': 0.004063113702956533,\n", " 'then': 1.4309354361561304,\n", " 'three': 1.4915025293575952,\n", " 'to': 0.6572291275451891,\n", " 'total': 3.2767385247710297,\n", " 'treaty': 5.89889905399281,\n", " 'troop': 7.248825770941826,\n", " 'two': 1.0988831858473562,\n", " 'unconstitutional': 6.8276123058655225,\n", " 'unemployment': 6.642689967371511,\n", " 'united': 4.703766236011668,\n", " 'university': 1.6946860096423695,\n", " 'unsuccessfully': 4.831637295208776,\n", " 'urged': 6.199003646443148,\n", " 'us': 11.591942692842837,\n", " 'victory': 3.384593429350028,\n", " 'wall': 4.021415043623787,\n", " 'war': 2.809822617276739,\n", " 'was': 0.3968289280609173,\n", " 'where': 1.089076212090673,\n", " 'whether': 4.744272123770029,\n", " 'which': 0.7674309670437692,\n", " 'while': 1.8364359481339414,\n", " 'with': 0.6074059275661821,\n", " 'withdrawal': 6.27696518791286,\n", " 'won': 1.3836400683164753,\n", " 'worked': 1.553891853362109,\n", " 'years': 1.0752380994247055}" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obama['tfidf'][0]" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.8339854936884276" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "graphlab.distances.cosine(obama['tfidf'][0],clinton['tfidf'][0])" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.9791305844747478" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "graphlab.distances.cosine(obama['tfidf'][0],beckham['tfidf'][0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Build a nearest neighbor model for document retrieval with `gl.nearest_neighbors`\n", "\n", "We now create a nearest-neighbors model and apply it to document retrieval. " ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
Starting brute force nearest neighbors model training.
" ], "text/plain": [ "Starting brute force nearest neighbors model training." ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "knn_model = graphlab.nearest_neighbors.create(people, features=['tfidf'], label='name')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Applying the nearest-neighbors model for retrieval" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Who is closest to Obama?" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
Starting pairwise querying.
" ], "text/plain": [ "Starting pairwise querying." ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| Query points | # Pairs | % Complete. | Elapsed Time |
" ], "text/plain": [ "| Query points | # Pairs | % Complete. | Elapsed Time |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| 0            | 1       | 0.00169288  | 13.291ms     |
" ], "text/plain": [ "| 0 | 1 | 0.00169288 | 13.291ms |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| Done         |         | 100         | 360.03ms     |
" ], "text/plain": [ "| Done | | 100 | 360.03ms |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_labelreference_labeldistancerank
0Barack Obama0.01
0Joe Biden0.7941176470592
0Joe Lieberman0.7946859903383
0Kelly Ayotte0.8119891008174
0Bill Clinton0.8138528138535
0Artur Davis0.8172323759796
0George W. Bush0.8189473684217
0John Kerry0.8194774346798
0Sam Brownback0.8211382113829
0Richard Cordray0.82180851063810
\n", "[10 rows x 4 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tquery_label\tint\n", "\treference_label\tstr\n", "\tdistance\tfloat\n", "\trank\tint\n", "\n", "Rows: 10\n", "\n", "Data:\n", "+-------------+-----------------+----------------+------+\n", "| query_label | reference_label | distance | rank |\n", "+-------------+-----------------+----------------+------+\n", "| 0 | Barack Obama | 0.0 | 1 |\n", "| 0 | Joe Biden | 0.794117647059 | 2 |\n", "| 0 | Joe Lieberman | 0.794685990338 | 3 |\n", "| 0 | Kelly Ayotte | 0.811989100817 | 4 |\n", "| 0 | Bill Clinton | 0.813852813853 | 5 |\n", "| 0 | Artur Davis | 0.817232375979 | 6 |\n", "| 0 | George W. Bush | 0.818947368421 | 7 |\n", "| 0 | John Kerry | 0.819477434679 | 8 |\n", "| 0 | Sam Brownback | 0.821138211382 | 9 |\n", "| 0 | Richard Cordray | 0.821808510638 | 10 |\n", "+-------------+-----------------+----------------+------+\n", "[10 rows x 4 columns]" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "knn_model.query(obama, radius=0.84, k=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As we can see, president Obama's article is closest to the one about his vice-president Biden, and those of other politicians. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Other examples of document retrieval" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": true }, "outputs": [], "source": [ "swift = people[people['name'] == 'Taylor Swift']" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "collapsed": false }, "outputs": [ { "ename": "ValueError", "evalue": "Input 'label' must be a string matching the name of a column in the reference SFrame 'dataset'.", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mknn_model\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mquery\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mswift\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m10\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;32m/home/arafatm/apps/anaconda2/envs/gl-env/lib/python2.7/site-packages/graphlab/toolkits/nearest_neighbors/_nearest_neighbors.pyc\u001b[0m in \u001b[0;36mquery\u001b[1;34m(self, dataset, label, k, radius, verbose)\u001b[0m\n\u001b[0;32m 984\u001b[0m raise ValueError(\n\u001b[0;32m 985\u001b[0m \u001b[1;34m\"Input 'label' must be a string matching the name of a \"\u001b[0m \u001b[1;33m+\u001b[0m\u001b[0;31m\\\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 986\u001b[1;33m \"column in the reference SFrame 'dataset'.\")\n\u001b[0m\u001b[0;32m 987\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 988\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mdataset\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mlabel\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mstr\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mdataset\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mlabel\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mint\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mValueError\u001b[0m: Input 'label' must be a string matching the name of a column in the reference SFrame 'dataset'." ] } ], "source": [ "knn_model.query(swift)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": true }, "outputs": [], "source": [ "jolie = people[people['name'] == 'Angelina Jolie']" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
Starting pairwise querying.
" ], "text/plain": [ "Starting pairwise querying." ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| Query points | # Pairs | % Complete. | Elapsed Time |
" ], "text/plain": [ "| Query points | # Pairs | % Complete. | Elapsed Time |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| 0            | 1       | 0.00169288  | 8.379ms      |
" ], "text/plain": [ "| 0 | 1 | 0.00169288 | 8.379ms |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| Done         |         | 100         | 330.986ms    |
" ], "text/plain": [ "| Done | | 100 | 330.986ms |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_labelreference_labeldistancerank
0Angelina Jolie0.01
0Brad Pitt0.7840236686392
0Julianne Moore0.7958579881663
0Billy Bob Thornton0.8030690537084
0George Clooney0.80468755
\n", "[5 rows x 4 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tquery_label\tint\n", "\treference_label\tstr\n", "\tdistance\tfloat\n", "\trank\tint\n", "\n", "Rows: 5\n", "\n", "Data:\n", "+-------------+--------------------+----------------+------+\n", "| query_label | reference_label | distance | rank |\n", "+-------------+--------------------+----------------+------+\n", "| 0 | Angelina Jolie | 0.0 | 1 |\n", "| 0 | Brad Pitt | 0.784023668639 | 2 |\n", "| 0 | Julianne Moore | 0.795857988166 | 3 |\n", "| 0 | Billy Bob Thornton | 0.803069053708 | 4 |\n", "| 0 | George Clooney | 0.8046875 | 5 |\n", "+-------------+--------------------+----------------+------+\n", "[5 rows x 4 columns]" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "knn_model.query(jolie)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": true }, "outputs": [], "source": [ "arnold = people[people['name'] == 'Arnold Schwarzenegger']" ] }, { "cell_type": "code", "execution_count": 71, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
Starting pairwise querying.
" ], "text/plain": [ "Starting pairwise querying." ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| Query points | # Pairs | % Complete. | Elapsed Time |
" ], "text/plain": [ "| Query points | # Pairs | % Complete. | Elapsed Time |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| 0            | 1       | 0.00169288  | 13.007ms     |
" ], "text/plain": [ "| 0 | 1 | 0.00169288 | 13.007ms |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| Done         |         | 100         | 336.714ms    |
" ], "text/plain": [ "| Done | | 100 | 336.714ms |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------------+------------------------------+----------------+------+\n", "| query_label | reference_label | distance | rank |\n", "+-------------+------------------------------+----------------+------+\n", "| 0 | Arnold Schwarzenegger | 0.0 | 1 |\n", "| 0 | Jesse Ventura | 0.818918918919 | 2 |\n", "| 0 | John Kitzhaber | 0.824615384615 | 3 |\n", "| 0 | Lincoln Chafee | 0.833876221498 | 4 |\n", "| 0 | Anthony Foxx | 0.833910034602 | 5 |\n", "| 0 | Abel Maldonado | 0.834482758621 | 6 |\n", "| 0 | Pat Quinn (politician) | 0.837209302326 | 7 |\n", "| 0 | Scott Walker (politician) | 0.838905775076 | 8 |\n", "| 0 | Mike Johanns | 0.839009287926 | 9 |\n", "| 0 | John Garamendi | 0.839762611276 | 10 |\n", "| 0 | Sean Parnell | 0.840531561462 | 11 |\n", "| 0 | Alec Baldwin | 0.843260188088 | 12 |\n", "| 0 | Gary Herbert | 0.844594594595 | 13 |\n", "| 0 | Lonnie Napier | 0.844776119403 | 14 |\n", "| 0 | David Steelman | 0.845238095238 | 15 |\n", "| 0 | Tom Corbett | 0.845930232558 | 16 |\n", "| 0 | March Fong Eu | 0.846354166667 | 17 |\n", "| 0 | Nat Robertson | 0.846405228758 | 18 |\n", "| 0 | Bob Corker | 0.846405228758 | 19 |\n", "| 0 | David Paterson | 0.847619047619 | 20 |\n", "| 0 | Antonio Villaraigosa | 0.84776119403 | 21 |\n", "| 0 | Mary Fallin | 0.84776119403 | 22 |\n", "| 0 | Jack Markell | 0.848297213622 | 23 |\n", "| 0 | Phil Mitman | 0.848874598071 | 24 |\n", "| 0 | Mark Mahon | 0.849230769231 | 25 |\n", "| 0 | Michael Steele | 0.849673202614 | 26 |\n", "| 0 | Donald E. Hines | 0.85 | 27 |\n", "| 0 | Neil Abercrombie | 0.850152905199 | 28 |\n", "| 0 | Jay Nixon | 0.852112676056 | 29 |\n", "| 0 | Bob Miller (Nevada governor) | 0.852150537634 | 30 |\n", "| 0 | Tom Sieckmann | 0.852564102564 | 31 |\n", "| 0 | Denny Altes | 0.853260869565 | 32 |\n", "| 0 | BettyLou DeCroce | 0.853293413174 | 33 |\n", "| 0 | Javier S%C3%A1nchez | 0.853741496599 | 34 |\n", "| 0 | Patsy Kinsey | 0.854037267081 | 35 |\n", "| 0 | Rodney Alexander | 0.854103343465 | 36 |\n", "| 0 | Ed Case | 0.854838709677 | 37 |\n", "| 0 | Andrew R. Ciesla | 0.854889589905 | 38 |\n", "| 0 | Rick Perry | 0.854961832061 | 39 |\n", "| 0 | Maggie Hassan | 0.85534591195 | 40 |\n", "+-------------+------------------------------+----------------+------+\n", "[40 rows x 4 columns]\n", "\n" ] }, { "data": { "text/plain": [ "graphlab.toolkits.nearest_neighbors._nearest_neighbors.NearestNeighborsModel" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "knn_model.query(arnold, k=40).print_rows(num_rows=40)\n", "\n", "type(knn_model)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Assignment" ] }, { "cell_type": "code", "execution_count": 103, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URInametext
<http://dbpedia.org/resou
rce/Elton_John> ...
Elton Johnsir elton hercules john
cbe born reginald ken ...
\n", "[? rows x 3 columns]
Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.
You can use sf.materialize() to force materialization.\n", "
" ], "text/plain": [ "Columns:\n", "\tURI\tstr\n", "\tname\tstr\n", "\ttext\tstr\n", "\n", "Rows: Unknown\n", "\n", "Data:\n", "+-------------------------------+------------+-------------------------------+\n", "| URI | name | text |\n", "+-------------------------------+------------+-------------------------------+\n", "| \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordcount
the27
in18
and15
of13
a10
has9
john7
he7
on6
award5
\n", "[10 rows x 2 columns]
\n", "" ], "text/plain": [ "Columns:\n", "\tword\tstr\n", "\tcount\tint\n", "\n", "Rows: 10\n", "\n", "Data:\n", "+-------+-------+\n", "| word | count |\n", "+-------+-------+\n", "| the | 27 |\n", "| in | 18 |\n", "| and | 15 |\n", "| of | 13 |\n", "| a | 10 |\n", "| has | 9 |\n", "| john | 7 |\n", "| he | 7 |\n", "| on | 6 |\n", "| award | 5 |\n", "+-------+-------+\n", "[10 rows x 2 columns]" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "elton['word_count'] = graphlab.text_analytics.count_words(elton['text'])\n", "\n", "elton_word_count_table = elton[['word_count']].stack('word_count', new_column_name = ['word','count'])\n", "\n", "elton_word_count_table.sort('count', ascending=False).head()" ] }, { "cell_type": "code", "execution_count": 108, "metadata": { "collapsed": false }, "outputs": [], "source": [ "people['word_count'] = graphlab.text_analytics.count_words(people['text'])\n", "\n", "people['tfidf'] = graphlab.text_analytics.tf_idf(people['word_count'])\n", "\n", "elton = people[people['name'] == 'Elton John']\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 109, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordtfidf
furnish18.38947184
elton17.48232027
billboard17.3036809575
john13.9393127924
songwriters11.250406447
tonightcandle10.9864953892
overallelton10.9864953892
1970200010.2933482087
fivedecade10.2933482087
aids10.262846934
\n", "[255 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.\n", "
" ], "text/plain": [ "Columns:\n", "\tword\tstr\n", "\ttfidf\tfloat\n", "\n", "Rows: 255\n", "\n", "Data:\n", "+---------------+---------------+\n", "| word | tfidf |\n", "+---------------+---------------+\n", "| furnish | 18.38947184 |\n", "| elton | 17.48232027 |\n", "| billboard | 17.3036809575 |\n", "| john | 13.9393127924 |\n", "| songwriters | 11.250406447 |\n", "| tonightcandle | 10.9864953892 |\n", "| overallelton | 10.9864953892 |\n", "| 19702000 | 10.2933482087 |\n", "| fivedecade | 10.2933482087 |\n", "| aids | 10.262846934 |\n", "+---------------+---------------+\n", "[255 rows x 2 columns]\n", "Note: Only the head of the SFrame is printed.\n", "You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns." ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "elton[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)" ] }, { "cell_type": "code", "execution_count": 111, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.956700637666\n", "0.825031002922\n" ] } ], "source": [ "victoria = people[people['name'] == 'Victoria Beckham']\n", "paul = people[people['name'] == 'Paul McCartney']\n", "\n", "print graphlab.distances.cosine(elton['tfidf'][0], victoria['tfidf'][0])\n", "print graphlab.distances.cosine(elton['tfidf'][0], paul['tfidf'][0])\n" ] }, { "cell_type": "code", "execution_count": 113, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URInametextword_count
<http://dbpedia.org/resou
rce/Digby_Morrell> ...
Digby Morrelldigby morrell born 10
october 1979 is a former ...
{'selection': 1,
'carltons': 1, 'being': ...
<http://dbpedia.org/resou
rce/Alfred_J._Lewy> ...
Alfred J. Lewyalfred j lewy aka sandy
lewy graduated from ...
{'precise': 1, 'thomas':
1, 'closely': 1, ...
<http://dbpedia.org/resou
rce/Harpdog_Brown> ...
Harpdog Brownharpdog brown is a singer
and harmonica player who ...
{'just': 1, 'issued': 1,
'mainly': 1, 'nominat ...
<http://dbpedia.org/resou
rce/Franz_Rottensteiner> ...
Franz Rottensteinerfranz rottensteiner born
in waidmannsfeld lower ...
{'all': 1,
'bauforschung': 1, ...
<http://dbpedia.org/resou
rce/G-Enka> ...
G-Enkahenry krvits born 30
december 1974 in tallinn ...
{'they': 1,
'gangstergenka': 1, ...
<http://dbpedia.org/resou
rce/Sam_Henderson> ...
Sam Hendersonsam henderson born
october 18 1969 is an ...
{'currently': 1, 'less':
1, 'being': 1, ...
<http://dbpedia.org/resou
rce/Aaron_LaCrate> ...
Aaron LaCrateaaron lacrate is an
american music producer ...
{'exclusive': 2,
'producer': 1, 'show' ...
<http://dbpedia.org/resou
rce/Trevor_Ferguson> ...
Trevor Fergusontrevor ferguson aka john
farrow born 11 november ...
{'taxi': 1, 'salon': 1,
'gangs': 1, 'being': 1, ...
<http://dbpedia.org/resou
rce/Grant_Nelson> ...
Grant Nelsongrant nelson born 27
april 1971 in london ...
{'houston': 1, 'frankie':
1, 'labels': 1, ...
<http://dbpedia.org/resou
rce/Cathy_Caruth> ...
Cathy Caruthcathy caruth born 1955 is
frank h t rhodes ...
{'phenomenon': 1,
'deborash': 1, 'both' ...
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tfidf
{'selection':
3.836578553093086, ...
{'precise':
6.44320060695519, ...
{'just':
2.7007299687108643, ...
{'all':
1.6431112434912472, ...
{'they':
1.8993401178193898, ...
{'currently':
1.637088969126014, ...
{'exclusive':
10.455187230695827, ...
{'taxi':
6.0520214560945025, ...
{'houston':
3.935505942157149, ...
{'phenomenon':
5.750053426395245, ...
\n", "[10 rows x 5 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tURI\tstr\n", "\tname\tstr\n", "\ttext\tstr\n", "\tword_count\tdict\n", "\ttfidf\tdict\n", "\n", "Rows: 10\n", "\n", "Data:\n", "+-------------------------------+---------------------+\n", "| URI | name |\n", "+-------------------------------+---------------------+\n", "| Starting brute force nearest neighbors model training." ], "text/plain": [ "Starting brute force nearest neighbors model training." ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Starting brute force nearest neighbors model training.
" ], "text/plain": [ "Starting brute force nearest neighbors model training." ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "knn_words = graphlab.nearest_neighbors.create(people, features=['word_count'], label='name', distance='cosine')\n", "knn_tfidf = graphlab.nearest_neighbors.create(people, features=['tfidf'], label='name')\n" ] }, { "cell_type": "code", "execution_count": 122, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
Starting pairwise querying.
" ], "text/plain": [ "Starting pairwise querying." ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| Query points | # Pairs | % Complete. | Elapsed Time |
" ], "text/plain": [ "| Query points | # Pairs | % Complete. | Elapsed Time |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| 0            | 1       | 0.00169288  | 16.166ms     |
" ], "text/plain": [ "| 0 | 1 | 0.00169288 | 16.166ms |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| Done         |         | 100         | 231.817ms    |
" ], "text/plain": [ "| Done | | 100 | 231.817ms |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------------+-----------------------+-------------------+------+\n", "| query_label | reference_label | distance | rank |\n", "+-------------+-----------------------+-------------------+------+\n", "| 0 | Elton John | 2.22044604925e-16 | 1 |\n", "| 0 | Cliff Richard | 0.16142415259 | 2 |\n", "| 0 | Sandro Petrone | 0.16822542751 | 3 |\n", "| 0 | Rod Stewart | 0.168327165587 | 4 |\n", "| 0 | Malachi O'Doherty | 0.177315545979 | 5 |\n", "| 0 | Roger Daltrey | 0.177554184666 | 6 |\n", "| 0 | Peter Paret | 0.180734837403 | 7 |\n", "| 0 | Mervyn Burtch | 0.181990140263 | 8 |\n", "| 0 | Chris Chivers | 0.1830733129 | 9 |\n", "| 0 | Dejan Bogdanovi%C4%87 | 0.184989473454 | 10 |\n", "+-------------+-----------------------+-------------------+------+\n", "[10 rows x 4 columns]\n", "\n" ] }, { "data": { "text/html": [ "
Starting pairwise querying.
" ], "text/plain": [ "Starting pairwise querying." ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| Query points | # Pairs | % Complete. | Elapsed Time |
" ], "text/plain": [ "| Query points | # Pairs | % Complete. | Elapsed Time |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| 0            | 1       | 0.00169288  | 12.86ms      |
" ], "text/plain": [ "| 0 | 1 | 0.00169288 | 12.86ms |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| Done         |         | 100         | 319.98ms     |
" ], "text/plain": [ "| Done | | 100 | 319.98ms |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------------+------------------+----------------+------+\n", "| query_label | reference_label | distance | rank |\n", "+-------------+------------------+----------------+------+\n", "| 0 | Elton John | 0.0 | 1 |\n", "| 0 | Phil Collins | 0.76399026764 | 2 |\n", "| 0 | Rod Stewart | 0.773333333333 | 3 |\n", "| 0 | Annie Lennox | 0.776623376623 | 4 |\n", "| 0 | Barry Gibb | 0.780952380952 | 5 |\n", "| 0 | Sting (musician) | 0.787172011662 | 6 |\n", "| 0 | Adele | 0.78813559322 | 7 |\n", "| 0 | Roger Daltrey | 0.788461538462 | 8 |\n", "| 0 | Billy Joel | 0.790769230769 | 9 |\n", "| 0 | Carrie Underwood | 0.79177377892 | 10 |\n", "+-------------+------------------+----------------+------+\n", "[10 rows x 4 columns]\n", "\n" ] } ], "source": [ "knn_words.query(elton, radius=0.84, k=10).print_rows()\n", "knn_tfidf.query(elton, radius=0.84, k=10).print_rows()" ] }, { "cell_type": "code", "execution_count": 124, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
Starting pairwise querying.
" ], "text/plain": [ "Starting pairwise querying." ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| Query points | # Pairs | % Complete. | Elapsed Time |
" ], "text/plain": [ "| Query points | # Pairs | % Complete. | Elapsed Time |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| 0            | 1       | 0.00169288  | 8.141ms      |
" ], "text/plain": [ "| 0 | 1 | 0.00169288 | 8.141ms |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| Done         |         | 100         | 267.05ms     |
" ], "text/plain": [ "| Done | | 100 | 267.05ms |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "+-------------+--------------------------+--------------------+------+\n", "| query_label | reference_label | distance | rank |\n", "+-------------+--------------------------+--------------------+------+\n", "| 0 | Victoria Beckham | -2.22044604925e-16 | 1 |\n", "| 0 | Mary Fitzgerald (artist) | 0.207307036115 | 2 |\n", "| 0 | Adrienne Corri | 0.214509782788 | 3 |\n", "| 0 | Beverly Jane Fry | 0.217466468741 | 4 |\n", "| 0 | Raman Mundair | 0.217695474992 | 5 |\n", "+-------------+--------------------------+--------------------+------+\n", "[5 rows x 4 columns]\n", "\n" ] }, { "data": { "text/html": [ "
Starting pairwise querying.
" ], "text/plain": [ "Starting pairwise querying." ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| Query points | # Pairs | % Complete. | Elapsed Time |
" ], "text/plain": [ "| Query points | # Pairs | % Complete. | Elapsed Time |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| 0            | 1       | 0.00169288  | 14.129ms     |
" ], "text/plain": [ "| 0 | 1 | 0.00169288 | 14.129ms |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| Done         |         | 100         | 305.451ms    |
" ], "text/plain": [ "| Done | | 100 | 305.451ms |" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
+--------------+---------+-------------+--------------+
" ], "text/plain": [ "+--------------+---------+-------------+--------------+" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_labelreference_labeldistancerank
0Victoria Beckham0.01
0Cheryl Cole0.8005865102642
0Heidi Klum0.8103448275863
0Simon Fuller0.8227424749164
0Adele0.8249158249165
\n", "[5 rows x 4 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tquery_label\tint\n", "\treference_label\tstr\n", "\tdistance\tfloat\n", "\trank\tint\n", "\n", "Rows: 5\n", "\n", "Data:\n", "+-------------+------------------+----------------+------+\n", "| query_label | reference_label | distance | rank |\n", "+-------------+------------------+----------------+------+\n", "| 0 | Victoria Beckham | 0.0 | 1 |\n", "| 0 | Cheryl Cole | 0.800586510264 | 2 |\n", "| 0 | Heidi Klum | 0.810344827586 | 3 |\n", "| 0 | Simon Fuller | 0.822742474916 | 4 |\n", "| 0 | Adele | 0.824915824916 | 5 |\n", "+-------------+------------------+----------------+------+\n", "[5 rows x 4 columns]" ] }, "execution_count": 124, "metadata": {}, "output_type": "execute_result" } ], "source": [ "knn_words.query(victoria).print_rows()\n", "knn_tfidf.query(victoria)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.13" } }, "nbformat": 4, "nbformat_minor": 0 }