{ "metadata": { "name": "", "signature": "sha256:98cdaefc4251ee58e13965ec094914c0751c2da48ae5dffd58eda52baef59397" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Metafiction\n", "## Data Analysis" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import numpy as np\n", "import pandas as pd\n", "from pandas import DataFrame, Series\n", "import matplotlib.pyplot as plt\n", "# %matplotlib inline" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "from wrangling import authors, stories, favourite_authors, favourite_stories, genres, categories" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Similarity" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "I need to calculate a similarity measure between two authors/users. \n", "\n", "Since I care about story preference, I'll use the [Jaccard Index](http://en.wikipedia.org/wiki/Jaccard_index) on favourite story sets." ] }, { "cell_type": "code", "collapsed": false, "input": [ "def jaccard_index(set1, set2):\n", " i_count = len(set1.intersection(set2))\n", " u_count = len(set1.union(set2))\n", " return 0 if u_count == 0 else i_count/float(u_count)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Scoring Authors\n", "\n", "Comparing author's favourite stories with mine" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def author_similarity(author, my_stories):\n", " author_stories = set(stories[stories[\"author\"] == author.name].index)\n", " author_favourites = set(favourite_stories.ix[author.name]) if author.name in favourite_stories else set()\n", " all_stories = author_stories.union(author_favourites)\n", " return jaccard_index(all_stories, set(my_stories))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "my_fav_stories = [\"8096183\", # Harry Potter and the Natural 20\n", " \"9794740\", # Pokemon, The Origin of Species\n", " \"9311012\", # Lighting up the Dark\n", " \"5782108\", # Harry Potter and the Methods of Rationality\n", " \"7354757\", # The Game of Champions\n", " \"5193644\", # Time Braid\n", " \"3695087\", # Larceny, Lechery and Luna Lovegood\n", " \"9669819\", # The Two Year Emperor\n", " ]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "authors[\"similarity\"] = authors.apply(author_similarity, axis=1, args=(my_fav_stories,))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "authors.sort(\"similarity\", ascending=False)[:5]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
4976703 alexanderwales 0.384615
5118664 daystar721 0.333333
3989854 Sir Poley 0.222222
4767519 Scientist's Thesis 0.187500
3344060 Velorien 0.166667
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 7, "text": [ " name similarity\n", "author \n", "4976703 alexanderwales 0.384615\n", "5118664 daystar721 0.333333\n", "3989854 Sir Poley 0.222222\n", "4767519 Scientist's Thesis 0.187500\n", "3344060 Velorien 0.166667" ] } ], "prompt_number": 7 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Scoring Stories\n", "\n", "Calculating the weighted average of all stories by author similarity" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# The sum of the similarity of every author who has favourited this story + the writer's similarity\n", "stories[\"sim_total\"] = authors.ix[stories[\"author\"]][\"similarity\"].values\n", "\n", "# The total number of times this story has been favourited + written (1)\n", "stories[\"sim_count\"] = 1\n", "\n", "for author in authors.iterrows():\n", " author_favs = favourite_stories.get(author[0], Series())\n", " stories.loc[author_favs, \"sim_total\"] += author[1][\"similarity\"]\n", " stories.loc[author_favs, \"sim_count\"] += 1\n", "\n", "stories[\"sim_score\"] = stories[\"sim_total\"].div(stories[\"sim_count\"])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Stories by average score" ] }, { "cell_type": "code", "collapsed": false, "input": [ "stories.sort(\"sim_score\", ascending=False)[:5][[\"title\", \"sim_total\", \"sim_count\", \"sim_score\"]]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
10327510 A Bluer Shade of White 0.384615 1 0.384615
10023949 Harry Potter and the Philosopher\\'s Zombie 0.717949 2 0.358974
9676374 Daystar\\'s Remix of Rationality 0.333333 1 0.333333
9794740 Pokemon: The Origin of Species 0.967949 4 0.241987
9658524 Branches on the Tree of Time 0.469361 2 0.234681
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 9, "text": [ " title sim_total sim_count \\\n", "story \n", "10327510 A Bluer Shade of White 0.384615 1 \n", "10023949 Harry Potter and the Philosopher\\'s Zombie 0.717949 2 \n", "9676374 Daystar\\'s Remix of Rationality 0.333333 1 \n", "9794740 Pokemon: The Origin of Species 0.967949 4 \n", "9658524 Branches on the Tree of Time 0.469361 2 \n", "\n", " sim_score \n", "story \n", "10327510 0.384615 \n", "10023949 0.358974 \n", "9676374 0.333333 \n", "9794740 0.241987 \n", "9658524 0.234681 " ] } ], "prompt_number": 9 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Stories by total similarity" ] }, { "cell_type": "code", "collapsed": false, "input": [ "stories.sort(\"sim_total\", ascending=False)[:5][[\"title\", \"sim_total\", \"sim_count\", \"sim_score\"]]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
8096183 Harry Potter and the Natural 20 1.573355 18 0.087409
5782108 Harry Potter and the Methods of Rationality 1.486540 18 0.082586
9794740 Pokemon: The Origin of Species 0.967949 4 0.241987
10023949 Harry Potter and the Philosopher\\'s Zombie 0.717949 2 0.358974
10360716 The Metropolitan Man 0.705458 5 0.141092
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 10, "text": [ " title sim_total sim_count \\\n", "story \n", "8096183 Harry Potter and the Natural 20 1.573355 18 \n", "5782108 Harry Potter and the Methods of Rationality 1.486540 18 \n", "9794740 Pokemon: The Origin of Species 0.967949 4 \n", "10023949 Harry Potter and the Philosopher\\'s Zombie 0.717949 2 \n", "10360716 The Metropolitan Man 0.705458 5 \n", "\n", " sim_score \n", "story \n", "8096183 0.087409 \n", "5782108 0.082586 \n", "9794740 0.241987 \n", "10023949 0.358974 \n", "10360716 0.141092 " ] } ], "prompt_number": 10 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Stories by times favourited" ] }, { "cell_type": "code", "collapsed": false, "input": [ "stories.sort(\"sim_count\", ascending=False)[:5][[\"title\", \"sim_total\", \"sim_count\", \"sim_score\"]]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
8096183 Harry Potter and the Natural 20 1.573355 18 0.087409
5782108 Harry Potter and the Methods of Rationality 1.486540 18 0.082586
2731239 Team 8 0.193825 14 0.013845
5193644 Time Braid 0.683962 13 0.052612
5409165 It\\'s For a Good Cause, I Swear! 0.058134 11 0.005285
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 11, "text": [ " title sim_total sim_count \\\n", "story \n", "8096183 Harry Potter and the Natural 20 1.573355 18 \n", "5782108 Harry Potter and the Methods of Rationality 1.486540 18 \n", "2731239 Team 8 0.193825 14 \n", "5193644 Time Braid 0.683962 13 \n", "5409165 It\\'s For a Good Cause, I Swear! 0.058134 11 \n", "\n", " sim_score \n", "story \n", "8096183 0.087409 \n", "5782108 0.082586 \n", "2731239 0.013845 \n", "5193644 0.052612 \n", "5409165 0.005285 " ] } ], "prompt_number": 11 } ], "metadata": {} } ] }