{
"metadata": {
"name": "",
"signature": "sha256:98cdaefc4251ee58e13965ec094914c0751c2da48ae5dffd58eda52baef59397"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Metafiction\n",
"## Data Analysis"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import numpy as np\n",
"import pandas as pd\n",
"from pandas import DataFrame, Series\n",
"import matplotlib.pyplot as plt\n",
"# %matplotlib inline"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from wrangling import authors, stories, favourite_authors, favourite_stories, genres, categories"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Similarity"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"I need to calculate a similarity measure between two authors/users. \n",
"\n",
"Since I care about story preference, I'll use the [Jaccard Index](http://en.wikipedia.org/wiki/Jaccard_index) on favourite story sets."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def jaccard_index(set1, set2):\n",
" i_count = len(set1.intersection(set2))\n",
" u_count = len(set1.union(set2))\n",
" return 0 if u_count == 0 else i_count/float(u_count)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Scoring Authors\n",
"\n",
"Comparing author's favourite stories with mine"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def author_similarity(author, my_stories):\n",
" author_stories = set(stories[stories[\"author\"] == author.name].index)\n",
" author_favourites = set(favourite_stories.ix[author.name]) if author.name in favourite_stories else set()\n",
" all_stories = author_stories.union(author_favourites)\n",
" return jaccard_index(all_stories, set(my_stories))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"my_fav_stories = [\"8096183\", # Harry Potter and the Natural 20\n",
" \"9794740\", # Pokemon, The Origin of Species\n",
" \"9311012\", # Lighting up the Dark\n",
" \"5782108\", # Harry Potter and the Methods of Rationality\n",
" \"7354757\", # The Game of Champions\n",
" \"5193644\", # Time Braid\n",
" \"3695087\", # Larceny, Lechery and Luna Lovegood\n",
" \"9669819\", # The Two Year Emperor\n",
" ]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"authors[\"similarity\"] = authors.apply(author_similarity, axis=1, args=(my_fav_stories,))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"authors.sort(\"similarity\", ascending=False)[:5]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" similarity | \n",
"
\n",
" \n",
" author | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 4976703 | \n",
" alexanderwales | \n",
" 0.384615 | \n",
"
\n",
" \n",
" 5118664 | \n",
" daystar721 | \n",
" 0.333333 | \n",
"
\n",
" \n",
" 3989854 | \n",
" Sir Poley | \n",
" 0.222222 | \n",
"
\n",
" \n",
" 4767519 | \n",
" Scientist's Thesis | \n",
" 0.187500 | \n",
"
\n",
" \n",
" 3344060 | \n",
" Velorien | \n",
" 0.166667 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
" name similarity\n",
"author \n",
"4976703 alexanderwales 0.384615\n",
"5118664 daystar721 0.333333\n",
"3989854 Sir Poley 0.222222\n",
"4767519 Scientist's Thesis 0.187500\n",
"3344060 Velorien 0.166667"
]
}
],
"prompt_number": 7
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Scoring Stories\n",
"\n",
"Calculating the weighted average of all stories by author similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# The sum of the similarity of every author who has favourited this story + the writer's similarity\n",
"stories[\"sim_total\"] = authors.ix[stories[\"author\"]][\"similarity\"].values\n",
"\n",
"# The total number of times this story has been favourited + written (1)\n",
"stories[\"sim_count\"] = 1\n",
"\n",
"for author in authors.iterrows():\n",
" author_favs = favourite_stories.get(author[0], Series())\n",
" stories.loc[author_favs, \"sim_total\"] += author[1][\"similarity\"]\n",
" stories.loc[author_favs, \"sim_count\"] += 1\n",
"\n",
"stories[\"sim_score\"] = stories[\"sim_total\"].div(stories[\"sim_count\"])"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Stories by average score"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"stories.sort(\"sim_score\", ascending=False)[:5][[\"title\", \"sim_total\", \"sim_count\", \"sim_score\"]]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" sim_total | \n",
" sim_count | \n",
" sim_score | \n",
"
\n",
" \n",
" story | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 10327510 | \n",
" A Bluer Shade of White | \n",
" 0.384615 | \n",
" 1 | \n",
" 0.384615 | \n",
"
\n",
" \n",
" 10023949 | \n",
" Harry Potter and the Philosopher\\'s Zombie | \n",
" 0.717949 | \n",
" 2 | \n",
" 0.358974 | \n",
"
\n",
" \n",
" 9676374 | \n",
" Daystar\\'s Remix of Rationality | \n",
" 0.333333 | \n",
" 1 | \n",
" 0.333333 | \n",
"
\n",
" \n",
" 9794740 | \n",
" Pokemon: The Origin of Species | \n",
" 0.967949 | \n",
" 4 | \n",
" 0.241987 | \n",
"
\n",
" \n",
" 9658524 | \n",
" Branches on the Tree of Time | \n",
" 0.469361 | \n",
" 2 | \n",
" 0.234681 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 9,
"text": [
" title sim_total sim_count \\\n",
"story \n",
"10327510 A Bluer Shade of White 0.384615 1 \n",
"10023949 Harry Potter and the Philosopher\\'s Zombie 0.717949 2 \n",
"9676374 Daystar\\'s Remix of Rationality 0.333333 1 \n",
"9794740 Pokemon: The Origin of Species 0.967949 4 \n",
"9658524 Branches on the Tree of Time 0.469361 2 \n",
"\n",
" sim_score \n",
"story \n",
"10327510 0.384615 \n",
"10023949 0.358974 \n",
"9676374 0.333333 \n",
"9794740 0.241987 \n",
"9658524 0.234681 "
]
}
],
"prompt_number": 9
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Stories by total similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"stories.sort(\"sim_total\", ascending=False)[:5][[\"title\", \"sim_total\", \"sim_count\", \"sim_score\"]]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" sim_total | \n",
" sim_count | \n",
" sim_score | \n",
"
\n",
" \n",
" story | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 8096183 | \n",
" Harry Potter and the Natural 20 | \n",
" 1.573355 | \n",
" 18 | \n",
" 0.087409 | \n",
"
\n",
" \n",
" 5782108 | \n",
" Harry Potter and the Methods of Rationality | \n",
" 1.486540 | \n",
" 18 | \n",
" 0.082586 | \n",
"
\n",
" \n",
" 9794740 | \n",
" Pokemon: The Origin of Species | \n",
" 0.967949 | \n",
" 4 | \n",
" 0.241987 | \n",
"
\n",
" \n",
" 10023949 | \n",
" Harry Potter and the Philosopher\\'s Zombie | \n",
" 0.717949 | \n",
" 2 | \n",
" 0.358974 | \n",
"
\n",
" \n",
" 10360716 | \n",
" The Metropolitan Man | \n",
" 0.705458 | \n",
" 5 | \n",
" 0.141092 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 10,
"text": [
" title sim_total sim_count \\\n",
"story \n",
"8096183 Harry Potter and the Natural 20 1.573355 18 \n",
"5782108 Harry Potter and the Methods of Rationality 1.486540 18 \n",
"9794740 Pokemon: The Origin of Species 0.967949 4 \n",
"10023949 Harry Potter and the Philosopher\\'s Zombie 0.717949 2 \n",
"10360716 The Metropolitan Man 0.705458 5 \n",
"\n",
" sim_score \n",
"story \n",
"8096183 0.087409 \n",
"5782108 0.082586 \n",
"9794740 0.241987 \n",
"10023949 0.358974 \n",
"10360716 0.141092 "
]
}
],
"prompt_number": 10
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Stories by times favourited"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"stories.sort(\"sim_count\", ascending=False)[:5][[\"title\", \"sim_total\", \"sim_count\", \"sim_score\"]]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" sim_total | \n",
" sim_count | \n",
" sim_score | \n",
"
\n",
" \n",
" story | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 8096183 | \n",
" Harry Potter and the Natural 20 | \n",
" 1.573355 | \n",
" 18 | \n",
" 0.087409 | \n",
"
\n",
" \n",
" 5782108 | \n",
" Harry Potter and the Methods of Rationality | \n",
" 1.486540 | \n",
" 18 | \n",
" 0.082586 | \n",
"
\n",
" \n",
" 2731239 | \n",
" Team 8 | \n",
" 0.193825 | \n",
" 14 | \n",
" 0.013845 | \n",
"
\n",
" \n",
" 5193644 | \n",
" Time Braid | \n",
" 0.683962 | \n",
" 13 | \n",
" 0.052612 | \n",
"
\n",
" \n",
" 5409165 | \n",
" It\\'s For a Good Cause, I Swear! | \n",
" 0.058134 | \n",
" 11 | \n",
" 0.005285 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 11,
"text": [
" title sim_total sim_count \\\n",
"story \n",
"8096183 Harry Potter and the Natural 20 1.573355 18 \n",
"5782108 Harry Potter and the Methods of Rationality 1.486540 18 \n",
"2731239 Team 8 0.193825 14 \n",
"5193644 Time Braid 0.683962 13 \n",
"5409165 It\\'s For a Good Cause, I Swear! 0.058134 11 \n",
"\n",
" sim_score \n",
"story \n",
"8096183 0.087409 \n",
"5782108 0.082586 \n",
"2731239 0.013845 \n",
"5193644 0.052612 \n",
"5409165 0.005285 "
]
}
],
"prompt_number": 11
}
],
"metadata": {}
}
]
}