{
"metadata": {
"name": "",
"signature": "sha256:ed3733b1a5e9cf118ddb4e83b19a41d1804684eeb665bec6cae02a0bb0245bc6"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Metafiction\n",
"## Filtered Data Analysis"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import numpy as np\n",
"import pandas as pd\n",
"from pandas import DataFrame, Series\n",
"import matplotlib.pyplot as plt\n",
"# %matplotlib inline"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from wrangling import authors, stories, favourite_authors, favourite_stories, genres, categories"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Subset Similarity\n",
"\n",
"I'm not interested in all stories, just certain genres and categories.\n",
"\n",
"It would nice to be able to only focus on those"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def jaccard_index(set1, set2):\n",
" i_count = len(set1.intersection(set2))\n",
" u_count = len(set1.union(set2))\n",
" return 0 if u_count == 0 else i_count/float(u_count)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from collections import namedtuple\n",
"Metafiction = namedtuple(\"Metafiction\", [\"authors\", \"stories\", \"favourite_authors\", \n",
" \"favourite_stories\", \"genres\", \"categories\"])\n",
"\n",
"def story_subset(genres_list=None, categories_list=None):\n",
" \"\"\"returns stor matching these genres and categories\n",
" \n",
" None implies no filter\"\"\"\n",
" \n",
" g_stories = genres if genres_list == None else genres[genres_list]\n",
" c_stories = categories if categories_list == None else categories[categories_list]\n",
" \n",
" g_story_set = set(genres.index if genres_list == None else genres[g_stories.sum(axis=1) > 0].index)\n",
" c_story_set = set(categories.index if categories_list == None else categories[c_stories.sum(axis=1) > 0].index)\n",
" \n",
" return g_story_set.intersection(c_story_set)\n",
" \n",
"\n",
"def metafiction_subset(ids=[]):\n",
" \"\"\"returns a Metafiction object containing datasets that only refer to the passed story ids\"\"\"\n",
" f_stories = stories.ix[ids]\n",
" f_authors = authors.ix[f_stories[\"author\"].drop_duplicates()]\n",
" f_fav_stories = favourite_stories[favourite_stories.isin(ids)]\n",
" f_fav_authors = favourite_authors[favourite_authors.isin(f_authors.index)]\n",
" f_genres = genres.ix[ids][genres.columns[genres.ix[ids].sum() > 0]]\n",
" f_categories = categories.ix[ids][categories.columns[categories.ix[ids].sum() > 0]]\n",
" \n",
" return Metafiction(authors=f_authors, stories=f_stories, favourite_authors=f_fav_authors,\n",
" favourite_stories=f_fav_stories, genres=f_genres, categories=f_categories)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"An `author_similarity` that accepts a passed metafiction object"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def author_similarity(author, my_stories, mfobj):\n",
" author_stories = set(mfobj.stories[mfobj.stories[\"author\"] == author.name].index)\n",
" author_favourites = set(mfobj.favourite_stories.ix[author.name]) if author.name in mfobj.favourite_stories else set()\n",
" all_stories = author_stories.union(author_favourites)\n",
" return jaccard_index(all_stories, my_stories)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Stories about __Harry Potter__, __Naruto__ and __Pokemon__"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"story_ids = story_subset(categories_list=[u\"Harry Potter\", u\"Naruto\", u\"Pok\u00e9mon\"])\n",
"mfobj = metafiction_subset(story_ids)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"my_fav_stories = [\"8096183\", # Harry Potter and the Natural 20\n",
" \"9794740\", # Pokemon, The Origin of Species\n",
" \"9311012\", # Lighting up the Dark\n",
" \"5782108\", # Harry Potter and the Methods of Rationality\n",
" \"7354757\", # The Game of Champions\n",
" \"5193644\", # Time Braid\n",
" \"3695087\", # Larceny, Lechery and Luna Lovegood\n",
" \"9669819\", # The Two Year Emperor\n",
" ]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"filtered_favs = set(my_fav_stories).intersection(story_ids)\n",
"filtered_favs"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 8,
"text": [
"{'3695087', '5193644', '5782108', '7354757', '8096183', '9311012', '9794740'}"
]
}
],
"prompt_number": 8
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Scoring Authors"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"auth_sim_series = mfobj.authors.apply(author_similarity, axis=1, args=(filtered_favs, mfobj))\n",
"auth_sim_series.name = \"similarity\"\n",
"auth_sim = mfobj.authors.join(auth_sim_series)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"auth_sim.sort(\"similarity\", ascending=False)[:5]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" similarity | \n",
"
\n",
" \n",
" author | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 4976703 | \n",
" alexanderwales | \n",
" 0.500000 | \n",
"
\n",
" \n",
" 5118664 | \n",
" daystar721 | \n",
" 0.400000 | \n",
"
\n",
" \n",
" 2269863 | \n",
" Less Wrong | \n",
" 0.185185 | \n",
"
\n",
" \n",
" 5111102 | \n",
" EagleJarl | \n",
" 0.176471 | \n",
"
\n",
" \n",
" 3344060 | \n",
" Velorien | \n",
" 0.166667 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 10,
"text": [
" name similarity\n",
"author \n",
"4976703 alexanderwales 0.500000\n",
"5118664 daystar721 0.400000\n",
"2269863 Less Wrong 0.185185\n",
"5111102 EagleJarl 0.176471\n",
"3344060 Velorien 0.166667"
]
}
],
"prompt_number": 10
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Scoring Stories"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"story_sim_values = DataFrame(index=mfobj.stories.index, columns=[\"sim_total\", \"sim_count\", \"similarity\"])\n",
"\n",
"# the writer's similarity + count\n",
"story_sim_values[\"sim_total\"] = auth_sim.ix[mfobj.stories.ix[story_sim_values.index][\"author\"].values][\"similarity\"].values\n",
"story_sim_values[\"sim_count\"] = 1\n",
"\n",
"for author in auth_sim.iterrows():\n",
" author_favs = mfobj.favourite_stories.get(author[0], Series())\n",
" story_sim_values.loc[author_favs, \"sim_total\"] += author[1][\"similarity\"]\n",
" story_sim_values.loc[author_favs, \"sim_count\"] += 1\n",
" \n",
"story_sim_values[\"similarity\"] = story_sim_values[\"sim_total\"].div(story_sim_values[\"sim_count\"])\n",
"story_sim = mfobj.stories.join(story_sim_values)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 11
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Stories by similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"story_sim.sort(\"similarity\", ascending=False)[:5][[\"title\", \"sim_total\", \"sim_count\", \"similarity\"]]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" sim_total | \n",
" sim_count | \n",
" similarity | \n",
"
\n",
" \n",
" story | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 10023949 | \n",
" Harry Potter and the Philosopher\\'s Zombie | \n",
" 0.900000 | \n",
" 2 | \n",
" 0.450000 | \n",
"
\n",
" \n",
" 9676374 | \n",
" Daystar\\'s Remix of Rationality | \n",
" 0.400000 | \n",
" 1 | \n",
" 0.400000 | \n",
"
\n",
" \n",
" 9794740 | \n",
" Pokemon: The Origin of Species | \n",
" 0.953571 | \n",
" 3 | \n",
" 0.317857 | \n",
"
\n",
" \n",
" 5300280 | \n",
" The Natural History of Pokemon | \n",
" 0.400000 | \n",
" 2 | \n",
" 0.200000 | \n",
"
\n",
" \n",
" 10069991 | \n",
" A Wizard Named Harry in 505 Words | \n",
" 0.176471 | \n",
" 1 | \n",
" 0.176471 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 12,
"text": [
" title sim_total sim_count \\\n",
"story \n",
"10023949 Harry Potter and the Philosopher\\'s Zombie 0.900000 2 \n",
"9676374 Daystar\\'s Remix of Rationality 0.400000 1 \n",
"9794740 Pokemon: The Origin of Species 0.953571 3 \n",
"5300280 The Natural History of Pokemon 0.400000 2 \n",
"10069991 A Wizard Named Harry in 505 Words 0.176471 1 \n",
"\n",
" similarity \n",
"story \n",
"10023949 0.450000 \n",
"9676374 0.400000 \n",
"9794740 0.317857 \n",
"5300280 0.200000 \n",
"10069991 0.176471 "
]
}
],
"prompt_number": 12
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Stories by total similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"story_sim.sort(\"sim_total\", ascending=False)[:5][[\"title\", \"sim_total\", \"sim_count\", \"similarity\"]]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" sim_total | \n",
" sim_count | \n",
" similarity | \n",
"
\n",
" \n",
" story | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 8096183 | \n",
" Harry Potter and the Natural 20 | \n",
" 1.920222 | \n",
" 17 | \n",
" 0.112954 | \n",
"
\n",
" \n",
" 5782108 | \n",
" Harry Potter and the Methods of Rationality | \n",
" 1.576022 | \n",
" 16 | \n",
" 0.098501 | \n",
"
\n",
" \n",
" 9794740 | \n",
" Pokemon: The Origin of Species | \n",
" 0.953571 | \n",
" 3 | \n",
" 0.317857 | \n",
"
\n",
" \n",
" 10023949 | \n",
" Harry Potter and the Philosopher\\'s Zombie | \n",
" 0.900000 | \n",
" 2 | \n",
" 0.450000 | \n",
"
\n",
" \n",
" 5193644 | \n",
" Time Braid | \n",
" 0.899144 | \n",
" 10 | \n",
" 0.089914 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 13,
"text": [
" title sim_total sim_count \\\n",
"story \n",
"8096183 Harry Potter and the Natural 20 1.920222 17 \n",
"5782108 Harry Potter and the Methods of Rationality 1.576022 16 \n",
"9794740 Pokemon: The Origin of Species 0.953571 3 \n",
"10023949 Harry Potter and the Philosopher\\'s Zombie 0.900000 2 \n",
"5193644 Time Braid 0.899144 10 \n",
"\n",
" similarity \n",
"story \n",
"8096183 0.112954 \n",
"5782108 0.098501 \n",
"9794740 0.317857 \n",
"10023949 0.450000 \n",
"5193644 0.089914 "
]
}
],
"prompt_number": 13
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Stories by times favourited"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"story_sim.sort(\"sim_count\", ascending=False)[:5][[\"title\", \"sim_total\", \"sim_count\", \"similarity\"]]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" sim_total | \n",
" sim_count | \n",
" similarity | \n",
"
\n",
" \n",
" story | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 8096183 | \n",
" Harry Potter and the Natural 20 | \n",
" 1.920222 | \n",
" 17 | \n",
" 0.112954 | \n",
"
\n",
" \n",
" 5782108 | \n",
" Harry Potter and the Methods of Rationality | \n",
" 1.576022 | \n",
" 16 | \n",
" 0.098501 | \n",
"
\n",
" \n",
" 2731239 | \n",
" Team 8 | \n",
" 0.345915 | \n",
" 12 | \n",
" 0.028826 | \n",
"
\n",
" \n",
" 5409165 | \n",
" It\\'s For a Good Cause, I Swear! | \n",
" 0.098002 | \n",
" 11 | \n",
" 0.008909 | \n",
"
\n",
" \n",
" 5453054 | \n",
" His Own Man | \n",
" 0.091417 | \n",
" 10 | \n",
" 0.009142 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 14,
"text": [
" title sim_total sim_count \\\n",
"story \n",
"8096183 Harry Potter and the Natural 20 1.920222 17 \n",
"5782108 Harry Potter and the Methods of Rationality 1.576022 16 \n",
"2731239 Team 8 0.345915 12 \n",
"5409165 It\\'s For a Good Cause, I Swear! 0.098002 11 \n",
"5453054 His Own Man 0.091417 10 \n",
"\n",
" similarity \n",
"story \n",
"8096183 0.112954 \n",
"5782108 0.098501 \n",
"2731239 0.028826 \n",
"5409165 0.008909 \n",
"5453054 0.009142 "
]
}
],
"prompt_number": 14
}
],
"metadata": {}
}
]
}