{
"metadata": {
"name": "",
"signature": "sha256:e4e2772e77a6ae10b4577869fb64e58993cddf98f20c992cc470c03d19ca7bc9"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Metafiction\n",
"## Data Wrangling"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import numpy as np\n",
"import pandas as pd\n",
"from pandas import DataFrame, Series\n",
"import matplotlib.pyplot as plt\n",
"# %matplotlib inline"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import json\n",
"\n",
"metafiction = [json.loads(x) for x in open(\"metafiction.dat\")]\n",
"len(metafiction)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 2,
"text": [
"100"
]
}
],
"prompt_number": 2
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### authors"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"author_list = [{\"author\": rec[\"id\"], \"name\": rec[\"name\"]} for rec in metafiction]\n",
"\n",
"len(author_list)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 3,
"text": [
"100"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for record in metafiction:\n",
" for story in record[\"favourite-stories\"]:\n",
" author_list.append({\"author\": story[\"author\"]})\n",
" for author in record[\"favourite-authors\"]:\n",
" author_list.append({\"author\": author})\n",
"\n",
"len(author_list)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": [
"10694"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"authors = DataFrame(author_list)\n",
"authors.drop_duplicates([\"author\"], inplace=True)\n",
"authors.set_index([\"author\"], inplace=True)\n",
"\n",
"len(authors)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 5,
"text": [
"5170"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"authors.ix[[0]]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
"
\n",
" \n",
" author | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 5111102 | \n",
" EagleJarl | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 6,
"text": [
" name\n",
"author \n",
"5111102 EagleJarl"
]
}
],
"prompt_number": 6
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## stories"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"story_list = []\n",
"\n",
"for record in metafiction:\n",
" story_list.extend(record[\"author-stories\"])\n",
" story_list.extend(record[\"favourite-stories\"])\n",
"\n",
"len(story_list)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
"11052"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"stories = DataFrame(story_list)\n",
"\n",
"## rename columns\n",
"columns = stories.columns.values\n",
"columns[3] = u\"is_complete\"\n",
"columns[4] = u\"submitted\"\n",
"columns[5] = u\"updated\"\n",
"columns[9] = u\"story\"\n",
"stories.columns = columns \n",
"\n",
"stories.drop_duplicates([\"story\"], inplace=True)\n",
"stories.set_index(\"story\", inplace=True)\n",
"stories[\"submitted\"] = stories[\"submitted\"].astype(\"datetime64\")\n",
"stories[\"updated\"] = stories[\"updated\"].astype(\"datetime64\")\n",
"\n",
"len(stories)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 8,
"text": [
"9089"
]
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"stories.ix[[0]]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" author | \n",
" categories | \n",
" chapters | \n",
" is_complete | \n",
" submitted | \n",
" updated | \n",
" favourites | \n",
" follows | \n",
" genres | \n",
" language | \n",
" rating | \n",
" reviews | \n",
" title | \n",
" word-count | \n",
"
\n",
" \n",
" story | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 9669819 | \n",
" 5111102 | \n",
" [Dungeons and Dragons] | \n",
" 76 | \n",
" False | \n",
" 2013-09-08 11:03:42 | \n",
" 2014-12-06 17:56:42 | \n",
" 425 | \n",
" 483 | \n",
" [Adventure, Fantasy] | \n",
" English | \n",
" T | \n",
" 773 | \n",
" The Two Year Emperor | \n",
" 309723 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 9,
"text": [
" author categories chapters is_complete \\\n",
"story \n",
"9669819 5111102 [Dungeons and Dragons] 76 False \n",
"\n",
" submitted updated favourites follows \\\n",
"story \n",
"9669819 2013-09-08 11:03:42 2014-12-06 17:56:42 425 483 \n",
"\n",
" genres language rating reviews title \\\n",
"story \n",
"9669819 [Adventure, Fantasy] English T 773 The Two Year Emperor \n",
"\n",
" word-count \n",
"story \n",
"9669819 309723 "
]
}
],
"prompt_number": 9
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## favourites"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"favourite_author_list = []\n",
"favourite_story_list = []\n",
"\n",
"for record in metafiction:\n",
" for author in record[\"favourite-authors\"]:\n",
" favourite_author_list.append({\"author\": record[\"id\"],\n",
" \"favourite_author\": author})\n",
" for story in record[\"favourite-stories\"]:\n",
" favourite_story_list.append({\"author\": record[\"id\"],\n",
" \"favourite_story\": story[\"id\"]})\n",
" \n",
"(len(favourite_author_list), len(favourite_story_list))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 10,
"text": [
"(1211, 9383)"
]
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"favourite_authors = DataFrame(favourite_author_list)\n",
"favourite_authors.set_index(\"author\", inplace=True)\n",
"favourite_authors = favourite_authors[\"favourite_author\"]\n",
"\n",
"favourite_stories = DataFrame(favourite_story_list)\n",
"favourite_stories.set_index(\"author\", inplace=True)\n",
"favourite_stories = favourite_stories[\"favourite_story\"]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"favourite_authors.ix[[0]]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 12,
"text": [
"author\n",
"5111102 4976703\n",
"Name: favourite_author, dtype: object"
]
}
],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"favourite_stories.ix[[0]]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 13,
"text": [
"author\n",
"5111102 8096183\n",
"Name: favourite_story, dtype: object"
]
}
],
"prompt_number": 13
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## genres and categories"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"genre_list = sorted(set.union(*[set(g) for g in stories[\"genres\"]]))\n",
"genres = DataFrame(data=np.zeros((len(stories), len(genre_list))), columns=genre_list, index=stories.index)\n",
"\n",
"category_list = sorted(set.union(*[set(c) for c in stories[\"categories\"]]))\n",
"categories = DataFrame(data=np.zeros((len(stories), len(category_list))), columns=category_list, index=stories.index)\n",
"\n",
"for story in stories.index:\n",
" genres.ix[story, stories.ix[story, \"genres\"]] = 1\n",
" categories.ix[story, stories.ix[story, \"categories\"]] = 1"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"genres.ix[[0]]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Adventure | \n",
" Angst | \n",
" Comfort | \n",
" Crime | \n",
" Drama | \n",
" Family | \n",
" Fantasy | \n",
" Friendship | \n",
" Horror | \n",
" Humor | \n",
" ... | \n",
" Mystery | \n",
" Parody | \n",
" Poetry | \n",
" Romance | \n",
" Sci-Fi | \n",
" Spiritual | \n",
" Supernatural | \n",
" Suspense | \n",
" Tragedy | \n",
" Western | \n",
"
\n",
" \n",
" story | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 9669819 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
1 rows \u00d7 21 columns
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 15,
"text": [
" Adventure Angst Comfort Crime Drama Family Fantasy Friendship \\\n",
"story \n",
"9669819 1 0 0 0 0 0 1 0 \n",
"\n",
" Horror Humor ... Mystery Parody Poetry Romance Sci-Fi \\\n",
"story ... \n",
"9669819 0 0 ... 0 0 0 0 0 \n",
"\n",
" Spiritual Supernatural Suspense Tragedy Western \n",
"story \n",
"9669819 0 0 0 0 0 \n",
"\n",
"[1 rows x 21 columns]"
]
}
],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"categories.ix[[0]]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" .hack/SIGN | \n",
" 10th Kingdom | \n",
" 1984 | \n",
" 24 | \n",
" 30 Rock | \n",
" A Certain Scientific Railgun/\u3068\u3042\u308b\u79d1\u5b66\u306e\u8d85\u96fb\u78c1\u7832 | \n",
" A song of Ice and Fire | \n",
" A-Team | \n",
" Addams Family | \n",
" Advance Wars | \n",
" ... | \n",
" Yami no Matsuei | \n",
" Young Justice | \n",
" Young Wizards | \n",
" Yu Yu Hakusho | \n",
" Yu-Gi-Oh | \n",
" Zatch Bell | \n",
" Zoids | \n",
" iCarly | \n",
" the X-Men | \n",
" xxxHOLiC | \n",
"
\n",
" \n",
" story | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 9669819 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
1 rows \u00d7 541 columns
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 16,
"text": [
" .hack/SIGN 10th Kingdom 1984 24 30 Rock \\\n",
"story \n",
"9669819 0 0 0 0 0 \n",
"\n",
" A Certain Scientific Railgun/\u3068\u3042\u308b\u79d1\u5b66\u306e\u8d85\u96fb\u78c1\u7832 A song of Ice and Fire \\\n",
"story \n",
"9669819 0 0 \n",
"\n",
" A-Team Addams Family Advance Wars ... Yami no Matsuei \\\n",
"story ... \n",
"9669819 0 0 0 ... 0 \n",
"\n",
" Young Justice Young Wizards Yu Yu Hakusho Yu-Gi-Oh Zatch Bell \\\n",
"story \n",
"9669819 0 0 0 0 0 \n",
"\n",
" Zoids iCarly the X-Men xxxHOLiC \n",
"story \n",
"9669819 0 0 0 0 \n",
"\n",
"[1 rows x 541 columns]"
]
}
],
"prompt_number": 16
}
],
"metadata": {}
}
]
}