{ "metadata": { "name": "", "signature": "sha256:e4e2772e77a6ae10b4577869fb64e58993cddf98f20c992cc470c03d19ca7bc9" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Metafiction\n", "## Data Wrangling" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import numpy as np\n", "import pandas as pd\n", "from pandas import DataFrame, Series\n", "import matplotlib.pyplot as plt\n", "# %matplotlib inline" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "import json\n", "\n", "metafiction = [json.loads(x) for x in open(\"metafiction.dat\")]\n", "len(metafiction)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 2, "text": [ "100" ] } ], "prompt_number": 2 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### authors" ] }, { "cell_type": "code", "collapsed": false, "input": [ "author_list = [{\"author\": rec[\"id\"], \"name\": rec[\"name\"]} for rec in metafiction]\n", "\n", "len(author_list)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 3, "text": [ "100" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "for record in metafiction:\n", " for story in record[\"favourite-stories\"]:\n", " author_list.append({\"author\": story[\"author\"]})\n", " for author in record[\"favourite-authors\"]:\n", " author_list.append({\"author\": author})\n", "\n", "len(author_list)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 4, "text": [ "10694" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "authors = DataFrame(author_list)\n", "authors.drop_duplicates([\"author\"], inplace=True)\n", "authors.set_index([\"author\"], inplace=True)\n", "\n", "len(authors)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 5, "text": [ "5170" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "authors.ix[[0]]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
name
author
5111102 EagleJarl
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 6, "text": [ " name\n", "author \n", "5111102 EagleJarl" ] } ], "prompt_number": 6 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## stories" ] }, { "cell_type": "code", "collapsed": false, "input": [ "story_list = []\n", "\n", "for record in metafiction:\n", " story_list.extend(record[\"author-stories\"])\n", " story_list.extend(record[\"favourite-stories\"])\n", "\n", "len(story_list)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 7, "text": [ "11052" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "stories = DataFrame(story_list)\n", "\n", "## rename columns\n", "columns = stories.columns.values\n", "columns[3] = u\"is_complete\"\n", "columns[4] = u\"submitted\"\n", "columns[5] = u\"updated\"\n", "columns[9] = u\"story\"\n", "stories.columns = columns \n", "\n", "stories.drop_duplicates([\"story\"], inplace=True)\n", "stories.set_index(\"story\", inplace=True)\n", "stories[\"submitted\"] = stories[\"submitted\"].astype(\"datetime64\")\n", "stories[\"updated\"] = stories[\"updated\"].astype(\"datetime64\")\n", "\n", "len(stories)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ "9089" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "stories.ix[[0]]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
authorcategorieschaptersis_completesubmittedupdatedfavouritesfollowsgenreslanguageratingreviewstitleword-count
story
9669819 5111102 [Dungeons and Dragons] 76 False2013-09-08 11:03:422014-12-06 17:56:42 425 483 [Adventure, Fantasy] English T 773 The Two Year Emperor 309723
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 9, "text": [ " author categories chapters is_complete \\\n", "story \n", "9669819 5111102 [Dungeons and Dragons] 76 False \n", "\n", " submitted updated favourites follows \\\n", "story \n", "9669819 2013-09-08 11:03:42 2014-12-06 17:56:42 425 483 \n", "\n", " genres language rating reviews title \\\n", "story \n", "9669819 [Adventure, Fantasy] English T 773 The Two Year Emperor \n", "\n", " word-count \n", "story \n", "9669819 309723 " ] } ], "prompt_number": 9 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## favourites" ] }, { "cell_type": "code", "collapsed": false, "input": [ "favourite_author_list = []\n", "favourite_story_list = []\n", "\n", "for record in metafiction:\n", " for author in record[\"favourite-authors\"]:\n", " favourite_author_list.append({\"author\": record[\"id\"],\n", " \"favourite_author\": author})\n", " for story in record[\"favourite-stories\"]:\n", " favourite_story_list.append({\"author\": record[\"id\"],\n", " \"favourite_story\": story[\"id\"]})\n", " \n", "(len(favourite_author_list), len(favourite_story_list))" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 10, "text": [ "(1211, 9383)" ] } ], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "favourite_authors = DataFrame(favourite_author_list)\n", "favourite_authors.set_index(\"author\", inplace=True)\n", "favourite_authors = favourite_authors[\"favourite_author\"]\n", "\n", "favourite_stories = DataFrame(favourite_story_list)\n", "favourite_stories.set_index(\"author\", inplace=True)\n", "favourite_stories = favourite_stories[\"favourite_story\"]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "favourite_authors.ix[[0]]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 12, "text": [ "author\n", "5111102 4976703\n", "Name: favourite_author, dtype: object" ] } ], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "favourite_stories.ix[[0]]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 13, "text": [ "author\n", "5111102 8096183\n", "Name: favourite_story, dtype: object" ] } ], "prompt_number": 13 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## genres and categories" ] }, { "cell_type": "code", "collapsed": false, "input": [ "genre_list = sorted(set.union(*[set(g) for g in stories[\"genres\"]]))\n", "genres = DataFrame(data=np.zeros((len(stories), len(genre_list))), columns=genre_list, index=stories.index)\n", "\n", "category_list = sorted(set.union(*[set(c) for c in stories[\"categories\"]]))\n", "categories = DataFrame(data=np.zeros((len(stories), len(category_list))), columns=category_list, index=stories.index)\n", "\n", "for story in stories.index:\n", " genres.ix[story, stories.ix[story, \"genres\"]] = 1\n", " categories.ix[story, stories.ix[story, \"categories\"]] = 1" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "genres.ix[[0]]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AdventureAngstComfortCrimeDramaFamilyFantasyFriendshipHorrorHumor...MysteryParodyPoetryRomanceSci-FiSpiritualSupernaturalSuspenseTragedyWestern
story
9669819 1 0 0 0 0 0 1 0 0 0... 0 0 0 0 0 0 0 0 0 0
\n", "

1 rows \u00d7 21 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 15, "text": [ " Adventure Angst Comfort Crime Drama Family Fantasy Friendship \\\n", "story \n", "9669819 1 0 0 0 0 0 1 0 \n", "\n", " Horror Humor ... Mystery Parody Poetry Romance Sci-Fi \\\n", "story ... \n", "9669819 0 0 ... 0 0 0 0 0 \n", "\n", " Spiritual Supernatural Suspense Tragedy Western \n", "story \n", "9669819 0 0 0 0 0 \n", "\n", "[1 rows x 21 columns]" ] } ], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "categories.ix[[0]]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
.hack/SIGN10th Kingdom19842430 RockA Certain Scientific Railgun/\u3068\u3042\u308b\u79d1\u5b66\u306e\u8d85\u96fb\u78c1\u7832A song of Ice and FireA-TeamAddams FamilyAdvance Wars...Yami no MatsueiYoung JusticeYoung WizardsYu Yu HakushoYu-Gi-OhZatch BellZoidsiCarlythe X-MenxxxHOLiC
story
9669819 0 0 0 0 0 0 0 0 0 0... 0 0 0 0 0 0 0 0 0 0
\n", "

1 rows \u00d7 541 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 16, "text": [ " .hack/SIGN 10th Kingdom 1984 24 30 Rock \\\n", "story \n", "9669819 0 0 0 0 0 \n", "\n", " A Certain Scientific Railgun/\u3068\u3042\u308b\u79d1\u5b66\u306e\u8d85\u96fb\u78c1\u7832 A song of Ice and Fire \\\n", "story \n", "9669819 0 0 \n", "\n", " A-Team Addams Family Advance Wars ... Yami no Matsuei \\\n", "story ... \n", "9669819 0 0 0 ... 0 \n", "\n", " Young Justice Young Wizards Yu Yu Hakusho Yu-Gi-Oh Zatch Bell \\\n", "story \n", "9669819 0 0 0 0 0 \n", "\n", " Zoids iCarly the X-Men xxxHOLiC \n", "story \n", "9669819 0 0 0 0 \n", "\n", "[1 rows x 541 columns]" ] } ], "prompt_number": 16 } ], "metadata": {} } ] }