{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Some analysis on my ~7500 Reddit comments" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ " \n", "\n", "\n", " \n", "\n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%matplotlib inline\n", "import json\n", "import pandas as pd\n", "import datetime\n", "from bokeh.plotting import figure, output_notebook, show\n", "from bokeh.charts import Bar\n", "from bokeh.models.grids import Grid\n", "from bokeh.models.axes import LinearAxis, CategoricalAxis\n", "from bokeh.models.ranges import FactorRange\n", "from bokeh.models.renderers import GlyphRenderer\n", "from bokeh.models.glyphs import Text\n", "from bokeh.palettes import Blues9, Oranges3, GnBu3\n", "import matplotlib.pyplot as plt\n", "from IPython.display import Markdown, display\n", "output_notebook()\n", "\n", "# Define a helper function for pretty printing out Python output\n", "def printmd(string):\n", " display(Markdown(string))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Firstly, we want to define a function that will grab all of our comment data and convert it to dictionaries from JSON." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def parse_comment_file():\n", " \"\"\"\n", " Parse each comment in the JSON file into a dict and then add it to a list, then return the list\n", " \"\"\"\n", " # Empty list for storage\n", " comment_list = []\n", "\n", " # Open the file\n", " with open('nikskoCommentsToMarch2016.json', 'r') as file:\n", " # For every line\n", " for line in file:\n", " # Parse the line into a dict\n", " comment_dict = json.loads(line)\n", " # Put the dict on the end of the list\n", " comment_list.append(comment_dict)\n", "\n", " return comment_list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we can do some analysis. First we'll clean up the data a little." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Parse the file\n", "comment_list = parse_comment_file()\n", "\n", "# Sort the list based on the UTC time\n", "comment_list.sort(key=lambda x: x['created_utc'])\n", "\n", "# Throw this into a pandas dataframe\n", "comment_frame = pd.DataFrame(comment_list)\n", "\n", "# Fix the datatypes on the columns\n", "comment_frame[['created_utc', 'score']] = comment_frame[['created_utc', 'score']].astype(int)\n", "comment_frame[['subreddit', 'body']] = comment_frame[['subreddit', 'body']].astype(str)\n", "\n", "# Convert from unix UTC dates to local times\n", "comment_frame['created_datetime'] = pd.to_datetime(comment_frame['created_utc'], unit='s')\n", "\n", "# Set the created_datetime as the index\n", "comment_frame = comment_frame.set_index('created_datetime')\n", "\n", "# Localize the timestamps to UTC, then convert to Melbourne time\n", "comment_frame = comment_frame.tz_localize('UTC')\n", "comment_frame = comment_frame.tz_convert('Australia/Melbourne')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next, some basic stats." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/markdown": [ "Total number of comments:" ], "text/plain": [ "