{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "https://plot.ly/python/google_big_query/\n", "\n", "**Table of Contents**\n", "\n", "
\n", "" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "\n", "# to communicate with Google BigQuery\n", "from pandas.io import gbq" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import plotly.plotly as py\n", "import plotly.graph_objs as go\n", "from plotly.tools import FigureFactory as FF" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "top10_active_users_query = \"\"\"\n", "SELECT\n", " author AS User,\n", " count(author) as Stories\n", "FROM\n", " [fh-bigquery:hackernews.stories]\n", "GROUP BY\n", " User\n", "ORDER BY\n", " Stories DESC\n", "LIMIT\n", " 10\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "project_id = 'tak2016-144221'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To get below running, needed this library:\n", "\n", "https://github.com/google/google-api-python-client\n", "\n", "\n", "```bash\n", "$ pip install google-api-python-client --user\n", "```" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requesting query... ok.\n", "Query running...\n", "Query done.\n", "Processed: 18.1 Mb\n", "\n", "Retrieving results...\n", "Got 10 rows.\n", "\n", "Total time taken 1.97 s.\n", "Finished at 2016-09-22 17:24:16.\n" ] } ], "source": [ "top10_active_users_df = gbq.read_gbq(top10_active_users_query, project_id=project_id)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
UserStories
0cwan7077
1shawndumas6602
2evo_95659
3nickb4322
4iProject4266
5bootload4212
6edw5193844
7ColinWright3766
8nreece3724
9tokenadult3659
\n", "
" ], "text/plain": [ " User Stories\n", "0 cwan 7077\n", "1 shawndumas 6602\n", "2 evo_9 5659\n", "3 nickb 4322\n", "4 iProject 4266\n", "5 bootload 4212\n", "6 edw519 3844\n", "7 ColinWright 3766\n", "8 nreece 3724\n", "9 tokenadult 3659" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "\n", "top10_active_users_df" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top_10_users_table = FF.create_table(top10_active_users_df)\n", "py.iplot(top_10_users_table, filename='plotly/top-10-active-users')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Top 10 Hacker News Submissions (by score)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "top10_story_query = \"\"\"\n", "SELECT\n", " title,\n", " score,\n", " time_ts AS timestamp\n", "FROM\n", " [fh-bigquery:hackernews.stories]\n", "ORDER BY\n", " score DESC\n", "LIMIT\n", " 10\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requesting query... ok.\n", "Query running...\n", "Query done.\n", "Processed: 113.1 Mb\n", "\n", "Retrieving results...\n", "Got 10 rows.\n", "\n", "Total time taken 1.5 s.\n", "Finished at 2016-09-22 17:26:09.\n" ] } ], "source": [ "top10_story_df = gbq.read_gbq(top10_story_query, project_id=project_id)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Create a table figure from the DataFrame\n", "top10_story_figure = FF.create_table(top10_story_df)\n", "\n", "# Scatter trace for the bubble chart timeseries\n", "story_timeseries_trace = go.Scatter(\n", " x=top10_story_df['timestamp'],\n", " y=top10_story_df['score'],\n", " xaxis='x2',\n", " yaxis='y2',\n", " mode='markers',\n", " text=top10_story_df['title'],\n", " marker=dict(\n", " color=[80 + i*5 for i in range(10)],\n", " size=top10_story_df['score']/50,\n", " showscale=False\n", " )\n", ")\n", "\n", "# Add the trace data to the figure\n", "top10_story_figure['data'].extend(go.Data([story_timeseries_trace]))\n", "\n", "# Subplot layout\n", "top10_story_figure.layout.yaxis.update({'domain': [0, .45]})\n", "top10_story_figure.layout.yaxis2.update({'domain': [.6, 1]})\n", "\n", "# Y-axis of the graph should be anchored with X-axis\n", "top10_story_figure.layout.yaxis2.update({'anchor': 'x2'})\n", "top10_story_figure.layout.xaxis2.update({'anchor': 'y2'})\n", "\n", "# Add the height and title attribute\n", "top10_story_figure.layout.update({'height':900})\n", "top10_story_figure.layout.update({'title': 'Highest Scoring Submissions on Hacker News'})\n", "\n", "# Update the background color for plot and paper\n", "top10_story_figure.layout.update({'paper_bgcolor': 'rgb(243, 243, 243)'})\n", "top10_story_figure.layout.update({'plot_bgcolor': 'rgb(243, 243, 243)'})\n", "\n", "# Add the margin to make subplot titles visible\n", "top10_story_figure.layout.margin.update({'t':75, 'l':50})\n", "top10_story_figure.layout.yaxis2.update({'title': 'Upvote Score'})\n", "top10_story_figure.layout.xaxis2.update({'title': 'Post Time'})" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "py.image.save_as(top10_story_figure, filename='top10-posts.png')\n", "py.iplot(top10_story_figure, filename='plotly/highest-scoring-submissions')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# From which Top-level domain (TLD) most of the stories come?\n", "- Here we have used the url-function [TLD](https://cloud.google.com/bigquery/query-reference#tld) from BigQuery's query syntax.\n", "- We collect the domain for all URLs with their respective count, and group them by it." ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": true }, "outputs": [], "source": [ "tld_share_query = \"\"\"\n", "SELECT\n", " TLD(url) AS domain,\n", " count(score) AS stories\n", "FROM\n", " [fh-bigquery:hackernews.stories]\n", "GROUP BY\n", " domain\n", "ORDER BY\n", " stories DESC\n", "LIMIT 10\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requesting query... ok.\n", "Query running...\n", "Query done.\n", "Processed: 134.6 Mb\n", "\n", "Retrieving results...\n", "Got 10 rows.\n", "\n", "Total time taken 4.13 s.\n", "Finished at 2016-09-22 17:27:28.\n" ] } ], "source": [ "tld_share_df = gbq.read_gbq(tld_share_query, project_id=project_id)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
domainstories
0.com1324329
1.org120716
2None111744
3.net58754
4.co.uk43955
5.io23507
6.edu14727
7.co10692
8.me10565
9.info8121
\n", "
" ], "text/plain": [ " domain stories\n", "0 .com 1324329\n", "1 .org 120716\n", "2 None 111744\n", "3 .net 58754\n", "4 .co.uk 43955\n", "5 .io 23507\n", "6 .edu 14727\n", "7 .co 10692\n", "8 .me 10565\n", "9 .info 8121" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tld_share_df" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "labels = tld_share_df['domain']\n", "values = tld_share_df['stories']\n", "\n", "tld_share_trace = go.Pie(labels=labels, values=values)\n", "data = [tld_share_trace]\n", "\n", "layout = go.Layout(\n", " title='Submissions shared by Top-level domains'\n", ")\n", "\n", "fig = go.Figure(data=data, layout=layout)\n", "py.iplot(fig, filename='plotly/toplevel-domains')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Public response to the \"Who Is Hiring?\" posts\n", "- There is an account on Hacker News by the name [whoishiring](https://news.ycombinator.com/user?id=whoishiring).\n", "- This account automatically submits a 'Who is Hiring?' post at 11 AM Eastern time on the first weekday of every month.\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "wih_query = \"\"\"\n", "SELECT\n", " id,\n", " title,\n", " score,\n", " time_ts\n", "FROM\n", " [fh-bigquery:hackernews.stories]\n", "WHERE\n", " author == 'whoishiring' AND\n", " LOWER(title) contains 'who is hiring?'\n", "ORDER BY\n", " time\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requesting query... ok.\n", "Query running...\n", "Query done.\n", "Processed: 146.1 Mb\n", "\n", "Retrieving results...\n", "Got 52 rows.\n", "\n", "Total time taken 1.83 s.\n", "Finished at 2016-09-22 17:28:45.\n" ] } ], "source": [ "wih_df = gbq.read_gbq(wih_query, project_id=project_id)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trace = go.Scatter(\n", " x=wih_df['time_ts'],\n", " y=wih_df['score'],\n", " mode='markers+lines',\n", " text=wih_df['title'],\n", " marker=dict(\n", " size=wih_df['score']/50\n", " )\n", ")\n", "\n", "layout = go.Layout(\n", " title='Public response to the \"Who Is Hiring?\" posts',\n", " xaxis=dict(\n", " title=\"Post Time\"\n", " ),\n", " yaxis=dict(\n", " title=\"Upvote Score\"\n", " )\n", ")\n", "\n", "data = [trace]\n", "\n", "fig = go.Figure(data=data, layout=layout)\n", "py.iplot(fig, filename='plotly/whoishiring-public-response')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Submission Traffic Volume in a Week\n" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": true }, "outputs": [], "source": [ "week_traffic_query = \"\"\"\n", "SELECT\n", " DAYOFWEEK(time_ts) as Weekday,\n", " count(DAYOFWEEK(time_ts)) as story_counts\n", "FROM\n", " [fh-bigquery:hackernews.stories]\n", "GROUP BY\n", " Weekday\n", "ORDER BY\n", " Weekday\n", "\"\"\"\n" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requesting query... ok.\n", "Query running...\n", "Query done.\n", "Processed: 14.8 Mb\n", "\n", "Retrieving results...\n", "Got 8 rows.\n", "\n", "Total time taken 2.45 s.\n", "Finished at 2016-09-22 17:29:14.\n" ] } ], "source": [ "week_traffic_df = gbq.read_gbq(week_traffic_query, project_id=project_id)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "week_traffic_df['Day'] = ['NULL', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']\n", "week_traffic_df = week_traffic_df.drop(week_traffic_df.index[0])" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trace = go.Scatter(\n", " x=week_traffic_df['Day'],\n", " y=week_traffic_df['story_counts'],\n", " mode='lines',\n", " text=week_traffic_df['Day']\n", ")\n", "\n", "layout = go.Layout(\n", " title='Submission Traffic Volume (Week Days)',\n", " xaxis=dict(\n", " title=\"Day of the Week\"\n", " ),\n", " yaxis=dict(\n", " title=\"Total Submissions\"\n", " )\n", ")\n", "\n", "data = [trace]\n", "\n", "fig = go.Figure(data=data, layout=layout)\n", "py.iplot(fig, filename='plotly/submission-traffic-volume')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Programming Language Trend on HackerNews\n", "We will compare the trends for the Python and PHP programming languages, using the Hacker News post titles." ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": true }, "outputs": [], "source": [ "python_query = \"\"\"\n", "SELECT\n", " YEAR(time_ts) as years,\n", " COUNT(YEAR(time_ts )) as trends\n", "FROM\n", " [fh-bigquery:hackernews.stories]\n", "WHERE\n", " LOWER(title) contains 'python'\n", "GROUP BY\n", " years\n", "ORDER BY\n", " years\n", "\"\"\"\n", "\n", "php_query = \"\"\"\n", "SELECT\n", " YEAR(time_ts) as years,\n", " COUNT(YEAR(time_ts )) as trends\n", "FROM\n", " [fh-bigquery:hackernews.stories]\n", "WHERE\n", " LOWER(title) contains 'php'\n", "GROUP BY\n", " years\n", "ORDER BY\n", " years\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requesting query... ok.\n", "Query running...\n", "Query done.\n", "Processed: 99.0 Mb\n", "\n", "Retrieving results...\n", "Got 9 rows.\n", "\n", "Total time taken 1.79 s.\n", "Finished at 2016-09-22 17:29:58.\n" ] } ], "source": [ "python_df = gbq.read_gbq(python_query, project_id=project_id)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requesting query... ok.\n", "Query running...\n", "Query done.\n", "Processed: 99.0 Mb\n", "\n", "Retrieving results...\n", "Got 9 rows.\n", "\n", "Total time taken 2.9 s.\n", "Finished at 2016-09-22 17:30:04.\n" ] } ], "source": [ "php_df = gbq.read_gbq(php_query, project_id=project_id)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trace1 = go.Scatter(\n", " x=python_df['years'],\n", " y=python_df['trends'],\n", " mode='lines',\n", " line=dict(color='rgba(115,115,115,1)', width=4),\n", " connectgaps=True,\n", ")\n", "\n", "trace2 = go.Scatter(\n", " x=[python_df['years'][0], python_df['years'][8]],\n", " y=[python_df['trends'][0], python_df['trends'][8]],\n", " mode='markers',\n", " marker=dict(color='rgba(115,115,115,1)', size=8)\n", ")\n", "\n", "trace3 = go.Scatter(\n", " x=php_df['years'],\n", " y=php_df['trends'],\n", " mode='lines',\n", " line=dict(color='rgba(189,189,189,1)', width=4),\n", " connectgaps=True,\n", ")\n", "\n", "trace4 = go.Scatter(\n", " x=[php_df['years'][0], php_df['years'][8]],\n", " y=[php_df['trends'][0], php_df['trends'][8]],\n", " mode='markers',\n", " marker=dict(color='rgba(189,189,189,1)', size=8)\n", ")\n", "\n", "traces = [trace1, trace2, trace3, trace4]\n", "\n", "layout = go.Layout(\n", " xaxis=dict(\n", " showline=True,\n", " showgrid=False,\n", " showticklabels=True,\n", " linecolor='rgb(204, 204, 204)',\n", " linewidth=2,\n", " autotick=False,\n", " ticks='outside',\n", " tickcolor='rgb(204, 204, 204)',\n", " tickwidth=2,\n", " ticklen=5,\n", " tickfont=dict(\n", " family='Arial',\n", " size=12,\n", " color='rgb(82, 82, 82)',\n", " ),\n", " ),\n", " yaxis=dict(\n", " showgrid=False,\n", " zeroline=False,\n", " showline=False,\n", " showticklabels=False,\n", " ),\n", " autosize=False,\n", " margin=dict(\n", " autoexpand=False,\n", " l=100,\n", " r=20,\n", " t=110,\n", " ),\n", " showlegend=False,\n", ")\n", "\n", "annotations = []\n", "\n", "annotations.append(\n", " dict(xref='paper', x=0.95, y=python_df['trends'][8],\n", " xanchor='left', yanchor='middle',\n", " text='Python',\n", " font=dict(\n", " family='Arial',\n", " size=14,\n", " color='rgba(49,130,189, 1)'\n", " ),\n", " showarrow=False)\n", ")\n", "\n", "annotations.append(\n", " dict(xref='paper', x=0.95, y=php_df['trends'][8],\n", " xanchor='left', yanchor='middle',\n", " text='PHP',\n", " font=dict(\n", " family='Arial',\n", " size=14,\n", " color='rgba(49,130,189, 1)'\n", " ),\n", " showarrow=False)\n", ")\n", "\n", "annotations.append(\n", " dict(xref='paper', yref='paper', x=0.5, y=-0.1,\n", " xanchor='center', yanchor='top',\n", " text='Source: Hacker News submissions with the title containing Python/PHP',\n", " font=dict(\n", " family='Arial',\n", " size=12,\n", " color='rgb(150,150,150)'\n", " ),\n", " showarrow=False)\n", ")\n", "\n", "layout['annotations'] = annotations\n", "\n", "fig = go.Figure(data=traces, layout=layout)\n", "py.iplot(fig, filename='plotly/programming-language-trends')" ] } ], "metadata": { "kernelspec": { "display_name": "Python [Root]", "language": "python", "name": "Python [Root]" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 0 }