{ "cells": [ { "cell_type": "code", "execution_count": 101, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import plotly.graph_objs as go\n", "from plotly.offline import download_plotlyjs, init_notebook_mode, iplot\n", "\n", "init_notebook_mode(connected=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will use the `gbq.read_gbq` function to read BigQuery datasets into Pandas `DataFrame` objects." ] }, { "cell_type": "code", "execution_count": 102, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "from pandas.io import gbq" ] }, { "cell_type": "code", "execution_count": 103, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will use `linregress` function for linear regression of scatter plots." ] }, { "cell_type": "code", "execution_count": 104, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from scipy.stats import linregress" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data Collection" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Read the post [Using Google BigQuery with Plotly and Pandas](moderndata.plot.ly/using-google-bigquery-with-plotly-and-pandas/) to create a new project." ] }, { "cell_type": "code", "execution_count": 105, "metadata": { "collapsed": true }, "outputs": [], "source": [ "project_id = 'sixth-edition-678'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This query will collect the `timestamp`, `package name`, and `total download count` columns from the table (on a daily basis)." ] }, { "cell_type": "code", "execution_count": 106, "metadata": { "collapsed": true }, "outputs": [], "source": [ "daily_download_query = \"\"\"\n", "SELECT\n", " DATE(timestamp) as day,\n", " MONTH(timestamp) as month,\n", " file.project,\n", " COUNT(*) as total_downloads,\n", "FROM\n", " TABLE_DATE_RANGE(\n", " [the-psf:pypi.downloads],\n", " TIMESTAMP(\"20120701\"),\n", " CURRENT_TIMESTAMP()\n", " )\n", "WHERE\n", " file.project = '{0}'\n", "GROUP BY\n", " day, file.project, month\n", "ORDER BY\n", " day asc\n", "\"\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The following function run the query and returns a DataFrame object, if successful." ] }, { "cell_type": "code", "execution_count": 107, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def package_df(package):\n", " \"\"\" Return the query result as a pandas.DataFrame object\n", " \n", " param: package(str): Name of the package on PyPI\n", " \"\"\"\n", " \n", " try:\n", " df = gbq.read_gbq(daily_download_query.format(package), project_id=project_id)\n", " return df\n", " except:\n", " raise IOError" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will construct different DataFrames for each package." ] }, { "cell_type": "code", "execution_count": 108, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requesting query... ok.\n", "Query running...\n", "Query done.\n", "Processed: 35.8 Gb\n", "\n", "Retrieving results...\n", " Got page: 1; 100.0% done. Elapsed 13.68 s.\n", "Got 144 rows.\n", "\n", "Total time taken 14.04 s.\n", "Finished at 2016-08-29 13:04:38.\n" ] } ], "source": [ "plotly_df = package_df('plotly')" ] }, { "cell_type": "code", "execution_count": 109, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requesting query... ok.\n", "Query running...\n", "Query done.\n", "Processed: 35.8 Gb\n", "\n", "Retrieving results...\n", " Got page: 1; 100.0% done. Elapsed 15.13 s.\n", "Got 144 rows.\n", "\n", "Total time taken 15.15 s.\n", "Finished at 2016-08-29 13:04:53.\n" ] } ], "source": [ "bokeh_df = package_df('bokeh')" ] }, { "cell_type": "code", "execution_count": 110, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requesting query... ok.\n", "Query running...\n", " Elapsed 14.52 s. Waiting...\n", "Query done.\n", "Processed: 35.8 Gb\n", "\n", "Retrieving results...\n", " Got page: 1; 100.0% done. Elapsed 15.37 s.\n", "Got 144 rows.\n", "\n", "Total time taken 15.39 s.\n", "Finished at 2016-08-29 13:05:09.\n" ] } ], "source": [ "matplotlib_df = package_df('matplotlib')" ] }, { "cell_type": "code", "execution_count": 111, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requesting query... ok.\n", "Query running...\n", "Query done.\n", "Processed: 35.8 Gb\n", "\n", "Retrieving results...\n", " Got page: 1; 100.0% done. Elapsed 13.45 s.\n", "Got 144 rows.\n", "\n", "Total time taken 13.47 s.\n", "Finished at 2016-08-29 13:05:22.\n" ] } ], "source": [ "mpld3_df = package_df('mpld3')" ] }, { "cell_type": "code", "execution_count": 112, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requesting query... ok.\n", "Query running...\n", "Query done.\n", "Processed: 35.8 Gb\n", "\n", "Retrieving results...\n", " Got page: 1; 100.0% done. Elapsed 14.06 s.\n", "Got 144 rows.\n", "\n", "Total time taken 14.08 s.\n", "Finished at 2016-08-29 13:05:36.\n" ] } ], "source": [ "vincent_df = package_df('vincent')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Inspection for Missing Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using a simple `TimeDelta` calculation, we can find if some rows are missing from the DataFrame." ] }, { "cell_type": "code", "execution_count": 113, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "77 rows are missing in the DataFrame.\n" ] } ], "source": [ "from datetime import datetime, timedelta\n", "\n", "# Number of rows in the DataFrame\n", "actual_rows = len(plotly_df)\n", "\n", "start_date = datetime.strptime(plotly_df.iloc[0]['day'], '%Y-%m-%d') # 2016-01-22\n", "end_date = datetime.strptime(plotly_df.iloc[actual_rows - 1]['day'], '%Y-%m-%d') # 2016-08-29\n", "\n", "# Expected rows if there was no missing data (day)\n", "expected_rows = (end_date - start_date).days + 1\n", "\n", "if (actual_rows != expected_rows):\n", " print \"{0} rows are missing in the DataFrame.\".format(expected_rows - actual_rows)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We find that there are no rows from **2016-03-06** to **2016-05-21**." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data Transformation\n", "\n", "Here, we will concate the missing values in the DataFrames." ] }, { "cell_type": "code", "execution_count": 118, "metadata": { "collapsed": false }, "outputs": [], "source": [ "missing_data_start_date = '2016-03-06'\n", "missing_data_end_date = '2016-05-21'\n", "\n", "# starting/ending date for missing data and time differene (1 day)\n", "s = datetime.strptime(missing_data_start_date, '%Y-%m-%d')\n", "e = datetime.strptime(missing_data_end_date, '%Y-%m-%d')\n", "diff = timedelta(days=1)\n", "\n", "# generate all the missing dates in the same format\n", "missing_dates = []\n", "missing_dates_month = []\n", "\n", "while (s <= e):\n", " missing_dates.append(s.strftime('%Y-%m-%d'))\n", " missing_dates_month.append(int(s.strftime('%m')[1]))\n", " s += diff\n", " \n", "missing_row_count = len(missing_dates) # 77" ] }, { "cell_type": "code", "execution_count": 123, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def append_missing_data(dataframe, package):\n", " \"\"\"Append the missing dates DataFrame to a given DataFrame\n", " \n", " param: dataframe(pandas.DataFrame): DataFrame to append\n", " param: package(str): Name of package on PyPI\n", " \"\"\"\n", " \n", " missing_dates_df = pd.DataFrame({'day': missing_dates,\n", " 'month': missing_dates_month,\n", " 'file_project': [package for i in range(missing_row_count)],\n", " 'total_downloads': [0 for i in range(missing_row_count)]}\n", " )\n", " \n", " # place the appended columns at their right place by sorting\n", " new_df = pd.concat([dataframe, missing_dates_df])\n", " \n", " return new_df.sort_values('day')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Updated DataFrames with the recovered missing data." ] }, { "cell_type": "code", "execution_count": 127, "metadata": { "collapsed": true }, "outputs": [], "source": [ "bokeh_df = append_missing_data(bokeh_df, 'bokeh')\n", "matplotlib_df = append_missing_data(matplotlib_df, 'matplotlib')\n", "mpld3_df = append_missing_data(mpld3_df, 'mpld3')\n", "plotly_df = append_missing_data(plotly_df, 'plotly')\n", "vincent_df = append_missing_data(vincent_df, 'vincent')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Package Downloads Comparison (Daily)" ] }, { "cell_type": "code", "execution_count": 128, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trace1 = go.Scatter(\n", " x=plotly_df['day'],\n", " y=plotly_df['total_downloads'],\n", " name='Plotly',\n", " mode='lines',\n", " line=dict(width=0.5,\n", " color='rgb(10. 240, 10)'),\n", " fill='tonexty'\n", ")\n", "\n", "trace2 = go.Scatter(\n", " x=bokeh_df['day'],\n", " y=bokeh_df['total_downloads'],\n", " name='Bokeh',\n", " mode='lines',\n", " line=dict(width=0.5,\n", " color='rgb(42, 77, 20)'),\n", " fill='tonexty'\n", ")\n", "\n", "trace3 = go.Scatter(\n", " x=mpld3_df['day'],\n", " y=mpld3_df['total_downloads'],\n", " name='MPLD3',\n", " mode='lines',\n", " line=dict(width=0.5,\n", " color='rgb(20, 33, 61)'),\n", " fill='tonexty'\n", ")\n", "\n", "trace4 = go.Scatter(\n", " x=vincent_df['day'],\n", " y=vincent_df['total_downloads'],\n", " name='Vincent',\n", " mode='lines',\n", " line=dict(width=0.5,\n", " color='rgb(0, 0, 0)'),\n", " fill='tonexty'\n", ")\n", "\n", "data = [trace1, trace2, trace3, trace4]\n", "\n", "layout = go.Layout(\n", " title='Package Downloads Comparison (Daily)',\n", " showlegend=True,\n", " xaxis=dict(\n", " type='category',\n", " showgrid=False\n", " ),\n", " yaxis=dict(\n", " title='No. of downloads (daily)',\n", " type='linear',\n", " range=[1, 10000]\n", " ),\n", " plot_bgcolor='rgba(250, 250, 250, 1)',\n", " shapes=[\n", " dict(\n", " type='line',\n", " xref='x',\n", " yref='y',\n", " x0='45',\n", " y0='2000',\n", " x1='120',\n", " y1='2000'\n", " )\n", " ],\n", " annotations=[\n", " dict(\n", " x=75,\n", " y=2400,\n", " xref='x',\n", " yref='y',\n", " text=\"PyPI's stats collection service was down from March 6 to May 21\",\n", " showarrow=False\n", " ),\n", " dict(\n", " x=115,\n", " y=9600,\n", " xref='x',\n", " yref='y',\n", " text='From Jan 22, 2016 To Aug 29, 2016',\n", " showarrow=False\n", " ),\n", " dict(\n", " x=121,\n", " y=2000,\n", " xref='x',\n", " yref='y',\n", " text=\"\",\n", " showarrow=True,\n", " ay=0,\n", " ax=-5\n", " ),\n", " dict(\n", " x=45,\n", " y=2000,\n", " xref='x',\n", " yref='y',\n", " text=\"\",\n", " showarrow=True,\n", " ay=0,\n", " ax=5\n", " )\n", " ]\n", ")\n", "\n", "fig = go.Figure(data=data, layout=layout)\n", "iplot(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Package Downloads Comparison (Monthly)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The dataset was created on Jan 22, 2016. We will use these months on the x-axis." ] }, { "cell_type": "code", "execution_count": 129, "metadata": { "collapsed": true }, "outputs": [], "source": [ "months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We are using pandas' `groupby` method to gather all the row by their `month` value and then adding their count to find out 'total downloads' in the month." ] }, { "cell_type": "code", "execution_count": 130, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
total_downloads
month
16791
225920
36008
40
515946
652043
761338
892813
\n", "
" ], "text/plain": [ " total_downloads\n", "month \n", "1 6791\n", "2 25920\n", "3 6008\n", "4 0\n", "5 15946\n", "6 52043\n", "7 61338\n", "8 92813" ] }, "execution_count": 130, "metadata": {}, "output_type": "execute_result" } ], "source": [ "plotly_df.groupby('month').sum()" ] }, { "cell_type": "code", "execution_count": 131, "metadata": { "collapsed": true }, "outputs": [], "source": [ "trace1 = go.Bar(x=months, y=plotly_df.groupby('month').sum()['total_downloads'], name='Plotly')\n", "trace2 = go.Bar(x=months, y=vincent_df.groupby('month').sum()['total_downloads'], name='Vincent')\n", "trace3 = go.Bar(x=months, y=bokeh_df.groupby('month').sum()['total_downloads'], name='Bokeh')\n", "trace4 = go.Bar(x=months, y=mpld3_df.groupby('month').sum()['total_downloads'], name='MPLD3')" ] }, { "cell_type": "code", "execution_count": 132, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "data = [trace1, trace2, trace3, trace4]\n", "\n", "layout = go.Layout(\n", " barmode='group',\n", " title=\"Package Downloads Comparison (PyPI)\",\n", " yaxis=dict(\n", " title='No. of downloads (monthly)'\n", " ),\n", " xaxis=dict(\n", " title='Month'\n", " ),\n", " annotations=[\n", " dict(\n", " x=3,\n", " y=0,\n", " xref='x',\n", " yref='y',\n", " text=\"PyPI's stats collection service
was down from March 6 to May 21\",\n", " showarrow=True,\n", " arrowhead=2,\n", " ax=0,\n", " ay=-150\n", " ),\n", " dict(\n", " x=3.7,\n", " y=90000,\n", " xref='x',\n", " yref='y',\n", " text='From Jan 22, 2016 To Aug 29, 2016',\n", " showarrow=False\n", " )\n", " ]\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "iplot(fig)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "### Growth of Plotly package downloads" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Following the tutorial [Linear fit in Python](https://plot.ly/python/linear-fits/), we will try to find an\n", "approximate regression line for the scatter graph of Plotly package's downloads." ] }, { "cell_type": "code", "execution_count": 155, "metadata": { "collapsed": true }, "outputs": [], "source": [ "xvals = np.arange(0, len(plotly_df))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The following `traces` are for the package downloads scatter plot (for each package)." ] }, { "cell_type": "code", "execution_count": 207, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trace1 = go.Scatter(\n", " x=xvals[:44], \n", " y=plotly_df['total_downloads'].iloc[:44], \n", " mode='markers',\n", " marker=go.Marker(color='rgb(255, 127, 14)',size=5,symbol='x'),\n", " name='Plotly Downloads'\n", ")\n", "\n", "trace2 = go.Scatter(\n", " x=xvals[121:], \n", " y=plotly_df['total_downloads'].iloc[121:],\n", " mode='markers',\n", " marker=go.Marker(color='rgb(255, 127, 14)',size=5,symbol='x'),\n", " name='Plotly Downloads',\n", " showlegend=False\n", ")\n", "\n", "# linear regression line for Plotly package downloads\n", "pslope, pintercept, pr_value, pp_value, pstd_err = linregress(xvals, plotly_df['total_downloads'])\n", "plotly_line = pslope*xvals + pintercept\n", "\n", "trace3 = go.Scatter(\n", " x=xvals, \n", " y=plotly_line, \n", " mode='lines',\n", " marker=go.Marker(color='rgb(10, 20, 30)'),\n", " name='Plotly Regression Line',\n", " line=dict(\n", " color='rgba(10, 10, 10, 1)',\n", " width=1,\n", " dash='longdashdot'\n", " )\n", ")\n", "\n", "layout = go.Layout(\n", " title='Linear Regression Line for Plotly\\'s Package Downloads Growth',\n", " yaxis = dict(\n", " title='No. of downloads (daily)'\n", " ),\n", " xaxis = dict(\n", " title='# days'\n", " ),\n", " annotations=[\n", " dict(\n", " x=85,\n", " y=2000,\n", " xref='x',\n", " yref='y',\n", " text=\"Y = 13.29X - 282.55\",\n", " showarrow=False\n", " )\n", " ]\n", ")\n", "\n", "data = [trace1, trace2, trace3]\n", "\n", "fig = go.Figure(data=data, layout=layout)\n", "iplot(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Similary, we can find the approximate growth line for 'Matplotlib'." ] }, { "cell_type": "code", "execution_count": 204, "metadata": { "collapsed": true }, "outputs": [], "source": [ "mslope, mintercept, mr_value, mp_value, mstd_err = linregress(xvals, matplotlib_df['total_downloads'])\n", "matplotlib_line = mslope*xvals + mintercept" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Daily download counts for 'Matplotlib' ranges around 7000-8000 as of now." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Let's find out how much time will it take for Plotly to reach that level." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using the Plotly's growth line equation $Y = 13.29X - 282.55$, we can find out the approximate no. of days for downloads to reach 8000." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "$Y(8000)$, results in X = 624 (nearest integer value), where current day index is 220 as of Aug 29, 2016.\n", "\n", "#### That means it will take around 404 days for Plotly's download range to reach 8000." ] }, { "cell_type": "code", "execution_count": 229, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# linear regression line for Plotly package downloads\n", "pslope, pintercept, pr_value, pp_value, pstd_err = linregress(xvals, plotly_df['total_downloads'])\n", "plotly_line = pslope*xvals + pintercept\n", "\n", "trace1 = go.Scatter(\n", " x=xvals, \n", " y=plotly_line, \n", " mode='lines',\n", " marker=go.Marker(color='rgb(10, 20, 30)'),\n", " name='Plotly Regression (Actual)',\n", " line=dict(\n", " color='rgba(10, 10, 10, 1)',\n", " width=1,\n", " dash='longdashdot'\n", " )\n", ")\n", "\n", "future_xvals = np.arange(221, 221 + 404)\n", "\n", "trace2 = go.Scatter(\n", " x=future_xvals, \n", " y=pslope*future_xvals+pintercept, \n", " mode='lines',\n", " marker=go.Marker(color='rgb(10, 20, 30)'),\n", " name='Plotly Regression (Prediction)',\n", " line=dict(\n", " color='rgba(10, 10, 10, 1)',\n", " width=1,\n", " dash='dot'\n", " )\n", ")\n", "\n", "layout = go.Layout(\n", " title='Prediction for Plotly\\'s Package Downloads Growth',\n", " yaxis = dict(\n", " title='No. of downloads (daily)'\n", " ),\n", " xaxis = dict(\n", " title='# days'\n", " ),\n", " annotations=[\n", " dict(\n", " x=85,\n", " y=2000,\n", " xref='x',\n", " yref='y',\n", " text=\"Y = 13.29X - 282.55\",\n", " showarrow=False\n", " ),\n", " dict(\n", " x=400,\n", " y=7800,\n", " xref='x',\n", " yref='y',\n", " text=\"Current download range for Matplotlib\",\n", " showarrow=False\n", " )\n", " ],\n", " shapes=[\n", " dict(\n", " type='line',\n", " xref='x',\n", " yref='y',\n", " x0=0,\n", " y0=8000,\n", " x1=624,\n", " y1=8000,\n", " line=dict(\n", " color='rgba(10, 10, 10, 1)',\n", " width=1,\n", " dash='solid'\n", " )\n", " ),\n", " dict(\n", " type='line',\n", " xref='x',\n", " yref='y',\n", " x0=624,\n", " y0=0,\n", " x1=624,\n", " y1=8000,\n", " line=dict(\n", " color='rgba(10, 10, 10, 1)',\n", " width=1,\n", " dash='solid'\n", " )\n", " )\n", " ]\n", ")\n", "\n", "data = [trace1, trace2]\n", "\n", "fig = go.Figure(data=data, layout=layout)\n", "iplot(fig)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }