{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Plotly demo with spark\n", "\n", "https://plot.ly/python/apache-spark/\n", "\n", " **Table of Contents**\n", "\n", "
\n", " \n", " \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Check pyspark is loaded correctly" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/takanori/mybin/spark-2.0.0-bin-hadoop2.7\n" ] } ], "source": [ "import os\n", "import sys\n", "\n", "spark_home = os.environ.get('SPARK_HOME', None)\n", "print spark_home" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "from __future__ import print_function #python 3 support\n", "print(sc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Spark Context object loaded nicely :).\n", "\n", "How about ``sqlcontext``?" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "from pyspark.sql import SQLContext\n", "sqlContext = SQLContext(sc)\n", "print(sqlContext)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Download bike data" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "--2016-09-22 15:49:51-- https://raw.githubusercontent.com/anabranch/Interactive-Graphs-with-Plotly/master/btd2.json\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.20.133\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.20.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 41125386 (39M) [text/plain]\n", "Saving to: ‘btd2.json’\n", "\n", " 0K .......... .......... .......... .......... .......... 0% 219M 0s\n", " 50K .......... .......... .......... .......... .......... 0% 402M 0s\n", " 100K .......... .......... .......... .......... .......... 0% 408M 0s\n", " 150K .......... .......... .......... .......... .......... 0% 382M 0s\n", " 200K .......... .......... .......... .......... .......... 0% 8.15M 1s\n", " 250K .......... .......... .......... .......... .......... 0% 7.07M 2s\n", " 300K .......... .......... .......... .......... .......... 0% 16.5M 2s\n", " 350K .......... .......... .......... .......... .......... 0% 4.44M 3s\n", " 400K .......... .......... .......... .......... .......... 1% 7.13M 3s\n", " 450K .......... .......... .......... .......... .......... 1% 14.0M 3s\n", " 500K .......... .......... .......... .......... .......... 1% 6.76M 3s\n", " 550K .......... .......... .......... .......... .......... 1% 6.98M 3s\n", " 600K .......... .......... .......... .......... .......... 1% 7.04M 4s\n", " 650K .......... .......... .......... .......... .......... 1% 20.3M 3s\n", " 700K .......... .......... .......... .......... .......... 1% 7.09M 4s\n", " 750K .......... .......... .......... .......... .......... 1% 6.18M 4s\n", " 800K .......... .......... .......... .......... .......... 2% 7.69M 4s\n", " 850K .......... .......... .......... .......... .......... 2% 7.78M 4s\n", " 900K .......... .......... .......... .......... .......... 2% 17.9M 4s\n", " 950K .......... .......... .......... .......... .......... 2% 6.93M 4s\n", " 1000K .......... .......... .......... .......... .......... 2% 6.80M 4s\n", " 1050K .......... .......... .......... .......... .......... 2% 6.88M 4s\n", " 1100K .......... .......... .......... .......... .......... 2% 6.87M 4s\n", " 1150K .......... .......... .......... .......... .......... 2% 6.29M 4s\n", " 1200K .......... .......... .......... .......... .......... 3% 67.3M 4s\n", " 1250K .......... .......... .......... .......... .......... 3% 6.84M 4s\n", " 1300K .......... .......... .......... .......... .......... 3% 6.09M 4s\n", " 1350K .......... .......... .......... .......... .......... 3% 6.70M 4s\n", " 1400K .......... .......... .......... .......... .......... 3% 54.7M 4s\n", " 1450K .......... .......... .......... .......... .......... 3% 6.91M 4s\n", " 1500K .......... .......... .......... .......... .......... 3% 6.87M 4s\n", " 1550K .......... .......... .......... .......... .......... 3% 6.39M 4s\n", " 1600K .......... .......... .......... .......... .......... 4% 6.70M 4s\n", " 1650K .......... .......... .......... .......... .......... 4% 7.22M 4s\n", " 1700K .......... .......... .......... .......... .......... 4% 37.6M 4s\n", " 1750K .......... .......... .......... .......... .......... 4% 6.87M 4s\n", " 1800K .......... .......... .......... .......... .......... 4% 6.53M 4s\n", " 1850K .......... .......... .......... .......... .......... 4% 6.78M 4s\n", " 1900K .......... .......... .......... .......... .......... 4% 7.13M 4s\n", " 1950K .......... .......... .......... .......... .......... 4% 6.80M 4s\n", " 2000K .......... .......... .......... .......... .......... 5% 40.6M 4s\n", " 2050K .......... .......... .......... .......... .......... 5% 6.46M 4s\n", " 2100K .......... .......... .......... .......... .......... 5% 6.77M 4s\n", " 2150K .......... .......... .......... .......... .......... 5% 7.28M 4s\n", " 2200K .......... .......... .......... .......... .......... 5% 37.0M 4s\n", " 2250K .......... .......... .......... .......... .......... 5% 6.86M 4s\n", " 2300K .......... .......... .......... .......... .......... 5% 6.85M 4s\n", " 2350K .......... .......... .......... .......... .......... 5% 6.71M 4s\n", " 2400K .......... .......... .......... .......... .......... 6% 8.05M 4s\n", " 2450K .......... .......... .......... .......... .......... 6% 21.1M 4s\n", " 2500K .......... .......... .......... .......... .......... 6% 6.76M 4s\n", " 2550K .......... .......... .......... .......... .......... 6% 6.85M 4s\n", " 2600K .......... .......... .......... .......... .......... 6% 6.80M 4s\n", " 2650K .......... .......... .......... .......... .......... 6% 46.5M 4s\n", " 2700K .......... .......... .......... .......... .......... 6% 6.40M 4s\n", " 2750K .......... .......... .......... .......... .......... 6% 7.18M 4s\n", " 2800K .......... .......... .......... .......... .......... 7% 7.46M 4s\n", " 2850K .......... .......... .......... .......... .......... 7% 6.73M 4s\n", " 2900K .......... .......... .......... .......... .......... 7% 7.66M 4s\n", " 2950K .......... .......... .......... .......... .......... 7% 26.6M 4s\n", " 3000K .......... .......... .......... .......... .......... 7% 6.83M 4s\n", " 3050K .......... .......... .......... .......... .......... 7% 6.77M 4s\n", " 3100K .......... .......... .......... .......... .......... 7% 8.92M 4s\n", " 3150K .......... .......... .......... .......... .......... 7% 5.43M 4s\n", " 3200K .......... .......... .......... .......... .......... 8% 57.0M 4s\n", " 3250K .......... .......... .......... .......... .......... 8% 6.69M 4s\n", " 3300K .......... .......... .......... .......... .......... 8% 6.77M 4s\n", " 3350K .......... .......... .......... .......... .......... 8% 6.45M 4s\n", " 3400K .......... .......... .......... .......... .......... 8% 66.7M 4s\n", " 3450K .......... .......... .......... .......... .......... 8% 7.07M 4s\n", " 3500K .......... .......... .......... .......... .......... 8% 6.26M 4s\n", " 3550K .......... .......... .......... .......... .......... 8% 6.28M 4s\n", " 3600K .......... .......... .......... .......... .......... 9% 2.88M 4s\n", " 3650K .......... .......... .......... .......... .......... 9% 32.5M 4s\n", " 3700K .......... .......... .......... .......... .......... 9% 6.80M 4s\n", " 3750K .......... .......... .......... .......... .......... 9% 6.32M 4s\n", " 3800K .......... .......... .......... .......... .......... 9% 7.36M 4s\n", " 3850K .......... .......... .......... .......... .......... 9% 54.7M 4s\n", " 3900K .......... .......... .......... .......... .......... 9% 6.66M 4s\n", " 3950K .......... .......... .......... .......... .......... 9% 6.79M 4s\n", " 4000K .......... .......... .......... .......... .......... 10% 6.63M 4s\n", " 4050K .......... .......... .......... .......... .......... 10% 6.87M 4s\n", " 4100K .......... .......... .......... .......... .......... 10% 27.6M 4s\n", " 4150K .......... .......... .......... .......... .......... 10% 7.89M 4s\n", " 4200K .......... .......... .......... .......... .......... 10% 5.85M 4s\n", " 4250K .......... .......... .......... .......... .......... 10% 6.54M 4s\n", " 4300K .......... .......... .......... .......... .......... 10% 49.3M 4s\n", " 4350K .......... .......... .......... .......... .......... 10% 5.06M 4s\n", " 4400K .......... .......... .......... .......... .......... 11% 6.75M 4s\n", " 4450K .......... .......... .......... .......... .......... 11% 6.74M 4s\n", " 4500K .......... .......... .......... .......... .......... 11% 11.4M 4s\n", " 4550K .......... .......... .......... .......... .......... 11% 7.05M 4s\n", " 4600K .......... .......... .......... .......... .......... 11% 12.6M 4s\n", " 4650K .......... .......... .......... .......... .......... 11% 6.86M 4s\n", " 4700K .......... .......... .......... .......... .......... 11% 7.81M 4s\n", " 4750K .......... .......... .......... .......... .......... 11% 6.13M 4s\n", " 4800K .......... .......... .......... .......... .......... 12% 9.07M 4s\n", " 4850K .......... .......... .......... .......... .......... 12% 6.91M 4s\n", " 4900K .......... .......... .......... .......... .......... 12% 3.98M 4s\n", " 4950K .......... .......... .......... .......... .......... 12% 7.10M 4s\n", " 5000K .......... .......... .......... .......... .......... 12% 13.0M 4s\n", " 5050K .......... .......... .......... .......... .......... 12% 6.49M 4s\n", " 5100K .......... .......... .......... .......... .......... 12% 12.4M 4s\n", " 5150K .......... .......... .......... .......... .......... 12% 6.33M 4s\n", " 5200K .......... .......... .......... .......... .......... 13% 6.70M 4s\n", " 5250K .......... .......... .......... .......... .......... 13% 8.85M 4s\n", " 5300K .......... .......... .......... .......... .......... 13% 6.77M 4s\n", " 5350K .......... .......... .......... .......... .......... 13% 9.48M 4s\n", " 5400K .......... .......... .......... .......... .......... 13% 9.72M 4s\n", " 5450K .......... .......... .......... .......... .......... 13% 7.08M 4s\n", " 5500K .......... .......... .......... .......... .......... 13% 7.86M 4s\n", " 5550K .......... .......... .......... .......... .......... 13% 6.09M 4s\n", " 5600K .......... .......... .......... .......... .......... 14% 7.02M 4s\n", " 5650K .......... .......... .......... .......... .......... 14% 24.9M 4s\n", " 5700K .......... .......... .......... .......... .......... 14% 6.75M 4s\n", " 5750K .......... .......... .......... .......... .......... 14% 8.10M 4s\n", " 5800K .......... .......... .......... .......... .......... 14% 6.20M 4s\n", " 5850K .......... .......... .......... .......... .......... 14% 13.4M 4s\n", " 5900K .......... .......... .......... .......... .......... 14% 6.57M 4s\n", " 5950K .......... .......... .......... .......... .......... 14% 4.86M 4s\n", " 6000K .......... .......... .......... .......... .......... 15% 22.3M 4s\n", " 6050K .......... .......... .......... .......... .......... 15% 6.84M 4s\n", " 6100K .......... .......... .......... .......... .......... 15% 6.42M 4s\n", " 6150K .......... .......... .......... .......... .......... 15% 9.29M 4s\n", " 6200K .......... .......... .......... .......... .......... 15% 5.37M 4s\n", " 6250K .......... .......... .......... .......... .......... 15% 48.8M 4s\n", " 6300K ........**OUTPUT MUTED**" ] } ], "source": [ "%%bash\n", "wget https://raw.githubusercontent.com/anabranch/Interactive-Graphs-with-Plotly/master/btd2.json" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [], "source": [ "btd = sqlContext.read.json('btd2.json')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "print(type(btd))" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- Bike #: string (nullable = true)\n", " |-- Duration: string (nullable = true)\n", " |-- End Date: string (nullable = true)\n", " |-- End Station: string (nullable = true)\n", " |-- End Terminal: string (nullable = true)\n", " |-- Start Date: string (nullable = true)\n", " |-- Start Station: string (nullable = true)\n", " |-- Start Terminal: string (nullable = true)\n", " |-- Subscription Type: string (nullable = true)\n", " |-- Trip ID: string (nullable = true)\n", " |-- Zip Code: string (nullable = true)\n", "\n" ] } ], "source": [ "btd.printSchema()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[Row(Bike #=u'520', Duration=u'63', End Date=u'8/29/13 14:14', End Station=u'South Van Ness at Market', End Terminal=u'66', Start Date=u'8/29/13 14:13', Start Station=u'South Van Ness at Market', Start Terminal=u'66', Subscription Type=u'Subscriber', Trip ID=u'4576', Zip Code=u'94127'),\n", " Row(Bike #=u'661', Duration=u'70', End Date=u'8/29/13 14:43', End Station=u'San Jose City Hall', End Terminal=u'10', Start Date=u'8/29/13 14:42', Start Station=u'San Jose City Hall', Start Terminal=u'10', Subscription Type=u'Subscriber', Trip ID=u'4607', Zip Code=u'95138'),\n", " Row(Bike #=u'48', Duration=u'71', End Date=u'8/29/13 10:17', End Station=u'Mountain View City Hall', End Terminal=u'27', Start Date=u'8/29/13 10:16', Start Station=u'Mountain View City Hall', Start Terminal=u'27', Subscription Type=u'Subscriber', Trip ID=u'4130', Zip Code=u'97214')]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "btd.take(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Register dataframe as table to use SQL commands!" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sqlContext.registerDataFrameAsTable(btd, \"bay_area_bike\")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# now i can use sql commands with table named *bay_area_bike*\n", "df2 = sqlContext.sql(\"SELECT Duration as d1 from bay_area_bike where Duration < 7200\")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- d1: string (nullable = true)\n", "\n" ] } ], "source": [ "df2.printSchema()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Now let's visualize!" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": true }, "outputs": [], "source": [ "data = Data([Histogram(x=df2.toPandas()['d1'])])\n", "\n" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/takanori/.local/lib/python2.7/site-packages/plotly/plotly/plotly.py:236: UserWarning:\n", "\n", "Woah there! Look at all those points! Due to browser limitations, the Plotly SVG drawing functions have a hard time graphing more than 500k data points for line charts, or 40k points for other types of charts. Here are some suggestions:\n", "(1) Use the `plotly.graph_objs.Scattergl` trace object to generate a WebGl graph.\n", "(2) Trying using the image API to return an image instead of a graph URL\n", "(3) Use matplotlib\n", "(4) See if you can create your visualization with fewer data points\n", "\n", "If the visualization you're using aggregates points (e.g., box plot, histogram, etc.) you can disregard this warning.\n", "\n" ] }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "py.iplot(data, filename=\"spark/less_2_hour_rides\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "\n", "That was simple and we can see that plotly was able to handle the data without issue. We can see that big uptick in rides that last less than ~30 minutes (2000 seconds) - so let's look at that distribution.\n" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df3 = sqlContext.sql(\"SELECT Duration as d1 from bay_area_bike where Duration < 2000\")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "\n", "A great thing about Apache Spark is that you can sample easily from large datasets, you just set the amount you would like to sample and you're all set.\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "s1 = df2.sample(False, 0.05, 20)\n", "s2 = df3.sample(False, 0.05, 2500)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": true }, "outputs": [], "source": [ "data = Data([\n", " Histogram(x=s1.toPandas()['d1'], name=\"Large Sample\"),\n", " Histogram(x=s2.toPandas()['d1'], name=\"Small Sample\")\n", " ])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Plotly converts those samples into beautifully overlayed histograms. This is a great way to eyeball different distributions." ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "py.iplot(data, filename=\"spark/sample_rides\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What's really powerful about Plotly is sharing this data is simple. I can take the above graph and change the styling or bins visually. A common workflow is to make a rough sketch of the graph in code, then make a more refined version with notes to share with management like the one below. Plotly's online interface allows you to edit graphs in other languages as well." ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import plotly.tools as tls\n", "tls.embed(\"https://plot.ly/~bill_chambers/101\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# PySpark Dataframe to Pandas Dataframe" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now let's check out bike rentals from individual stations. We can do a groupby with Spark DataFrames just as we might in Pandas. We've also seen at this point how easy it is to convert a Spark DataFrame to a pandas DataFrame." ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/takanori/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:1: FutureWarning:\n", "\n", "sort(columns=....) is deprecated, use sort_values(by=.....)\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Start Stationcount
64San Francisco Caltrain (Townsend at 4th)9838
50Harry Bridges Plaza (Ferry Building)7343
54Embarcadero at Sansome6545
6Market at Sansome5922
27Temporary Transbay Terminal (Howard at Beale)5113
26Market at 4th5030
662nd at Townsend4987
58San Francisco Caltrain 2 (330 Townsend)4976
28Steuart at Market4913
14Townsend at 7th4493
612nd at South Park4458
48Grant Avenue at Columbus Avenue4004
36Powell Street BART3836
02nd at Folsom3776
30South Van Ness at Market3521
40Market at 10th3511
32Embarcadero at Bryant3497
7Spear at Folsom3423
60Howard at 2nd3263
15Civic Center BART (7th at Market)3074
44Beale at Market3057
24Embarcadero at Folsom2931
51Mechanics Plaza (Market at Battery)2868
10Commercial at Montgomery2834
3Powell at Post (Union Square)2824
65Embarcadero at Vallejo2785
455th at Howard2635
19Post at Kearney2503
5Yerba Buena Center of the Arts (3rd @ Howard)2487
38Clay at Battery2419
.........
46San Pedro Square715
25Mountain View City Hall630
12San Salvador at 1st597
8MLK Library528
9Japantown496
21SJSU - San Salvador at 9th489
39University and Emerson434
23Palo Alto Caltrain Station431
68SJSU 4th at San Carlos389
33Redwood City Caltrain Station378
67St James Park366
63Cowper at University355
18San Jose Civic Center346
53Arena Green / SAP Center339
42Adobe on Almaden335
1California Ave Caltrain Station297
13Rengstorff Avenue / California Street248
31San Antonio Caltrain Station238
37Evelyn Park and Ride218
43Broadway St at Battery St201
59Park at Olive189
56Castro Street and El Camino Real132
47Redwood City Medical Center123
57San Antonio Shopping Center108
49San Mateo County Center101
62Franklin at Maple99
17Broadway at Main45
41Redwood City Public Library44
22San Jose Government Center23
20Mezes Park3
\n", "

69 rows × 2 columns

\n", "
" ], "text/plain": [ " Start Station count\n", "64 San Francisco Caltrain (Townsend at 4th) 9838\n", "50 Harry Bridges Plaza (Ferry Building) 7343\n", "54 Embarcadero at Sansome 6545\n", "6 Market at Sansome 5922\n", "27 Temporary Transbay Terminal (Howard at Beale) 5113\n", "26 Market at 4th 5030\n", "66 2nd at Townsend 4987\n", "58 San Francisco Caltrain 2 (330 Townsend) 4976\n", "28 Steuart at Market 4913\n", "14 Townsend at 7th 4493\n", "61 2nd at South Park 4458\n", "48 Grant Avenue at Columbus Avenue 4004\n", "36 Powell Street BART 3836\n", "0 2nd at Folsom 3776\n", "30 South Van Ness at Market 3521\n", "40 Market at 10th 3511\n", "32 Embarcadero at Bryant 3497\n", "7 Spear at Folsom 3423\n", "60 Howard at 2nd 3263\n", "15 Civic Center BART (7th at Market) 3074\n", "44 Beale at Market 3057\n", "24 Embarcadero at Folsom 2931\n", "51 Mechanics Plaza (Market at Battery) 2868\n", "10 Commercial at Montgomery 2834\n", "3 Powell at Post (Union Square) 2824\n", "65 Embarcadero at Vallejo 2785\n", "45 5th at Howard 2635\n", "19 Post at Kearney 2503\n", "5 Yerba Buena Center of the Arts (3rd @ Howard) 2487\n", "38 Clay at Battery 2419\n", ".. ... ...\n", "46 San Pedro Square 715\n", "25 Mountain View City Hall 630\n", "12 San Salvador at 1st 597\n", "8 MLK Library 528\n", "9 Japantown 496\n", "21 SJSU - San Salvador at 9th 489\n", "39 University and Emerson 434\n", "23 Palo Alto Caltrain Station 431\n", "68 SJSU 4th at San Carlos 389\n", "33 Redwood City Caltrain Station 378\n", "67 St James Park 366\n", "63 Cowper at University 355\n", "18 San Jose Civic Center 346\n", "53 Arena Green / SAP Center 339\n", "42 Adobe on Almaden 335\n", "1 California Ave Caltrain Station 297\n", "13 Rengstorff Avenue / California Street 248\n", "31 San Antonio Caltrain Station 238\n", "37 Evelyn Park and Ride 218\n", "43 Broadway St at Battery St 201\n", "59 Park at Olive 189\n", "56 Castro Street and El Camino Real 132\n", "47 Redwood City Medical Center 123\n", "57 San Antonio Shopping Center 108\n", "49 San Mateo County Center 101\n", "62 Franklin at Maple 99\n", "17 Broadway at Main 45\n", "41 Redwood City Public Library 44\n", "22 San Jose Government Center 23\n", "20 Mezes Park 3\n", "\n", "[69 rows x 2 columns]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dep_stations = btd.groupBy(btd['Start Station']).count().toPandas().sort('count', ascending=False)\n", "dep_stations" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "\n", "Now that we've got a better sense of which stations might be interesting to look at, let's graph out, the number of trips leaving from the top two stations over time.\n" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "64 San Francisco Caltrain (Townsend at 4th)\n", "50 Harry Bridges Plaza (Ferry Building)\n", "54 Embarcadero at Sansome\n", "Name: Start Station, dtype: object" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dep_stations['Start Station'][:3] # top 3 stations" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def transform_df(df):\n", " df['counts'] = 1\n", " df['Start Date'] = df['Start Date'].apply(pd.to_datetime)\n", " return df.set_index('Start Date').resample('D', how='sum')" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/takanori/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:4: FutureWarning:\n", "\n", "how in .resample() is deprecated\n", "the new syntax is .resample(...).sum()\n", "\n" ] } ], "source": [ "pop_stations = [] # being popular stations - we could easily extend this to more stations\n", "for station in dep_stations['Start Station'][:3]:\n", " temp = transform_df(btd.where(btd['Start Station'] == station).select(\"Start Date\").toPandas())\n", " pop_stations.append(\n", " Scatter(\n", " x=temp.index,\n", " y=temp.counts,\n", " name=station\n", " )\n", " )" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "\n", "data = Data(pop_stations)\n", "py.iplot(data, filename=\"spark/over_time\")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [Root]", "language": "python", "name": "Python [Root]" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 0 }