{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\"Dask\n", "\n", "DataFrames on a Cluster\n", "=======================\n", "\n", "\"Pandas\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Parallelize Pandas with Dask.dataframe\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from dask.distributed import Client, progress\n", "e = Client('10.200.30.241:8786')\n", "e.restart()\n", "e" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import dask.dataframe as dd\n", "import pandas as pd\n", "\n", "df = pd.read_csv('/data/jcrist/airline/1990.csv')\n", "dtypes = df.dtypes.to_dict()\n", "df = dd.read_csv('/data/jcrist/airline/198*.csv', dtype=dtypes)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df = e.persist(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "progress(df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "Dask DataFrames\n", "---------------\n", "\n", "* Coordinate many Pandas DataFrames across a cluster\n", "* Faithfully implement a subset of the Pandas API\n", "* Use Pandas under the hood (for speed and maturity)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%time len(df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Top 10 airports by mean departure delay" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "expr = df.DepDelay.groupby(df.Origin).mean().nlargest(10)\n", "expr" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "expr.compute()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Maximum departure delay from EWR" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "expr = df.DepDelay[df.Origin == 'EWR'].max()\n", "expr" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "expr.compute()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 0 }