{ "cells": [ { "cell_type": "code", "execution_count": 225, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import zipfile\n", "\n", "import requests # not a dependency of engarde\n", "\n", "import engarde.checks as ck\n", "\n", "%matplotlib inline\n", "pd.options.display.max_rows = 20" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Just a couple quick examples." ] }, { "cell_type": "code", "execution_count": 213, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# This will take a few minutes\n", "\n", "r = requests.get(\"http://www.transtats.bts.gov/Download/On_Time_On_Time_Performance_2015_1.zip\",\n", " stream=True)\n", "with open(\"otp-1.zip\", \"wb\") as f:\n", " for chunk in r.iter_content(chunk_size=1024):\n", " f.write(chunk)\n", " f.flush()\n", "r.close()\n", "\n", "z = zipfile.ZipFile(\"otp-1.zip\")\n", "fp = z.extract('On_Time_On_Time_Performance_2015_1.csv')" ] }, { "cell_type": "code", "execution_count": 144, "metadata": { "collapsed": false }, "outputs": [], "source": [ "columns = ['FlightDate', 'Carrier', 'TailNum', 'FlightNum',\n", " 'Origin', 'OriginCityName', 'OriginStateName',\n", " 'Dest', 'DestCityName', 'DestStateName',\n", " 'DepTime', 'DepDelay', 'TaxiOut', 'WheelsOn',\n", " 'WheelsOn', 'TaxiIn', 'ArrTime', 'ArrDelay',\n", " 'Cancelled', 'Diverted', 'ActualElapsedTime',\n", " 'AirTime', 'Distance', 'CarrierDelay', 'WeatherDelay',\n", " 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']\n", "\n", "df = pd.read_csv('On_Time_On_Time_Performance_2015_1.csv', usecols=columns,\n", " dtype={'DepTime': str})\n", "dep_time = df.DepTime.fillna('').str.pad(4, side='left', fillchar='0')\n", "df['ts'] = pd.to_datetime(df.FlightDate + 'T' + dep_time,\n", " format='%Y-%m-%dT%H%M%S')\n", "df = df.drop(['FlightDate', 'DepTime'], axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's suppose that down the road our probram can only handle certain carriers; an update to the data adding a new carrier would violate an assumpetion we hold. We'll use the `within_set` method to check our assumption" ] }, { "cell_type": "code", "execution_count": 226, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "WN 100042\n", "DL 64421\n", "EV 49925\n", "OO 48114\n", "AA 44059\n", "dtype: int64" ] }, "execution_count": 226, "metadata": {}, "output_type": "execute_result" } ], "source": [ "carriers = ['AA', 'AS', 'B6', 'DL', 'US', 'VX', 'WN', 'UA', 'NK', 'MQ', 'OO',\n", " 'EV', 'HA', 'F9']\n", "df.pipe(ck.within_set, items={'Carrier': carriers}).Carrier.value_counts().head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Great, our assumption was true (at least for now).\n", "\n", "Surely, we can't count on each flight having a `Carrier`, `TailNum` and `FlightNum`, right?" ] }, { "cell_type": "code", "execution_count": 217, "metadata": { "collapsed": false }, "outputs": [ { "ename": "AssertionError", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpipe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mck\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnone_missing\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Carrier'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'TailNum'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'FlightNum'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/Users/tom.augspurger/Envs/py3/lib/python3.4/site-packages/pandas-0.16.2_11_gd8a2f30-py3.4-macosx-10.10-x86_64.egg/pandas/core/generic.py\u001b[0m in \u001b[0;36mpipe\u001b[0;34m(self, func, *args, **kwargs)\u001b[0m\n\u001b[1;32m 2111\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2112\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2113\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2114\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2115\u001b[0m \u001b[0;31m#----------------------------------------------------------------------\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/tom.augspurger/sandbox/engarde/engarde/checks.py\u001b[0m in \u001b[0;36mnone_missing\u001b[0;34m(df, columns)\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0many\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0many\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 21\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mAssertionError\u001b[0m: " ] } ], "source": [ "df.pipe(ck.none_missing, columns=['Carrier', 'TailNum', 'FlightNum'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note: this isn't too user-friendly yet. I'm planning to make the error messages more informative. Just haven't gotten there yet.\n", "That said, you wouldn't really use `engarde` to *determine* whether or not those columns are always not null. Instead, you might find that for January every flight has all of those fields, assume that hold generally, only to be surprised when next month you get a flight without a tail number." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Decorator Interface" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Each of your `checks` can also be written as a decorator on a function that returns a `DataFrame`.\n", "I *really* like how slick this is.\n", "\n", "Let's do a nonsense example. Suppose we want to show the counts of each `Carrier`, but our UI designer\n", "worries that if things are too spread out the bar graph will look weird (again, silly example). We'll\n", "assert that teh counts are within a comfortable range and that each count is within 3 standard deviations of the mean." ] }, { "cell_type": "code", "execution_count": 266, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import engarde.decorators as ed\n", "\n", "@ed.within_range({'Counts!': (4000, 110000)})\n", "@ed.within_n_std(3)\n", "def pretty_counts(df):\n", " return df.Carrier.value_counts().to_frame(name='Counts!')" ] }, { "cell_type": "code", "execution_count": 268, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Counts!
WN100042
DL64421
EV49925
OO48114
AA44059
UA38395
US33489
MQ29900
B621623
AS13257
NK8743
F96829
HA6440
VX4731
\n", "
" ], "text/plain": [ " Counts!\n", "WN 100042\n", "DL 64421\n", "EV 49925\n", "OO 48114\n", "AA 44059\n", "UA 38395\n", "US 33489\n", "MQ 29900\n", "B6 21623\n", "AS 13257\n", "NK 8743\n", "F9 6829\n", "HA 6440\n", "VX 4731" ] }, "execution_count": 268, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pretty_counts(df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "No `AssertionError` was raised so we're all good.\n", "\n", "I'll typically find myself with a handful of functions that define my ETL process from a flat file to\n", "whatever the end result is. Each function takes and returns a `DataFrame`. This can be nicely layered into a pipeline. Using decorators allows you to make the checks without breaking up the flow of the pipeline." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.3" } }, "nbformat": 4, "nbformat_minor": 0 }