{ "cells": [ { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import dask.dataframe as dd\n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "pd.options.display.max_rows = 10" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-rw-r--r-- 1 taugspurger staff 2.4G Sep 9 06:14 data/yellow_tripdata_2009-01.csv\r\n", "-rw-r--r-- 1 taugspurger staff 2.2G Sep 9 10:56 data/yellow_tripdata_2009-02.csv\r\n", "-rw-r--r-- 1 taugspurger staff 2.4G Sep 9 11:03 data/yellow_tripdata_2009-03.csv\r\n", "-rw-r--r-- 1 taugspurger staff 2.4G Sep 9 11:10 data/yellow_tripdata_2009-04.csv\r\n", "-rw-r--r-- 1 taugspurger staff 2.5G Sep 9 11:17 data/yellow_tripdata_2009-05.csv\r\n", "-rw-r--r-- 1 taugspurger staff 2.4G Sep 9 11:23 data/yellow_tripdata_2009-06.csv\r\n", "-rw-r--r-- 1 taugspurger staff 2.3G Sep 9 11:30 data/yellow_tripdata_2009-07.csv\r\n", "-rw-r--r-- 1 taugspurger staff 2.3G Sep 9 11:36 data/yellow_tripdata_2009-08.csv\r\n", "-rw-r--r-- 1 taugspurger staff 2.4G Sep 9 11:44 data/yellow_tripdata_2009-09.csv\r\n", "-rw-r--r-- 1 taugspurger staff 2.6G Sep 9 11:52 data/yellow_tripdata_2009-10.csv\r\n", "-rw-r--r-- 1 taugspurger staff 2.4G Sep 9 11:59 data/yellow_tripdata_2009-11.csv\r\n", "-rw-r--r-- 1 taugspurger staff 2.5G Sep 9 12:07 data/yellow_tripdata_2009-12.csv\r\n" ] } ], "source": [ "ls -lh data/*.csv" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1min 4s, sys: 5.2 s, total: 1min 9s\n", "Wall time: 1min 9s\n" ] } ], "source": [ "%%time\n", "dtype = {\n", " 'vendor_name': 'category',\n", " 'Payment_Type': 'category',\n", "}\n", "\n", "df = pd.read_csv(\"data/yellow_tripdata_2009-01.csv\", dtype=dtype,\n", " parse_dates=['Trip_Pickup_DateTime', 'Trip_Dropoff_DateTime'],)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | vendor_name | \n", "Trip_Pickup_DateTime | \n", "Trip_Dropoff_DateTime | \n", "Passenger_Count | \n", "Trip_Distance | \n", "Start_Lon | \n", "Start_Lat | \n", "Rate_Code | \n", "store_and_forward | \n", "End_Lon | \n", "End_Lat | \n", "Payment_Type | \n", "Fare_Amt | \n", "surcharge | \n", "mta_tax | \n", "Tip_Amt | \n", "Tolls_Amt | \n", "Total_Amt | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "VTS | \n", "2009-01-04 02:52:00 | \n", "2009-01-04 03:02:00 | \n", "1 | \n", "2.63 | \n", "-73.991957 | \n", "40.721567 | \n", "NaN | \n", "NaN | \n", "-73.993803 | \n", "40.695922 | \n", "CASH | \n", "8.9 | \n", "0.5 | \n", "NaN | \n", "0.00 | \n", "0.0 | \n", "9.40 | \n", "
1 | \n", "VTS | \n", "2009-01-04 03:31:00 | \n", "2009-01-04 03:38:00 | \n", "3 | \n", "4.55 | \n", "-73.982102 | \n", "40.736290 | \n", "NaN | \n", "NaN | \n", "-73.955850 | \n", "40.768030 | \n", "Credit | \n", "12.1 | \n", "0.5 | \n", "NaN | \n", "2.00 | \n", "0.0 | \n", "14.60 | \n", "
2 | \n", "VTS | \n", "2009-01-03 15:43:00 | \n", "2009-01-03 15:57:00 | \n", "5 | \n", "10.35 | \n", "-74.002587 | \n", "40.739748 | \n", "NaN | \n", "NaN | \n", "-73.869983 | \n", "40.770225 | \n", "Credit | \n", "23.7 | \n", "0.0 | \n", "NaN | \n", "4.74 | \n", "0.0 | \n", "28.44 | \n", "
3 | \n", "DDS | \n", "2009-01-01 20:52:58 | \n", "2009-01-01 21:14:00 | \n", "1 | \n", "5.00 | \n", "-73.974267 | \n", "40.790955 | \n", "NaN | \n", "NaN | \n", "-73.996558 | \n", "40.731849 | \n", "CREDIT | \n", "14.9 | \n", "0.5 | \n", "NaN | \n", "3.05 | \n", "0.0 | \n", "18.45 | \n", "
4 | \n", "DDS | \n", "2009-01-24 16:18:23 | \n", "2009-01-24 16:24:56 | \n", "1 | \n", "0.40 | \n", "-74.001580 | \n", "40.719382 | \n", "NaN | \n", "NaN | \n", "-74.008378 | \n", "40.720350 | \n", "CASH | \n", "3.7 | \n", "0.0 | \n", "NaN | \n", "0.00 | \n", "0.0 | \n", "3.70 | \n", "