{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": "## Manipulating data" }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": "import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n%matplotlib inline\ndata = pd.read_csv('data/nyc_data.csv', parse_dates=['pickup_datetime',\n 'dropoff_datetime'])\nfare = pd.read_csv('data/nyc_fare.csv', parse_dates=['pickup_datetime'])" }, { "cell_type": "markdown", "metadata": {}, "source": "### Selecting data" }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": " trip_distance trip_time_in_secs\n0 0.61 300\n1 3.28 960\n2 1.50 386" }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": "data[['trip_distance', 'trip_time_in_secs']].head(3)" }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": "medallion 76942C3205E17D7E7FE5A9F709D16434\nhack_license 25BA06A87905667AA1FE5990E33F0E2E\nvendor_id VTS\nrate_code 1\nstore_and_fwd_flag NaN\npickup_datetime 2013-01-01 00:00:00\ndropoff_datetime 2013-01-01 00:05:00\npassenger_count 3\ntrip_time_in_secs 300\ntrip_distance 0.61\npickup_longitude -73.95592\npickup_latitude 40.78189\ndropoff_longitude -73.96318\ndropoff_latitude 40.77783\nName: 0, dtype: object" }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": "data.loc[0]" }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": "data.loc[[0, 100000]]" }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": " trip_distance trip_time_in_secs\n1000 1.00 441\n1010 3.80 691\n....\n1990 0.13 60\n2000 9.60 963" }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": "data.loc[1000:2000:10,\n ['trip_distance', 'trip_time_in_secs']]" }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": "data.loc[data.trip_distance>50]" }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": "from ipywidgets import interact" }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": "@interact\ndef show_nrows(distance_threshold=(0, 200)):\n return len(data.loc[data.trip_distance > distance_threshold])" }, { "cell_type": "markdown", "metadata": {}, "source": "### Computing with numbers" }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": "data['trip_time_in_mins'] = data.trip_time_in_secs / 60.0" }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": " trip_time_in_secs trip_time_in_mins\n0 300 5.000000\n1 960 16.000000\n2 386 6.433333" }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": "data[['trip_time_in_secs', 'trip_time_in_mins']].head(3)" }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": "0 0.61\n1 3.28\n2 1.50\n3 0.00\n4 1.31\nName: trip_distance, dtype: float64" }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": "a = data.trip_distance[:5]\na" }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": "2 1.50\n3 0.00\n4 1.31\n5 5.81\nName: trip_distance, dtype: float64" }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": "b = data.trip_distance[2:6]\nb" }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": "0 NaN\n1 NaN\n2 3.00\n3 0.00\n4 2.62\n5 NaN\nName: trip_distance, dtype: float64" }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": "a + b" }, { "cell_type": "markdown", "metadata": {}, "source": "### Working with text" }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": "0 76942C3205E17D7E7FE5A9F709D16434\n1 517C6B330DBB3F055D007B07512628B3\n2 ED15611F168E41B33619C83D900FE266\nName: medallion, dtype: object" }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": "data.medallion.head(3)" }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": "0 7694\n1 517C\n2 ED15\nName: medallion, dtype: object" }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": "data.medallion.str.slice(0, 4).head(3)" }, { "cell_type": "markdown", "metadata": {}, "source": "### Working with dates and times" }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": "0 1\n200000 6\n400000 5\n600000 0\n800000 1\ndtype: int64" }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": "data.pickup_datetime.dt.dayofweek[::200000]" }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": "7716" }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": "day_p = data.pickup_datetime.dt.day\nday_d = data.dropoff_datetime.dt.day\nselection = (day_p != day_d)\nprint(len(data.loc[selection]))\ndata.loc[selection].head(3)" }, { "cell_type": "markdown", "metadata": {}, "source": "### Handling missing data" } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 0 }