{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": "## Complex operations" }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": "import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn\n%matplotlib inline\ndata = pd.read_csv('data/nyc_data.csv', parse_dates=['pickup_datetime',\n 'dropoff_datetime'])\nfare = pd.read_csv('data/nyc_fare.csv', parse_dates=['pickup_datetime'])" }, { "cell_type": "markdown", "metadata": {}, "source": "### Group-by" }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": "weekly = data.groupby(data.pickup_datetime.dt.weekofyear)" }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": "52" }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": "len(weekly)" }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": "1 17042\n2 15941\n3 17017\ndtype: int64" }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": "y = weekly.size()\ny.head(3)" }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": "1 2013-01-01 00:00:00\n2 2013-01-07 00:03:00\n3 2013-01-14 00:00:51\nName: pickup_datetime, dtype: datetime64[ns]" }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": "x = weekly.pickup_datetime.first()\nx.head(3)" }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": "pd.Series(y.values, index=x).plot()\nplt.ylim(0) # Set the lower y value to 0.\nplt.xlabel('Week') # Label of the x axis.\nplt.ylabel('Taxi rides') # Label of the y axis." }, { "cell_type": "markdown", "metadata": {}, "source": "### Joins" }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": "13407\n tip_amount\nmedallion\n00005007A9F30E289E760362F69E4EAD 1.815854\n000318C2E3E6381580E5C99910A60668 2.857222\n000351EDC735C079246435340A54C7C1 2.099111" }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": "tip = fare[['medallion', 'tip_amount']] \\\n .loc[fare.tip_amount>0].groupby('medallion').mean()\nprint(len(tip))\ntip.head(3)" }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": "tip.hist(bins=np.linspace(0., 6., 100))\nplt.xlabel('Average tip')\nplt.ylabel('Number of taxis')" }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": "data_merged = pd.merge(data, tip, how='left', left_on='medallion', right_index=True)\ndata_merged.head(3)" } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 0 }