{ "metadata": { "name": "", "signature": "sha256:2eb84660a936bd5ce6798b3d1c7ffe703e63b796a44be086e91fca1ff95cbf80" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "3. Cleaning the dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook will perform the process that was applied to the August's data in the [previous notebook](2. Exploring a subdataset.ipynb) to the rest of months. So, if you're just reading the project you can skip this file and open the [next notebook](4. Obtaining a sample and improving the data.ipynb)." ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "3.1 Preparing the notebook" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%matplotlib inline\n", "%config InlineBackend.figure_format='retina'" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "import os\n", "\n", "import numpy as np\n", "import pandas as pd" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "# VARIABLES\n", "\n", "# Paths.\n", "faredata_path = '../data/faredata/trip_fare_{0}.csv'\n", "tripdata_path = '../data/tripdata/trip_data_{0}.csv'\n", "cleaned_data_path = '../data/cleaned/cleaned_{0}.csv'\n", "\n", "\n", "# Tip percentage variables.\n", "tip_perc_necesary_columns = ['fare_amount', 'surcharge', 'mta_tax', 'tip_amount']\n", "tip_perc_column_name = 'tip_perc'\n", "\n", "\n", "# Coordinates variables.\n", "min_x = -74.361107\n", "min_y = 40.459518\n", "\n", "max_x = -71.903083\n", "max_y = 41.175342\n", "\n", "step_x = 0.003205 # 270 meters (885 feet) approximately.\n", "step_y = 0.002429 # 270 meters (885 feet) approximately.\n", "\n", "# Number of maximum areas per axis in the 'area matrix'.\n", "max_area_x = 766 # int((max_x - min_x) / step_x)\n", "max_area_y = 294 # int((max_y - min_y) / step_y)\n", "\n", "# Number of columns of the 'area matrix'.\n", "num_columns = 767 # max_area_x + 1\n", "\n", "# The area at the top right corner in the 'area matrix'.\n", "max_area = 226264 # (max_area_y * num_columns) + max_area_x\n", "\n", "# Minimum number of pickups or dropoffs per area.\n", "min_area_number = 20\n", "\n", "# Columns for creating the pickup and dropoff area attributes.\n", "area_column_names = ['pickup_area', 'dropoff_area']\n", "area_necesary_columns = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']\n", "\n", "# Order of the columns to apply it at before saving.\n", "column_order = ['medallion', 'hack_license', 'vendor_id', 'pickup_datetime', 'fare_amount', 'surcharge', 'tip_amount',\n", " 'tip_perc', 'tolls_amount', 'total_amount', 'passenger_count', 'trip_time_in_secs', 'trip_distance',\n", " 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "# FUNCTIONS\n", "\n", "# Function for caculating the tip percentage of a trip.\n", "def calculate_tip_perc(row):\n", " subtotal = row.fare_amount + row.surcharge + row.mta_tax\n", " tip = row.tip_amount / subtotal\n", " tip_perc = tip * 100\n", " \n", " return pd.Series({tip_perc_column_name: tip_perc})\n", "\n", "\n", "# Functions for calculating. the pickup and dropoff areas in the 'area matrix'.\n", "def get_1d_area(coor, min_coor, step_coor):\n", " return int((coor - min_coor) / step_coor)\n", "\n", "def get_area(area_x, area_y, columns):\n", " return (area_y * columns) + area_x\n", "\n", "def calculate_area(row):\n", " pickup_x = get_1d_area(row.pickup_longitude, min_x, step_x)\n", " pickup_y = get_1d_area(row.pickup_latitude, min_y, step_y)\n", " pickup_area = get_area(pickup_x, pickup_y, num_columns)\n", " \n", " dropoff_x = get_1d_area(row.dropoff_longitude, min_x, step_x)\n", " dropoff_y = get_1d_area(row.dropoff_latitude, min_y, step_y)\n", " dropoff_area = get_area(dropoff_x, dropoff_y, num_columns)\n", " \n", " return pd.Series({\n", " area_column_names[0]: pickup_area,\n", " area_column_names[1]: dropoff_area\n", " })\n", "\n", "\n", "# This auxiliary function applies another one to every row in a DataFrame for creating new columns.\n", "def iterate_and_apply(dataframe, function, necesary_columns):\n", " perform = True\n", " step = 100000\n", " start = 0\n", " to = step\n", " \n", " while perform:\n", " new_columns = dataframe[start:to][necesary_columns].apply(function, axis=1)\n", " if len(new_columns) == 0:\n", " perform = False\n", " else:\n", " dataframe.update(new_columns)\n", " new_columns = None\n", " start += step\n", " to += step\n", " \n", " return dataframe" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "# Creating the directory for saving the cleaned files if it doesn't exist.\n", "if not os.path.exists('../data/cleaned/'):\n", " os.makedirs('../data/cleaned/')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "3.2. Performing the process" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# The range of files to clean.\n", "months = range(1, 8) + range(9, 13)\n", "for month in months:\n", " \n", " # Cleaning 'fare' data.\n", " \n", " data = pd.read_csv(faredata_path.format(month), skipinitialspace=True)\n", " \n", " payment_type = (data.payment_type == 'CRD')\n", " fare_amount = ((data.fare_amount >= 3.0) & (data.fare_amount <= 200.0))\n", " surcharge = ((data.surcharge == 0.0) | (data.surcharge == 0.5) | (data.surcharge == 1.0))\n", " mta_tax = (data.mta_tax == 0.5)\n", " tip_amount = ((data.tip_amount >= 0.0) & (data.tip_amount <= 100.0))\n", " tolls_amount = ((data.tolls_amount >= 0.0) & (data.tolls_amount <= 30.0))\n", "\n", " data = data[payment_type & fare_amount & surcharge & mta_tax & tip_amount & tolls_amount]\n", " data.drop(['payment_type'], axis=1, inplace=True)\n", " \n", " payment_type = None\n", " fare_amount = None\n", " surcharge = None\n", " mta_tax = None\n", " tip_amount = None\n", " tolls_amount = None\n", " \n", " # Calculating the tip percentage.\n", " \n", " data[tip_perc_column_name] = np.nan\n", " data = iterate_and_apply(data, calculate_tip_perc, tip_perc_necesary_columns)\n", " data.drop(['mta_tax'], axis=1, inplace=True)\n", " \n", " tip_perc = (data.tip_perc <= 50.0)\n", " data = data[tip_perc]\n", " tip_perc = None\n", " \n", " # Adding the trip data file.\n", " \n", " tripdata = pd.read_csv(tripdata_path.format(month), skipinitialspace=True, usecols=[3, 7, 8, 9, 10, 11, 12, 13])\n", " \n", " for column in tripdata.columns:\n", " data[column] = np.nan\n", " \n", " '''\n", " Careful! The next line use a lot of memory, a least more than 8GB.\n", " It's showed here beacuse it's simplier that the another way I know to update the 'fare' DataFrame:\n", " Read the 'trip' file with NumPy for creating a lot of little DataFrames and update them\n", " into the data variable in a loop.\n", " '''\n", " data.update(tripdata)\n", " tripdata = None\n", " \n", " # Cleaning 'trip' data.\n", " \n", " rate_code = (data.rate_code == 1.0)\n", " passenger_count = ((data.passenger_count >= 1.0) & (data.passenger_count <= 6.0))\n", " trip_time_in_secs = ((data.trip_time_in_secs > 0.0) & (data.trip_time_in_secs <= 3600.0))\n", " trip_distance = ((data.trip_distance > 0.0) & (data.trip_distance <= 25.0))\n", " pickup_latitude = ((data.pickup_latitude >= 40.459518) & (data.pickup_latitude <= 41.175342))\n", " pickup_longitude = ((data.pickup_longitude >= -74.361107) & (data.pickup_longitude <= -71.903083))\n", " dropoff_latitude = ((data.dropoff_latitude >= 40.459518) & (data.dropoff_latitude <= 41.175342))\n", " dropoff_longitude = ((data.dropoff_longitude >= -74.361107) & (data.dropoff_longitude <= -71.903083))\n", " \n", " data = data[rate_code & passenger_count & trip_time_in_secs & trip_distance & pickup_latitude & pickup_longitude\n", " & dropoff_latitude & dropoff_longitude]\n", " data.drop(['rate_code'], axis=1, inplace=True)\n", " \n", " rate_code = None\n", " passenger_count = None\n", " trip_time_in_secs = None\n", " trip_distance = None\n", " pickup_latitude = None\n", " pickup_longitude = None\n", " dropoff_latitude = None\n", " dropoff_longitude = None\n", " \n", " # Filtering the coordinates of unusal places.\n", " \n", " for column in area_column_names:\n", " data[column] = np.nan\n", " \n", " data = iterate_and_apply(data, calculate_area, area_necesary_columns)\n", " \n", " aux_pickup = data.groupby(['pickup_area']).size()\n", " aux_pickup = aux_pickup[aux_pickup >= min_area_number]\n", " aux_pickup = data['pickup_area'].isin(aux_pickup.index)\n", " \n", " aux_dropoff = data.groupby(['dropoff_area']).size()\n", " aux_dropoff = aux_dropoff[aux_dropoff >= min_area_number]\n", " aux_dropoff = data['dropoff_area'].isin(aux_dropoff.index)\n", " \n", " data = data[aux_pickup & aux_dropoff]\n", " data.drop(['pickup_area', 'dropoff_area'], axis=1, inplace=True)\n", "\n", " aux_pickup = None\n", " aux_dropoff = None\n", " \n", " # Saving the file.\n", " \n", " data = data.reindex_axis(column_order, axis=1)\n", " data.to_csv(cleaned_data_path.format(month), index=True)\n", " data = None" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 } ], "metadata": {} } ] }