{
 "metadata": {
  "name": "",
  "signature": "sha256:2eb84660a936bd5ce6798b3d1c7ffe703e63b796a44be086e91fca1ff95cbf80"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "3. Cleaning the dataset"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "This notebook will perform the process that was applied to the August's data in the [previous notebook](2. Exploring a subdataset.ipynb) to the rest of months. So, if you're just reading the project you can skip this file and open the [next notebook](4. Obtaining a sample and improving the data.ipynb)."
     ]
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "3.1 Preparing the notebook"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%matplotlib inline\n",
      "%config InlineBackend.figure_format='retina'"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import os\n",
      "\n",
      "import numpy as np\n",
      "import pandas as pd"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# VARIABLES\n",
      "\n",
      "# Paths.\n",
      "faredata_path = '../data/faredata/trip_fare_{0}.csv'\n",
      "tripdata_path = '../data/tripdata/trip_data_{0}.csv'\n",
      "cleaned_data_path = '../data/cleaned/cleaned_{0}.csv'\n",
      "\n",
      "\n",
      "# Tip percentage variables.\n",
      "tip_perc_necesary_columns = ['fare_amount', 'surcharge', 'mta_tax', 'tip_amount']\n",
      "tip_perc_column_name = 'tip_perc'\n",
      "\n",
      "\n",
      "# Coordinates variables.\n",
      "min_x = -74.361107\n",
      "min_y = 40.459518\n",
      "\n",
      "max_x = -71.903083\n",
      "max_y = 41.175342\n",
      "\n",
      "step_x = 0.003205  # 270 meters (885 feet) approximately.\n",
      "step_y = 0.002429  # 270 meters (885 feet) approximately.\n",
      "\n",
      "# Number of maximum areas per axis in the 'area matrix'.\n",
      "max_area_x = 766  # int((max_x - min_x) / step_x)\n",
      "max_area_y = 294  # int((max_y - min_y) / step_y)\n",
      "\n",
      "# Number of columns of the 'area matrix'.\n",
      "num_columns = 767  # max_area_x + 1\n",
      "\n",
      "# The area at the top right corner in the 'area matrix'.\n",
      "max_area = 226264  # (max_area_y * num_columns) + max_area_x\n",
      "\n",
      "# Minimum number of pickups or dropoffs per area.\n",
      "min_area_number = 20\n",
      "\n",
      "# Columns for creating the pickup and dropoff area attributes.\n",
      "area_column_names = ['pickup_area', 'dropoff_area']\n",
      "area_necesary_columns = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']\n",
      "\n",
      "# Order of the columns to apply it at before saving.\n",
      "column_order = ['medallion', 'hack_license', 'vendor_id', 'pickup_datetime', 'fare_amount', 'surcharge', 'tip_amount',\n",
      "                'tip_perc', 'tolls_amount', 'total_amount', 'passenger_count', 'trip_time_in_secs', 'trip_distance',\n",
      "                'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# FUNCTIONS\n",
      "\n",
      "# Function for caculating the tip percentage of a trip.\n",
      "def calculate_tip_perc(row):\n",
      "    subtotal = row.fare_amount + row.surcharge + row.mta_tax\n",
      "    tip = row.tip_amount / subtotal\n",
      "    tip_perc = tip * 100\n",
      "    \n",
      "    return pd.Series({tip_perc_column_name: tip_perc})\n",
      "\n",
      "\n",
      "# Functions for calculating. the pickup and dropoff areas in the 'area matrix'.\n",
      "def get_1d_area(coor, min_coor, step_coor):\n",
      "    return int((coor - min_coor) / step_coor)\n",
      "\n",
      "def get_area(area_x, area_y, columns):\n",
      "    return (area_y * columns) + area_x\n",
      "\n",
      "def calculate_area(row):\n",
      "    pickup_x = get_1d_area(row.pickup_longitude, min_x, step_x)\n",
      "    pickup_y = get_1d_area(row.pickup_latitude, min_y, step_y)\n",
      "    pickup_area = get_area(pickup_x, pickup_y, num_columns)\n",
      "    \n",
      "    dropoff_x = get_1d_area(row.dropoff_longitude, min_x, step_x)\n",
      "    dropoff_y = get_1d_area(row.dropoff_latitude, min_y, step_y)\n",
      "    dropoff_area = get_area(dropoff_x, dropoff_y, num_columns)\n",
      "    \n",
      "    return pd.Series({\n",
      "        area_column_names[0]: pickup_area,\n",
      "        area_column_names[1]: dropoff_area\n",
      "    })\n",
      "\n",
      "\n",
      "# This auxiliary function applies another one to every row in a DataFrame for creating new columns.\n",
      "def iterate_and_apply(dataframe, function, necesary_columns):\n",
      "    perform = True\n",
      "    step = 100000\n",
      "    start = 0\n",
      "    to = step\n",
      "    \n",
      "    while perform:\n",
      "        new_columns = dataframe[start:to][necesary_columns].apply(function, axis=1)\n",
      "        if len(new_columns) == 0:\n",
      "            perform = False\n",
      "        else:\n",
      "            dataframe.update(new_columns)\n",
      "        new_columns = None\n",
      "        start += step\n",
      "        to += step\n",
      "    \n",
      "    return dataframe"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Creating the directory for saving the cleaned files if it doesn't exist.\n",
      "if not os.path.exists('../data/cleaned/'):\n",
      "    os.makedirs('../data/cleaned/')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "3.2. Performing the process"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# The range of files to clean.\n",
      "months = range(1, 8) + range(9, 13)\n",
      "for month in months:\n",
      "    \n",
      "    # Cleaning 'fare' data.\n",
      "    \n",
      "    data = pd.read_csv(faredata_path.format(month), skipinitialspace=True)\n",
      "    \n",
      "    payment_type = (data.payment_type == 'CRD')\n",
      "    fare_amount = ((data.fare_amount >= 3.0) & (data.fare_amount <= 200.0))\n",
      "    surcharge = ((data.surcharge == 0.0) | (data.surcharge == 0.5) | (data.surcharge == 1.0))\n",
      "    mta_tax = (data.mta_tax == 0.5)\n",
      "    tip_amount = ((data.tip_amount >= 0.0) & (data.tip_amount <= 100.0))\n",
      "    tolls_amount = ((data.tolls_amount >= 0.0) & (data.tolls_amount <= 30.0))\n",
      "\n",
      "    data = data[payment_type & fare_amount & surcharge & mta_tax & tip_amount & tolls_amount]\n",
      "    data.drop(['payment_type'], axis=1, inplace=True)\n",
      "    \n",
      "    payment_type = None\n",
      "    fare_amount = None\n",
      "    surcharge = None\n",
      "    mta_tax = None\n",
      "    tip_amount = None\n",
      "    tolls_amount = None\n",
      "    \n",
      "    # Calculating the tip percentage.\n",
      "    \n",
      "    data[tip_perc_column_name] = np.nan\n",
      "    data = iterate_and_apply(data, calculate_tip_perc, tip_perc_necesary_columns)\n",
      "    data.drop(['mta_tax'], axis=1, inplace=True)\n",
      "    \n",
      "    tip_perc = (data.tip_perc <= 50.0)\n",
      "    data = data[tip_perc]\n",
      "    tip_perc = None\n",
      "    \n",
      "    # Adding the trip data file.\n",
      "    \n",
      "    tripdata = pd.read_csv(tripdata_path.format(month), skipinitialspace=True, usecols=[3, 7, 8, 9, 10, 11, 12, 13])\n",
      "    \n",
      "    for column in tripdata.columns:\n",
      "        data[column] = np.nan\n",
      "    \n",
      "    '''\n",
      "    Careful! The next line use a lot of memory, a least more than 8GB.\n",
      "    It's showed here beacuse it's simplier that the another way I know to update the 'fare' DataFrame:\n",
      "        Read the 'trip' file with NumPy for creating a lot of little DataFrames and update them\n",
      "        into the data variable in a loop.\n",
      "    '''\n",
      "    data.update(tripdata)\n",
      "    tripdata = None\n",
      "    \n",
      "    # Cleaning 'trip' data.\n",
      "    \n",
      "    rate_code = (data.rate_code == 1.0)\n",
      "    passenger_count = ((data.passenger_count >= 1.0) & (data.passenger_count <= 6.0))\n",
      "    trip_time_in_secs = ((data.trip_time_in_secs > 0.0) & (data.trip_time_in_secs <= 3600.0))\n",
      "    trip_distance = ((data.trip_distance > 0.0) & (data.trip_distance <= 25.0))\n",
      "    pickup_latitude = ((data.pickup_latitude >= 40.459518) & (data.pickup_latitude <= 41.175342))\n",
      "    pickup_longitude = ((data.pickup_longitude >= -74.361107) & (data.pickup_longitude <= -71.903083))\n",
      "    dropoff_latitude = ((data.dropoff_latitude >= 40.459518) & (data.dropoff_latitude <= 41.175342))\n",
      "    dropoff_longitude = ((data.dropoff_longitude >= -74.361107) & (data.dropoff_longitude <= -71.903083))\n",
      "    \n",
      "    data = data[rate_code & passenger_count & trip_time_in_secs & trip_distance & pickup_latitude & pickup_longitude\n",
      "                & dropoff_latitude & dropoff_longitude]\n",
      "    data.drop(['rate_code'], axis=1, inplace=True)\n",
      "    \n",
      "    rate_code = None\n",
      "    passenger_count = None\n",
      "    trip_time_in_secs = None\n",
      "    trip_distance = None\n",
      "    pickup_latitude = None\n",
      "    pickup_longitude = None\n",
      "    dropoff_latitude = None\n",
      "    dropoff_longitude = None\n",
      "    \n",
      "    # Filtering the coordinates of unusal places.\n",
      "    \n",
      "    for column in area_column_names:\n",
      "        data[column] = np.nan\n",
      "    \n",
      "    data = iterate_and_apply(data, calculate_area, area_necesary_columns)\n",
      "    \n",
      "    aux_pickup = data.groupby(['pickup_area']).size()\n",
      "    aux_pickup = aux_pickup[aux_pickup >= min_area_number]\n",
      "    aux_pickup = data['pickup_area'].isin(aux_pickup.index)\n",
      "    \n",
      "    aux_dropoff = data.groupby(['dropoff_area']).size()\n",
      "    aux_dropoff = aux_dropoff[aux_dropoff >= min_area_number]\n",
      "    aux_dropoff = data['dropoff_area'].isin(aux_dropoff.index)\n",
      "    \n",
      "    data = data[aux_pickup & aux_dropoff]\n",
      "    data.drop(['pickup_area', 'dropoff_area'], axis=1, inplace=True)\n",
      "\n",
      "    aux_pickup = None\n",
      "    aux_dropoff = None\n",
      "    \n",
      "    # Saving the file.\n",
      "    \n",
      "    data = data.reindex_axis(column_order, axis=1)\n",
      "    data.to_csv(cleaned_data_path.format(month), index=True)\n",
      "    data = None"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    }
   ],
   "metadata": {}
  }
 ]
}