{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Madrid Mobility Data\n", "> Preprocessing Madrid CRTM survey data\n", "\n", "- toc: true\n", "- branch: master\n", "- badges: false\n", "- hide_binder_badge: true\n", "- comments: true\n", "- author: Alexandra Kapp\n", "- categories: [mobility, preprocessing]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Madrid (CRTM: Consorcio Regional de Transportes de Madrid), unlike most other cities, provides a rich open data set on survey data on mobility behaviour of ~85.000 people with ~220.000 individual trips.\n", "\n", "The description and results can be found here (in spanish):\n", "https://www.crtm.es/media/712934/edm18_sintesis.pdf\n", "\n", "For our [article](https://correlaid.org/blog/gender_bias_and_mobility/) we did some preprocessing:\n", " - merging the different data sets\n", " - translate Spanish codes to English and join with data\n", " - simplify some answer options (e.g. modal choice or trip motive)\n", " \n", "**If you want to use this data set for your own project, this preproessing might help you get started.**" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# import packages\n", "import os\n", "import requests\n", "import pandas as pd\n", "import numpy as np\n", "from datetime import time\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Download the data\n", "\n", "Download all four datasets from here: https://crtm.maps.arcgis.com/apps/MinimalGallery/index.html?appid=a60bb2f0142b440eadee1a69a11693fc\n", "and store in *data\\raw\\*" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Folder ../data/raw already exists.\n", "Folder ../data/interim already exists.\n", "Folder ../data/processed already exists.\n", "\n", "\"../data/raw\" folder contains:\n", "['.gitkeep', 'EDM2018HOGARES.xlsx', 'EDM2018INDIVIDUOS.xlsx', 'EDM2018VIAJES.xlsx', 'EDM2018XETAPAS.xlsx', 'public_transport_madrid', 'zt1259', 'zt208', 'zt84']\n" ] } ], "source": [ "# working directory\n", "path = os.getcwd()\n", "print(\"Current working directory:\", path)\n", "\n", "# create sub-folders in 'data' folder\n", "for x in ['raw', 'interim', 'processed']:\n", " temp = os.path.join('../data/', x)\n", " try:\n", " os.mkdir(temp)\n", " except:\n", " print('Folder', temp, 'already exists.')\n", " \n", "# download and save raw datasets\n", " # HOGARES\n", "url = 'https://crtm.maps.arcgis.com/sharing/rest/content/items/d9e8c48ae6a1474faa34083239007307/data'\n", "r = requests.get(url, allow_redirects=True)\n", "output = open('../data/raw/EDM2018HOGARES.xlsx', 'wb')\n", "output.write(r.content)\n", "output.close()\n", "\n", " # INDIVIDUOS\n", "url = 'https://crtm.maps.arcgis.com/sharing/rest/content/items/07dad41b543641d3964a68851fc9ad11/data'\n", "r = requests.get(url, allow_redirects=True)\n", "output = open('../data/raw/EDM2018INDIVIDUOS.xlsx', 'wb')\n", "output.write(r.content)\n", "output.close()\n", "\n", " # VIAJES\n", "url = 'https://crtm.maps.arcgis.com/sharing/rest/content/items/6afd4db8175d4902ada0803f08ccf50e/data'\n", "r = requests.get(url, allow_redirects=True)\n", "output = open('../data/raw/EDM2018VIAJES.xlsx', 'wb')\n", "output.write(r.content)\n", "output.close()\n", "\n", " # XETAPAS\n", "url = 'https://crtm.maps.arcgis.com/sharing/rest/content/items/81919e30e674422d93203a3190eafcdc/data'\n", "r = requests.get(url, allow_redirects=True)\n", "output = open('../data/raw/EDM2018XETAPAS.xlsx', 'wb')\n", "output.write(r.content)\n", "output.close()\n", "\n", "# display content of 'raw' folder\n", "print('\\n\"../data/raw\" folder contains:')\n", "print(os.listdir('../data/raw'))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Join datasets" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# read downloaded files\n", "ind = pd.read_excel('../data/raw/EDM2018INDIVIDUOS.xlsx')\n", "hog = pd.read_excel('../data/raw/EDM2018HOGARES.xlsx')\n", "via = pd.read_excel('../data/raw/EDM2018VIAJES.xlsx', dtype = {'VORIHORAINI':str, 'VDESHORAFIN':str}) # specify times to be read as strings\n", "etap = pd.read_excel('../data/raw/EDM2018XETAPAS.xlsx')\n", "\n", "# set unique identifiers as index for each DataFrame\n", "via.set_index([\"ID_HOGAR\", \"ID_IND\", \"ID_VIAJE\"], inplace = True)\n", "ind.set_index([\"ID_HOGAR\", \"ID_IND\"], inplace = True)\n", "hog.set_index(\"ID_HOGAR\", inplace = True)\n", "etap.set_index([\"ID_HOGAR\", \"ID_IND\", \"ID_VIAJE\"], inplace =True)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "people = hog.join(ind, lsuffix = \"_hog\", rsuffix = \"_ind\")\n", "trips = people.join(via, rsuffix = \"_via\")\n", "legs = trips.join(etap, rsuffix = \"_etap\")\n", "\n", "# people is a dataset of households and unique people\n", "people.to_csv(r'..\\data\\interim\\people.csv', index = False)\n", "\n", "# trips includes households, people and unique trips\n", "trips.to_csv(r'..\\data\\interim\\trips.csv', index = False)\n", "\n", "# legs includes households, people, trips and unique legs\n", "legs.to_csv(r'..\\data\\interim\\legs.csv', index = False)\n", "\n", "## you likely want to work with \"trips\" for analysing mobility patterns." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### join codes for survey answers" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# read codes from second tab of excel files\n", "codes_hog = pd.read_excel ('../data/raw/EDM2018HOGARES.xlsx', sheet_name = 1)\n", "codes_ind = pd.read_excel ('../data/raw/EDM2018INDIVIDUOS.xlsx', sheet_name = 1)\n", "codes_via = pd.read_excel ('../data/raw/EDM2018VIAJES.xlsx', sheet_name = 1)\n", "codes_etap = pd.read_excel ('../data/raw/EDM2018XETAPAS.xlsx', sheet_name = 1)\n", "\n", "# append all codes to a single file\n", "codes = codes_hog.append(codes_ind, ignore_index = True, sort = False\n", " ).append(codes_via, ignore_index = True, sort = False\n", " ).append(codes_etap, ignore_index = True, sort = False\n", " ).drop(columns = [\"Unnamed: 0\", \"Unnamed: 1\"])\n", "\n", "# forward fill varibale and specification to work as proper table (and not only visually in Excel)\n", "codes[\"VARIABLE\"] = codes.VARIABLE.fillna(method = \"ffill\")\n", "codes[\"ESPECIFICACIÓN\"] = codes[\"ESPECIFICACIÓN\"].fillna(method = \"ffill\")\n", "\n", "# split code and value \n", "codes['CODE'], codes['VALUE'] = codes['VALORES'].str.split(\".\", 1).str\n", "codes['CODE'], unused = codes.CODE.str.split(\" '\", 1).str\n", "\n", "codes.drop(index = codes[codes.VARIABLE.isna()].index, inplace = True)\n", "\n", "codes.to_csv('../data/interim/codes.csv', index = False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Translate codes by using Google Sheets. \n", "\n", "![](my_icons/google_sheets_screenshot.png)\n", "\n", "(Google API Package for Python wasnt working? https://pypi.org/project/googletrans/ ) Option to use the Google translate API (https://cloud.google.com/translate/docs/reference/rest) but for this use case wasnt really worth to set it up.\n", "\n", "- Translation is stored in data\\processed\\codes_translated.csv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## create new variables" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "# read csv in case you already stored the csv and don't want to repeat the downloading steps\n", "# trips = pd.read_csv(r'..\\data\\interim\\trips.csv', dtype = {'VORIHORAINI':str, 'VDESHORAFIN':str})" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "codes = pd.read_csv(r'..\\data\\processed\\codes_translated.csv', dtype = {'CODE': float})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Set value of sex to 'Man' and 'Woman' instead of numbers." ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "code_sex = codes[codes.VARIABLE == \"C2SEXO\"][['CODE', 'VALUE_en']].rename({'VALUE_en': 'sex'}, axis = 1)\n", "trips = trips.join(code_sex.set_index('CODE'), on = \"C2SEXO\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Compute duration from start (VORIHORAINI) and endtime (VDESHORAFIN)." ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "mins = pd.to_numeric(trips.VDESHORAFIN.str.slice(2)) - pd.to_numeric(trips.VORIHORAINI.str.slice(2))\n", "hours = (pd.to_numeric(trips.VDESHORAFIN.str.slice(0, 2)) - pd.to_numeric(trips.VORIHORAINI.str.slice(0, 2))) * 60\n", "trips[\"duration\"] = mins + hours" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create a datetime object from string." ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "trips['start_time'] = pd.to_datetime(trips.VORIHORAINI, format = '%H%M').dt.time" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Endtimes sometimes exceeds 24 hours. (If a trip starts at 11pm and ends at 1am, then the endtime is 2500). Datetime does not work with times > 24h. Therefore this needs to be fixed." ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "trips[\"end_time\"] = np.where(pd.to_numeric(trips['VDESHORAFIN']) > 2400, \n", " pd.to_numeric(trips['VDESHORAFIN']) - 2400, \n", " pd.to_numeric(trips['VDESHORAFIN']))" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "trips['end_time'] = pd.to_datetime(trips.end_time, format = '%H%M', errors = 'coerce').dt.time" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "# speed\n", "trips[\"speed\"] = np.where(trips.duration != 0, # skip if start and Endtime is the same\n", " trips.DISTANCIA_VIAJE / (trips.duration / 60), np.NaN)" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "# remove implausible data\n", "trips.loc[trips.speed > 150, \"speed\"] = None" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The original survey has a 24 different modes:\n", "- 1: Renfe Cercanías (train)\n", "- 2: Autobus interurbano (intercity bus)\n", "- 3: Autobus urbano otro municipio (urban bus other municipalities)\n", "- 4: Metro (metro)\n", "- 5: Metro ligero/tranvía (light train)\n", "- 6: Autobus urbano Madrid EMT (urban bus Madrid)\n", "- 7: Resto renfe (train)\n", "- 8: Autobus discrecional (unscheduled bus)\n", "- 9: Autobus de largo recorrido (intercity bus)\n", "- 10: Taxi (taxi)\n", "- 11: Coche conductor particular (private car)\n", "- 12: Coche conductor empresa (business car)\n", "- 13: Coche conductor alquiler sin conductor (car rental without driver)\n", "- 14: Coche pasajero particular (private car passenger)\n", "- 15: Coche pasajero empresa (business car passenger)\n", "- 16: Coche pasajero alquiler con conductor (car rental with driver)\n", "- 17: Moto/ciclomotor particular (private motorcycle)\n", "- 18: Moto/ciclomotor publica (public motorcycle)\n", "- 19: Moto/ciclomotor empresa (business motorcycle)\n", "- 20: Bicicleta particular (private bicycle)\n", "- 21: Bicicleta publica (public bicycle)\n", "- 22: Bicicleta empresa (business bicycle)\n", "- 23: Otros (other)\n", "- 24: Andando/pie (walk)\n", "\n", "These are simplified to four modes:\n", "- public transport\n", "- car\n", "- other (including motorbicycle and bicycle)\n", "- walk" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# simplify mode\n", "code_mode = codes[codes['VARIABLE'] == 'MODO_PRIORITARIO'][['CODE', 'VALUE_en']].drop_duplicates()\n", "trips = trips.join(code_mode.set_index('CODE'), on = 'MODO_PRIORITARIO', how = \"left\").reset_index()\n", "trips.rename({\"VALUE_en\": \"mode\"}, axis = \"columns\", inplace = True)\n", "\n", "trips[\"mode_simple\"] = trips.MODO_PRIORITARIO\n", "trips.loc[trips.MODO_PRIORITARIO < 10, \"mode_simple\"] = \"public transport\"\n", "trips.loc[(trips.MODO_PRIORITARIO > 9) & (trips.MODO_PRIORITARIO < 17), \"mode_simple\"] = \"car\"\n", "trips.loc[(trips.MODO_PRIORITARIO > 16) & (trips.MODO_PRIORITARIO < 24), \"mode_simple\"] = \"other\"\n", "trips.loc[trips.MODO_PRIORITARIO == 24, \"mode_simple\"] = \"walk\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The original survey has 12 trip motives: \n", "- 1: Casa (Home)\n", "- 2: Trabajo (Work)\n", "- 3: Gestión de trabajo (work management?)\n", "- 4: Estudio (study)\n", "- 5: Compras (Purchases)\n", "- 6: Médico (Doctor)\n", "- 7: Acompañamiento a otra persona (Acompany another person)\n", "- 8: Ocio (leisure)\n", "- 9: Deporte/ dar un paseo (sport)\n", "- 10: Asunto personal (personal matter)\n", "- 11: Otro domicilio (other residence)\n", "- 12: Otros (other)\n", "\n", "These are simplified to 5 trip motives:\n", "- work\n", "- study\n", "- car / errand\n", "- leisure\n", "- other" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "# simplify trip motive\n", "code_motive = codes[codes['VARIABLE'] == 'MOTIVO_PRIORITARIO'][['CODE', 'VALUE_en']]\n", "trips = trips.join(code_motive.set_index('CODE'), on = 'MOTIVO_PRIORITARIO').reset_index()\n", "trips.rename({\"VALUE_en\": \"motive\"}, axis = \"columns\", inplace = True)\n", "\n", "trips.loc[:,\"motive_simple\"] = trips.motive\n", "trips.loc[(trips.MOTIVO_PRIORITARIO == 2) | (trips.MOTIVO_PRIORITARIO == 3), \"motive_simple\"] = \"work\"\n", "trips.loc[((trips.MOTIVO_PRIORITARIO >= 5) & (trips.MOTIVO_PRIORITARIO <= 8) | (trips.MOTIVO_PRIORITARIO == 10)), \"motive_simple\"] = \"care / errand\"\n", "trips.loc[(trips.MOTIVO_PRIORITARIO >= 8) & (trips.MOTIVO_PRIORITARIO <= 9), \"motive_simple\"] = \"leisure\"\n", "trips.loc[(trips.MOTIVO_PRIORITARIO == 1) | (trips.MOTIVO_PRIORITARIO == 11) | (trips.MOTIVO_PRIORITARIO == 12), \"motive_simple\"] = \"other\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Traffic can be divided into three main categories time wise: \n", "- rush hour (usually morning and afternoon)\n", "- off-peak (usually during the day)\n", "- low traffic (usually night and weekend)\n", "\n", "A new category 'daytime' is created for this." ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "# daytime: rush hour, off-peak, low traffic\n", "conditions = [\n", " ((trips.start_time >= time(7)) & (trips.start_time < time(9))) | ((trips.start_time >= time(17)) & (trips.start_time < time(20))),\n", " (trips.start_time >= time(9)) & (trips.start_time < time(17)),\n", " (trips.start_time >= time(20)) | (trips.start_time < time(7))\n", "]\n", "choices = ['rush hour (HVZ)', 'off-peak hour (NVZ)', 'low traffic time (SVZ)']\n", "trips['daytime'] = np.select(conditions, choices, default= None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Round age to 5 to get larger groups with the same age and make it easier to work with." ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "# create variable for rounded age\n", "trips[\"rounded_age\"] = 5 * (trips.EDAD_FIN / 5).round()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create four different age groups:\n", "- kids (<= 18)\n", "- young adults (18-25)\n", "- adults (25 - 55)\n", "- seniors (> 55)" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "# age group\n", "conditions = [\n", " trips.EDAD_FIN <= 18,\n", " (trips.EDAD_FIN > 18) & (trips.EDAD_FIN <= 25),\n", " (trips.EDAD_FIN > 25) & (trips.EDAD_FIN <= 55),\n", " trips.EDAD_FIN > 55]\n", "choices = ['kids', 'young adults', 'adults', 'seniors']\n", "trips['age_group'] = np.select(conditions, choices, default= None)" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "trips.drop([\"index\"], axis = 1, inplace = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Store preprocessed data to csv-File to work with in your use case." ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "trips.to_csv(r'..\\data\\processed\\trips_custom_variables.csv', index = False)" ] } ], "metadata": { "kernelspec": { "display_name": "genmob_kernel", "language": "python", "name": "genmob_kernel" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }