{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "\n", "import os # used for file system operations\n", "import json # prevalent input/output data format\n", "from hashlib import sha256 # cryptographic hashing for personal data for anonymization\n", "\n", "import pandas as pd # working with data frames, a versatile tabular data structure\n", "from geopy.geocoders import Nominatim # use Nominatim, the OpenStreetMap geocoder service (from address to geo-location)\n", "from geopy.extra.rate_limiter import RateLimiter # rate-limit support for geocoder services\n", "\n", "pd.options.display.max_columns = 50\n", "pd.options.display.max_rows = 100" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Loading original data\n", "\n", "Original data set from company X, to be anonymized." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original data, 4108 rows and 20 columns\n" ] } ], "source": [ "data_location = 'data/RAW.csv' # original data in CSV format\n", "df = pd.read_csv(data_location)\n", "df = df.drop('Source', axis='columns') # drop superfluous column\n", "print(f'Original data, {df.shape[0]} rows and {df.shape[1]} columns')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data pre-processing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Convert currency to numeric (DKK)\n", "Fix `Cost` and `NightlyRate` columns to be numeric. The source data is formatted as danish kroner." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": false }, "outputs": [], "source": [ "df.Cost = df.Cost.str.replace('kr','').str.replace(',','').astype(float)\n", "df.NightlyRate = df.NightlyRate.str.replace('kr','').str.replace(',','').astype(float)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Consolidation\n", "\n", "Remove duplicate and incomplete rows." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "New shape, rows and columns: (4069, 20)\n" ] } ], "source": [ "# Drop duplicate bookings (artefact of importing from different systems)\n", "df = df.sort_values('Status').drop_duplicates(subset=['Guest Name', 'Rental', 'Arrive', 'Depart'], keep='first')\n", "\n", "# Delete test bookings\n", "df = df.dropna(subset=['Guest Name', 'Rental'], how='any')\n", "\n", "print('New shape, rows and columns:', df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Anonymization" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Anonymize property location" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the original data, the property's exact address is available in the `Rental_Address` column." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "missing property locations: 28\n", "unique property locations: 173\n" ] } ], "source": [ "print('missing property locations:', df.Rental_Address.isna().sum())\n", "\n", "unique_property_locations = df.Rental_Address.dropna().unique().tolist()\n", "print('unique property locations:', len(unique_property_locations))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "geolocator = Nominatim(user_agent='CovidBnB') # create OpenStreetMap Nominatim geolocator (https://wiki.openstreetmap.org/wiki/Nominatim)\n", "geocode = RateLimiter(geolocator.geocode, min_delay_seconds=0.1) # ensure no more than 10 Nominatim queries per second" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "location_cache_file = 'cache/location_cache.json'\n", "if os.path.exists(location_cache_file): # if location cache exists,\n", " with open(location_cache_file, 'r') as fd:\n", " location_cache = json.load(fd)\n", "else:\n", " location_cache = {} # if no location cache found, start empty dictionary as location cache\n", "location_not_found = [] # list of location we could not find" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Could not find 52 locations, 4 unique\n" ] } ], "source": [ "def anonymize_address(property_location):\n", " '''Anonymize full input address to postal code.'''\n", " if pd.isna(property_location):\n", " return None\n", " cached_location = location_cache.get(property_location, False) # try to read from location cache\n", " if cached_location:\n", " return cached_location # cache hit, found address in location cache, return cached postal code\n", " else:\n", " try:\n", " location_geo = geocode(property_location, addressdetails=True) # not found in cache, geolocate with address details\n", " postcode = location_geo.raw['address']['postcode'] # extract postal code from address details response\n", " location_cache[property_location] = postcode # add value to cache\n", " return postcode\n", " except:\n", " location_not_found.append(property_location) # if not found, add to list of locations not found \n", " return None\n", "\n", "df['postal_code'] = df.Rental_Address.apply(anonymize_address).astype(str)\n", "\n", "with open(location_cache_file, 'w') as fd:\n", " json.dump(location_cache, fd) # write cache to file\n", "\n", "print(f'Could not find {len(location_not_found)} locations, {len(set(location_not_found))} unique')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Anonymize guest names and host property names" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | Arrive | \n", "Depart | \n", "Received | \n", "Status | \n", "Adults | \n", "Cost | \n", "Nights | \n", "Last_modified | \n", "NightlyRate | \n", "Guest Location | \n", "postal_code | \n", "Guest Anon | \n", "Rental Anon | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "5/16/2019 | \n", "5/23/2019 | \n", "5/14/2019 13:20 | \n", "canceled | \n", "3.0 | \n", "7682.50 | \n", "7 | \n", "8/28/2019 15:12 | \n", "1019.0 | \n", "NaN | \n", "2300 | \n", "e8d072e5 | \n", "3fd1e124 | \n", "
1 | \n", "4/1/2020 | \n", "4/4/2020 | \n", "2/17/2020 00:26 | \n", "canceled | \n", "3.0 | \n", "1037.81 | \n", "3 | \n", "3/12/2020 17:17 | \n", "346.0 | \n", "NaN | \n", "2200 | \n", "99076ea9 | \n", "c74a5829 | \n", "
2 | \n", "7/4/2020 | \n", "7/5/2020 | \n", "3/4/2019 13:37 | \n", "canceled | \n", "4.0 | \n", "977.50 | \n", "1 | \n", "8/15/2019 15:28 | \n", "978.0 | \n", "NaN | \n", "1799 | \n", "794d4098 | \n", "d9dacb3e | \n", "
3 | \n", "4/9/2020 | \n", "4/14/2020 | \n", "2/17/2020 00:05 | \n", "canceled | \n", "3.0 | \n", "419.10 | \n", "5 | \n", "3/15/2020 18:01 | \n", "84.0 | \n", "NaN | \n", "2200 | \n", "245bb108 | \n", "c74a5829 | \n", "
4 | \n", "7/4/2020 | \n", "7/7/2020 | \n", "2/16/2020 22:59 | \n", "canceled | \n", "2.0 | \n", "5537.23 | \n", "3 | \n", "7/8/2020 19:28 | \n", "1679.0 | \n", "Houston, TX | \n", "1051 | \n", "fcac9b2c | \n", "0fd1eb2e | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
4064 | \n", "6/24/2019 | \n", "6/26/2019 | \n", "6/21/2019 10:51 | \n", "confirmed | \n", "2.0 | \n", "2827.10 | \n", "2 | \n", "6/21/2019 10:52 | \n", "1239.0 | \n", "NaN | \n", "1454 | \n", "0948688e | \n", "6361b74c | \n", "
4065 | \n", "6/24/2019 | \n", "6/27/2019 | \n", "6/21/2019 13:57 | \n", "confirmed | \n", "1.0 | \n", "4028.15 | \n", "3 | \n", "6/21/2019 13:58 | \n", "1243.0 | \n", "NaN | \n", "1454 | \n", "d9db51e4 | \n", "154dcb23 | \n", "
4066 | \n", "7/5/2019 | \n", "7/7/2019 | \n", "6/21/2019 14:53 | \n", "confirmed | \n", "6.0 | \n", "5486.24 | \n", "2 | \n", "6/21/2019 14:58 | \n", "2393.0 | \n", "Plymouth, United Kingdom | \n", "2200 | \n", "d068cc67 | \n", "7f9711ed | \n", "
4067 | \n", "9/2/2019 | \n", "9/4/2019 | \n", "9/1/2019 15:48 | \n", "confirmed | \n", "2.0 | \n", "2226.22 | \n", "2 | \n", "9/1/2019 15:52 | \n", "863.0 | \n", "St Petersburg, FL | \n", "1063 | \n", "0ef3d2da | \n", "eeeb0cf5 | \n", "
4068 | \n", "8/8/2020 | \n", "8/11/2020 | \n", "8/7/2020 23:45 | \n", "confirmed | \n", "2.0 | \n", "6326.97 | \n", "3 | \n", "8/7/2020 23:47 | \n", "1792.0 | \n", "NaN | \n", "nan | \n", "ecd81df9 | \n", "e28934a2 | \n", "
4069 rows × 13 columns
\n", "\n", " | Arrive | \n", "Depart | \n", "Received | \n", "Status | \n", "Adults | \n", "Cost | \n", "Nights | \n", "Last_modified | \n", "NightlyRate | \n", "Guest Location | \n", "postal_code | \n", "Guest Anon | \n", "Rental Anon | \n", "property_lat | \n", "property_lon | \n", "guest_lat | \n", "guest_lon | \n", "guest_country | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "5/16/2019 | \n", "5/23/2019 | \n", "5/14/2019 13:20 | \n", "canceled | \n", "3.0 | \n", "7682.50 | \n", "7 | \n", "8/28/2019 15:12 | \n", "1019.0 | \n", "NaN | \n", "2300 | \n", "e8d072e5 | \n", "3fd1e124 | \n", "55.651306275214914 | \n", "12.603239181235617 | \n", "None | \n", "None | \n", "None | \n", "
1 | \n", "4/1/2020 | \n", "4/4/2020 | \n", "2/17/2020 00:26 | \n", "canceled | \n", "3.0 | \n", "1037.81 | \n", "3 | \n", "3/12/2020 17:17 | \n", "346.0 | \n", "NaN | \n", "2200 | \n", "99076ea9 | \n", "c74a5829 | \n", "55.69475043151038 | \n", "12.550190147092675 | \n", "None | \n", "None | \n", "None | \n", "
2 | \n", "7/4/2020 | \n", "7/5/2020 | \n", "3/4/2019 13:37 | \n", "canceled | \n", "4.0 | \n", "977.50 | \n", "1 | \n", "8/15/2019 15:28 | \n", "978.0 | \n", "NaN | \n", "1799 | \n", "794d4098 | \n", "d9dacb3e | \n", "55.66627486937496 | \n", "12.534955500625006 | \n", "None | \n", "None | \n", "None | \n", "
3 | \n", "4/9/2020 | \n", "4/14/2020 | \n", "2/17/2020 00:05 | \n", "canceled | \n", "3.0 | \n", "419.10 | \n", "5 | \n", "3/15/2020 18:01 | \n", "84.0 | \n", "NaN | \n", "2200 | \n", "245bb108 | \n", "c74a5829 | \n", "55.69475043151038 | \n", "12.550190147092675 | \n", "None | \n", "None | \n", "None | \n", "
4 | \n", "7/4/2020 | \n", "7/7/2020 | \n", "2/16/2020 22:59 | \n", "canceled | \n", "2.0 | \n", "5537.23 | \n", "3 | \n", "7/8/2020 19:28 | \n", "1679.0 | \n", "Houston, TX | \n", "1051 | \n", "fcac9b2c | \n", "0fd1eb2e | \n", "55.679985160377335 | \n", "12.590608396226418 | \n", "29.7589382 | \n", "-95.3676974 | \n", "us | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
4064 | \n", "6/24/2019 | \n", "6/26/2019 | \n", "6/21/2019 10:51 | \n", "confirmed | \n", "2.0 | \n", "2827.10 | \n", "2 | \n", "6/21/2019 10:52 | \n", "1239.0 | \n", "NaN | \n", "1454 | \n", "0948688e | \n", "6361b74c | \n", "55.67840178857141 | \n", "12.569364442857141 | \n", "None | \n", "None | \n", "None | \n", "
4065 | \n", "6/24/2019 | \n", "6/27/2019 | \n", "6/21/2019 13:57 | \n", "confirmed | \n", "1.0 | \n", "4028.15 | \n", "3 | \n", "6/21/2019 13:58 | \n", "1243.0 | \n", "NaN | \n", "1454 | \n", "d9db51e4 | \n", "154dcb23 | \n", "55.67840178857141 | \n", "12.569364442857141 | \n", "None | \n", "None | \n", "None | \n", "
4066 | \n", "7/5/2019 | \n", "7/7/2019 | \n", "6/21/2019 14:53 | \n", "confirmed | \n", "6.0 | \n", "5486.24 | \n", "2 | \n", "6/21/2019 14:58 | \n", "2393.0 | \n", "Plymouth, United Kingdom | \n", "2200 | \n", "d068cc67 | \n", "7f9711ed | \n", "55.69475043151038 | \n", "12.550190147092675 | \n", "50.3712659 | \n", "-4.1425658 | \n", "gb | \n", "
4067 | \n", "9/2/2019 | \n", "9/4/2019 | \n", "9/1/2019 15:48 | \n", "confirmed | \n", "2.0 | \n", "2226.22 | \n", "2 | \n", "9/1/2019 15:52 | \n", "863.0 | \n", "St Petersburg, FL | \n", "1063 | \n", "0ef3d2da | \n", "eeeb0cf5 | \n", "55.67796323809524 | \n", "12.583763952380954 | \n", "27.7703796 | \n", "-82.6695085 | \n", "us | \n", "
4068 | \n", "8/8/2020 | \n", "8/11/2020 | \n", "8/7/2020 23:45 | \n", "confirmed | \n", "2.0 | \n", "6326.97 | \n", "3 | \n", "8/7/2020 23:47 | \n", "1792.0 | \n", "NaN | \n", "nan | \n", "ecd81df9 | \n", "e28934a2 | \n", "None | \n", "None | \n", "None | \n", "None | \n", "None | \n", "
4069 rows × 18 columns
\n", "