{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "\n", "import os # used for file system operations\n", "import json # prevalent input/output data format\n", "from hashlib import sha256 # cryptographic hashing for personal data for anonymization\n", "\n", "import pandas as pd # working with data frames, a versatile tabular data structure\n", "from geopy.geocoders import Nominatim # use Nominatim, the OpenStreetMap geocoder service (from address to geo-location)\n", "from geopy.extra.rate_limiter import RateLimiter # rate-limit support for geocoder services\n", "\n", "pd.options.display.max_columns = 50\n", "pd.options.display.max_rows = 100" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Loading original data\n", "\n", "Original data set from company X, to be anonymized." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original data, 4108 rows and 20 columns\n" ] } ], "source": [ "data_location = 'data/RAW.csv' # original data in CSV format\n", "df = pd.read_csv(data_location)\n", "df = df.drop('Source', axis='columns') # drop superfluous column\n", "print(f'Original data, {df.shape[0]} rows and {df.shape[1]} columns')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data pre-processing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Convert currency to numeric (DKK)\n", "Fix `Cost` and `NightlyRate` columns to be numeric. The source data is formatted as danish kroner." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": false }, "outputs": [], "source": [ "df.Cost = df.Cost.str.replace('kr','').str.replace(',','').astype(float)\n", "df.NightlyRate = df.NightlyRate.str.replace('kr','').str.replace(',','').astype(float)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Consolidation\n", "\n", "Remove duplicate and incomplete rows." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "New shape, rows and columns: (4069, 20)\n" ] } ], "source": [ "# Drop duplicate bookings (artefact of importing from different systems)\n", "df = df.sort_values('Status').drop_duplicates(subset=['Guest Name', 'Rental', 'Arrive', 'Depart'], keep='first')\n", "\n", "# Delete test bookings\n", "df = df.dropna(subset=['Guest Name', 'Rental'], how='any')\n", "\n", "print('New shape, rows and columns:', df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Anonymization" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Anonymize property location" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the original data, the property's exact address is available in the `Rental_Address` column." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "missing property locations: 28\n", "unique property locations: 173\n" ] } ], "source": [ "print('missing property locations:', df.Rental_Address.isna().sum())\n", "\n", "unique_property_locations = df.Rental_Address.dropna().unique().tolist()\n", "print('unique property locations:', len(unique_property_locations))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "geolocator = Nominatim(user_agent='CovidBnB') # create OpenStreetMap Nominatim geolocator (https://wiki.openstreetmap.org/wiki/Nominatim)\n", "geocode = RateLimiter(geolocator.geocode, min_delay_seconds=0.1) # ensure no more than 10 Nominatim queries per second" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "location_cache_file = 'cache/location_cache.json'\n", "if os.path.exists(location_cache_file): # if location cache exists,\n", " with open(location_cache_file, 'r') as fd:\n", " location_cache = json.load(fd)\n", "else:\n", " location_cache = {} # if no location cache found, start empty dictionary as location cache\n", "location_not_found = [] # list of location we could not find" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Could not find 52 locations, 4 unique\n" ] } ], "source": [ "def anonymize_address(property_location):\n", " '''Anonymize full input address to postal code.'''\n", " if pd.isna(property_location):\n", " return None\n", " cached_location = location_cache.get(property_location, False) # try to read from location cache\n", " if cached_location:\n", " return cached_location # cache hit, found address in location cache, return cached postal code\n", " else:\n", " try:\n", " location_geo = geocode(property_location, addressdetails=True) # not found in cache, geolocate with address details\n", " postcode = location_geo.raw['address']['postcode'] # extract postal code from address details response\n", " location_cache[property_location] = postcode # add value to cache\n", " return postcode\n", " except:\n", " location_not_found.append(property_location) # if not found, add to list of locations not found \n", " return None\n", "\n", "df['postal_code'] = df.Rental_Address.apply(anonymize_address).astype(str)\n", "\n", "with open(location_cache_file, 'w') as fd:\n", " json.dump(location_cache, fd) # write cache to file\n", "\n", "print(f'Could not find {len(location_not_found)} locations, {len(set(location_not_found))} unique')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Anonymize guest names and host property names" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ArriveDepartReceivedStatusAdultsCostNightsLast_modifiedNightlyRateGuest Locationpostal_codeGuest AnonRental Anon
05/16/20195/23/20195/14/2019 13:20canceled3.07682.5078/28/2019 15:121019.0NaN2300e8d072e53fd1e124
14/1/20204/4/20202/17/2020 00:26canceled3.01037.8133/12/2020 17:17346.0NaN220099076ea9c74a5829
27/4/20207/5/20203/4/2019 13:37canceled4.0977.5018/15/2019 15:28978.0NaN1799794d4098d9dacb3e
34/9/20204/14/20202/17/2020 00:05canceled3.0419.1053/15/2020 18:0184.0NaN2200245bb108c74a5829
47/4/20207/7/20202/16/2020 22:59canceled2.05537.2337/8/2020 19:281679.0Houston, TX1051fcac9b2c0fd1eb2e
..........................................
40646/24/20196/26/20196/21/2019 10:51confirmed2.02827.1026/21/2019 10:521239.0NaN14540948688e6361b74c
40656/24/20196/27/20196/21/2019 13:57confirmed1.04028.1536/21/2019 13:581243.0NaN1454d9db51e4154dcb23
40667/5/20197/7/20196/21/2019 14:53confirmed6.05486.2426/21/2019 14:582393.0Plymouth, United Kingdom2200d068cc677f9711ed
40679/2/20199/4/20199/1/2019 15:48confirmed2.02226.2229/1/2019 15:52863.0St Petersburg, FL10630ef3d2daeeeb0cf5
40688/8/20208/11/20208/7/2020 23:45confirmed2.06326.9738/7/2020 23:471792.0NaNnanecd81df9e28934a2
\n", "

4069 rows × 13 columns

\n", "
" ], "text/plain": [ " Arrive Depart Received Status Adults Cost \\\n", "0 5/16/2019 5/23/2019 5/14/2019 13:20 canceled 3.0 7682.50 \n", "1 4/1/2020 4/4/2020 2/17/2020 00:26 canceled 3.0 1037.81 \n", "2 7/4/2020 7/5/2020 3/4/2019 13:37 canceled 4.0 977.50 \n", "3 4/9/2020 4/14/2020 2/17/2020 00:05 canceled 3.0 419.10 \n", "4 7/4/2020 7/7/2020 2/16/2020 22:59 canceled 2.0 5537.23 \n", "... ... ... ... ... ... ... \n", "4064 6/24/2019 6/26/2019 6/21/2019 10:51 confirmed 2.0 2827.10 \n", "4065 6/24/2019 6/27/2019 6/21/2019 13:57 confirmed 1.0 4028.15 \n", "4066 7/5/2019 7/7/2019 6/21/2019 14:53 confirmed 6.0 5486.24 \n", "4067 9/2/2019 9/4/2019 9/1/2019 15:48 confirmed 2.0 2226.22 \n", "4068 8/8/2020 8/11/2020 8/7/2020 23:45 confirmed 2.0 6326.97 \n", "\n", " Nights Last_modified NightlyRate Guest Location \\\n", "0 7 8/28/2019 15:12 1019.0 NaN \n", "1 3 3/12/2020 17:17 346.0 NaN \n", "2 1 8/15/2019 15:28 978.0 NaN \n", "3 5 3/15/2020 18:01 84.0 NaN \n", "4 3 7/8/2020 19:28 1679.0 Houston, TX \n", "... ... ... ... ... \n", "4064 2 6/21/2019 10:52 1239.0 NaN \n", "4065 3 6/21/2019 13:58 1243.0 NaN \n", "4066 2 6/21/2019 14:58 2393.0 Plymouth, United Kingdom \n", "4067 2 9/1/2019 15:52 863.0 St Petersburg, FL \n", "4068 3 8/7/2020 23:47 1792.0 NaN \n", "\n", " postal_code Guest Anon Rental Anon \n", "0 2300 e8d072e5 3fd1e124 \n", "1 2200 99076ea9 c74a5829 \n", "2 1799 794d4098 d9dacb3e \n", "3 2200 245bb108 c74a5829 \n", "4 1051 fcac9b2c 0fd1eb2e \n", "... ... ... ... \n", "4064 1454 0948688e 6361b74c \n", "4065 1454 d9db51e4 154dcb23 \n", "4066 2200 d068cc67 7f9711ed \n", "4067 1063 0ef3d2da eeeb0cf5 \n", "4068 nan ecd81df9 e28934a2 \n", "\n", "[4069 rows x 13 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def anonymize_text(text, length=8):\n", " '''Anonymize a given text using the SHA256 cryptographic hash function. Default is to use first 8 characters of hash.'''\n", " byte_text = text.encode('utf8')\n", " text_hash = sha256(byte_text).hexdigest()\n", " return text_hash[:length]\n", "\n", "df['Guest Anon'] = df['Guest Name'].apply(anonymize_text)\n", "df['Rental Anon'] = df['Rental'].apply(anonymize_text)\n", "\n", "# Drop columns with personally identifiable information (PII)\n", "drop_pii_columns = ['Guest Name', 'First Name', 'Last Name', 'Guest_Name', 'Rental_Name', \n", " 'Rental', 'ID', 'Altered', 'Phone', 'Rental_Address']\n", "df = df.drop(drop_pii_columns, axis=1).reset_index(drop=True)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Geolocation of anonymized property" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "postal_code_cache_file = 'cache/postal_code_cache.json'\n", "if os.path.exists(postal_code_cache_file): # if postal code cache exists\n", " with open(postal_code_cache_file, 'r') as fd:\n", " postal_code_cache = json.load(fd) # read postal code cache from file\n", "else:\n", " postal_code_cache = {} # if no postal code cache found, start empty dictionary as cache" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def geolocate_postalcode(postalcode):\n", " '''Geolocate a postal code to a pandas Series with lat and lon.'''\n", " if postalcode in ['None', 'nan'] or postalcode is None or pd.isnull(postalcode):\n", " return pd.Series({'property_lat': None, 'property_lon': None})\n", " lat_lon = postal_code_cache.get(postalcode) # try to read lat and lon from postal code cache\n", " if lat_lon:\n", " # if lat lon from postal code cache is a hit, return a pandas Series\n", " return pd.Series({'property_lat': lat_lon[0], 'property_lon': lat_lon[1]})\n", " else:\n", " # if cache is not hit, geocode the postal code (country is always Denmark)\n", " geo = geocode(query={'postalcode': postalcode, 'country': 'Denmark'})\n", " postal_code_cache[postalcode] = (geo.raw['lat'], geo.raw['lon']) # add fetched data to cache\n", " return pd.Series({'property_lat': geo.raw['lat'], 'property_lon': geo.raw['lon']})\n", "\n", "lat_lon_df = df.postal_code.apply(geolocate_postalcode) # create data frame for lat lon\n", "\n", "with open(postal_code_cache_file, 'w') as fd:\n", " json.dump(postal_code_cache, fd) # write cache to file\n", "df = pd.concat([df, lat_lon_df], axis='columns') # concatenate lat lon data frame by columns (horizontally)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Geolocate guest origin" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "guest_origin_cache_file = 'cache/guest_origin_cache.json'\n", "if os.path.exists(guest_origin_cache_file): # if guest origin cache exists\n", " with open(guest_origin_cache_file, 'r') as fd:\n", " guest_origin_cache = json.load(fd) # read guest origin cache from file\n", "else:\n", " guest_origin_cache = {} # if no guest origin cache found, start empty dictionary as cache\n", "guest_origin_not_found = []" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Problem locating: Lombardy, Italy, Lombardy, Italy\n", "Problem locating: Sundbyberg, Sweden, Toronto, Canada\n", "Problem locating: Stoyanka, Ukraine\n", "Problem locating: Nes Municipality, Faroe Islands\n", "Problem locating: Greatford, United Kingdom, Greatford, United Kingdom, Greatford, United Kingdom, Greatford, United Kingdom, Greatford, United Kingdom\n", "Problem locating: Gothenburg, Sweden, Oslo, Norway\n", "Problem locating: Barcelona, Spain, Barcelona, Spain\n", "Problem locating: Berlin, Germany, Berlin, Germany\n", "Problem locating: Hong Kong SAR, China\n" ] } ], "source": [ "def geolocate_guest_origin(location):\n", " '''Geolocate guest location.'''\n", " if pd.isnull(location):\n", " return pd.Series({'guest_lat': None, 'guest_lon': None, 'guest_country': None})\n", " lat_lon = guest_origin_cache.get(location)\n", " if lat_lon:\n", " return pd.Series({'guest_lat': lat_lon[0], 'guest_lon': lat_lon[1], 'guest_country': lat_lon[2]})\n", " elif location in guest_origin_not_found:\n", " return pd.Series({'guest_lat': None, 'guest_lon': None, 'guest_country': None})\n", " else:\n", " try:\n", " geo = geocode(query=location, addressdetails=True)\n", " guest_origin_cache[location] = (geo.raw['lat'], geo.raw['lon'], geo.raw['address']['country_code'])\n", " print(location, (geo.raw['lat'], geo.raw['lon']), geo.raw['address']['country_code'])\n", " return pd.Series({'guest_lat': geo.raw['lat'], 'guest_lon': geo.raw['lon'], \n", " 'guest_country': geo.raw['address']['country_code']})\n", " except:\n", " guest_origin_not_found.append(location)\n", " print('Problem locating:', location)\n", " return pd.Series({'guest_lat': None, 'guest_lon': None, 'guest_country': None})\n", "\n", "guest_lat_lon_df = df['Guest Location'].apply(geolocate_guest_origin)\n", "\n", "with open(guest_origin_cache_file, 'w') as fd:\n", " json.dump(guest_origin_cache, fd) # write cache to file\n", " \n", "df = pd.concat([df, guest_lat_lon_df], axis='columns')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Save anonymized data as CSV" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ArriveDepartReceivedStatusAdultsCostNightsLast_modifiedNightlyRateGuest Locationpostal_codeGuest AnonRental Anonproperty_latproperty_longuest_latguest_longuest_country
05/16/20195/23/20195/14/2019 13:20canceled3.07682.5078/28/2019 15:121019.0NaN2300e8d072e53fd1e12455.65130627521491412.603239181235617NoneNoneNone
14/1/20204/4/20202/17/2020 00:26canceled3.01037.8133/12/2020 17:17346.0NaN220099076ea9c74a582955.6947504315103812.550190147092675NoneNoneNone
27/4/20207/5/20203/4/2019 13:37canceled4.0977.5018/15/2019 15:28978.0NaN1799794d4098d9dacb3e55.6662748693749612.534955500625006NoneNoneNone
34/9/20204/14/20202/17/2020 00:05canceled3.0419.1053/15/2020 18:0184.0NaN2200245bb108c74a582955.6947504315103812.550190147092675NoneNoneNone
47/4/20207/7/20202/16/2020 22:59canceled2.05537.2337/8/2020 19:281679.0Houston, TX1051fcac9b2c0fd1eb2e55.67998516037733512.59060839622641829.7589382-95.3676974us
.........................................................
40646/24/20196/26/20196/21/2019 10:51confirmed2.02827.1026/21/2019 10:521239.0NaN14540948688e6361b74c55.6784017885714112.569364442857141NoneNoneNone
40656/24/20196/27/20196/21/2019 13:57confirmed1.04028.1536/21/2019 13:581243.0NaN1454d9db51e4154dcb2355.6784017885714112.569364442857141NoneNoneNone
40667/5/20197/7/20196/21/2019 14:53confirmed6.05486.2426/21/2019 14:582393.0Plymouth, United Kingdom2200d068cc677f9711ed55.6947504315103812.55019014709267550.3712659-4.1425658gb
40679/2/20199/4/20199/1/2019 15:48confirmed2.02226.2229/1/2019 15:52863.0St Petersburg, FL10630ef3d2daeeeb0cf555.6779632380952412.58376395238095427.7703796-82.6695085us
40688/8/20208/11/20208/7/2020 23:45confirmed2.06326.9738/7/2020 23:471792.0NaNnanecd81df9e28934a2NoneNoneNoneNoneNone
\n", "

4069 rows × 18 columns

\n", "
" ], "text/plain": [ " Arrive Depart Received Status Adults Cost \\\n", "0 5/16/2019 5/23/2019 5/14/2019 13:20 canceled 3.0 7682.50 \n", "1 4/1/2020 4/4/2020 2/17/2020 00:26 canceled 3.0 1037.81 \n", "2 7/4/2020 7/5/2020 3/4/2019 13:37 canceled 4.0 977.50 \n", "3 4/9/2020 4/14/2020 2/17/2020 00:05 canceled 3.0 419.10 \n", "4 7/4/2020 7/7/2020 2/16/2020 22:59 canceled 2.0 5537.23 \n", "... ... ... ... ... ... ... \n", "4064 6/24/2019 6/26/2019 6/21/2019 10:51 confirmed 2.0 2827.10 \n", "4065 6/24/2019 6/27/2019 6/21/2019 13:57 confirmed 1.0 4028.15 \n", "4066 7/5/2019 7/7/2019 6/21/2019 14:53 confirmed 6.0 5486.24 \n", "4067 9/2/2019 9/4/2019 9/1/2019 15:48 confirmed 2.0 2226.22 \n", "4068 8/8/2020 8/11/2020 8/7/2020 23:45 confirmed 2.0 6326.97 \n", "\n", " Nights Last_modified NightlyRate Guest Location \\\n", "0 7 8/28/2019 15:12 1019.0 NaN \n", "1 3 3/12/2020 17:17 346.0 NaN \n", "2 1 8/15/2019 15:28 978.0 NaN \n", "3 5 3/15/2020 18:01 84.0 NaN \n", "4 3 7/8/2020 19:28 1679.0 Houston, TX \n", "... ... ... ... ... \n", "4064 2 6/21/2019 10:52 1239.0 NaN \n", "4065 3 6/21/2019 13:58 1243.0 NaN \n", "4066 2 6/21/2019 14:58 2393.0 Plymouth, United Kingdom \n", "4067 2 9/1/2019 15:52 863.0 St Petersburg, FL \n", "4068 3 8/7/2020 23:47 1792.0 NaN \n", "\n", " postal_code Guest Anon Rental Anon property_lat \\\n", "0 2300 e8d072e5 3fd1e124 55.651306275214914 \n", "1 2200 99076ea9 c74a5829 55.69475043151038 \n", "2 1799 794d4098 d9dacb3e 55.66627486937496 \n", "3 2200 245bb108 c74a5829 55.69475043151038 \n", "4 1051 fcac9b2c 0fd1eb2e 55.679985160377335 \n", "... ... ... ... ... \n", "4064 1454 0948688e 6361b74c 55.67840178857141 \n", "4065 1454 d9db51e4 154dcb23 55.67840178857141 \n", "4066 2200 d068cc67 7f9711ed 55.69475043151038 \n", "4067 1063 0ef3d2da eeeb0cf5 55.67796323809524 \n", "4068 nan ecd81df9 e28934a2 None \n", "\n", " property_lon guest_lat guest_lon guest_country \n", "0 12.603239181235617 None None None \n", "1 12.550190147092675 None None None \n", "2 12.534955500625006 None None None \n", "3 12.550190147092675 None None None \n", "4 12.590608396226418 29.7589382 -95.3676974 us \n", "... ... ... ... ... \n", "4064 12.569364442857141 None None None \n", "4065 12.569364442857141 None None None \n", "4066 12.550190147092675 50.3712659 -4.1425658 gb \n", "4067 12.583763952380954 27.7703796 -82.6695085 us \n", "4068 None None None None \n", "\n", "[4069 rows x 18 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "df.to_csv('data/processed.csv', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 2 }