{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "58ab37b6", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:50.200222Z", "start_time": "2024-05-17T13:20:50.188014Z" } }, "outputs": [], "source": [ "import geopy\n", "from geopy.extra.rate_limiter import RateLimiter\n", "\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import geopandas as gpd\n", "\n", "from matplotlib import pyplot as plt\n", "import shapely\n", "\n", "%matplotlib inline\n", "\n", "import contextily as ctx\n", "import requests\n", "import plotly.express as px\n", "from zipfile import ZipFile\n", "import json\n", "import random\n", "import os, urllib\n", "import seaborn as sbn\n", "\n", "#from tqdm.auto import tqdm, trange\n", "from tqdm import tqdm, trange\n", "tqdm.pandas()\n", "\n", "from geopy_nominatim_wrapper import NominatimWrapper\n", "from geopy_bestaddress import BestAddress\n", "from geopy_pelias import Pelias\n", "from geopy_bepelias import BePelias\n", "\n", "\n", "from credentials import (here_api_key, \n", " bing_api_key, \n", " mapbox_api_key, \n", " tomtom_api_key, \n", " google_api_key, \n", " arcgis_username, arcgis_password,\n", " best_client_id, best_client_secret, best_hostname)\n", "\n", "\n", "\n", "fig_path = \"output/geocoding/figs\"\n", "data_dir = \"data/geocoding\"\n", "output_dir=\"output/geocoding/\"\n", "# from fpdf import FPDF " ] }, { "cell_type": "code", "execution_count": null, "id": "beb6e357", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:50.283170Z", "start_time": "2024-05-17T13:20:50.202973Z" } }, "outputs": [], "source": [ "# dataset = \"rrn_1000\"\n", "# dataset = \"kbo_1000\"\n", "# dataset = \"best_1000\"\n", "# dataset = \"best2_1000\"\n", "# dataset = \"rep_1000\"\n", "# dataset = \"resto_1000\"\n", "\n", "dataset = \"kbo_10000\"\n", "# dataset = \"rrn_10000\"\n", "# dataset = \"rep_10000\"\n", "# dataset = \"best_10000\"" ] }, { "cell_type": "code", "execution_count": null, "id": "1b48d835", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:50.338338Z", "start_time": "2024-05-17T13:20:50.285163Z" } }, "outputs": [], "source": [ "crs = 'epsg:3857'\n", "osm_crs= 'epsg:4326'" ] }, { "cell_type": "code", "execution_count": null, "id": "2734364b", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:50.389917Z", "start_time": "2024-05-17T13:20:50.341296Z" } }, "outputs": [], "source": [ "def_figsize = (10,8)" ] }, { "cell_type": "code", "execution_count": null, "id": "99213f34", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:50.441356Z", "start_time": "2024-05-17T13:20:50.391916Z" } }, "outputs": [], "source": [ "os.makedirs(f\"{output_dir}/geocoded_data/\", exist_ok=True)\n", "os.makedirs(f\"{output_dir}/reports/\", exist_ok=True)\n", "os.makedirs(fig_path, exist_ok=True)\n", "os.makedirs(data_dir, exist_ok=True)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5b32450f", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:50.492025Z", "start_time": "2024-05-17T13:20:50.443459Z" } }, "outputs": [], "source": [ "def add_basemap(ax, zoom='auto'):\n", " \"\"\"\n", " Add a basemap on a plot. Tries first default (Stamen) basemap. If errors, tries OpenStreetMap.Mapnik\n", " \n", " Parameters\n", " ----------\n", " ax: matplotlib axes\n", " \n", " Returns\n", " -------\n", " None\n", " \"\"\"\n", "\n", " try: \n", "# ctx.add_basemap(ax, zoom=zoom)\n", "# except requests.HTTPError:\n", "# print(\"Default basemap doesn't work...\")\n", " ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik, zoom=zoom)\n", " except ValueError as e:\n", " print(\"Value error...\")\n", " print(e)\n", " except NameError as e:\n", " print(\"contextily not installed\")" ] }, { "cell_type": "code", "execution_count": null, "id": "cf4dc166", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:50.568237Z", "start_time": "2024-05-17T13:20:50.494535Z" } }, "outputs": [], "source": [ "def set_optimal_limits(ax, df):\n", " \"\"\"\n", " Adapt xlim/ylim to a GeoDataFrame point plot to avoid plot to be too wide when \n", " points are horizontally aligned, and to narrow when points are vertically aligned\n", "\n", " Usage : \n", " \n", " ax = df.plot()\n", " set_optimal_limits(ax, df)\n", " \n", " Parameters\n", " ----------\n", " ax: AxesSubplot\n", " plot to resize\n", " df: GeoDataFrame\n", " data to be plotted\n", "\n", " Returns\n", " -------\n", " None\n", " \"\"\"\n", " \n", " plot_ratio = 1.5 # optimal ratio between \"one horizontal degree\" and \"one vertical degree\". It depends of the CRS. \n", " # For \"polar\" CRS, it may also depend of the place on the globe\n", "\n", " minimal_width=200\n", " \n", " margins = 1.1 # Avoid having dots on edges of the plot\n", " \n", "\n", " # Compute dimension of the data\n", " xmin, ymin, xmax, ymax = df.total_bounds\n", " height = (ymax - ymin) \n", " width = (xmax - xmin)\n", " \n", " opt_height = max(height, width / plot_ratio, minimal_width / plot_ratio)\n", " opt_width = max(width , height*plot_ratio, minimal_width)\n", " \n", "# print(xmin, ymin, xmax, ymax)\n", "# print(width, height, opt_width, opt_height)\n", " # If plot is too narrow, increase xmin. If plot is too wide, increase ylim\n", "\n", " if opt_height > height :\n", " ymid = (ymax+ymin)/2\n", " mid_height = opt_height * margins / 2\n", " ax.set_ylim(ymid - mid_height, ymid + mid_height)\n", " if opt_width > width:\n", " xmid = (xmax+xmin)/2\n", " mid_width = opt_width* margins/2\n", " ax.set_xlim(xmid - mid_width, xmid + mid_width)" ] }, { "cell_type": "code", "execution_count": null, "id": "93ee5b8a", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:50.627134Z", "start_time": "2024-05-17T13:20:50.570356Z" } }, "outputs": [], "source": [ "def pdf_savefig():\n", " try: \n", " pdf.savefig(bbox_inches='tight')\n", " except AttributeError:\n", " print(\"PDF probably closed\")" ] }, { "cell_type": "code", "execution_count": null, "id": "2b8514b1", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:50.677767Z", "start_time": "2024-05-17T13:20:50.629387Z" } }, "outputs": [], "source": [ "# plot_address(geocoded_data, geocoded_data.iloc[5].address)" ] }, { "cell_type": "markdown", "id": "318d1fe5", "metadata": {}, "source": [ "# Prepare geocoders" ] }, { "cell_type": "code", "execution_count": null, "id": "23fe71c7", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:50.738211Z", "start_time": "2024-05-17T13:20:50.681978Z" } }, "outputs": [], "source": [ "def get_precision(record):\n", " \n", " precision_field= {\n", " \"nominatim\": \"class\",\n", " \"nominatim_loc\": \"class\",\n", " \"nominatim_wrapper\": (\"metadata\", \"placeRank\"),\n", "# \"nominatim_wrapper2\": \"place_rank\",\n", " \"here\": \"MatchLevel\",\n", " \"photon\": (\"properties\", \"type\"),\n", " \"bing\": \"entityType\",\n", " \"mapbox\": (\"properties\", \"accuracy\"),\n", " \"tomtom\": \"type\",\n", " \"trillium\": \"match_level\",\n", " \"google\":(\"geometry\", \"location_type\"),\n", " \"bestaddress\": \"precision\",\n", " \"lpost_bestaddress\": \"precision\",\n", " \"be_pelias\": \"precision\",\n", " }\n", " \n", " label_mapping = {\n", " \"nominatim\": {\"building\": \"building\",\n", " \"amenity\": \"building\",\n", " \"shop\": \"building\",\n", " \"place\": \"building\",\n", " \"highway\": \"street\"},\n", " \n", " \n", " \"nominatim_loc\": {\"building\": \"building\",\n", " \"amenity\": \"building\",\n", " \"shop\": \"building\",\n", " \"place\": \"building\",\n", " \"highway\": \"street\"},\n", " \n", " \"nominatim_wrapper\": {\"30\": \"building\", \n", " \"27\": \"street\",\n", " \"26\": \"street\",\n", " \"25\": \"city\",\n", " \"22\": \"city\",\n", " \"21\": \"city\",\n", " \"20\": \"city\",\n", " \"19\": \"city\",\n", " \"18\": \"city\",\n", " \"16\": \"city\",\n", " \"14\": \"city\"},\n", "\n", " \"here\": {\"houseNumber\": \"building\",\n", " \"street\": \"street\",\n", " \"city\": \"city\",\n", " \"postalCode\": \"city\"},\n", " \"photon\": {\n", " \"house\": \"building\",\n", " \"locality\": \"city\",\n", " \"district\": \"city\",\n", " \"city\": \"city\",\n", " \"street\": \"street\"},\n", " \"bing\": { \"Address\": \"building\",\n", " \"RoadBlock\": \"street\",\n", " \"PopulatedPlace\": \"city\",\n", " \"Postcode1\": \"city\",\n", " \"CountryRegion\": \"country\"\n", " },\n", " \"mapbox\": {\"rooftop\": \"building\",\n", " \"point\": \"building\",\n", " \"interpolated\": \"building\",\n", " \"street\": \"street\"},\n", " \n", " \"tomtom\": {\"Point Address\": \"building\",\n", " \"Street\": \"street\",\n", " \"Address Range\": \"street\",\n", " \"Cross Street\": \"street\",\n", " \"Geography\": \"city\"},\n", " \"trillium\": {0:\"building\",\n", " 2: \"city\",\n", " 3: \"street\"},\n", " \"google\" : {\n", " \"ROOFTOP\": \"building\",\n", " \"RANGE_INTERPOLATED\": \"building\",\n", " \"GEOMETRIC_CENTER\": \"street\",\n", " \"APPROXIMATE\": \"city\"\n", " },\n", " \"bestaddress\":{\n", " \"building\": \"building\",\n", " \"street\":\"street\",\n", " \"country\": \"country\",\n", " \"city\":\"city\"\n", " },\n", " \"lpost_bestaddress\":{\n", " \"building\": \"building\",\n", " \"street\":\"street\",\n", " \"country\": \"country\"\n", " },\n", " \n", " \"be_pelias\": {\n", " \"address\": \"building\",\n", " \"address_00\": \"country\",\n", " \"address_streetcenter\": \"street\",\n", " \"address_interpol\": \"building\",\n", " \"street_interpol\": \"building\",\n", " \"street_00\": \"country\",\n", " \"street\": \"street\",\n", " \"city\": \"city\",\n", " \"country\": \"country\",\n", " },\n", "\n", "\n", " }\n", " \n", "# if \"pelias\" in record.geocoder:\n", " \n", "# if record.location.point == geopy.Point(0,0):\n", "# return \"country\"\n", " \n", "# raw_label = record.location.raw[\"properties\"]\n", " \n", "# if 'interpolated' in record.location.raw[\"bepelias\"] and record.location.raw[\"bepelias\"][\"interpolated\"]==\"street_center\":\n", "# return \"street\" \n", "\n", "# if (raw_label[\"match_type\"] in [\"exact\", \"interpolated\"] or raw_label[\"accuracy\"]==\"point\") and 'housenumber' in raw_label:\n", "# return \"building\"\n", " \n", "\n", "# if \"street\" in raw_label:\n", "# return \"street\"\n", "# if \"region\" in raw_label:\n", "# return \"city\"\n", "# return \"country\"\n", " \n", " if \"jcd\" in record.geocoder:\n", " \n", " if record.location.point == geopy.Point(0,0):\n", " return \"country\"\n", " return \"building\"\n", " \n", " if \"arcgis\" in record.geocoder:\n", " attr = record.location.raw[\"attributes\"]\n", " if \"AddNum\" in attr and len(attr[\"AddNum\"])>0:\n", " return \"building\"\n", " if \"StName\" in attr and len(attr[\"StName\"])>0:\n", " return \"street\"\n", " if \"City\" in attr and len(attr[\"City\"])>0:\n", " return \"city\"\n", " \n", " return \"[UNKNOWN]\"\n", " \n", " try: \n", " # print(record.geocoder)\n", " f = precision_field[record.geocoder]\n", " # print(f)\n", " mapper = label_mapping[record.geocoder]\n", " if isinstance(f, str):\n", " raw_label = record.location.raw[f]\n", " else: \n", " raw_label = record.location.raw[f[0]][f[1]]\n", " return mapper[raw_label] if raw_label in mapper else f\"[UNKNOWN - {raw_label}]\"\n", " except KeyError: \n", " if record.geocoder == \"mapbox\":\n", " return \"city\"\n", " return \"[UNKNOWN]\"\n", " \n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "24853111", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:50.800502Z", "start_time": "2024-05-17T13:20:50.740143Z" }, "tags": [] }, "outputs": [], "source": [ "# geocoded_data.loc[151].location.raw\n", "# geocoded_data.loc[0].location.raw" ] }, { "cell_type": "code", "execution_count": null, "id": "ce2729b5", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:50.856218Z", "start_time": "2024-05-17T13:20:50.802958Z" }, "scrolled": true }, "outputs": [], "source": [ "# get_precision(geocoded_data.loc[0])" ] }, { "cell_type": "code", "execution_count": null, "id": "a8c93483", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:50.940561Z", "start_time": "2024-05-17T13:20:50.858369Z" } }, "outputs": [], "source": [ "geocoders = {}" ] }, { "cell_type": "code", "execution_count": null, "id": "42723411", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:50.987452Z", "start_time": "2024-05-17T13:20:50.957898Z" } }, "outputs": [], "source": [ "from geopy.geocoders import Nominatim\n", "geocoders['nominatim'] = Nominatim(user_agent=\"smalsresearch\", domain=\"172.27.0.64:8080\", scheme=\"http\", timeout=1000)\n", "# geocoders['nominatim'] = Nominatim(user_agent=\"smalsresearch\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "690a72a6", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:51.061507Z", "start_time": "2024-05-17T13:20:50.989611Z" }, "scrolled": true }, "outputs": [], "source": [ "geocoders['nominatim_wrapper'] = NominatimWrapper(user_agent=\"smalsresearch\", scheme=\"http\", timeout=1000, domain=\"172.27.0.64:5000\")" ] }, { "cell_type": "code", "execution_count": null, "id": "7dbdff0a", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:51.116731Z", "start_time": "2024-05-17T13:20:51.063585Z" } }, "outputs": [], "source": [ "geocoders['bestaddress'] = BestAddress(user_agent=\"smalsresearch\", scheme=\"https\", timeout=1000, \n", " client_id=best_client_id, client_secret=best_client_secret,\n", " domain=best_hostname,\n", " verbose=False\n", " )" ] }, { "cell_type": "code", "execution_count": null, "id": "7d8c98a7", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:51.185823Z", "start_time": "2024-05-17T13:20:51.119167Z" } }, "outputs": [], "source": [ "# from geopy.geocoders import Pelias\n", "geocoders['pelias'] = Pelias(user_agent=\"smalsresearch\", domain=\"172.27.0.64:4005\", scheme=\"http\", timeout=1000)\n", "geocoders['pelias_struct'] = Pelias(user_agent=\"smalsresearch\", domain=\"172.27.0.64:4005\", scheme=\"http\", timeout=1000)\n", "geocoders['pelias_struct_noloc'] = Pelias(user_agent=\"smalsresearch\", domain=\"172.27.0.64:4005\", scheme=\"http\", timeout=1000, with_localities=False)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "8bd21dd8", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:51.408406Z", "start_time": "2024-05-17T13:20:51.337778Z" } }, "outputs": [], "source": [ "# from geopy.geocoders import Pelias\n", "# For peliase installed through bePelias procedure\n", "geocoders['be_pelias_basic_unstruct'] = Pelias(user_agent=\"smalsresearch\", domain=\"172.27.0.64:4000\", scheme=\"http\", timeout=1000)\n", "geocoders['be_pelias_basic_struct'] = Pelias(user_agent=\"smalsresearch\", domain=\"172.27.0.64:4000\", scheme=\"http\", timeout=1000)\n", "geocoders['be_pelias_basic_struct_noloc'] = Pelias(user_agent=\"smalsresearch\", domain=\"172.27.0.64:4000\", scheme=\"http\", timeout=1000, with_localities=False)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "587b7e22", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:51.432853Z", "start_time": "2024-05-17T13:20:51.410420Z" } }, "outputs": [], "source": [ "# geocoders['be_pelias'].geocode({\"street\": \"Rue des Vierges\", \"housenumber\": \"25\", \"postcode\": \"9600\", \"city\": \"renaix\"}).raw" ] }, { "cell_type": "code", "execution_count": null, "id": "ac14f51a", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:51.492779Z", "start_time": "2024-05-17T13:20:51.434872Z" } }, "outputs": [], "source": [ "geocoders['be_pelias_basic'] = BePelias(user_agent=\"smalsresearch\", domain=\"172.27.0.64:4001\", scheme=\"http\", timeout=1000,\n", "# mode=\"simple\")\n", " mode=\"basic\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "572ed926", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:51.543586Z", "start_time": "2024-05-17T13:20:51.494606Z" } }, "outputs": [], "source": [ "geocoders['be_pelias_simple'] = BePelias(user_agent=\"smalsresearch\", domain=\"172.27.0.64:4001\", scheme=\"http\", timeout=1000,\n", " mode=\"simple\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1781c844", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:51.597924Z", "start_time": "2024-05-17T13:20:51.545688Z" } }, "outputs": [], "source": [ "geocoders['be_pelias'] = BePelias(user_agent=\"smalsresearch\", domain=\"172.27.0.64:4001\", scheme=\"http\", timeout=1000,\n", "# mode=\"simple\")\n", " mode=\"advanced\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f857bfdb", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:51.643785Z", "start_time": "2024-05-17T13:20:51.599711Z" } }, "outputs": [], "source": [ "# geocoders['be_pelias_basic_struct'] = BePelias(user_agent=\"smalsresearch\", domain=\"172.27.0.64:4001\", scheme=\"http\", timeout=1000,\n", "# # mode=\"simple\")\n", "# mode=\"struct\")\n", "\n", "# geocoders['be_pelias_basic_struct_noloc'] = BePelias(user_agent=\"smalsresearch\", domain=\"172.27.0.64:4001\", scheme=\"http\", timeout=1000,\n", "# # mode=\"simple\")\n", "# mode=\"struct_noloc\")\n", "\n", "# geocoders['be_pelias_basic_unstruct'] = BePelias(user_agent=\"smalsresearch\", domain=\"172.27.0.64:4001\", scheme=\"http\", timeout=1000,\n", "# # mode=\"simple\")\n", "# mode=\"unstruct\")\n", "geocoders['be_pelias'].geocode({\"street\": \"Avenue Fonsny\", \"housenumber\": \"20\", \"postcode\": \"1060\", \"city\": \"Saint-Gilles\"})" ] }, { "cell_type": "code", "execution_count": null, "id": "c291bdb6", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:51.720397Z", "start_time": "2024-05-17T13:20:51.645727Z" } }, "outputs": [], "source": [ "from geopy.geocoders import Here\n", "geocoders['here'] = Here(apikey=here_api_key)" ] }, { "cell_type": "code", "execution_count": null, "id": "930897d9", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:51.771992Z", "start_time": "2024-05-17T13:20:51.727800Z" } }, "outputs": [], "source": [ "from geopy.geocoders import Bing\n", "geocoders['bing'] = Bing(api_key=bing_api_key)" ] }, { "cell_type": "code", "execution_count": null, "id": "200fe417", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:51.826761Z", "start_time": "2024-05-17T13:20:51.774045Z" } }, "outputs": [], "source": [ "from geopy.geocoders import Photon\n", "# geocoders['photon'] = Photon(domain=\"photon.komoot.io\")\n", "\n", "geocoders['photon'] = Photon(domain=\"127.0.0.1:2322\", scheme=\"http\")\n", "\n", "# geocoders['photon'].geocode(\"Avenue Fonsny 20, 1060 Bruxelles\").raw" ] }, { "cell_type": "code", "execution_count": null, "id": "9febad3e", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:51.881590Z", "start_time": "2024-05-17T13:20:51.828603Z" } }, "outputs": [], "source": [ "from geopy.geocoders import MapBox\n", "geocoders['mapbox'] = MapBox(api_key=mapbox_api_key)" ] }, { "cell_type": "code", "execution_count": null, "id": "ad827b2e", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:51.932723Z", "start_time": "2024-05-17T13:20:51.883540Z" } }, "outputs": [], "source": [ "from geopy.geocoders import TomTom\n", "geocoders['tomtom'] =TomTom(api_key=tomtom_api_key)" ] }, { "cell_type": "code", "execution_count": null, "id": "e050da29", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:51.982985Z", "start_time": "2024-05-17T13:20:51.934643Z" } }, "outputs": [], "source": [ "from geopy.geocoders import GoogleV3\n", "geocoders[\"google\"] = GoogleV3(api_key=google_api_key)" ] }, { "cell_type": "code", "execution_count": null, "id": "467819fc", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:52.058354Z", "start_time": "2024-05-17T13:20:51.984663Z" } }, "outputs": [], "source": [ "from geopy.geocoders import ArcGIS\n", "arcgis = ArcGIS(username=arcgis_username, password=arcgis_password, referer=\"www.smalsresearch.Be\")\n", "arcgis._geocode = arcgis.geocode\n", "arcgis.geocode = lambda x: arcgis._geocode(x, out_fields=\"*\")\n", "geocoders[\"arcgis\"] = arcgis" ] }, { "cell_type": "code", "execution_count": null, "id": "d4f20d78", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:52.108706Z", "start_time": "2024-05-17T13:20:52.060219Z" } }, "outputs": [], "source": [ "from matplotlib.backends.backend_pdf import PdfPages\n" ] }, { "cell_type": "code", "execution_count": null, "id": "41d97cc8", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:53.091420Z", "start_time": "2024-05-17T13:20:52.110832Z" }, "scrolled": true }, "outputs": [], "source": [ "# geocoders['be_pelias'].geocode(\"1060 bxl, belgique\").raw" ] }, { "cell_type": "code", "execution_count": null, "id": "43219655", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:53.096131Z", "start_time": "2024-05-17T13:20:53.093625Z" } }, "outputs": [], "source": [ "# geocoders['nominatim_wrapper'].geocode(\"Rue de Namur, 5190 Spy\").raw" ] }, { "cell_type": "code", "execution_count": null, "id": "52ea0507", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:53.609089Z", "start_time": "2024-05-17T13:20:53.098116Z" } }, "outputs": [], "source": [ "# geocoders['nominatim_wrapper'].geocode(\"Dorpstraat, Geldrode\").raw" ] }, { "cell_type": "markdown", "id": "50a58b96", "metadata": { "ExecuteTime": { "end_time": "2021-09-06T13:04:40.465866Z", "start_time": "2021-09-06T13:04:40.442905Z" } }, "source": [ "# Load data\n" ] }, { "cell_type": "code", "execution_count": null, "id": "c94a8933", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:53.687548Z", "start_time": "2024-05-17T13:20:53.611330Z" } }, "outputs": [], "source": [ "data = pd.read_csv(f\"{data_dir}/{dataset}.csv.gz\", dtype=str)\n", "\n", "mandatory_columns = [\"street\", \"housenumber\", \"postcode\", \"city\"]\n", "for f in mandatory_columns:\n", " assert f in data, f\"Field {f} is mandatory !!\"\n" ] }, { "cell_type": "code", "execution_count": null, "id": "06f50a4e", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:53.762044Z", "start_time": "2024-05-17T13:20:53.689665Z" } }, "outputs": [], "source": [ "data[\"address\"] = data.street.fillna(\"\")+\", \"+data.housenumber.fillna(\"\")+\", \"+data.postcode+\" \"+data.city+\", Belgique\"\n", "data" ] }, { "cell_type": "code", "execution_count": null, "id": "03c1c973", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:53.814069Z", "start_time": "2024-05-17T13:20:53.764098Z" } }, "outputs": [], "source": [ "data = data.reset_index().rename(columns={\"index\": \"reference_key\"})" ] }, { "cell_type": "code", "execution_count": null, "id": "5f7bb2b5", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:53.871903Z", "start_time": "2024-05-17T13:20:53.815813Z" } }, "outputs": [], "source": [ "# data" ] }, { "cell_type": "code", "execution_count": null, "id": "04834dd1", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:53.989965Z", "start_time": "2024-05-17T13:20:53.874128Z" } }, "outputs": [], "source": [ "pdf = PdfPages(f\"output/geocoding/reports/report_{dataset}.pdf\")" ] }, { "cell_type": "code", "execution_count": null, "id": "4bcf7df6", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:54.209455Z", "start_time": "2024-05-17T13:20:53.992076Z" }, "tags": [] }, "outputs": [], "source": [ "metadata= [[\"Dataset\", dataset],\n", " [\"Nb records\", data.shape[0]]]\n", "\n", "\n", "fig, ax =plt.subplots()\n", "ax.set_axis_off() \n", "tbl = ax.table(cellText = [ r[1:] for r in metadata],\n", " rowLabels = [r[0] for r in metadata],\n", " rowColours = [\"palegreen\"] * len(metadata),\n", " loc=\"upper left\")\n", "tbl.scale(1,2)\n", "pdf_savefig()" ] }, { "cell_type": "markdown", "id": "7a8dfba5", "metadata": {}, "source": [ "# Geocode" ] }, { "cell_type": "code", "execution_count": null, "id": "98620a16", "metadata": { "ExecuteTime": { "end_time": "2023-09-29T08:54:50.842892Z", "start_time": "2023-09-29T08:54:50.838299Z" } }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "be449a25", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:54.215529Z", "start_time": "2024-05-17T13:20:54.211696Z" } }, "outputs": [], "source": [ "try: \n", " no_res_timing = pd.read_pickle(f\"output/geocoding/no_res_timing_{dataset}.pkl\")\n", "except FileNotFoundError: \n", " no_res_timing = pd.DataFrame(columns =[\"address\", \"geocoder\", \"duration\"] )" ] }, { "cell_type": "code", "execution_count": null, "id": "1f0e87bd", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:54.301020Z", "start_time": "2024-05-17T13:20:54.217188Z" } }, "outputs": [], "source": [ "import datetime\n", "#no_res_timing ={}\n", "def geocode_and_timeit(geocoder, addr):\n", " global no_res_timing\n", " start = datetime.datetime.now()\n", " res = geocoder(addr)\n", " if res:\n", " res.raw.update({\"duration\": datetime.datetime.now()-start})\n", " else: \n", " geocoder_name = list(filter(lambda y: y[1] == geocoder.__self__ , [(x, geocoders[x]) for x in geocoders]))[0][0]\n", " \n", " no_res_timing = pd.concat([no_res_timing, pd.DataFrame([{\"address\": addr,\n", " \"duration\": (datetime.datetime.now()-start).total_seconds(), \n", " \"geocoder\": geocoder_name}])])\n", " return res" ] }, { "cell_type": "code", "execution_count": null, "id": "8dd9c21f", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:56.253878Z", "start_time": "2024-05-17T13:20:54.303576Z" }, "scrolled": true }, "outputs": [], "source": [ "geocoded_data_r = [] #gpd.GeoDataFrame()\n", "coders=[\"nominatim_wrapper\", \"nominatim\", \"here\", \"bing\", \"mapbox\", \"tomtom\", \"google\", \"bestaddress\", \n", " \"ngi_pelias\"]\n", "coders=[\"nominatim_wrapper\", \"nominatim\", \"here\", \"bing\", \"google\", \"tomtom\", \"ngi_pelias\"]\n", "\n", "coders=[\"nominatim\", \"be_pelias\", \"here\", \"bing\", \"google\", \"tomtom\"]\n", "\n", "# coders=[\"here\", \"bing\", \"google\", \"nominatim\", \"nominatim_wrapper\", \"be_pelias\"]\n", "\n", "# coders=[\"pelias_struct_noloc\", \"pelias_struct\", \"pelias\"]\n", "\n", "# coders=[\"be_pelias_basic_struct_noloc\", \"be_pelias_basic_struct\", \"be_pelias_basic_unstruct\"]\n", "\n", "# coders=[\"be_pelias\"]\n", "\n", "# coders=[\"be_pelias\", \"pelias_struct_noloc\", \"pelias_struct\", \"pelias\"]\n", "\n", "#coders=[\"be_pelias_basic_struct_noloc\", \"be_pelias_basic_struct\", \"be_pelias_basic_unstruct\", \"pelias_struct_noloc\", \"pelias_struct\", \"pelias\"]\n", "\n", "# coders=[ \"bing\", \"google\", \"here\", \"tomtom\", \"nominatim\", \"nominatim_wrapper\", \"pelias\", \"ngi_pelias\", \"jcd\"]\n", "# coders=[ \"bing\", \"google\", \"here\", \"tomtom\", \"nominatim\", \"pelias\", \"photon\"]\n", "\n", "# coders = [\"here\", \"bing\", \"tomtom\", \"google\", \"nominatim_wrapper\", \"nominatim\"]\n", "# coders = [\"here\", \"bing\", \"tomtom\", \"google\", \"nominatim_loc\"]\n", "\n", "# coders = [\"nominatim\"]\n", "# from_file = coders\n", "# from_file = [] \n", "\n", "# coders =[\"pelias\", \"pelias_struct\", \"ngi_pelias_basic\", \"ngi_pelias_simple\", \"ngi_pelias\"]\n", "\n", "# from_file = coders\n", "\n", "from_file = coders\n", "\n", "# coders = [\"be_pelias\"]\n", "from_file=[] \n", "\n", "\n", "\n", "delay={\"mapbox\":0.5, \"nominatim_wrapper\":0.01, \"nominatim\":0.01, \"lpost_bestaddress\": 0.01, \"google\":0.05, \"tomtom\": 0.4, \n", " \"bing\": 0.05, \"arcgis\": 0.05}\n", "for c in coders:\n", " if \"pelias\" in c:\n", " delay[c] = 0.01\n", "\n", "\n", "for coder in coders: #geocoders:\n", " print(coder)\n", " \n", " if coder in from_file:\n", " print(\"Load from local file...\")\n", " g_data = pd.read_pickle(f\"output/geocoding/geocoded_data/geocoded_{dataset}_{coder}.pkl\")\n", " else:\n", " \n", " # Erase no-res timing for this coder for any previous run\n", " no_res_timing = no_res_timing[no_res_timing.geocoder != coder]\n", " \n", " g = geocoders[coder]\n", " g.geocode_timeit = lambda x: geocode_and_timeit(g.geocode, x)\n", " geocode = RateLimiter(g.geocode_timeit, min_delay_seconds=delay[coder] if coder in delay else 0.2)\n", "\n", " if coder in [\"bestaddress\", \"be_pelias\", 'be_pelias_basic', 'be_pelias_simple', \"pelias_struct\", \"pelias_struct_noloc\",\n", " \"be_pelias_basic_struct\", \"be_pelias_basic_struct_noloc\"]: # structured addresses\n", " g_data = data.assign(location=data[[\"street\", \"housenumber\", \"postcode\", \"city\"]].apply(dict, axis=1).progress_apply(geocode)).assign(geocoder=coder)\n", "\n", " else: \n", " g_data = data.assign(location=(data['address'].str.replace(\"/\", \"%2F\") if coder==\"tomtom\" else data['address']).progress_apply(geocode)).assign(geocoder=coder)\n", "# g_data = data.assign(location=data['address'].progress_apply(geocode)).assign(geocoder=coder)\n", " g_data.to_pickle(f\"output/geocoding/geocoded_data/geocoded_{dataset}_{coder}.pkl\")\n", " \n", " geocoded_data_r.append(g_data)\n", " display(g_data)" ] }, { "cell_type": "markdown", "id": "20eda90e", "metadata": { "ExecuteTime": { "end_time": "2022-05-06T11:23:47.951791Z", "start_time": "2022-05-06T11:23:47.946802Z" } }, "source": [ "## Gather results" ] }, { "cell_type": "code", "execution_count": null, "id": "d6aed4b7", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:56.412660Z", "start_time": "2024-05-17T13:20:56.321333Z" }, "scrolled": true }, "outputs": [], "source": [ "geocoded_data = pd.concat( geocoded_data_r).reset_index(drop=True)\n", "geocoded_data = geocoded_data[geocoded_data.location.notnull()]\n", "geocoded_data" ] }, { "cell_type": "code", "execution_count": null, "id": "2654c77a-01cd-4d1b-8762-b448c2703520", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "a470c20e", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:56.451072Z", "start_time": "2024-05-17T13:20:56.414510Z" } }, "outputs": [], "source": [ "data[~data.reference_key.isin(geocoded_data[geocoded_data.geocoder==\"be_pelias\"].reference_key)]" ] }, { "cell_type": "code", "execution_count": null, "id": "8e7fb22c-c5e3-42ea-8aea-bba8d3ad21ea", "metadata": {}, "outputs": [], "source": [ "data.loc[[780]].apply(geocoders[\"be_pelias\"].geocode, axis=1).iloc[0]#.raw" ] }, { "cell_type": "code", "execution_count": null, "id": "64631711-a69d-437f-adda-7d3bbeba73ea", "metadata": {}, "outputs": [], "source": [ "# if \"be_pelias\" in geocoders:\n", "# print(geocoded_data[geocoded_data.geocoder==\"be_pelias\"].location.apply(lambda r : r.raw[\"bepelias\"][\"call_cnt\"]).value_counts().sort_index())#.plot.bar())\n", "# geocoders\n", "# geocoded_data.iloc[0].location.raw[\"precision\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "5265d688-e993-4768-9163-6eda01a37065", "metadata": {}, "outputs": [], "source": [ "if \"be_pelias\" in coders:\n", " geocoded_data[geocoded_data.geocoder==\"be_pelias\"].location.apply(lambda r : r.raw[\"bepelias\"][\"peliasCallCount\"]).value_counts().sort_index().plot.bar()" ] }, { "cell_type": "code", "execution_count": null, "id": "570c91c0-5d8b-4796-bfc1-10081fffbac8", "metadata": {}, "outputs": [], "source": [ "if \"be_pelias\" in coders:\n", " print(\"Mean Pelias call count: \", end=\"\")\n", " print(geocoded_data[geocoded_data.geocoder==\"be_pelias\"].location.apply(lambda r : r.raw[\"bepelias\"][\"peliasCallCount\"]).mean()) # 3.137413741374137 -> 2.648164816481648 --> 2.3695369536953694" ] }, { "cell_type": "code", "execution_count": null, "id": "780fdb41", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:56.576639Z", "start_time": "2024-05-17T13:20:56.524248Z" } }, "outputs": [], "source": [ "geocoded_data.geocoder.value_counts()" ] }, { "cell_type": "markdown", "id": "73e6ff22", "metadata": { "ExecuteTime": { "end_time": "2022-01-27T07:06:32.229136Z", "start_time": "2022-01-27T07:06:31.112564Z" } }, "source": [ "# Duration" ] }, { "cell_type": "code", "execution_count": null, "id": "063d1d76", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:56.641969Z", "start_time": "2024-05-17T13:20:56.578622Z" } }, "outputs": [], "source": [ "geocoded_data[\"duration\"] = geocoded_data[\"location\"].apply(lambda loc: loc.raw[\"duration\"] if loc and \"duration\" in loc.raw else None).dt.total_seconds()" ] }, { "cell_type": "code", "execution_count": null, "id": "a95dac82", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:56.683138Z", "start_time": "2024-05-17T13:20:56.643636Z" } }, "outputs": [], "source": [ "geocoded_data.groupby(\"geocoder\").address.count().sum()#[\"duration\"]\n" ] }, { "cell_type": "code", "execution_count": null, "id": "356465fe", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:56.731840Z", "start_time": "2024-05-17T13:20:56.685016Z" } }, "outputs": [], "source": [ "no_res_timing.to_pickle(f\"output/geocoding/no_res_timing_{dataset}.pkl\")" ] }, { "cell_type": "code", "execution_count": null, "id": "eae4f2dd", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:56.802716Z", "start_time": "2024-05-17T13:20:56.733627Z" } }, "outputs": [], "source": [ "geocoded_data_duration = pd.concat([geocoded_data.assign(with_result=True), \n", " no_res_timing[no_res_timing.geocoder.isin(coders)].assign(with_result=False)])\n", "geocoded_data_duration = geocoded_data_duration.reset_index()[[\"address\", \"geocoder\", \"duration\", \"with_result\"]]\n", "geocoded_data_duration[\"address\"] = geocoded_data_duration[\"address\"].astype(str)" ] }, { "cell_type": "code", "execution_count": null, "id": "5dc14f87", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:56.851327Z", "start_time": "2024-05-17T13:20:56.804716Z" } }, "outputs": [], "source": [ "geocoded_data_duration.sort_values(\"duration\")" ] }, { "cell_type": "code", "execution_count": null, "id": "b41436c7", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:20:56.897808Z", "start_time": "2024-05-17T13:20:56.853247Z" } }, "outputs": [], "source": [ "geocoded_data_duration.duration.quantile(0.99)" ] }, { "cell_type": "code", "execution_count": null, "id": "67e93704", "metadata": { "ExecuteTime": { "end_time": "2024-04-11T11:44:10.113810Z", "start_time": "2024-04-11T11:44:09.627437Z" } }, "outputs": [], "source": [ "plt.figure(figsize=def_figsize)\n", "sbn.histplot(geocoded_data_duration[geocoded_data_duration.duration0.005 else \"\", fontproperties={\"size\":\"x-small\"})\n", "\n", "\n", " plt.savefig(f'{fig_path}/matching_rate_{dataset}.png', dpi=150, bbox_inches='tight')\n", "\n", " sbn.set(font_scale=1)\n", " \n", " return mr_prec\n" ] }, { "cell_type": "code", "execution_count": null, "id": "336bfc22", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:00.311498Z", "start_time": "2024-05-17T13:21:00.254064Z" } }, "outputs": [], "source": [ "import matplotlib\n", "matplotlib.__version__\n" ] }, { "cell_type": "code", "execution_count": null, "id": "9e123b10", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:01.644784Z", "start_time": "2024-05-17T13:21:00.313288Z" } }, "outputs": [], "source": [ "mr_prec = plot_matching_rate(geocoded_data)\n", "pdf_savefig()" ] }, { "cell_type": "code", "execution_count": null, "id": "eeba208a", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:01.655122Z", "start_time": "2024-05-17T13:21:01.646870Z" } }, "outputs": [], "source": [ "# no builing MR (for Nominatim):\n", "# best : 31.6+0.4 = 32 %\n", "# kbo : 24+1 = 25 %\n", "# resto : 13.1 + 4.5 = 17.5 \n", "# be_pelias\t96.9%\t1.9%\t0.4%\t0.8%\n", "with pd.option_context('display.float_format', '{:0.1%}'.format):\n", " display(mr_prec.fillna(\"\"))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "a6cb7730", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:01.733128Z", "start_time": "2024-05-17T13:21:01.657024Z" }, "scrolled": true }, "outputs": [], "source": [ "geocoded_data[(geocoded_data.geocoder==\"be_pelias\") & (geocoded_data.precision==\"city\")]" ] }, { "cell_type": "code", "execution_count": null, "id": "b9d2a585-b2aa-4277-ba86-15a83ee6d252", "metadata": {}, "outputs": [], "source": [ "def has_boxinfo(f):\n", " try: \n", " return \"box_info\" in f.raw[\"properties\"][\"addendum\"][\"best\"]\n", " except KeyError:\n", " return False" ] }, { "cell_type": "code", "execution_count": null, "id": "38ba18bc-0046-4b68-a81b-a53b63858e49", "metadata": {}, "outputs": [], "source": [ "geocoded_data[(geocoded_data.geocoder==\"be_pelias\") & (geocoded_data.precision==\"country\") & geocoded_data.location.apply(has_boxinfo)]" ] }, { "cell_type": "code", "execution_count": null, "id": "e4641afd", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:02.147646Z", "start_time": "2024-05-17T13:21:02.062386Z" } }, "outputs": [], "source": [ "# geocoded_data\n", "addr_in_bru = geocoded_data[geocoded_data.postcode.astype(int).between(1000, 1299)].address\n", "geocoded_data_bru = geocoded_data[geocoded_data.address.isin(addr_in_bru)] \n", "\n", "# mr_prec = plot_matching_rate(geocoded_data_bru, f\"Matching rate - Precision (within Brussels - {dataset})\")\n", "# pdf_savefig()" ] }, { "cell_type": "code", "execution_count": null, "id": "d89d77d9", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:02.355774Z", "start_time": "2024-05-17T13:21:02.204021Z" }, "scrolled": true }, "outputs": [], "source": [ "if \"be_pelias\" in geocoded_data.geocoder.unique():\n", " vc = geocoded_data[geocoded_data.geocoder==\"be_pelias\"].location.apply(lambda rec: rec.raw[\"bepelias\"][\"callType\"]+\" - \"+(rec.raw[\"bepelias\"][\"transformers\"] or \"*\")).value_counts()\n", " display(vc)\n", " \n", " _vc = vc.reset_index()\n", " _vc.location = _vc.location.where(_vc[\"count\"]>200, \"other\")\n", " _vc.groupby(\"location\")[\"count\"].sum().sort_values(ascending=False).rename(\"\").plot.pie(autopct='%1.1f%%', \n", " title=f\"Transformer ({dataset})\")\n", "\n", "# vc.plot.pie()" ] }, { "cell_type": "markdown", "id": "2ce3a333", "metadata": { "ExecuteTime": { "end_time": "2023-06-12T12:47:05.367541Z", "start_time": "2023-06-12T12:47:05.367526Z" } }, "source": [ "# Distance to median statistics" ] }, { "cell_type": "markdown", "id": "ad76b5e6", "metadata": {}, "source": [ "## Extract location" ] }, { "cell_type": "code", "execution_count": null, "id": "b3ffd957-2e26-4f44-bbc6-72dd9f69ed63", "metadata": {}, "outputs": [], "source": [ "geocoded_data[\"location\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "0f31c752-72d0-4bfc-8ead-153703a91635", "metadata": {}, "outputs": [], "source": [ "# geocoded_data.loc[58123]#.location\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "3ab4cb67", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:02.788119Z", "start_time": "2024-05-17T13:21:02.712145Z" } }, "outputs": [], "source": [ "lat = geocoded_data[\"location\"].apply(lambda loc: loc.point.latitude if loc and loc.point.latitude >0 else None)\n", "lng = geocoded_data[\"location\"].apply(lambda loc: loc.point.longitude if loc else None)\n", "geocoded_data[\"point\"] = np.where(lat.notnull(), gpd.points_from_xy(lng, lat), pd.NA)" ] }, { "cell_type": "code", "execution_count": null, "id": "31030439", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:02.845867Z", "start_time": "2024-05-17T13:21:02.789941Z" }, "scrolled": true }, "outputs": [], "source": [ "geocoded_data[geocoded_data.point.isnull()]#.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "d4abfd39", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:03.267425Z", "start_time": "2024-05-17T13:21:03.261866Z" } }, "outputs": [], "source": [ "geocoded_data = geocoded_data[geocoded_data.point.notnull()] " ] }, { "cell_type": "code", "execution_count": null, "id": "0615dc8a", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:03.389143Z", "start_time": "2024-05-17T13:21:03.269424Z" }, "scrolled": true }, "outputs": [], "source": [ "geocoded_data = gpd.GeoDataFrame(geocoded_data).set_geometry(\"point\").set_crs(osm_crs, allow_override=True).to_crs(crs)\n", "geocoded_data" ] }, { "cell_type": "code", "execution_count": null, "id": "9fd83cfd-e4ce-4846-bb7c-41187903c974", "metadata": {}, "outputs": [], "source": [ "# geocoded_data[geocoded_data.geocoder==\"be_pelias\"].is_in_belgium.value_counts()\n", "geocoded_data" ] }, { "cell_type": "code", "execution_count": null, "id": "bcbc15f8", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:06.021512Z", "start_time": "2024-05-17T13:21:03.391253Z" }, "scrolled": true }, "outputs": [], "source": [ "from matplotlib.colors import ListedColormap\n", "\n", "colors = ListedColormap([f\"C{i}\" for i in range(geocoded_data.geocoder.nunique())], name=\"mycmap\")\n", "ax=geocoded_data.plot(\"geocoder\", legend=True, figsize=def_figsize, cmap=colors)\n", "plt.title(f\"Global view ({dataset})\")\n", "add_basemap(ax)\n", "# img_name=f\"{fig_path}/global_{dataset}.png\"\n", "# plt.savefig(img_name, bbox_inches='tight')\n", "# pdf_add_image(pdf, img_name)\n", "# pdf.add_page()\n", "# pdf.image(f\"{fig_path}/global_{dataset}.png\", 0, 0, 290)\n", "plt.axis(\"off\")\n", "pdf_savefig()\n", "# plt.show()\n", "# plt.close()" ] }, { "cell_type": "code", "execution_count": null, "id": "95ebc2fb-f3ef-4ffb-af53-8713267f375a", "metadata": {}, "outputs": [], "source": [ "# colors = ListedColormap([f\"C{i}\" for i in range(geocoded_data.precision.nunique())], name=\"mycmap\")\n", "# ax=geocoded_data[geocoded_data.geocoder==\"be_pelias\"].plot(\"precision\", legend=True, figsize=def_figsize, cmap=colors)\n", "# add_basemap(ax)" ] }, { "cell_type": "code", "execution_count": null, "id": "0eb156dd", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:06.337648Z", "start_time": "2024-05-17T13:21:06.023563Z" }, "scrolled": true }, "outputs": [], "source": [ "belgium_boundaries = gpd.read_file(\"data/boundaries.geojson\", driver=\"GeoJSON\")\n", "belgium_boundaries = belgium_boundaries[belgium_boundaries.name == \"BEL\"].geometry.iloc[0].simplify(1000).buffer(1000)\n", "# belgium_boundaries" ] }, { "cell_type": "code", "execution_count": null, "id": "e19de26d", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:06.342631Z", "start_time": "2024-05-17T13:21:06.339670Z" } }, "outputs": [], "source": [ "import fiona" ] }, { "cell_type": "code", "execution_count": null, "id": "d08ff1d7", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:07.365135Z", "start_time": "2024-05-17T13:21:06.344610Z" } }, "outputs": [], "source": [ "geocoded_data[\"is_in_belgium\"]= geocoded_data.geometry.within(belgium_boundaries)\n", "geocoded_data[\"is_in_belgium\"].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "ffb5b035", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:07.378641Z", "start_time": "2024-05-17T13:21:07.367097Z" }, "scrolled": true }, "outputs": [], "source": [ "geocoded_data[~geocoded_data.is_in_belgium]" ] }, { "cell_type": "code", "execution_count": null, "id": "904b03d1", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:07.477020Z", "start_time": "2024-05-17T13:21:07.380514Z" } }, "outputs": [], "source": [ "geocoded_data.groupby(\"geocoder\").is_in_belgium.value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "eb06ee4e", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:07.535694Z", "start_time": "2024-05-17T13:21:07.479179Z" } }, "outputs": [], "source": [ "geocoded_data[~geocoded_data.is_in_belgium]" ] }, { "cell_type": "code", "execution_count": null, "id": "b6db3b15", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:09.864743Z", "start_time": "2024-05-17T13:21:07.537606Z" } }, "outputs": [], "source": [ "ax=geocoded_data[geocoded_data.is_in_belgium].plot(\"geocoder\", legend=True, figsize=def_figsize)\n", "plt.title(\"Belgian view\")\n", "add_basemap(ax)\n", "pdf_savefig()" ] }, { "cell_type": "code", "execution_count": null, "id": "f23d15ff-394c-4137-ad93-ada2e7d03b23", "metadata": {}, "outputs": [], "source": [ "# geocoded_data[geocoded_data.is_in_belgium].drop(columns=[\"location\"]).explore(\"geocoder\")" ] }, { "cell_type": "markdown", "id": "44b36007", "metadata": {}, "source": [ "## Compute (distance to) median" ] }, { "cell_type": "code", "execution_count": null, "id": "b5878b81", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:09.870505Z", "start_time": "2024-05-17T13:21:09.867078Z" } }, "outputs": [], "source": [ "def get_median_point(bloc):\n", " bloc = gpd.GeoSeries(bloc.dropna())\n", " x = np.median(bloc.x)\n", " y = np.median(bloc.y)\n", " return shapely.geometry.Point(x, y), bloc.shape[0]\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "5251d20f", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:09.927804Z", "start_time": "2024-05-17T13:21:09.872548Z" } }, "outputs": [], "source": [ "# geocoded_data.address.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": null, "id": "ca93cfc0", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:10.684054Z", "start_time": "2024-05-17T13:21:09.930058Z" } }, "outputs": [], "source": [ "median_geocoders = coders # [\"here\", \"bing\", \"mapbox\", \"tomtom\", \"nominatim_wrapper\", \"google\", \"bestaddress\"] # Median based on those geocoders\n", "median_points = geocoded_data[geocoded_data.geocoder.isin(median_geocoders)&( geocoded_data.precision==\"building\") ].groupby(\"address\").point.apply(get_median_point).apply(pd.Series)#.rename(\"median_point\").reset_index()\n", "median_points = median_points.rename(columns={0: \"median_point\", 1: \"nb_points\"}).reset_index()\n", "median_points = median_points.set_geometry(\"median_point\").set_crs(geocoded_data.crs)\n", "median_points" ] }, { "cell_type": "code", "execution_count": null, "id": "07f93dc8", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:10.780616Z", "start_time": "2024-05-17T13:21:10.686328Z" } }, "outputs": [], "source": [ "median_points.nb_points.value_counts().plot.pie()" ] }, { "cell_type": "code", "execution_count": null, "id": "5a124318", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:10.788640Z", "start_time": "2024-05-17T13:21:10.782877Z" } }, "outputs": [], "source": [ "median_points = median_points[median_points.nb_points >= min(3,median_points.nb_points.max()) ]\n", "median_points.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "1a7d0cf4", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:10.855951Z", "start_time": "2024-05-17T13:21:10.790548Z" }, "scrolled": true }, "outputs": [], "source": [ "geocoded_data = geocoded_data.merge(median_points, how=\"left\")#.dropna()\n", "# geocoded_data" ] }, { "cell_type": "code", "execution_count": null, "id": "42dc3eab", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:10.929451Z", "start_time": "2024-05-17T13:21:10.857669Z" }, "scrolled": true }, "outputs": [], "source": [ "geocoded_data[\"dist_to_median\"] = geocoded_data.point.distance(geocoded_data.median_point.set_crs(geocoded_data.crs))\n", "geocoded_data.sort_values(\"dist_to_median\")#.tail(100)" ] }, { "cell_type": "code", "execution_count": null, "id": "097d8ac3", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:10.957420Z", "start_time": "2024-05-17T13:21:10.931412Z" } }, "outputs": [], "source": [ "# geocoded_data[geocoded_data.median_point.isnull()].sort_values(\"address\")" ] }, { "cell_type": "code", "execution_count": null, "id": "2fbdb298", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:11.008631Z", "start_time": "2024-05-17T13:21:10.959270Z" } }, "outputs": [], "source": [ "# geocoded_data.sort_values(\"dist_to_median\").tail(40).geocoder.value_counts().plot.bar()" ] }, { "cell_type": "code", "execution_count": null, "id": "bc1c9059", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:11.089510Z", "start_time": "2024-05-17T13:21:11.011023Z" } }, "outputs": [], "source": [ "def plot_address(geocoded_data, a):\n", " df = geocoded_data[(geocoded_data.address==a)]\n", " display(df)\n", " \n", "\n", " ax = df.assign(label=df.geocoder + \" (\"+df.dist_to_median.fillna(-1).astype(int).astype(str) + \" - \" + df.precision + \")\").plot(\n", " \"label\", \n", " legend=True, \n", " markersize=100,\n", " figsize=def_figsize\n", " )\n", "\n", " if df.dist_to_median.isnull().all():\n", " print(\"No median point\")\n", " else: \n", " med=df[[\"median_point\"]].drop_duplicates().set_geometry(\"median_point\")\n", " \n", " med.plot(ax=ax, color=\"red\", marker=\"x\", markersize=100)\n", " ax.set_title(a)\n", " set_optimal_limits(ax, df)\n", " add_basemap(ax)\n", " plt.show() \n", " return ax" ] }, { "cell_type": "code", "execution_count": null, "id": "0525395b", "metadata": { "ExecuteTime": { "end_time": "2024-04-11T11:44:30.012536Z", "start_time": "2024-04-11T11:44:29.930474Z" } }, "outputs": [], "source": [ "# plot_address(geocoded_data, geocoded_data.loc[46].address)" ] }, { "cell_type": "code", "execution_count": null, "id": "1657ae8e", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:14.809356Z", "start_time": "2024-05-17T13:21:11.144975Z" }, "scrolled": true }, "outputs": [], "source": [ "for a in geocoded_data.sort_values(\"dist_to_median\", ascending=False).address.unique()[0:3]:\n", " #print(a)\n", " plot_address(geocoded_data, a)\n", " " ] }, { "cell_type": "markdown", "id": "2423cc1b", "metadata": {}, "source": [ "## Median pertinence\n", "\n", "How many close to median ?" ] }, { "cell_type": "code", "execution_count": null, "id": "11394aa2", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:14.824969Z", "start_time": "2024-05-17T13:21:14.811863Z" } }, "outputs": [], "source": [ "nb_close_to_median = geocoded_data.assign(close_to_median = geocoded_data.dist_to_median < 100).groupby(\"address\").close_to_median.sum()\n", "nb_close_to_median" ] }, { "cell_type": "code", "execution_count": null, "id": "f046f4d6", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:14.886397Z", "start_time": "2024-05-17T13:21:14.826698Z" }, "scrolled": true }, "outputs": [], "source": [ "median_points = median_points.merge(nb_close_to_median.rename(\"nb_close_to_median\").reset_index())" ] }, { "cell_type": "code", "execution_count": null, "id": "e0e41e04", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:14.944033Z", "start_time": "2024-05-17T13:21:14.888331Z" } }, "outputs": [], "source": [ "median_points.nb_close_to_median.value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "acf88c5c", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:15.000983Z", "start_time": "2024-05-17T13:21:14.946084Z" } }, "outputs": [], "source": [ "median_points[median_points.nb_close_to_median>=2]" ] }, { "cell_type": "code", "execution_count": null, "id": "bb5afa1c", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:15.044657Z", "start_time": "2024-05-17T13:21:15.002924Z" } }, "outputs": [], "source": [ "# nb_close_to_median.value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "063cab3f", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:15.109477Z", "start_time": "2024-05-17T13:21:15.046678Z" } }, "outputs": [], "source": [ "geocoded_data = geocoded_data.merge(median_points, how=\"left\")" ] }, { "cell_type": "code", "execution_count": null, "id": "ee68566e", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:15.155231Z", "start_time": "2024-05-17T13:21:15.111269Z" } }, "outputs": [], "source": [ "geocoded_data.loc[geocoded_data.nb_close_to_median <2, \"dist_to_median\"] = pd.NA" ] }, { "cell_type": "code", "execution_count": null, "id": "7851eb88", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:15.219535Z", "start_time": "2024-05-17T13:21:15.156956Z" } }, "outputs": [], "source": [ "geocoded_data[geocoded_data.dist_to_median.notnull()].address.nunique()" ] }, { "cell_type": "code", "execution_count": null, "id": "2a1862c5", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:17.474342Z", "start_time": "2024-05-17T13:21:15.221269Z" }, "scrolled": true }, "outputs": [], "source": [ "for a in nb_close_to_median[nb_close_to_median==0].iloc[0:4].index:\n", " if geocoded_data[geocoded_data.address == a].median_point.notnull().any():\n", " plot_address(geocoded_data[geocoded_data.median_point.notnull()], a)" ] }, { "cell_type": "code", "execution_count": null, "id": "c578d59a", "metadata": { "ExecuteTime": { "end_time": "2024-05-17T13:21:17.693038Z", "start_time": "2024-05-17T13:21:17.476590Z" } }, "outputs": [], "source": [ "geocoded_data.to_pickle(f\"output/geocoding/geocoded_data/geocoded_{dataset}_all.pkl\")" ] }, { "cell_type": "markdown", "id": "d1311903", "metadata": { "ExecuteTime": { "end_time": "2021-09-10T07:34:51.827274Z", "start_time": "2021-09-10T07:34:51.822752Z" } }, "source": [ "## Distance to median" ] }, { "cell_type": "markdown", "id": "d5bb34d9", "metadata": {}, "source": [ "### Average" ] }, { "cell_type": "code", "execution_count": null, "id": "96dc30ba", "metadata": { "ExecuteTime": { "end_time": "2024-04-11T11:44:36.210927Z", "start_time": "2024-04-11T11:44:36.104418Z" } }, "outputs": [], "source": [ "# d2m = geocoded_data.groupby(\"geocoder\").dist_to_median.mean()\n", "d2m = geocoded_data.groupby(\"geocoder\").dist_to_median.mean()\n", "d2m.plot.bar()\n", "# plt.savefig(f\"{fig_path}/dist2med_{dataset}.png\")\n", "d2m" ] }, { "cell_type": "code", "execution_count": null, "id": "ac527f02", "metadata": { "ExecuteTime": { "end_time": "2024-04-11T11:44:36.318830Z", "start_time": "2024-04-11T11:44:36.212658Z" } }, "outputs": [], "source": [ "# skipping non reliable median\n", "\n", "# d2m = geocoded_data[geocoded_data.is_reliable].groupby(\"geocoder\").dist_to_median.mean()\n", "d2m = geocoded_data[geocoded_data.is_in_belgium].groupby(\"geocoder\").dist_to_median.mean()\n", "d2m.plot.bar()\n", "# plt.savefig(f\"{fig_path}/dist2med_reliable_{dataset}.png\")\n", "d2m" ] }, { "cell_type": "code", "execution_count": null, "id": "f7ffc914-5e26-4feb-abdc-416de19fdb27", "metadata": {}, "outputs": [], "source": [ "thresh=100\n", "ax=(geocoded_data[geocoded_data.dist_to_median