{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "Licensed to the Apache Software Foundation (ASF) under one\n", "or more contributor license agreements. See the NOTICE file\n", "distributed with this work for additional information\n", "regarding copyright ownership. The ASF licenses this file\n", "to you under the Apache License, Version 2.0 (the\n", "\"License\"); you may not use this file except in compliance\n", "with the License. You may obtain a copy of the License at\n", " http://www.apache.org/licenses/LICENSE-2.0\n", "Unless required by applicable law or agreed to in writing,\n", "software distributed under the License is distributed on an\n", "\"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n", "KIND, either express or implied. See the License for the\n", "specific language governing permissions and limitations\n", "under the License.\n", "```" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "tags": [] }, "outputs": [], "source": [ "import os\n", "\n", "import geopandas as gpd\n", "from pyspark.sql import SparkSession\n", "\n", "from sedona.spark import *" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "https://artifacts.unidata.ucar.edu/repository/unidata-all added as a remote repository with the name: repo-1\n", "Ivy Default Cache set to: /root/.ivy2/cache\n", "The jars for the packages stored in: /root/.ivy2/jars\n", "org.apache.sedona#sedona-spark-3.4_2.12 added as a dependency\n", "org.datasyslab#geotools-wrapper added as a dependency\n", "uk.co.gresearch.spark#spark-extension_2.12 added as a dependency\n", ":: resolving dependencies :: org.apache.spark#spark-submit-parent-df60331d-6671-41a0-bd2d-449a1210d0b2;1.0\n", "\tconfs: [default]\n", "\tfound org.apache.sedona#sedona-spark-3.4_2.12;1.5.1 in central\n", "\tfound org.apache.sedona#sedona-common;1.5.1 in central\n", "\tfound org.apache.commons#commons-math3;3.6.1 in central\n", "\tfound org.locationtech.jts#jts-core;1.19.0 in central\n", "\tfound org.wololo#jts2geojson;0.16.1 in central\n", "\tfound org.locationtech.spatial4j#spatial4j;0.8 in central\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ ":: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\tfound com.google.geometry#s2-geometry;2.0.0 in central\n", "\tfound com.google.guava#guava;25.1-jre in central\n", "\tfound com.google.code.findbugs#jsr305;3.0.2 in central\n", "\tfound org.checkerframework#checker-qual;2.0.0 in central\n", "\tfound com.google.errorprone#error_prone_annotations;2.1.3 in central\n", "\tfound com.google.j2objc#j2objc-annotations;1.1 in central\n", "\tfound org.codehaus.mojo#animal-sniffer-annotations;1.14 in central\n", "\tfound com.uber#h3;4.1.1 in central\n", "\tfound net.sf.geographiclib#GeographicLib-Java;1.52 in central\n", "\tfound com.github.ben-manes.caffeine#caffeine;2.9.2 in central\n", "\tfound org.checkerframework#checker-qual;3.10.0 in central\n", "\tfound com.google.errorprone#error_prone_annotations;2.5.1 in central\n", "\tfound org.apache.sedona#sedona-spark-common-3.4_2.12;1.5.1 in central\n", "\tfound commons-lang#commons-lang;2.6 in central\n", "\tfound org.scala-lang.modules#scala-collection-compat_2.12;2.5.0 in central\n", "\tfound org.beryx#awt-color-factory;1.0.0 in central\n", "\tfound org.datasyslab#geotools-wrapper;1.5.1-28.2 in central\n", "\tfound uk.co.gresearch.spark#spark-extension_2.12;2.11.0-3.4 in central\n", "\tfound com.github.scopt#scopt_2.12;4.1.0 in central\n", ":: resolution report :: resolve 304ms :: artifacts dl 9ms\n", "\t:: modules in use:\n", "\tcom.github.ben-manes.caffeine#caffeine;2.9.2 from central in [default]\n", "\tcom.github.scopt#scopt_2.12;4.1.0 from central in [default]\n", "\tcom.google.code.findbugs#jsr305;3.0.2 from central in [default]\n", "\tcom.google.errorprone#error_prone_annotations;2.5.1 from central in [default]\n", "\tcom.google.geometry#s2-geometry;2.0.0 from central in [default]\n", "\tcom.google.guava#guava;25.1-jre from central in [default]\n", "\tcom.google.j2objc#j2objc-annotations;1.1 from central in [default]\n", "\tcom.uber#h3;4.1.1 from central in [default]\n", "\tcommons-lang#commons-lang;2.6 from central in [default]\n", "\tnet.sf.geographiclib#GeographicLib-Java;1.52 from central in [default]\n", "\torg.apache.commons#commons-math3;3.6.1 from central in [default]\n", "\torg.apache.sedona#sedona-common;1.5.1 from central in [default]\n", "\torg.apache.sedona#sedona-spark-3.4_2.12;1.5.1 from central in [default]\n", "\torg.apache.sedona#sedona-spark-common-3.4_2.12;1.5.1 from central in [default]\n", "\torg.beryx#awt-color-factory;1.0.0 from central in [default]\n", "\torg.checkerframework#checker-qual;3.10.0 from central in [default]\n", "\torg.codehaus.mojo#animal-sniffer-annotations;1.14 from central in [default]\n", "\torg.datasyslab#geotools-wrapper;1.5.1-28.2 from central in [default]\n", "\torg.locationtech.jts#jts-core;1.19.0 from central in [default]\n", "\torg.locationtech.spatial4j#spatial4j;0.8 from central in [default]\n", "\torg.scala-lang.modules#scala-collection-compat_2.12;2.5.0 from central in [default]\n", "\torg.wololo#jts2geojson;0.16.1 from central in [default]\n", "\tuk.co.gresearch.spark#spark-extension_2.12;2.11.0-3.4 from central in [default]\n", "\t:: evicted modules:\n", "\torg.checkerframework#checker-qual;2.0.0 by [org.checkerframework#checker-qual;3.10.0] in [default]\n", "\tcom.google.errorprone#error_prone_annotations;2.1.3 by [com.google.errorprone#error_prone_annotations;2.5.1] in [default]\n", "\t---------------------------------------------------------------------\n", "\t| | modules || artifacts |\n", "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", "\t---------------------------------------------------------------------\n", "\t| default | 25 | 0 | 0 | 2 || 23 | 0 |\n", "\t---------------------------------------------------------------------\n", ":: retrieving :: org.apache.spark#spark-submit-parent-df60331d-6671-41a0-bd2d-449a1210d0b2\n", "\tconfs: [default]\n", "\t0 artifacts copied, 23 already retrieved (0kB/7ms)\n", "24/01/20 23:12:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" ] } ], "source": [ "config = SedonaContext.builder() .\\\n", " config('spark.jars.packages',\n", " 'org.apache.sedona:sedona-spark-3.4_2.12:1.5.1,'\n", " 'org.datasyslab:geotools-wrapper:1.5.1-28.2,'\n", " 'uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.4'). \\\n", " config('spark.jars.repositories', 'https://artifacts.unidata.ucar.edu/repository/unidata-all'). \\\n", " getOrCreate()\n", "\n", "sedona = SedonaContext.create(config)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Geometry Constructors" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### ST_Point" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-----------------+\n", "| arealandmark|\n", "+-----------------+\n", "|POINT (1.1 101.1)|\n", "|POINT (2.1 102.1)|\n", "|POINT (3.1 103.1)|\n", "|POINT (4.1 104.1)|\n", "|POINT (5.1 105.1)|\n", "+-----------------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "point_csv_df = sedona.read.format(\"csv\").\\\n", " option(\"delimiter\", \",\").\\\n", " option(\"header\", \"false\").\\\n", " load(\"data/testpoint.csv\")\n", "\n", "point_csv_df.createOrReplaceTempView(\"pointtable\")\n", "\n", "point_df = sedona.sql(\"select ST_Point(cast(pointtable._c0 as Decimal(24,20)), cast(pointtable._c1 as Decimal(24,20))) as arealandmark from pointtable\")\n", "point_df.show(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### ST_GeomFromText" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----------------+--------------------+\n", "| name| countyshape|\n", "+----------------+--------------------+\n", "| Cuming County|POLYGON ((-97.019...|\n", "|Wahkiakum County|POLYGON ((-123.43...|\n", "| De Baca County|POLYGON ((-104.56...|\n", "|Lancaster County|POLYGON ((-96.910...|\n", "| Nuckolls County|POLYGON ((-98.273...|\n", "+----------------+--------------------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "polygon_wkt_df = sedona.read.format(\"csv\").\\\n", " option(\"delimiter\", \"\\t\").\\\n", " option(\"header\", \"false\").\\\n", " load(\"data/county_small.tsv\")\n", "\n", "polygon_wkt_df.createOrReplaceTempView(\"polygontable\")\n", "polygon_df = sedona.sql(\"select polygontable._c6 as name, ST_GeomFromText(polygontable._c0) as countyshape from polygontable\")\n", "polygon_df.show(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### ST_GeomFromWKB" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----------------+--------------------+\n", "| name| countyshape|\n", "+----------------+--------------------+\n", "| Cuming County|POLYGON ((-97.019...|\n", "|Wahkiakum County|POLYGON ((-123.43...|\n", "| De Baca County|POLYGON ((-104.56...|\n", "|Lancaster County|POLYGON ((-96.910...|\n", "| Nuckolls County|POLYGON ((-98.273...|\n", "+----------------+--------------------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "polygon_wkb_df = sedona.read.format(\"csv\").\\\n", " option(\"delimiter\", \"\\t\").\\\n", " option(\"header\", \"false\").\\\n", " load(\"data/county_small_wkb.tsv\")\n", "\n", "polygon_wkb_df.createOrReplaceTempView(\"polygontable\")\n", "polygon_df = sedona.sql(\"select polygontable._c6 as name, ST_GeomFromWKB(polygontable._c0) as countyshape from polygontable\")\n", "polygon_df.show(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### ST_GeomFromGeoJSON" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+\n", "| countyshape|\n", "+--------------------+\n", "|POLYGON ((-87.621...|\n", "|POLYGON ((-85.719...|\n", "|POLYGON ((-86.000...|\n", "|POLYGON ((-86.574...|\n", "|POLYGON ((-85.382...|\n", "+--------------------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "polygon_json_df = sedona.read.format(\"csv\").\\\n", " option(\"delimiter\", \"\\t\").\\\n", " option(\"header\", \"false\").\\\n", " load(\"data/testPolygon.json\")\n", "\n", "polygon_json_df.createOrReplaceTempView(\"polygontable\")\n", "polygon_df = sedona.sql(\"select ST_GeomFromGeoJSON(polygontable._c0) as countyshape from polygontable\")\n", "polygon_df.show(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Spatial Operations" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Spatial Join - Distance Join" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "== Physical Plan ==\n", "BroadcastIndexJoin pointshape2#257: geometry, LeftSide, LeftSide, Inner, INTERSECTS, ( **org.apache.spark.sql.sedona_sql.expressions.ST_Distance** < 2.0) ST_INTERSECTS(pointshape1#232, pointshape2#257)\n", ":- SpatialIndex pointshape1#232: geometry, RTREE, false, false, 2.0\n", ": +- Project [ **org.apache.spark.sql.sedona_sql.expressions.ST_Point** AS pointshape1#232, abc AS name1#233]\n", ": +- FileScan csv [_c0#228,_c1#229] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/opt/workspace/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>\n", "+- Project [ **org.apache.spark.sql.sedona_sql.expressions.ST_Point** AS pointshape2#257, def AS name2#258]\n", " +- FileScan csv [_c0#253,_c1#254] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/opt/workspace/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>\n", "\n", "\n", "+-----------------+-----+-----------------+-----+\n", "| pointshape1|name1| pointshape2|name2|\n", "+-----------------+-----+-----------------+-----+\n", "|POINT (1.1 101.1)| abc|POINT (1.1 101.1)| def|\n", "|POINT (2.1 102.1)| abc|POINT (1.1 101.1)| def|\n", "|POINT (1.1 101.1)| abc|POINT (2.1 102.1)| def|\n", "|POINT (2.1 102.1)| abc|POINT (2.1 102.1)| def|\n", "|POINT (3.1 103.1)| abc|POINT (2.1 102.1)| def|\n", "+-----------------+-----+-----------------+-----+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "point_csv_df_1 = sedona.read.format(\"csv\").\\\n", " option(\"delimiter\", \",\").\\\n", " option(\"header\", \"false\").load(\"data/testpoint.csv\")\n", "\n", "point_csv_df_1.createOrReplaceTempView(\"pointtable\")\n", "\n", "point_df1 = sedona.sql(\"SELECT ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape1, \\'abc\\' as name1 from pointtable\")\n", "point_df1.createOrReplaceTempView(\"pointdf1\")\n", "\n", "point_csv_df2 = sedona.read.format(\"csv\").\\\n", " option(\"delimiter\", \",\").\\\n", " option(\"header\", \"false\").load(\"data/testpoint.csv\")\n", "\n", "point_csv_df2.createOrReplaceTempView(\"pointtable\")\n", "point_df2 = sedona.sql(\"select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape2, \\'def\\' as name2 from pointtable\")\n", "point_df2.createOrReplaceTempView(\"pointdf2\")\n", "\n", "distance_join_df = sedona.sql(\"select * from pointdf1, pointdf2 where ST_Distance(pointdf1.pointshape1,pointdf2.pointshape2) < 2\")\n", "distance_join_df.explain()\n", "distance_join_df.show(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Spatial Join - Range Join and RDD API Join\n", "\n", "Please refer to the example - airports per country: https://github.com/apache/sedona/blob/master/binder/ApacheSedonaSQL_SpatialJoin_AirportsPerCountry.ipynb" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Converting GeoPandas to Apache Sedona" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "gdf = gpd.read_file(\"data/gis_osm_pois_free_1.shp\")\n", "gdf = gdf.replace(pd.NA, '')\n", "osm_points = sedona.createDataFrame(\n", " gdf\n", ")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- osm_id: string (nullable = true)\n", " |-- code: long (nullable = true)\n", " |-- fclass: string (nullable = true)\n", " |-- name: string (nullable = true)\n", " |-- geometry: geometry (nullable = true)\n", "\n" ] } ], "source": [ "osm_points.printSchema()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+----+---------+--------------+--------------------+\n", "| osm_id|code| fclass| name| geometry|\n", "+--------+----+---------+--------------+--------------------+\n", "|26860257|2422|camp_site| de Kroon|POINT (15.3393145...|\n", "|26860294|2406| chalet|Leśne Ustronie|POINT (14.8709625...|\n", "|29947493|2402| motel| |POINT (15.0946636...|\n", "|29947498|2602| atm| |POINT (15.0732014...|\n", "|29947499|2401| hotel| |POINT (15.0696777...|\n", "+--------+----+---------+--------------+--------------------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "osm_points.show(5)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "osm_points.createOrReplaceTempView(\"points\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "transformed_df = sedona.sql(\n", " \"\"\"\n", " SELECT osm_id,\n", " code,\n", " fclass,\n", " name,\n", " ST_Transform(geometry, 'epsg:4326', 'epsg:2180') as geom \n", " FROM points\n", " \"\"\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+----+---------+--------------+--------------------+\n", "| osm_id|code| fclass| name| geom|\n", "+--------+----+---------+--------------+--------------------+\n", "|26860257|2422|camp_site| de Kroon|POINT (250776.778...|\n", "|26860294|2406| chalet|Leśne Ustronie|POINT (221076.709...|\n", "|29947493|2402| motel| |POINT (233902.541...|\n", "|29947498|2602| atm| |POINT (232447.203...|\n", "|29947499|2401| hotel| |POINT (232208.377...|\n", "+--------+----+---------+--------------+--------------------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "transformed_df.show(5)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "transformed_df.createOrReplaceTempView(\"points_2180\")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "neighbours_within_1000m = sedona.sql(\"\"\"\n", " SELECT a.osm_id AS id_1,\n", " b.osm_id AS id_2,\n", " a.geom \n", " FROM points_2180 AS a, points_2180 AS b \n", " WHERE ST_Distance(a.geom,b.geom) < 50\n", " \"\"\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "24/01/20 23:12:40 WARN JoinQuery: UseIndex is true, but no index exists. Will build index on the fly.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "+----------+---------+--------------------+\n", "| id_1| id_2| geom|\n", "+----------+---------+--------------------+\n", "| 197624402|197624402|POINT (203703.035...|\n", "| 197663196|197663196|POINT (203936.327...|\n", "| 197953474|197953474|POINT (203724.746...|\n", "|1074233127|262310516|POINT (203524.110...|\n", "| 262310516|262310516|POINT (203507.730...|\n", "|1074233123|262310516|POINT (203505.198...|\n", "|1074232906|270281140|POINT (202816.420...|\n", "| 270281140|270281140|POINT (202809.394...|\n", "| 270306609|270306609|POINT (203639.141...|\n", "|1257728000|270306746|POINT (203730.740...|\n", "| 270306746|270306746|POINT (203694.827...|\n", "|1401424769|270306746|POINT (203717.829...|\n", "| 280402616|280402616|POINT (204597.633...|\n", "| 839725400|280402616|POINT (204584.576...|\n", "| 293896571|293896571|POINT (203064.162...|\n", "|3256728465|293896571|POINT (203078.302...|\n", "| 945009922|311395425|POINT (197087.969...|\n", "|6339786017|311395425|POINT (197102.731...|\n", "| 825853330|311395425|POINT (197088.273...|\n", "| 311395425|311395425|POINT (197093.445...|\n", "+----------+---------+--------------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "neighbours_within_1000m.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Converting Apache Sedona to GeoPandas" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "24/01/20 23:12:41 WARN JoinQuery: UseIndex is true, but no index exists. Will build index on the fly.\n" ] } ], "source": [ "df = neighbours_within_1000m.toPandas()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "gdf = gpd.GeoDataFrame(df, geometry=\"geom\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
| \n", " | id_1 | \n", "id_2 | \n", "geom | \n", "
|---|---|---|---|
| 0 | \n", "197624402 | \n", "197624402 | \n", "POINT (203703.036 418398.613) | \n", "
| 1 | \n", "197663196 | \n", "197663196 | \n", "POINT (203936.327 418662.604) | \n", "
| 2 | \n", "197953474 | \n", "197953474 | \n", "POINT (203724.747 418602.854) | \n", "
| 3 | \n", "1074233127 | \n", "262310516 | \n", "POINT (203524.111 417303.619) | \n", "
| 4 | \n", "262310516 | \n", "262310516 | \n", "POINT (203507.731 417345.373) | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "
| 65670 | \n", "2276133152 | \n", "6785548357 | \n", "POINT (254883.018 569956.340) | \n", "
| 65671 | \n", "6785548358 | \n", "6785548358 | \n", "POINT (255246.168 569632.391) | \n", "
| 65672 | \n", "1836163571 | \n", "6797128792 | \n", "POINT (245838.356 547922.793) | \n", "
| 65673 | \n", "6797128792 | \n", "6797128792 | \n", "POINT (245819.115 547966.886) | \n", "
| 65674 | \n", "6817416704 | \n", "6817416704 | \n", "POINT (286325.570 557253.517) | \n", "
65675 rows × 3 columns
\n", "