{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Nearest Points Exercise\n", "\n", "Tamás Gál (tamas.gal@fau.de)\n", "\n", "The latest version of this notebook is available at [https://github.com/escape2020/school2021](https://github.com/escape2020/school2021)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Python version: 3.9.4 | packaged by conda-forge | (default, May 10 2021, 22:10:52) \n", "[Clang 11.1.0 ]\n", "NumPy version: 1.20.3\n" ] } ], "source": [ "import numpy as np\n", "import sys\n", "\n", "print(f\"Python version: {sys.version}\\n\"\n", " f\"NumPy version: {np.__version__}\")\n", "\n", "rng = np.random.default_rng(42) # initialise our random number generator" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Given an array of points (in 3D), find the nearest point for each one." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "array([[0.77395605, 0.43887844, 0.85859792],\n", " [0.69736803, 0.09417735, 0.97562235],\n", " [0.7611397 , 0.78606431, 0.12811363],\n", " ...,\n", " [0.82221355, 0.64818373, 0.78175705],\n", " [0.42816986, 0.63793674, 0.856229 ],\n", " [0.63106544, 0.34767363, 0.66252959]])" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "N = 500\n", "n_dims = 3\n", "points = rng.random((N, n_dims))\n", "points" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Solution" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "array([[0.67185419, 0.96058696, 0.37091232],\n", " [0.42508177, 0.81212296, 0.50576231],\n", " [0.73657309, 0.45970946, 0.21549514],\n", " [0.74520384, 0.13115517, 0.19858366]])" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "N = 4\n", "n_dims = 3\n", "points = rng.random((N, n_dims))\n", "points" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 0, 3, 2])" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distances = np.linalg.norm((points.reshape(N, 1, n_dims) - points), axis=2)\n", "distances[np.diag_indices_from(distances)] = np.inf\n", "np.argmin(distances, axis=1)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Discussion" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "array([[0.67185419, 0.96058696, 0.37091232],\n", " [0.42508177, 0.81212296, 0.50576231],\n", " [0.73657309, 0.45970946, 0.21549514],\n", " [0.74520384, 0.13115517, 0.19858366]])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "points" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "(4, 3)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "points.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Adding some extra dimensions to expand the data" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "array([[[0.67185419, 0.96058696, 0.37091232]],\n", "\n", " [[0.42508177, 0.81212296, 0.50576231]],\n", "\n", " [[0.73657309, 0.45970946, 0.21549514]],\n", "\n", " [[0.74520384, 0.13115517, 0.19858366]]])" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reshaped_points = points.reshape(N, 1, n_dims)\n", "reshaped_points" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "(4, 1, 3)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reshaped_points.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Calculating the differences (results in vectors pointing to each other)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "data": { "text/plain": [ "array([[[ 0. , 0. , 0. ],\n", " [ 0.24677242, 0.148464 , -0.13484999],\n", " [-0.0647189 , 0.5008775 , 0.15541718],\n", " [-0.07334965, 0.82943179, 0.17232866]],\n", "\n", " [[-0.24677242, -0.148464 , 0.13484999],\n", " [ 0. , 0. , 0. ],\n", " [-0.31149133, 0.3524135 , 0.29026717],\n", " [-0.32012208, 0.68096779, 0.30717865]],\n", "\n", " [[ 0.0647189 , -0.5008775 , -0.15541718],\n", " [ 0.31149133, -0.3524135 , -0.29026717],\n", " [ 0. , 0. , 0. ],\n", " [-0.00863075, 0.3285543 , 0.01691148]],\n", "\n", " [[ 0.07334965, -0.82943179, -0.17232866],\n", " [ 0.32012208, -0.68096779, -0.30717865],\n", " [ 0.00863075, -0.3285543 , -0.01691148],\n", " [ 0. , 0. , 0. ]]])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "differences = reshaped_points - points\n", "differences" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(4, 1, 3) - (4, 3) => (4, 4, 3)\n" ] } ], "source": [ "print(reshaped_points.shape, \" - \", points.shape, \" => \", differences.shape)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "data": { "text/plain": [ "array([[0. , 0.31799797, 0.52841395, 0.85031432],\n", " [0.31799797, 0. , 0.55269987, 0.81274473],\n", " [0.52841395, 0.55269987, 0. , 0.32910244],\n", " [0.85031432, 0.81274473, 0.32910244, 0. ]])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distances = np.linalg.norm(differences, axis=2)\n", "distances" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 12, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "(4, 4)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distances.shape" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "data": { "text/plain": [ "array([[0. , 0.31799797, 0.52841395, 0.85031432],\n", " [0.31799797, 0. , 0.55269987, 0.81274473],\n", " [0.52841395, 0.55269987, 0. , 0.32910244],\n", " [0.85031432, 0.81274473, 0.32910244, 0. ]])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distances" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "# get rid of self-distances first ;)\n", "distances[np.diag_indices_from(distances)] = np.inf" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "array([[ inf, 0.31799797, 0.52841395, 0.85031432],\n", " [0.31799797, inf, 0.55269987, 0.81274473],\n", " [0.52841395, 0.55269987, inf, 0.32910244],\n", " [0.85031432, 0.81274473, 0.32910244, inf]])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distances" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "array([1, 0, 3, 2])" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nearest_points = np.argmin(distances, axis=1)\n", "nearest_points" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Comparing to `scikit-learn`" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "array([1, 0, 3, 2])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nearest_points" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "array([1, 0, 3, 2])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.neighbors import NearestNeighbors\n", "_, indices = NearestNeighbors().fit(points).kneighbors(points, 2)\n", "indices[:, 1]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "84 µs ± 1.67 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" ] } ], "source": [ "%timeit _, indices = NearestNeighbors().fit(points).kneighbors(points, 2)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "11.6 µs ± 229 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" ] } ], "source": [ "%%timeit\n", "distances = np.linalg.norm((points.reshape(N, 1, n_dims) - points), axis=2)\n", "distances[np.diag_indices_from(distances)] = np.inf\n", "np.argmin(distances, axis=1)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "fragment" } }, "source": [ "#### Depending on the data size, our implementation may be faster!\n", "\n", "...but it also requires much more memory O(N^2) due to the blow-up-trick." ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0.62682498, 0.7472698 , 0.89468789],\n", " [0.2725865 , 0.11072426, 0.95604666],\n", " [0.15442309, 0.19766698, 0.29132945],\n", " ...,\n", " [0.46039559, 0.13064526, 0.39747408],\n", " [0.4116557 , 0.18771592, 0.75630711],\n", " [0.32140068, 0.58898729, 0.51573018]])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "N = 1000\n", "n_dims = 3\n", "points = rng.random((N, n_dims))\n", "points" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "892 µs ± 7.72 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" ] } ], "source": [ "%timeit _, indices = NearestNeighbors().fit(points).kneighbors(points, 2)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "22.9 ms ± 1.57 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%%timeit\n", "distances = np.linalg.norm((points.reshape(N, 1, n_dims) - points), axis=2)\n", "distances[np.diag_indices_from(distances)] = np.inf\n", "np.argmin(distances, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "@webio": { "lastCommId": "e800f9477722428abc03f280b811860b", "lastKernelId": "cc438d68-999a-42f6-91d0-437b70ceb0c8" }, "celltoolbar": "Slideshow", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.4" } }, "nbformat": 4, "nbformat_minor": 4 }