{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Nearest Points Exercise\n",
    "\n",
    "Tamás Gál (tamas.gal@fau.de)\n",
    "\n",
    "The latest version of this notebook is available at [https://github.com/escape2020/school2021](https://github.com/escape2020/school2021)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Python version:  3.9.4 | packaged by conda-forge | (default, May 10 2021, 22:10:52) \n",
      "[Clang 11.1.0 ]\n",
      "NumPy version:   1.20.3\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import sys\n",
    "\n",
    "print(f\"Python version:  {sys.version}\\n\"\n",
    "      f\"NumPy version:   {np.__version__}\")\n",
    "\n",
    "rng = np.random.default_rng(42)  # initialise our random number generator"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Given an array of points (in 3D), find the nearest point for each one."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.77395605, 0.43887844, 0.85859792],\n",
       "       [0.69736803, 0.09417735, 0.97562235],\n",
       "       [0.7611397 , 0.78606431, 0.12811363],\n",
       "       ...,\n",
       "       [0.82221355, 0.64818373, 0.78175705],\n",
       "       [0.42816986, 0.63793674, 0.856229  ],\n",
       "       [0.63106544, 0.34767363, 0.66252959]])"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "N = 500\n",
    "n_dims = 3\n",
    "points = rng.random((N, n_dims))\n",
    "points"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "### Solution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.67185419, 0.96058696, 0.37091232],\n",
       "       [0.42508177, 0.81212296, 0.50576231],\n",
       "       [0.73657309, 0.45970946, 0.21549514],\n",
       "       [0.74520384, 0.13115517, 0.19858366]])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "N = 4\n",
    "n_dims = 3\n",
    "points = rng.random((N, n_dims))\n",
    "points"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 0, 3, 2])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "distances = np.linalg.norm((points.reshape(N, 1, n_dims) - points), axis=2)\n",
    "distances[np.diag_indices_from(distances)] = np.inf\n",
    "np.argmin(distances, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "### Discussion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.67185419, 0.96058696, 0.37091232],\n",
       "       [0.42508177, 0.81212296, 0.50576231],\n",
       "       [0.73657309, 0.45970946, 0.21549514],\n",
       "       [0.74520384, 0.13115517, 0.19858366]])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "points"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4, 3)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "points.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Adding some extra dimensions to expand the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[[0.67185419, 0.96058696, 0.37091232]],\n",
       "\n",
       "       [[0.42508177, 0.81212296, 0.50576231]],\n",
       "\n",
       "       [[0.73657309, 0.45970946, 0.21549514]],\n",
       "\n",
       "       [[0.74520384, 0.13115517, 0.19858366]]])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reshaped_points = points.reshape(N, 1, n_dims)\n",
    "reshaped_points"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4, 1, 3)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reshaped_points.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Calculating the differences (results in vectors pointing to each other)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[[ 0.        ,  0.        ,  0.        ],\n",
       "        [ 0.24677242,  0.148464  , -0.13484999],\n",
       "        [-0.0647189 ,  0.5008775 ,  0.15541718],\n",
       "        [-0.07334965,  0.82943179,  0.17232866]],\n",
       "\n",
       "       [[-0.24677242, -0.148464  ,  0.13484999],\n",
       "        [ 0.        ,  0.        ,  0.        ],\n",
       "        [-0.31149133,  0.3524135 ,  0.29026717],\n",
       "        [-0.32012208,  0.68096779,  0.30717865]],\n",
       "\n",
       "       [[ 0.0647189 , -0.5008775 , -0.15541718],\n",
       "        [ 0.31149133, -0.3524135 , -0.29026717],\n",
       "        [ 0.        ,  0.        ,  0.        ],\n",
       "        [-0.00863075,  0.3285543 ,  0.01691148]],\n",
       "\n",
       "       [[ 0.07334965, -0.82943179, -0.17232866],\n",
       "        [ 0.32012208, -0.68096779, -0.30717865],\n",
       "        [ 0.00863075, -0.3285543 , -0.01691148],\n",
       "        [ 0.        ,  0.        ,  0.        ]]])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "differences = reshaped_points - points\n",
    "differences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(4, 1, 3)  -  (4, 3)  =>  (4, 4, 3)\n"
     ]
    }
   ],
   "source": [
    "print(reshaped_points.shape, \" - \", points.shape, \" => \", differences.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.        , 0.31799797, 0.52841395, 0.85031432],\n",
       "       [0.31799797, 0.        , 0.55269987, 0.81274473],\n",
       "       [0.52841395, 0.55269987, 0.        , 0.32910244],\n",
       "       [0.85031432, 0.81274473, 0.32910244, 0.        ]])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "distances = np.linalg.norm(differences, axis=2)\n",
    "distances"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4, 4)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "distances.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.        , 0.31799797, 0.52841395, 0.85031432],\n",
       "       [0.31799797, 0.        , 0.55269987, 0.81274473],\n",
       "       [0.52841395, 0.55269987, 0.        , 0.32910244],\n",
       "       [0.85031432, 0.81274473, 0.32910244, 0.        ]])"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "distances"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "# get rid of self-distances first ;)\n",
    "distances[np.diag_indices_from(distances)] = np.inf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[       inf, 0.31799797, 0.52841395, 0.85031432],\n",
       "       [0.31799797,        inf, 0.55269987, 0.81274473],\n",
       "       [0.52841395, 0.55269987,        inf, 0.32910244],\n",
       "       [0.85031432, 0.81274473, 0.32910244,        inf]])"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "distances"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 0, 3, 2])"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nearest_points = np.argmin(distances, axis=1)\n",
    "nearest_points"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "### Comparing to `scikit-learn`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 0, 3, 2])"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nearest_points"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 0, 3, 2])"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.neighbors import NearestNeighbors\n",
    "_, indices = NearestNeighbors().fit(points).kneighbors(points, 2)\n",
    "indices[:, 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "84 µs ± 1.67 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit _, indices = NearestNeighbors().fit(points).kneighbors(points, 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "11.6 µs ± 229 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "distances = np.linalg.norm((points.reshape(N, 1, n_dims) - points), axis=2)\n",
    "distances[np.diag_indices_from(distances)] = np.inf\n",
    "np.argmin(distances, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "#### Depending on the data size, our implementation may be faster!\n",
    "\n",
    "...but it also requires much more memory O(N^2) due to the blow-up-trick."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.62682498, 0.7472698 , 0.89468789],\n",
       "       [0.2725865 , 0.11072426, 0.95604666],\n",
       "       [0.15442309, 0.19766698, 0.29132945],\n",
       "       ...,\n",
       "       [0.46039559, 0.13064526, 0.39747408],\n",
       "       [0.4116557 , 0.18771592, 0.75630711],\n",
       "       [0.32140068, 0.58898729, 0.51573018]])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "N = 1000\n",
    "n_dims = 3\n",
    "points = rng.random((N, n_dims))\n",
    "points"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "892 µs ± 7.72 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit _, indices = NearestNeighbors().fit(points).kneighbors(points, 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "22.9 ms ± 1.57 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "distances = np.linalg.norm((points.reshape(N, 1, n_dims) - points), axis=2)\n",
    "distances[np.diag_indices_from(distances)] = np.inf\n",
    "np.argmin(distances, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "@webio": {
   "lastCommId": "e800f9477722428abc03f280b811860b",
   "lastKernelId": "cc438d68-999a-42f6-91d0-437b70ceb0c8"
  },
  "celltoolbar": "Slideshow",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}