{ "cells": [ { "cell_type": "markdown", "source": [ "# Distances\n", "\n", "In this notebook, we will use aeon for time series distance computation\n", "\n", "### Preliminaries" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 9, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "\n", "from aeon.datasets import load_macroeconomic\n", "from aeon.distances import distance" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "## Distances\n", "\n", "The goal of a distance computation is to measure the similarity between the time series\n", "'x' and 'y'. A distance function should take x and y as parameters and return a float\n", "that is the computed distance between x and y. The value returned should be 0.0 when\n", "the time series are the exact same, and a value greater than 0.0 that is a measure of\n", "distance between them, when they are different.\n", "\n", "Take the following two time series:" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 10, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'statsmodels'. 'statsmodels' is a soft dependency and not included in the base aeon installation. Please run: `pip install statsmodels` to install the statsmodels package. To install all soft dependencies, run: `pip install aeon[all_extras]`", "output_type": "error", "traceback": [ "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", "\u001B[0;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)", "File \u001B[0;32m~/projects/aeon/aeon/utils/validation/_dependencies.py:118\u001B[0m, in \u001B[0;36m_check_soft_dependencies\u001B[0;34m(package_import_alias, severity, obj, suppress_import_stdout, *packages)\u001B[0m\n\u001B[1;32m 117\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m--> 118\u001B[0m pkg_ref \u001B[38;5;241m=\u001B[39m \u001B[43mimport_module\u001B[49m\u001B[43m(\u001B[49m\u001B[43mpackage_import_name\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 119\u001B[0m \u001B[38;5;66;03m# if package cannot be imported, make the user aware of installation requirement\u001B[39;00m\n", "File \u001B[0;32m/usr/lib/python3.10/importlib/__init__.py:126\u001B[0m, in \u001B[0;36mimport_module\u001B[0;34m(name, package)\u001B[0m\n\u001B[1;32m 125\u001B[0m level \u001B[38;5;241m+\u001B[39m\u001B[38;5;241m=\u001B[39m \u001B[38;5;241m1\u001B[39m\n\u001B[0;32m--> 126\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43m_bootstrap\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_gcd_import\u001B[49m\u001B[43m(\u001B[49m\u001B[43mname\u001B[49m\u001B[43m[\u001B[49m\u001B[43mlevel\u001B[49m\u001B[43m:\u001B[49m\u001B[43m]\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mpackage\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mlevel\u001B[49m\u001B[43m)\u001B[49m\n", "File \u001B[0;32m:1050\u001B[0m, in \u001B[0;36m_gcd_import\u001B[0;34m(name, package, level)\u001B[0m\n", "File \u001B[0;32m:1027\u001B[0m, in \u001B[0;36m_find_and_load\u001B[0;34m(name, import_)\u001B[0m\n", "File \u001B[0;32m:1004\u001B[0m, in \u001B[0;36m_find_and_load_unlocked\u001B[0;34m(name, import_)\u001B[0m\n", "\u001B[0;31mModuleNotFoundError\u001B[0m: No module named 'statsmodels'", "\nThe above exception was the direct cause of the following exception:\n", "\u001B[0;31mModuleNotFoundError\u001B[0m Traceback (most recent call last)", "Cell \u001B[0;32mIn[10], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m X \u001B[38;5;241m=\u001B[39m \u001B[43mload_macroeconomic\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 2\u001B[0m country_d, country_c, country_b, country_a \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39msplit(X[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mrealgdp\u001B[39m\u001B[38;5;124m\"\u001B[39m]\u001B[38;5;241m.\u001B[39mto_numpy()[\u001B[38;5;241m3\u001B[39m:], \u001B[38;5;241m4\u001B[39m)\n\u001B[1;32m 4\u001B[0m plt\u001B[38;5;241m.\u001B[39mplot(country_a, label\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCounty D\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n", "File \u001B[0;32m~/projects/aeon/aeon/datasets/_single_problem_loaders.py:941\u001B[0m, in \u001B[0;36mload_macroeconomic\u001B[0;34m()\u001B[0m\n\u001B[1;32m 906\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mload_macroeconomic\u001B[39m():\n\u001B[1;32m 907\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[1;32m 908\u001B[0m \u001B[38;5;124;03m Load the US Macroeconomic Data [1]_.\u001B[39;00m\n\u001B[1;32m 909\u001B[0m \n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 939\u001B[0m \u001B[38;5;124;03m http://www.bls.gov/data/; accessed December 15, 2009.\u001B[39;00m\n\u001B[1;32m 940\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[0;32m--> 941\u001B[0m \u001B[43m_check_soft_dependencies\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mstatsmodels\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[1;32m 942\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mstatsmodels\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mapi\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m \u001B[38;5;21;01msm\u001B[39;00m\n\u001B[1;32m 944\u001B[0m y \u001B[38;5;241m=\u001B[39m sm\u001B[38;5;241m.\u001B[39mdatasets\u001B[38;5;241m.\u001B[39mmacrodata\u001B[38;5;241m.\u001B[39mload_pandas()\u001B[38;5;241m.\u001B[39mdata\n", "File \u001B[0;32m~/projects/aeon/aeon/utils/validation/_dependencies.py:140\u001B[0m, in \u001B[0;36m_check_soft_dependencies\u001B[0;34m(package_import_alias, severity, obj, suppress_import_stdout, *packages)\u001B[0m\n\u001B[1;32m 130\u001B[0m msg \u001B[38;5;241m=\u001B[39m (\n\u001B[1;32m 131\u001B[0m \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mclass_name\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m requires package \u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mpackage\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m to be present \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m 132\u001B[0m \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124min the python environment, but \u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mpackage\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m was not found. \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 137\u001B[0m \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124maeon[all_extras]`\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m 138\u001B[0m )\n\u001B[1;32m 139\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m severity \u001B[38;5;241m==\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124merror\u001B[39m\u001B[38;5;124m\"\u001B[39m:\n\u001B[0;32m--> 140\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mModuleNotFoundError\u001B[39;00m(msg) \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01me\u001B[39;00m\n\u001B[1;32m 141\u001B[0m \u001B[38;5;28;01melif\u001B[39;00m severity \u001B[38;5;241m==\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mwarning\u001B[39m\u001B[38;5;124m\"\u001B[39m:\n\u001B[1;32m 142\u001B[0m warnings\u001B[38;5;241m.\u001B[39mwarn(msg)\n", "\u001B[0;31mModuleNotFoundError\u001B[0m: No module named 'statsmodels'. 'statsmodels' is a soft dependency and not included in the base aeon installation. Please run: `pip install statsmodels` to install the statsmodels package. To install all soft dependencies, run: `pip install aeon[all_extras]`" ] } ], "source": [ "X = load_macroeconomic()\n", "country_d, country_c, country_b, country_a = np.split(X[\"realgdp\"].to_numpy()[3:], 4)\n", "\n", "plt.plot(country_a, label=\"County D\")\n", "plt.plot(country_b, label=\"Country C\")\n", "plt.plot(country_c, label=\"Country B\")\n", "plt.plot(country_d, label=\"Country A\")\n", "plt.xlabel(\"Quarters from 1959\")\n", "plt.ylabel(\"Gdp\")\n", "plt.legend()" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "The above shows a made up scenario comparing the gdp growth of four countries (country\n", "A, B, C and D) by quarter from 1959. If our task is to determine how different country\n", "C is from our other countries one way to do this is to measure the distance between\n", "each country.\n", "\n", "How to use the distance module to perform tasks such as these, will now be outlined." ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "## Distance module\n", "\n", "To begin using the distance module we need at least two time series, x and y and they\n", "must be numpy arrays. We've established the various time series we'll be using for this\n", "example above as country_a, country_b, country_c and country_d. To compute the distance\n", "between x and y we can use a euclidean distance as shown:" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 3, "outputs": [ { "data": { "text/plain": "27014.721294922445" }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Simple euclidean distance\n", "distance(country_a, country_b, metric=\"euclidean\")" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "Shown above taking the distance between country_a and country_b, returns a singular\n", "float that represents their similarity (distance). We can do the same again but compare\n", "country_d to country_a:" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 4, "outputs": [ { "data": { "text/plain": "58340.14674572803" }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "distance(country_a, country_d, metric=\"euclidean\")" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "Now we can compare the result of the distance computation and we find that country_a is\n", "closer to country_b than country_d (27014.7 < 58340.1).\n", "\n", "We can further confirm this result by looking at the graph above and see the green line\n", " (country_b) is closer to the red line (country_a) than the orange line (country d).\n", "\n", "### Different metric parameters\n", "\n", "Above we used the metric \"euclidean\". While euclidean distance is appropriate for simple\n", "example such as the one above, it has been shown to be inadequate when we have larger\n", "and more complex timeseries (particularly multivariate). While the merits of each\n", "different distance won't be described here (see documentation for descriptions of each),\n", "a large number of specialised time series distances have been implement to get a better\n", "accuracy in distance computation. These are:\n", "\n", "'euclidean', 'squared', 'dtw', 'ddtw', 'wdtw', 'wddtw', 'lcss', 'edr', 'erp'\n", "\n", "All of the above can be used as a metric parameter. This will now be demonstrated:" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 5, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Euclidean distance: 58340.14674572803\n", "Squared euclidean distance: 3403572722.3130813\n", "Dynamic time warping distance: 3403572722.3130813\n", "Derivative dynamic time warping distance: 175072.58701887555\n", "Weighted dynamic time warping distance: 1701786361.1565406\n", "Weighted derivative dynamic time warping distance: 87536.29350943778\n", "Longest common subsequence distance: 1.0\n", "Edit distance for real sequences distance: 1.0\n", "Edit distance for real penalty distance: 411654.25899999996\n" ] } ], "source": [ "print(\"Euclidean distance: \", distance(country_a, country_d, metric=\"euclidean\"))\n", "print(\"Squared euclidean distance: \", distance(country_a, country_d, metric=\"squared\"))\n", "print(\"Dynamic time warping distance: \", distance(country_a, country_d, metric=\"dtw\"))\n", "print(\n", " \"Derivative dynamic time warping distance: \",\n", " distance(country_a, country_d, metric=\"ddtw\"),\n", ")\n", "print(\n", " \"Weighted dynamic time warping distance: \",\n", " distance(country_a, country_d, metric=\"wdtw\"),\n", ")\n", "print(\n", " \"Weighted derivative dynamic time warping distance: \",\n", " distance(country_a, country_d, metric=\"wddtw\"),\n", ")\n", "print(\n", " \"Longest common subsequence distance: \",\n", " distance(country_a, country_d, metric=\"lcss\"),\n", ")\n", "print(\n", " \"Edit distance for real sequences distance: \",\n", " distance(country_a, country_d, metric=\"edr\"),\n", ")\n", "print(\n", " \"Edit distance for real penalty distance: \",\n", " distance(country_a, country_d, metric=\"erp\"),\n", ")" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "While many of the above use euclidean distance at their core, they change how it is\n", "used to account for various problems we encounter with time series data such as:\n", "alignment, phase, shape, dimensions etc. As mentioned for specific details on how to\n", "best use each distance and what it does see the documentation for that distance.\n", "\n", "### Custom parameters for distances\n", "\n", "In addition each distance has a different set of parameters. How these are passed to\n", "the 'distance' function will now be outlined using the 'dtw' example. As stated for\n", "specific parameters for each distance please refer to the documentation.\n", "\n", "Dtw is a O(n^2) algorithm and as such a point of focus has been trying to optimise the\n", "algorithm. A proposal to improve performance is to restrict the potential alignment\n", "path by putting a 'bound' on values to consider when looking for an alignment. While\n", "there have been many bounding algorithms proposed the most popular is known as\n", "Sakoe-Chiba's bounding window.\n", "\n", "First this is a bounding matrix that considers all indexes in x and y:" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 3, "outputs": [ { "data": { "text/plain": "array([[ True, True, True, True, True],\n [ True, True, True, True, True],\n [ True, True, True, True, True],\n [ True, True, True, True, True],\n [ True, True, True, True, True],\n [ True, True, True, True, True],\n [ True, True, True, True, True],\n [ True, True, True, True, True],\n [ True, True, True, True, True],\n [ True, True, True, True, True]])" }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from aeon.distances import create_bounding_matrix\n", "\n", "first_ts_size = 10\n", "second_ts_size = 5\n", "\n", "create_bounding_matrix(first_ts_size, second_ts_size)" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "Above shows a matrix that maps each index in 'x' to each index in 'y'. Each value\n", "that is considered in the computation is set to True (in this instance we want a full\n", "bounding matrix so all values are set to True). As such if we were to run Dtw with this\n", "without bounding matrix all values will be consider all of these indexes.\n", "\n", "However, it sometimes (depending on the dataset) is not necessary (and sometimes\n", "detrimental to the result) to consider all indexes. We can you a bounding technique\n", "like Sakoe-Chibas to limit the potential warping paths. This is done by setting a window\n", "size that will restrict the indexes that are considered. Below shows creating a bounding\n", "matrix again but only considering 20% of the indexes in x and y:" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 5, "outputs": [ { "data": { "text/plain": "array([[ True, True, True, True, True],\n [ True, True, True, True, True],\n [False, True, True, True, True],\n [False, False, True, True, True],\n [False, False, False, True, True],\n [False, False, False, False, False],\n [False, False, False, False, False],\n [False, False, False, False, False],\n [False, False, False, False, False],\n [False, False, False, False, False]])" }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "create_bounding_matrix(first_ts_size, second_ts_size, window=0.2)" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "This bounding matrix produces a diagnol shape over the matrix. This restricts the\n", "warping and therefore greatly increase the speed at which the distance is computed as\n", "much fewer potential warping paths are considered." ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "With that base introductory to bounding algorithms and why we may want to use them\n", "how do we use it in our distance computation. There are two ways:" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 11, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dynamic time warping distance with Sakoe-Chiba: 312.6023735352965\n" ] } ], "source": [ "# Create two random unaligned time series to better illustrate the difference\n", "\n", "rng = np.random.RandomState(42)\n", "n_timestamps, n_features = 10, 19\n", "x = rng.randn(n_timestamps, n_features)\n", "y = rng.randn(n_timestamps, n_features)\n", "\n", "# First we can specify the bounding matrix to use either via enum or int (see\n", "# documentation for potential values):\n", "print(\n", " \"Dynamic time warping distance with Sakoe-Chiba: \",\n", " distance(x, y, metric=\"dtw\", window=0.2),\n", ") # Sakoe chiba" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [], "metadata": { "collapsed": false } } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }