{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "time_series_anomaly_detection_with_pca.ipynb", "provenance": [], "collapsed_sections": [], "authorship_tag": "ABX9TyPiLAjS/qzwEfo/w4/Y3WMy", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "" ] }, { "cell_type": "code", "metadata": { "id": "8Gsf6OuJiQtt" }, "source": [ "#!pip install netdata_pandas" ], "execution_count": 1, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Hw5a7pZ1iaar", "outputId": "cf6a4f94-e1be-497b-a62e-6160075bfbbe" }, "source": [ "import pandas as pd \n", "import numpy as np\n", "from sklearn.decomposition import PCA\n", "from sklearn.preprocessing import StandardScaler\n", "from scipy.spatial.distance import cdist\n", "\n", "from netdata_pandas.data import get_data\n", "\n", "\n", "def anomaly_scores(pca, X):\n", " \"\"\"Given a fitted pca model and some X feature vectors, compute an anomaly score as the sum of weighted euclidean distance between each sample to the\n", " hyperplane constructed by the selected eigenvectors. \n", " \"\"\"\n", " return np.sum(cdist(X, pca.components_) / pca.explained_variance_ratio_, axis=1).ravel()\n", "\n", "\n", "def preprocess_df(df, lags_n, diffs_n, smooth_n, diffs_abs=False, abs_features=True):\n", " \"\"\"Given a pandas dataframe preprocess it to take differences, add smoothing, and lags as specified. \n", " \"\"\"\n", " if diffs_n >= 1:\n", " # take differences\n", " df = df.diff(diffs_n).dropna()\n", " # abs diffs if defined\n", " if diffs_abs == True:\n", " df = abs(df)\n", " if smooth_n >= 2:\n", " # apply a rolling average to smooth out the data a bit\n", " df = df.rolling(smooth_n).mean().dropna()\n", " if lags_n >= 1:\n", " # for each dimension add a new columns for each of lags_n lags of the differenced and smoothed values for that dimension\n", " df_columns_new = [f'{col}_lag{n}' for n in range(lags_n+1) for col in df.columns]\n", " df = pd.concat([df.shift(n) for n in range(lags_n + 1)], axis=1).dropna()\n", " df.columns = df_columns_new\n", " # sort columns to have lagged values next to each other for clarity when looking at the feature vectors\n", " df = df.reindex(sorted(df.columns), axis=1)\n", "\n", " # abs all features if specified\n", " if abs_features == True:\n", " df = abs(df)\n", " \n", " return df" ], "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/trio/_core/_multierror.py:464: RuntimeWarning: IPython detected, but you already have a custom exception handler installed. I'll skip installing Trio's custom handler, but this means MultiErrors will not show full tracebacks.\n", " category=RuntimeWarning,\n" ] } ] }, { "cell_type": "code", "metadata": { "id": "uyk2s3C1iads" }, "source": [ "# inputs \n", "host = 'london.my-netdata.io' # pull from 'london' netdata demo host\n", "after = -3600 # last 60 minutes\n", "before = 0 # starting from now\n", "dims = ['system.cpu|system'] # lets just look at syatem cpu data\n", "\n", "# params\n", "n_train = 3000 # use the last 50 minutes of data to train on\n", "diffs_n = 1 # take differences\n", "lags_n = 3 # include 3 lags in the feature vector\n", "smooth_n = 3 # smooth the latest values to be included in the feature vector" ], "execution_count": 3, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 252 }, "id": "fBnkg8Fziaf5", "outputId": "c99c2d57-627b-4af0-e0fc-6989bfb8ea9b" }, "source": [ "# get raw data\n", "df = get_data(\n", " hosts=[host], \n", " charts=list(set([d.split('|')[0] for d in dims])), \n", " after=after, \n", " before=before, \n", " index_as_datetime=True\n", ")\n", "df = df[dims]\n", "\n", "# look at raw data\n", "print(df.shape)\n", "display(df.head())" ], "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(3600, 1)\n" ] }, { "output_type": "display_data", "data": { "text/html": [ "
\n", " | system.cpu|system | \n", "
---|---|
time_idx | \n", "\n", " |
2021-10-11 14:55:56 | \n", "0.501253 | \n", "
2021-10-11 14:55:57 | \n", "0.503778 | \n", "
2021-10-11 14:55:58 | \n", "1.256281 | \n", "
2021-10-11 14:55:59 | \n", "0.502513 | \n", "
2021-10-11 14:56:00 | \n", "0.503778 | \n", "