{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "time_series_anomaly_detection_with_pca.ipynb",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyPiLAjS/qzwEfo/w4/Y3WMy",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
""
]
},
{
"cell_type": "code",
"metadata": {
"id": "8Gsf6OuJiQtt"
},
"source": [
"#!pip install netdata_pandas"
],
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Hw5a7pZ1iaar",
"outputId": "cf6a4f94-e1be-497b-a62e-6160075bfbbe"
},
"source": [
"import pandas as pd \n",
"import numpy as np\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.preprocessing import StandardScaler\n",
"from scipy.spatial.distance import cdist\n",
"\n",
"from netdata_pandas.data import get_data\n",
"\n",
"\n",
"def anomaly_scores(pca, X):\n",
" \"\"\"Given a fitted pca model and some X feature vectors, compute an anomaly score as the sum of weighted euclidean distance between each sample to the\n",
" hyperplane constructed by the selected eigenvectors. \n",
" \"\"\"\n",
" return np.sum(cdist(X, pca.components_) / pca.explained_variance_ratio_, axis=1).ravel()\n",
"\n",
"\n",
"def preprocess_df(df, lags_n, diffs_n, smooth_n, diffs_abs=False, abs_features=True):\n",
" \"\"\"Given a pandas dataframe preprocess it to take differences, add smoothing, and lags as specified. \n",
" \"\"\"\n",
" if diffs_n >= 1:\n",
" # take differences\n",
" df = df.diff(diffs_n).dropna()\n",
" # abs diffs if defined\n",
" if diffs_abs == True:\n",
" df = abs(df)\n",
" if smooth_n >= 2:\n",
" # apply a rolling average to smooth out the data a bit\n",
" df = df.rolling(smooth_n).mean().dropna()\n",
" if lags_n >= 1:\n",
" # for each dimension add a new columns for each of lags_n lags of the differenced and smoothed values for that dimension\n",
" df_columns_new = [f'{col}_lag{n}' for n in range(lags_n+1) for col in df.columns]\n",
" df = pd.concat([df.shift(n) for n in range(lags_n + 1)], axis=1).dropna()\n",
" df.columns = df_columns_new\n",
" # sort columns to have lagged values next to each other for clarity when looking at the feature vectors\n",
" df = df.reindex(sorted(df.columns), axis=1)\n",
"\n",
" # abs all features if specified\n",
" if abs_features == True:\n",
" df = abs(df)\n",
" \n",
" return df"
],
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.7/dist-packages/trio/_core/_multierror.py:464: RuntimeWarning: IPython detected, but you already have a custom exception handler installed. I'll skip installing Trio's custom handler, but this means MultiErrors will not show full tracebacks.\n",
" category=RuntimeWarning,\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "uyk2s3C1iads"
},
"source": [
"# inputs \n",
"host = 'london.my-netdata.io' # pull from 'london' netdata demo host\n",
"after = -3600 # last 60 minutes\n",
"before = 0 # starting from now\n",
"dims = ['system.cpu|system'] # lets just look at syatem cpu data\n",
"\n",
"# params\n",
"n_train = 3000 # use the last 50 minutes of data to train on\n",
"diffs_n = 1 # take differences\n",
"lags_n = 3 # include 3 lags in the feature vector\n",
"smooth_n = 3 # smooth the latest values to be included in the feature vector"
],
"execution_count": 3,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 252
},
"id": "fBnkg8Fziaf5",
"outputId": "c99c2d57-627b-4af0-e0fc-6989bfb8ea9b"
},
"source": [
"# get raw data\n",
"df = get_data(\n",
" hosts=[host], \n",
" charts=list(set([d.split('|')[0] for d in dims])), \n",
" after=after, \n",
" before=before, \n",
" index_as_datetime=True\n",
")\n",
"df = df[dims]\n",
"\n",
"# look at raw data\n",
"print(df.shape)\n",
"display(df.head())"
],
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"(3600, 1)\n"
]
},
{
"output_type": "display_data",
"data": {
"text/html": [
"
| \n", " | system.cpu|system | \n", "
|---|---|
| time_idx | \n", "\n", " |
| 2021-10-11 14:55:56 | \n", "0.501253 | \n", "
| 2021-10-11 14:55:57 | \n", "0.503778 | \n", "
| 2021-10-11 14:55:58 | \n", "1.256281 | \n", "
| 2021-10-11 14:55:59 | \n", "0.502513 | \n", "
| 2021-10-11 14:56:00 | \n", "0.503778 | \n", "