{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Normal Distribution and 3 Sigma Rule\n", "![normal.png](normal.png)\n", "\n", "\n", "## Anomaly/Outlier\n", "If a test_point is $3\\sigma$ away from the mean $\\mu$, it can be classified as an anomaly\n", "\n", "## Is there an anomaly?\n", "![war.png](war.png)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import numpy as np" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "data = np.array([2, 3, 4,2,3,2,2,2,3,486])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(50.9, 145.03478893010464)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "m , s = data.mean(), data.std()\n", "m , s" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def anomalyDetector(data, test_point):\n", " m , s = data.mean(), data.std()\n", " return np.abs(test_point - m) > 3 * s" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "anomalyDetector(data, test_point = 486)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "485.9" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "50.9 + 3 * 145" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## A better way of doing anomaly detection\n", " - Remove the max %5 of data points\n", " - Remove the min %5 of data points\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
02
13
24
32
43
52
62
72
83
9486
\n", "
" ], "text/plain": [ " 0\n", "0 2\n", "1 3\n", "2 4\n", "3 2\n", "4 3\n", "5 2\n", "6 2\n", "7 2\n", "8 3\n", "9 486" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "df = pd.DataFrame(data)\n", "df" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
count10.000000
mean50.900000
std152.880091
min2.000000
25%2.000000
50%2.500000
75%3.000000
max486.000000
\n", "
" ], "text/plain": [ " 0\n", "count 10.000000\n", "mean 50.900000\n", "std 152.880091\n", "min 2.000000\n", "25% 2.000000\n", "50% 2.500000\n", "75% 3.000000\n", "max 486.000000" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2.0, 269.0999999999995)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "qmin, qmax = float(df.quantile(.05)), float(df.quantile(.95))\n", "qmin, qmax" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([2, 3, 4, 2, 3, 2, 2, 2, 3])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[(data >= qmin) & (data <= qmax)]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "def anomalyDetector(data, test_point):\n", " # Remove the max %5 of data points\n", " # Remove the min %5 of data points\n", " df = pd.DataFrame(data)\n", " qmin, qmax = float(df.quantile(.05)), float(df.quantile(.95))\n", " data = data[(data >= qmin) & (data <= qmax)]\n", " \n", " m , s = data.mean(), data.std()\n", " return np.abs(test_point - m) > 3 * s" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 2, 3, 4, 2, 3, 2, 2, 2, 3, 486])" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "anomalyDetector(data, test_point = 486)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
02
13
24
32
43
52
62
72
83
9486
\n", "
" ], "text/plain": [ " 0\n", "0 2\n", "1 3\n", "2 4\n", "3 2\n", "4 3\n", "5 2\n", "6 2\n", "7 2\n", "8 3\n", "9 486" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "def anomalyDetector(data, test_point = None):\n", " df = pd.DataFrame(data)\n", " qmin, qmax = float(df.quantile(.05)), float(df.quantile(.95))\n", " \n", " # Remove the max %5 and min %5 of data points\n", " data = data[(data >= qmin) & (data <= qmax)]\n", " m , s = data.mean(), data.std()\n", " \n", " if test_point:\n", " return np.abs(test_point - m) > 3 * s\n", " else:\n", " anomalies = df.apply(lambda x: np.abs(x - m) > 3 * s)\n", " idx = anomalies.values.reshape(-1)\n", " return idx" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([False, False, False, False, False, False, False, False, False,\n", " True])" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "idx = anomalyDetector(data)\n", "idx" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([486])" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[idx]" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "anomalyDetector(data, test_point = 5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }