{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Normal Distribution and 3 Sigma Rule\n", "\n", "\n", "\n", "## Anomaly/Outlier\n", "If a test_point is $3\\sigma$ away from the mean $\\mu$, it can be classified as an anomaly\n", "\n", "## Is there an anomaly?\n", "" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import numpy as np" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "data = np.array([2, 3, 4,2,3,2,2,2,3,486])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(50.9, 145.03478893010464)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "m , s = data.mean(), data.std()\n", "m , s" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def anomalyDetector(data, test_point):\n", " m , s = data.mean(), data.std()\n", " return np.abs(test_point - m) > 3 * s" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "anomalyDetector(data, test_point = 486)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "485.9" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "50.9 + 3 * 145" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## A better way of doing anomaly detection\n", " - Remove the max %5 of data points\n", " - Remove the min %5 of data points\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>0</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>3</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>4</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>3</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>3</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>486</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " 0\n", "0 2\n", "1 3\n", "2 4\n", "3 2\n", "4 3\n", "5 2\n", "6 2\n", "7 2\n", "8 3\n", "9 486" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "df = pd.DataFrame(data)\n", "df" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>0</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>count</th>\n", " <td>10.000000</td>\n", " </tr>\n", " <tr>\n", " <th>mean</th>\n", " <td>50.900000</td>\n", " </tr>\n", " <tr>\n", " <th>std</th>\n", " <td>152.880091</td>\n", " </tr>\n", " <tr>\n", " <th>min</th>\n", " <td>2.000000</td>\n", " </tr>\n", " <tr>\n", " <th>25%</th>\n", " <td>2.000000</td>\n", " </tr>\n", " <tr>\n", " <th>50%</th>\n", " <td>2.500000</td>\n", " </tr>\n", " <tr>\n", " <th>75%</th>\n", " <td>3.000000</td>\n", " </tr>\n", " <tr>\n", " <th>max</th>\n", " <td>486.000000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " 0\n", "count 10.000000\n", "mean 50.900000\n", "std 152.880091\n", "min 2.000000\n", "25% 2.000000\n", "50% 2.500000\n", "75% 3.000000\n", "max 486.000000" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2.0, 269.0999999999995)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "qmin, qmax = float(df.quantile(.05)), float(df.quantile(.95))\n", "qmin, qmax" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([2, 3, 4, 2, 3, 2, 2, 2, 3])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[(data >= qmin) & (data <= qmax)]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "def anomalyDetector(data, test_point):\n", " # Remove the max %5 of data points\n", " # Remove the min %5 of data points\n", " df = pd.DataFrame(data)\n", " qmin, qmax = float(df.quantile(.05)), float(df.quantile(.95))\n", " data = data[(data >= qmin) & (data <= qmax)]\n", " \n", " m , s = data.mean(), data.std()\n", " return np.abs(test_point - m) > 3 * s" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 2, 3, 4, 2, 3, 2, 2, 2, 3, 486])" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "anomalyDetector(data, test_point = 486)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>0</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>3</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>4</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>3</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>3</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>486</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " 0\n", "0 2\n", "1 3\n", "2 4\n", "3 2\n", "4 3\n", "5 2\n", "6 2\n", "7 2\n", "8 3\n", "9 486" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "def anomalyDetector(data, test_point = None):\n", " df = pd.DataFrame(data)\n", " qmin, qmax = float(df.quantile(.05)), float(df.quantile(.95))\n", " \n", " # Remove the max %5 and min %5 of data points\n", " data = data[(data >= qmin) & (data <= qmax)]\n", " m , s = data.mean(), data.std()\n", " \n", " if test_point:\n", " return np.abs(test_point - m) > 3 * s\n", " else:\n", " anomalies = df.apply(lambda x: np.abs(x - m) > 3 * s)\n", " idx = anomalies.values.reshape(-1)\n", " return idx" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([False, False, False, False, False, False, False, False, False,\n", " True])" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "idx = anomalyDetector(data)\n", "idx" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([486])" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[idx]" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "anomalyDetector(data, test_point = 5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }