## Normal Distribution and 3 Sigma Rule



## Anomaly/Outlier
If a test_point is $3\sigma$ away from the mean $\mu$, it can be classified as an anomaly

## Is there an anomaly?


In [2]:
import numpy as np

In [3]:
data = np.array([2, 3, 4,2,3,2,2,2,3,486])

In [4]:
m , s = data.mean(), data.std()
m , s

(50.9, 145.03478893010464)

In [5]:
def anomalyDetector(data, test_point):
 m , s = data.mean(), data.std()
 return np.abs(test_point - m) > 3 * s

In [6]:
anomalyDetector(data, test_point = 486)

False

In [7]:
50.9 + 3 * 145

485.9

## A better way of doing anomaly detection
 - Remove the max %5 of data points
 - Remove the min %5 of data points


In [9]:
import pandas as pd
df = pd.DataFrame(data)
df

Unnamed: 0,0
0,2
1,3
2,4
3,2
4,3
5,2
6,2
7,2
8,3
9,486


In [10]:
df.describe()

Unnamed: 0,0
count,10.0
mean,50.9
std,152.880091
min,2.0
25%,2.0
50%,2.5
75%,3.0
max,486.0


In [19]:
qmin, qmax = float(df.quantile(.05)), float(df.quantile(.95))
qmin, qmax

(2.0, 269.0999999999995)

In [21]:
data[(data >= qmin) & (data <= qmax)]

array([2, 3, 4, 2, 3, 2, 2, 2, 3])

In [22]:
def anomalyDetector(data, test_point):
 # Remove the max %5 of data points
 # Remove the min %5 of data points
 df = pd.DataFrame(data)
 qmin, qmax = float(df.quantile(.05)), float(df.quantile(.95))
 data = data[(data >= qmin) & (data <= qmax)]
 
 m , s = data.mean(), data.std()
 return np.abs(test_point - m) > 3 * s

In [23]:
data

array([ 2, 3, 4, 2, 3, 2, 2, 2, 3, 486])

In [24]:
anomalyDetector(data, test_point = 486)

True

In [25]:
df

Unnamed: 0,0
0,2
1,3
2,4
3,2
4,3
5,2
6,2
7,2
8,3
9,486


In [67]:
def anomalyDetector(data, test_point = None):
 df = pd.DataFrame(data)
 qmin, qmax = float(df.quantile(.05)), float(df.quantile(.95))
 
 # Remove the max %5 and min %5 of data points
 data = data[(data >= qmin) & (data <= qmax)]
 m , s = data.mean(), data.std()
 
 if test_point:
 return np.abs(test_point - m) > 3 * s
 else:
 anomalies = df.apply(lambda x: np.abs(x - m) > 3 * s)
 idx = anomalies.values.reshape(-1)
 return idx

In [69]:
idx = anomalyDetector(data)
idx

array([False, False, False, False, False, False, False, False, False,
 True])

In [70]:
data[idx]

array([486])

In [75]:
anomalyDetector(data, test_point = 5)

True