{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Explainable Outlier Detection in Titanic dataset\n", "\n", "This short notebook illustrates basic usage of the [OutlierTree](https://github.com/david-cortes/outliertree) library for explainable outlier detection using the Titanic dataset. For more details, you can check the package's documentation [here](http://outliertree.readthedocs.io/en/latest/).\n", "\n", "The dataset is very popular and can be downloaded from different sources, such as Kaggle or many university webpages. This notebook took it from the following link: http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv\n", "** *\n", "\n", "### Loading the raw data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | pclass | \n", "survived | \n", "name | \n", "sex | \n", "age | \n", "sibsp | \n", "parch | \n", "ticket | \n", "fare | \n", "cabin | \n", "embarked | \n", "boat | \n", "body | \n", "home.dest | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "1 | \n", "Allen, Miss. Elisabeth Walton | \n", "female | \n", "29.00 | \n", "0 | \n", "0 | \n", "24160 | \n", "211.3375 | \n", "B5 | \n", "S | \n", "2 | \n", "NaN | \n", "St Louis, MO | \n", "
1 | \n", "1 | \n", "1 | \n", "Allison, Master. Hudson Trevor | \n", "male | \n", "0.92 | \n", "1 | \n", "2 | \n", "113781 | \n", "151.5500 | \n", "C22 C26 | \n", "S | \n", "11 | \n", "NaN | \n", "Montreal, PQ / Chesterville, ON | \n", "
2 | \n", "1 | \n", "0 | \n", "Allison, Miss. Helen Loraine | \n", "female | \n", "2.00 | \n", "1 | \n", "2 | \n", "113781 | \n", "151.5500 | \n", "C22 C26 | \n", "S | \n", "NaN | \n", "NaN | \n", "Montreal, PQ / Chesterville, ON | \n", "
3 | \n", "1 | \n", "0 | \n", "Allison, Mr. Hudson Joshua Creighton | \n", "male | \n", "30.00 | \n", "1 | \n", "2 | \n", "113781 | \n", "151.5500 | \n", "C22 C26 | \n", "S | \n", "NaN | \n", "135.0 | \n", "Montreal, PQ / Chesterville, ON | \n", "
4 | \n", "1 | \n", "0 | \n", "Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | \n", "female | \n", "25.00 | \n", "1 | \n", "2 | \n", "113781 | \n", "151.5500 | \n", "C22 C26 | \n", "S | \n", "NaN | \n", "NaN | \n", "Montreal, PQ / Chesterville, ON | \n", "
\n", " | Pclass | \n", "Survived | \n", "Sex | \n", "Age | \n", "SibSp | \n", "Parch | \n", "Fare | \n", "Cabin | \n", "Embarked | \n", "Boat | \n", "Body | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "Yes | \n", "Female | \n", "29.00 | \n", "0 | \n", "0 | \n", "211.3375 | \n", "B5 | \n", "S | \n", "2 | \n", "NaN | \n", "
1 | \n", "1 | \n", "Yes | \n", "Male | \n", "0.92 | \n", "1 | \n", "2 | \n", "151.5500 | \n", "C22 C26 | \n", "S | \n", "11 | \n", "NaN | \n", "
2 | \n", "1 | \n", "No | \n", "Female | \n", "2.00 | \n", "1 | \n", "2 | \n", "151.5500 | \n", "C22 C26 | \n", "S | \n", "NaN | \n", "NaN | \n", "
3 | \n", "1 | \n", "No | \n", "Male | \n", "30.00 | \n", "1 | \n", "2 | \n", "151.5500 | \n", "C22 C26 | \n", "S | \n", "NaN | \n", "135.0 | \n", "
4 | \n", "1 | \n", "No | \n", "Female | \n", "25.00 | \n", "1 | \n", "2 | \n", "151.5500 | \n", "C22 C26 | \n", "S | \n", "NaN | \n", "NaN | \n", "
\n", " | Pclass | \n", "Survived | \n", "Sex | \n", "Age | \n", "SibSp | \n", "Parch | \n", "Fare | \n", "Cabin | \n", "Embarked | \n", "Boat | \n", "Body | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|
1146 | \n", "3 | \n", "No | \n", "Female | \n", "39.0 | \n", "0 | \n", "5 | \n", "29.125 | \n", "NaN | \n", "Q | \n", "NaN | \n", "327.0 | \n", "
1163 | \n", "3 | \n", "No | \n", "Male | \n", "NaN | \n", "0 | \n", "0 | \n", "24.150 | \n", "NaN | \n", "Q | \n", "NaN | \n", "NaN | \n", "
\n", " | suspicious_value | \n", "group_statistics | \n", "conditions | \n", "tree_depth | \n", "uses_NA_branch | \n", "outlier_score | \n", "
---|---|---|---|---|---|---|
1146 | \n", "{'column': 'Fare', 'value': 29.125, 'decimals'... | \n", "{'upper_thr': 15.5, 'pct_below': 0.97849462365... | \n", "[{'column': 'Embarked', 'comparison': '=', 'va... | \n", "4.0 | \n", "False | \n", "0.003805 | \n", "
1163 | \n", "{'column': 'Fare', 'value': 24.15, 'decimals': 0} | \n", "{'upper_thr': 15.5, 'pct_below': 0.97849462365... | \n", "[{'column': 'Embarked', 'comparison': '=', 'va... | \n", "4.0 | \n", "False | \n", "0.005227 | \n", "
\n", " | suspicious_value | \n", "group_statistics | \n", "conditions | \n", "tree_depth | \n", "uses_NA_branch | \n", "outlier_score | \n", "
---|---|---|---|---|---|---|
18 | \n", "{'column': 'Age', 'value': 32.0, 'decimals': 0} | \n", "{'lower_thr': 43.0, 'pct_above': 0.96, 'mean':... | \n", "[{'column': 'Cabin', 'comparison': 'in', 'valu... | \n", "3.0 | \n", "False | \n", "0.007545 | \n", "
170 | \n", "{'column': 'Fare', 'value': 0.0, 'decimals': 0} | \n", "{'lower_thr': 25.7417, 'pct_above': 0.98571428... | \n", "[{'column': 'Boat', 'comparison': 'in', 'value... | \n", "2.0 | \n", "False | \n", "0.015339 | \n", "
896 | \n", "{'column': 'Fare', 'value': 0.0, 'decimals': 0} | \n", "{'lower_thr': 3.1708, 'pct_above': 0.992156862... | \n", "[{'column': 'Pclass', 'comparison': '=', 'valu... | \n", "3.0 | \n", "False | \n", "0.011148 | \n", "
898 | \n", "{'column': 'Fare', 'value': 0.0, 'decimals': 0} | \n", "{'lower_thr': 3.1708, 'pct_above': 0.992156862... | \n", "[{'column': 'Pclass', 'comparison': '=', 'valu... | \n", "3.0 | \n", "False | \n", "0.011148 | \n", "
963 | \n", "{'column': 'Fare', 'value': 0.0, 'decimals': 0} | \n", "{'lower_thr': 3.1708, 'pct_above': 0.992156862... | \n", "[{'column': 'Pclass', 'comparison': '=', 'valu... | \n", "3.0 | \n", "False | \n", "0.011148 | \n", "
1044 | \n", "{'column': 'Fare', 'value': 15.5, 'decimals': 0} | \n", "{'upper_thr': 8.5167, 'pct_below': 0.967741935... | \n", "[{'column': 'Boat', 'comparison': 'in', 'value... | \n", "4.0 | \n", "False | \n", "0.002018 | \n", "
1146 | \n", "{'column': 'Fare', 'value': 29.125, 'decimals'... | \n", "{'upper_thr': 15.5, 'pct_below': 0.97849462365... | \n", "[{'column': 'Embarked', 'comparison': '=', 'va... | \n", "4.0 | \n", "False | \n", "0.003805 | \n", "
1163 | \n", "{'column': 'Fare', 'value': 24.15, 'decimals': 0} | \n", "{'upper_thr': 15.5, 'pct_below': 0.97849462365... | \n", "[{'column': 'Embarked', 'comparison': '=', 'va... | \n", "4.0 | \n", "False | \n", "0.005227 | \n", "
1254 | \n", "{'column': 'Fare', 'value': 0.0, 'decimals': 0} | \n", "{'lower_thr': 3.1708, 'pct_above': 0.992156862... | \n", "[{'column': 'Pclass', 'comparison': '=', 'valu... | \n", "3.0 | \n", "False | \n", "0.011148 | \n", "