{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ArbitraryOutlierCapper\n",
"The ArbitraryOutlierCapper() caps the maximum or minimum values of a variable\n",
"at an arbitrary value indicated by the user.\n",
"\n",
"The user must provide the maximum or minimum values that will be used
\n",
"to cap each variable in a dictionary {feature : capping_value}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Example"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# importing libraries\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"from feature_engine.outliers import ArbitraryOutlierCapper"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Load titanic dataset from OpenML\n",
"\n",
"def load_titanic():\n",
" data = pd.read_csv(\n",
" 'https://www.openml.org/data/get_csv/16826755/phpMYEkMl')\n",
" data = data.replace('?', np.nan)\n",
" data['cabin'] = data['cabin'].astype(str).str[0]\n",
" data['pclass'] = data['pclass'].astype('O')\n",
" data['embarked'].fillna('C', inplace=True)\n",
" data['fare'] = data['fare'].astype('float')\n",
" data['fare'].fillna(data['fare'].median(), inplace=True)\n",
" data['age'] = data['age'].astype('float')\n",
" data['age'].fillna(data['age'].median(), inplace=True)\n",
" data.drop(['name', 'ticket'], axis=1, inplace=True)\n",
" return data\n",
"\n",
"# To plot histogram of given numerical feature\n",
"\n",
"\n",
"def plot_hist(data, col):\n",
" plt.figure(figsize=(8, 5))\n",
" plt.hist(data[col], bins=30)\n",
" plt.title(\"Distribution of \" + col)\n",
" return plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n", " | pclass | \n", "survived | \n", "sex | \n", "age | \n", "sibsp | \n", "parch | \n", "fare | \n", "cabin | \n", "embarked | \n", "boat | \n", "body | \n", "home.dest | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
214 | \n", "1 | \n", "1 | \n", "female | \n", "23.0 | \n", "1 | \n", "0 | \n", "113.275 | \n", "D | \n", "C | \n", "6 | \n", "NaN | \n", "Lexington, MA | \n", "
651 | \n", "3 | \n", "0 | \n", "male | \n", "30.0 | \n", "0 | \n", "0 | \n", "7.225 | \n", "n | \n", "C | \n", "NaN | \n", "NaN | \n", "Ottawa, ON | \n", "
930 | \n", "3 | \n", "0 | \n", "male | \n", "28.0 | \n", "1 | \n", "0 | \n", "7.750 | \n", "n | \n", "Q | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
992 | \n", "3 | \n", "0 | \n", "female | \n", "30.5 | \n", "0 | \n", "0 | \n", "7.750 | \n", "n | \n", "Q | \n", "NaN | \n", "61 | \n", "NaN | \n", "
718 | \n", "3 | \n", "0 | \n", "male | \n", "20.0 | \n", "0 | \n", "0 | \n", "7.050 | \n", "n | \n", "S | \n", "NaN | \n", "NaN | \n", "Portugal | \n", "