{ "cells": [ { "cell_type": "markdown", "id": "checked-orange", "metadata": {}, "source": [ "# OutlierTrimmer\n", "The OutlierTrimmer() removes observations with outliers from the dataset.\n", "\n", "It works only with numerical variables. A list of variables can be indicated.\n", "Alternatively, the OutlierTrimmer() will select all numerical variables.\n", "\n", "The OutlierTrimmer() first calculates the maximum and /or minimum values\n", "beyond which a value will be considered an outlier, and thus removed.\n", "\n", "Limits are determined using:\n", "\n", "- a Gaussian approximation\n", "- the inter-quantile range proximity rule\n", "- percentiles.\n", "\n", "### Example:" ] }, { "cell_type": "code", "execution_count": 1, "id": "original-pasta", "metadata": {}, "outputs": [], "source": [ "# importing libraries\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "from sklearn.model_selection import train_test_split\n", "\n", "from feature_engine.outliers import OutlierTrimmer" ] }, { "cell_type": "code", "execution_count": 2, "id": "planned-programmer", "metadata": {}, "outputs": [], "source": [ "# Load titanic dataset from OpenML\n", "\n", "def load_titanic():\n", " data = pd.read_csv(\n", " 'https://www.openml.org/data/get_csv/16826755/phpMYEkMl')\n", " data = data.replace('?', np.nan)\n", " data['cabin'] = data['cabin'].astype(str).str[0]\n", " data['pclass'] = data['pclass'].astype('O')\n", " data['embarked'].fillna('C', inplace=True)\n", " data['fare'] = data['fare'].astype('float')\n", " data['fare'].fillna(data['fare'].median(), inplace=True)\n", " data['age'] = data['age'].astype('float')\n", " data['age'].fillna(data['age'].median(), inplace=True)\n", " data.drop(['name', 'ticket'], axis=1, inplace=True)\n", " return data\n", "\n", "# To plot histogram of given numerical feature\n", "\n", "\n", "def plot_hist(data, col):\n", " plt.figure(figsize=(8, 5))\n", " plt.hist(data[col], bins=30)\n", " plt.title(\"Distribution of \" + col)\n", " return plt.show()" ] }, { "cell_type": "code", "execution_count": 3, "id": "objective-professor", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | pclass | \n", "survived | \n", "sex | \n", "age | \n", "sibsp | \n", "parch | \n", "fare | \n", "cabin | \n", "embarked | \n", "boat | \n", "body | \n", "home.dest | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
675 | \n", "3 | \n", "0 | \n", "male | \n", "21.0 | \n", "0 | \n", "0 | \n", "7.775 | \n", "n | \n", "S | \n", "NaN | \n", "NaN | \n", "Brennes, Norway New York | \n", "
558 | \n", "2 | \n", "1 | \n", "female | \n", "18.0 | \n", "0 | \n", "2 | \n", "13.000 | \n", "n | \n", "S | \n", "16 | \n", "NaN | \n", "Finland / Minneapolis, MN | \n", "
194 | \n", "1 | \n", "0 | \n", "male | \n", "30.0 | \n", "0 | \n", "0 | \n", "26.000 | \n", "C | \n", "S | \n", "NaN | \n", "NaN | \n", "Brockton, MA | \n", "
217 | \n", "1 | \n", "0 | \n", "male | \n", "64.0 | \n", "0 | \n", "0 | \n", "26.000 | \n", "n | \n", "S | \n", "NaN | \n", "263 | \n", "Isle of Wight, England | \n", "
473 | \n", "2 | \n", "0 | \n", "male | \n", "28.0 | \n", "0 | \n", "0 | \n", "0.000 | \n", "n | \n", "S | \n", "NaN | \n", "NaN | \n", "Belfast | \n", "