{ "cells": [ { "cell_type": "markdown", "id": "checked-orange", "metadata": {}, "source": [ "# OutlierTrimmer\n", "The OutlierTrimmer() removes observations with outliers from the dataset.\n", "\n", "It works only with numerical variables. A list of variables can be indicated.\n", "Alternatively, the OutlierTrimmer() will select all numerical variables.\n", "\n", "The OutlierTrimmer() first calculates the maximum and /or minimum values\n", "beyond which a value will be considered an outlier, and thus removed.\n", "\n", "Limits are determined using:\n", "\n", "- a Gaussian approximation\n", "- the inter-quantile range proximity rule\n", "- percentiles.\n", "\n", "### Example:" ] }, { "cell_type": "code", "execution_count": 1, "id": "original-pasta", "metadata": {}, "outputs": [], "source": [ "# importing libraries\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "from sklearn.model_selection import train_test_split\n", "\n", "from feature_engine.outliers import OutlierTrimmer" ] }, { "cell_type": "code", "execution_count": 2, "id": "planned-programmer", "metadata": {}, "outputs": [], "source": [ "# Load titanic dataset from OpenML\n", "\n", "def load_titanic():\n", " data = pd.read_csv(\n", " 'https://www.openml.org/data/get_csv/16826755/phpMYEkMl')\n", " data = data.replace('?', np.nan)\n", " data['cabin'] = data['cabin'].astype(str).str[0]\n", " data['pclass'] = data['pclass'].astype('O')\n", " data['embarked'].fillna('C', inplace=True)\n", " data['fare'] = data['fare'].astype('float')\n", " data['fare'].fillna(data['fare'].median(), inplace=True)\n", " data['age'] = data['age'].astype('float')\n", " data['age'].fillna(data['age'].median(), inplace=True)\n", " data.drop(['name', 'ticket'], axis=1, inplace=True)\n", " return data\n", "\n", "# To plot histogram of given numerical feature\n", "\n", "\n", "def plot_hist(data, col):\n", " plt.figure(figsize=(8, 5))\n", " plt.hist(data[col], bins=30)\n", " plt.title(\"Distribution of \" + col)\n", " return plt.show()" ] }, { "cell_type": "code", "execution_count": 3, "id": "objective-professor", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pclasssurvivedsexagesibspparchfarecabinembarkedboatbodyhome.dest
67530male21.0007.775nSNaNNaNBrennes, Norway New York
55821female18.00213.000nS16NaNFinland / Minneapolis, MN
19410male30.00026.000CSNaNNaNBrockton, MA
21710male64.00026.000nSNaN263Isle of Wight, England
47320male28.0000.000nSNaNNaNBelfast
\n", "
" ], "text/plain": [ " pclass survived sex age sibsp parch fare cabin embarked boat \\\n", "675 3 0 male 21.0 0 0 7.775 n S NaN \n", "558 2 1 female 18.0 0 2 13.000 n S 16 \n", "194 1 0 male 30.0 0 0 26.000 C S NaN \n", "217 1 0 male 64.0 0 0 26.000 n S NaN \n", "473 2 0 male 28.0 0 0 0.000 n S NaN \n", "\n", " body home.dest \n", "675 NaN Brennes, Norway New York \n", "558 NaN Finland / Minneapolis, MN \n", "194 NaN Brockton, MA \n", "217 263 Isle of Wight, England \n", "473 NaN Belfast " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Loading titanic dataset\n", "data = load_titanic()\n", "data.sample(5)" ] }, { "cell_type": "code", "execution_count": 4, "id": "nervous-interference", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train data shape before removing outliers: (916, 11)\n", "test data shape before removing outliers: (393, 11)\n" ] } ], "source": [ "# let's separate into training and testing set\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(data.drop('survived', axis=1),\n", " data['survived'],\n", " test_size=0.3,\n", " random_state=0)\n", "\n", "print(\"train data shape before removing outliers:\", X_train.shape)\n", "print(\"test data shape before removing outliers:\", X_test.shape)" ] }, { "cell_type": "code", "execution_count": 5, "id": "medium-chile", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Max age: 80.0\n", "Max fare: 512.3292\n", "Min age: 0.1667\n", "Min fare: 0.0\n" ] } ], "source": [ "# let's find out the maximum Age and maximum Fare in the titanic\n", "\n", "print(\"Max age:\", data.age.max())\n", "print(\"Max fare:\", data.fare.max())\n", "\n", "print(\"Min age:\", data.age.min())\n", "print(\"Min fare:\", data.fare.min())" ] }, { "cell_type": "code", "execution_count": 6, "id": "suburban-mills", "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeYAAAE/CAYAAACTomAoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAZM0lEQVR4nO3de7SldX3f8fdHhoAC4SKndJwBBpVqiKsOdkRc2gRBI+AFs1ZicHmZWLImbTHRhCSCGsUuabGNktomtEQQvCHESyBIjIhkGdsKDgjIReIEBplxYAaQmxgi+O0f+zeyHWY49zm/vef9Wmuv8+zfc/v+ztn7fM7ze579nFQVkiSpD09Z6AIkSdLjDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrM0B5L8ryR/PEfbOiDJQ0l2as//LslvzcW22/b+JsnKudreNPb7gSR3J7lze+9bGiXxc8zSk0uyFtgPeBR4DLgJ+DhwVlX9ZAbb+q2q+so01vk74JNV9dHp7Kuteyrw7Kp603TXnUtJDgBuAQ6sqo0LWYvUO4+Ypal5TVXtARwInA68Ezh7rneSZNFcb7MTBwD3GMrS5AxmaRqq6v6quhj4DWBlkucBJDk3yQfa9L5JLklyX5J7k/x9kqck+QSDgPrrNlT9R0mWJakkJyT5HvDVobbhkH5WkquSPJDkoiT7tH0dkWTdcI1J1iZ5eZKjgXcBv9H2d12b/9Oh8VbXe5LcnmRjko8n2bPN21zHyiTfa8PQ797W9ybJnm39TW1772nbfzlwGfCMVse5W1l37/Y925TkB2166dD8g5J8LcmDSb6S5M+SfHJo/uFJ/m/7nl+X5Igp/kil7hjM0gxU1VXAOuDfbmX2SW3eBIMh8HcNVqk3A99jcPS9e1X916F1fhn4BeCV29jlW4B/ByxmMKT+kSnU+CXgPwMXtP09fyuL/WZ7vAx4JrA78D+3WOalwHOAo4D3JvmFbezyfwB7tu38cqv5rW3Y/hjg+62O39zKuk8BPsZgROIA4Edb1PFp4Crg6cCpwJs3z0iyBPgi8AFgH+APgM8lmdhGnVLXDGZp5r7PIAi29GMGAXpgVf24qv6+Jr+Y49Sq+mFV/Wgb8z9RVTdU1Q+BPwZev/nisFl6I/Dhqrq1qh4CTgGO3+Jo/f1V9aOqug64DnhCwLdajgdOqaoHq2ot8CGGAvTJVNU9VfW5qnq4qh4ETmMQ7pvPT78QeG9V/XNVfR24eGj1NwGXVtWlVfWTqroMWA0cO51vhNQLg1mauSXAvVtp/2/AGuDLSW5NcvIUtnXHNObfDuwM7DulKp/cM9r2hre9iMGR/mbDV1E/zOCoekv7tpq23NaSqRSR5GlJ/ncbAn8A+BqwVwv8ZwD3VtXDQ6sMfz8OBH69DWPfl+Q+Bkf5i6eyb6k3BrM0A0leyCB0vr7lvHbEeFJVPRN4LfD7SY7aPHsbm5zsiHr/oekDGByV3w38EHjaUF07MRhCn+p2v88g2Ia3/Shw1yTrbenuVtOW21o/xfVPYjBc/qKq+nngl1p7gA3APkmeNrT88PfjDgYjCnsNPXarqtOn2QepCwazNA1Jfj7Jq4HPMPgI07e3ssyrkzw7SYD7GXzEavPHqu5icA52ut6U5JAWTv8J+GxVPQb8A7Brklcl2Rl4D7DL0Hp3AcuSbOu9fj7we+3iqt15/Jz0o9MprtVyIXBakj2SHAj8PvDJJ1/zp/ZgcF75vnZh2/uGtn07g6HpU5P8XJIXA68ZWveTwGuSvDLJTkl2bRfFLUUaQQazNDV/neRBBkdn7wY+DLx1G8seDHwFeAj4f8CfV9UVbd5/Ad7Thlz/YBr7/wRwLoNh5V2B34XBVeLAfwQ+yuDo9IcMLjzb7C/b13uSXLOV7Z7Ttv014Dbgn4DfmUZdw36n7f9WBiMJn27bn4o/BZ7K4Mj7G8CXtpj/RuDFwD0MLvK6AHgEoKruAI5jcJHdJgY/oz/E328aUd5gRNLISXIB8J2qet+kC0sjxr8oJXUvyQuTPKt9LvpoBkfIf7XAZUnzYlzvMiRpvPxL4PMMPse8DvgPVfWthS1Jmh8OZUuS1BGHsiVJ6ojBLElSR7o4x7zvvvvWsmXLFroMSZK2m6uvvvruqnrCPd27COZly5axevXqhS5DkqTtJsntW2t3KFuSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSNd3Ctb2p6WnfzFaa+z9vRXzUMlkvREHjFLktSRSYM5ya5JrkpyXZIbk7y/tZ+b5LYk17bH8taeJB9JsibJ9UleMM99kCRpbExlKPsR4MiqeijJzsDXk/xNm/eHVfXZLZY/Bji4PV4EnNm+SpKkSUx6xFwDD7WnO7dHPckqxwEfb+t9A9gryeLZlypJ0vib0jnmJDsluRbYCFxWVVe2Wae14eozkuzS2pYAdwytvq61SZKkSUwpmKvqsapaDiwFDkvyPOAU4LnAC4F9gHdOZ8dJViVZnWT1pk2bple1JEljalpXZVfVfcAVwNFVtaENVz8CfAw4rC22Hth/aLWlrW3LbZ1VVSuqasXExMSMipckadxM5arsiSR7temnAq8AvrP5vHGSAK8DbmirXAy8pV2dfThwf1VtmIfaJUkaO1O5KnsxcF6SnRgE+YVVdUmSryaZAAJcC/z7tvylwLHAGuBh4K1zXrUkSWNq0mCuquuBQ7fSfuQ2li/gxNmXJknSjsc7f0mS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1ZNJgTrJrkquSXJfkxiTvb+0HJbkyyZokFyT5uda+S3u+ps1fNs99kCRpbEzliPkR4Miqej6wHDg6yeHAB4EzqurZwA+AE9ryJwA/aO1ntOUkSdIUTBrMNfBQe7pzexRwJPDZ1n4e8Lo2fVx7Tpt/VJLMVcGSJI2zKZ1jTrJTkmuBjcBlwD8C91XVo22RdcCSNr0EuAOgzb8fePoc1ixJ0tiaUjBX1WNVtRxYChwGPHe2O06yKsnqJKs3bdo0281JkjQWpnVVdlXdB1wBvBjYK8miNmspsL5Nrwf2B2jz9wTu2cq2zqqqFVW1YmJiYmbVS5I0ZqZyVfZEkr3a9FOBVwA3MwjoX2uLrQQuatMXt+e0+V+tqprDmiVJGluLJl+ExcB5SXZiEOQXVtUlSW4CPpPkA8C3gLPb8mcDn0iyBrgXOH4e6pYkaSxNGsxVdT1w6Fbab2VwvnnL9n8Cfn1OqpMkaQfjnb8kSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOjJpMCfZP8kVSW5KcmOSt7f2U5OsT3Jtexw7tM4pSdYkuSXJK+ezA5IkjZNFU1jmUeCkqromyR7A1Ukua/POqKo/GV44ySHA8cAvAs8AvpLkX1XVY3NZuCRJ42jSI+aq2lBV17TpB4GbgSVPsspxwGeq6pGqug1YAxw2F8VKkjTupnWOOcky4FDgytb0tiTXJzknyd6tbQlwx9Bq63jyIJckSc2UgznJ7sDngHdU1QPAmcCzgOXABuBD09lxklVJVidZvWnTpumsKknS2JpSMCfZmUEof6qqPg9QVXdV1WNV9RPgL3h8uHo9sP/Q6ktb28+oqrOqakVVrZiYmJhNHyRJGhtTuSo7wNnAzVX14aH2xUOL/SpwQ5u+GDg+yS5JDgIOBq6au5IlSRpfU7kq+yXAm4FvJ7m2tb0LeEOS5UABa4HfBqiqG5NcCNzE4IruE70iW5KkqZk0mKvq60C2MuvSJ1nnNOC0WdQlSdIOyTt/SZLUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHVk0mBOsn+SK5LclOTGJG9v7fskuSzJd9vXvVt7knwkyZok1yd5wXx3QpKkcTGVI+ZHgZOq6hDgcODEJIcAJwOXV9XBwOXtOcAxwMHtsQo4c86rliRpTE0azFW1oaquadMPAjcDS4DjgPPaYucBr2vTxwEfr4FvAHslWTzXhUuSNI6mdY45yTLgUOBKYL+q2tBm3Qns16aXAHcMrbautUmSpElMOZiT7A58DnhHVT0wPK+qCqjp7DjJqiSrk6zetGnTdFaVJGlsTSmYk+zMIJQ/VVWfb813bR6ibl83tvb1wP5Dqy9tbT+jqs6qqhVVtWJiYmKm9UuSNFamclV2gLOBm6vqw0OzLgZWtumVwEVD7W9pV2cfDtw/NOQtSZKexKIpLPMS4M3At5Nc29reBZwOXJjkBOB24PVt3qXAscAa4GHgrXNZsCRJ42zSYK6qrwPZxuyjtrJ8ASfOsi5JknZI3vlLkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHZk0mJOck2RjkhuG2k5Nsj7Jte1x7NC8U5KsSXJLklfOV+GSJI2jqRwxnwscvZX2M6pqeXtcCpDkEOB44BfbOn+eZKe5KlaSpHE3aTBX1deAe6e4veOAz1TVI1V1G7AGOGwW9UmStEOZzTnmtyW5vg11793algB3DC2zrrU9QZJVSVYnWb1p06ZZlCFJ0viYaTCfCTwLWA5sAD403Q1U1VlVtaKqVkxMTMywDEmSxsuMgrmq7qqqx6rqJ8Bf8Phw9Xpg/6FFl7Y2SZI0BTMK5iSLh57+KrD5iu2LgeOT7JLkIOBg4KrZlShJ0o5j0WQLJDkfOALYN8k64H3AEUmWAwWsBX4boKpuTHIhcBPwKHBiVT02L5VLkjSGJg3mqnrDVprPfpLlTwNOm01RkiTtqLzzlyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdWbTQBUgAy07+4rTXWXv6q+ahEklaWB4xS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1JFJgznJOUk2JrlhqG2fJJcl+W77undrT5KPJFmT5PokL5jP4iVJGjdTOWI+Fzh6i7aTgcur6mDg8vYc4Bjg4PZYBZw5N2VKkrRjmDSYq+prwL1bNB8HnNemzwNeN9T+8Rr4BrBXksVzVKskSWNvpueY96uqDW36TmC/Nr0EuGNouXWt7QmSrEqyOsnqTZs2zbAMSZLGy6wv/qqqAmoG651VVSuqasXExMRsy5AkaSzMNJjv2jxE3b5ubO3rgf2Hllva2iRJ0hTMNJgvBla26ZXARUPtb2lXZx8O3D805C1JkiaxaLIFkpwPHAHsm2Qd8D7gdODCJCcAtwOvb4tfChwLrAEeBt46DzVLkjS2Jg3mqnrDNmYdtZVlCzhxtkVpdC07+YsLXYIkjTTv/CVJUkcMZkmSOjLpULakmQ/Rrz39VXNciaRx5xGzJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjixa6AGmmlp38xYUuQZLmnMEsdWamf3CsPf1Vc1yJpIXgULYkSR3xiFnb5FCxJG1/HjFLktQRg1mSpI4YzJIkdcRgliSpI7O6+CvJWuBB4DHg0apakWQf4AJgGbAWeH1V/WB2ZUqStGOYi6uyX1ZVdw89Pxm4vKpOT3Jye/7OOdiPpCcxk6vo/eyz1J/5GMo+DjivTZ8HvG4e9iFJ0liabTAX8OUkVydZ1dr2q6oNbfpOYL9Z7kOSpB3GbIeyX1pV65P8C+CyJN8ZnllVlaS2tmIL8lUABxxwwCzLkCRpPMzqiLmq1revG4EvAIcBdyVZDNC+btzGumdV1YqqWjExMTGbMiRJGhszDuYkuyXZY/M08CvADcDFwMq22ErgotkWKUnSjmI2Q9n7AV9Isnk7n66qLyX5JnBhkhOA24HXz75MSZJ2DDMO5qq6FXj+VtrvAY6aTVGSJO2o/O9S0jzyP3RJmi5vySlJUkc8YpY0bd5lTJo/HjFLktQRg1mSpI44lL0D8AIk9WCmr0OHwLWjGctg9heAJGlUOZQtSVJHxvKIWdLUeJpD6o9HzJIkdcRgliSpIwazJEkdMZglSeqIF39J6poff9SOxmCWpMY/AtQDg3mIb0pJ0kLzHLMkSR0xmCVJ6ohD2ZLGknc106gymOeA/zRe0nR5TYu2xaFsSZI64hHzAnGYTdJMOEI3/jxiliSpIx4xS9IsOQKmueQRsyRJHTGYJUnqiMEsSVJH5u0cc5Kjgf8O7AR8tKpOn699SZL64OezZ29egjnJTsCfAa8A1gHfTHJxVd00H/uTJG2bYTla5mso+zBgTVXdWlX/DHwGOG6e9iVJ0tiYr6HsJcAdQ8/XAS+ap31JkubB9vwYWO8fOdueowcL9jnmJKuAVe3pQ0lumcPN7wvcPYfbWyjj0g+wL72yL30al76MSz/IB+elLwdurXG+gnk9sP/Q86Wt7aeq6izgrPnYeZLVVbViPra9PY1LP8C+9Mq+9Glc+jIu/YDt25f5Osf8TeDgJAcl+TngeODiedqXJEljY16OmKvq0SRvA/6WwcelzqmqG+djX5IkjZN5O8dcVZcCl87X9icxL0PkC2Bc+gH2pVf2pU/j0pdx6Qdsx76kqrbXviRJ0iS8JackSR0Zq2BOcnSSW5KsSXLyQtczHUnOSbIxyQ1DbfskuSzJd9vXvReyxqlKsn+SK5LclOTGJG9v7SPXnyS7JrkqyXWtL+9v7QclubK91i5oFzl2L8lOSb6V5JL2fFT7sTbJt5Ncm2R1axu51xdAkr2SfDbJd5LcnOTFo9iXJM9pP4/NjweSvGMU+wKQ5Pfae/6GJOe33wXb5f0yNsE8dBvQY4BDgDckOWRhq5qWc4Gjt2g7Gbi8qg4GLm/PR8GjwElVdQhwOHBi+1mMYn8eAY6squcDy4GjkxwOfBA4o6qeDfwAOGHhSpyWtwM3Dz0f1X4AvKyqlg99hGUUX18w+J8CX6qq5wLPZ/DzGbm+VNUt7eexHPg3wMPAFxjBviRZAvwusKKqnsfgIubj2V7vl6oaiwfwYuBvh56fApyy0HVNsw/LgBuGnt8CLG7Ti4FbFrrGGfbrIgb3TR/p/gBPA65hcBe7u4FFrf1nXnu9PhjcT+By4EjgEiCj2I9W61pg3y3aRu71BewJ3Ea73meU+7JF/b8C/J9R7QuP371yHwYXSV8CvHJ7vV/G5oiZrd8GdMkC1TJX9quqDW36TmC/hSxmJpIsAw4FrmRE+9OGf68FNgKXAf8I3FdVj7ZFRuW19qfAHwE/ac+fzmj2A6CALye5ut1FEEbz9XUQsAn4WDvF8NEkuzGafRl2PHB+mx65vlTVeuBPgO8BG4D7gavZTu+XcQrmsVaDP9FG6hL6JLsDnwPeUVUPDM8bpf5U1WM1GJ5byuAftDx3YSuaviSvBjZW1dULXcsceWlVvYDBqasTk/zS8MwRen0tAl4AnFlVhwI/ZIuh3hHqCwDtvOtrgb/cct6o9KWdBz+OwR9OzwB244mnGufNOAXzpLcBHUF3JVkM0L5uXOB6pizJzgxC+VNV9fnWPLL9Aaiq+4ArGAxh7ZVk830ARuG19hLgtUnWMvhvb0cyOLc5av0AfnpEQ1VtZHAe8zBG8/W1DlhXVVe2559lENSj2JfNjgGuqaq72vNR7MvLgduqalNV/Rj4PIP30HZ5v4xTMI/jbUAvBla26ZUMztV2L0mAs4Gbq+rDQ7NGrj9JJpLs1aafyuBc+c0MAvrX2mLd96WqTqmqpVW1jMF746tV9UZGrB8ASXZLssfmaQbnM29gBF9fVXUncEeS57Smo4CbGMG+DHkDjw9jw2j25XvA4Ume1n6fbf65bJ/3y0KfZJ/jE/bHAv/A4Bzguxe6nmnWfj6Dcxk/ZvBX9AkMzgFeDnwX+Aqwz0LXOcW+vJTBcNX1wLXtcewo9gf418C3Wl9uAN7b2p8JXAWsYTBkt8tC1zqNPh0BXDKq/Wg1X9ceN25+r4/i66vVvRxY3V5jfwXsPcJ92Q24B9hzqG1U+/J+4Dvtff8JYJft9X7xzl+SJHVknIayJUkaeQazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXk/wOn1so1cVz37wAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Histogram of age feature before capping outliers\n", "plot_hist(data, 'age')" ] }, { "cell_type": "code", "execution_count": 7, "id": "compatible-finish", "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeYAAAE/CAYAAACTomAoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAYCElEQVR4nO3dfbTdVZ3f8fdHAjLjA+HhmsUkgeCQ6jBdFWlq49KpSkbLgzX84TC4HIk0s9LVUqvVWU58GKtdtsV2KSN9oGWJNfiMzFgySh1jwOXMWgMaFJAHlSsFkxhIRAgCoyP67R9nXz3EhHvuzb3J5uT9Wuuss39779/vt88ml09++/e7J6kqJElSH55ysAcgSZJ+yWCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLcyDJ/0zyJ3N0rBOSPJzksLb95SR/OBfHbsf7v0nWzNXxZnDe9yb5QZJ799H+L5Pc1z77sQd6fFIv4u8xS08syd3AIuAx4GfA7cAVwGVV9fNZHOsPq+pLM9jny8DHqupDMzlX2/fdwMlV9Qcz3XcuJTkB+DZwYlXt3Ev74cBDwMqquvlAj0/qiVfM0mj+WVU9AzgRuAj4Y+DyuT5JkgVzfcxOnADcv7dQbhYBRwK3zebgYzxvOgQZzNIMVNXuqtoI/D6wJsnfB0jykSTvbeXjknwuyYNJfpjkr5I8JclHGQTUX7Tl2rcmWZakkqxN8j3g2qG64bD5zSRfTfJQkquTHNPO9dIk24bHmOTuJL+b5Azg7cDvt/Pd3Np/sTTexvXOJPck2ZnkiiRHtbapcaxJ8r22DP2Ofc1NkqPa/rva8d7Zjv+7wCbgN9o4PrLHfn+PwdU0wINJrm31H0yytX3mG5P8ztA+705yVZKPJXkIeH07/+VJdiTZ3pbODxv9v67UB4NZmoWq+iqwDfidvTS/pbVNMLgSfPtgl3od8D0GV99Pr6r/PLTPS4DfAv7pPk55PvDPgeMZLKlfMsIYvwD8R+DT7XzP20u317fXy4BnA08H/tsefV4MPAdYBbwryW/t45T/FTiqHeclbcwXtGX7M4Hvt3G8fo9xfgf47ba5sKpOb+WvAacCxwCfAD6T5MihXVcDVwELgY8DH2EwNycDzwdeAczZvXnpQDGYpdn7PoPQ2NNPGQToiVX106r6q5r+YY53V9UjVfW3+2j/aFXdWlWPAH8CnDtHV4OvBT5QVXdV1cPA24Dz9rhaf09V/W2793sz8CsB38ZyHvC2qvpRVd0NvB943WwHVlUfq6r7q+qxqno/8FQGf0GY8jdV9X/aff5nAmcBb2rzuBO4uI1JelIxmKXZWwz8cC/1/wWYBL6Y5K4k60c41tYZtN8DHA4cN9Ion9hvtOMNH3sBgyv9KcNPUT/K4Kp6T8e1Me15rMWzHViSP0pyR5LdSR5kcDU+/JmH5+TEdv4d7RbCg8D/Ap412/NLB4vBLM1Ckn/EIHT+es+2dsX4lqp6NvAq4M1JVk017+OQ011RLx0qn8DgqvwHwCPArw+N6zAGS+ijHvf7DEJt+NiPAfdNs9+eftDGtOexts/wOAC0+8lvBc4Fjq6qhcBuIEPdhj/bVuAnwHFVtbC9nllVv430JGMwSzOQ5JlJXgl8isGvMH1zL31emeTkJGEQJj8Dpn6t6j4G92Bn6g+SnJLk14F/D1xVVT8DvgMcmeTs9itH72Sw5DvlPmBZkn39rH8S+LdJTkrydH55T/qxmQyujeVK4D8keUaSE4E3Ax+byXGGPIPBXxB2AQuSvIvBcvW+zr8D+CLw/vbf6ClJfjPJS2Z5fumgMZil0fxFkh8xuDJ7B/AB4IJ99F0OfAl4GPgb4H9U1XWt7T8B72zLrX80g/N/lMHDTfcy+LWifwODp8SBfwV8iMHV6SMMHjyb8pn2fn+Sr+/luB9ux/4K8P+AHwNvmMG4hr2hnf8uBisJn2jHn42/BL7A4C8e97RxTbfcfz5wBIPfM3+AwYNhx8/y/NJB4xeMSJLUEa+YJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjnTxL7Icd9xxtWzZsoM9DEmSDogbb7zxB1U1sbe2LoJ52bJlbNmy5WAPQ5KkAyLJPftqcylbkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjXXxX9lxbtv7zs9rv7ovOnuORSJI0M14xS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqyLTBnOQ5SW4aej2U5E1JjkmyKcmd7f3o1j9JLkkymeSWJKfN/8eQJGk8TBvMVfXtqjq1qk4F/iHwKPBZYD2wuaqWA5vbNsCZwPL2WgdcOg/jliRpLM10KXsV8N2qugdYDWxo9RuAc1p5NXBFDVwPLExy/FwMVpKkcTfTYD4P+GQrL6qqHa18L7ColRcDW4f22dbqJEnSNEYO5iRHAK8CPrNnW1UVUDM5cZJ1SbYk2bJr166Z7CpJ0tiayRXzmcDXq+q+tn3f1BJ1e9/Z6rcDS4f2W9LqHqeqLquqFVW1YmJiYuYjlyRpDM0kmF/DL5exATYCa1p5DXD1UP357enslcDuoSVvSZL0BEb616WSPA14OfAvhqovAq5Msha4Bzi31V8DnAVMMniC+4I5G60kSWNupGCuqkeAY/eou5/BU9p79i3gwjkZnSRJhxi/+UuSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIyMFc5KFSa5K8q0kdyR5YZJjkmxKcmd7P7r1TZJLkkwmuSXJafP7ESRJGh+jXjF/EPhCVT0XeB5wB7Ae2FxVy4HNbRvgTGB5e60DLp3TEUuSNMamDeYkRwH/BLgcoKr+rqoeBFYDG1q3DcA5rbwauKIGrgcWJjl+jsctSdJYGuWK+SRgF/C/k3wjyYeSPA1YVFU7Wp97gUWtvBjYOrT/tlYnSZKmMUowLwBOAy6tqucDj/DLZWsAqqqAmsmJk6xLsiXJll27ds1kV0mSxtYowbwN2FZVN7TtqxgE9X1TS9TtfWdr3w4sHdp/Sat7nKq6rKpWVNWKiYmJ2Y5fkqSxMm0wV9W9wNYkz2lVq4DbgY3Amla3Bri6lTcC57ens1cCu4eWvCVJ0hNYMGK/NwAfT3IEcBdwAYNQvzLJWuAe4NzW9xrgLGASeLT1lSRJIxgpmKvqJmDFXppW7aVvARfu37AkSTo0+c1fkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjIwVzkruTfDPJTUm2tLpjkmxKcmd7P7rVJ8klSSaT3JLktPn8AJIkjZOZXDG/rKpOraoVbXs9sLmqlgOb2zbAmcDy9loHXDpXg5Ukadztz1L2amBDK28Azhmqv6IGrgcWJjl+P84jSdIhY9RgLuCLSW5Msq7VLaqqHa18L7ColRcDW4f23dbqJEnSNBaM2O/FVbU9ybOATUm+NdxYVZWkZnLiFvDrAE444YSZ7CpJ0tga6Yq5qra3953AZ4EXAPdNLVG3952t+3Zg6dDuS1rdnse8rKpWVNWKiYmJ2X8CSZLGyLTBnORpSZ4xVQZeAdwKbATWtG5rgKtbeSNwfns6eyWwe2jJW5IkPYFRlrIXAZ9NMtX/E1X1hSRfA65Msha4Bzi39b8GOAuYBB4FLpjzUUuSNKamDeaqugt43l7q7wdW7aW+gAvnZHSSJB1i/OYvSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI6MHMxJDkvyjSSfa9snJbkhyWSSTyc5otU/tW1PtvZl8zR2SZLGzkyumN8I3DG0/T7g4qo6GXgAWNvq1wIPtPqLWz9JkjSCkYI5yRLgbOBDbTvA6cBVrcsG4JxWXt22ae2rWn9JkjSNUa+Y/xR4K/Dztn0s8GBVPda2twGLW3kxsBWgte9u/R8nybokW5Js2bVr1+xGL0nSmJk2mJO8EthZVTfO5Ymr6rKqWlFVKyYmJuby0JIkPWktGKHPi4BXJTkLOBJ4JvBBYGGSBe2qeAmwvfXfDiwFtiVZABwF3D/nI5ckaQxNe8VcVW+rqiVVtQw4D7i2ql4LXAe8unVbA1zdyhvbNq392qqqOR21JEljan9+j/mPgTcnmWRwD/nyVn85cGyrfzOwfv+GKEnSoWOUpexfqKovA19u5buAF+ylz4+B35uDsUmSdMjxm78kSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkemDeYkRyb5apKbk9yW5D2t/qQkNySZTPLpJEe0+qe27cnWvmyeP4MkSWNjlCvmnwCnV9XzgFOBM5KsBN4HXFxVJwMPAGtb/7XAA63+4tZPkiSNYNpgroGH2+bh7VXA6cBVrX4DcE4rr27btPZVSTJXA5YkaZyNdI85yWFJbgJ2ApuA7wIPVtVjrcs2YHErLwa2ArT23cCxczhmSZLG1kjBXFU/q6pTgSXAC4Dn7u+Jk6xLsiXJll27du3v4SRJGgszeiq7qh4ErgNeCCxMsqA1LQG2t/J2YClAaz8KuH8vx7qsqlZU1YqJiYnZjV6SpDEzylPZE0kWtvKvAS8H7mAQ0K9u3dYAV7fyxrZNa7+2qmoOxyxJ0thaMH0Xjgc2JDmMQZBfWVWfS3I78Kkk7wW+AVze+l8OfDTJJPBD4Lx5GLckSWNp2mCuqluA5++l/i4G95v3rP8x8HtzMjpJkg4xfvOXJEkdMZglSeqIwSxJUkcMZkmSOjLKU9mHjGXrPz/jfe6+6Ox5GIkk6VDlFbMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1JFpgznJ0iTXJbk9yW1J3tjqj0myKcmd7f3oVp8klySZTHJLktPm+0NIkjQuRrlifgx4S1WdAqwELkxyCrAe2FxVy4HNbRvgTGB5e60DLp3zUUuSNKamDeaq2lFVX2/lHwF3AIuB1cCG1m0DcE4rrwauqIHrgYVJjp/rgUuSNI5mdI85yTLg+cANwKKq2tGa7gUWtfJiYOvQbttanSRJmsbIwZzk6cCfAW+qqoeG26qqgJrJiZOsS7IlyZZdu3bNZFdJksbWSMGc5HAGofzxqvrzVn3f1BJ1e9/Z6rcDS4d2X9LqHqeqLquqFVW1YmJiYrbjlyRprIzyVHaAy4E7quoDQ00bgTWtvAa4eqj+/PZ09kpg99CStyRJegILRujzIuB1wDeT3NTq3g5cBFyZZC1wD3Bua7sGOAuYBB4FLpjLAUuSNM6mDeaq+msg+2hetZf+BVy4n+OSJOmQ5Dd/SZLUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHVk2mBO8uEkO5PcOlR3TJJNSe5s70e3+iS5JMlkkluSnDafg5ckadyMcsX8EeCMPerWA5urajmwuW0DnAksb691wKVzM0xJkg4N0wZzVX0F+OEe1auBDa28AThnqP6KGrgeWJjk+DkaqyRJY2+295gXVdWOVr4XWNTKi4GtQ/22tbpfkWRdki1JtuzatWuWw5Akabzs98NfVVVAzWK/y6pqRVWtmJiY2N9hSJI0FmYbzPdNLVG3952tfjuwdKjfklYnSZJGMNtg3gisaeU1wNVD9ee3p7NXAruHlrwlSdI0FkzXIckngZcCxyXZBvw74CLgyiRrgXuAc1v3a4CzgEngUeCCeRizJElja9pgrqrX7KNp1V76FnDh/g5KkqRDld/8JUlSRwxmSZI6YjBLktQRg1mSpI4YzJIkdWTap7I1P5at//wBO9fdF519wM4lSdo/XjFLktQRr5j304G88pUkjT+vmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHVlwsAeg+bds/edntd/dF53d9bkkaRwZzNqn2YbsgTrXkyHMe/9c/kVK6s+8LGUnOSPJt5NMJlk/H+eQJGkczXkwJzkM+O/AmcApwGuSnDLX55EkaRzNx1L2C4DJqroLIMmngNXA7fNwLh3CxnUZdlw/l9SL3m8xzUcwLwa2Dm1vA/7xPJxH0pAD+UyApPlz0B7+SrIOWNc2H07y7Tk8/HHAD+bweIeasZ6/vG9eDz/WczdlnubwkJi7eeLczd5IczcPf+ZP3FfDfATzdmDp0PaSVvc4VXUZcNk8nJ8kW6pqxXwc+1Dg/M2eczd7zt3sOXez1+PczcdT2V8Dlic5KckRwHnAxnk4jyRJY2fOr5ir6rEk/xr4S+Aw4MNVddtcn0eSpHE0L/eYq+oa4Jr5OPaI5mWJ/BDi/M2eczd7zt3sOXez193cpaoO9hgkSVLjP2IhSVJHxi6Y/TrQJ5bkw0l2Jrl1qO6YJJuS3Nnej271SXJJm8tbkpx28EZ+8CVZmuS6JLcnuS3JG1u98zeNJEcm+WqSm9vcvafVn5TkhjZHn24PjJLkqW17srUvO6gfoANJDkvyjSSfa9vO3YiS3J3km0luSrKl1XX7cztWwezXgY7kI8AZe9StBzZX1XJgc9uGwTwub691wKUHaIy9egx4S1WdAqwELmx/vpy/6f0EOL2qngecCpyRZCXwPuDiqjoZeABY2/qvBR5o9Re3foe6NwJ3DG07dzPzsqo6dehXo7r9uR2rYGbo60Cr6u+Aqa8DVVNVXwF+uEf1amBDK28Azhmqv6IGrgcWJjn+gAy0Q1W1o6q+3so/YvA/ycU4f9Nqc/Bw2zy8vQo4Hbiq1e85d1NzehWwKkkOzGj7k2QJcDbwobYdnLv91e3P7bgF896+DnTxQRrLk8miqtrRyvcCi1rZ+dyHtjz4fOAGnL+RtKXYm4CdwCbgu8CDVfVY6zI8P7+Yu9a+Gzj2gA64L38KvBX4eds+FuduJgr4YpIb27dOQsc/t/57zHqcqqokPqr/BJI8Hfgz4E1V9dDwxYjzt29V9TPg1CQLgc8Czz24I3pySPJKYGdV3ZjkpQd5OE9WL66q7UmeBWxK8q3hxt5+bsftinmkrwPVr7hvaqmmve9s9c7nHpIcziCUP15Vf96qnb8ZqKoHgeuAFzJYJpy6QBien1/MXWs/Crj/wI60Gy8CXpXkbga3504HPohzN7Kq2t7edzL4S+EL6PjndtyC2a8DnZ2NwJpWXgNcPVR/fntKcSWwe2jp55DT7tNdDtxRVR8YanL+ppFkol0pk+TXgJczuEd/HfDq1m3PuZua01cD19Yh+qULVfW2qlpSVcsY/D/t2qp6Lc7dSJI8LckzpsrAK4Bb6fnntqrG6gWcBXyHwf2rdxzs8fT2Aj4J7AB+yuDeyVoG9582A3cCXwKOaX3D4Cn37wLfBFYc7PEf5Ll7MYN7VbcAN7XXWc7fSHP3D4BvtLm7FXhXq3828FVgEvgM8NRWf2Tbnmztzz7Yn6GHF/BS4HPO3Yzm7NnAze1121Qu9Pxz6zd/SZLUkXFbypYk6UnNYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjvx/csKUYAcL4/kAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Histogram of fare feature before capping outliers\n", "plot_hist(data, 'fare')" ] }, { "cell_type": "markdown", "id": "weighted-palestinian", "metadata": {}, "source": [ "### Outlier trimming using Gaussian limits:\n", "The transformer will find the maximum and / or minimum values to\n", " trim the variables using the Gaussian approximation.\n", "\n", "\n", "- right tail: mean + 3* std\n", "- left tail: mean - 3* std" ] }, { "cell_type": "code", "execution_count": 8, "id": "micro-knitting", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "OutlierTrimmer(variables=['age', 'fare'])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'''Parameters\n", "----------\n", "\n", "capping_method : str, default=gaussian\n", " Desired capping method. Can take 'gaussian', 'iqr' or 'quantiles'.\n", " \n", "tail : str, default=right\n", " Whether to cap outliers on the right, left or both tails of the distribution.\n", " Can take 'left', 'right' or 'both'.\n", "\n", "fold: int or float, default=3\n", " How far out to to place the capping values. The number that will multiply\n", " the std or IQR to calculate the capping values.\n", "\n", "variables : list, default=None\n", "\n", "missing_values: string, default='raise'\n", " Indicates if missing values should be ignored or raised.'''\n", "\n", "# removing outliers based on right tail of age and fare columns using gaussian capping method\n", "trimmer = OutlierTrimmer(\n", " capping_method='gaussian', tail='right', fold=3, variables=['age', 'fare'])\n", "\n", "# fitting trimmer object to training data\n", "trimmer.fit(X_train)" ] }, { "cell_type": "code", "execution_count": 9, "id": "revolutionary-giant", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'age': 67.49048447470315, 'fare': 174.78162171790441}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# here we can find the maximum caps allowed\n", "trimmer.right_tail_caps_" ] }, { "cell_type": "code", "execution_count": 10, "id": "requested-paint", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# this dictionary is empty, because we selected only right tail\n", "trimmer.left_tail_caps_" ] }, { "cell_type": "code", "execution_count": 15, "id": "extreme-contribution", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Max age: 66.0\n", "Max fare: 164.8667\n" ] } ], "source": [ "# transforming the training and testing data\n", "train_t = trimmer.transform(X_train)\n", "test_t = trimmer.transform(X_test)\n", "\n", "# let's check the new maximum Age and maximum Fare in the titanic\n", "print(\"Max age:\", train_t.age.max())\n", "print(\"Max fare:\", train_t.fare.max())" ] }, { "cell_type": "code", "execution_count": 12, "id": "mobile-charger", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train data shape after removing outliers: (887, 11)\n", "29 observations are removed\n", "\n", "test data shape after removing outliers: (376, 11)\n", "17 observations are removed\n" ] } ], "source": [ "print(\"train data shape after removing outliers:\", train_t.shape)\n", "print(f\"{X_train.shape[0] - train_t.shape[0]} observations are removed\\n\")\n", "\n", "print(\"test data shape after removing outliers:\", test_t.shape)\n", "print(f\"{X_test.shape[0] - test_t.shape[0]} observations are removed\")" ] }, { "cell_type": "markdown", "id": "duplicate-automation", "metadata": {}, "source": [ "### Gaussian approximation trimming, both tails" ] }, { "cell_type": "code", "execution_count": 16, "id": "fifteen-parker", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "OutlierTrimmer(fold=2, tail='both', variables=['fare', 'age'])" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Trimming the outliers at both tails using gaussian method\n", "trimmer = OutlierTrimmer(\n", " capping_method='gaussian', tail='both', fold=2, variables=['fare', 'age'])\n", "trimmer.fit(X_train)" ] }, { "cell_type": "code", "execution_count": 17, "id": "meaningful-kinase", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Minimum caps : {'fare': -62.30099726608475, 'age': 4.681562024142586}\n", "Maximum caps : {'fare': 127.36509792110658, 'age': 54.92869998459104}\n" ] } ], "source": [ "print(\"Minimum caps :\", trimmer.left_tail_caps_)\n", "\n", "print(\"Maximum caps :\", trimmer.right_tail_caps_)" ] }, { "cell_type": "code", "execution_count": 18, "id": "confidential-tradition", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train data shape after removing outliers: (803, 11)\n", "113 observations are removed\n", "\n", "test data shape after removing outliers: (334, 11)\n", "59 observations are removed\n" ] } ], "source": [ "# transforming the training and testing data\n", "train_t = trimmer.transform(X_train)\n", "test_t = trimmer.transform(X_test)\n", "\n", "print(\"train data shape after removing outliers:\", train_t.shape)\n", "print(f\"{X_train.shape[0] - train_t.shape[0]} observations are removed\\n\")\n", "\n", "print(\"test data shape after removing outliers:\", test_t.shape)\n", "print(f\"{X_test.shape[0] - test_t.shape[0]} observations are removed\")" ] }, { "cell_type": "markdown", "id": "fundamental-address", "metadata": {}, "source": [ "### Inter Quartile Range, both tails\n", "The transformer will find the boundaries using the IQR proximity rule.\n", "**IQR limits:**\n", "\n", "- right tail: 75th quantile + 3* IQR\n", "- left tail: 25th quantile - 3* IQR\n", "\n", "where IQR is the inter-quartile range: 75th quantile - 25th quantile.\n" ] }, { "cell_type": "code", "execution_count": 19, "id": "closed-knight", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "OutlierTrimmer(capping_method='iqr', tail='both', variables=['age', 'fare'])" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# trimming at both tails using iqr capping method\n", "trimmer = OutlierTrimmer(\n", " capping_method='iqr', tail='both', variables=['age', 'fare'])\n", "\n", "trimmer.fit(X_train)" ] }, { "cell_type": "code", "execution_count": 20, "id": "psychological-holmes", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Minimum caps : {'age': -13.0, 'fare': -62.24179999999999}\n", "Maximum caps : {'age': 71.0, 'fare': 101.4126}\n" ] } ], "source": [ "print(\"Minimum caps :\", trimmer.left_tail_caps_)\n", "\n", "print(\"Maximum caps :\", trimmer.right_tail_caps_)" ] }, { "cell_type": "code", "execution_count": 21, "id": "neither-enlargement", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train data shape after removing outliers: (857, 11)\n", "59 observations are removed\n", "\n", "test data shape after removing outliers: (365, 11)\n", "28 observations are removed\n" ] } ], "source": [ "# transforming the training and testing data\n", "train_t = trimmer.transform(X_train)\n", "test_t = trimmer.transform(X_test)\n", "\n", "print(\"train data shape after removing outliers:\", train_t.shape)\n", "print(f\"{X_train.shape[0] - train_t.shape[0]} observations are removed\\n\")\n", "\n", "print(\"test data shape after removing outliers:\", test_t.shape)\n", "print(f\"{X_test.shape[0] - test_t.shape[0]} observations are removed\")" ] }, { "cell_type": "markdown", "id": "robust-highland", "metadata": {}, "source": [ "### percentiles or quantiles:\n", "The limits are given by the percentiles.\n", "- right tail: 98th percentile\n", "- left tail: 2nd percentile" ] }, { "cell_type": "code", "execution_count": 23, "id": "egyptian-northwest", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "OutlierTrimmer(capping_method='quantiles', fold=0.02, tail='both',\n", " variables=['age', 'fare'])" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# trimming at both tails using quantiles capping method\n", "trimmer = OutlierTrimmer(capping_method='quantiles',\n", " tail='both', fold=0.02, variables=['age', 'fare'])\n", "\n", "trimmer.fit(X_train)" ] }, { "cell_type": "code", "execution_count": 24, "id": "banner-logistics", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Minimum caps : {'age': 2.0, 'fare': 6.44125}\n", "Maximum caps : {'age': 61.69999999999993, 'fare': 211.5}\n" ] } ], "source": [ "print(\"Minimum caps :\", trimmer.left_tail_caps_)\n", "\n", "print(\"Maximum caps :\", trimmer.right_tail_caps_)" ] }, { "cell_type": "code", "execution_count": 25, "id": "familiar-climate", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train data shape after removing outliers: (852, 11)\n", "64 observations are removed\n", "\n", "test data shape after removing outliers: (358, 11)\n", "35 observations are removed\n" ] } ], "source": [ "# transforming the training and testing data\n", "train_t = trimmer.transform(X_train)\n", "test_t = trimmer.transform(X_test)\n", "\n", "print(\"train data shape after removing outliers:\", train_t.shape)\n", "print(f\"{X_train.shape[0] - train_t.shape[0]} observations are removed\\n\")\n", "\n", "print(\"test data shape after removing outliers:\", test_t.shape)\n", "print(f\"{X_test.shape[0] - test_t.shape[0]} observations are removed\")" ] }, { "cell_type": "code", "execution_count": 26, "id": "usual-playlist", "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeYAAAE/CAYAAACTomAoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAYTklEQVR4nO3dfbRddX3n8ffHBLUKFZA7DAIhoKktumq0VweXD0WxFfEBnTWDsHyIlE50Bjs6xXEQrVCXdOgD2HHa4qRCwSeEiihVakVKi66p0KCICFqBBkkMSQQRBAcNfOePsyOHm5vcm3vOzf3lnPdrrbPO3r/99M0vOfnc/dv77JuqQpIkteFRC12AJEl6mMEsSVJDDGZJkhpiMEuS1BCDWZKkhhjMkiQ1xGCWhiDJh5L83pD2tSTJj5Ms6ub/IclvD2Pf3f7+NsmKYe1vB477/iQ/SHLHzj62tCuJ32OWti/JGmBfYDPwIHAj8BFgVVU9NId9/XZVfWkHtvkH4GNV9eEdOVa37WnAU6rq9Tu67TAlWQJ8BzioqjYuZC1S6zxjlmbnlVW1B3AQcAbwP4Bzhn2QJIuHvc9GLAHuNJSlmRnM0g6oqh9V1aXAa4EVSZ4OkOS8JO/vpvdJ8rkkdye5K8mXkzwqyUfpBdTfdEPV70yyNEklOSHJ94C/72vrD+knJ7kmyT1JPptk7+5YhydZ219jkjVJXpLkSOAU4LXd8b7RLf/50HhX13uS3JZkY5KPJHlCt2xLHSuSfK8bhn73tvomyRO67Td1+3tPt/+XAJcDT+rqOG+abffq+mxTkh920wf0LT84yVVJ7k3ypSR/nuRjfcsPS/J/uz7/RpLDZ/lXKjXHYJbmoKquAdYCL5hm8Undsgl6Q+Cn9DapNwDfo3f2vXtV/VHfNr8O/Arw0m0c8o3AbwH70RtS/+AsavwC8AfAhd3xnjHNam/qXi8CDgF2B/5syjrPB54KHAG8N8mvbOOQ/xt4QrefX+9qPr4btn8Z8P2ujjdNs+2jgL+iNyKxBPjJlDo+AVwDPBE4DXjDlgVJ9gc+D7wf2Bt4B3Bxkolt1Ck1zWCW5u779IJgqp/RC9CDqupnVfXlmvlmjtOq6r6q+sk2ln+0qm6oqvuA3wOO2XJz2IBeB5xVVbdW1Y+BdwHHTjlb//2q+klVfQP4BrBVwHe1HAu8q6rurao1wJn0Bej2VNWdVXVxVd1fVfcCp9ML9y3Xp58NvLeqflpVXwEu7dv89cBlVXVZVT1UVZcDq4GjdqQjpFYYzNLc7Q/cNU37HwM3A19McmuSk2exr9t3YPltwG7APrOqcvue1O2vf9+L6Z3pb9F/F/X99M6qp9qnq2nqvvafTRFJHpfk/3RD4PcAVwF7doH/JOCuqrq/b5P+/jgI+I/dMPbdSe6md5a/32yOLbXGYJbmIMmz6YXOV6Yu684YT6qqQ4BXAb+b5Igti7exy5nOqA/sm15C76z8B8B9wOP66lpEbwh9tvv9Pr1g69/3ZmDDDNtN9YOupqn7WjfL7U+iN1z+76rqF4EXdu0B1gN7J3lc3/r9/XE7vRGFPftej6+qM3bwzyA1wWCWdkCSX0zyCuCT9L7C9M1p1nlFkqckCfAjel+x2vK1qg30rsHuqNcnObQLp/cBn6qqB4F/AR6b5OVJdgPeAzymb7sNwNIk2/qsXwD8t+7mqt15+Jr05h0prqvlIuD0JHskOQj4XeBj29/y5/agd1357u7GtlP79n0bvaHp05I8OslzgVf2bfsx4JVJXppkUZLHdjfFHYC0CzKYpdn5myT30js7ezdwFnD8NtZdBnwJ+DHwT8BfVNWV3bL/CbynG3J9xw4c/6PAefSGlR8L/Ffo3SUO/Bfgw/TOTu+jd+PZFn/dvd+Z5GvT7Pfcbt9XAf8K/D/gd3agrn6/0x3/VnojCZ/o9j8bfwr8Ar0z768CX5iy/HXAc4E76d3kdSHwAEBV3Q4cTe8mu030/o7+O/7/pl2UDxiRtMtJciHw7ao6dcaVpV2MP1FKal6SZyd5cve96CPpnSF/ZoHLkubFqD5lSNJo+bfAp+l9j3kt8J+r6usLW5I0PxzKliSpIQ5lS5LUEINZkqSGNHGNeZ999qmlS5cudBmSJO0011577Q+qaqtnujcRzEuXLmX16tULXYYkSTtNktuma3coW5KkhhjMkiQ1xGCWJKkhBrMkSQ0xmCVJaojBLElSQwxmSZIaYjBLktQQg1mSpIYYzJIkNcRgliSpIU08K1vSw5ae/Pk5bbfmjJcPuRJJC8EzZkmSGmIwS5LUEINZkqSGGMySJDXEYJYkqSEGsyRJDTGYJUlqiMEsSVJDDGZJkhpiMEuS1JAZgznJgUmuTHJjkm8leVvXvneSy5N8t3vfq2tPkg8muTnJ9UmeNd9/CEmSRsVszpg3AydV1aHAYcCJSQ4FTgauqKplwBXdPMDLgGXdayVw9tCrliRpRM0YzFW1vqq+1k3fC9wE7A8cDZzfrXY+8Opu+mjgI9XzVWDPJPsNu3BJkkbRDl1jTrIUeCZwNbBvVa3vFt0B7NtN7w/c3rfZ2q5NkiTNYNbBnGR34GLg7VV1T/+yqiqgduTASVYmWZ1k9aZNm3ZkU0mSRtasgjnJbvRC+eNV9emuecOWIerufWPXvg44sG/zA7q2R6iqVVU1WVWTExMTc61fkqSRMpu7sgOcA9xUVWf1LboUWNFNrwA+29f+xu7u7MOAH/UNeUuSpO1YPIt1nge8Afhmkuu6tlOAM4CLkpwA3AYc0y27DDgKuBm4Hzh+mAVLkjTKZgzmqvoKkG0sPmKa9Qs4ccC6JEkaSz75S5KkhhjMkiQ1xGCWJKkhBrMkSQ0xmCVJaojBLElSQwxmSZIaYjBLktQQg1mSpIYYzJIkNcRgliSpIQazJEkNMZglSWqIwSxJUkMMZkmSGmIwS5LUEINZkqSGGMySJDXEYJYkqSEGsyRJDTGYJUlqyIzBnOTcJBuT3NDXdmGS67rXmiTXde1Lk/ykb9mH5rF2SZJGzuJZrHMe8GfAR7Y0VNVrt0wnORP4Ud/6t1TV8iHVJ0nSWJkxmKvqqiRLp1uWJMAxwIuHXJckSWNp0GvMLwA2VNV3+9oOTvL1JP+Y5AUD7l+SpLEym6Hs7TkOuKBvfj2wpKruTPJrwGeSPK2q7pm6YZKVwEqAJUuWDFiGJEmjYc5nzEkWA/8euHBLW1U9UFV3dtPXArcAvzTd9lW1qqomq2pyYmJirmVIkjRSBhnKfgnw7apau6UhyUSSRd30IcAy4NbBSpQkaXzM5utSFwD/BDw1ydokJ3SLjuWRw9gALwSu774+9SngLVV11xDrlSRppM3mruzjttH+pmnaLgYuHrwsSZLGk0/+kiSpIQazJEkNMZglSWqIwSxJUkMMZkmSGmIwS5LUEINZkqSGGMySJDXEYJYkqSEGsyRJDTGYJUlqiMEsSVJDDGZJkhpiMEuS1BCDWZKkhhjMkiQ1xGCWJKkhBrMkSQ0xmCVJaojBLElSQwxmSZIaYjBLktSQGYM5yblJNia5oa/ttCTrklzXvY7qW/auJDcn+U6Sl85X4ZIkjaLZnDGfBxw5TfsHqmp597oMIMmhwLHA07pt/iLJomEVK0nSqJsxmKvqKuCuWe7vaOCTVfVAVf0rcDPwnAHqkyRprAxyjfmtSa7vhrr36tr2B27vW2dt1yZJkmZhrsF8NvBkYDmwHjhzR3eQZGWS1UlWb9q0aY5lSJI0WuYUzFW1oaoerKqHgL/k4eHqdcCBfase0LVNt49VVTVZVZMTExNzKUOSpJEzp2BOsl/f7GuALXdsXwocm+QxSQ4GlgHXDFaiJEnjY/FMKyS5ADgc2CfJWuBU4PAky4EC1gBvBqiqbyW5CLgR2AycWFUPzkvlkiSNoBmDuaqOm6b5nO2sfzpw+iBFSZI0rnzylyRJDTGYJUlqiMEsSVJDDGZJkhpiMEuS1BCDWZKkhhjMkiQ1xGCWJKkhBrMkSQ0xmCVJaojBLElSQwxmSZIaYjBLktQQg1mSpIYYzJIkNcRgliSpIQazJEkNMZglSWqIwSxJUkMMZkmSGmIwS5LUkBmDOcm5STYmuaGv7Y+TfDvJ9UkuSbJn1740yU+SXNe9PjSPtUuSNHJmc8Z8HnDklLbLgadX1a8C/wK8q2/ZLVW1vHu9ZThlSpI0HmYM5qq6CrhrStsXq2pzN/tV4IB5qE2SpLEzjGvMvwX8bd/8wUm+nuQfk7xgCPuXJGlsLB5k4yTvBjYDH++a1gNLqurOJL8GfCbJ06rqnmm2XQmsBFiyZMkgZUiSNDLmfMac5E3AK4DXVVUBVNUDVXVnN30tcAvwS9NtX1WrqmqyqiYnJibmWoYkSSNlTsGc5EjgncCrqur+vvaJJIu66UOAZcCtwyhUkqRxMONQdpILgMOBfZKsBU6ldxf2Y4DLkwB8tbsD+4XA+5L8DHgIeEtV3TXtjiVJ0lZmDOaqOm6a5nO2se7FwMWDFiVJ0rjyyV+SJDXEYJYkqSEGsyRJDTGYJUlqiMEsSVJDDGZJkhpiMEuS1BCDWZKkhhjMkiQ1xGCWJKkhBrMkSQ0xmCVJaojBLElSQwxmSZIaYjBLktQQg1mSpIYYzJIkNcRgliSpIQazJEkNMZglSWqIwSxJUkMMZkmSGjKrYE5ybpKNSW7oa9s7yeVJvtu979W1J8kHk9yc5Pokz5qv4iVJGjWzPWM+DzhyStvJwBVVtQy4opsHeBmwrHutBM4evExJksbDrIK5qq4C7prSfDRwfjd9PvDqvvaPVM9XgT2T7DeEWiVJGnmDXGPet6rWd9N3APt20/sDt/ett7ZrkyRJMxjKzV9VVUDtyDZJViZZnWT1pk2bhlGGJEm7vEGCecOWIerufWPXvg44sG+9A7q2R6iqVVU1WVWTExMTA5QhSdLoGCSYLwVWdNMrgM/2tb+xuzv7MOBHfUPekiRpOxbPZqUkFwCHA/skWQucCpwBXJTkBOA24Jhu9cuAo4CbgfuB44dcsyRJI2tWwVxVx21j0RHTrFvAiYMUJUnSuPLJX5IkNcRgliSpIQazJEkNMZglSWqIwSxJUkMMZkmSGmIwS5LUEINZkqSGGMySJDXEYJYkqSEGsyRJDTGYJUlqiMEsSVJDDGZJkhpiMEuS1BCDWZKkhhjMkiQ1xGCWJKkhBrMkSQ0xmCVJaojBLElSQxbPdcMkTwUu7Gs6BHgvsCfwn4BNXfspVXXZXI8jSdI4mXMwV9V3gOUASRYB64BLgOOBD1TVnwyjQEmSxsmwhrKPAG6pqtuGtD9JksbSsIL5WOCCvvm3Jrk+yblJ9hrSMSRJGnkDB3OSRwOvAv66azobeDK9Ye71wJnb2G5lktVJVm/atGm6VSRJGjvDOGN+GfC1qtoAUFUbqurBqnoI+EvgOdNtVFWrqmqyqiYnJiaGUIYkSbu+YQTzcfQNYyfZr2/Za4AbhnAMSZLGwpzvygZI8njgN4A39zX/UZLlQAFrpiyTJEnbMVAwV9V9wBOntL1hoIokSRpjPvlLkqSGGMySJDXEYJYkqSEGsyRJDTGYJUlqiMEsSVJDDGZJkhpiMEuS1BCDWZKkhhjMkiQ1xGCWJKkhBrMkSQ0xmCVJaojBLElSQwxmSZIaYjBLktQQg1mSpIYYzJIkNWTxQhcg7QqWnvz5OW235oyXD7kSSaPOM2ZJkhpiMEuS1BCDWZKkhgx8jTnJGuBe4EFgc1VNJtkbuBBYCqwBjqmqHw56LEmSRt2wzphfVFXLq2qymz8ZuKKqlgFXdPOSJGkG8zWUfTRwfjd9PvDqeTqOJEkjZRjBXMAXk1ybZGXXtm9Vre+m7wD2HcJxJEkaecP4HvPzq2pdkn8DXJ7k2/0Lq6qS1NSNuhBfCbBkyZIhlCGNN79rLY2GgYO5qtZ17xuTXAI8B9iQZL+qWp9kP2DjNNutAlYBTE5ObhXc0nyZa4C1fixJo2Ggoewkj0+yx5Zp4DeBG4BLgRXdaiuAzw5yHEmSxsWgZ8z7Apck2bKvT1TVF5L8M3BRkhOA24BjBjyOJEljYaBgrqpbgWdM034ncMQg+5YkaRz5SyzUhLlci/WmJUmjyGDWLssbqySNIp+VLUlSQwxmSZIaYjBLktQQg1mSpIYYzJIkNcRgliSpIX5dShpzfodcaotnzJIkNcRgliSpIQazJEkNMZglSWqIwSxJUkMMZkmSGmIwS5LUEINZkqSGGMySJDXEYJYkqSEGsyRJDTGYJUlqyJyDOcmBSa5McmOSbyV5W9d+WpJ1Sa7rXkcNr1xJkkbbIL9dajNwUlV9LckewLVJLu+WfaCq/mTw8iS1aC6/kWqu/E1WGjdzDuaqWg+s76bvTXITsP+wCpMkaRwN5RpzkqXAM4Gru6a3Jrk+yblJ9hrGMSRJGgcDB3OS3YGLgbdX1T3A2cCTgeX0zqjP3MZ2K5OsTrJ606ZNg5YhSdJISFXNfeNkN+BzwN9V1VnTLF8KfK6qnr69/UxOTtbq1avnXIfasTOvPUrb47VptS7JtVU1ObV9kLuyA5wD3NQfykn261vtNcANcz2GJEnjZpC7sp8HvAH4ZpLrurZTgOOSLAcKWAO8eYBjSNKczGX0xrNstWCQu7K/AmSaRZfNvRxJ2vX4Q4CGySd/SZLUEINZkqSGGMySJDXEYJYkqSEGsyRJDRnk61KSNFJG9QE5c/1zeef4wvCMWZKkhhjMkiQ1xGCWJKkhXmOWJA2N17MHN7LB7CPyJLVsVG800+AcypYkqSEGsyRJDTGYJUlqyMheY54Lb1qQJC00g3mB7Ao/BHhziiTtfAazJGmstP6tHYN5F9P6PyhJo8NRs4VhMA+B/3glaecb1f97vStbkqSGeMY8Bkb1p0pJGkWeMUuS1JB5O2NOciTwv4BFwIer6oz5OpYkadfmyN7D5uWMOcki4M+BlwGHAsclOXQ+jiVJ0iiZr6Hs5wA3V9WtVfVT4JPA0fN0LEmSRsZ8BfP+wO1982u7NkmStB0Ldld2kpXAym72x0m+s41V9wF+sHOq2mXYJ49kf2zNPtmafbI1+2Rr0/ZJ/nBejnXQdI3zFczrgAP75g/o2n6uqlYBq2baUZLVVTU53PJ2bfbJI9kfW7NPtmafbM0+2VoLfTJfQ9n/DCxLcnCSRwPHApfO07EkSRoZ83LGXFWbk7wV+Dt6X5c6t6q+NR/HkiRplMzbNeaqugy4bAi7mnG4ewzZJ49kf2zNPtmafbI1+2RrC94nqaqFrkGSJHV8JKckSQ1pNpiTHJnkO0luTnLyQtezEJKcm2Rjkhv62vZOcnmS73bvey1kjTtbkgOTXJnkxiTfSvK2rn1s+yXJY5Nck+QbXZ/8ftd+cJKru8/Qhd2NmGMjyaIkX0/yuW5+3PtjTZJvJrkuyequbWw/NwBJ9kzyqSTfTnJTkue20CdNBrOP9Py584Ajp7SdDFxRVcuAK7r5cbIZOKmqDgUOA07s/m2Mc788ALy4qp4BLAeOTHIY8IfAB6rqKcAPgRMWrsQF8Tbgpr75ce8PgBdV1fK+rwON8+cGer/P4QtV9cvAM+j9e1nwPmkymPGRngBU1VXAXVOajwbO76bPB169M2taaFW1vqq+1k3fS++DtD9j3C/V8+NudrfuVcCLgU917WPVJ0kOAF4OfLibD2PcH9sxtp+bJE8AXgicA1BVP62qu2mgT1oNZh/puW37VtX6bvoOYN+FLGYhJVkKPBO4mjHvl27Y9jpgI3A5cAtwd1Vt7lYZt8/QnwLvBB7q5p/IePcH9H5Y+2KSa7snL8J4f24OBjYBf9Vd8vhwksfTQJ+0GsyaherdUj+Wt9Un2R24GHh7Vd3Tv2wc+6WqHqyq5fSesvcc4JcXtqKFk+QVwMaqunaha2nM86vqWfQuEZ6Y5IX9C8fwc7MYeBZwdlU9E7iPKcPWC9UnrQbzjI/0HGMbkuwH0L1vXOB6droku9EL5Y9X1ae75rHvF4BuKO5K4LnAnkm2PKtgnD5DzwNelWQNvctgL6Z3LXFc+wOAqlrXvW8ELqH3A9w4f27WAmur6upu/lP0gnrB+6TVYPaRntt2KbCim14BfHYBa9npumuF5wA3VdVZfYvGtl+STCTZs5v+BeA36F17vxL4D91qY9MnVfWuqjqgqpbS+7/j76vqdYxpfwAkeXySPbZMA78J3MAYf26q6g7g9iRP7ZqOAG6kgT5p9gEjSY6id51oyyM9T1/Yina+JBcAh9P7bScbgFOBzwAXAUuA24BjqmrqDWIjK8nzgS8D3+Th64en0LvOPJb9kuRX6d2ksojeD9sXVdX7khxC74xxb+DrwOur6oGFq3TnS3I48I6qesU490f3Z7+km10MfKKqTk/yRMb0cwOQZDm9GwQfDdwKHE/3GWIB+6TZYJYkaRy1OpQtSdJYMpglSWqIwSxJUkMMZkmSGmIwS5LUEINZkqSGGMySJDXEYJYkqSH/HxAt2CABmtTSAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Histogram of age feature after removing outliers\n", "plot_hist(train_t, 'age')" ] }, { "cell_type": "code", "execution_count": 27, "id": "yellow-group", "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeYAAAE/CAYAAACTomAoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAYf0lEQVR4nO3dfdCddX3n8fdHgtgWakDuZjAJhip9wM4YnBTZ0a4IVXlwG5yxFKeFyOLE3cWurnY1WFtxR3bp7iKr25VdFJbgE1LUJVVqRcCxzlQwWEQeSs1ikMRAIs9oZZv43T/OL/U05uZ+zv3j3O/XzJlzXb/r6XvONef+5Pc717mSqkKSJPXhGfNdgCRJ+gmDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLM2CJP8zyR/N0r4OT/JEkv3a/JeTvHE29t329xdJ1szW/qZw3Pcl+X6S+8dZ/q+TPNBe+3P2dX1SL+LvmKWnlmQzsATYCewC7gSuAC6pqh9PY19vrKovTWGbLwMfq6qPTOVYbdvzgBdU1e9NddvZlORw4G7geVW1fS/L9wceA46tqm/u6/qknthjlibnX1TVQcDzgAuAdwKXzvZBkiya7X124nDgwb2FcrMEeBZwx3R2PsLvmxYgg1magqp6tKo2AL8DrEnyawBJLk/yvjZ9aJLPJXkkyUNJ/irJM5J8lEFA/Xkbrn1HkhVJKsnZSb4L3DDUNhw2z09yc5LHklyT5JB2rOOSbBmuMcnmJL+Z5ETgXcDvtON9sy3/x6HxVte7k9ybZHuSK5I8uy3bXceaJN9tw9B/ON57k+TZbfsdbX/vbvv/TeA64Lmtjsv32O6XGPSmAR5JckNr/0CS+9prviXJbwxtc16Sq5N8LMljwBva8S9Nsi3J1jZ0vt/kz67UB4NZmoaquhnYAvzGXha/vS0bY9ATfNdgkzoD+C6D3veBVfWfh7Z5OfCrwKvHOeSZwL8EDmMwpP7BSdT4BeA/Ap9qx3vRXlZ7Q3u8AvhF4EDgT/dY52XALwMnAH+c5FfHOeR/B57d9vPyVvNZbdj+JOB7rY437FHn3wEvbLOLq+r4Nv11YCVwCPAJ4M+SPGto09XA1cBi4OPA5QzemxcARwOvAmbtu3lpXzGYpen7HoPQ2NM/MAjQ51XVP1TVX9XEF3OcV1U/qKq/H2f5R6vq9qr6AfBHwGmz1Bv8XeD9VXVPVT0BnAucvkdv/b1V9fftu99vAj8V8K2W04Fzq+rxqtoMXAicMd3CqupjVfVgVe2sqguBAxj8A2G3v66q/9O+5/954GTgre193A5c1GqSnlYMZmn6lgIP7aX9vwCbgC8muSfJukns674pLL8X2B84dFJVPrXntv0N73sRg57+bsNXUf+QQa96T4e2mvbc19LpFpbkD5LcleTRJI8w6I0Pv+bh9+R57fjb2lcIjwD/C/iF6R5fmi8GszQNSX6dQeh8dc9lrcf49qr6ReC3gLclOWH34nF2OVGPevnQ9OEMeuXfB34A/OxQXfsxGEKf7H6/xyDUhve9E3hggu329P1W05772jrF/QDQvk9+B3AacHBVLQYeBTK02vBruw94Eji0qha3x89X1QuRnmYMZmkKkvx8ktcAVzL4CdO39rLOa5K8IEkYhMkuYPfPqh5g8B3sVP1ekqOS/CzwH4Crq2oX8HfAs5Kc0n5y9G4GQ767PQCsSDLeZ/2TwL9LckSSA/nJd9I7p1Jcq+Uq4PwkByV5HvA24GNT2c+Qgxj8A2EHsCjJHzMYrh7v+NuALwIXtnP0jCTPT/LyaR5fmjcGszQ5f57kcQY9sz8E3g+cNc66RwJfAp4A/hr4UFXd2Jb9J+Ddbbj1D6Zw/I8yuLjpfgY/K/q3MLhKHPg3wEcY9E5/wODCs93+rD0/mOQbe9nvZW3fXwG+A/wI+P0p1DXs99vx72EwkvCJtv/p+EvgCwz+4XFvq2ui4f4zgWcy+J35wwwuDDtsmseX5o03GJEkqSP2mCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI508T+yHHroobVixYr5LkOSpH3mlltu+X5Vje3Z3kUwr1ixgo0bN853GZIk7TNJ7t1bu0PZkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkd6eJe2bNtxbrPT2u7zRecMsuVSJI0NfaYJUnqyKSDOcl+Sf4myefa/BFJbkqyKcmnkjyztR/Q5je15SvmqHZJkkbOVHrMbwHuGpr/E+CiqnoB8DBwdms/G3i4tV/U1pMkSZMwqWBOsgw4BfhImw9wPHB1W2U9cGqbXt3mactPaOtLkqQJTLbH/N+AdwA/bvPPAR6pqp1tfguwtE0vBe4DaMsfbetLkqQJTBjMSV4DbK+qW2bzwEnWJtmYZOOOHTtmc9eSJD1tTabH/FLgt5JsBq5kMIT9AWBxkt0/t1oGbG3TW4HlAG35s4EH99xpVV1SVauqatXY2NiMXoQkSaNiwmCuqnOrallVrQBOB26oqt8FbgRe11ZbA1zTpje0edryG6qqZrVqSZJG1Ex+x/xO4G1JNjH4DvnS1n4p8JzW/jZg3cxKlCRp4ZjSnb+q6svAl9v0PcAxe1nnR8Bvz0JtkiQtON75S5KkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjoyYTAneVaSm5N8M8kdSd7b2i9P8p0kt7bHytaeJB9MsinJbUlePMevQZKkkbFoEus8CRxfVU8k2R/4apK/aMv+fVVdvcf6JwFHtsdLgIvbsyRJmsCEPeYaeKLN7t8e9RSbrAauaNt9DVic5LCZlypJ0uib1HfMSfZLciuwHbiuqm5qi85vw9UXJTmgtS0F7hvafEtrkyRJE5hUMFfVrqpaCSwDjknya8C5wK8Avw4cArxzKgdOsjbJxiQbd+zYMbWqJUkaUVO6KruqHgFuBE6sqm1tuPpJ4H8Dx7TVtgLLhzZb1tr23NclVbWqqlaNjY1Nq3hJkkbNZK7KHkuyuE3/DPBK4G93f2+cJMCpwO1tkw3Ame3q7GOBR6tq2xzULknSyJnMVdmHAeuT7McgyK+qqs8luSHJGBDgVuBftfWvBU4GNgE/BM6a9aolSRpREwZzVd0GHL2X9uPHWb+Ac2ZemiRJC493/pIkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOTBjMSZ6V5OYk30xyR5L3tvYjktyUZFOSTyV5Zms/oM1vastXzPFrkCRpZEymx/wkcHxVvQhYCZyY5FjgT4CLquoFwMPA2W39s4GHW/tFbT1JkjQJEwZzDTzRZvdvjwKOB65u7euBU9v06jZPW35CksxWwZIkjbJJfcecZL8ktwLbgeuA/ws8UlU72ypbgKVteilwH0Bb/ijwnL3sc22SjUk27tixY0YvQpKkUTGpYK6qXVW1ElgGHAP8ykwPXFWXVNWqqlo1NjY2091JkjQSpnRVdlU9AtwI/DNgcZJFbdEyYGub3gosB2jLnw08OBvFSpI06iZzVfZYksVt+meAVwJ3MQjo17XV1gDXtOkNbZ62/IaqqlmsWZKkkbVo4lU4DFifZD8GQX5VVX0uyZ3AlUneB/wNcGlb/1Lgo0k2AQ8Bp89B3ZIkjaQJg7mqbgOO3kv7PQy+b96z/UfAb89KdZIkLTDe+UuSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRyYM5iTLk9yY5M4kdyR5S2s/L8nWJLe2x8lD25ybZFOSu5O8ei5fgCRJo2TRJNbZCby9qr6R5CDgliTXtWUXVdV/HV45yVHA6cALgecCX0ryS1W1azYLlyRpFE3YY66qbVX1jTb9OHAXsPQpNlkNXFlVT1bVd4BNwDGzUawkSaNuSt8xJ1kBHA3c1JrenOS2JJclObi1LQXuG9psC08d5JIkqZl0MCc5EPg08Naqegy4GHg+sBLYBlw4lQMnWZtkY5KNO3bsmMqmkiSNrEkFc5L9GYTyx6vqMwBV9UBV7aqqHwMf5ifD1VuB5UObL2tt/0RVXVJVq6pq1djY2ExegyRJI2MyV2UHuBS4q6reP9R+2NBqrwVub9MbgNOTHJDkCOBI4ObZK1mSpNE1mauyXwqcAXwrya2t7V3A65OsBArYDLwJoKruSHIVcCeDK7rP8YpsSZImZ8JgrqqvAtnLomufYpvzgfNnUJckSQuSd/6SJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjkwYzEmWJ7kxyZ1J7kjyltZ+SJLrkny7PR/c2pPkg0k2JbktyYvn+kVIkjQqJtNj3gm8vaqOAo4FzklyFLAOuL6qjgSub/MAJwFHtsda4OJZr1qSpBE1YTBX1baq+kabfhy4C1gKrAbWt9XWA6e26dXAFTXwNWBxksNmu3BJkkbRlL5jTrICOBq4CVhSVdvaovuBJW16KXDf0GZbWpskSZrApIM5yYHAp4G3VtVjw8uqqoCayoGTrE2yMcnGHTt2TGVTSZJG1qSCOcn+DEL541X1mdb8wO4h6va8vbVvBZYPbb6stf0TVXVJVa2qqlVjY2PTrV+SpJEymauyA1wK3FVV7x9atAFY06bXANcMtZ/Zrs4+Fnh0aMhbkiQ9hUWTWOelwBnAt5Lc2treBVwAXJXkbOBe4LS27FrgZGAT8EPgrNksWJKkUTZhMFfVV4GMs/iEvaxfwDkzrEuSpAXJO39JktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6siEwZzksiTbk9w+1HZekq1Jbm2Pk4eWnZtkU5K7k7x6rgqXJGkUTabHfDlw4l7aL6qqle1xLUCSo4DTgRe2bT6UZL/ZKlaSpFE3YTBX1VeAhya5v9XAlVX1ZFV9B9gEHDOD+iRJWlBm8h3zm5Pc1oa6D25tS4H7htbZ0tokSdIkTDeYLwaeD6wEtgEXTnUHSdYm2Zhk444dO6ZZhiRJo2VawVxVD1TVrqr6MfBhfjJcvRVYPrTqsta2t31cUlWrqmrV2NjYdMqQJGnkTCuYkxw2NPtaYPcV2xuA05MckOQI4Ejg5pmVKEnSwrFoohWSfBI4Djg0yRbgPcBxSVYCBWwG3gRQVXckuQq4E9gJnFNVu+akckmSRtCEwVxVr99L86VPsf75wPkzKUqSpIXKO39JktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRyYM5iSXJdme5PahtkOSXJfk2+354NaeJB9MsinJbUlePJfFS5I0ahZNYp3LgT8FrhhqWwdcX1UXJFnX5t8JnAQc2R4vAS5uzyNtxbrPT3mbzRecMgeVSJKe7ibsMVfVV4CH9mheDaxv0+uBU4far6iBrwGLkxw2S7VKkjTypvsd85Kq2tam7weWtOmlwH1D621pbZIkaRJmfPFXVRVQU90uydokG5Ns3LFjx0zLkCRpJEw3mB/YPUTdnre39q3A8qH1lrW2n1JVl1TVqqpaNTY2Ns0yJEkaLdMN5g3Amja9BrhmqP3MdnX2scCjQ0PekiRpAhNelZ3kk8BxwKFJtgDvAS4ArkpyNnAvcFpb/VrgZGAT8EPgrDmoWZKkkTVhMFfV68dZdMJe1i3gnJkWJUnSQjWZ3zEvGNP5PbIkSbPJW3JKktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkji+a7gIVqxbrPT2u7zRecMsuVSJJ6MqNgTrIZeBzYBeysqlVJDgE+BawANgOnVdXDMytTkqSFYTaGsl9RVSuralWbXwdcX1VHAte3eUmSNAlz8R3zamB9m14PnDoHx5AkaSTNNJgL+GKSW5KsbW1Lqmpbm74fWDLDY0iStGDM9OKvl1XV1iS/AFyX5G+HF1ZVJam9bdiCfC3A4YcfPsMyJEkaDTPqMVfV1va8HfgscAzwQJLDANrz9nG2vaSqVlXVqrGxsZmUIUnSyJh2MCf5uSQH7Z4GXgXcDmwA1rTV1gDXzLRISZIWipkMZS8BPptk934+UVVfSPJ14KokZwP3AqfNvExJkhaGaQdzVd0DvGgv7Q8CJ8ykKEmSFipvySlJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHZvq/S2kfW7Hu81PeZvMFp8xBJZKkuWCPWZKkjhjMkiR1xKFsjcthc0na9+wxS5LUEXvMmlXT6WXDvu1pOxIgqWf2mCVJ6ojBLElSRxzK1tPWdIfNJaln9pglSeqIPWZpEvZl79wLzaSFzWBeABzylaSnD4eyJUnqiD1mdcFevcbjb+M123o/X3MWzElOBD4A7Ad8pKoumKtjSer/j42kyZmTYE6yH/A/gFcCW4CvJ9lQVXfOxfEkTc/ToTcqLTRz1WM+BthUVfcAJLkSWA0YzNIIsHcuzZ25CualwH1D81uAl8zRsSTpp4zqdQv+o2j0papmf6fJ64ATq+qNbf4M4CVV9eahddYCa9vsLwN3P8UuDwW+P+uFajo8F/3wXPTDc9GPp9O5eF5Vje3ZOFc95q3A8qH5Za3tH1XVJcAlk9lZko1VtWr2ytN0eS764bnoh+eiH6NwLubqd8xfB45MckSSZwKnAxvm6FiSJI2MOekxV9XOJG8G/pLBz6Uuq6o75uJYkiSNkjn7HXNVXQtcO0u7m9SQt/YJz0U/PBf98Fz042l/Lubk4i9JkjQ93itbkqSOdB3MSU5McneSTUnWzXc9C02SzUm+leTWJBtb2yFJrkvy7fZ88HzXOYqSXJZke5Lbh9r2+t5n4IPtc3JbkhfPX+WjZ5xzcV6Sre2zcWuSk4eWndvOxd1JXj0/VY+mJMuT3JjkziR3JHlLax+pz0a3wTx0W8+TgKOA1yc5an6rWpBeUVUrh35+sA64vqqOBK5v85p9lwMn7tE23nt/EnBke6wFLt5HNS4Ul/PT5wLgovbZWNmuqaH9jTodeGHb5kPtb5lmx07g7VV1FHAscE57z0fqs9FtMDN0W8+q+n/A7tt6an6tBta36fXAqfNXyuiqqq8AD+3RPN57vxq4oga+BixOctg+KXQBGOdcjGc1cGVVPVlV3wE2MfhbpllQVduq6htt+nHgLgZ3mhypz0bPwby323ounadaFqoCvpjklnanNoAlVbWtTd8PLJmf0hak8d57Pyvz481tePSyoa90PBf7SJIVwNHATYzYZ6PnYNb8e1lVvZjBcNA5Sf758MIaXNLvZf3zwPd+3l0MPB9YCWwDLpzXahaYJAcCnwbeWlWPDS8bhc9Gz8E84W09Nbeqamt73g58lsGQ3AO7h4La8/b5q3DBGe+997Oyj1XVA1W1q6p+DHyYnwxXey7mWJL9GYTyx6vqM615pD4bPQezt/WcR0l+LslBu6eBVwG3MzgHa9pqa4Br5qfCBWm8934DcGa7AvVY4NGhYT3NgT2+p3wtg88GDM7F6UkOSHIEg4uObt7X9Y2qJAEuBe6qqvcPLRqpz8ac3flrpryt57xbAnx28DlgEfCJqvpCkq8DVyU5G7gXOG0eaxxZST4JHAccmmQL8B7gAvb+3l8LnMzgQqMfAmft84JH2Djn4rgkKxkMmW4G3gRQVXckuYrB/z2/EzinqnbNQ9mj6qXAGcC3ktza2t7FiH02vPOXJEkd6XkoW5KkBcdgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSO/H9/wuc5D3DShgAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Histogram of fare feature after removing outliers\n", "plot_hist(train_t, 'fare')" ] }, { "cell_type": "code", "execution_count": null, "id": "unavailable-geography", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 5 }