{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Winsorizer\n", "Winzorizer finds maximum and minimum values following a Gaussian or skewed distribution as indicated. It can also cap the right, left or both ends of the distribution.\n", "\n", "The Winsorizer() caps maximum and / or minimum values of a variable.\n", "\n", "The Winsorizer() works only with numerical variables. A list of variables can\n", "be indicated. Alternatively, the Winsorizer() will select all numerical\n", "variables in the train set.\n", "\n", "The Winsorizer() first calculates the capping values at the end of the\n", "distribution. The values are determined using:\n", "\n", "- a Gaussian approximation,\n", "- the inter-quantile range proximity rule (IQR)\n", "- percentiles.\n", "\n", "\n", "### Example" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# importing libraries\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "from sklearn.model_selection import train_test_split\n", "\n", "from feature_engine.outliers import Winsorizer" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Load titanic dataset from OpenML\n", "\n", "def load_titanic():\n", " data = pd.read_csv(\n", " 'https://www.openml.org/data/get_csv/16826755/phpMYEkMl')\n", " data = data.replace('?', np.nan)\n", " data['cabin'] = data['cabin'].astype(str).str[0]\n", " data['pclass'] = data['pclass'].astype('O')\n", " data['embarked'].fillna('C', inplace=True)\n", " data['fare'] = data['fare'].astype('float')\n", " data['fare'].fillna(data['fare'].median(), inplace=True)\n", " data['age'] = data['age'].astype('float')\n", " data['age'].fillna(data['age'].median(), inplace=True)\n", " data.drop(['name', 'ticket'], axis=1, inplace=True)\n", " return data\n", "\n", "# To plot histogram of given numerical feature\n", "\n", "\n", "def plot_hist(data, col):\n", " plt.figure(figsize=(8, 5))\n", " plt.hist(data[col], bins=30)\n", " plt.title(\"Distribution of \"+col)\n", " return plt.show()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pclasssurvivedsexagesibspparchfarecabinembarkedboatbodyhome.dest
15710male28.00051.8625ESNaNNaNBrighton, MA
40021female34.01132.5000nS10NaNGreenport, NY
54621female28.00013.0000nS9NaNSpain
61830male35.0008.0500nSNaNNaNLower Clapton, Middlesex or Erdington, Birmingham
120830female9.03227.9000nSNaNNaNNaN
\n", "
" ], "text/plain": [ " pclass survived sex age sibsp parch fare cabin embarked \\\n", "157 1 0 male 28.0 0 0 51.8625 E S \n", "400 2 1 female 34.0 1 1 32.5000 n S \n", "546 2 1 female 28.0 0 0 13.0000 n S \n", "618 3 0 male 35.0 0 0 8.0500 n S \n", "1208 3 0 female 9.0 3 2 27.9000 n S \n", "\n", " boat body home.dest \n", "157 NaN NaN Brighton, MA \n", "400 10 NaN Greenport, NY \n", "546 9 NaN Spain \n", "618 NaN NaN Lower Clapton, Middlesex or Erdington, Birmingham \n", "1208 NaN NaN NaN " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Loading titanic dataset\n", "data = load_titanic()\n", "data.sample(5)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train data: (916, 11)\n", "test data: (393, 11)\n" ] } ], "source": [ "# let's separate into training and testing set\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(data.drop('survived', axis=1),\n", " data['survived'],\n", " test_size=0.3,\n", " random_state=0)\n", "\n", "print(\"train data:\", X_train.shape)\n", "print(\"test data:\", X_test.shape)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Max age: 80.0\n", "Max fare: 512.3292\n" ] } ], "source": [ "# let's find out the maximum Age and maximum Fare in the titanic\n", "\n", "print(\"Max age:\", data.age.max())\n", "print(\"Max fare:\", data.fare.max())" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeYAAAE/CAYAAACTomAoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAZM0lEQVR4nO3de7SldX3f8fdHhoAC4SKndJwBBpVqiKsOdkRc2gRBI+AFs1ZicHmZWLImbTHRhCSCGsUuabGNktomtEQQvCHESyBIjIhkGdsKDgjIReIEBplxYAaQmxgi+O0f+zeyHWY49zm/vef9Wmuv8+zfc/v+ztn7fM7ze579nFQVkiSpD09Z6AIkSdLjDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrM0B5L8ryR/PEfbOiDJQ0l2as//LslvzcW22/b+JsnKudreNPb7gSR3J7lze+9bGiXxc8zSk0uyFtgPeBR4DLgJ+DhwVlX9ZAbb+q2q+so01vk74JNV9dHp7Kuteyrw7Kp603TXnUtJDgBuAQ6sqo0LWYvUO4+Ypal5TVXtARwInA68Ezh7rneSZNFcb7MTBwD3GMrS5AxmaRqq6v6quhj4DWBlkucBJDk3yQfa9L5JLklyX5J7k/x9kqck+QSDgPrrNlT9R0mWJakkJyT5HvDVobbhkH5WkquSPJDkoiT7tH0dkWTdcI1J1iZ5eZKjgXcBv9H2d12b/9Oh8VbXe5LcnmRjko8n2bPN21zHyiTfa8PQ797W9ybJnm39TW1772nbfzlwGfCMVse5W1l37/Y925TkB2166dD8g5J8LcmDSb6S5M+SfHJo/uFJ/m/7nl+X5Igp/kil7hjM0gxU1VXAOuDfbmX2SW3eBIMh8HcNVqk3A99jcPS9e1X916F1fhn4BeCV29jlW4B/ByxmMKT+kSnU+CXgPwMXtP09fyuL/WZ7vAx4JrA78D+3WOalwHOAo4D3JvmFbezyfwB7tu38cqv5rW3Y/hjg+62O39zKuk8BPsZgROIA4Edb1PFp4Crg6cCpwJs3z0iyBPgi8AFgH+APgM8lmdhGnVLXDGZp5r7PIAi29GMGAXpgVf24qv6+Jr+Y49Sq+mFV/Wgb8z9RVTdU1Q+BPwZev/nisFl6I/Dhqrq1qh4CTgGO3+Jo/f1V9aOqug64DnhCwLdajgdOqaoHq2ot8CGGAvTJVNU9VfW5qnq4qh4ETmMQ7pvPT78QeG9V/XNVfR24eGj1NwGXVtWlVfWTqroMWA0cO51vhNQLg1mauSXAvVtp/2/AGuDLSW5NcvIUtnXHNObfDuwM7DulKp/cM9r2hre9iMGR/mbDV1E/zOCoekv7tpq23NaSqRSR5GlJ/ncbAn8A+BqwVwv8ZwD3VtXDQ6sMfz8OBH69DWPfl+Q+Bkf5i6eyb6k3BrM0A0leyCB0vr7lvHbEeFJVPRN4LfD7SY7aPHsbm5zsiHr/oekDGByV3w38EHjaUF07MRhCn+p2v88g2Ia3/Shw1yTrbenuVtOW21o/xfVPYjBc/qKq+nngl1p7gA3APkmeNrT88PfjDgYjCnsNPXarqtOn2QepCwazNA1Jfj7Jq4HPMPgI07e3ssyrkzw7SYD7GXzEavPHqu5icA52ut6U5JAWTv8J+GxVPQb8A7Brklcl2Rl4D7DL0Hp3AcuSbOu9fj7we+3iqt15/Jz0o9MprtVyIXBakj2SHAj8PvDJJ1/zp/ZgcF75vnZh2/uGtn07g6HpU5P8XJIXA68ZWveTwGuSvDLJTkl2bRfFLUUaQQazNDV/neRBBkdn7wY+DLx1G8seDHwFeAj4f8CfV9UVbd5/Ad7Thlz/YBr7/wRwLoNh5V2B34XBVeLAfwQ+yuDo9IcMLjzb7C/b13uSXLOV7Z7Ttv014Dbgn4DfmUZdw36n7f9WBiMJn27bn4o/BZ7K4Mj7G8CXtpj/RuDFwD0MLvK6AHgEoKruAI5jcJHdJgY/oz/E328aUd5gRNLISXIB8J2qet+kC0sjxr8oJXUvyQuTPKt9LvpoBkfIf7XAZUnzYlzvMiRpvPxL4PMMPse8DvgPVfWthS1Jmh8OZUuS1BGHsiVJ6ojBLElSR7o4x7zvvvvWsmXLFroMSZK2m6uvvvruqnrCPd27COZly5axevXqhS5DkqTtJsntW2t3KFuSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSNd3Ctb2p6WnfzFaa+z9vRXzUMlkvREHjFLktSRSYM5ya5JrkpyXZIbk7y/tZ+b5LYk17bH8taeJB9JsibJ9UleMM99kCRpbExlKPsR4MiqeijJzsDXk/xNm/eHVfXZLZY/Bji4PV4EnNm+SpKkSUx6xFwDD7WnO7dHPckqxwEfb+t9A9gryeLZlypJ0vib0jnmJDsluRbYCFxWVVe2Wae14eozkuzS2pYAdwytvq61SZKkSUwpmKvqsapaDiwFDkvyPOAU4LnAC4F9gHdOZ8dJViVZnWT1pk2bple1JEljalpXZVfVfcAVwNFVtaENVz8CfAw4rC22Hth/aLWlrW3LbZ1VVSuqasXExMSMipckadxM5arsiSR7temnAq8AvrP5vHGSAK8DbmirXAy8pV2dfThwf1VtmIfaJUkaO1O5KnsxcF6SnRgE+YVVdUmSryaZAAJcC/z7tvylwLHAGuBh4K1zXrUkSWNq0mCuquuBQ7fSfuQ2li/gxNmXJknSjsc7f0mS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1ZNJgTrJrkquSXJfkxiTvb+0HJbkyyZokFyT5uda+S3u+ps1fNs99kCRpbEzliPkR4Miqej6wHDg6yeHAB4EzqurZwA+AE9ryJwA/aO1ntOUkSdIUTBrMNfBQe7pzexRwJPDZ1n4e8Lo2fVx7Tpt/VJLMVcGSJI2zKZ1jTrJTkmuBjcBlwD8C91XVo22RdcCSNr0EuAOgzb8fePoc1ixJ0tiaUjBX1WNVtRxYChwGPHe2O06yKsnqJKs3bdo0281JkjQWpnVVdlXdB1wBvBjYK8miNmspsL5Nrwf2B2jz9wTu2cq2zqqqFVW1YmJiYmbVS5I0ZqZyVfZEkr3a9FOBVwA3MwjoX2uLrQQuatMXt+e0+V+tqprDmiVJGluLJl+ExcB5SXZiEOQXVtUlSW4CPpPkA8C3gLPb8mcDn0iyBrgXOH4e6pYkaSxNGsxVdT1w6Fbab2VwvnnL9n8Cfn1OqpMkaQfjnb8kSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOjJpMCfZP8kVSW5KcmOSt7f2U5OsT3Jtexw7tM4pSdYkuSXJK+ezA5IkjZNFU1jmUeCkqromyR7A1Ukua/POqKo/GV44ySHA8cAvAs8AvpLkX1XVY3NZuCRJ42jSI+aq2lBV17TpB4GbgSVPsspxwGeq6pGqug1YAxw2F8VKkjTupnWOOcky4FDgytb0tiTXJzknyd6tbQlwx9Bq63jyIJckSc2UgznJ7sDngHdU1QPAmcCzgOXABuBD09lxklVJVidZvWnTpumsKknS2JpSMCfZmUEof6qqPg9QVXdV1WNV9RPgL3h8uHo9sP/Q6ktb28+oqrOqakVVrZiYmJhNHyRJGhtTuSo7wNnAzVX14aH2xUOL/SpwQ5u+GDg+yS5JDgIOBq6au5IlSRpfU7kq+yXAm4FvJ7m2tb0LeEOS5UABa4HfBqiqG5NcCNzE4IruE70iW5KkqZk0mKvq60C2MuvSJ1nnNOC0WdQlSdIOyTt/SZLUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHVk0mBOsn+SK5LclOTGJG9v7fskuSzJd9vXvVt7knwkyZok1yd5wXx3QpKkcTGVI+ZHgZOq6hDgcODEJIcAJwOXV9XBwOXtOcAxwMHtsQo4c86rliRpTE0azFW1oaquadMPAjcDS4DjgPPaYucBr2vTxwEfr4FvAHslWTzXhUuSNI6mdY45yTLgUOBKYL+q2tBm3Qns16aXAHcMrbautUmSpElMOZiT7A58DnhHVT0wPK+qCqjp7DjJqiSrk6zetGnTdFaVJGlsTSmYk+zMIJQ/VVWfb813bR6ibl83tvb1wP5Dqy9tbT+jqs6qqhVVtWJiYmKm9UuSNFamclV2gLOBm6vqw0OzLgZWtumVwEVD7W9pV2cfDtw/NOQtSZKexKIpLPMS4M3At5Nc29reBZwOXJjkBOB24PVt3qXAscAa4GHgrXNZsCRJ42zSYK6qrwPZxuyjtrJ8ASfOsi5JknZI3vlLkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHZk0mJOck2RjkhuG2k5Nsj7Jte1x7NC8U5KsSXJLklfOV+GSJI2jqRwxnwscvZX2M6pqeXtcCpDkEOB44BfbOn+eZKe5KlaSpHE3aTBX1deAe6e4veOAz1TVI1V1G7AGOGwW9UmStEOZzTnmtyW5vg11793algB3DC2zrrU9QZJVSVYnWb1p06ZZlCFJ0viYaTCfCTwLWA5sAD403Q1U1VlVtaKqVkxMTMywDEmSxsuMgrmq7qqqx6rqJ8Bf8Phw9Xpg/6FFl7Y2SZI0BTMK5iSLh57+KrD5iu2LgeOT7JLkIOBg4KrZlShJ0o5j0WQLJDkfOALYN8k64H3AEUmWAwWsBX4boKpuTHIhcBPwKHBiVT02L5VLkjSGJg3mqnrDVprPfpLlTwNOm01RkiTtqLzzlyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdWbTQBUgAy07+4rTXWXv6q+ahEklaWB4xS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1JFJgznJOUk2JrlhqG2fJJcl+W77undrT5KPJFmT5PokL5jP4iVJGjdTOWI+Fzh6i7aTgcur6mDg8vYc4Bjg4PZYBZw5N2VKkrRjmDSYq+prwL1bNB8HnNemzwNeN9T+8Rr4BrBXksVzVKskSWNvpueY96uqDW36TmC/Nr0EuGNouXWt7QmSrEqyOsnqTZs2zbAMSZLGy6wv/qqqAmoG651VVSuqasXExMRsy5AkaSzMNJjv2jxE3b5ubO3rgf2Hllva2iRJ0hTMNJgvBla26ZXARUPtb2lXZx8O3D805C1JkiaxaLIFkpwPHAHsm2Qd8D7gdODCJCcAtwOvb4tfChwLrAEeBt46DzVLkjS2Jg3mqnrDNmYdtZVlCzhxtkVpdC07+YsLXYIkjTTv/CVJUkcMZkmSOjLpULakmQ/Rrz39VXNciaRx5xGzJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjixa6AGmmlp38xYUuQZLmnMEsdWamf3CsPf1Vc1yJpIXgULYkSR3xiFnb5FCxJG1/HjFLktQRg1mSpI4YzJIkdcRgliSpI7O6+CvJWuBB4DHg0apakWQf4AJgGbAWeH1V/WB2ZUqStGOYi6uyX1ZVdw89Pxm4vKpOT3Jye/7OOdiPpCcxk6vo/eyz1J/5GMo+DjivTZ8HvG4e9iFJ0liabTAX8OUkVydZ1dr2q6oNbfpOYL9Z7kOSpB3GbIeyX1pV65P8C+CyJN8ZnllVlaS2tmIL8lUABxxwwCzLkCRpPMzqiLmq1revG4EvAIcBdyVZDNC+btzGumdV1YqqWjExMTGbMiRJGhszDuYkuyXZY/M08CvADcDFwMq22ErgotkWKUnSjmI2Q9n7AV9Isnk7n66qLyX5JnBhkhOA24HXz75MSZJ2DDMO5qq6FXj+VtrvAY6aTVGSJO2o/O9S0jzyP3RJmi5vySlJUkc8YpY0bd5lTJo/HjFLktQRg1mSpI44lL0D8AIk9WCmr0OHwLWjGctg9heAJGlUOZQtSVJHxvKIWdLUeJpD6o9HzJIkdcRgliSpIwazJEkdMZglSeqIF39J6poff9SOxmCWpMY/AtQDg3mIb0pJ0kLzHLMkSR0xmCVJ6ohD2ZLGknc106gymOeA/zRe0nR5TYu2xaFsSZI64hHzAnGYTdJMOEI3/jxiliSpIx4xS9IsOQKmueQRsyRJHTGYJUnqiMEsSVJH5u0cc5Kjgf8O7AR8tKpOn699SZL64OezZ29egjnJTsCfAa8A1gHfTHJxVd00H/uTJG2bYTla5mso+zBgTVXdWlX/DHwGOG6e9iVJ0tiYr6HsJcAdQ8/XAS+ap31JkubB9vwYWO8fOdueowcL9jnmJKuAVe3pQ0lumcPN7wvcPYfbWyjj0g+wL72yL30al76MSz/IB+elLwdurXG+gnk9sP/Q86Wt7aeq6izgrPnYeZLVVbViPra9PY1LP8C+9Mq+9Glc+jIu/YDt25f5Osf8TeDgJAcl+TngeODiedqXJEljY16OmKvq0SRvA/6WwcelzqmqG+djX5IkjZN5O8dcVZcCl87X9icxL0PkC2Bc+gH2pVf2pU/j0pdx6Qdsx76kqrbXviRJ0iS8JackSR0Zq2BOcnSSW5KsSXLyQtczHUnOSbIxyQ1DbfskuSzJd9vXvReyxqlKsn+SK5LclOTGJG9v7SPXnyS7JrkqyXWtL+9v7QclubK91i5oFzl2L8lOSb6V5JL2fFT7sTbJt5Ncm2R1axu51xdAkr2SfDbJd5LcnOTFo9iXJM9pP4/NjweSvGMU+wKQ5Pfae/6GJOe33wXb5f0yNsE8dBvQY4BDgDckOWRhq5qWc4Gjt2g7Gbi8qg4GLm/PR8GjwElVdQhwOHBi+1mMYn8eAY6squcDy4GjkxwOfBA4o6qeDfwAOGHhSpyWtwM3Dz0f1X4AvKyqlg99hGUUX18w+J8CX6qq5wLPZ/DzGbm+VNUt7eexHPg3wMPAFxjBviRZAvwusKKqnsfgIubj2V7vl6oaiwfwYuBvh56fApyy0HVNsw/LgBuGnt8CLG7Ti4FbFrrGGfbrIgb3TR/p/gBPA65hcBe7u4FFrf1nXnu9PhjcT+By4EjgEiCj2I9W61pg3y3aRu71BewJ3Ea73meU+7JF/b8C/J9R7QuP371yHwYXSV8CvHJ7vV/G5oiZrd8GdMkC1TJX9quqDW36TmC/hSxmJpIsAw4FrmRE+9OGf68FNgKXAf8I3FdVj7ZFRuW19qfAHwE/ac+fzmj2A6CALye5ut1FEEbz9XUQsAn4WDvF8NEkuzGafRl2PHB+mx65vlTVeuBPgO8BG4D7gavZTu+XcQrmsVaDP9FG6hL6JLsDnwPeUVUPDM8bpf5U1WM1GJ5byuAftDx3YSuaviSvBjZW1dULXcsceWlVvYDBqasTk/zS8MwRen0tAl4AnFlVhwI/ZIuh3hHqCwDtvOtrgb/cct6o9KWdBz+OwR9OzwB244mnGufNOAXzpLcBHUF3JVkM0L5uXOB6pizJzgxC+VNV9fnWPLL9Aaiq+4ArGAxh7ZVk830ARuG19hLgtUnWMvhvb0cyOLc5av0AfnpEQ1VtZHAe8zBG8/W1DlhXVVe2559lENSj2JfNjgGuqaq72vNR7MvLgduqalNV/Rj4PIP30HZ5v4xTMI/jbUAvBla26ZUMztV2L0mAs4Gbq+rDQ7NGrj9JJpLs1aafyuBc+c0MAvrX2mLd96WqTqmqpVW1jMF746tV9UZGrB8ASXZLssfmaQbnM29gBF9fVXUncEeS57Smo4CbGMG+DHkDjw9jw2j25XvA4Ume1n6fbf65bJ/3y0KfZJ/jE/bHAv/A4Bzguxe6nmnWfj6Dcxk/ZvBX9AkMzgFeDnwX+Aqwz0LXOcW+vJTBcNX1wLXtcewo9gf418C3Wl9uAN7b2p8JXAWsYTBkt8tC1zqNPh0BXDKq/Wg1X9ceN25+r4/i66vVvRxY3V5jfwXsPcJ92Q24B9hzqG1U+/J+4Dvtff8JYJft9X7xzl+SJHVknIayJUkaeQazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXk/wOn1so1cVz37wAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Histogram of age feature before capping outliers\n", "plot_hist(data, 'age')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeYAAAE/CAYAAACTomAoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAYCElEQVR4nO3dfbTdVZ3f8fdHAjLjA+HhmsUkgeCQ6jBdFWlq49KpSkbLgzX84TC4HIk0s9LVUqvVWU58GKtdtsV2KSN9oGWJNfiMzFgySh1jwOXMWgMaFJAHlSsFkxhIRAgCoyP67R9nXz3EhHvuzb3J5uT9Wuuss39779/vt88ml09++/e7J6kqJElSH55ysAcgSZJ+yWCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLcyDJ/0zyJ3N0rBOSPJzksLb95SR/OBfHbsf7v0nWzNXxZnDe9yb5QZJ799H+L5Pc1z77sQd6fFIv4u8xS08syd3AIuAx4GfA7cAVwGVV9fNZHOsPq+pLM9jny8DHqupDMzlX2/fdwMlV9Qcz3XcuJTkB+DZwYlXt3Ev74cBDwMqquvlAj0/qiVfM0mj+WVU9AzgRuAj4Y+DyuT5JkgVzfcxOnADcv7dQbhYBRwK3zebgYzxvOgQZzNIMVNXuqtoI/D6wJsnfB0jykSTvbeXjknwuyYNJfpjkr5I8JclHGQTUX7Tl2rcmWZakkqxN8j3g2qG64bD5zSRfTfJQkquTHNPO9dIk24bHmOTuJL+b5Azg7cDvt/Pd3Np/sTTexvXOJPck2ZnkiiRHtbapcaxJ8r22DP2Ofc1NkqPa/rva8d7Zjv+7wCbgN9o4PrLHfn+PwdU0wINJrm31H0yytX3mG5P8ztA+705yVZKPJXkIeH07/+VJdiTZ3pbODxv9v67UB4NZmoWq+iqwDfidvTS/pbVNMLgSfPtgl3od8D0GV99Pr6r/PLTPS4DfAv7pPk55PvDPgeMZLKlfMsIYvwD8R+DT7XzP20u317fXy4BnA08H/tsefV4MPAdYBbwryW/t45T/FTiqHeclbcwXtGX7M4Hvt3G8fo9xfgf47ba5sKpOb+WvAacCxwCfAD6T5MihXVcDVwELgY8DH2EwNycDzwdeAczZvXnpQDGYpdn7PoPQ2NNPGQToiVX106r6q5r+YY53V9UjVfW3+2j/aFXdWlWPAH8CnDtHV4OvBT5QVXdV1cPA24Dz9rhaf09V/W2793sz8CsB38ZyHvC2qvpRVd0NvB943WwHVlUfq6r7q+qxqno/8FQGf0GY8jdV9X/aff5nAmcBb2rzuBO4uI1JelIxmKXZWwz8cC/1/wWYBL6Y5K4k60c41tYZtN8DHA4cN9Ion9hvtOMNH3sBgyv9KcNPUT/K4Kp6T8e1Me15rMWzHViSP0pyR5LdSR5kcDU+/JmH5+TEdv4d7RbCg8D/Ap412/NLB4vBLM1Ckn/EIHT+es+2dsX4lqp6NvAq4M1JVk017+OQ011RLx0qn8DgqvwHwCPArw+N6zAGS+ijHvf7DEJt+NiPAfdNs9+eftDGtOexts/wOAC0+8lvBc4Fjq6qhcBuIEPdhj/bVuAnwHFVtbC9nllVv430JGMwSzOQ5JlJXgl8isGvMH1zL31emeTkJGEQJj8Dpn6t6j4G92Bn6g+SnJLk14F/D1xVVT8DvgMcmeTs9itH72Sw5DvlPmBZkn39rH8S+LdJTkrydH55T/qxmQyujeVK4D8keUaSE4E3Ax+byXGGPIPBXxB2AQuSvIvBcvW+zr8D+CLw/vbf6ClJfjPJS2Z5fumgMZil0fxFkh8xuDJ7B/AB4IJ99F0OfAl4GPgb4H9U1XWt7T8B72zLrX80g/N/lMHDTfcy+LWifwODp8SBfwV8iMHV6SMMHjyb8pn2fn+Sr+/luB9ux/4K8P+AHwNvmMG4hr2hnf8uBisJn2jHn42/BL7A4C8e97RxTbfcfz5wBIPfM3+AwYNhx8/y/NJB4xeMSJLUEa+YJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjnTxL7Icd9xxtWzZsoM9DEmSDogbb7zxB1U1sbe2LoJ52bJlbNmy5WAPQ5KkAyLJPftqcylbkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjXXxX9lxbtv7zs9rv7ovOnuORSJI0M14xS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqyLTBnOQ5SW4aej2U5E1JjkmyKcmd7f3o1j9JLkkymeSWJKfN/8eQJGk8TBvMVfXtqjq1qk4F/iHwKPBZYD2wuaqWA5vbNsCZwPL2WgdcOg/jliRpLM10KXsV8N2qugdYDWxo9RuAc1p5NXBFDVwPLExy/FwMVpKkcTfTYD4P+GQrL6qqHa18L7ColRcDW4f22dbqJEnSNEYO5iRHAK8CPrNnW1UVUDM5cZJ1SbYk2bJr166Z7CpJ0tiayRXzmcDXq+q+tn3f1BJ1e9/Z6rcDS4f2W9LqHqeqLquqFVW1YmJiYuYjlyRpDM0kmF/DL5exATYCa1p5DXD1UP357enslcDuoSVvSZL0BEb616WSPA14OfAvhqovAq5Msha4Bzi31V8DnAVMMniC+4I5G60kSWNupGCuqkeAY/eou5/BU9p79i3gwjkZnSRJhxi/+UuSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIyMFc5KFSa5K8q0kdyR5YZJjkmxKcmd7P7r1TZJLkkwmuSXJafP7ESRJGh+jXjF/EPhCVT0XeB5wB7Ae2FxVy4HNbRvgTGB5e60DLp3TEUuSNMamDeYkRwH/BLgcoKr+rqoeBFYDG1q3DcA5rbwauKIGrgcWJjl+jsctSdJYGuWK+SRgF/C/k3wjyYeSPA1YVFU7Wp97gUWtvBjYOrT/tlYnSZKmMUowLwBOAy6tqucDj/DLZWsAqqqAmsmJk6xLsiXJll27ds1kV0mSxtYowbwN2FZVN7TtqxgE9X1TS9TtfWdr3w4sHdp/Sat7nKq6rKpWVNWKiYmJ2Y5fkqSxMm0wV9W9wNYkz2lVq4DbgY3Amla3Bri6lTcC57ens1cCu4eWvCVJ0hNYMGK/NwAfT3IEcBdwAYNQvzLJWuAe4NzW9xrgLGASeLT1lSRJIxgpmKvqJmDFXppW7aVvARfu37AkSTo0+c1fkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjIwVzkruTfDPJTUm2tLpjkmxKcmd7P7rVJ8klSSaT3JLktPn8AJIkjZOZXDG/rKpOraoVbXs9sLmqlgOb2zbAmcDy9loHXDpXg5Ukadztz1L2amBDK28Azhmqv6IGrgcWJjl+P84jSdIhY9RgLuCLSW5Msq7VLaqqHa18L7ColRcDW4f23dbqJEnSNBaM2O/FVbU9ybOATUm+NdxYVZWkZnLiFvDrAE444YSZ7CpJ0tga6Yq5qra3953AZ4EXAPdNLVG3952t+3Zg6dDuS1rdnse8rKpWVNWKiYmJ2X8CSZLGyLTBnORpSZ4xVQZeAdwKbATWtG5rgKtbeSNwfns6eyWwe2jJW5IkPYFRlrIXAZ9NMtX/E1X1hSRfA65Msha4Bzi39b8GOAuYBB4FLpjzUUuSNKamDeaqugt43l7q7wdW7aW+gAvnZHSSJB1i/OYvSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI6MHMxJDkvyjSSfa9snJbkhyWSSTyc5otU/tW1PtvZl8zR2SZLGzkyumN8I3DG0/T7g4qo6GXgAWNvq1wIPtPqLWz9JkjSCkYI5yRLgbOBDbTvA6cBVrcsG4JxWXt22ae2rWn9JkjSNUa+Y/xR4K/Dztn0s8GBVPda2twGLW3kxsBWgte9u/R8nybokW5Js2bVr1+xGL0nSmJk2mJO8EthZVTfO5Ymr6rKqWlFVKyYmJuby0JIkPWktGKHPi4BXJTkLOBJ4JvBBYGGSBe2qeAmwvfXfDiwFtiVZABwF3D/nI5ckaQxNe8VcVW+rqiVVtQw4D7i2ql4LXAe8unVbA1zdyhvbNq392qqqOR21JEljan9+j/mPgTcnmWRwD/nyVn85cGyrfzOwfv+GKEnSoWOUpexfqKovA19u5buAF+ylz4+B35uDsUmSdMjxm78kSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkemDeYkRyb5apKbk9yW5D2t/qQkNySZTPLpJEe0+qe27cnWvmyeP4MkSWNjlCvmnwCnV9XzgFOBM5KsBN4HXFxVJwMPAGtb/7XAA63+4tZPkiSNYNpgroGH2+bh7VXA6cBVrX4DcE4rr27btPZVSTJXA5YkaZyNdI85yWFJbgJ2ApuA7wIPVtVjrcs2YHErLwa2ArT23cCxczhmSZLG1kjBXFU/q6pTgSXAC4Dn7u+Jk6xLsiXJll27du3v4SRJGgszeiq7qh4ErgNeCCxMsqA1LQG2t/J2YClAaz8KuH8vx7qsqlZU1YqJiYnZjV6SpDEzylPZE0kWtvKvAS8H7mAQ0K9u3dYAV7fyxrZNa7+2qmoOxyxJ0thaMH0Xjgc2JDmMQZBfWVWfS3I78Kkk7wW+AVze+l8OfDTJJPBD4Lx5GLckSWNp2mCuqluA5++l/i4G95v3rP8x8HtzMjpJkg4xfvOXJEkdMZglSeqIwSxJUkcMZkmSOjLKU9mHjGXrPz/jfe6+6Ox5GIkk6VDlFbMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1JFpgznJ0iTXJbk9yW1J3tjqj0myKcmd7f3oVp8klySZTHJLktPm+0NIkjQuRrlifgx4S1WdAqwELkxyCrAe2FxVy4HNbRvgTGB5e60DLp3zUUuSNKamDeaq2lFVX2/lHwF3AIuB1cCG1m0DcE4rrwauqIHrgYVJjp/rgUuSNI5mdI85yTLg+cANwKKq2tGa7gUWtfJiYOvQbttanSRJmsbIwZzk6cCfAW+qqoeG26qqgJrJiZOsS7IlyZZdu3bNZFdJksbWSMGc5HAGofzxqvrzVn3f1BJ1e9/Z6rcDS4d2X9LqHqeqLquqFVW1YmJiYrbjlyRprIzyVHaAy4E7quoDQ00bgTWtvAa4eqj+/PZ09kpg99CStyRJegILRujzIuB1wDeT3NTq3g5cBFyZZC1wD3Bua7sGOAuYBB4FLpjLAUuSNM6mDeaq+msg+2hetZf+BVy4n+OSJOmQ5Dd/SZLUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHVk2mBO8uEkO5PcOlR3TJJNSe5s70e3+iS5JMlkkluSnDafg5ckadyMcsX8EeCMPerWA5urajmwuW0DnAksb691wKVzM0xJkg4N0wZzVX0F+OEe1auBDa28AThnqP6KGrgeWJjk+DkaqyRJY2+295gXVdWOVr4XWNTKi4GtQ/22tbpfkWRdki1JtuzatWuWw5Akabzs98NfVVVAzWK/y6pqRVWtmJiY2N9hSJI0FmYbzPdNLVG3952tfjuwdKjfklYnSZJGMNtg3gisaeU1wNVD9ee3p7NXAruHlrwlSdI0FkzXIckngZcCxyXZBvw74CLgyiRrgXuAc1v3a4CzgEngUeCCeRizJElja9pgrqrX7KNp1V76FnDh/g5KkqRDld/8JUlSRwxmSZI6YjBLktQRg1mSpI4YzJIkdWTap7I1P5at//wBO9fdF519wM4lSdo/XjFLktQRr5j304G88pUkjT+vmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHVlwsAeg+bds/edntd/dF53d9bkkaRwZzNqn2YbsgTrXkyHMe/9c/kVK6s+8LGUnOSPJt5NMJlk/H+eQJGkczXkwJzkM+O/AmcApwGuSnDLX55EkaRzNx1L2C4DJqroLIMmngNXA7fNwLh3CxnUZdlw/l9SL3m8xzUcwLwa2Dm1vA/7xPJxH0pAD+UyApPlz0B7+SrIOWNc2H07y7Tk8/HHAD+bweIeasZ6/vG9eDz/WczdlnubwkJi7eeLczd5IczcPf+ZP3FfDfATzdmDp0PaSVvc4VXUZcNk8nJ8kW6pqxXwc+1Dg/M2eczd7zt3sOXez1+PczcdT2V8Dlic5KckRwHnAxnk4jyRJY2fOr5ir6rEk/xr4S+Aw4MNVddtcn0eSpHE0L/eYq+oa4Jr5OPaI5mWJ/BDi/M2eczd7zt3sOXez193cpaoO9hgkSVLjP2IhSVJHxi6Y/TrQJ5bkw0l2Jrl1qO6YJJuS3Nnej271SXJJm8tbkpx28EZ+8CVZmuS6JLcnuS3JG1u98zeNJEcm+WqSm9vcvafVn5TkhjZHn24PjJLkqW17srUvO6gfoANJDkvyjSSfa9vO3YiS3J3km0luSrKl1XX7cztWwezXgY7kI8AZe9StBzZX1XJgc9uGwTwub691wKUHaIy9egx4S1WdAqwELmx/vpy/6f0EOL2qngecCpyRZCXwPuDiqjoZeABY2/qvBR5o9Re3foe6NwJ3DG07dzPzsqo6dehXo7r9uR2rYGbo60Cr6u+Aqa8DVVNVXwF+uEf1amBDK28Azhmqv6IGrgcWJjn+gAy0Q1W1o6q+3so/YvA/ycU4f9Nqc/Bw2zy8vQo4Hbiq1e85d1NzehWwKkkOzGj7k2QJcDbwobYdnLv91e3P7bgF896+DnTxQRrLk8miqtrRyvcCi1rZ+dyHtjz4fOAGnL+RtKXYm4CdwCbgu8CDVfVY6zI8P7+Yu9a+Gzj2gA64L38KvBX4eds+FuduJgr4YpIb27dOQsc/t/57zHqcqqokPqr/BJI8Hfgz4E1V9dDwxYjzt29V9TPg1CQLgc8Czz24I3pySPJKYGdV3ZjkpQd5OE9WL66q7UmeBWxK8q3hxt5+bsftinmkrwPVr7hvaqmmve9s9c7nHpIcziCUP15Vf96qnb8ZqKoHgeuAFzJYJpy6QBien1/MXWs/Crj/wI60Gy8CXpXkbga3504HPohzN7Kq2t7edzL4S+EL6PjndtyC2a8DnZ2NwJpWXgNcPVR/fntKcSWwe2jp55DT7tNdDtxRVR8YanL+ppFkol0pk+TXgJczuEd/HfDq1m3PuZua01cD19Yh+qULVfW2qlpSVcsY/D/t2qp6Lc7dSJI8LckzpsrAK4Bb6fnntqrG6gWcBXyHwf2rdxzs8fT2Aj4J7AB+yuDeyVoG9582A3cCXwKOaX3D4Cn37wLfBFYc7PEf5Ll7MYN7VbcAN7XXWc7fSHP3D4BvtLm7FXhXq3828FVgEvgM8NRWf2Tbnmztzz7Yn6GHF/BS4HPO3Yzm7NnAze1121Qu9Pxz6zd/SZLUkXFbypYk6UnNYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjvx/csKUYAcL4/kAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Histogram of fare feature before capping outliers\n", "plot_hist(data, 'fare')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Capping : Gaussian\n", "\n", "Gaussian limits:\n", "+ right tail: mean + 3* std\n", "+ left tail: mean - 3* std\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Winsorizer(variables=['age', 'fare'])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'''Parameters\n", "----------\n", "capping_method : str, default=gaussian\n", "\n", " Desired capping method. Can take 'gaussian', 'iqr' or 'quantiles'.\n", "\n", "tail : str, default=right\n", "\n", " Whether to cap outliers on the right, left or both tails of the distribution.\n", " Can take 'left', 'right' or 'both'.\n", "\n", "fold: int or float, default=3\n", "\n", " How far out to to place the capping values. The number that will multiply\n", " the std or IQR to calculate the capping values. Recommended values, 2\n", " or 3 for the gaussian approximation, or 1.5 or 3 for the IQR proximity\n", " rule.\n", "\n", "variables: list, default=None\n", " \n", "missing_values: string, default='raise'\n", "\n", " Indicates if missing values should be ignored or raised.\n", "'''\n", "# capping at right tail using gaussian capping method\n", "capper = Winsorizer(\n", " capping_method='gaussian', tail='right', fold=3, variables=['age', 'fare'])\n", "\n", "# fitting winsorizer object to training data\n", "capper.fit(X_train)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'age': 67.49048447470315, 'fare': 174.78162171790441}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# here we can find the maximum caps allowed\n", "capper.right_tail_caps_" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# this dictionary is empty, because we selected only right tail\n", "capper.left_tail_caps_" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAecAAAE/CAYAAAB8YAsWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAU40lEQVR4nO3dffRlVV3H8fcHRjPFBJxpGoFhUGdZ1Eq00XBlhWKJ+ICtVYTLh4loTauotLACsyCXFNZSy0qTFMEnhHwIUjJxsoVWSoOKIkhOOMiMMDOAPAimDn774+6fXMYZfs/89u/83q+17rrn7PO097135nP3PueeX6oKSZLUj30WugKSJOm+DGdJkjpjOEuS1BnDWZKkzhjOkiR1xnCWJKkzhrM0B5L8fZI/nqN9rU7y9ST7tvl/T/Jrc7Hvtr9/SbJ+rvY3jeO+OsnNSW56oI8tLTbxd87S/UuyBVgJ7ALuAa4G3g6cXVXfmcG+fq2qPjqNbf4deGdVvWU6x2rbngE8tqpeNN1t51KS1cC1wKFVtWMh6yItBvacpal5blU9HDgUOAv4Q+Ctc32QJMvmep+dWA3cYjBLU2M4S9NQVbdX1cXALwPrk/wYQJJzk7y6TS9P8sEktyW5NcnHk+yT5B2MQuqf27D1HyRZk6SSnJTkK8C/jZWNB/Vjklye5I4kFyU5sB3rqCRbx+uYZEuSZyQ5BngF8MvteFe25d8dJm/1emWS65PsSPL2JI9oyybqsT7JV9qQ9B/t7bVJ8oi2/c62v1e2/T8DuBR4VKvHuXvY9oD2mu1M8rU2ffDY8sOSXJbkziQfTfJ3Sd45tvzIJP/ZXvMrkxw1xbdU6pLhLM1AVV0ObAV+eg+LT2nLVjAaDn/FaJN6MfAVRr3w/arqL8a2+VngR4Bn7uWQLwF+FVjFaHj9DVOo44eBPwMuaMd7/B5W+5X2eBrwaGA/4G93W+epwOOAo4E/SfIjeznk3wCPaPv52VbnE9sQ/rOAr7Z6/Moett0HeBujkYnVwDd2q8e7gcuBRwJnAC+eWJDkIOBDwKuBA4GXA+9LsmIv9ZS6ZzhLM/dVRmGwu28zCtFDq+rbVfXxmvzijjOq6q6q+sZelr+jqq6qqruAPwaOn7hgbJZeCLyuqq6rqq8DpwEn7NZr/9Oq+kZVXQlcCXxPyLe6nACcVlV3VtUW4LWMhej9qapbqup9VXV3Vd0JnMko4CfOVz8J+JOq+lZVfQK4eGzzFwGXVNUlVfWdqroU2AQcO50XQuqJ4SzN3EHArXso/0tgM/CRJNclOXUK+7phGsuvBx4ELJ9SLe/fo9r+xve9jFGPf8L41dV3M+pd7255q9Pu+zpoKpVI8tAkb27D4XcAlwH7t9B/FHBrVd09tsn463Eo8EttSPu2JLcx6u2vmsqxpR4ZztIMJHkSo+D5xO7LWs/xlKp6NPA84PeSHD2xeC+7nKxnfcjY9GpGvfObgbuAh47Va19Gw+lT3e9XGYXb+L53Adsn2W53N7c67b6vbVPc/hRGQ+c/WVU/APxMKw9wI3BgkoeOrT/+etzAaGRh/7HHw6rqrGm2QeqG4SxNQ5IfSPIc4D2Mft70+T2s85wkj00S4HZGP7+a+MnVdkbnZKfrRUkObwH1KuC9VXUP8D/AQ5I8O8mDgFcC3ze23XZgTZK9/Vs/H/jddsHVftx7jnrXdCrX6nIhcGaShyc5FPg94J33v+V3PZzReebb2sVup4/t+3pGw9RnJHlwkqcAzx3b9p3Ac5M8M8m+SR7SLpQ7GGmRMpylqfnnJHcy6qX9EfA64MS9rLsW+CjwdeC/gDdW1cfasj8HXtmGX18+jeO/AziX0RDzQ4DfgdHV48BvAm9h1Eu9i9HFaBP+sT3fkuTTe9jvOW3flwFfBv4P+O1p1Gvcb7fjX8doROHdbf9T8VfA9zPqgX8S+PBuy18IPAW4hdGFXxcA3wSoqhuA4xhdeLeT0Xv0+/j/mxYxb0IiadFJcgHwxao6fdKVpUXIb5aSupfkSUke0343fQyjnvI/LXC1pHkz1LsRSRqWHwLez+h3zluB36iqzyxslaT547C2JEmdcVhbkqTOGM6SJHWmi3POy5cvrzVr1ix0NSRJesBcccUVN1fVHu8B30U4r1mzhk2bNi10NSRJesAkuX5vyxzWliSpM4azJEmdMZwlSeqM4SxJUmcMZ0mSOmM4S5LUGcNZkqTOGM6SJHXGcJYkqTOGsyRJnTGcJUnqTBf31paGas2pH5r2NlvOevY81ETSYmLPWZKkzhjOkiR1xnCWJKkzhrMkSZ0xnCVJ6ozhLElSZwxnSZI6YzhLktQZw1mSpM4YzpIkdcZwliSpM4azJEmdMZwlSeqM4SxJUmcMZ0mSOmM4S5LUGcNZkqTOGM6SJHXGcJYkqTOGsyRJnTGcJUnqjOEsSVJnDGdJkjozaTgnOSTJx5JcneQLSV7ayg9McmmSL7XnA1p5krwhyeYkn0vyxPluhCRJQzKVnvMu4JSqOhw4Ejg5yeHAqcDGqloLbGzzAM8C1rbHBuBNc15rSZIGbNJwrqobq+rTbfpO4BrgIOA44Ly22nnA89v0ccDba+STwP5JVs11xSVJGqppnXNOsgZ4AvApYGVV3dgW3QSsbNMHATeMbba1lUmSpCmYcjgn2Q94H/CyqrpjfFlVFVDTOXCSDUk2Jdm0c+fO6WwqSdKgTSmckzyIUTC/q6re34q3TwxXt+cdrXwbcMjY5ge3svuoqrOral1VrVuxYsVM6y9J0uBM5WrtAG8Frqmq140tuhhY36bXAxeNlb+kXbV9JHD72PC3JEmaxLIprPNTwIuBzyf5bCt7BXAWcGGSk4DrgePbskuAY4HNwN3AiXNZYUmShm7ScK6qTwDZy+Kj97B+ASfPsl6SJC1Z3iFMkqTOGM6SJHXGcJYkqTOGsyRJnTGcJUnqjOEsSVJnDGdJkjpjOEuS1BnDWZKkzhjOkiR1xnCWJKkzhrMkSZ0xnCVJ6ozhLElSZwxnSZI6YzhLktQZw1mSpM4YzpIkdcZwliSpM4azJEmdMZwlSeqM4SxJUmcMZ0mSOmM4S5LUGcNZkqTOGM6SJHXGcJYkqTOGsyRJnTGcJUnqjOEsSVJnDGdJkjpjOEuS1BnDWZKkzhjOkiR1xnCWJKkzhrMkSZ0xnCVJ6ozhLElSZwxnSZI6YzhLktQZw1mSpM4YzpIkdcZwliSpM4azJEmdMZwlSeqM4SxJUmcMZ0mSOmM4S5LUGcNZkqTOTBrOSc5JsiPJVWNlZyTZluSz7XHs2LLTkmxOcm2SZ85XxSVJGqqp9JzPBY7ZQ/nrq+qI9rgEIMnhwAnAj7Zt3phk37mqrCRJS8Gk4VxVlwG3TnF/xwHvqapvVtWXgc3Ak2dRP0mSlpzZnHP+rSSfa8PeB7Syg4AbxtbZ2sq+R5INSTYl2bRz585ZVEOSpGGZaTi/CXgMcARwI/Da6e6gqs6uqnVVtW7FihUzrIYkScMzo3Cuqu1VdU9VfQf4B+4dut4GHDK26sGtTJIkTdGMwjnJqrHZXwAmruS+GDghyfclOQxYC1w+uypKkrS0LJtshSTnA0cBy5NsBU4HjkpyBFDAFuDXAarqC0kuBK4GdgEnV9U981JzSZIGatJwrqoX7KH4rfez/pnAmbOplCRJS5l3CJMkqTOGsyRJnTGcJUnqjOEsSVJnDGdJkjpjOEuS1BnDWZKkzhjOkiR1xnCWJKkzhrMkSZ0xnCVJ6ozhLElSZwxnSZI6YzhLktQZw1mSpM4YzpIkdcZwliSpM4azJEmdMZwlSeqM4SxJUmcMZ0mSOmM4S5LUGcNZkqTOGM6SJHXGcJYkqTOGsyRJnTGcJUnqjOEsSVJnDGdJkjpjOEuS1BnDWZKkzhjOkiR1xnCWJKkzhrMkSZ0xnCVJ6ozhLElSZwxnSZI6YzhLktQZw1mSpM4YzpIkdcZwliSpM4azJEmdMZwlSeqM4SxJUmcMZ0mSOmM4S5LUGcNZkqTOGM6SJHVm0nBOck6SHUmuGis7MMmlSb7Ung9o5UnyhiSbk3wuyRPns/KSJA3RVHrO5wLH7FZ2KrCxqtYCG9s8wLOAte2xAXjT3FRTkqSlY9JwrqrLgFt3Kz4OOK9Nnwc8f6z87TXySWD/JKvmqK6SJC0JMz3nvLKqbmzTNwEr2/RBwA1j621tZZIkaYpmfUFYVRVQ090uyYYkm5Js2rlz52yrIUnSYMw0nLdPDFe35x2tfBtwyNh6B7ey71FVZ1fVuqpat2LFihlWQ5Kk4ZlpOF8MrG/T64GLxspf0q7aPhK4fWz4W5IkTcGyyVZIcj5wFLA8yVbgdOAs4MIkJwHXA8e31S8BjgU2A3cDJ85DnSVJGrRJw7mqXrCXRUfvYd0CTp5tpSRJWsq8Q5gkSZ0xnCVJ6ozhLElSZwxnSZI6YzhLktQZw1mSpM4YzpIkdcZwliSpM4azJEmdMZwlSeqM4SxJUmcMZ0mSOmM4S5LUGcNZkqTOGM6SJHXGcJYkqTOGsyRJnTGcJUnqjOEsSVJnDGdJkjpjOEuS1BnDWZKkzhjOkiR1xnCWJKkzyxa6AtJisObUDy10FSQtIfacJUnqjOEsSVJnDGdJkjpjOEuS1BnDWZKkzhjOkiR1xnCWJKkzhrMkSZ0xnCVJ6ozhLElSZwxnSZI6YzhLktQZw1mSpM4YzpIkdcZwliSpM/49Zy1aM/0by1vOevYc10SS5pY9Z0mSOmM4S5LUGcNZkqTOGM6SJHXGcJYkqTOGsyRJnTGcJUnqjOEsSVJnZnUTkiRbgDuBe4BdVbUuyYHABcAaYAtwfFV9bXbVlCRp6ZiLnvPTquqIqlrX5k8FNlbVWmBjm5ckSVM0H8PaxwHntenzgOfPwzEkSRqs2d5bu4CPJCngzVV1NrCyqm5sy28CVs7yGFpEFsP9rmdaR0l6oMw2nJ9aVduS/CBwaZIvji+sqmrB/T2SbAA2AKxevXqW1ZCGYzF8wZE0v2Y1rF1V29rzDuADwJOB7UlWAbTnHXvZ9uyqWldV61asWDGbakiSNCgz7jkneRiwT1Xd2aZ/HngVcDGwHjirPV80FxWVNPfspUt9ms2w9krgA0km9vPuqvpwkv8GLkxyEnA9cPzsqylJ0tIx43CuquuAx++h/Bbg6NlUSpKkpcw7hEmS1BnDWZKkzhjOkiR1xnCWJKkzhrMkSZ0xnCVJ6ozhLElSZ2Z7b21JnfAPekjDYc9ZkqTOGM6SJHXGYW11wSFZSbqXPWdJkjpjOEuS1BnDWZKkzhjOkiR1xnCWJKkzXq0t6QEx0yvyt5z17DmuidQ/e86SJHXGnrOkQZpJT91eunphz1mSpM4YzpIkdcZwliSpM4azJEmdMZwlSeqM4SxJUmcMZ0mSOmM4S5LUGcNZkqTOGM6SJHVmkLfv9Ab7kqTFbJDhLGl+zfQLsKSpMZwldc0vAlqKDGdJWgCeftP98YIwSZI6Y89ZkmbJoXfNNcNZkhaRmXwRcCh88XFYW5KkzhjOkiR1xmFtSWo8d6xe2HOWJKkz9pwXiL9xlKR7eaHbfdlzliSpM/acx3i+6b58PSRNl/9vzA17zpIkdcaesyQNnNe4LD6GsyRpURrylw7DeZHxfI4kDZ/hLEnao6F2BhbDz7a8IEySpM7MWzgnOSbJtUk2Jzl1vo4jSdLQzEs4J9kX+DvgWcDhwAuSHD4fx5IkaWjmq+f8ZGBzVV1XVd8C3gMcN0/HkiRpUOYrnA8Cbhib39rKJEnSJBbsau0kG4ANbfbrSa6do10vB26eo30tFkutzbZ3+JZam21v5/KaWW2+t/YeurcN5iuctwGHjM0f3Mq+q6rOBs6e6wMn2VRV6+Z6vz1bam22vcO31Npse4dtJu2dr2Ht/wbWJjksyYOBE4CL5+lYkiQNyrz0nKtqV5LfAv4V2Bc4p6q+MB/HkiRpaObtnHNVXQJcMl/7vx9zPlS+CCy1Ntve4Vtqbba9wzbt9qaq5qMikiRphrx9pyRJnRlUOC+FW4YmOSfJjiRXjZUdmOTSJF9qzwcsZB3nSpJDknwsydVJvpDkpa18kO0FSPKQJJcnubK1+U9b+WFJPtU+2xe0Cy0HI8m+ST6T5INtfrDtTbIlyeeTfDbJplY22M80QJL9k7w3yReTXJPkKUNtc5LHtfd24nFHkpdNt72DCecldMvQc4Fjdis7FdhYVWuBjW1+CHYBp1TV4cCRwMntPR1qewG+CTy9qh4PHAEck+RI4DXA66vqscDXgJMWrorz4qXANWPzQ2/v06rqiLGf1wz5Mw3w18CHq+qHgcczeq8H2eaqura9t0cAPwHcDXyA6ba3qgbxAJ4C/OvY/GnAaQtdr3lq6xrgqrH5a4FVbXoVcO1C13Ge2n0R8HNLqL0PBT4N/CSjGxgsa+X3+awv9gej+yBsBJ4OfBDIwNu7BVi+W9lgP9PAI4Av065xWgptHmvjzwP/MZP2DqbnzNK+ZejKqrqxTd8ErFzIysyHJGuAJwCfYuDtbUO8nwV2AJcC/wvcVlW72ipD+2z/FfAHwHfa/CMZdnsL+EiSK9qdEmHYn+nDgJ3A29qpi7ckeRjDbvOEE4Dz2/S02jukcBZQo69lg7oEP8l+wPuAl1XVHePLhtjeqrqnRkNiBzP6IzI/vLA1mj9JngPsqKorFrouD6CnVtUTGZ2COznJz4wvHOBnehnwROBNVfUE4C52G9IdYJtp10k8D/jH3ZdNpb1DCudJbxk6YNuTrAJozzsWuD5zJsmDGAXzu6rq/a14sO0dV1W3AR9jNKy7f5KJ+xIM6bP9U8Dzkmxh9Nfrns7o/ORQ20tVbWvPOxidi3wyw/5MbwW2VtWn2vx7GYX1kNsMoy9fn66q7W1+Wu0dUjgv5VuGXgysb9PrGZ2bXfSSBHgrcE1VvW5s0SDbC5BkRZL92/T3MzrHfg2jkP7Fttpg2lxVp1XVwVW1htG/2X+rqhcy0PYmeViSh09MMzoneRUD/kxX1U3ADUke14qOBq5mwG1uXsC9Q9owzfYO6iYkSY5ldP5q4pahZy5sjeZekvOBoxj9lZPtwOnAPwEXAquB64Hjq+rWBarinEnyVODjwOe593zkKxiddx5cewGS/DhwHqPP8D7AhVX1qiSPZtSzPBD4DPCiqvrmwtV07iU5Cnh5VT1nqO1t7fpAm10GvLuqzkzySAb6mQZIcgTwFuDBwHXAibTPNwNsc/vi9RXg0VV1eyub1ns8qHCWJGkIhjSsLUnSIBjOkiR1xnCWJKkzhrMkSZ0xnCVJ6ozhLElSZwxnSZI6YzhLktSZ/wcOz6e6KfYedQAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# # Histogram of age feature after capping outliers\n", "plot_hist(capper.transform(X_train), 'age')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(67.49048447470315, 174.78162171790441)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# transforming the training and testing data\n", "train_t = capper.transform(X_train)\n", "test_t = capper.transform(X_test)\n", "\n", "# let's check the new maximum Age and maximum Fare in the titanic\n", "train_t.age.max(), train_t.fare.max()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Gaussian approximation capping, both tails" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Winsorizer(fold=2, tail='both', variables=['fare'])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Capping the outliers at both tails using gaussian capping method\n", "\n", "winsor = Winsorizer(capping_method='gaussian',\n", " tail='both', fold=2, variables='fare')\n", "winsor.fit(X_train)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Minimum caps : {'fare': -62.30099726608475}\n", "Maximum caps : {'fare': 127.36509792110658}\n" ] } ], "source": [ "print(\"Minimum caps :\", winsor.left_tail_caps_)\n", "\n", "print(\"Maximum caps :\", winsor.right_tail_caps_)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeYAAAE/CAYAAACTomAoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAXZUlEQVR4nO3dfbCedX3n8fdHglhBDZQ0g+EhqGktOiu4WYaOdrViKw+usTNdGsdqtHTS2cVWWzsuqG11p+7itsDq7kqXCkt8REQtqVIrIh3rTEWDReRBNEKAxECCyIPaWoPf/eP+pdyNJ5zHO+eX+36/Zu451/W7nr6/c51zPud6uK87VYUkSerD4xa7AEmS9CiDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLC2AJH+e5A8XaF1HJ/lekgPa+N8m+a2FWHdb318nWbdQ65vFdv8kyX1J7tnL9P+U5N7W95/e1/VJvYjvY5YeW5ItwHJgF/AIcAvwPuCiqvrxHNb1W1X12Vks87fAB6rqvbPZVlv2bcAzquo3ZrvsQkpyNHAbcExV7Zhi+oHAQ8BJVfXVfV2f1BOPmKWZ+Q9V9STgGOBc4L8AFy/0RpIsWeh1duJo4DtThXKzHHgCcPNcVj7G3zdNIINZmoWqerCqNgK/DqxL8myAJJcm+ZM2fHiSTyZ5IMn9Sf4uyeOSvJ9BQP1VO137piQrk1SSM5PcBXxuqG04bJ6e5EtJHkpyZZLD2rZemGTrcI1JtiR5cZJTgDcDv96299U2/V9Ojbe63prkziQ7krwvyVPatN11rEtyVzsN/Za9fW+SPKUtv7Ot761t/S8Grgae2uq4dI/lfpbB0TTAA0k+19rfleTu1ufrk/zi0DJvS3JFkg8keQh4Tdv+xUm2J9nWTp0fMPO9K/XBYJbmoKq+BGwFfnGKyW9s05YxOBJ882CRehVwF4Oj70Oq6n8MLfMC4OeBl+xlk68GfhM4gsEp9XfPoMZPA/8N+Ejb3nOmmO017fVLwNOAQ4D/vcc8zwd+DjgZ+KMkP7+XTf4v4CltPS9oNb+2nbY/Ffh2q+M1e9T5DeBZbXRpVb2oDX8ZOB44DPgQ8NEkTxhadA1wBbAU+CBwKYPvzTOAE4BfARbs2ry0rxjM0tx9m0Fo7OlHDAL0mKr6UVX9XU1/M8fbqur7VfWPe5n+/qq6qaq+D/whcMYCHQ2+Eji/qm6vqu8B5wBr9zhaf3tV/WO79vtV4CcCvtWyFjinqh6uqi3AecCr5lpYVX2gqr5TVbuq6jzgIAb/IOz291X1l+06/5OB04A3tO/jDuCCVpO0XzGYpblbAdw/RfufApuBzyS5PcnZM1jX3bOYfidwIHD4jKp8bE9t6xte9xIGR/q7Dd9F/QMGR9V7OrzVtOe6Vsy1sCR/kOTWJA8meYDB0fhwn4e/J8e07W9vlxAeAP4v8DNz3b60WAxmaQ6S/DsGofOFPae1I8Y3VtXTgJcBv5/k5N2T97LK6Y6ojxoaPprBUfl9wPeBJw7VdQCDU+gzXe+3GYTa8Lp3AfdOs9ye7ms17bmubbNcDwDtevKbgDOAQ6tqKfAgkKHZhvt2N/BD4PCqWtpeT66qZyHtZwxmaRaSPDnJS4HLGLyF6WtTzPPSJM9IEgZh8giw+21V9zK4Bjtbv5HkuCRPBP4rcEVVPQJ8A3hCktPbW47eyuCU7273AiuT7O13/cPA7yU5NskhPHpNetdsimu1XA68I8mTkhwD/D7wgdmsZ8iTGPyDsBNYkuSPGJyu3tv2twOfAc5r++hxSZ6e5AVz3L60aAxmaWb+KsnDDI7M3gKcD7x2L/OuAj4LfA/4e+A9VXVtm/bfgbe2061/MIvtv5/BzU33MHhb0e/C4C5x4D8D72VwdPp9Bjee7fbR9vU7Sb4yxXovaev+PHAH8E/A78yirmG/07Z/O4MzCR9q65+LvwE+zeAfjztbXdOd7n818HgG7zP/LoMbw46Y4/alReMDRiRJ6ohHzJIkdWTaYE7yhPZgg68muTnJ21v7sUmuS7I5yUeSPL61H9TGN7fpK0fcB0mSxsZMjph/CLyoPZzgeOCUJCcB7wQuqKpnMLiec2ab/0zgu639gjafJEmagWmDuQa+10YPbK8CXsTg5gqADcDL2/CaNk6bfnK7O1WSJE1jRteYkxyQ5AZgB4Nn3n4LeGDoLRVbefRBAitod0+26Q8CfoSbJEkzMKNPZGnvUTw+yVLgE8Az57vhJOuB9QAHH3zwv33mM+e9SkmS9hvXX3/9fVW1bM/2WX1UWlU9kORa4BeApUmWtKPiI3n0CT/bGDylaGt73u5TgO9Msa6LgIsAVq9eXZs2bZpNKZIk7deS3DlV+0zuyl7WjpRJ8lPALwO3AtcCv9ZmWwdc2YY3tnHa9M/N4AH+kiSJmR0xHwFsaM/gfRxweVV9MsktwGUZfAbtP/Doh8ZfDLw/yWYGD/j3010kSZqhaYO5qm5k8Nmme7bfDpw4Rfs/Af9xQaqTJGnC+OQvSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkdm9azscbfy7E/Nabkt556+wJVIkiaVR8ySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqyLTBnOSoJNcmuSXJzUle39rflmRbkhva67ShZc5JsjnJbUleMsoOSJI0TpbMYJ5dwBur6itJngRcn+TqNu2Cqvqz4ZmTHAesBZ4FPBX4bJKfrapHFrJwSZLG0bRHzFW1vaq+0oYfBm4FVjzGImuAy6rqh1V1B7AZOHEhipUkadzN6hpzkpXACcB1rel1SW5MckmSQ1vbCuDuocW28thBLkmSmhkHc5JDgI8Bb6iqh4ALgacDxwPbgfNms+Ek65NsSrJp586ds1lUkqSxNaNgTnIgg1D+YFV9HKCq7q2qR6rqx8Bf8Ojp6m3AUUOLH9na/pWquqiqVlfV6mXLls2nD5IkjY2Z3JUd4GLg1qo6f6j9iKHZfhW4qQ1vBNYmOSjJscAq4EsLV7IkSeNrJndlPw94FfC1JDe0tjcDr0hyPFDAFuC3Aarq5iSXA7cwuKP7LO/IliRpZqYN5qr6ApApJl31GMu8A3jHPOqSJGki+eQvSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHVk2mBOclSSa5PckuTmJK9v7YcluTrJN9vXQ1t7krw7yeYkNyZ57qg7IUnSuJjJEfMu4I1VdRxwEnBWkuOAs4FrqmoVcE0bBzgVWNVe64ELF7xqSZLG1LTBXFXbq+orbfhh4FZgBbAG2NBm2wC8vA2vAd5XA18EliY5YqELlyRpHM3qGnOSlcAJwHXA8qra3ibdAyxvwyuAu4cW29raJEnSNGYczEkOAT4GvKGqHhqeVlUF1Gw2nGR9kk1JNu3cuXM2i0qSNLZmFMxJDmQQyh+sqo+35nt3n6JuX3e09m3AUUOLH9na/pWquqiqVlfV6mXLls21fkmSxspM7soOcDFwa1WdPzRpI7CuDa8Drhxqf3W7O/sk4MGhU96SJOkxLJnBPM8DXgV8LckNre3NwLnA5UnOBO4EzmjTrgJOAzYDPwBeu5AFS5I0zqYN5qr6ApC9TD55ivkLOGuedUmSNJF88pckSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOjJtMCe5JMmOJDcNtb0tybYkN7TXaUPTzkmyOcltSV4yqsIlSRpHMzlivhQ4ZYr2C6rq+Pa6CiDJccBa4FltmfckOWChipUkadxNG8xV9Xng/hmubw1wWVX9sKruADYDJ86jPkmSJsp8rjG/LsmN7VT3oa1tBXD30DxbW5skSZqBuQbzhcDTgeOB7cB5s11BkvVJNiXZtHPnzjmWIUnSeJlTMFfVvVX1SFX9GPgLHj1dvQ04amjWI1vbVOu4qKpWV9XqZcuWzaUMSZLGzpyCOckRQ6O/Cuy+Y3sjsDbJQUmOBVYBX5pfiZIkTY4l082Q5MPAC4HDk2wF/hh4YZLjgQK2AL8NUFU3J7kcuAXYBZxVVY+MpHJJksbQtMFcVa+Yovnix5j/HcA75lOUJEmTyid/SZLUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdWTJYhcwqVae/ak5Lbfl3NMXuBJJUk88YpYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdWTaYE5ySZIdSW4aajssydVJvtm+Htrak+TdSTYnuTHJc0dZvCRJ42YmR8yXAqfs0XY2cE1VrQKuaeMApwKr2ms9cOHClClJ0mSYNpir6vPA/Xs0rwE2tOENwMuH2t9XA18EliY5YoFqlSRp7M31GvPyqtrehu8BlrfhFcDdQ/NtbW2SJGkG5n3zV1UVULNdLsn6JJuSbNq5c+d8y5AkaSzMNZjv3X2Kun3d0dq3AUcNzXdka/sJVXVRVa2uqtXLli2bYxmSJI2XuQbzRmBdG14HXDnU/up2d/ZJwINDp7wlSdI0lkw3Q5IPAy8EDk+yFfhj4Fzg8iRnAncCZ7TZrwJOAzYDPwBeO4KaJUkaW9MGc1W9Yi+TTp5i3gLOmm9RkiRNKp/8JUlSRwxmSZI6YjBLktQRg1mSpI4YzJIkdWTau7I1vZVnf2qxSxiJufRry7mnj6ASSZocHjFLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkcMZkmSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJHDGZJkjpiMEuS1BGDWZKkjhjMkiR1xGCWJKkjBrMkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI4YzJIkdcRgliSpIwazJEkdMZglSeqIwSxJUkeWzGfhJFuAh4FHgF1VtTrJYcBHgJXAFuCMqvru/MqUJGkyLMQR8y9V1fFVtbqNnw1cU1WrgGvauCRJmoFRnMpeA2xowxuAl49gG5IkjaX5BnMBn0lyfZL1rW15VW1vw/cAy+e5DUmSJsa8rjEDz6+qbUl+Brg6ydeHJ1ZVJampFmxBvh7g6KOPnmcZkiSNh3kdMVfVtvZ1B/AJ4ETg3iRHALSvO/ay7EVVtbqqVi9btmw+ZUiSNDbmHMxJDk7ypN3DwK8ANwEbgXVttnXAlfMtUpKkSTGfU9nLgU8k2b2eD1XVp5N8Gbg8yZnAncAZ8y9TkqTJMOdgrqrbgedM0f4d4OT5FCVJ0qisPPtTs15my7mnj6CSqfnkL0mSOmIwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqyHyfla19rPf330mS5scjZkmSOmIwS5LUEYNZkqSOGMySJHXEm7+0oOZycxp4g5ok7WYwT4C5hqUkad/zVLYkSR0xmCVJ6ojBLElSRwxmSZI6YjBLktQRg1mSpI74dilphPzQEUmz5RGzJEkdMZglSeqIp7K13/Lxn5PB/axJYzBLnTGIpMk2lsHss6H3P+4zSRrwGrMkSR0xmCVJ6shYnsqWNFq+P1saHY+YJUnqiEfM0gx4c9r+x6N67a88YpYkqSMGsyRJHTGYJUnqiNeYJe0T43ydflyvZ49rv3pnMEtjwj+i0ngYWTAnOQV4F3AA8N6qOndU25KkhbAvj+rH9Zno49qvfWkkwZzkAOD/AL8MbAW+nGRjVd0yiu1JmptxPr2s8TeuP7+juvnrRGBzVd1eVf8MXAasGdG2JEkaG6MK5hXA3UPjW1ubJEl6DIt281eS9cD6Nvq9JLct4OoPB+5bwPXtb+z/Y/Q/79yHlSyOSd7/Y9/3aX5+99v+L9Dv5cj6P6K/G8dM1TiqYN4GHDU0fmRr+xdVdRFw0Sg2nmRTVa0exbr3B/bf/k9q/ye572D/x6X/ozqV/WVgVZJjkzweWAtsHNG2JEkaGyM5Yq6qXUleB/wNg7dLXVJVN49iW5IkjZORXWOuqquAq0a1/mmM5BT5fsT+T7ZJ7v8k9x3s/1j0P1W12DVIkqTGD7GQJKkjYxfMSU5JcluSzUnOXux6Ri3JUUmuTXJLkpuTvL61H5bk6iTfbF8PXexaRyXJAUn+Ickn2/ixSa5rPwMfaTcgjqUkS5NckeTrSW5N8gsTtu9/r/3c35Tkw0meMM77P8klSXYkuWmobcr9nYF3t+/DjUmeu3iVz99e+v6n7Wf/xiSfSLJ0aNo5re+3JXnJohQ9R2MVzEOPAj0VOA54RZLjFreqkdsFvLGqjgNOAs5qfT4buKaqVgHXtPFx9Xrg1qHxdwIXVNUzgO8CZy5KVfvGu4BPV9Uzgecw+D5MxL5PsgL4XWB1VT2bwY2maxnv/X8pcMoebXvb36cCq9prPXDhPqpxVC7lJ/t+NfDsqvo3wDeAcwDa38C1wLPaMu9p+bBfGKtgZgIfBVpV26vqK234YQZ/mFcw6PeGNtsG4OWLUuCIJTkSOB14bxsP8CLgijbLOPf9KcC/By4GqKp/rqoHmJB93ywBfirJEuCJwHbGeP9X1eeB+/do3tv+XgO8rwa+CCxNcsQ+KXQEpup7VX2mqna10S8yeGYGDPp+WVX9sKruADYzyIf9wrgF80Q/CjTJSuAE4DpgeVVtb5PuAZYvVl0j9j+BNwE/buM/DTww9Ms6zj8DxwI7gf/XTuW/N8nBTMi+r6ptwJ8BdzEI5AeB65mc/b/b3vb3pP09/E3gr9vwft33cQvmiZXkEOBjwBuq6qHhaTW49X7sbr9P8lJgR1Vdv9i1LJIlwHOBC6vqBOD77HHaelz3PUC7lrqGwT8oTwUO5idPdU6Ucd7fjyXJWxhc1vvgYteyEMYtmKd9FOg4SnIgg1D+YFV9vDXfu/u0Vfu6Y7HqG6HnAS9LsoXBZYsXMbjmurSd2oTx/hnYCmytquva+BUMgnoS9j3Ai4E7qmpnVf0I+DiDn4lJ2f+77W1/T8TfwySvAV4KvLIeff/vft33cQvmiXsUaLumejFwa1WdPzRpI7CuDa8DrtzXtY1aVZ1TVUdW1UoG+/pzVfVK4Frg19psY9l3gKq6B7g7yc+1ppOBW5iAfd/cBZyU5Int92B3/ydi/w/Z2/7eCLy63Z19EvDg0CnvsZDkFAaXsl5WVT8YmrQRWJvkoCTHMrgB7kuLUeOcVNVYvYDTGNyd9y3gLYtdzz7o7/MZnLq6EbihvU5jcK31GuCbwGeBwxa71hF/H14IfLINP43BL+Fm4KPAQYtd3wj7fTywqe3/vwQOnaR9D7wd+DpwE/B+4KBx3v/AhxlcT/8RgzMmZ+5tfwNh8C6VbwFfY3D3+qL3YYH7vpnBteTdf/v+fGj+t7S+3wacutj1z+blk78kSerIuJ3KliRpv2YwS5LUEYNZkqSOGMySJHXEYJYkqSMGsyRJHTGYJUnqiMEsSVJH/j/TlylrumqxfgAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Histogram of fare feature after capping outliers\n", "plot_hist(winsor.transform(X_train), 'fare')" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Max fare: 127.36509792110658\n", "Min fare: 0.0\n" ] } ], "source": [ "# transforming the training and testing data\n", "train_t = winsor.transform(X_train)\n", "test_t = winsor.transform(X_test)\n", "\n", "print(\"Max fare:\", train_t.fare.max())\n", "print(\"Min fare:\", train_t.fare.min())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Inter Quartile Range, both tails\n", "**IQR limits:**\n", "\n", "- right tail: 75th quantile + 3* IQR\n", "- left tail: 25th quantile - 3* IQR\n", "\n", "where IQR is the inter-quartile range: 75th quantile - 25th quantile.\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Winsorizer(capping_method='iqr', tail='both', variables=['age', 'fare'])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# capping at both tails using iqr capping method\n", "winsor = Winsorizer(capping_method='iqr', tail='both',\n", " variables=['age', 'fare'])\n", "\n", "winsor.fit(X_train)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'age': -13.0, 'fare': -62.24179999999999}" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "winsor.left_tail_caps_" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'age': 71.0, 'fare': 101.4126}" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "winsor.right_tail_caps_" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Max fare: 101.4126\n", "Min fare 0.0\n" ] } ], "source": [ "# transforming the training and testing data\n", "\n", "train_t = winsor.transform(X_train)\n", "test_t = winsor.transform(X_test)\n", "\n", "print(\"Max fare:\", train_t.fare.max())\n", "print(\"Min fare\", train_t.fare.min())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### percentiles or quantiles:\n", "\n", "- right tail: 98th percentile\n", "- left tail: 2nd percentile" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Winsorizer(capping_method='quantiles', fold=0.02, tail='both',\n", " variables=['age', 'fare'])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# capping at both tails using quantiles capping method\n", "winsor = Winsorizer(capping_method='quantiles', tail='both',\n", " fold=0.02, variables=['age', 'fare'])\n", "\n", "winsor.fit(X_train)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Minimum caps : {'age': 2.0, 'fare': 6.44125}\n", "Maximum caps : {'age': 61.69999999999993, 'fare': 211.5}\n" ] } ], "source": [ "print(\"Minimum caps :\", winsor.left_tail_caps_)\n", "\n", "print(\"Maximum caps :\", winsor.right_tail_caps_)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Max age: 61.69999999999993\n", "Min age 2.0\n" ] } ], "source": [ "# transforming the training and testing data\n", "train_t = winsor.transform(X_train)\n", "test_t = winsor.transform(X_test)\n", "\n", "print(\"Max age:\", train_t.age.max())\n", "print(\"Min age\", train_t.age.min())" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeYAAAE/CAYAAACTomAoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAUoUlEQVR4nO3dfdCddX3n8fdHwFqFCphsFgMhqJm2bKeiE12cWovSVkQt7kyX4viQsnSys6Wt3eK2aG2hHdmlu1Pb2gdbFiioVaE+FFpZV6R00NlVGlQUQdcsDZIYkgCiPLja4Hf/ONctJzHhfs79zX3er5l7znV+19M3P3L43L/fdZ0rqSokSVIPT1jqAiRJ0mMMZkmSGjGYJUlqxGCWJKkRg1mSpEYMZkmSGjGYpQWQ5M+T/NYCHWtNkoeSHDK8/4ckv7AQxx6O9z+SbFio483ivG9Ncm+Sew70uaWDSfwes/T4kmwBVgG7gUeB24F3ApdU1XfmcKxfqKqPzWKffwDeXVWXzuZcw74XAs+qqtfOdt+FlGQN8CXg+KrauZS1SN05YpZm5pVVdQRwPHAx8BvAZQt9kiSHLvQxm1gD3GcoS9MzmKVZqKqvV9W1wM8BG5L8CECSK5K8dVhekeTvkjyQ5P4kH0/yhCTvYhRQfztMVf96krVJKsk5Sb4C/P1Y23hIPzPJzUm+keSaJEcP5zolydbxGpNsSfKTSU4D3gz83HC+W4f1350aH+p6S5K7kuxM8s4kTx3WTdWxIclXhmno39xf3yR56rD/ruF4bxmO/5PA9cDThzqu2Me+Rw19tivJ14blY8fWn5DkpiQPJvlYkj9N8u6x9Scn+V9Dn9+a5JQZ/ieV2jGYpTmoqpuBrcCP72P1ecO6lYymwN882qVeB3yF0ej78Kr6r2P7/ATww8BL93PK1wP/DjiG0ZT622dQ40eA/wxcNZzv2fvY7OeHnxcDzwAOB/5kr21eCPwgcCrw20l+eD+n/GPgqcNxfmKo+exh2v5lwFeHOn5+H/s+AfhLRjMSa4Bv7lXHe4CbgacBFwKvm1qRZDXwYeCtwNHAG4EPJFm5nzql1gxmae6+yigI9vbPjAL0+Kr656r6eE1/M8eFVfVwVX1zP+vfVVW3VdXDwG8BZ07dHDZPrwHeVlV3VtVDwJuAs/Yarf9OVX2zqm4FbgW+J+CHWs4C3lRVD1bVFuD3GQvQx1NV91XVB6rqkap6ELiIUbhPXZ9+HvDbVfXtqvoEcO3Y7q8Frquq66rqO1V1PbAJOH02HSF1YTBLc7cauH8f7f8N2Ax8NMmdSc6fwbHunsX6u4DDgBUzqvLxPX043vixD2U00p8yfhf1I4xG1XtbMdS097FWz6SIJE9O8hfDFPg3gJuAI4fAfzpwf1U9MrbLeH8cD/zbYRr7gSQPMBrlHzOTc0vdGMzSHCR5HqPQ+cTe64YR43lV9QzgZ4BfS3Lq1Or9HHK6EfVxY8trGI3K7wUeBp48VtchjKbQZ3rcrzIKtvFj7wZ2TLPf3u4datr7WNtmuP95jKbL/3VV/QDwoqE9wHbg6CRPHtt+vD/uZjSjcOTYz1Oq6uJZ/hmkFgxmaRaS/ECSVwDvY/QVps/vY5tXJHlWkgBfZ/QVq6mvVe1gdA12tl6b5MQhnH4XeH9VPQr8H+BJSV6e5DDgLcD3je23A1ibZH+f9fcC/3G4uepwHrsmvXs2xQ21XA1clOSIJMcDvwa8+/H3/K4jGF1XfmC4se2CsWPfxWhq+sIkT0zyAuCVY/u+G3hlkpcmOSTJk4ab4o5FOggZzNLM/G2SBxmNzn4TeBtw9n62XQd8DHgI+N/An1XVjcO6/wK8ZZhyfeMszv8u4ApG08pPAn4FRneJA78IXMpodPowoxvPpvz18Hpfkk/v47iXD8e+Cfgn4P8BvzyLusb98nD+OxnNJLxnOP5M/CHw/YxG3p8EPrLX+tcALwDuY3ST11XAtwCq6m7gDEY32e1i9N/oP+H/33SQ8gEjkg46Sa4CvlhVF0y7sXSQ8TdKSe0leV6SZw7fiz6N0Qj5b5a4LGlRLNenDElaXv4l8EFG32PeCvyHqvrM0pYkLQ6nsiVJasSpbEmSGjGYJUlqpMU15hUrVtTatWuXugxJkg6YW2655d6q+p5nurcI5rVr17Jp06alLkOSpAMmyV37ancqW5KkRgxmSZIaMZglSWrEYJYkqRGDWZKkRgxmSZIaMZglSWrEYJYkqRGDWZKkRgxmSZIaMZglSWqkxbOyJT1m7fkfntN+Wy5++QJXImkpOGKWJKkRg1mSpEYMZkmSGjGYJUlqxGCWJKkRg1mSpEYMZkmSGjGYJUlqxGCWJKkRg1mSpEYMZkmSGjGYJUlqxGCWJKkRg1mSpEYMZkmSGjGYJUlqxGCWJKkRg1mSpEYMZkmSGjGYJUlqxGCWJKkRg1mSpEYMZkmSGpk2mJMcl+TGJLcn+UKSNwztRye5PsmXh9ejhvYkeXuSzUk+l+S5i/2HkCRpuZjJiHk3cF5VnQicDJyb5ETgfOCGqloH3DC8B3gZsG742Qi8Y8GrliRpmZo2mKtqe1V9elh+ELgDWA2cAVw5bHYl8Kph+QzgnTXySeDIJMcsdOGSJC1Hs7rGnGQt8BzgU8Cqqto+rLoHWDUsrwbuHttt69AmSZKmMeNgTnI48AHgV6vqG+PrqqqAms2Jk2xMsinJpl27ds1mV0mSlq0ZBXOSwxiF8l9V1QeH5h1TU9TD686hfRtw3Njuxw5te6iqS6pqfVWtX7ly5VzrlyRpWZnJXdkBLgPuqKq3ja26FtgwLG8Arhlrf/1wd/bJwNfHprwlSdLjOHQG2/wY8Drg80k+O7S9GbgYuDrJOcBdwJnDuuuA04HNwCPA2QtZsCRJy9m0wVxVnwCyn9Wn7mP7As6dZ12SJE0kn/wlSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIj0wZzksuT7Exy21jbhUm2Jfns8HP62Lo3Jdmc5EtJXrpYhUuStBzNZMR8BXDaPtr/oKpOGn6uA0hyInAW8K+Gff4sySELVawkScvdtMFcVTcB98/weGcA76uqb1XVPwGbgefPoz5JkibKfK4x/1KSzw1T3UcNbauBu8e22Tq0fY8kG5NsSrJp165d8yhDkqTlY67B/A7gmcBJwHbg92d7gKq6pKrWV9X6lStXzrEMSZKWlzkFc1XtqKpHq+o7wH/nsenqbcBxY5seO7RJkqQZmFMwJzlm7O2/Aabu2L4WOCvJ9yU5AVgH3Dy/EiVJmhyHTrdBkvcCpwArkmwFLgBOSXISUMAW4N8DVNUXklwN3A7sBs6tqkcXpXJJkpahaYO5ql69j+bLHmf7i4CL5lOUJEmTyid/SZLUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNTJtMCe5PMnOJLeNtR2d5PokXx5ejxrak+TtSTYn+VyS5y5m8ZIkLTczGTFfAZy2V9v5wA1VtQ64YXgP8DJg3fCzEXjHwpQpSdJkmDaYq+om4P69ms8ArhyWrwReNdb+zhr5JHBkkmMWqFZJkpa9uV5jXlVV24fle4BVw/Jq4O6x7bYObZIkaQbmffNXVRVQs90vycYkm5Js2rVr13zLkCRpWZhrMO+YmqIeXncO7duA48a2O3Zo+x5VdUlVra+q9StXrpxjGZIkLS9zDeZrgQ3D8gbgmrH21w93Z58MfH1syluSJE3j0Ok2SPJe4BRgRZKtwAXAxcDVSc4B7gLOHDa/Djgd2Aw8Apy9CDVLkrRsTRvMVfXq/aw6dR/bFnDufIuSJGlS+eQvSZIaMZglSWrEYJYkqRGDWZKkRgxmSZIaMZglSWrEYJYkqRGDWZKkRgxmSZIaMZglSWrEYJYkqRGDWZKkRgxmSZIaMZglSWrEYJYkqRGDWZKkRgxmSZIaMZglSWrEYJYkqRGDWZKkRgxmSZIaMZglSWrEYJYkqRGDWZKkRgxmSZIaMZglSWrEYJYkqRGDWZKkRgxmSZIaMZglSWrEYJYkqRGDWZKkRgxmSZIaMZglSWrEYJYkqRGDWZKkRgxmSZIaMZglSWrEYJYkqRGDWZKkRgxmSZIaMZglSWrEYJYkqRGDWZKkRg5d6gKkg8Ha8z88p/22XPzyBa5E0nLniFmSpEYMZkmSGjGYJUlqxGCWJKmRed38lWQL8CDwKLC7qtYnORq4ClgLbAHOrKqvza9MSZImw0KMmF9cVSdV1frh/fnADVW1DrhheC9JkmZgMaayzwCuHJavBF61COeQJGlZmm8wF/DRJLck2Ti0raqq7cPyPcCqeZ5DkqSJMd8HjLywqrYl+RfA9Um+OL6yqipJ7WvHIcg3AqxZs2aeZUiay0NQfACK1M+8grmqtg2vO5N8CHg+sCPJMVW1PckxwM797HsJcAnA+vXr9xnemhwHMlTm+hSv7ueStDzMeSo7yVOSHDG1DPw0cBtwLbBh2GwDcM18i5QkaVLMZ8S8CvhQkqnjvKeqPpLkH4Grk5wD3AWcOf8yJUmaDHMO5qq6E3j2PtrvA06dT1GSJE0q/3UpHbS8fitpOfKRnJIkNWIwS5LUiMEsSVIjBrMkSY0YzJIkNWIwS5LUiF+XkibYXL9y5jO2pcXjiFmSpEYMZkmSGjGYJUlqxGCWJKkRg1mSpEYMZkmSGjGYJUlqxGCWJKkRg1mSpEYMZkmSGvGRnJIOCB//Kc2MwSxJmihz+SXxQP6CaDBLmrW5jn4lTc9rzJIkNWIwS5LUiFPZklrzpjFNGkfMkiQ1sixHzP6GvXS8KUhdHMg7b7vf5auDiyNmSZIaMZglSWrEYJYkqRGDWZKkRgxmSZIaMZglSWpkWX5dSpLmYrl+3c+vkB5cHDFLktSII2ZJOogs11G9HuOIWZKkRgxmSZIacSpbkrRgvNFs/hwxS5LUiCNmSVoC3sSl/XHELElSI46YxxzIayNeh5Ek7YvBrP1yqk2SDjynsiVJasRgliSpEaeyDzJzmV72urSkueh+Oat7fXNlMC+A5fqXQ5J04BnME8BfHCTp4OE1ZkmSGlm0EXOS04A/Ag4BLq2qixfrXJKkg5sze49ZlBFzkkOAPwVeBpwIvDrJiYtxLkmSlpPFmsp+PrC5qu6sqm8D7wPOWKRzSZK0bCxWMK8G7h57v3VokyRJj2PJ7spOshHYOLx9KMmX9rPpCuDeA1PVQcH+2JP9sSf7Y0/2x57sjz3NuD/ye4ty/uP31bhYwbwNOG7s/bFD23dV1SXAJdMdKMmmqlq/sOUdvOyPPdkfe7I/9mR/7Mn+2FPX/lisqex/BNYlOSHJE4GzgGsX6VySJC0bizJirqrdSX4J+J+Mvi51eVV9YTHOJUnScrJo15ir6jrgugU41LTT3RPG/tiT/bEn+2NP9see7I89teyPVNVS1yBJkgY+klOSpEbaBnOS05J8KcnmJOcvdT1LIcnlSXYmuW2s7egk1yf58vB61FLWeKAkOS7JjUluT/KFJG8Y2ie1P56U5OYktw798TtD+wlJPjV8bq4abr6cGEkOSfKZJH83vJ/Y/kiyJcnnk3w2yaahbSI/LwBJjkzy/iRfTHJHkhd07Y+WwewjPb/rCuC0vdrOB26oqnXADcP7SbAbOK+qTgROBs4d/k5Man98C3hJVT0bOAk4LcnJwO8Bf1BVzwK+BpyzdCUuiTcAd4y9n/T+eHFVnTT2laBJ/bzA6N9u+EhV/RDwbEZ/T1r2R8tgxkd6AlBVNwH379V8BnDlsHwl8KoDWdNSqartVfXpYflBRh+q1Uxuf1RVPTS8PWz4KeAlwPuH9onpD4AkxwIvBy4d3ocJ7o/9mMjPS5KnAi8CLgOoqm9X1QM07Y+uwewjPfdvVVVtH5bvAVYtZTFLIcla4DnAp5jg/himbT8L7ASuB/4v8EBV7R42mbTPzR8Cvw58Z3j/NCa7Pwr4aJJbhictwuR+Xk4AdgF/OVzquDTJU2jaH12DWTNQo1vqJ+q2+iSHAx8AfrWqvjG+btL6o6oeraqTGD1Z7/nADy1tRUsnySuAnVV1y1LX0sgLq+q5jC4JnpvkReMrJ+zzcijwXOAdVfUc4GH2mrbu1B9dg3naR3pOsB1JjgEYXncucT0HTJLDGIXyX1XVB4fmie2PKcOU3I3AC4Ajk0w9n2CSPjc/BvxMki2MLn29hNE1xUntD6pq2/C6E/gQo1/eJvXzshXYWlWfGt6/n1FQt+yPrsHsIz3371pgw7C8AbhmCWs5YIbrhZcBd1TV28ZWTWp/rExy5LD8/cBPMbrufiPws8NmE9MfVfWmqjq2qtYy+v/F31fVa5jQ/kjylCRHTC0DPw3cxoR+XqrqHuDuJD84NJ0K3E7T/mj7gJEkpzO6ZjT1SM+LlraiAy/Je4FTGP0LKDuAC4C/Aa4G1gB3AWdW1d43iC07SV4IfBz4PI9dQ3wzo+vMk9gfP8roZpVDGP2CfXVV/W6SZzAaMR4NfAZ4bVV9a+kqPfCSnAK8sapeMan9Mfy5PzS8PRR4T1VdlORpTODnBSDJSYxuDHwicCdwNsNnh2b90TaYJUmaRF2nsiVJmkgGsyRJjRjMkiQ1YjBLktSIwSxJUiMGsyRJjRjMkiQ1YjBLktTI/we5s3I9U0QETQAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Histogram of age feature after capping outliers\n", "plot_hist(train_t, 'age')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 4 }