{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# RareLabelEncoder\n",
    "\n",
    "The RareLabelEncoder() groups labels that show a small number of observations in the dataset into a new category called 'Rare'. This helps to avoid overfitting.\n",
    "\n",
    "The argument ' tol ' indicates the percentage of observations that the label needs to have in order not to be re-grouped into the \"Rare\" label.<br> The argument n_categories indicates the minimum number of distinct categories that a variable needs to have for any of the labels to be re-grouped into 'Rare'.<br><br>\n",
    "#### Note\n",
    "If the number of labels is smaller than n_categories, then the encoder will not group the labels for that variable."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from feature_engine.encoding import RareLabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load titanic dataset from OpenML\n",
    "\n",
    "def load_titanic():\n",
    "    data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')\n",
    "    data = data.replace('?', np.nan)\n",
    "    data['cabin'] = data['cabin'].astype(str).str[0]\n",
    "    data['pclass'] = data['pclass'].astype('O')\n",
    "    data['age'] = data['age'].astype('float')\n",
    "    data['fare'] = data['fare'].astype('float')\n",
    "    data['embarked'].fillna('C', inplace=True)\n",
    "    data.drop(labels=['boat', 'body', 'home.dest'], axis=1, inplace=True)\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pclass</th>\n",
       "      <th>survived</th>\n",
       "      <th>name</th>\n",
       "      <th>sex</th>\n",
       "      <th>age</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>ticket</th>\n",
       "      <th>fare</th>\n",
       "      <th>cabin</th>\n",
       "      <th>embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Allen, Miss. Elisabeth Walton</td>\n",
       "      <td>female</td>\n",
       "      <td>29.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>24160</td>\n",
       "      <td>211.3375</td>\n",
       "      <td>B</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Allison, Master. Hudson Trevor</td>\n",
       "      <td>male</td>\n",
       "      <td>0.9167</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>113781</td>\n",
       "      <td>151.5500</td>\n",
       "      <td>C</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Allison, Miss. Helen Loraine</td>\n",
       "      <td>female</td>\n",
       "      <td>2.0000</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>113781</td>\n",
       "      <td>151.5500</td>\n",
       "      <td>C</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Allison, Mr. Hudson Joshua Creighton</td>\n",
       "      <td>male</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>113781</td>\n",
       "      <td>151.5500</td>\n",
       "      <td>C</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Allison, Mrs. Hudson J C (Bessie Waldo Daniels)</td>\n",
       "      <td>female</td>\n",
       "      <td>25.0000</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>113781</td>\n",
       "      <td>151.5500</td>\n",
       "      <td>C</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  pclass  survived                                             name     sex  \\\n",
       "0      1         1                    Allen, Miss. Elisabeth Walton  female   \n",
       "1      1         1                   Allison, Master. Hudson Trevor    male   \n",
       "2      1         0                     Allison, Miss. Helen Loraine  female   \n",
       "3      1         0             Allison, Mr. Hudson Joshua Creighton    male   \n",
       "4      1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   \n",
       "\n",
       "       age  sibsp  parch  ticket      fare cabin embarked  \n",
       "0  29.0000      0      0   24160  211.3375     B        S  \n",
       "1   0.9167      1      2  113781  151.5500     C        S  \n",
       "2   2.0000      1      2  113781  151.5500     C        S  \n",
       "3  30.0000      1      2  113781  151.5500     C        S  \n",
       "4  25.0000      1      2  113781  151.5500     C        S  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = load_titanic()\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = data.drop(['survived', 'name', 'ticket'], axis=1)\n",
    "y = data.survived"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "cabin       0\n",
       "pclass      0\n",
       "embarked    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we will encode the below variables, they have no missing values\n",
    "X[['cabin', 'pclass', 'embarked']].isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "cabin       object\n",
       "pclass      object\n",
       "embarked    object\n",
       "dtype: object"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "''' Make sure that the variables are type (object).\n",
    "if not, cast it as object , otherwise the transformer will either send an error (if we pass it as argument) \n",
    "or not pick it up (if we leave variables=None). '''\n",
    "\n",
    "X[['cabin', 'pclass', 'embarked']].dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((916, 8), (393, 8))"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# let's separate into training and testing set\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\n",
    "\n",
    "X_train.shape, X_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The RareLabelEncoder() groups rare / infrequent categories in\n",
    "a new category called \"Rare\", or any other name entered by the user.\n",
    "\n",
    "For example in the variable colour,<br> if the percentage of observations\n",
    "for the categories magenta, cyan and burgundy \n",
    "are < 5%, all those\n",
    "categories will be replaced by the new label \"Rare\".\n",
    "\n",
    "Note, infrequent labels can also be grouped under a user defined name, for\n",
    "example 'Other'. The name to replace infrequent categories is defined\n",
    "with the parameter replace_with.\n",
    "   \n",
    "The encoder will encode only categorical variables (type 'object'). A list\n",
    "of variables can be passed as an argument. If no variables are passed as \n",
    "argument, the encoder will find and encode all categorical variables\n",
    "(object type)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\king_ashok\\desktop\\feature_engine\\feature_engine\\encoding\\rare_label.py:139: UserWarning: The number of unique categories for variable pclass is less than that indicated in n_categories. Thus, all categories will be considered frequent\n",
      "  \"considered frequent\".format(var)\n",
      "c:\\users\\king_ashok\\desktop\\feature_engine\\feature_engine\\encoding\\rare_label.py:139: UserWarning: The number of unique categories for variable embarked is less than that indicated in n_categories. Thus, all categories will be considered frequent\n",
      "  \"considered frequent\".format(var)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "RareLabelEncoder(n_categories=5, variables=['cabin', 'pclass', 'embarked'])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Rare value encoder\n",
    "'''\n",
    "Parameters\n",
    "----------\n",
    "\n",
    "tol: float, default=0.05\n",
    "    the minimum frequency a label should have to be considered frequent.\n",
    "    Categories with frequencies lower than tol will be grouped.\n",
    "\n",
    "n_categories: int, default=10\n",
    "    the minimum number of categories a variable should have for the encoder\n",
    "    to find frequent labels. If the variable contains less categories, all\n",
    "    of them will be considered frequent.\n",
    "\n",
    "max_n_categories: int, default=None\n",
    "    the maximum number of categories that should be considered frequent.\n",
    "    If None, all categories with frequency above the tolerance (tol) will be\n",
    "    considered.\n",
    "\n",
    "variables : list, default=None\n",
    "    The list of categorical variables that will be encoded. If None, the \n",
    "    encoder will find and select all object type variables.\n",
    "\n",
    "replace_with : string, default='Rare'\n",
    "    The category name that will be used to replace infrequent categories.\n",
    "'''\n",
    "\n",
    "rare_encoder = RareLabelEncoder(tol=0.05, \n",
    "                                n_categories=5,\n",
    "                                variables=['cabin', 'pclass', 'embarked'])\n",
    "rare_encoder.fit(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'cabin': Index(['n', 'C'], dtype='object'),\n",
       " 'pclass': array([2, 3, 1], dtype=object),\n",
       " 'embarked': array(['S', 'C', 'Q'], dtype=object)}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rare_encoder.encoder_dict_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pclass</th>\n",
       "      <th>sex</th>\n",
       "      <th>age</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>fare</th>\n",
       "      <th>cabin</th>\n",
       "      <th>embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>501</th>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>13.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>19.5000</td>\n",
       "      <td>n</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>588</th>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>23.0000</td>\n",
       "      <td>n</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>402</th>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>30.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>13.8583</td>\n",
       "      <td>n</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1193</th>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.7250</td>\n",
       "      <td>n</td>\n",
       "      <td>Q</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>686</th>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>22.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.7250</td>\n",
       "      <td>n</td>\n",
       "      <td>Q</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     pclass     sex   age  sibsp  parch     fare cabin embarked\n",
       "501       2  female  13.0      0      1  19.5000     n        S\n",
       "588       2  female   4.0      1      1  23.0000     n        S\n",
       "402       2  female  30.0      1      0  13.8583     n        C\n",
       "1193      3    male   NaN      0      0   7.7250     n        Q\n",
       "686       3  female  22.0      0      0   7.7250     n        Q"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_t = rare_encoder.transform(X_train)\n",
    "test_t = rare_encoder.transform(X_train)\n",
    "\n",
    "test_t.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "n       702\n",
       "Rare    143\n",
       "C        71\n",
       "Name: cabin, dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_t.cabin.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### The user can change the string from 'Rare' to something else."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pclass</th>\n",
       "      <th>sex</th>\n",
       "      <th>age</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>fare</th>\n",
       "      <th>cabin</th>\n",
       "      <th>embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1059</th>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>28.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>n</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1227</th>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>22.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9.8375</td>\n",
       "      <td>n</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>470</th>\n",
       "      <td>2</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12.3500</td>\n",
       "      <td>n</td>\n",
       "      <td>Q</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>66</th>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>36.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>262.3750</td>\n",
       "      <td>B</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>950</th>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>29.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9.4833</td>\n",
       "      <td>n</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     pclass     sex   age  sibsp  parch      fare cabin embarked\n",
       "1059      3    male  28.0      0      0    8.0500     n        S\n",
       "1227      3  female  22.0      0      0    9.8375     n        S\n",
       "470       2    male  35.0      0      0   12.3500     n        Q\n",
       "66        1  female  36.0      0      0  262.3750     B        C\n",
       "950       3    male  29.0      0      0    9.4833     n        S"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Rare value encoder\n",
    "\n",
    "rare_encoder = RareLabelEncoder(tol = 0.03,\n",
    "                                replace_with='Other', #replacing 'Rare' with 'Other'\n",
    "                                variables=['cabin', 'pclass', 'embarked'],\n",
    "                                n_categories=2\n",
    "                           )\n",
    "\n",
    "rare_encoder.fit(X_train)\n",
    "\n",
    "train_t = rare_encoder.transform(X_train)\n",
    "test_t = rare_encoder.transform(X_train)\n",
    "\n",
    "test_t.sample(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'cabin': Index(['n', 'C', 'B', 'E', 'D'], dtype='object'),\n",
       " 'pclass': Int64Index([3, 1, 2], dtype='int64'),\n",
       " 'embarked': Index(['S', 'C', 'Q'], dtype='object')}"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rare_encoder.encoder_dict_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "n        702\n",
       "C         71\n",
       "B         42\n",
       "Other     37\n",
       "E         32\n",
       "D         32\n",
       "Name: cabin, dtype: int64"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_t.cabin.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### The user can choose to retain only the most popular categories with the argument max_n_categories."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pclass</th>\n",
       "      <th>sex</th>\n",
       "      <th>age</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>fare</th>\n",
       "      <th>cabin</th>\n",
       "      <th>embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1222</th>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>33.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.6625</td>\n",
       "      <td>n</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>781</th>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>33.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.8958</td>\n",
       "      <td>n</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>272</th>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>23.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>82.2667</td>\n",
       "      <td>B</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1043</th>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>15.5000</td>\n",
       "      <td>n</td>\n",
       "      <td>Q</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>867</th>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>12.2875</td>\n",
       "      <td>n</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     pclass     sex   age  sibsp  parch     fare cabin embarked\n",
       "1222      3    male  33.0      0      0   8.6625     n        C\n",
       "781       3    male  33.0      0      0   7.8958     n        C\n",
       "272       1  female  23.0      1      0  82.2667     B        S\n",
       "1043      3  female   NaN      1      0  15.5000     n        Q\n",
       "867       3  female  22.0      1      1  12.2875     n        S"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Rare value encoder\n",
    "\n",
    "rare_encoder = RareLabelEncoder(tol = 0.03,\n",
    "                                variables=['cabin', 'pclass', 'embarked'],\n",
    "                                n_categories=2,\n",
    "                                \n",
    "                                max_n_categories=3 #keeps only the most popular 3 categories in every variable.\n",
    "                                \n",
    "                           )\n",
    "\n",
    "rare_encoder.fit(X_train)\n",
    "\n",
    "train_t = rare_encoder.transform(X_train)\n",
    "test_t = rare_encoder.transform(X_train)\n",
    "\n",
    "test_t.sample(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'cabin': Index(['n', 'C', 'B'], dtype='object'),\n",
       " 'pclass': Int64Index([3, 1, 2], dtype='int64'),\n",
       " 'embarked': Index(['S', 'C', 'Q'], dtype='object')}"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rare_encoder.encoder_dict_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Automatically select all categorical variables\n",
    "\n",
    "If no variable list is passed as argument, it selects all the categorical variables."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\king_ashok\\desktop\\feature_engine\\feature_engine\\encoding\\rare_label.py:139: UserWarning: The number of unique categories for variable pclass is less than that indicated in n_categories. Thus, all categories will be considered frequent\n",
      "  \"considered frequent\".format(var)\n",
      "c:\\users\\king_ashok\\desktop\\feature_engine\\feature_engine\\encoding\\rare_label.py:139: UserWarning: The number of unique categories for variable sex is less than that indicated in n_categories. Thus, all categories will be considered frequent\n",
      "  \"considered frequent\".format(var)\n",
      "c:\\users\\king_ashok\\desktop\\feature_engine\\feature_engine\\encoding\\rare_label.py:139: UserWarning: The number of unique categories for variable embarked is less than that indicated in n_categories. Thus, all categories will be considered frequent\n",
      "  \"considered frequent\".format(var)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'pclass': array([2, 3, 1], dtype=object),\n",
       " 'sex': array(['female', 'male'], dtype=object),\n",
       " 'cabin': Index(['n', 'C', 'B', 'E', 'D'], dtype='object'),\n",
       " 'embarked': array(['S', 'C', 'Q'], dtype=object)}"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Rare value encoder\n",
    "\n",
    "rare_encoder = RareLabelEncoder(tol = 0.03, n_categories=3)\n",
    "\n",
    "rare_encoder.fit(X_train)\n",
    "\n",
    "rare_encoder.encoder_dict_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pclass</th>\n",
       "      <th>sex</th>\n",
       "      <th>age</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>fare</th>\n",
       "      <th>cabin</th>\n",
       "      <th>embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>385</th>\n",
       "      <td>2</td>\n",
       "      <td>male</td>\n",
       "      <td>8.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>36.750</td>\n",
       "      <td>n</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>154</th>\n",
       "      <td>1</td>\n",
       "      <td>male</td>\n",
       "      <td>55.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>93.500</td>\n",
       "      <td>B</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>323</th>\n",
       "      <td>2</td>\n",
       "      <td>male</td>\n",
       "      <td>30.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>24.000</td>\n",
       "      <td>n</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>572</th>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>28.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12.650</td>\n",
       "      <td>n</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>809</th>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>18.0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>34.375</td>\n",
       "      <td>n</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    pclass     sex   age  sibsp  parch    fare cabin embarked\n",
       "385      2    male   8.0      1      1  36.750     n        S\n",
       "154      1    male  55.0      1      1  93.500     B        S\n",
       "323      2    male  30.0      1      0  24.000     n        C\n",
       "572      2  female  28.0      0      0  12.650     n        S\n",
       "809      3    male  18.0      2      2  34.375     n        S"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_t = rare_encoder.transform(X_train)\n",
    "test_t = rare_encoder.transform(X_train)\n",
    "\n",
    "test_t.sample(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}