{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Guide To Encoding Categorical Values in Python\n",
    "Supporting notebook for [article](http://pbpython.com/categorical-encoding.html) on Practical Business Python."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Import the pandas, scikit-learn, numpy and [category_encoder](https://github.com/scikit-learn-contrib/category_encoders) libraries."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder\n",
    "from sklearn.compose import make_column_transformer\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "import category_encoders as ce"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Need to define the headers since the data does not contain any"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "headers = [\"symboling\", \"normalized_losses\", \"make\", \"fuel_type\", \"aspiration\", \"num_doors\", \"body_style\",\n",
    "           \"drive_wheels\", \"engine_location\", \"wheel_base\", \"length\", \"width\", \"height\", \"curb_weight\",\n",
    "           \"engine_type\", \"num_cylinders\", \"engine_size\", \"fuel_system\", \"bore\", \"stroke\", \n",
    "           \"compression_ratio\", \"horsepower\", \"peak_rpm\", \"city_mpg\", \"highway_mpg\", \"price\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read in the data from the url, add headers and convert ? to nan values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data\",\n",
    "                 header=None, names=headers, na_values=\"?\" )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>symboling</th>\n",
       "      <th>normalized_losses</th>\n",
       "      <th>make</th>\n",
       "      <th>fuel_type</th>\n",
       "      <th>aspiration</th>\n",
       "      <th>num_doors</th>\n",
       "      <th>body_style</th>\n",
       "      <th>drive_wheels</th>\n",
       "      <th>engine_location</th>\n",
       "      <th>wheel_base</th>\n",
       "      <th>...</th>\n",
       "      <th>engine_size</th>\n",
       "      <th>fuel_system</th>\n",
       "      <th>bore</th>\n",
       "      <th>stroke</th>\n",
       "      <th>compression_ratio</th>\n",
       "      <th>horsepower</th>\n",
       "      <th>peak_rpm</th>\n",
       "      <th>city_mpg</th>\n",
       "      <th>highway_mpg</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>two</td>\n",
       "      <td>convertible</td>\n",
       "      <td>rwd</td>\n",
       "      <td>front</td>\n",
       "      <td>88.6</td>\n",
       "      <td>...</td>\n",
       "      <td>130</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>3.47</td>\n",
       "      <td>2.68</td>\n",
       "      <td>9.0</td>\n",
       "      <td>111.0</td>\n",
       "      <td>5000.0</td>\n",
       "      <td>21</td>\n",
       "      <td>27</td>\n",
       "      <td>13495.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>two</td>\n",
       "      <td>convertible</td>\n",
       "      <td>rwd</td>\n",
       "      <td>front</td>\n",
       "      <td>88.6</td>\n",
       "      <td>...</td>\n",
       "      <td>130</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>3.47</td>\n",
       "      <td>2.68</td>\n",
       "      <td>9.0</td>\n",
       "      <td>111.0</td>\n",
       "      <td>5000.0</td>\n",
       "      <td>21</td>\n",
       "      <td>27</td>\n",
       "      <td>16500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>two</td>\n",
       "      <td>hatchback</td>\n",
       "      <td>rwd</td>\n",
       "      <td>front</td>\n",
       "      <td>94.5</td>\n",
       "      <td>...</td>\n",
       "      <td>152</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>2.68</td>\n",
       "      <td>3.47</td>\n",
       "      <td>9.0</td>\n",
       "      <td>154.0</td>\n",
       "      <td>5000.0</td>\n",
       "      <td>19</td>\n",
       "      <td>26</td>\n",
       "      <td>16500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>164.0</td>\n",
       "      <td>audi</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>four</td>\n",
       "      <td>sedan</td>\n",
       "      <td>fwd</td>\n",
       "      <td>front</td>\n",
       "      <td>99.8</td>\n",
       "      <td>...</td>\n",
       "      <td>109</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>3.19</td>\n",
       "      <td>3.40</td>\n",
       "      <td>10.0</td>\n",
       "      <td>102.0</td>\n",
       "      <td>5500.0</td>\n",
       "      <td>24</td>\n",
       "      <td>30</td>\n",
       "      <td>13950.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>164.0</td>\n",
       "      <td>audi</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>four</td>\n",
       "      <td>sedan</td>\n",
       "      <td>4wd</td>\n",
       "      <td>front</td>\n",
       "      <td>99.4</td>\n",
       "      <td>...</td>\n",
       "      <td>136</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>3.19</td>\n",
       "      <td>3.40</td>\n",
       "      <td>8.0</td>\n",
       "      <td>115.0</td>\n",
       "      <td>5500.0</td>\n",
       "      <td>18</td>\n",
       "      <td>22</td>\n",
       "      <td>17450.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 26 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   symboling  normalized_losses         make fuel_type aspiration num_doors  \\\n",
       "0          3                NaN  alfa-romero       gas        std       two   \n",
       "1          3                NaN  alfa-romero       gas        std       two   \n",
       "2          1                NaN  alfa-romero       gas        std       two   \n",
       "3          2              164.0         audi       gas        std      four   \n",
       "4          2              164.0         audi       gas        std      four   \n",
       "\n",
       "    body_style drive_wheels engine_location  wheel_base  ...  engine_size  \\\n",
       "0  convertible          rwd           front        88.6  ...          130   \n",
       "1  convertible          rwd           front        88.6  ...          130   \n",
       "2    hatchback          rwd           front        94.5  ...          152   \n",
       "3        sedan          fwd           front        99.8  ...          109   \n",
       "4        sedan          4wd           front        99.4  ...          136   \n",
       "\n",
       "   fuel_system  bore  stroke compression_ratio horsepower  peak_rpm city_mpg  \\\n",
       "0         mpfi  3.47    2.68               9.0      111.0    5000.0       21   \n",
       "1         mpfi  3.47    2.68               9.0      111.0    5000.0       21   \n",
       "2         mpfi  2.68    3.47               9.0      154.0    5000.0       19   \n",
       "3         mpfi  3.19    3.40              10.0      102.0    5500.0       24   \n",
       "4         mpfi  3.19    3.40               8.0      115.0    5500.0       18   \n",
       "\n",
       "   highway_mpg    price  \n",
       "0           27  13495.0  \n",
       "1           27  16500.0  \n",
       "2           26  16500.0  \n",
       "3           30  13950.0  \n",
       "4           22  17450.0  \n",
       "\n",
       "[5 rows x 26 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Look at the data types contained in the dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "symboling              int64\n",
       "normalized_losses    float64\n",
       "make                  object\n",
       "fuel_type             object\n",
       "aspiration            object\n",
       "num_doors             object\n",
       "body_style            object\n",
       "drive_wheels          object\n",
       "engine_location       object\n",
       "wheel_base           float64\n",
       "length               float64\n",
       "width                float64\n",
       "height               float64\n",
       "curb_weight            int64\n",
       "engine_type           object\n",
       "num_cylinders         object\n",
       "engine_size            int64\n",
       "fuel_system           object\n",
       "bore                 float64\n",
       "stroke               float64\n",
       "compression_ratio    float64\n",
       "horsepower           float64\n",
       "peak_rpm             float64\n",
       "city_mpg               int64\n",
       "highway_mpg            int64\n",
       "price                float64\n",
       "dtype: object"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a copy of the data with only the object columns."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj_df = df.select_dtypes(include=['object']).copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>make</th>\n",
       "      <th>fuel_type</th>\n",
       "      <th>aspiration</th>\n",
       "      <th>num_doors</th>\n",
       "      <th>body_style</th>\n",
       "      <th>drive_wheels</th>\n",
       "      <th>engine_location</th>\n",
       "      <th>engine_type</th>\n",
       "      <th>num_cylinders</th>\n",
       "      <th>fuel_system</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>two</td>\n",
       "      <td>convertible</td>\n",
       "      <td>rwd</td>\n",
       "      <td>front</td>\n",
       "      <td>dohc</td>\n",
       "      <td>four</td>\n",
       "      <td>mpfi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>two</td>\n",
       "      <td>convertible</td>\n",
       "      <td>rwd</td>\n",
       "      <td>front</td>\n",
       "      <td>dohc</td>\n",
       "      <td>four</td>\n",
       "      <td>mpfi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>two</td>\n",
       "      <td>hatchback</td>\n",
       "      <td>rwd</td>\n",
       "      <td>front</td>\n",
       "      <td>ohcv</td>\n",
       "      <td>six</td>\n",
       "      <td>mpfi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>audi</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>four</td>\n",
       "      <td>sedan</td>\n",
       "      <td>fwd</td>\n",
       "      <td>front</td>\n",
       "      <td>ohc</td>\n",
       "      <td>four</td>\n",
       "      <td>mpfi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>audi</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>four</td>\n",
       "      <td>sedan</td>\n",
       "      <td>4wd</td>\n",
       "      <td>front</td>\n",
       "      <td>ohc</td>\n",
       "      <td>five</td>\n",
       "      <td>mpfi</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          make fuel_type aspiration num_doors   body_style drive_wheels  \\\n",
       "0  alfa-romero       gas        std       two  convertible          rwd   \n",
       "1  alfa-romero       gas        std       two  convertible          rwd   \n",
       "2  alfa-romero       gas        std       two    hatchback          rwd   \n",
       "3         audi       gas        std      four        sedan          fwd   \n",
       "4         audi       gas        std      four        sedan          4wd   \n",
       "\n",
       "  engine_location engine_type num_cylinders fuel_system  \n",
       "0           front        dohc          four        mpfi  \n",
       "1           front        dohc          four        mpfi  \n",
       "2           front        ohcv           six        mpfi  \n",
       "3           front         ohc          four        mpfi  \n",
       "4           front         ohc          five        mpfi  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obj_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check for null values in the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>make</th>\n",
       "      <th>fuel_type</th>\n",
       "      <th>aspiration</th>\n",
       "      <th>num_doors</th>\n",
       "      <th>body_style</th>\n",
       "      <th>drive_wheels</th>\n",
       "      <th>engine_location</th>\n",
       "      <th>engine_type</th>\n",
       "      <th>num_cylinders</th>\n",
       "      <th>fuel_system</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>dodge</td>\n",
       "      <td>gas</td>\n",
       "      <td>turbo</td>\n",
       "      <td>NaN</td>\n",
       "      <td>sedan</td>\n",
       "      <td>fwd</td>\n",
       "      <td>front</td>\n",
       "      <td>ohc</td>\n",
       "      <td>four</td>\n",
       "      <td>mpfi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63</th>\n",
       "      <td>mazda</td>\n",
       "      <td>diesel</td>\n",
       "      <td>std</td>\n",
       "      <td>NaN</td>\n",
       "      <td>sedan</td>\n",
       "      <td>fwd</td>\n",
       "      <td>front</td>\n",
       "      <td>ohc</td>\n",
       "      <td>four</td>\n",
       "      <td>idi</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     make fuel_type aspiration num_doors body_style drive_wheels  \\\n",
       "27  dodge       gas      turbo       NaN      sedan          fwd   \n",
       "63  mazda    diesel        std       NaN      sedan          fwd   \n",
       "\n",
       "   engine_location engine_type num_cylinders fuel_system  \n",
       "27           front         ohc          four        mpfi  \n",
       "63           front         ohc          four         idi  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obj_df[obj_df.isnull().any(axis=1)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Since the num_doors column contains the null values, look at what values are current options"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "four    114\n",
       "two      89\n",
       "Name: num_doors, dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obj_df[\"num_doors\"].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will fill in the doors value with the most common element - four."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj_df = obj_df.fillna({\"num_doors\": \"four\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>make</th>\n",
       "      <th>fuel_type</th>\n",
       "      <th>aspiration</th>\n",
       "      <th>num_doors</th>\n",
       "      <th>body_style</th>\n",
       "      <th>drive_wheels</th>\n",
       "      <th>engine_location</th>\n",
       "      <th>engine_type</th>\n",
       "      <th>num_cylinders</th>\n",
       "      <th>fuel_system</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [make, fuel_type, aspiration, num_doors, body_style, drive_wheels, engine_location, engine_type, num_cylinders, fuel_system]\n",
       "Index: []"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obj_df[obj_df.isnull().any(axis=1)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Encoding values using pandas"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Convert the num_cylinders and num_doors values to numbers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "four      159\n",
       "six        24\n",
       "five       11\n",
       "eight       5\n",
       "two         4\n",
       "three       1\n",
       "twelve      1\n",
       "Name: num_cylinders, dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obj_df[\"num_cylinders\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "cleanup_nums = {\"num_doors\":     {\"four\": 4, \"two\": 2},\n",
    "                \"num_cylinders\": {\"four\": 4, \"six\": 6, \"five\": 5, \"eight\": 8,\n",
    "                                  \"two\": 2, \"twelve\": 12, \"three\":3 }}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj_df = obj_df.replace(cleanup_nums)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>make</th>\n",
       "      <th>fuel_type</th>\n",
       "      <th>aspiration</th>\n",
       "      <th>num_doors</th>\n",
       "      <th>body_style</th>\n",
       "      <th>drive_wheels</th>\n",
       "      <th>engine_location</th>\n",
       "      <th>engine_type</th>\n",
       "      <th>num_cylinders</th>\n",
       "      <th>fuel_system</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>2</td>\n",
       "      <td>convertible</td>\n",
       "      <td>rwd</td>\n",
       "      <td>front</td>\n",
       "      <td>dohc</td>\n",
       "      <td>4</td>\n",
       "      <td>mpfi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>2</td>\n",
       "      <td>convertible</td>\n",
       "      <td>rwd</td>\n",
       "      <td>front</td>\n",
       "      <td>dohc</td>\n",
       "      <td>4</td>\n",
       "      <td>mpfi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>2</td>\n",
       "      <td>hatchback</td>\n",
       "      <td>rwd</td>\n",
       "      <td>front</td>\n",
       "      <td>ohcv</td>\n",
       "      <td>6</td>\n",
       "      <td>mpfi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>audi</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>4</td>\n",
       "      <td>sedan</td>\n",
       "      <td>fwd</td>\n",
       "      <td>front</td>\n",
       "      <td>ohc</td>\n",
       "      <td>4</td>\n",
       "      <td>mpfi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>audi</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>4</td>\n",
       "      <td>sedan</td>\n",
       "      <td>4wd</td>\n",
       "      <td>front</td>\n",
       "      <td>ohc</td>\n",
       "      <td>5</td>\n",
       "      <td>mpfi</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          make fuel_type aspiration  num_doors   body_style drive_wheels  \\\n",
       "0  alfa-romero       gas        std          2  convertible          rwd   \n",
       "1  alfa-romero       gas        std          2  convertible          rwd   \n",
       "2  alfa-romero       gas        std          2    hatchback          rwd   \n",
       "3         audi       gas        std          4        sedan          fwd   \n",
       "4         audi       gas        std          4        sedan          4wd   \n",
       "\n",
       "  engine_location engine_type  num_cylinders fuel_system  \n",
       "0           front        dohc              4        mpfi  \n",
       "1           front        dohc              4        mpfi  \n",
       "2           front        ohcv              6        mpfi  \n",
       "3           front         ohc              4        mpfi  \n",
       "4           front         ohc              5        mpfi  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obj_df.head()"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "Check the data types to make sure they are coming through as numbers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "make               object\n",
       "fuel_type          object\n",
       "aspiration         object\n",
       "num_doors           int64\n",
       "body_style         object\n",
       "drive_wheels       object\n",
       "engine_location    object\n",
       "engine_type        object\n",
       "num_cylinders       int64\n",
       "fuel_system        object\n",
       "dtype: object"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obj_df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "One approach to encoding labels is to convert the values to a pandas category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "sedan          96\n",
       "hatchback      70\n",
       "wagon          25\n",
       "hardtop         8\n",
       "convertible     6\n",
       "Name: body_style, dtype: int64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obj_df[\"body_style\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj_df[\"body_style\"] = obj_df[\"body_style\"].astype('category')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "make                 object\n",
       "fuel_type            object\n",
       "aspiration           object\n",
       "num_doors             int64\n",
       "body_style         category\n",
       "drive_wheels         object\n",
       "engine_location      object\n",
       "engine_type          object\n",
       "num_cylinders         int64\n",
       "fuel_system          object\n",
       "dtype: object"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obj_df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can assign the category codes to a new column so we have a clean numeric representation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj_df[\"body_style_cat\"] = obj_df[\"body_style\"].cat.codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>make</th>\n",
       "      <th>fuel_type</th>\n",
       "      <th>aspiration</th>\n",
       "      <th>num_doors</th>\n",
       "      <th>body_style</th>\n",
       "      <th>drive_wheels</th>\n",
       "      <th>engine_location</th>\n",
       "      <th>engine_type</th>\n",
       "      <th>num_cylinders</th>\n",
       "      <th>fuel_system</th>\n",
       "      <th>body_style_cat</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>2</td>\n",
       "      <td>convertible</td>\n",
       "      <td>rwd</td>\n",
       "      <td>front</td>\n",
       "      <td>dohc</td>\n",
       "      <td>4</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>2</td>\n",
       "      <td>convertible</td>\n",
       "      <td>rwd</td>\n",
       "      <td>front</td>\n",
       "      <td>dohc</td>\n",
       "      <td>4</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>2</td>\n",
       "      <td>hatchback</td>\n",
       "      <td>rwd</td>\n",
       "      <td>front</td>\n",
       "      <td>ohcv</td>\n",
       "      <td>6</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>audi</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>4</td>\n",
       "      <td>sedan</td>\n",
       "      <td>fwd</td>\n",
       "      <td>front</td>\n",
       "      <td>ohc</td>\n",
       "      <td>4</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>audi</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>4</td>\n",
       "      <td>sedan</td>\n",
       "      <td>4wd</td>\n",
       "      <td>front</td>\n",
       "      <td>ohc</td>\n",
       "      <td>5</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          make fuel_type aspiration  num_doors   body_style drive_wheels  \\\n",
       "0  alfa-romero       gas        std          2  convertible          rwd   \n",
       "1  alfa-romero       gas        std          2  convertible          rwd   \n",
       "2  alfa-romero       gas        std          2    hatchback          rwd   \n",
       "3         audi       gas        std          4        sedan          fwd   \n",
       "4         audi       gas        std          4        sedan          4wd   \n",
       "\n",
       "  engine_location engine_type  num_cylinders fuel_system  body_style_cat  \n",
       "0           front        dohc              4        mpfi               0  \n",
       "1           front        dohc              4        mpfi               0  \n",
       "2           front        ohcv              6        mpfi               2  \n",
       "3           front         ohc              4        mpfi               3  \n",
       "4           front         ohc              5        mpfi               3  "
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obj_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "make                 object\n",
       "fuel_type            object\n",
       "aspiration           object\n",
       "num_doors             int64\n",
       "body_style         category\n",
       "drive_wheels         object\n",
       "engine_location      object\n",
       "engine_type          object\n",
       "num_cylinders         int64\n",
       "fuel_system          object\n",
       "body_style_cat         int8\n",
       "dtype: object"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obj_df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In order to do one hot encoding, use pandas get_dummies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>make</th>\n",
       "      <th>fuel_type</th>\n",
       "      <th>aspiration</th>\n",
       "      <th>num_doors</th>\n",
       "      <th>body_style</th>\n",
       "      <th>engine_location</th>\n",
       "      <th>engine_type</th>\n",
       "      <th>num_cylinders</th>\n",
       "      <th>fuel_system</th>\n",
       "      <th>body_style_cat</th>\n",
       "      <th>drive_wheels_4wd</th>\n",
       "      <th>drive_wheels_fwd</th>\n",
       "      <th>drive_wheels_rwd</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>2</td>\n",
       "      <td>convertible</td>\n",
       "      <td>front</td>\n",
       "      <td>dohc</td>\n",
       "      <td>4</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>2</td>\n",
       "      <td>convertible</td>\n",
       "      <td>front</td>\n",
       "      <td>dohc</td>\n",
       "      <td>4</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>2</td>\n",
       "      <td>hatchback</td>\n",
       "      <td>front</td>\n",
       "      <td>ohcv</td>\n",
       "      <td>6</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>audi</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>4</td>\n",
       "      <td>sedan</td>\n",
       "      <td>front</td>\n",
       "      <td>ohc</td>\n",
       "      <td>4</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>audi</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>4</td>\n",
       "      <td>sedan</td>\n",
       "      <td>front</td>\n",
       "      <td>ohc</td>\n",
       "      <td>5</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          make fuel_type aspiration  num_doors   body_style engine_location  \\\n",
       "0  alfa-romero       gas        std          2  convertible           front   \n",
       "1  alfa-romero       gas        std          2  convertible           front   \n",
       "2  alfa-romero       gas        std          2    hatchback           front   \n",
       "3         audi       gas        std          4        sedan           front   \n",
       "4         audi       gas        std          4        sedan           front   \n",
       "\n",
       "  engine_type  num_cylinders fuel_system  body_style_cat  drive_wheels_4wd  \\\n",
       "0        dohc              4        mpfi               0                 0   \n",
       "1        dohc              4        mpfi               0                 0   \n",
       "2        ohcv              6        mpfi               2                 0   \n",
       "3         ohc              4        mpfi               3                 0   \n",
       "4         ohc              5        mpfi               3                 1   \n",
       "\n",
       "   drive_wheels_fwd  drive_wheels_rwd  \n",
       "0                 0                 1  \n",
       "1                 0                 1  \n",
       "2                 0                 1  \n",
       "3                 1                 0  \n",
       "4                 0                 0  "
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.get_dummies(obj_df, columns=[\"drive_wheels\"]).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "get_dummiers has options for selecting the columns and adding prefixes to make the resulting data easier to understand."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>make</th>\n",
       "      <th>fuel_type</th>\n",
       "      <th>aspiration</th>\n",
       "      <th>num_doors</th>\n",
       "      <th>engine_location</th>\n",
       "      <th>engine_type</th>\n",
       "      <th>num_cylinders</th>\n",
       "      <th>fuel_system</th>\n",
       "      <th>body_style_cat</th>\n",
       "      <th>body_convertible</th>\n",
       "      <th>body_hardtop</th>\n",
       "      <th>body_hatchback</th>\n",
       "      <th>body_sedan</th>\n",
       "      <th>body_wagon</th>\n",
       "      <th>drive_4wd</th>\n",
       "      <th>drive_fwd</th>\n",
       "      <th>drive_rwd</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>2</td>\n",
       "      <td>front</td>\n",
       "      <td>dohc</td>\n",
       "      <td>4</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>2</td>\n",
       "      <td>front</td>\n",
       "      <td>dohc</td>\n",
       "      <td>4</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>2</td>\n",
       "      <td>front</td>\n",
       "      <td>ohcv</td>\n",
       "      <td>6</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>audi</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>4</td>\n",
       "      <td>front</td>\n",
       "      <td>ohc</td>\n",
       "      <td>4</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>audi</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>4</td>\n",
       "      <td>front</td>\n",
       "      <td>ohc</td>\n",
       "      <td>5</td>\n",
       "      <td>mpfi</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          make fuel_type aspiration  num_doors engine_location engine_type  \\\n",
       "0  alfa-romero       gas        std          2           front        dohc   \n",
       "1  alfa-romero       gas        std          2           front        dohc   \n",
       "2  alfa-romero       gas        std          2           front        ohcv   \n",
       "3         audi       gas        std          4           front         ohc   \n",
       "4         audi       gas        std          4           front         ohc   \n",
       "\n",
       "   num_cylinders fuel_system  body_style_cat  body_convertible  body_hardtop  \\\n",
       "0              4        mpfi               0                 1             0   \n",
       "1              4        mpfi               0                 1             0   \n",
       "2              6        mpfi               2                 0             0   \n",
       "3              4        mpfi               3                 0             0   \n",
       "4              5        mpfi               3                 0             0   \n",
       "\n",
       "   body_hatchback  body_sedan  body_wagon  drive_4wd  drive_fwd  drive_rwd  \n",
       "0               0           0           0          0          0          1  \n",
       "1               0           0           0          0          0          1  \n",
       "2               1           0           0          0          0          1  \n",
       "3               0           1           0          0          1          0  \n",
       "4               0           1           0          1          0          0  "
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.get_dummies(obj_df, columns=[\"body_style\", \"drive_wheels\"], prefix=[\"body\", \"drive\"]).head()"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "Another approach to encoding values is to select an attribute and convert it to True or False.\n",
    "In this case, we can check if an engine is an OHC or not."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ohc      148\n",
       "ohcf      15\n",
       "ohcv      13\n",
       "l         12\n",
       "dohc      12\n",
       "rotor      4\n",
       "dohcv      1\n",
       "Name: engine_type, dtype: int64"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obj_df[\"engine_type\"].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Use np.where and the str accessor to do this in one efficient line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj_df[\"OHC_Code\"] = np.where(obj_df[\"engine_type\"].str.contains(\"ohc\"), 1, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>make</th>\n",
       "      <th>engine_type</th>\n",
       "      <th>OHC_Code</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>dohc</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>dohc</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>ohcv</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>audi</td>\n",
       "      <td>ohc</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>audi</td>\n",
       "      <td>ohc</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>audi</td>\n",
       "      <td>ohc</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>audi</td>\n",
       "      <td>ohc</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>audi</td>\n",
       "      <td>ohc</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>audi</td>\n",
       "      <td>ohc</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>audi</td>\n",
       "      <td>ohc</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>bmw</td>\n",
       "      <td>ohc</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>bmw</td>\n",
       "      <td>ohc</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>bmw</td>\n",
       "      <td>ohc</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>bmw</td>\n",
       "      <td>ohc</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>bmw</td>\n",
       "      <td>ohc</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>bmw</td>\n",
       "      <td>ohc</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>bmw</td>\n",
       "      <td>ohc</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>bmw</td>\n",
       "      <td>ohc</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>chevrolet</td>\n",
       "      <td>l</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>chevrolet</td>\n",
       "      <td>ohc</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           make engine_type  OHC_Code\n",
       "0   alfa-romero        dohc         1\n",
       "1   alfa-romero        dohc         1\n",
       "2   alfa-romero        ohcv         1\n",
       "3          audi         ohc         1\n",
       "4          audi         ohc         1\n",
       "5          audi         ohc         1\n",
       "6          audi         ohc         1\n",
       "7          audi         ohc         1\n",
       "8          audi         ohc         1\n",
       "9          audi         ohc         1\n",
       "10          bmw         ohc         1\n",
       "11          bmw         ohc         1\n",
       "12          bmw         ohc         1\n",
       "13          bmw         ohc         1\n",
       "14          bmw         ohc         1\n",
       "15          bmw         ohc         1\n",
       "16          bmw         ohc         1\n",
       "17          bmw         ohc         1\n",
       "18    chevrolet           l         0\n",
       "19    chevrolet         ohc         1"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obj_df[[\"make\", \"engine_type\", \"OHC_Code\"]].head(20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Encoding Values Using Scitkit-learn"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Instantiate the LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "ord_enc = OrdinalEncoder()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj_df[\"make_code\"] = ord_enc.fit_transform(obj_df[[\"make\"]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>make</th>\n",
       "      <th>make_code</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>audi</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>audi</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>audi</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>audi</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>audi</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>audi</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>audi</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>bmw</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           make  make_code\n",
       "0   alfa-romero        0.0\n",
       "1   alfa-romero        0.0\n",
       "2   alfa-romero        0.0\n",
       "3          audi        1.0\n",
       "4          audi        1.0\n",
       "5          audi        1.0\n",
       "6          audi        1.0\n",
       "7          audi        1.0\n",
       "8          audi        1.0\n",
       "9          audi        1.0\n",
       "10          bmw        2.0"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obj_df[[\"make\", \"make_code\"]].head(11)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To accomplish something similar to pandas get_dummies, use LabelBinarizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "oe_style = OneHotEncoder()\n",
    "oe_results = oe_style.fit_transform(obj_df[[\"body_style\"]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The results are an array that needs to be converted to a DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1., 0., 0., 0., 0.],\n",
       "       [1., 0., 0., 0., 0.],\n",
       "       [0., 0., 1., 0., 0.],\n",
       "       ...,\n",
       "       [0., 0., 0., 1., 0.],\n",
       "       [0., 0., 0., 1., 0.],\n",
       "       [0., 0., 0., 1., 0.]])"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "oe_results.toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>convertible</th>\n",
       "      <th>hardtop</th>\n",
       "      <th>hatchback</th>\n",
       "      <th>sedan</th>\n",
       "      <th>wagon</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  convertible hardtop hatchback sedan wagon\n",
       "0         1.0     0.0       0.0   0.0   0.0\n",
       "1         1.0     0.0       0.0   0.0   0.0\n",
       "2         0.0     0.0       1.0   0.0   0.0\n",
       "3         0.0     0.0       0.0   1.0   0.0\n",
       "4         0.0     0.0       0.0   1.0   0.0"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(oe_results.toarray(), columns=oe_style.categories_).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Advanced Encoding\n",
    "[category_encoder](https://github.com/scikit-learn-contrib/category_encoders) library"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get a new clean dataframe\n",
    "obj_df = df.select_dtypes(include=['object']).copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>make</th>\n",
       "      <th>fuel_type</th>\n",
       "      <th>aspiration</th>\n",
       "      <th>num_doors</th>\n",
       "      <th>body_style</th>\n",
       "      <th>drive_wheels</th>\n",
       "      <th>engine_location</th>\n",
       "      <th>engine_type</th>\n",
       "      <th>num_cylinders</th>\n",
       "      <th>fuel_system</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>two</td>\n",
       "      <td>convertible</td>\n",
       "      <td>rwd</td>\n",
       "      <td>front</td>\n",
       "      <td>dohc</td>\n",
       "      <td>four</td>\n",
       "      <td>mpfi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>two</td>\n",
       "      <td>convertible</td>\n",
       "      <td>rwd</td>\n",
       "      <td>front</td>\n",
       "      <td>dohc</td>\n",
       "      <td>four</td>\n",
       "      <td>mpfi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>alfa-romero</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>two</td>\n",
       "      <td>hatchback</td>\n",
       "      <td>rwd</td>\n",
       "      <td>front</td>\n",
       "      <td>ohcv</td>\n",
       "      <td>six</td>\n",
       "      <td>mpfi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>audi</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>four</td>\n",
       "      <td>sedan</td>\n",
       "      <td>fwd</td>\n",
       "      <td>front</td>\n",
       "      <td>ohc</td>\n",
       "      <td>four</td>\n",
       "      <td>mpfi</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>audi</td>\n",
       "      <td>gas</td>\n",
       "      <td>std</td>\n",
       "      <td>four</td>\n",
       "      <td>sedan</td>\n",
       "      <td>4wd</td>\n",
       "      <td>front</td>\n",
       "      <td>ohc</td>\n",
       "      <td>five</td>\n",
       "      <td>mpfi</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          make fuel_type aspiration num_doors   body_style drive_wheels  \\\n",
       "0  alfa-romero       gas        std       two  convertible          rwd   \n",
       "1  alfa-romero       gas        std       two  convertible          rwd   \n",
       "2  alfa-romero       gas        std       two    hatchback          rwd   \n",
       "3         audi       gas        std      four        sedan          fwd   \n",
       "4         audi       gas        std      four        sedan          4wd   \n",
       "\n",
       "  engine_location engine_type num_cylinders fuel_system  \n",
       "0           front        dohc          four        mpfi  \n",
       "1           front        dohc          four        mpfi  \n",
       "2           front        ohcv           six        mpfi  \n",
       "3           front         ohc          four        mpfi  \n",
       "4           front         ohc          five        mpfi  "
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obj_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Try out the Backward Difference Encoder on the engine_type column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/chris/miniconda3/envs/pbpcode/lib/python3.8/site-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead\n",
      "  elif pd.api.types.is_categorical(cols):\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "BackwardDifferenceEncoder(cols=['engine_type'],\n",
       "                          mapping=[{'col': 'engine_type',\n",
       "                                    'mapping':     engine_type_0  engine_type_1  engine_type_2  engine_type_3  engine_type_4  \\\n",
       " 1      -0.857143      -0.714286      -0.571429      -0.428571      -0.285714   \n",
       " 2       0.142857      -0.714286      -0.571429      -0.428571      -0.285714   \n",
       " 3       0.142857       0.285714      -0.571429      -0.428571      -0.285714   \n",
       " 4       0.142857       0.285714       0.428571      -0.428571      -0.285714   \n",
       " 5       0.142857       0.285714       0.428571       0.571429      -0.285714   \n",
       " 6       0.142857       0.285714       0.428571       0.571429       0.714286   \n",
       " 7       0.142857       0.285714       0.428571       0.571429       0.714286   \n",
       "-1       0.000000       0.000000       0.000000       0.000000       0.000000   \n",
       "-2       0.000000       0.000000       0.000000       0.000000       0.000000   \n",
       "\n",
       "    engine_type_5  \n",
       " 1      -0.142857  \n",
       " 2      -0.142857  \n",
       " 3      -0.142857  \n",
       " 4      -0.142857  \n",
       " 5      -0.142857  \n",
       " 6      -0.142857  \n",
       " 7       0.857143  \n",
       "-1       0.000000  \n",
       "-2       0.000000  }])"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Specify the columns to encode then fit and transform\n",
    "encoder = ce.BackwardDifferenceEncoder(cols=[\"engine_type\"])\n",
    "encoder.fit(obj_df, verbose=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/chris/miniconda3/envs/pbpcode/lib/python3.8/site-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead\n",
      "  elif pd.api.types.is_categorical(cols):\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>engine_type_0</th>\n",
       "      <th>engine_type_1</th>\n",
       "      <th>engine_type_2</th>\n",
       "      <th>engine_type_3</th>\n",
       "      <th>engine_type_4</th>\n",
       "      <th>engine_type_5</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.857143</td>\n",
       "      <td>-0.714286</td>\n",
       "      <td>-0.571429</td>\n",
       "      <td>-0.428571</td>\n",
       "      <td>-0.285714</td>\n",
       "      <td>-0.142857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.857143</td>\n",
       "      <td>-0.714286</td>\n",
       "      <td>-0.571429</td>\n",
       "      <td>-0.428571</td>\n",
       "      <td>-0.285714</td>\n",
       "      <td>-0.142857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.142857</td>\n",
       "      <td>-0.714286</td>\n",
       "      <td>-0.571429</td>\n",
       "      <td>-0.428571</td>\n",
       "      <td>-0.285714</td>\n",
       "      <td>-0.142857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>-0.571429</td>\n",
       "      <td>-0.428571</td>\n",
       "      <td>-0.285714</td>\n",
       "      <td>-0.142857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.142857</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>-0.571429</td>\n",
       "      <td>-0.428571</td>\n",
       "      <td>-0.285714</td>\n",
       "      <td>-0.142857</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   engine_type_0  engine_type_1  engine_type_2  engine_type_3  engine_type_4  \\\n",
       "0      -0.857143      -0.714286      -0.571429      -0.428571      -0.285714   \n",
       "1      -0.857143      -0.714286      -0.571429      -0.428571      -0.285714   \n",
       "2       0.142857      -0.714286      -0.571429      -0.428571      -0.285714   \n",
       "3       0.142857       0.285714      -0.571429      -0.428571      -0.285714   \n",
       "4       0.142857       0.285714      -0.571429      -0.428571      -0.285714   \n",
       "\n",
       "   engine_type_5  \n",
       "0      -0.142857  \n",
       "1      -0.142857  \n",
       "2      -0.142857  \n",
       "3      -0.142857  \n",
       "4      -0.142857  "
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoder.fit_transform(obj_df).iloc[:,8:14].head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Another approach is to use a polynomial encoding."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/chris/miniconda3/envs/pbpcode/lib/python3.8/site-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead\n",
      "  elif pd.api.types.is_categorical(cols):\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>engine_type_0</th>\n",
       "      <th>engine_type_1</th>\n",
       "      <th>engine_type_2</th>\n",
       "      <th>engine_type_3</th>\n",
       "      <th>engine_type_4</th>\n",
       "      <th>engine_type_5</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.566947</td>\n",
       "      <td>0.545545</td>\n",
       "      <td>-0.408248</td>\n",
       "      <td>0.241747</td>\n",
       "      <td>-0.109109</td>\n",
       "      <td>0.032898</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.566947</td>\n",
       "      <td>0.545545</td>\n",
       "      <td>-0.408248</td>\n",
       "      <td>0.241747</td>\n",
       "      <td>-0.109109</td>\n",
       "      <td>0.032898</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.377964</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.408248</td>\n",
       "      <td>-0.564076</td>\n",
       "      <td>0.436436</td>\n",
       "      <td>-0.197386</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.188982</td>\n",
       "      <td>-0.327327</td>\n",
       "      <td>0.408248</td>\n",
       "      <td>0.080582</td>\n",
       "      <td>-0.545545</td>\n",
       "      <td>0.493464</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.188982</td>\n",
       "      <td>-0.327327</td>\n",
       "      <td>0.408248</td>\n",
       "      <td>0.080582</td>\n",
       "      <td>-0.545545</td>\n",
       "      <td>0.493464</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   engine_type_0  engine_type_1  engine_type_2  engine_type_3  engine_type_4  \\\n",
       "0      -0.566947       0.545545      -0.408248       0.241747      -0.109109   \n",
       "1      -0.566947       0.545545      -0.408248       0.241747      -0.109109   \n",
       "2      -0.377964       0.000000       0.408248      -0.564076       0.436436   \n",
       "3      -0.188982      -0.327327       0.408248       0.080582      -0.545545   \n",
       "4      -0.188982      -0.327327       0.408248       0.080582      -0.545545   \n",
       "\n",
       "   engine_type_5  \n",
       "0       0.032898  \n",
       "1       0.032898  \n",
       "2      -0.197386  \n",
       "3       0.493464  \n",
       "4       0.493464  "
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoder = ce.polynomial.PolynomialEncoder(cols=[\"engine_type\"])\n",
    "encoder.fit_transform(obj_df, verbose=1).iloc[:,8:14].head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Scikit-learn pipeline\n",
    "Show an example of how to incorporate the encoding strategies into a scikit-learn pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for the purposes of this analysis, only use a small subset of features\n",
    "feature_cols = [\n",
    "    'fuel_type', 'make', 'aspiration', 'highway_mpg', 'city_mpg',\n",
    "    'curb_weight', 'drive_wheels'\n",
    "]\n",
    "\n",
    "# Remove the empty price rows\n",
    "df_ml = df.dropna(subset=['price'])\n",
    "\n",
    "X = df_ml[feature_cols]\n",
    "y = df_ml['price']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "column_trans = make_column_transformer((OneHotEncoder(handle_unknown='ignore'),\n",
    "                                        ['fuel_type', 'make', 'drive_wheels']),\n",
    "                                      (OrdinalEncoder(), ['aspiration']),\n",
    "                                      remainder='passthrough')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "linreg = LinearRegression()\n",
    "pipe = make_pipeline(column_trans, linreg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-4476.0937653 , -1014.54842052, -4227.68553953, -4936.79899194,\n",
       "       -1591.8291911 , -3716.06617255, -4293.79197464, -1390.00486495,\n",
       "       -1600.57946369, -2124.30041954])"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cross_val_score(pipe, X, y, cv=10, scoring='neg_mean_absolute_error')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-2937.17"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Get the average of the errors after 10 iterations\n",
    "cross_val_score(pipe, X, y, cv=10, scoring='neg_mean_absolute_error').mean().round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}