{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Guide To Encoding Categorical Values in Python\n", "Supporting notebook for [article](http://pbpython.com/categorical-encoding.html) on Practical Business Python." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Import the pandas, scikit-learn, numpy and [category_encoder](https://github.com/scikit-learn-contrib/category_encoders) libraries." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder\n", "from sklearn.compose import make_column_transformer\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.model_selection import cross_val_score\n", "\n", "import category_encoders as ce" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Need to define the headers since the data does not contain any" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "headers = [\"symboling\", \"normalized_losses\", \"make\", \"fuel_type\", \"aspiration\", \"num_doors\", \"body_style\",\n", " \"drive_wheels\", \"engine_location\", \"wheel_base\", \"length\", \"width\", \"height\", \"curb_weight\",\n", " \"engine_type\", \"num_cylinders\", \"engine_size\", \"fuel_system\", \"bore\", \"stroke\", \n", " \"compression_ratio\", \"horsepower\", \"peak_rpm\", \"city_mpg\", \"highway_mpg\", \"price\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Read in the data from the url, add headers and convert ? to nan values" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data\",\n", " header=None, names=headers, na_values=\"?\" )" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
symbolingnormalized_lossesmakefuel_typeaspirationnum_doorsbody_styledrive_wheelsengine_locationwheel_base...engine_sizefuel_systemborestrokecompression_ratiohorsepowerpeak_rpmcity_mpghighway_mpgprice
03NaNalfa-romerogasstdtwoconvertiblerwdfront88.6...130mpfi3.472.689.0111.05000.0212713495.0
13NaNalfa-romerogasstdtwoconvertiblerwdfront88.6...130mpfi3.472.689.0111.05000.0212716500.0
21NaNalfa-romerogasstdtwohatchbackrwdfront94.5...152mpfi2.683.479.0154.05000.0192616500.0
32164.0audigasstdfoursedanfwdfront99.8...109mpfi3.193.4010.0102.05500.0243013950.0
42164.0audigasstdfoursedan4wdfront99.4...136mpfi3.193.408.0115.05500.0182217450.0
\n", "

5 rows × 26 columns

\n", "
" ], "text/plain": [ " symboling normalized_losses make fuel_type aspiration num_doors \\\n", "0 3 NaN alfa-romero gas std two \n", "1 3 NaN alfa-romero gas std two \n", "2 1 NaN alfa-romero gas std two \n", "3 2 164.0 audi gas std four \n", "4 2 164.0 audi gas std four \n", "\n", " body_style drive_wheels engine_location wheel_base ... engine_size \\\n", "0 convertible rwd front 88.6 ... 130 \n", "1 convertible rwd front 88.6 ... 130 \n", "2 hatchback rwd front 94.5 ... 152 \n", "3 sedan fwd front 99.8 ... 109 \n", "4 sedan 4wd front 99.4 ... 136 \n", "\n", " fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg \\\n", "0 mpfi 3.47 2.68 9.0 111.0 5000.0 21 \n", "1 mpfi 3.47 2.68 9.0 111.0 5000.0 21 \n", "2 mpfi 2.68 3.47 9.0 154.0 5000.0 19 \n", "3 mpfi 3.19 3.40 10.0 102.0 5500.0 24 \n", "4 mpfi 3.19 3.40 8.0 115.0 5500.0 18 \n", "\n", " highway_mpg price \n", "0 27 13495.0 \n", "1 27 16500.0 \n", "2 26 16500.0 \n", "3 30 13950.0 \n", "4 22 17450.0 \n", "\n", "[5 rows x 26 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Look at the data types contained in the dataframe" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "symboling int64\n", "normalized_losses float64\n", "make object\n", "fuel_type object\n", "aspiration object\n", "num_doors object\n", "body_style object\n", "drive_wheels object\n", "engine_location object\n", "wheel_base float64\n", "length float64\n", "width float64\n", "height float64\n", "curb_weight int64\n", "engine_type object\n", "num_cylinders object\n", "engine_size int64\n", "fuel_system object\n", "bore float64\n", "stroke float64\n", "compression_ratio float64\n", "horsepower float64\n", "peak_rpm float64\n", "city_mpg int64\n", "highway_mpg int64\n", "price float64\n", "dtype: object" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create a copy of the data with only the object columns." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "obj_df = df.select_dtypes(include=['object']).copy()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
makefuel_typeaspirationnum_doorsbody_styledrive_wheelsengine_locationengine_typenum_cylindersfuel_system
0alfa-romerogasstdtwoconvertiblerwdfrontdohcfourmpfi
1alfa-romerogasstdtwoconvertiblerwdfrontdohcfourmpfi
2alfa-romerogasstdtwohatchbackrwdfrontohcvsixmpfi
3audigasstdfoursedanfwdfrontohcfourmpfi
4audigasstdfoursedan4wdfrontohcfivempfi
\n", "
" ], "text/plain": [ " make fuel_type aspiration num_doors body_style drive_wheels \\\n", "0 alfa-romero gas std two convertible rwd \n", "1 alfa-romero gas std two convertible rwd \n", "2 alfa-romero gas std two hatchback rwd \n", "3 audi gas std four sedan fwd \n", "4 audi gas std four sedan 4wd \n", "\n", " engine_location engine_type num_cylinders fuel_system \n", "0 front dohc four mpfi \n", "1 front dohc four mpfi \n", "2 front ohcv six mpfi \n", "3 front ohc four mpfi \n", "4 front ohc five mpfi " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check for null values in the data" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
makefuel_typeaspirationnum_doorsbody_styledrive_wheelsengine_locationengine_typenum_cylindersfuel_system
27dodgegasturboNaNsedanfwdfrontohcfourmpfi
63mazdadieselstdNaNsedanfwdfrontohcfouridi
\n", "
" ], "text/plain": [ " make fuel_type aspiration num_doors body_style drive_wheels \\\n", "27 dodge gas turbo NaN sedan fwd \n", "63 mazda diesel std NaN sedan fwd \n", "\n", " engine_location engine_type num_cylinders fuel_system \n", "27 front ohc four mpfi \n", "63 front ohc four idi " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj_df[obj_df.isnull().any(axis=1)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Since the num_doors column contains the null values, look at what values are current options" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "four 114\n", "two 89\n", "Name: num_doors, dtype: int64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj_df[\"num_doors\"].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will fill in the doors value with the most common element - four." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "obj_df = obj_df.fillna({\"num_doors\": \"four\"})" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
makefuel_typeaspirationnum_doorsbody_styledrive_wheelsengine_locationengine_typenum_cylindersfuel_system
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [make, fuel_type, aspiration, num_doors, body_style, drive_wheels, engine_location, engine_type, num_cylinders, fuel_system]\n", "Index: []" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj_df[obj_df.isnull().any(axis=1)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Encoding values using pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Convert the num_cylinders and num_doors values to numbers" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "four 159\n", "six 24\n", "five 11\n", "eight 5\n", "two 4\n", "three 1\n", "twelve 1\n", "Name: num_cylinders, dtype: int64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj_df[\"num_cylinders\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "cleanup_nums = {\"num_doors\": {\"four\": 4, \"two\": 2},\n", " \"num_cylinders\": {\"four\": 4, \"six\": 6, \"five\": 5, \"eight\": 8,\n", " \"two\": 2, \"twelve\": 12, \"three\":3 }}" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "obj_df = obj_df.replace(cleanup_nums)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
makefuel_typeaspirationnum_doorsbody_styledrive_wheelsengine_locationengine_typenum_cylindersfuel_system
0alfa-romerogasstd2convertiblerwdfrontdohc4mpfi
1alfa-romerogasstd2convertiblerwdfrontdohc4mpfi
2alfa-romerogasstd2hatchbackrwdfrontohcv6mpfi
3audigasstd4sedanfwdfrontohc4mpfi
4audigasstd4sedan4wdfrontohc5mpfi
\n", "
" ], "text/plain": [ " make fuel_type aspiration num_doors body_style drive_wheels \\\n", "0 alfa-romero gas std 2 convertible rwd \n", "1 alfa-romero gas std 2 convertible rwd \n", "2 alfa-romero gas std 2 hatchback rwd \n", "3 audi gas std 4 sedan fwd \n", "4 audi gas std 4 sedan 4wd \n", "\n", " engine_location engine_type num_cylinders fuel_system \n", "0 front dohc 4 mpfi \n", "1 front dohc 4 mpfi \n", "2 front ohcv 6 mpfi \n", "3 front ohc 4 mpfi \n", "4 front ohc 5 mpfi " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj_df.head()" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "Check the data types to make sure they are coming through as numbers" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "make object\n", "fuel_type object\n", "aspiration object\n", "num_doors int64\n", "body_style object\n", "drive_wheels object\n", "engine_location object\n", "engine_type object\n", "num_cylinders int64\n", "fuel_system object\n", "dtype: object" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj_df.dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "One approach to encoding labels is to convert the values to a pandas category" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "sedan 96\n", "hatchback 70\n", "wagon 25\n", "hardtop 8\n", "convertible 6\n", "Name: body_style, dtype: int64" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj_df[\"body_style\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "obj_df[\"body_style\"] = obj_df[\"body_style\"].astype('category')" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "make object\n", "fuel_type object\n", "aspiration object\n", "num_doors int64\n", "body_style category\n", "drive_wheels object\n", "engine_location object\n", "engine_type object\n", "num_cylinders int64\n", "fuel_system object\n", "dtype: object" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj_df.dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can assign the category codes to a new column so we have a clean numeric representation" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "obj_df[\"body_style_cat\"] = obj_df[\"body_style\"].cat.codes" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
makefuel_typeaspirationnum_doorsbody_styledrive_wheelsengine_locationengine_typenum_cylindersfuel_systembody_style_cat
0alfa-romerogasstd2convertiblerwdfrontdohc4mpfi0
1alfa-romerogasstd2convertiblerwdfrontdohc4mpfi0
2alfa-romerogasstd2hatchbackrwdfrontohcv6mpfi2
3audigasstd4sedanfwdfrontohc4mpfi3
4audigasstd4sedan4wdfrontohc5mpfi3
\n", "
" ], "text/plain": [ " make fuel_type aspiration num_doors body_style drive_wheels \\\n", "0 alfa-romero gas std 2 convertible rwd \n", "1 alfa-romero gas std 2 convertible rwd \n", "2 alfa-romero gas std 2 hatchback rwd \n", "3 audi gas std 4 sedan fwd \n", "4 audi gas std 4 sedan 4wd \n", "\n", " engine_location engine_type num_cylinders fuel_system body_style_cat \n", "0 front dohc 4 mpfi 0 \n", "1 front dohc 4 mpfi 0 \n", "2 front ohcv 6 mpfi 2 \n", "3 front ohc 4 mpfi 3 \n", "4 front ohc 5 mpfi 3 " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj_df.head()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "make object\n", "fuel_type object\n", "aspiration object\n", "num_doors int64\n", "body_style category\n", "drive_wheels object\n", "engine_location object\n", "engine_type object\n", "num_cylinders int64\n", "fuel_system object\n", "body_style_cat int8\n", "dtype: object" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj_df.dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In order to do one hot encoding, use pandas get_dummies" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
makefuel_typeaspirationnum_doorsbody_styleengine_locationengine_typenum_cylindersfuel_systembody_style_catdrive_wheels_4wddrive_wheels_fwddrive_wheels_rwd
0alfa-romerogasstd2convertiblefrontdohc4mpfi0001
1alfa-romerogasstd2convertiblefrontdohc4mpfi0001
2alfa-romerogasstd2hatchbackfrontohcv6mpfi2001
3audigasstd4sedanfrontohc4mpfi3010
4audigasstd4sedanfrontohc5mpfi3100
\n", "
" ], "text/plain": [ " make fuel_type aspiration num_doors body_style engine_location \\\n", "0 alfa-romero gas std 2 convertible front \n", "1 alfa-romero gas std 2 convertible front \n", "2 alfa-romero gas std 2 hatchback front \n", "3 audi gas std 4 sedan front \n", "4 audi gas std 4 sedan front \n", "\n", " engine_type num_cylinders fuel_system body_style_cat drive_wheels_4wd \\\n", "0 dohc 4 mpfi 0 0 \n", "1 dohc 4 mpfi 0 0 \n", "2 ohcv 6 mpfi 2 0 \n", "3 ohc 4 mpfi 3 0 \n", "4 ohc 5 mpfi 3 1 \n", "\n", " drive_wheels_fwd drive_wheels_rwd \n", "0 0 1 \n", "1 0 1 \n", "2 0 1 \n", "3 1 0 \n", "4 0 0 " ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.get_dummies(obj_df, columns=[\"drive_wheels\"]).head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "get_dummiers has options for selecting the columns and adding prefixes to make the resulting data easier to understand." ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
makefuel_typeaspirationnum_doorsengine_locationengine_typenum_cylindersfuel_systembody_style_catbody_convertiblebody_hardtopbody_hatchbackbody_sedanbody_wagondrive_4wddrive_fwddrive_rwd
0alfa-romerogasstd2frontdohc4mpfi010000001
1alfa-romerogasstd2frontdohc4mpfi010000001
2alfa-romerogasstd2frontohcv6mpfi200100001
3audigasstd4frontohc4mpfi300010010
4audigasstd4frontohc5mpfi300010100
\n", "
" ], "text/plain": [ " make fuel_type aspiration num_doors engine_location engine_type \\\n", "0 alfa-romero gas std 2 front dohc \n", "1 alfa-romero gas std 2 front dohc \n", "2 alfa-romero gas std 2 front ohcv \n", "3 audi gas std 4 front ohc \n", "4 audi gas std 4 front ohc \n", "\n", " num_cylinders fuel_system body_style_cat body_convertible body_hardtop \\\n", "0 4 mpfi 0 1 0 \n", "1 4 mpfi 0 1 0 \n", "2 6 mpfi 2 0 0 \n", "3 4 mpfi 3 0 0 \n", "4 5 mpfi 3 0 0 \n", "\n", " body_hatchback body_sedan body_wagon drive_4wd drive_fwd drive_rwd \n", "0 0 0 0 0 0 1 \n", "1 0 0 0 0 0 1 \n", "2 1 0 0 0 0 1 \n", "3 0 1 0 0 1 0 \n", "4 0 1 0 1 0 0 " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.get_dummies(obj_df, columns=[\"body_style\", \"drive_wheels\"], prefix=[\"body\", \"drive\"]).head()" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "Another approach to encoding values is to select an attribute and convert it to True or False.\n", "In this case, we can check if an engine is an OHC or not." ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ohc 148\n", "ohcf 15\n", "ohcv 13\n", "l 12\n", "dohc 12\n", "rotor 4\n", "dohcv 1\n", "Name: engine_type, dtype: int64" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj_df[\"engine_type\"].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Use np.where and the str accessor to do this in one efficient line" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "obj_df[\"OHC_Code\"] = np.where(obj_df[\"engine_type\"].str.contains(\"ohc\"), 1, 0)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
makeengine_typeOHC_Code
0alfa-romerodohc1
1alfa-romerodohc1
2alfa-romeroohcv1
3audiohc1
4audiohc1
5audiohc1
6audiohc1
7audiohc1
8audiohc1
9audiohc1
10bmwohc1
11bmwohc1
12bmwohc1
13bmwohc1
14bmwohc1
15bmwohc1
16bmwohc1
17bmwohc1
18chevroletl0
19chevroletohc1
\n", "
" ], "text/plain": [ " make engine_type OHC_Code\n", "0 alfa-romero dohc 1\n", "1 alfa-romero dohc 1\n", "2 alfa-romero ohcv 1\n", "3 audi ohc 1\n", "4 audi ohc 1\n", "5 audi ohc 1\n", "6 audi ohc 1\n", "7 audi ohc 1\n", "8 audi ohc 1\n", "9 audi ohc 1\n", "10 bmw ohc 1\n", "11 bmw ohc 1\n", "12 bmw ohc 1\n", "13 bmw ohc 1\n", "14 bmw ohc 1\n", "15 bmw ohc 1\n", "16 bmw ohc 1\n", "17 bmw ohc 1\n", "18 chevrolet l 0\n", "19 chevrolet ohc 1" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj_df[[\"make\", \"engine_type\", \"OHC_Code\"]].head(20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Encoding Values Using Scitkit-learn" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Instantiate the LabelEncoder" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "ord_enc = OrdinalEncoder()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "obj_df[\"make_code\"] = ord_enc.fit_transform(obj_df[[\"make\"]])" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
makemake_code
0alfa-romero0.0
1alfa-romero0.0
2alfa-romero0.0
3audi1.0
4audi1.0
5audi1.0
6audi1.0
7audi1.0
8audi1.0
9audi1.0
10bmw2.0
\n", "
" ], "text/plain": [ " make make_code\n", "0 alfa-romero 0.0\n", "1 alfa-romero 0.0\n", "2 alfa-romero 0.0\n", "3 audi 1.0\n", "4 audi 1.0\n", "5 audi 1.0\n", "6 audi 1.0\n", "7 audi 1.0\n", "8 audi 1.0\n", "9 audi 1.0\n", "10 bmw 2.0" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj_df[[\"make\", \"make_code\"]].head(11)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To accomplish something similar to pandas get_dummies, use LabelBinarizer" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "oe_style = OneHotEncoder()\n", "oe_results = oe_style.fit_transform(obj_df[[\"body_style\"]])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The results are an array that needs to be converted to a DataFrame" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1., 0., 0., 0., 0.],\n", " [1., 0., 0., 0., 0.],\n", " [0., 0., 1., 0., 0.],\n", " ...,\n", " [0., 0., 0., 1., 0.],\n", " [0., 0., 0., 1., 0.],\n", " [0., 0., 0., 1., 0.]])" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "oe_results.toarray()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
convertiblehardtophatchbacksedanwagon
01.00.00.00.00.0
11.00.00.00.00.0
20.00.01.00.00.0
30.00.00.01.00.0
40.00.00.01.00.0
\n", "
" ], "text/plain": [ " convertible hardtop hatchback sedan wagon\n", "0 1.0 0.0 0.0 0.0 0.0\n", "1 1.0 0.0 0.0 0.0 0.0\n", "2 0.0 0.0 1.0 0.0 0.0\n", "3 0.0 0.0 0.0 1.0 0.0\n", "4 0.0 0.0 0.0 1.0 0.0" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(oe_results.toarray(), columns=oe_style.categories_).head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Advanced Encoding\n", "[category_encoder](https://github.com/scikit-learn-contrib/category_encoders) library" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "# Get a new clean dataframe\n", "obj_df = df.select_dtypes(include=['object']).copy()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
makefuel_typeaspirationnum_doorsbody_styledrive_wheelsengine_locationengine_typenum_cylindersfuel_system
0alfa-romerogasstdtwoconvertiblerwdfrontdohcfourmpfi
1alfa-romerogasstdtwoconvertiblerwdfrontdohcfourmpfi
2alfa-romerogasstdtwohatchbackrwdfrontohcvsixmpfi
3audigasstdfoursedanfwdfrontohcfourmpfi
4audigasstdfoursedan4wdfrontohcfivempfi
\n", "
" ], "text/plain": [ " make fuel_type aspiration num_doors body_style drive_wheels \\\n", "0 alfa-romero gas std two convertible rwd \n", "1 alfa-romero gas std two convertible rwd \n", "2 alfa-romero gas std two hatchback rwd \n", "3 audi gas std four sedan fwd \n", "4 audi gas std four sedan 4wd \n", "\n", " engine_location engine_type num_cylinders fuel_system \n", "0 front dohc four mpfi \n", "1 front dohc four mpfi \n", "2 front ohcv six mpfi \n", "3 front ohc four mpfi \n", "4 front ohc five mpfi " ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Try out the Backward Difference Encoder on the engine_type column" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/chris/miniconda3/envs/pbpcode/lib/python3.8/site-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead\n", " elif pd.api.types.is_categorical(cols):\n" ] }, { "data": { "text/plain": [ "BackwardDifferenceEncoder(cols=['engine_type'],\n", " mapping=[{'col': 'engine_type',\n", " 'mapping': engine_type_0 engine_type_1 engine_type_2 engine_type_3 engine_type_4 \\\n", " 1 -0.857143 -0.714286 -0.571429 -0.428571 -0.285714 \n", " 2 0.142857 -0.714286 -0.571429 -0.428571 -0.285714 \n", " 3 0.142857 0.285714 -0.571429 -0.428571 -0.285714 \n", " 4 0.142857 0.285714 0.428571 -0.428571 -0.285714 \n", " 5 0.142857 0.285714 0.428571 0.571429 -0.285714 \n", " 6 0.142857 0.285714 0.428571 0.571429 0.714286 \n", " 7 0.142857 0.285714 0.428571 0.571429 0.714286 \n", "-1 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "-2 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "\n", " engine_type_5 \n", " 1 -0.142857 \n", " 2 -0.142857 \n", " 3 -0.142857 \n", " 4 -0.142857 \n", " 5 -0.142857 \n", " 6 -0.142857 \n", " 7 0.857143 \n", "-1 0.000000 \n", "-2 0.000000 }])" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Specify the columns to encode then fit and transform\n", "encoder = ce.BackwardDifferenceEncoder(cols=[\"engine_type\"])\n", "encoder.fit(obj_df, verbose=1)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/chris/miniconda3/envs/pbpcode/lib/python3.8/site-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead\n", " elif pd.api.types.is_categorical(cols):\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
engine_type_0engine_type_1engine_type_2engine_type_3engine_type_4engine_type_5
0-0.857143-0.714286-0.571429-0.428571-0.285714-0.142857
1-0.857143-0.714286-0.571429-0.428571-0.285714-0.142857
20.142857-0.714286-0.571429-0.428571-0.285714-0.142857
30.1428570.285714-0.571429-0.428571-0.285714-0.142857
40.1428570.285714-0.571429-0.428571-0.285714-0.142857
\n", "
" ], "text/plain": [ " engine_type_0 engine_type_1 engine_type_2 engine_type_3 engine_type_4 \\\n", "0 -0.857143 -0.714286 -0.571429 -0.428571 -0.285714 \n", "1 -0.857143 -0.714286 -0.571429 -0.428571 -0.285714 \n", "2 0.142857 -0.714286 -0.571429 -0.428571 -0.285714 \n", "3 0.142857 0.285714 -0.571429 -0.428571 -0.285714 \n", "4 0.142857 0.285714 -0.571429 -0.428571 -0.285714 \n", "\n", " engine_type_5 \n", "0 -0.142857 \n", "1 -0.142857 \n", "2 -0.142857 \n", "3 -0.142857 \n", "4 -0.142857 " ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoder.fit_transform(obj_df).iloc[:,8:14].head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Another approach is to use a polynomial encoding." ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/chris/miniconda3/envs/pbpcode/lib/python3.8/site-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead\n", " elif pd.api.types.is_categorical(cols):\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
engine_type_0engine_type_1engine_type_2engine_type_3engine_type_4engine_type_5
0-0.5669470.545545-0.4082480.241747-0.1091090.032898
1-0.5669470.545545-0.4082480.241747-0.1091090.032898
2-0.3779640.0000000.408248-0.5640760.436436-0.197386
3-0.188982-0.3273270.4082480.080582-0.5455450.493464
4-0.188982-0.3273270.4082480.080582-0.5455450.493464
\n", "
" ], "text/plain": [ " engine_type_0 engine_type_1 engine_type_2 engine_type_3 engine_type_4 \\\n", "0 -0.566947 0.545545 -0.408248 0.241747 -0.109109 \n", "1 -0.566947 0.545545 -0.408248 0.241747 -0.109109 \n", "2 -0.377964 0.000000 0.408248 -0.564076 0.436436 \n", "3 -0.188982 -0.327327 0.408248 0.080582 -0.545545 \n", "4 -0.188982 -0.327327 0.408248 0.080582 -0.545545 \n", "\n", " engine_type_5 \n", "0 0.032898 \n", "1 0.032898 \n", "2 -0.197386 \n", "3 0.493464 \n", "4 0.493464 " ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoder = ce.polynomial.PolynomialEncoder(cols=[\"engine_type\"])\n", "encoder.fit_transform(obj_df, verbose=1).iloc[:,8:14].head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Scikit-learn pipeline\n", "Show an example of how to incorporate the encoding strategies into a scikit-learn pipeline" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "# for the purposes of this analysis, only use a small subset of features\n", "feature_cols = [\n", " 'fuel_type', 'make', 'aspiration', 'highway_mpg', 'city_mpg',\n", " 'curb_weight', 'drive_wheels'\n", "]\n", "\n", "# Remove the empty price rows\n", "df_ml = df.dropna(subset=['price'])\n", "\n", "X = df_ml[feature_cols]\n", "y = df_ml['price']" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "column_trans = make_column_transformer((OneHotEncoder(handle_unknown='ignore'),\n", " ['fuel_type', 'make', 'drive_wheels']),\n", " (OrdinalEncoder(), ['aspiration']),\n", " remainder='passthrough')" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "linreg = LinearRegression()\n", "pipe = make_pipeline(column_trans, linreg)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-4476.0937653 , -1014.54842052, -4227.68553953, -4936.79899194,\n", " -1591.8291911 , -3716.06617255, -4293.79197464, -1390.00486495,\n", " -1600.57946369, -2124.30041954])" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cross_val_score(pipe, X, y, cv=10, scoring='neg_mean_absolute_error')" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-2937.17" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get the average of the errors after 10 iterations\n", "cross_val_score(pipe, X, y, cv=10, scoring='neg_mean_absolute_error').mean().round(2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 1 }