{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Guide To Encoding Categorical Values in Python\n",
"Supporting notebook for [article](http://pbpython.com/categorical-encoding.html) on Practical Business Python."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Import the pandas, scikit-learn, numpy and [category_encoder](https://github.com/scikit-learn-contrib/category_encoders) libraries."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder\n",
"from sklearn.compose import make_column_transformer\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.model_selection import cross_val_score\n",
"\n",
"import category_encoders as ce"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Need to define the headers since the data does not contain any"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"headers = [\"symboling\", \"normalized_losses\", \"make\", \"fuel_type\", \"aspiration\", \"num_doors\", \"body_style\",\n",
" \"drive_wheels\", \"engine_location\", \"wheel_base\", \"length\", \"width\", \"height\", \"curb_weight\",\n",
" \"engine_type\", \"num_cylinders\", \"engine_size\", \"fuel_system\", \"bore\", \"stroke\", \n",
" \"compression_ratio\", \"horsepower\", \"peak_rpm\", \"city_mpg\", \"highway_mpg\", \"price\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Read in the data from the url, add headers and convert ? to nan values"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data\",\n",
" header=None, names=headers, na_values=\"?\" )"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" symboling | \n",
" normalized_losses | \n",
" make | \n",
" fuel_type | \n",
" aspiration | \n",
" num_doors | \n",
" body_style | \n",
" drive_wheels | \n",
" engine_location | \n",
" wheel_base | \n",
" ... | \n",
" engine_size | \n",
" fuel_system | \n",
" bore | \n",
" stroke | \n",
" compression_ratio | \n",
" horsepower | \n",
" peak_rpm | \n",
" city_mpg | \n",
" highway_mpg | \n",
" price | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 3 | \n",
" NaN | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" two | \n",
" convertible | \n",
" rwd | \n",
" front | \n",
" 88.6 | \n",
" ... | \n",
" 130 | \n",
" mpfi | \n",
" 3.47 | \n",
" 2.68 | \n",
" 9.0 | \n",
" 111.0 | \n",
" 5000.0 | \n",
" 21 | \n",
" 27 | \n",
" 13495.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 3 | \n",
" NaN | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" two | \n",
" convertible | \n",
" rwd | \n",
" front | \n",
" 88.6 | \n",
" ... | \n",
" 130 | \n",
" mpfi | \n",
" 3.47 | \n",
" 2.68 | \n",
" 9.0 | \n",
" 111.0 | \n",
" 5000.0 | \n",
" 21 | \n",
" 27 | \n",
" 16500.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" NaN | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" two | \n",
" hatchback | \n",
" rwd | \n",
" front | \n",
" 94.5 | \n",
" ... | \n",
" 152 | \n",
" mpfi | \n",
" 2.68 | \n",
" 3.47 | \n",
" 9.0 | \n",
" 154.0 | \n",
" 5000.0 | \n",
" 19 | \n",
" 26 | \n",
" 16500.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 2 | \n",
" 164.0 | \n",
" audi | \n",
" gas | \n",
" std | \n",
" four | \n",
" sedan | \n",
" fwd | \n",
" front | \n",
" 99.8 | \n",
" ... | \n",
" 109 | \n",
" mpfi | \n",
" 3.19 | \n",
" 3.40 | \n",
" 10.0 | \n",
" 102.0 | \n",
" 5500.0 | \n",
" 24 | \n",
" 30 | \n",
" 13950.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 2 | \n",
" 164.0 | \n",
" audi | \n",
" gas | \n",
" std | \n",
" four | \n",
" sedan | \n",
" 4wd | \n",
" front | \n",
" 99.4 | \n",
" ... | \n",
" 136 | \n",
" mpfi | \n",
" 3.19 | \n",
" 3.40 | \n",
" 8.0 | \n",
" 115.0 | \n",
" 5500.0 | \n",
" 18 | \n",
" 22 | \n",
" 17450.0 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 26 columns
\n",
"
"
],
"text/plain": [
" symboling normalized_losses make fuel_type aspiration num_doors \\\n",
"0 3 NaN alfa-romero gas std two \n",
"1 3 NaN alfa-romero gas std two \n",
"2 1 NaN alfa-romero gas std two \n",
"3 2 164.0 audi gas std four \n",
"4 2 164.0 audi gas std four \n",
"\n",
" body_style drive_wheels engine_location wheel_base ... engine_size \\\n",
"0 convertible rwd front 88.6 ... 130 \n",
"1 convertible rwd front 88.6 ... 130 \n",
"2 hatchback rwd front 94.5 ... 152 \n",
"3 sedan fwd front 99.8 ... 109 \n",
"4 sedan 4wd front 99.4 ... 136 \n",
"\n",
" fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg \\\n",
"0 mpfi 3.47 2.68 9.0 111.0 5000.0 21 \n",
"1 mpfi 3.47 2.68 9.0 111.0 5000.0 21 \n",
"2 mpfi 2.68 3.47 9.0 154.0 5000.0 19 \n",
"3 mpfi 3.19 3.40 10.0 102.0 5500.0 24 \n",
"4 mpfi 3.19 3.40 8.0 115.0 5500.0 18 \n",
"\n",
" highway_mpg price \n",
"0 27 13495.0 \n",
"1 27 16500.0 \n",
"2 26 16500.0 \n",
"3 30 13950.0 \n",
"4 22 17450.0 \n",
"\n",
"[5 rows x 26 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Look at the data types contained in the dataframe"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"symboling int64\n",
"normalized_losses float64\n",
"make object\n",
"fuel_type object\n",
"aspiration object\n",
"num_doors object\n",
"body_style object\n",
"drive_wheels object\n",
"engine_location object\n",
"wheel_base float64\n",
"length float64\n",
"width float64\n",
"height float64\n",
"curb_weight int64\n",
"engine_type object\n",
"num_cylinders object\n",
"engine_size int64\n",
"fuel_system object\n",
"bore float64\n",
"stroke float64\n",
"compression_ratio float64\n",
"horsepower float64\n",
"peak_rpm float64\n",
"city_mpg int64\n",
"highway_mpg int64\n",
"price float64\n",
"dtype: object"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a copy of the data with only the object columns."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"obj_df = df.select_dtypes(include=['object']).copy()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" make | \n",
" fuel_type | \n",
" aspiration | \n",
" num_doors | \n",
" body_style | \n",
" drive_wheels | \n",
" engine_location | \n",
" engine_type | \n",
" num_cylinders | \n",
" fuel_system | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" two | \n",
" convertible | \n",
" rwd | \n",
" front | \n",
" dohc | \n",
" four | \n",
" mpfi | \n",
"
\n",
" \n",
" 1 | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" two | \n",
" convertible | \n",
" rwd | \n",
" front | \n",
" dohc | \n",
" four | \n",
" mpfi | \n",
"
\n",
" \n",
" 2 | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" two | \n",
" hatchback | \n",
" rwd | \n",
" front | \n",
" ohcv | \n",
" six | \n",
" mpfi | \n",
"
\n",
" \n",
" 3 | \n",
" audi | \n",
" gas | \n",
" std | \n",
" four | \n",
" sedan | \n",
" fwd | \n",
" front | \n",
" ohc | \n",
" four | \n",
" mpfi | \n",
"
\n",
" \n",
" 4 | \n",
" audi | \n",
" gas | \n",
" std | \n",
" four | \n",
" sedan | \n",
" 4wd | \n",
" front | \n",
" ohc | \n",
" five | \n",
" mpfi | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" make fuel_type aspiration num_doors body_style drive_wheels \\\n",
"0 alfa-romero gas std two convertible rwd \n",
"1 alfa-romero gas std two convertible rwd \n",
"2 alfa-romero gas std two hatchback rwd \n",
"3 audi gas std four sedan fwd \n",
"4 audi gas std four sedan 4wd \n",
"\n",
" engine_location engine_type num_cylinders fuel_system \n",
"0 front dohc four mpfi \n",
"1 front dohc four mpfi \n",
"2 front ohcv six mpfi \n",
"3 front ohc four mpfi \n",
"4 front ohc five mpfi "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Check for null values in the data"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" make | \n",
" fuel_type | \n",
" aspiration | \n",
" num_doors | \n",
" body_style | \n",
" drive_wheels | \n",
" engine_location | \n",
" engine_type | \n",
" num_cylinders | \n",
" fuel_system | \n",
"
\n",
" \n",
" \n",
" \n",
" 27 | \n",
" dodge | \n",
" gas | \n",
" turbo | \n",
" NaN | \n",
" sedan | \n",
" fwd | \n",
" front | \n",
" ohc | \n",
" four | \n",
" mpfi | \n",
"
\n",
" \n",
" 63 | \n",
" mazda | \n",
" diesel | \n",
" std | \n",
" NaN | \n",
" sedan | \n",
" fwd | \n",
" front | \n",
" ohc | \n",
" four | \n",
" idi | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" make fuel_type aspiration num_doors body_style drive_wheels \\\n",
"27 dodge gas turbo NaN sedan fwd \n",
"63 mazda diesel std NaN sedan fwd \n",
"\n",
" engine_location engine_type num_cylinders fuel_system \n",
"27 front ohc four mpfi \n",
"63 front ohc four idi "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj_df[obj_df.isnull().any(axis=1)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Since the num_doors column contains the null values, look at what values are current options"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"four 114\n",
"two 89\n",
"Name: num_doors, dtype: int64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj_df[\"num_doors\"].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We will fill in the doors value with the most common element - four."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"obj_df = obj_df.fillna({\"num_doors\": \"four\"})"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" make | \n",
" fuel_type | \n",
" aspiration | \n",
" num_doors | \n",
" body_style | \n",
" drive_wheels | \n",
" engine_location | \n",
" engine_type | \n",
" num_cylinders | \n",
" fuel_system | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [make, fuel_type, aspiration, num_doors, body_style, drive_wheels, engine_location, engine_type, num_cylinders, fuel_system]\n",
"Index: []"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj_df[obj_df.isnull().any(axis=1)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Encoding values using pandas"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Convert the num_cylinders and num_doors values to numbers"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"four 159\n",
"six 24\n",
"five 11\n",
"eight 5\n",
"two 4\n",
"three 1\n",
"twelve 1\n",
"Name: num_cylinders, dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj_df[\"num_cylinders\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"cleanup_nums = {\"num_doors\": {\"four\": 4, \"two\": 2},\n",
" \"num_cylinders\": {\"four\": 4, \"six\": 6, \"five\": 5, \"eight\": 8,\n",
" \"two\": 2, \"twelve\": 12, \"three\":3 }}"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"obj_df = obj_df.replace(cleanup_nums)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" make | \n",
" fuel_type | \n",
" aspiration | \n",
" num_doors | \n",
" body_style | \n",
" drive_wheels | \n",
" engine_location | \n",
" engine_type | \n",
" num_cylinders | \n",
" fuel_system | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" 2 | \n",
" convertible | \n",
" rwd | \n",
" front | \n",
" dohc | \n",
" 4 | \n",
" mpfi | \n",
"
\n",
" \n",
" 1 | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" 2 | \n",
" convertible | \n",
" rwd | \n",
" front | \n",
" dohc | \n",
" 4 | \n",
" mpfi | \n",
"
\n",
" \n",
" 2 | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" 2 | \n",
" hatchback | \n",
" rwd | \n",
" front | \n",
" ohcv | \n",
" 6 | \n",
" mpfi | \n",
"
\n",
" \n",
" 3 | \n",
" audi | \n",
" gas | \n",
" std | \n",
" 4 | \n",
" sedan | \n",
" fwd | \n",
" front | \n",
" ohc | \n",
" 4 | \n",
" mpfi | \n",
"
\n",
" \n",
" 4 | \n",
" audi | \n",
" gas | \n",
" std | \n",
" 4 | \n",
" sedan | \n",
" 4wd | \n",
" front | \n",
" ohc | \n",
" 5 | \n",
" mpfi | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" make fuel_type aspiration num_doors body_style drive_wheels \\\n",
"0 alfa-romero gas std 2 convertible rwd \n",
"1 alfa-romero gas std 2 convertible rwd \n",
"2 alfa-romero gas std 2 hatchback rwd \n",
"3 audi gas std 4 sedan fwd \n",
"4 audi gas std 4 sedan 4wd \n",
"\n",
" engine_location engine_type num_cylinders fuel_system \n",
"0 front dohc 4 mpfi \n",
"1 front dohc 4 mpfi \n",
"2 front ohcv 6 mpfi \n",
"3 front ohc 4 mpfi \n",
"4 front ohc 5 mpfi "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj_df.head()"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"Check the data types to make sure they are coming through as numbers"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"make object\n",
"fuel_type object\n",
"aspiration object\n",
"num_doors int64\n",
"body_style object\n",
"drive_wheels object\n",
"engine_location object\n",
"engine_type object\n",
"num_cylinders int64\n",
"fuel_system object\n",
"dtype: object"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj_df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"One approach to encoding labels is to convert the values to a pandas category"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"sedan 96\n",
"hatchback 70\n",
"wagon 25\n",
"hardtop 8\n",
"convertible 6\n",
"Name: body_style, dtype: int64"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj_df[\"body_style\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"obj_df[\"body_style\"] = obj_df[\"body_style\"].astype('category')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"make object\n",
"fuel_type object\n",
"aspiration object\n",
"num_doors int64\n",
"body_style category\n",
"drive_wheels object\n",
"engine_location object\n",
"engine_type object\n",
"num_cylinders int64\n",
"fuel_system object\n",
"dtype: object"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj_df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can assign the category codes to a new column so we have a clean numeric representation"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"obj_df[\"body_style_cat\"] = obj_df[\"body_style\"].cat.codes"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" make | \n",
" fuel_type | \n",
" aspiration | \n",
" num_doors | \n",
" body_style | \n",
" drive_wheels | \n",
" engine_location | \n",
" engine_type | \n",
" num_cylinders | \n",
" fuel_system | \n",
" body_style_cat | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" 2 | \n",
" convertible | \n",
" rwd | \n",
" front | \n",
" dohc | \n",
" 4 | \n",
" mpfi | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" 2 | \n",
" convertible | \n",
" rwd | \n",
" front | \n",
" dohc | \n",
" 4 | \n",
" mpfi | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" 2 | \n",
" hatchback | \n",
" rwd | \n",
" front | \n",
" ohcv | \n",
" 6 | \n",
" mpfi | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" audi | \n",
" gas | \n",
" std | \n",
" 4 | \n",
" sedan | \n",
" fwd | \n",
" front | \n",
" ohc | \n",
" 4 | \n",
" mpfi | \n",
" 3 | \n",
"
\n",
" \n",
" 4 | \n",
" audi | \n",
" gas | \n",
" std | \n",
" 4 | \n",
" sedan | \n",
" 4wd | \n",
" front | \n",
" ohc | \n",
" 5 | \n",
" mpfi | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" make fuel_type aspiration num_doors body_style drive_wheels \\\n",
"0 alfa-romero gas std 2 convertible rwd \n",
"1 alfa-romero gas std 2 convertible rwd \n",
"2 alfa-romero gas std 2 hatchback rwd \n",
"3 audi gas std 4 sedan fwd \n",
"4 audi gas std 4 sedan 4wd \n",
"\n",
" engine_location engine_type num_cylinders fuel_system body_style_cat \n",
"0 front dohc 4 mpfi 0 \n",
"1 front dohc 4 mpfi 0 \n",
"2 front ohcv 6 mpfi 2 \n",
"3 front ohc 4 mpfi 3 \n",
"4 front ohc 5 mpfi 3 "
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"make object\n",
"fuel_type object\n",
"aspiration object\n",
"num_doors int64\n",
"body_style category\n",
"drive_wheels object\n",
"engine_location object\n",
"engine_type object\n",
"num_cylinders int64\n",
"fuel_system object\n",
"body_style_cat int8\n",
"dtype: object"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj_df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In order to do one hot encoding, use pandas get_dummies"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" make | \n",
" fuel_type | \n",
" aspiration | \n",
" num_doors | \n",
" body_style | \n",
" engine_location | \n",
" engine_type | \n",
" num_cylinders | \n",
" fuel_system | \n",
" body_style_cat | \n",
" drive_wheels_4wd | \n",
" drive_wheels_fwd | \n",
" drive_wheels_rwd | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" 2 | \n",
" convertible | \n",
" front | \n",
" dohc | \n",
" 4 | \n",
" mpfi | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" 2 | \n",
" convertible | \n",
" front | \n",
" dohc | \n",
" 4 | \n",
" mpfi | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" 2 | \n",
" hatchback | \n",
" front | \n",
" ohcv | \n",
" 6 | \n",
" mpfi | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" audi | \n",
" gas | \n",
" std | \n",
" 4 | \n",
" sedan | \n",
" front | \n",
" ohc | \n",
" 4 | \n",
" mpfi | \n",
" 3 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" audi | \n",
" gas | \n",
" std | \n",
" 4 | \n",
" sedan | \n",
" front | \n",
" ohc | \n",
" 5 | \n",
" mpfi | \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" make fuel_type aspiration num_doors body_style engine_location \\\n",
"0 alfa-romero gas std 2 convertible front \n",
"1 alfa-romero gas std 2 convertible front \n",
"2 alfa-romero gas std 2 hatchback front \n",
"3 audi gas std 4 sedan front \n",
"4 audi gas std 4 sedan front \n",
"\n",
" engine_type num_cylinders fuel_system body_style_cat drive_wheels_4wd \\\n",
"0 dohc 4 mpfi 0 0 \n",
"1 dohc 4 mpfi 0 0 \n",
"2 ohcv 6 mpfi 2 0 \n",
"3 ohc 4 mpfi 3 0 \n",
"4 ohc 5 mpfi 3 1 \n",
"\n",
" drive_wheels_fwd drive_wheels_rwd \n",
"0 0 1 \n",
"1 0 1 \n",
"2 0 1 \n",
"3 1 0 \n",
"4 0 0 "
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.get_dummies(obj_df, columns=[\"drive_wheels\"]).head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"get_dummiers has options for selecting the columns and adding prefixes to make the resulting data easier to understand."
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" make | \n",
" fuel_type | \n",
" aspiration | \n",
" num_doors | \n",
" engine_location | \n",
" engine_type | \n",
" num_cylinders | \n",
" fuel_system | \n",
" body_style_cat | \n",
" body_convertible | \n",
" body_hardtop | \n",
" body_hatchback | \n",
" body_sedan | \n",
" body_wagon | \n",
" drive_4wd | \n",
" drive_fwd | \n",
" drive_rwd | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" 2 | \n",
" front | \n",
" dohc | \n",
" 4 | \n",
" mpfi | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" 2 | \n",
" front | \n",
" dohc | \n",
" 4 | \n",
" mpfi | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" 2 | \n",
" front | \n",
" ohcv | \n",
" 6 | \n",
" mpfi | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" audi | \n",
" gas | \n",
" std | \n",
" 4 | \n",
" front | \n",
" ohc | \n",
" 4 | \n",
" mpfi | \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" audi | \n",
" gas | \n",
" std | \n",
" 4 | \n",
" front | \n",
" ohc | \n",
" 5 | \n",
" mpfi | \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" make fuel_type aspiration num_doors engine_location engine_type \\\n",
"0 alfa-romero gas std 2 front dohc \n",
"1 alfa-romero gas std 2 front dohc \n",
"2 alfa-romero gas std 2 front ohcv \n",
"3 audi gas std 4 front ohc \n",
"4 audi gas std 4 front ohc \n",
"\n",
" num_cylinders fuel_system body_style_cat body_convertible body_hardtop \\\n",
"0 4 mpfi 0 1 0 \n",
"1 4 mpfi 0 1 0 \n",
"2 6 mpfi 2 0 0 \n",
"3 4 mpfi 3 0 0 \n",
"4 5 mpfi 3 0 0 \n",
"\n",
" body_hatchback body_sedan body_wagon drive_4wd drive_fwd drive_rwd \n",
"0 0 0 0 0 0 1 \n",
"1 0 0 0 0 0 1 \n",
"2 1 0 0 0 0 1 \n",
"3 0 1 0 0 1 0 \n",
"4 0 1 0 1 0 0 "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.get_dummies(obj_df, columns=[\"body_style\", \"drive_wheels\"], prefix=[\"body\", \"drive\"]).head()"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"Another approach to encoding values is to select an attribute and convert it to True or False.\n",
"In this case, we can check if an engine is an OHC or not."
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"ohc 148\n",
"ohcf 15\n",
"ohcv 13\n",
"l 12\n",
"dohc 12\n",
"rotor 4\n",
"dohcv 1\n",
"Name: engine_type, dtype: int64"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj_df[\"engine_type\"].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use np.where and the str accessor to do this in one efficient line"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"obj_df[\"OHC_Code\"] = np.where(obj_df[\"engine_type\"].str.contains(\"ohc\"), 1, 0)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" make | \n",
" engine_type | \n",
" OHC_Code | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" alfa-romero | \n",
" dohc | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" alfa-romero | \n",
" dohc | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" alfa-romero | \n",
" ohcv | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" audi | \n",
" ohc | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" audi | \n",
" ohc | \n",
" 1 | \n",
"
\n",
" \n",
" 5 | \n",
" audi | \n",
" ohc | \n",
" 1 | \n",
"
\n",
" \n",
" 6 | \n",
" audi | \n",
" ohc | \n",
" 1 | \n",
"
\n",
" \n",
" 7 | \n",
" audi | \n",
" ohc | \n",
" 1 | \n",
"
\n",
" \n",
" 8 | \n",
" audi | \n",
" ohc | \n",
" 1 | \n",
"
\n",
" \n",
" 9 | \n",
" audi | \n",
" ohc | \n",
" 1 | \n",
"
\n",
" \n",
" 10 | \n",
" bmw | \n",
" ohc | \n",
" 1 | \n",
"
\n",
" \n",
" 11 | \n",
" bmw | \n",
" ohc | \n",
" 1 | \n",
"
\n",
" \n",
" 12 | \n",
" bmw | \n",
" ohc | \n",
" 1 | \n",
"
\n",
" \n",
" 13 | \n",
" bmw | \n",
" ohc | \n",
" 1 | \n",
"
\n",
" \n",
" 14 | \n",
" bmw | \n",
" ohc | \n",
" 1 | \n",
"
\n",
" \n",
" 15 | \n",
" bmw | \n",
" ohc | \n",
" 1 | \n",
"
\n",
" \n",
" 16 | \n",
" bmw | \n",
" ohc | \n",
" 1 | \n",
"
\n",
" \n",
" 17 | \n",
" bmw | \n",
" ohc | \n",
" 1 | \n",
"
\n",
" \n",
" 18 | \n",
" chevrolet | \n",
" l | \n",
" 0 | \n",
"
\n",
" \n",
" 19 | \n",
" chevrolet | \n",
" ohc | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" make engine_type OHC_Code\n",
"0 alfa-romero dohc 1\n",
"1 alfa-romero dohc 1\n",
"2 alfa-romero ohcv 1\n",
"3 audi ohc 1\n",
"4 audi ohc 1\n",
"5 audi ohc 1\n",
"6 audi ohc 1\n",
"7 audi ohc 1\n",
"8 audi ohc 1\n",
"9 audi ohc 1\n",
"10 bmw ohc 1\n",
"11 bmw ohc 1\n",
"12 bmw ohc 1\n",
"13 bmw ohc 1\n",
"14 bmw ohc 1\n",
"15 bmw ohc 1\n",
"16 bmw ohc 1\n",
"17 bmw ohc 1\n",
"18 chevrolet l 0\n",
"19 chevrolet ohc 1"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj_df[[\"make\", \"engine_type\", \"OHC_Code\"]].head(20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Encoding Values Using Scitkit-learn"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Instantiate the LabelEncoder"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"ord_enc = OrdinalEncoder()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"obj_df[\"make_code\"] = ord_enc.fit_transform(obj_df[[\"make\"]])"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" make | \n",
" make_code | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" alfa-romero | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" alfa-romero | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" alfa-romero | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" audi | \n",
" 1.0 | \n",
"
\n",
" \n",
" 4 | \n",
" audi | \n",
" 1.0 | \n",
"
\n",
" \n",
" 5 | \n",
" audi | \n",
" 1.0 | \n",
"
\n",
" \n",
" 6 | \n",
" audi | \n",
" 1.0 | \n",
"
\n",
" \n",
" 7 | \n",
" audi | \n",
" 1.0 | \n",
"
\n",
" \n",
" 8 | \n",
" audi | \n",
" 1.0 | \n",
"
\n",
" \n",
" 9 | \n",
" audi | \n",
" 1.0 | \n",
"
\n",
" \n",
" 10 | \n",
" bmw | \n",
" 2.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" make make_code\n",
"0 alfa-romero 0.0\n",
"1 alfa-romero 0.0\n",
"2 alfa-romero 0.0\n",
"3 audi 1.0\n",
"4 audi 1.0\n",
"5 audi 1.0\n",
"6 audi 1.0\n",
"7 audi 1.0\n",
"8 audi 1.0\n",
"9 audi 1.0\n",
"10 bmw 2.0"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj_df[[\"make\", \"make_code\"]].head(11)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To accomplish something similar to pandas get_dummies, use LabelBinarizer"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"oe_style = OneHotEncoder()\n",
"oe_results = oe_style.fit_transform(obj_df[[\"body_style\"]])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The results are an array that needs to be converted to a DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1., 0., 0., 0., 0.],\n",
" [1., 0., 0., 0., 0.],\n",
" [0., 0., 1., 0., 0.],\n",
" ...,\n",
" [0., 0., 0., 1., 0.],\n",
" [0., 0., 0., 1., 0.],\n",
" [0., 0., 0., 1., 0.]])"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"oe_results.toarray()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" convertible | \n",
" hardtop | \n",
" hatchback | \n",
" sedan | \n",
" wagon | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" convertible hardtop hatchback sedan wagon\n",
"0 1.0 0.0 0.0 0.0 0.0\n",
"1 1.0 0.0 0.0 0.0 0.0\n",
"2 0.0 0.0 1.0 0.0 0.0\n",
"3 0.0 0.0 0.0 1.0 0.0\n",
"4 0.0 0.0 0.0 1.0 0.0"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(oe_results.toarray(), columns=oe_style.categories_).head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Advanced Encoding\n",
"[category_encoder](https://github.com/scikit-learn-contrib/category_encoders) library"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"# Get a new clean dataframe\n",
"obj_df = df.select_dtypes(include=['object']).copy()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" make | \n",
" fuel_type | \n",
" aspiration | \n",
" num_doors | \n",
" body_style | \n",
" drive_wheels | \n",
" engine_location | \n",
" engine_type | \n",
" num_cylinders | \n",
" fuel_system | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" two | \n",
" convertible | \n",
" rwd | \n",
" front | \n",
" dohc | \n",
" four | \n",
" mpfi | \n",
"
\n",
" \n",
" 1 | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" two | \n",
" convertible | \n",
" rwd | \n",
" front | \n",
" dohc | \n",
" four | \n",
" mpfi | \n",
"
\n",
" \n",
" 2 | \n",
" alfa-romero | \n",
" gas | \n",
" std | \n",
" two | \n",
" hatchback | \n",
" rwd | \n",
" front | \n",
" ohcv | \n",
" six | \n",
" mpfi | \n",
"
\n",
" \n",
" 3 | \n",
" audi | \n",
" gas | \n",
" std | \n",
" four | \n",
" sedan | \n",
" fwd | \n",
" front | \n",
" ohc | \n",
" four | \n",
" mpfi | \n",
"
\n",
" \n",
" 4 | \n",
" audi | \n",
" gas | \n",
" std | \n",
" four | \n",
" sedan | \n",
" 4wd | \n",
" front | \n",
" ohc | \n",
" five | \n",
" mpfi | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" make fuel_type aspiration num_doors body_style drive_wheels \\\n",
"0 alfa-romero gas std two convertible rwd \n",
"1 alfa-romero gas std two convertible rwd \n",
"2 alfa-romero gas std two hatchback rwd \n",
"3 audi gas std four sedan fwd \n",
"4 audi gas std four sedan 4wd \n",
"\n",
" engine_location engine_type num_cylinders fuel_system \n",
"0 front dohc four mpfi \n",
"1 front dohc four mpfi \n",
"2 front ohcv six mpfi \n",
"3 front ohc four mpfi \n",
"4 front ohc five mpfi "
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Try out the Backward Difference Encoder on the engine_type column"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/chris/miniconda3/envs/pbpcode/lib/python3.8/site-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead\n",
" elif pd.api.types.is_categorical(cols):\n"
]
},
{
"data": {
"text/plain": [
"BackwardDifferenceEncoder(cols=['engine_type'],\n",
" mapping=[{'col': 'engine_type',\n",
" 'mapping': engine_type_0 engine_type_1 engine_type_2 engine_type_3 engine_type_4 \\\n",
" 1 -0.857143 -0.714286 -0.571429 -0.428571 -0.285714 \n",
" 2 0.142857 -0.714286 -0.571429 -0.428571 -0.285714 \n",
" 3 0.142857 0.285714 -0.571429 -0.428571 -0.285714 \n",
" 4 0.142857 0.285714 0.428571 -0.428571 -0.285714 \n",
" 5 0.142857 0.285714 0.428571 0.571429 -0.285714 \n",
" 6 0.142857 0.285714 0.428571 0.571429 0.714286 \n",
" 7 0.142857 0.285714 0.428571 0.571429 0.714286 \n",
"-1 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"-2 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"\n",
" engine_type_5 \n",
" 1 -0.142857 \n",
" 2 -0.142857 \n",
" 3 -0.142857 \n",
" 4 -0.142857 \n",
" 5 -0.142857 \n",
" 6 -0.142857 \n",
" 7 0.857143 \n",
"-1 0.000000 \n",
"-2 0.000000 }])"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Specify the columns to encode then fit and transform\n",
"encoder = ce.BackwardDifferenceEncoder(cols=[\"engine_type\"])\n",
"encoder.fit(obj_df, verbose=1)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/chris/miniconda3/envs/pbpcode/lib/python3.8/site-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead\n",
" elif pd.api.types.is_categorical(cols):\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" engine_type_0 | \n",
" engine_type_1 | \n",
" engine_type_2 | \n",
" engine_type_3 | \n",
" engine_type_4 | \n",
" engine_type_5 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" -0.857143 | \n",
" -0.714286 | \n",
" -0.571429 | \n",
" -0.428571 | \n",
" -0.285714 | \n",
" -0.142857 | \n",
"
\n",
" \n",
" 1 | \n",
" -0.857143 | \n",
" -0.714286 | \n",
" -0.571429 | \n",
" -0.428571 | \n",
" -0.285714 | \n",
" -0.142857 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.142857 | \n",
" -0.714286 | \n",
" -0.571429 | \n",
" -0.428571 | \n",
" -0.285714 | \n",
" -0.142857 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.142857 | \n",
" 0.285714 | \n",
" -0.571429 | \n",
" -0.428571 | \n",
" -0.285714 | \n",
" -0.142857 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.142857 | \n",
" 0.285714 | \n",
" -0.571429 | \n",
" -0.428571 | \n",
" -0.285714 | \n",
" -0.142857 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" engine_type_0 engine_type_1 engine_type_2 engine_type_3 engine_type_4 \\\n",
"0 -0.857143 -0.714286 -0.571429 -0.428571 -0.285714 \n",
"1 -0.857143 -0.714286 -0.571429 -0.428571 -0.285714 \n",
"2 0.142857 -0.714286 -0.571429 -0.428571 -0.285714 \n",
"3 0.142857 0.285714 -0.571429 -0.428571 -0.285714 \n",
"4 0.142857 0.285714 -0.571429 -0.428571 -0.285714 \n",
"\n",
" engine_type_5 \n",
"0 -0.142857 \n",
"1 -0.142857 \n",
"2 -0.142857 \n",
"3 -0.142857 \n",
"4 -0.142857 "
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"encoder.fit_transform(obj_df).iloc[:,8:14].head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Another approach is to use a polynomial encoding."
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/chris/miniconda3/envs/pbpcode/lib/python3.8/site-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead\n",
" elif pd.api.types.is_categorical(cols):\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" engine_type_0 | \n",
" engine_type_1 | \n",
" engine_type_2 | \n",
" engine_type_3 | \n",
" engine_type_4 | \n",
" engine_type_5 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" -0.566947 | \n",
" 0.545545 | \n",
" -0.408248 | \n",
" 0.241747 | \n",
" -0.109109 | \n",
" 0.032898 | \n",
"
\n",
" \n",
" 1 | \n",
" -0.566947 | \n",
" 0.545545 | \n",
" -0.408248 | \n",
" 0.241747 | \n",
" -0.109109 | \n",
" 0.032898 | \n",
"
\n",
" \n",
" 2 | \n",
" -0.377964 | \n",
" 0.000000 | \n",
" 0.408248 | \n",
" -0.564076 | \n",
" 0.436436 | \n",
" -0.197386 | \n",
"
\n",
" \n",
" 3 | \n",
" -0.188982 | \n",
" -0.327327 | \n",
" 0.408248 | \n",
" 0.080582 | \n",
" -0.545545 | \n",
" 0.493464 | \n",
"
\n",
" \n",
" 4 | \n",
" -0.188982 | \n",
" -0.327327 | \n",
" 0.408248 | \n",
" 0.080582 | \n",
" -0.545545 | \n",
" 0.493464 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" engine_type_0 engine_type_1 engine_type_2 engine_type_3 engine_type_4 \\\n",
"0 -0.566947 0.545545 -0.408248 0.241747 -0.109109 \n",
"1 -0.566947 0.545545 -0.408248 0.241747 -0.109109 \n",
"2 -0.377964 0.000000 0.408248 -0.564076 0.436436 \n",
"3 -0.188982 -0.327327 0.408248 0.080582 -0.545545 \n",
"4 -0.188982 -0.327327 0.408248 0.080582 -0.545545 \n",
"\n",
" engine_type_5 \n",
"0 0.032898 \n",
"1 0.032898 \n",
"2 -0.197386 \n",
"3 0.493464 \n",
"4 0.493464 "
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"encoder = ce.polynomial.PolynomialEncoder(cols=[\"engine_type\"])\n",
"encoder.fit_transform(obj_df, verbose=1).iloc[:,8:14].head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Scikit-learn pipeline\n",
"Show an example of how to incorporate the encoding strategies into a scikit-learn pipeline"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"# for the purposes of this analysis, only use a small subset of features\n",
"feature_cols = [\n",
" 'fuel_type', 'make', 'aspiration', 'highway_mpg', 'city_mpg',\n",
" 'curb_weight', 'drive_wheels'\n",
"]\n",
"\n",
"# Remove the empty price rows\n",
"df_ml = df.dropna(subset=['price'])\n",
"\n",
"X = df_ml[feature_cols]\n",
"y = df_ml['price']"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"column_trans = make_column_transformer((OneHotEncoder(handle_unknown='ignore'),\n",
" ['fuel_type', 'make', 'drive_wheels']),\n",
" (OrdinalEncoder(), ['aspiration']),\n",
" remainder='passthrough')"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"linreg = LinearRegression()\n",
"pipe = make_pipeline(column_trans, linreg)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([-4476.0937653 , -1014.54842052, -4227.68553953, -4936.79899194,\n",
" -1591.8291911 , -3716.06617255, -4293.79197464, -1390.00486495,\n",
" -1600.57946369, -2124.30041954])"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cross_val_score(pipe, X, y, cv=10, scoring='neg_mean_absolute_error')"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"-2937.17"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Get the average of the errors after 10 iterations\n",
"cross_val_score(pipe, X, y, cv=10, scoring='neg_mean_absolute_error').mean().round(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 1
}