{ "cells": [ { "cell_type": "markdown", "metadata": { "toc": true }, "source": [ "
symboling | normalized_losses | make | fuel_type | aspiration | doors | body_style | drive_wheels | engine_location | wheel_base | length | width | height | curb_weight | engine_type | num_cylinders | engine_size | fuel_system | bore | stroke | compression_ratio | horsepower | peak_rpm | city_mpg | highway_mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "3 | \n", "? | \n", "alfa-romero | \n", "gas | \n", "std | \n", "two | \n", "convertible | \n", "rwd | \n", "front | \n", "88.6 | \n", "168.8 | \n", "64.1 | \n", "48.8 | \n", "2548 | \n", "dohc | \n", "four | \n", "130 | \n", "mpfi | \n", "3.47 | \n", "2.68 | \n", "9 | \n", "111 | \n", "5000 | \n", "21 | \n", "27 | \n", "13495 | \n", "
1 | \n", "3 | \n", "? | \n", "alfa-romero | \n", "gas | \n", "std | \n", "two | \n", "convertible | \n", "rwd | \n", "front | \n", "88.6 | \n", "168.8 | \n", "64.1 | \n", "48.8 | \n", "2548 | \n", "dohc | \n", "four | \n", "130 | \n", "mpfi | \n", "3.47 | \n", "2.68 | \n", "9 | \n", "111 | \n", "5000 | \n", "21 | \n", "27 | \n", "16500 | \n", "
2 | \n", "1 | \n", "? | \n", "alfa-romero | \n", "gas | \n", "std | \n", "two | \n", "hatchback | \n", "rwd | \n", "front | \n", "94.5 | \n", "171.2 | \n", "65.5 | \n", "52.4 | \n", "2823 | \n", "ohcv | \n", "six | \n", "152 | \n", "mpfi | \n", "2.68 | \n", "3.47 | \n", "9 | \n", "154 | \n", "5000 | \n", "19 | \n", "26 | \n", "16500 | \n", "
3 | \n", "2 | \n", "164 | \n", "audi | \n", "gas | \n", "std | \n", "four | \n", "sedan | \n", "fwd | \n", "front | \n", "99.8 | \n", "176.6 | \n", "66.2 | \n", "54.3 | \n", "2337 | \n", "ohc | \n", "four | \n", "109 | \n", "mpfi | \n", "3.19 | \n", "3.40 | \n", "10 | \n", "102 | \n", "5500 | \n", "24 | \n", "30 | \n", "13950 | \n", "
4 | \n", "2 | \n", "164 | \n", "audi | \n", "gas | \n", "std | \n", "four | \n", "sedan | \n", "4wd | \n", "front | \n", "99.4 | \n", "176.6 | \n", "66.4 | \n", "54.3 | \n", "2824 | \n", "ohc | \n", "five | \n", "136 | \n", "mpfi | \n", "3.19 | \n", "3.40 | \n", "8 | \n", "115 | \n", "5500 | \n", "18 | \n", "22 | \n", "17450 | \n", "
np.nan
values.\n",
"3. Change the data type to float.\n",
"4. Remove the 4 observations with an unknown price since this attribute is our target.\n",
"5. Replace the remaining missing values by the mean of the attribute."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def remove_attribute(attribute, df):\n",
" \"\"\"\n",
" Remove column from a dataframe\n",
" \n",
" attribute (string): the column name to remove\n",
" df (DataFrame): the data\n",
" \n",
" \"\"\"\n",
" \n",
" df.drop(attribute,axis=1,inplace=True)\n",
" \n",
"\n",
"def clean_continuous_attributes(mv_str, cols_index, df):\n",
" \"\"\"\n",
" Replace missing vaalues by np.nan value and change object columns to float data type\n",
" \n",
" mv_str (string): the string denoting the missing value\n",
" cols_index (list of integers) : the columns index we want to clean\n",
" df (DataFrame) : the data \n",
" \n",
" \"\"\"\n",
" \n",
" for idx, col in enumerate(df.columns):\n",
" if idx in cols_index:\n",
" df[col]=df[col].replace(mv_str, np.nan)\n",
" df[col]=df[col].astype(np.float64)\n",
" return df\n",
"\n",
"def remove_rows(target, df):\n",
" \"\"\"\n",
" Remove rows with np.nan values from a DataFrame and reset the index\n",
" \n",
" target (array-like of strings): the column name which we want to remove np.nan values \n",
" df (DataFrame) : the data \n",
" \n",
" \"\"\"\n",
" df.dropna(subset=target,inplace=True)\n",
" df.reset_index(inplace=True)\n",
" df.drop(\"index\",axis=1,inplace=True)\n",
" \n",
"\n",
"def impute_missing_values(cols_index, df):\n",
" \"\"\"\n",
" Replace np.nan values by the mean of the attribute \n",
" \n",
" cols_index (list of integers) : the columns index we want to replace np.nan values by the mean\n",
" df (DataFrame) : the data \n",
" \n",
" \"\"\"\n",
" for idx, col in enumerate(df.columns):\n",
" if idx in cols_index: \n",
" m = df[col].mean()\n",
" df[col] = df[col].fillna(m) \n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n", " | symboling | \n", "make | \n", "fuel_type | \n", "aspiration | \n", "doors | \n", "body_style | \n", "drive_wheels | \n", "engine_location | \n", "wheel_base | \n", "length | \n", "... | \n", "engine_size | \n", "fuel_system | \n", "bore | \n", "stroke | \n", "compression_ratio | \n", "horsepower | \n", "peak_rpm | \n", "city_mpg | \n", "highway_mpg | \n", "price | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "3 | \n", "alfa-romero | \n", "gas | \n", "std | \n", "two | \n", "convertible | \n", "rwd | \n", "front | \n", "88.6 | \n", "168.8 | \n", "... | \n", "130 | \n", "mpfi | \n", "3.47 | \n", "2.68 | \n", "9.0 | \n", "111.0 | \n", "5000.0 | \n", "21 | \n", "27 | \n", "13495.0 | \n", "
1 | \n", "3 | \n", "alfa-romero | \n", "gas | \n", "std | \n", "two | \n", "convertible | \n", "rwd | \n", "front | \n", "88.6 | \n", "168.8 | \n", "... | \n", "130 | \n", "mpfi | \n", "3.47 | \n", "2.68 | \n", "9.0 | \n", "111.0 | \n", "5000.0 | \n", "21 | \n", "27 | \n", "16500.0 | \n", "
2 | \n", "1 | \n", "alfa-romero | \n", "gas | \n", "std | \n", "two | \n", "hatchback | \n", "rwd | \n", "front | \n", "94.5 | \n", "171.2 | \n", "... | \n", "152 | \n", "mpfi | \n", "2.68 | \n", "3.47 | \n", "9.0 | \n", "154.0 | \n", "5000.0 | \n", "19 | \n", "26 | \n", "16500.0 | \n", "
3 | \n", "2 | \n", "audi | \n", "gas | \n", "std | \n", "four | \n", "sedan | \n", "fwd | \n", "front | \n", "99.8 | \n", "176.6 | \n", "... | \n", "109 | \n", "mpfi | \n", "3.19 | \n", "3.40 | \n", "10.0 | \n", "102.0 | \n", "5500.0 | \n", "24 | \n", "30 | \n", "13950.0 | \n", "
4 | \n", "2 | \n", "audi | \n", "gas | \n", "std | \n", "four | \n", "sedan | \n", "4wd | \n", "front | \n", "99.4 | \n", "176.6 | \n", "... | \n", "136 | \n", "mpfi | \n", "3.19 | \n", "3.40 | \n", "8.0 | \n", "115.0 | \n", "5500.0 | \n", "18 | \n", "22 | \n", "17450.0 | \n", "
5 rows × 25 columns
\n", "\n", " | symboling_B | \n", "symboling_C | \n", "symboling_D | \n", "symboling_E | \n", "symboling_F | \n", "symboling_G | \n", "make_alfa-romero | \n", "make_audi | \n", "make_bmw | \n", "make_chevrolet | \n", "... | \n", "num_cylinders_twelve | \n", "num_cylinders_two | \n", "fuel_system_1bbl | \n", "fuel_system_2bbl | \n", "fuel_system_4bbl | \n", "fuel_system_idi | \n", "fuel_system_mfi | \n", "fuel_system_mpfi | \n", "fuel_system_spdi | \n", "fuel_system_spfi | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "
1 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "
2 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "
3 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "
4 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "
5 rows × 66 columns
\n", "