{
  "cells": [
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# ML/Random Forest&SGD Regressor: Vehicle Mileage Prediction\n",
        "[Click here to Interact with this code on nbViewer](https://nbviewer.org/github/ujwalnk/MachineLearning101/blob/main/docs/examples/Machine_Learning_03_Fuel_Efficiency_Regression.ipynb)\n",
        "## Data Preprocessing"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "1YQT6g6JAGOB"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import sklearn"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "33Dfq5kKBK4F"
      },
      "source": [
        "> Pandas is most likey used when missing data / fixing data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ihknFY4j1tHH"
      },
      "outputs": [],
      "source": [
        "url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'\n",
        "column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',\n",
        "                'Acceleration', 'Model Year', 'Origin']\n",
        "\n",
        "df = pd.read_csv(url, names=column_names,\n",
        "                          na_values='?', comment='\\t',\n",
        "                          sep=' ', skipinitialspace=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "xmnOga7ZBFgz",
        "outputId": "3ae9597e-b006-4e13-bf3c-1c889f6b48eb"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "MPG             0\n",
              "Cylinders       0\n",
              "Displacement    0\n",
              "Horsepower      6\n",
              "Weight          0\n",
              "Acceleration    0\n",
              "Model Year      0\n",
              "Origin          0\n",
              "dtype: int64"
            ]
          },
          "execution_count": 3,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "df.isnull().sum()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 423
        },
        "id": "OlPwpc262WYn",
        "outputId": "e291352b-1e15-4ac3-b6c5-f4f825a8a676"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <div id=\"df-f5f5de0d-6941-46eb-84bf-193740797443\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>MPG</th>\n",
              "      <th>Cylinders</th>\n",
              "      <th>Displacement</th>\n",
              "      <th>Horsepower</th>\n",
              "      <th>Weight</th>\n",
              "      <th>Acceleration</th>\n",
              "      <th>Model Year</th>\n",
              "      <th>Origin</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>18.0</td>\n",
              "      <td>8</td>\n",
              "      <td>307.0</td>\n",
              "      <td>130.0</td>\n",
              "      <td>3504.0</td>\n",
              "      <td>12.0</td>\n",
              "      <td>70</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>15.0</td>\n",
              "      <td>8</td>\n",
              "      <td>350.0</td>\n",
              "      <td>165.0</td>\n",
              "      <td>3693.0</td>\n",
              "      <td>11.5</td>\n",
              "      <td>70</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>18.0</td>\n",
              "      <td>8</td>\n",
              "      <td>318.0</td>\n",
              "      <td>150.0</td>\n",
              "      <td>3436.0</td>\n",
              "      <td>11.0</td>\n",
              "      <td>70</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>16.0</td>\n",
              "      <td>8</td>\n",
              "      <td>304.0</td>\n",
              "      <td>150.0</td>\n",
              "      <td>3433.0</td>\n",
              "      <td>12.0</td>\n",
              "      <td>70</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>17.0</td>\n",
              "      <td>8</td>\n",
              "      <td>302.0</td>\n",
              "      <td>140.0</td>\n",
              "      <td>3449.0</td>\n",
              "      <td>10.5</td>\n",
              "      <td>70</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>393</th>\n",
              "      <td>27.0</td>\n",
              "      <td>4</td>\n",
              "      <td>140.0</td>\n",
              "      <td>86.0</td>\n",
              "      <td>2790.0</td>\n",
              "      <td>15.6</td>\n",
              "      <td>82</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>394</th>\n",
              "      <td>44.0</td>\n",
              "      <td>4</td>\n",
              "      <td>97.0</td>\n",
              "      <td>52.0</td>\n",
              "      <td>2130.0</td>\n",
              "      <td>24.6</td>\n",
              "      <td>82</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>395</th>\n",
              "      <td>32.0</td>\n",
              "      <td>4</td>\n",
              "      <td>135.0</td>\n",
              "      <td>84.0</td>\n",
              "      <td>2295.0</td>\n",
              "      <td>11.6</td>\n",
              "      <td>82</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>396</th>\n",
              "      <td>28.0</td>\n",
              "      <td>4</td>\n",
              "      <td>120.0</td>\n",
              "      <td>79.0</td>\n",
              "      <td>2625.0</td>\n",
              "      <td>18.6</td>\n",
              "      <td>82</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>397</th>\n",
              "      <td>31.0</td>\n",
              "      <td>4</td>\n",
              "      <td>119.0</td>\n",
              "      <td>82.0</td>\n",
              "      <td>2720.0</td>\n",
              "      <td>19.4</td>\n",
              "      <td>82</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>398 rows × 8 columns</p>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-f5f5de0d-6941-46eb-84bf-193740797443')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-f5f5de0d-6941-46eb-84bf-193740797443 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-f5f5de0d-6941-46eb-84bf-193740797443');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ],
            "text/plain": [
              "      MPG  Cylinders  Displacement  Horsepower  Weight  Acceleration  \\\n",
              "0    18.0          8         307.0       130.0  3504.0          12.0   \n",
              "1    15.0          8         350.0       165.0  3693.0          11.5   \n",
              "2    18.0          8         318.0       150.0  3436.0          11.0   \n",
              "3    16.0          8         304.0       150.0  3433.0          12.0   \n",
              "4    17.0          8         302.0       140.0  3449.0          10.5   \n",
              "..    ...        ...           ...         ...     ...           ...   \n",
              "393  27.0          4         140.0        86.0  2790.0          15.6   \n",
              "394  44.0          4          97.0        52.0  2130.0          24.6   \n",
              "395  32.0          4         135.0        84.0  2295.0          11.6   \n",
              "396  28.0          4         120.0        79.0  2625.0          18.6   \n",
              "397  31.0          4         119.0        82.0  2720.0          19.4   \n",
              "\n",
              "     Model Year  Origin  \n",
              "0            70       1  \n",
              "1            70       1  \n",
              "2            70       1  \n",
              "3            70       1  \n",
              "4            70       1  \n",
              "..          ...     ...  \n",
              "393          82       1  \n",
              "394          82       2  \n",
              "395          82       1  \n",
              "396          82       1  \n",
              "397          82       1  \n",
              "\n",
              "[398 rows x 8 columns]"
            ]
          },
          "execution_count": 4,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "df"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "0O46wXpH-M5_",
        "outputId": "759ff00f-afd8-4e3e-ca33-da4d62be7524"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "23.514572864321607"
            ]
          },
          "execution_count": 5,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "df[\"MPG\"].mean()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "UDS2C2E24zaT"
      },
      "outputs": [],
      "source": [
        "df = df.dropna()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "OSORjQfb4R5m"
      },
      "outputs": [],
      "source": [
        "# columns = [\"Cylinders\", \"Model Year\", \"Origin\"]\n",
        "df = df.astype(float)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "5UyJwm-IGo4r"
      },
      "source": [
        "### Split data into 3 parts\n",
        "- Train\n",
        "- Validation\n",
        "- Test\n",
        "\n",
        "We are here merging the validation and test due to lack to data entries"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "RM1fXnmnF-E1",
        "outputId": "2159e918-9dfb-4c05-8da4-47421191ba99"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "pandas.core.series.Series"
            ]
          },
          "execution_count": 16,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "from  sklearn.model_selection import train_test_split\n",
        "import numpy as np\n",
        "\n",
        "X = df.drop('MPG', axis=1) # Capital because that is a matrix -> Input Matrix\n",
        "y = df['MPG'] # Lowercase since the data is an array -> Output Array\n",
        "\n",
        "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)\n",
        "# 0.2 -> Split ration between train and test\n",
        "\n",
        "X_train.shape, X_test.shape, y_train.shape, y_test.shape\n",
        "type(y_train)\n"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Model Training"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Random Forest Regressor"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "9yPTktQ_7G6B",
        "outputId": "b6867f5e-600a-422d-eb89-91a9169936da"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "(2.460025316455698, 0.7769961840589887)"
            ]
          },
          "execution_count": 13,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "from sklearn.ensemble import RandomForestRegressor\n",
        "from sklearn.metrics import mean_absolute_error as mae\n",
        "\n",
        "model2 = RandomForestRegressor()\n",
        "model2.fit(X_train, y_train)\n",
        "\n",
        "y_preds = model2.predict(X_test)\n",
        "mae(y_test, y_preds), model2.score(X_test, y_test)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### SGD Regressor"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "T26WGMBg7WTN",
        "outputId": "0741a792-97cd-4221-f60c-68322bbd2345"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "7688061220403291.0"
            ]
          },
          "execution_count": 14,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "from sklearn.linear_model import SGDRegressor\n",
        "model3 = SGDRegressor()\n",
        "model3.fit(X_train, y_train)\n",
        "model3.score(X_test, y_test)\n",
        "mae(y_test, model3.predict(X_test))"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}