{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# ML/Random Forest&SGD Regressor: Vehicle Mileage Prediction\n", "[Click here to Interact with this code on nbViewer](https://nbviewer.org/github/ujwalnk/MachineLearning101/blob/main/docs/examples/Machine_Learning_03_Fuel_Efficiency_Regression.ipynb)\n", "## Data Preprocessing" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1YQT6g6JAGOB" }, "outputs": [], "source": [ "import pandas as pd\n", "import sklearn" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "id": "33Dfq5kKBK4F" }, "source": [ "> Pandas is most likey used when missing data / fixing data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ihknFY4j1tHH" }, "outputs": [], "source": [ "url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'\n", "column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',\n", " 'Acceleration', 'Model Year', 'Origin']\n", "\n", "df = pd.read_csv(url, names=column_names,\n", " na_values='?', comment='\\t',\n", " sep=' ', skipinitialspace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xmnOga7ZBFgz", "outputId": "3ae9597e-b006-4e13-bf3c-1c889f6b48eb" }, "outputs": [ { "data": { "text/plain": [ "MPG 0\n", "Cylinders 0\n", "Displacement 0\n", "Horsepower 6\n", "Weight 0\n", "Acceleration 0\n", "Model Year 0\n", "Origin 0\n", "dtype: int64" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "OlPwpc262WYn", "outputId": "e291352b-1e15-4ac3-b6c5-f4f825a8a676" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MPGCylindersDisplacementHorsepowerWeightAccelerationModel YearOrigin
018.08307.0130.03504.012.0701
115.08350.0165.03693.011.5701
218.08318.0150.03436.011.0701
316.08304.0150.03433.012.0701
417.08302.0140.03449.010.5701
...........................
39327.04140.086.02790.015.6821
39444.0497.052.02130.024.6822
39532.04135.084.02295.011.6821
39628.04120.079.02625.018.6821
39731.04119.082.02720.019.4821
\n", "

398 rows × 8 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ], "text/plain": [ " MPG Cylinders Displacement Horsepower Weight Acceleration \\\n", "0 18.0 8 307.0 130.0 3504.0 12.0 \n", "1 15.0 8 350.0 165.0 3693.0 11.5 \n", "2 18.0 8 318.0 150.0 3436.0 11.0 \n", "3 16.0 8 304.0 150.0 3433.0 12.0 \n", "4 17.0 8 302.0 140.0 3449.0 10.5 \n", ".. ... ... ... ... ... ... \n", "393 27.0 4 140.0 86.0 2790.0 15.6 \n", "394 44.0 4 97.0 52.0 2130.0 24.6 \n", "395 32.0 4 135.0 84.0 2295.0 11.6 \n", "396 28.0 4 120.0 79.0 2625.0 18.6 \n", "397 31.0 4 119.0 82.0 2720.0 19.4 \n", "\n", " Model Year Origin \n", "0 70 1 \n", "1 70 1 \n", "2 70 1 \n", "3 70 1 \n", "4 70 1 \n", ".. ... ... \n", "393 82 1 \n", "394 82 2 \n", "395 82 1 \n", "396 82 1 \n", "397 82 1 \n", "\n", "[398 rows x 8 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0O46wXpH-M5_", "outputId": "759ff00f-afd8-4e3e-ca33-da4d62be7524" }, "outputs": [ { "data": { "text/plain": [ "23.514572864321607" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"MPG\"].mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "UDS2C2E24zaT" }, "outputs": [], "source": [ "df = df.dropna()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "OSORjQfb4R5m" }, "outputs": [], "source": [ "# columns = [\"Cylinders\", \"Model Year\", \"Origin\"]\n", "df = df.astype(float)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "id": "5UyJwm-IGo4r" }, "source": [ "### Split data into 3 parts\n", "- Train\n", "- Validation\n", "- Test\n", "\n", "We are here merging the validation and test due to lack to data entries" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "RM1fXnmnF-E1", "outputId": "2159e918-9dfb-4c05-8da4-47421191ba99" }, "outputs": [ { "data": { "text/plain": [ "pandas.core.series.Series" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.model_selection import train_test_split\n", "import numpy as np\n", "\n", "X = df.drop('MPG', axis=1) # Capital because that is a matrix -> Input Matrix\n", "y = df['MPG'] # Lowercase since the data is an array -> Output Array\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)\n", "# 0.2 -> Split ration between train and test\n", "\n", "X_train.shape, X_test.shape, y_train.shape, y_test.shape\n", "type(y_train)\n" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Model Training" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Random Forest Regressor" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9yPTktQ_7G6B", "outputId": "b6867f5e-600a-422d-eb89-91a9169936da" }, "outputs": [ { "data": { "text/plain": [ "(2.460025316455698, 0.7769961840589887)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.metrics import mean_absolute_error as mae\n", "\n", "model2 = RandomForestRegressor()\n", "model2.fit(X_train, y_train)\n", "\n", "y_preds = model2.predict(X_test)\n", "mae(y_test, y_preds), model2.score(X_test, y_test)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### SGD Regressor" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "T26WGMBg7WTN", "outputId": "0741a792-97cd-4221-f60c-68322bbd2345" }, "outputs": [ { "data": { "text/plain": [ "7688061220403291.0" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.linear_model import SGDRegressor\n", "model3 = SGDRegressor()\n", "model3.fit(X_train, y_train)\n", "model3.score(X_test, y_test)\n", "mae(y_test, model3.predict(X_test))" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }