{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# ML/Random Forest&SGD Regressor: Vehicle Mileage Prediction\n", "[Click here to Interact with this code on nbViewer](https://nbviewer.org/github/ujwalnk/MachineLearning101/blob/main/docs/examples/Machine_Learning_03_Fuel_Efficiency_Regression.ipynb)\n", "## Data Preprocessing" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1YQT6g6JAGOB" }, "outputs": [], "source": [ "import pandas as pd\n", "import sklearn" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "id": "33Dfq5kKBK4F" }, "source": [ "> Pandas is most likey used when missing data / fixing data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ihknFY4j1tHH" }, "outputs": [], "source": [ "url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'\n", "column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',\n", " 'Acceleration', 'Model Year', 'Origin']\n", "\n", "df = pd.read_csv(url, names=column_names,\n", " na_values='?', comment='\\t',\n", " sep=' ', skipinitialspace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xmnOga7ZBFgz", "outputId": "3ae9597e-b006-4e13-bf3c-1c889f6b48eb" }, "outputs": [ { "data": { "text/plain": [ "MPG 0\n", "Cylinders 0\n", "Displacement 0\n", "Horsepower 6\n", "Weight 0\n", "Acceleration 0\n", "Model Year 0\n", "Origin 0\n", "dtype: int64" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "OlPwpc262WYn", "outputId": "e291352b-1e15-4ac3-b6c5-f4f825a8a676" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " | MPG | \n", "Cylinders | \n", "Displacement | \n", "Horsepower | \n", "Weight | \n", "Acceleration | \n", "Model Year | \n", "Origin | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "18.0 | \n", "8 | \n", "307.0 | \n", "130.0 | \n", "3504.0 | \n", "12.0 | \n", "70 | \n", "1 | \n", "
1 | \n", "15.0 | \n", "8 | \n", "350.0 | \n", "165.0 | \n", "3693.0 | \n", "11.5 | \n", "70 | \n", "1 | \n", "
2 | \n", "18.0 | \n", "8 | \n", "318.0 | \n", "150.0 | \n", "3436.0 | \n", "11.0 | \n", "70 | \n", "1 | \n", "
3 | \n", "16.0 | \n", "8 | \n", "304.0 | \n", "150.0 | \n", "3433.0 | \n", "12.0 | \n", "70 | \n", "1 | \n", "
4 | \n", "17.0 | \n", "8 | \n", "302.0 | \n", "140.0 | \n", "3449.0 | \n", "10.5 | \n", "70 | \n", "1 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
393 | \n", "27.0 | \n", "4 | \n", "140.0 | \n", "86.0 | \n", "2790.0 | \n", "15.6 | \n", "82 | \n", "1 | \n", "
394 | \n", "44.0 | \n", "4 | \n", "97.0 | \n", "52.0 | \n", "2130.0 | \n", "24.6 | \n", "82 | \n", "2 | \n", "
395 | \n", "32.0 | \n", "4 | \n", "135.0 | \n", "84.0 | \n", "2295.0 | \n", "11.6 | \n", "82 | \n", "1 | \n", "
396 | \n", "28.0 | \n", "4 | \n", "120.0 | \n", "79.0 | \n", "2625.0 | \n", "18.6 | \n", "82 | \n", "1 | \n", "
397 | \n", "31.0 | \n", "4 | \n", "119.0 | \n", "82.0 | \n", "2720.0 | \n", "19.4 | \n", "82 | \n", "1 | \n", "
398 rows × 8 columns
\n", "