{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# ML/SGD Classifier: Weather Prediction\n", "[Click here to Interact with this code on nbViewer](https://nbviewer.org/github/ujwalnk/MachineLearning101/blob/main/docs/examples/Machine_Learning_01_Weather_Classfication.ipynb)\n", "## Data Preprocessing" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "5BZOlBpsIrkw" }, "outputs": [], "source": [ "import pandas as pd" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "id": "k3cQdermJkB9" }, "source": [ "Get the `csv` from github" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 386 }, "id": "DuU9kyr-JlT6", "outputId": "12acf768-802d-40e2-a0a5-b2441f728318" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DateLocationMinTempMaxTempRainfallEvaporationSunshineWindGustDirWindGustSpeedWindDir9am...Humidity9amHumidity3pmPressure9amPressure3pmCloud9amCloud3pmTemp9amTemp3pmRainTodayRainTomorrow
02008-12-01Albury13.422.90.6NaNNaNW44.0W...71.022.01007.71007.18.0NaN16.921.8NoNo
12008-12-02Albury7.425.10.0NaNNaNWNW44.0NNW...44.025.01010.61007.8NaNNaN17.224.3NoNo
22008-12-03Albury12.925.70.0NaNNaNWSW46.0W...38.030.01007.61008.7NaN2.021.023.2NoNo
32008-12-04Albury9.228.00.0NaNNaNNE24.0SE...45.016.01017.61012.8NaNNaN18.126.5NoNo
42008-12-05Albury17.532.31.0NaNNaNW41.0ENE...82.033.01010.81006.07.08.017.829.7NoNo
\n", "

5 rows × 23 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ], "text/plain": [ " Date Location MinTemp MaxTemp Rainfall Evaporation Sunshine \\\n", "0 2008-12-01 Albury 13.4 22.9 0.6 NaN NaN \n", "1 2008-12-02 Albury 7.4 25.1 0.0 NaN NaN \n", "2 2008-12-03 Albury 12.9 25.7 0.0 NaN NaN \n", "3 2008-12-04 Albury 9.2 28.0 0.0 NaN NaN \n", "4 2008-12-05 Albury 17.5 32.3 1.0 NaN NaN \n", "\n", " WindGustDir WindGustSpeed WindDir9am ... Humidity9am Humidity3pm \\\n", "0 W 44.0 W ... 71.0 22.0 \n", "1 WNW 44.0 NNW ... 44.0 25.0 \n", "2 WSW 46.0 W ... 38.0 30.0 \n", "3 NE 24.0 SE ... 45.0 16.0 \n", "4 W 41.0 ENE ... 82.0 33.0 \n", "\n", " Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday \\\n", "0 1007.7 1007.1 8.0 NaN 16.9 21.8 No \n", "1 1010.6 1007.8 NaN NaN 17.2 24.3 No \n", "2 1007.6 1008.7 NaN 2.0 21.0 23.2 No \n", "3 1017.6 1012.8 NaN NaN 18.1 26.5 No \n", "4 1010.8 1006.0 7.0 8.0 17.8 29.7 No \n", "\n", " RainTomorrow \n", "0 No \n", "1 No \n", "2 No \n", "3 No \n", "4 No \n", "\n", "[5 rows x 23 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataframe = pd.read_csv(\"https://raw.githubusercontent.com/ujwalnk/MachineLearning101/main/data/01%20Weather%20Data.csv\")\n", "dataframe.head()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "id": "fUJjOrOTJyX0" }, "source": [ "### Check for missing data & remove any na data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "II0rlaRxJ7a7", "outputId": "1b7db804-1ce6-46d7-aae4-a867c9e4d854" }, "outputs": [ { "data": { "text/plain": [ "(Date 0\n", " Location 0\n", " MinTemp 0\n", " MaxTemp 0\n", " Rainfall 0\n", " Evaporation 0\n", " Sunshine 0\n", " WindGustDir 0\n", " WindGustSpeed 0\n", " WindDir9am 0\n", " WindDir3pm 0\n", " WindSpeed9am 0\n", " WindSpeed3pm 0\n", " Humidity9am 0\n", " Humidity3pm 0\n", " Pressure9am 0\n", " Pressure3pm 0\n", " Cloud9am 0\n", " Cloud3pm 0\n", " Temp9am 0\n", " Temp3pm 0\n", " RainToday 0\n", " RainTomorrow 0\n", " dtype: int64,\n", " Date 56420\n", " Location 56420\n", " MinTemp 56420\n", " MaxTemp 56420\n", " Rainfall 56420\n", " Evaporation 56420\n", " Sunshine 56420\n", " WindGustDir 56420\n", " WindGustSpeed 56420\n", " WindDir9am 56420\n", " WindDir3pm 56420\n", " WindSpeed9am 56420\n", " WindSpeed3pm 56420\n", " Humidity9am 56420\n", " Humidity3pm 56420\n", " Pressure9am 56420\n", " Pressure3pm 56420\n", " Cloud9am 56420\n", " Cloud3pm 56420\n", " Temp9am 56420\n", " Temp3pm 56420\n", " RainToday 56420\n", " RainTomorrow 56420\n", " dtype: int64)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataframe = dataframe.dropna()\n", "dataframe.isnull().sum(), dataframe.count()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "id": "K5jNK3leYp5u" }, "source": [ "Drop Unnecessary Columns" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "0X4WxiVEYyKy" }, "outputs": [], "source": [ "dataframe = dataframe.drop(\"Date\", axis=1)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "id": "DGuVKMCCTkvb" }, "source": [ "Sort and check for datapoints" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gDaL9J2gO9J7", "outputId": "5bdce59a-c35b-4b02-d10c-b6267f887ad4" }, "outputs": [ { "data": { "text/plain": [ "No 43993\n", "Yes 12427\n", "Name: RainTomorrow, dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataframe = dataframe.drop_duplicates()\n", "dataframe.sort_values(\"RainTomorrow\", axis=0, ascending=True, inplace=True)\n", "dataframe[\"RainTomorrow\"].value_counts()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "id": "R1psB7EhTxBI" }, "source": [ "### Data Splitting" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "yE0Jsc2OS6K0" }, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "# Import label encoder\n", "from sklearn import preprocessing\n", "\n", "label_encoder = preprocessing.LabelEncoder()\n", "dataframe[\"RainTomorrow\"] = label_encoder.fit_transform(dataframe[\"RainTomorrow\"])\n", "\n", "y = dataframe[\"RainTomorrow\"]\n", "X = dataframe = pd.get_dummies(dataframe.drop(\"RainTomorrow\", axis=1))\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "id": "_UCvF_MYUfhg" }, "source": [ "## Test for shape matching:\n", "$$\n", "\\begin{align}\n", "&(\\approx 70\\% \\times data , <\\# vars>) &&(\\approx 10\\% \\times data , <\\# vars>) &&&(\\approx 20\\% \\times data , <\\# vars>) \\\\ &(\\approx 70\\% \\times data,) &&(\\approx 10\\% \\times data,) &&&(\\approx 20\\% \\times data,) \\\\\n", "\\end{align}$$" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "id": "JgEvDu2lU3Bt" }, "source": [ "Need to make a separate validation and test data as all data is labelled" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "rHJHAcJdUrig", "outputId": "8e2e3069-19f9-4b61-a351-e22324c2d293" }, "outputs": [ { "data": { "text/plain": [ "((45136, 92), (11284, 92), (45136,), (11284,))" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.shape, X_test.shape, y_train.shape, y_test.shape" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "id": "MmgWO7m1U_nq" }, "source": [ "The shapes match, so start training the model" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "id": "JLwNKoCDUxEq" }, "source": [ "## Model Training" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 74 }, "id": "Um9wSdvZVLY2", "outputId": "841bdb63-7b55-4b89-862e-9b8269a3cd2f" }, "outputs": [ { "data": { "text/html": [ "
SGDClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "SGDClassifier()" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.linear_model import SGDClassifier as clf\n", "\n", "sgd_model = clf()\n", "sgd_model.fit(X_train, y_train)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "id": "GF9ZZjfdVht6" }, "source": [ "### Testing accuracy score of model" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "czPBz9Hddkur", "outputId": "6a07b8f4-1d87-43b8-cdca-67c34c042085" }, "outputs": [ { "data": { "text/plain": [ "0.8290499822757887" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sgd_model.score(X_test, y_test)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "id": "LmllnC5SdpzV" }, "source": [ "### Weights of each data column" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0iCP1h8idv-x", "outputId": "67b1c3e7-ee04-45bd-a162-6df11d743b96" }, "outputs": [ { "data": { "text/plain": [ "array([[-13.94129179, 18.26175469, 10.5008666 , -1.86834394,\n", " -66.24311611, 33.02988746, -5.71274478, -13.38599022,\n", " 1.76674237, 26.90531948, 65.68816178, -69.91987087,\n", " -7.36906574, 54.10162366, 14.16866264, -1.30859774,\n", " -0.95906887, 20.3071201 , 5.75649665, -0.0888927 ,\n", " 0.47154798, 2.907069 , -7.01696732, -8.65384302,\n", " -0.40279504, -21.44189095, 1.52645429, 2.55149821,\n", " 4.77312118, -14.39783913, 2.07092207, 28.33801976,\n", " 15.94026633, 10.62267737, -15.19578999, 2.63691854,\n", " -7.81491818, -24.71842023, 10.43655829, -6.39471844,\n", " 1.33894626, -0.79239506, -13.03597522, -7.77880552,\n", " -11.29701182, -2.94040377, -3.40222911, 2.91470822,\n", " 11.67827816, 11.32826316, -13.05889287, -2.50913529,\n", " -6.4037466 , -5.88219648, 1.97439015, 11.5803573 ,\n", " 19.23832431, 9.39415251, -7.21767037, 5.76899719,\n", " -12.28872098, 11.44771272, 13.61377776, 28.21509783,\n", " 1.14866033, 2.78692497, -14.34783699, -16.69168741,\n", " -14.76591046, -13.51794032, -1.02573839, 11.31298473,\n", " 2.7778968 , 4.58352972, -10.14904597, -9.11775178,\n", " -8.16076633, 8.22604691, -6.20026566, 3.65918456,\n", " 14.85341421, 19.45013894, -15.02494933, -3.1063831 ,\n", " -11.05186243, -7.18016876, -12.0081534 , 12.90749749,\n", " 23.61767862, 1.08546318, -50.80912147, 52.6091986 ]])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sgd_model.coef_" ] } ], "metadata": { "colab": { "authorship_tag": "ABX9TyMxZhaFaDHf2tb71IOWYbOG", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }