{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "logistic_regression.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4GgPS2DCsFaX", "outputId": "ef023bce-51e7-42c8-8ec8-23fdd8cfc9b7" }, "source": [ "# This mounts your Google Drive to the Colab VM.\n", "from google.colab import drive\n", "drive.mount('/content/drive')\n", "\n", "# TODO: Enter the foldername in your Drive where you have saved the unzipped\n", "# assignment folder, e.g. 'cs231n/assignments/assignment1/'\n", "FOLDERNAME = \"ml/logistic_regression\"\n", "assert FOLDERNAME is not None, \"[!] Enter the foldername.\"\n", "\n", "# Now that we've mounted your Drive, this ensures that\n", "# the Python interpreter of the Colab VM can load\n", "# python files from within it.\n", "import sys\n", "sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))\n", "\n", "# This downloads the CIFAR-10 dataset to your Drive\n", "# if it doesn't already exist.\n", "#%cd /content/drive/My\\ Drive/$FOLDERNAME/cs231n/datasets/\n", "#!bash get_datasets.sh\n", "#%cd /content/drive/My\\ Drive/$FOLDERNAME" ], "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" ] } ] }, { "cell_type": "code", "metadata": { "id": "uEC0pnScTtdb" }, "source": [ "import numpy as np\n", "import pandas as pd\n", "import math" ], "execution_count": 2, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "O_VCvc39Gdxt" }, "source": [ "#Dataset" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "A0x-AOqtJ2lY", "outputId": "61410361-203d-4d73-8ada-b8141e2611c0" }, "source": [ "pima = pd.read_csv(\"diabetes.csv\")\n", "print(pima.columns)\n", "columns = ['Glucose','SkinThickness','Outcome']\n", "df = pd.DataFrame(pima, columns=columns)\n", "print(df.head())" ], "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',\n", " 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],\n", " dtype='object')\n", " Glucose SkinThickness Outcome\n", "0 148 35 1\n", "1 85 29 0\n", "2 183 0 1\n", "3 89 23 0\n", "4 137 35 1\n" ] } ] }, { "cell_type": "code", "metadata": { "id": "NDc2R1hDj9-D" }, "source": [ "# Splitting data set into train and test \n", "\n", "dfx = df.to_numpy()\n", "\n", "train_data = dfx[0:500,:]\n", "test_data = dfx[500:,:]\n", "\n", "# train data \n", "train_X = train_data[:,0:2]\n", "train_y = train_data[:,2]\n", "\n", "#test data\n", "test_X = test_data[:,0:2]\n", "test_y = test_data[:,2]" ], "execution_count": 4, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "vTqhsxcPTEXa" }, "source": [ "def sigmoid(x):\n", " \"\"\"A numerically stable version of the logistic sigmoid function.\"\"\"\n", " pos_mask = x >= 0\n", " neg_mask = x < 0\n", " z = np.zeros_like(x)\n", " z[pos_mask] = np.exp(-x[pos_mask])\n", " z[neg_mask] = np.exp(x[neg_mask])\n", " top = np.ones_like(x)\n", " top[neg_mask] = z[neg_mask]\n", " return top / (1 + z)" ], "execution_count": 5, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "U3pAavI7jR-l" }, "source": [ "def print_every(loss,i):\n", " if i%50 == 0:\n", " print(\"Iteration:\",i,\"loss:\",loss)" ], "execution_count": 12, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "OIbrUg3m3K_2" }, "source": [ "def normalize(x):\n", " mu = np.mean(x,axis=0)\n", " var = np.mean((x-mu)**2,axis=0)\n", " x = (x - mu) / np.sqrt(var)\n", " ones = np.ones([x.shape[0],1])\n", " x = np.append(ones,x,axis=1)\n", " return x,mu,var" ], "execution_count": 8, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "KhegA7KrGcDg" }, "source": [ "#Train" ] }, { "cell_type": "code", "metadata": { "id": "gwjmYX_ZKFbH" }, "source": [ "def logistic_reg_train(x,w,y,iter,lr):\n", "\n", " # linear function same as Linear Regression\n", " n = x.shape[0]\n", "\n", " loss_hist = []\n", "\n", " for i in range(iter):\n", "\n", " h = np.dot(x,w)\n", " #Sigmoid Function \n", " s = sigmoid(h)\n", " \n", " # loss function \n", " loss = -(1/n) * ((np.dot(y,np.log(s))) + np.dot((1-y),np.log(1-s)))\n", " print_every(loss,i)\n", " \n", " # Partial derivatives w.r.t w\n", " \n", " dw = (1/n) * np.dot((y-s),x)\n", "\n", " #updating parameters \n", " w = w + lr*dw\n", "\n", " loss_hist.append(loss) \n", "\n", " return w,loss_hist" ], "execution_count": 6, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "oQYYpi3HjhOl" }, "source": [ "# Normalize the inputs\n", "x_train_norm,mu_train,var_train = normalize(train_X)\n", "\n", "# weights initialization \n", "w = np.random.randn(train_X.shape[1],1) \n", "w = np.append(0,w)\n" ], "execution_count": 9, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Thyn__rguUoX", "outputId": "57b2c7a8-0f97-4ac2-a419-66b784ce2c9a" }, "source": [ "iter = 2000\n", "lr = 0.003\n", "optim_w, loss_hist = logistic_reg_train(x_train_norm,w,train_y,iter,lr)" ], "execution_count": 13, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Iteration: 0 loss: 0.670044768724279\n", "Iteration: 50 loss: 0.6626523557924836\n", "Iteration: 100 loss: 0.6556799978061332\n", "Iteration: 150 loss: 0.6491072494003616\n", "Iteration: 200 loss: 0.6429141523824431\n", "Iteration: 250 loss: 0.6370812981205708\n", "Iteration: 300 loss: 0.6315898779543568\n", "Iteration: 350 loss: 0.6264217225187003\n", "Iteration: 400 loss: 0.6215593309558392\n", "Iteration: 450 loss: 0.6169858910308043\n", "Iteration: 500 loss: 0.612685291171128\n", "Iteration: 550 loss: 0.6086421254300672\n", "Iteration: 600 loss: 0.6048416923305238\n", "Iteration: 650 loss: 0.6012699884901916\n", "Iteration: 700 loss: 0.5979136978621907\n", "Iteration: 750 loss: 0.594760177353653\n", "Iteration: 800 loss: 0.5917974395106935\n", "Iteration: 850 loss: 0.5890141328844253\n", "Iteration: 900 loss: 0.5863995206210857\n", "Iteration: 950 loss: 0.5839434577512648\n", "Iteration: 1000 loss: 0.5816363675895684\n", "Iteration: 1050 loss: 0.5794692175973817\n", "Iteration: 1100 loss: 0.5774334950079463\n", "Iteration: 1150 loss: 0.5755211824648213\n", "Iteration: 1200 loss: 0.5737247338818299\n", "Iteration: 1250 loss: 0.5720370506945847\n", "Iteration: 1300 loss: 0.5704514586403434\n", "Iteration: 1350 loss: 0.5689616851739256\n", "Iteration: 1400 loss: 0.5675618376023547\n", "Iteration: 1450 loss: 0.5662463819993975\n", "Iteration: 1500 loss: 0.565010122942904\n", "Iteration: 1550 loss: 0.5638481841024198\n", "Iteration: 1600 loss: 0.5627559896916551\n", "Iteration: 1650 loss: 0.5617292467897003\n", "Iteration: 1700 loss: 0.5607639285261273\n", "Iteration: 1750 loss: 0.5598562581180195\n", "Iteration: 1800 loss: 0.5590026937413304\n", "Iteration: 1850 loss: 0.5581999142145451\n", "Iteration: 1900 loss: 0.5574448054692442\n", "Iteration: 1950 loss: 0.5567344477796855\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "bD_jQkYq_5AB" }, "source": [ "## Predict" ] }, { "cell_type": "code", "metadata": { "id": "JQBVIF062daS" }, "source": [ "def predict(x,y,w):\n", "\n", " h = np.dot(x,w)\n", " s = sigmoid(h)\n", " y_pred = []\n", " for i in range(len(s)):\n", " if s[i] >= 0.5:\n", " y_pred.append(1)\n", " else:\n", " y_pred.append(0)\n", " \n", " return y_pred" ], "execution_count": 14, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mjGgYuBT3Ilb", "outputId": "23e17d51-16f9-4827-b1f1-d7ee781a0621" }, "source": [ "x_test_norm,mu,var = normalize(test_X)\n", "\n", "pred_y = predict(x_test_norm,test_y,optim_w)\n", "pred_y = np.array(pred_y)\n", "\n", "#Calculating the accuracy \n", "count = 0\n", "for i in range(len(test_y)):\n", " if test_y[i]==pred_y[i]:\n", " count+=1\n", "\n", "print(\"Prediction Accuracy:\",(count/len(test_y)*100))" ], "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Prediction Accuracy: 77.61194029850746\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "2zPx6Kz__zLQ" }, "source": [ "# Plotting the decision boundary " ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 533 }, "id": "bkiPAH_Bu3hJ", "outputId": "9efa9ccb-33a1-4747-ed71-3e41df304905" }, "source": [ "# we will try to plot with different colors \n", "import matplotlib.pyplot as plt\n", "import seaborn as sbn\n", "plt.figure(figsize=(8, 8), dpi=80)\n", "sbn.scatterplot(x=test_X[:, 0][test_y==0], y=test_X[:, 1][test_y==0],color='green')\n", "sbn.scatterplot(x=test_X[:, 0][test_y==1], y=test_X[:, 1][test_y==1],color='blue')\n", "\n", "#plt.margins(x=-0.25, y=-0.25)\n", "decision_boundary = (-(optim_w[0] + (optim_w[1]) *x_train_norm[:,1])) /(optim_w[2]/10)\n", "plt.plot(train_X[:,0],decision_boundary,color='red')\n", "plt.grid()\n", "plt.show()" ], "execution_count": 91, "outputs": [ { "output_type": "display_data", "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "markdown", "metadata": { "id": "TXXeFOL_AMU-" }, "source": [ "Try out different set of data" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "73w_goIUAVC9", "outputId": "7fc6ffc9-dd49-4238-c07a-00aa1fe3cf96" }, "source": [ "pima = pd.read_csv(\"diabetes.csv\")\n", "print(pima.columns)\n", "columns = ['Glucose','Age','Outcome']\n", "df = pd.DataFrame(pima, columns=columns)\n", "print(df.head())" ], "execution_count": 32, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',\n", " 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],\n", " dtype='object')\n", " Glucose Age Outcome\n", "0 148 50 1\n", "1 85 31 0\n", "2 183 32 1\n", "3 89 21 0\n", "4 137 33 1\n" ] } ] }, { "cell_type": "code", "metadata": { "id": "bU17YW18AVJr" }, "source": [ "# Splitting data set into train and test \n", "\n", "dfx = df.to_numpy()\n", "\n", "train_data = dfx[0:500,:]\n", "test_data = dfx[500:,:]\n", "\n", "# train data \n", "train_X = train_data[:,0:2]\n", "train_y = train_data[:,2]\n", "\n", "#test data\n", "test_X = test_data[:,0:2]\n", "test_y = test_data[:,2]" ], "execution_count": 33, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "BazPrEzSAVNz" }, "source": [ "# Normalize the inputs\n", "x_train_norm,mu_train,var_train = normalize(train_X)\n", "\n", "# weights initialization \n", "w = np.random.randn(train_X.shape[1],1) \n", "w = np.append(0,w)\n" ], "execution_count": 34, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wiVTOU5qAVRw", "outputId": "2c3e1cb1-213b-4e4b-be12-770d142a7975" }, "source": [ "iter = 2000\n", "lr = 0.003\n", "optim_w, loss_hist = logistic_reg_train(x_train_norm,w,train_y,iter,lr)" ], "execution_count": 35, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Iteration: 0 loss: 0.6536393076313141\n", "Iteration: 50 loss: 0.6492563106039907\n", "Iteration: 100 loss: 0.6450014258419662\n", "Iteration: 150 loss: 0.6408730876433092\n", "Iteration: 200 loss: 0.6368696791982097\n", "Iteration: 250 loss: 0.632989532059511\n", "Iteration: 300 loss: 0.629230925912058\n", "Iteration: 350 loss: 0.6255920886469993\n", "Iteration: 400 loss: 0.6220711967451137\n", "Iteration: 450 loss: 0.61866637597101\n", "Iteration: 500 loss: 0.6153757023777348\n", "Iteration: 550 loss: 0.6121972036189006\n", "Iteration: 600 loss: 0.6091288605629931\n", "Iteration: 650 loss: 0.6061686092020252\n", "Iteration: 700 loss: 0.603314342844242\n", "Iteration: 750 loss: 0.6005639145781453\n", "Iteration: 800 loss: 0.5979151399927768\n", "Iteration: 850 loss: 0.5953658001369553\n", "Iteration: 900 loss: 0.5929136446980983\n", "Iteration: 950 loss: 0.5905563953793473\n", "Iteration: 1000 loss: 0.5882917494520337\n", "Iteration: 1050 loss: 0.5861173834590634\n", "Iteration: 1100 loss: 0.5840309570436123\n", "Iteration: 1150 loss: 0.582030116876602\n", "Iteration: 1200 loss: 0.580112500655817\n", "Iteration: 1250 loss: 0.5782757411492054\n", "Iteration: 1300 loss: 0.5765174702549023\n", "Iteration: 1350 loss: 0.574835323050825\n", "Iteration: 1400 loss: 0.5732269418072925\n", "Iteration: 1450 loss: 0.5716899799370264\n", "Iteration: 1500 loss: 0.5702221058580564\n", "Iteration: 1550 loss: 0.5688210067464866\n", "Iteration: 1600 loss: 0.5674843921577114\n", "Iteration: 1650 loss: 0.5662099974965321\n", "Iteration: 1700 loss: 0.5649955873186046\n", "Iteration: 1750 loss: 0.5638389584477951\n", "Iteration: 1800 loss: 0.5627379428962258\n", "Iteration: 1850 loss: 0.5616904105760655\n", "Iteration: 1900 loss: 0.560694271794407\n", "Iteration: 1950 loss: 0.5597474795248379\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "nFIoeSzPAjJ2" }, "source": [ "##Predict" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7jyPE4EfAVUd", "outputId": "6f00454d-1200-4747-a436-fa7648c669ca" }, "source": [ "x_test_norm,mu,var = normalize(test_X)\n", "\n", "pred_y = predict(x_test_norm,test_y,optim_w)\n", "pred_y = np.array(pred_y)\n", "\n", "#Calculating the accuracy \n", "count = 0\n", "for i in range(len(test_y)):\n", " if test_y[i]==pred_y[i]:\n", " count+=1\n", "\n", "print(\"Prediction Accuracy:\",(count/len(test_y)*100))" ], "execution_count": 36, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Prediction Accuracy: 79.8507462686567\n" ] } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 533 }, "id": "CN7b0McYA5d0", "outputId": "7921ec42-3dfd-4e63-8331-34aad6ece9d2" }, "source": [ "# we will try to plot with different colors \n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sbn\n", "\n", "plt.figure(figsize=(8, 8), dpi=80)\n", "decision_boundary = (-(optim_w[0] + (optim_w[1]) *x_train_norm[:,1])) /(optim_w[2]/10)\n", "plt.plot(train_X[:,0],decision_boundary,color='blue')\n", "sbn.scatterplot(x=test_X[:, 0][test_y==0],y= test_X[:, 1][test_y==0],color='green')\n", "sbn.scatterplot(x=test_X[:, 0][test_y==1],y= test_X[:, 1][test_y==1],color='orange')\n", "\n", "plt.grid()\n", "plt.show()" ], "execution_count": 93, "outputs": [ { "output_type": "display_data", "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" } } ] } ] }