{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Simple Naive Bayes Example\n", "\n", "Example based on Lesson 2, from Intro to Machine Learning, Sebastian Thrun and Katie Malone:\n", "\n", "https://www.udacity.com/course/intro-to-machine-learning--ud120" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import random\n", "import numpy as np\n", "import pylab as pl\n", "import matplotlib.pyplot as plt\n", "%matplotlib notebook" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prepare our data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def makeTerrainData(n_points=1000):\n", "###############################################################################\n", "### make the toy dataset\n", " random.seed(42)\n", " grade = [random.random() for ii in range(0,n_points)]\n", " bumpy = [random.random() for ii in range(0,n_points)]\n", " error = [random.random() for ii in range(0,n_points)]\n", " y = [round(grade[ii]*bumpy[ii]+0.3+0.1*error[ii]) for ii in range(0,n_points)]\n", " for ii in range(0, len(y)):\n", " if grade[ii]>0.8 or bumpy[ii]>0.8:\n", " y[ii] = 1.0\n", "\n", "### split into train/test sets\n", " X = [[gg, ss] for gg, ss in zip(grade, bumpy)]\n", " split = int(0.75*n_points)\n", " X_train = X[0:split]\n", " X_test = X[split:]\n", " y_train = y[0:split]\n", " y_test = y[split:]\n", " return X_train, y_train, X_test, y_test " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We have defined a function that generates the data ('makeTerrainData'), lets call it." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "features_train, labels_train, features_test, labels_test = makeTerrainData()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Define a function for training the classifier" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.naive_bayes import GaussianNB\n", "\n", "def classify(features_train, labels_train): \n", " ### import the sklearn module for GaussianNB\n", " ### create classifier\n", " ### fit the classifier on the training features and labels\n", " ### return the fit classifier\n", " classifier = GaussianNB()\n", " classifier.fit(features_train, labels_train)\n", " return classifier" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We have defined a function that trains the classifier, lets call it." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "classifier = classify(features_train, labels_train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Show results" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def prettyPicture(classifier, X_test, y_test):\n", " x_min = 0.0; x_max = 1.0\n", " y_min = 0.0; y_max = 1.0\n", "\n", " # Plot the decision boundary. For that, we will assign a color to each\n", " # point in the mesh [x_min, m_max]x[y_min, y_max].\n", " h = .01 # step size in the mesh\n", " xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n", " Z = classifier.predict(np.c_[xx.ravel(), yy.ravel()])\n", "\n", " # Put the result into a color plot\n", " Z = Z.reshape(xx.shape)\n", " plt.xlim(xx.min(), xx.max())\n", " plt.ylim(yy.min(), yy.max())\n", "\n", " plt.pcolormesh(xx, yy, Z, cmap=pl.cm.seismic)\n", "\n", " # Plot also the test points\n", " grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0]\n", " bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0]\n", " grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1]\n", " bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1]\n", "\n", " plt.scatter(grade_sig, bumpy_sig, color = \"b\", label=\"fast\")\n", " plt.scatter(grade_bkg, bumpy_bkg, color = \"r\", label=\"slow\")\n", " plt.legend()\n", " plt.xlabel(\"bumpiness\")\n", " plt.ylabel(\"grade\")\n", " plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "prettyPicture(classifier, features_test, labels_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Calculate Accuracy" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score\n", "\n", "labels_predicted = classifier.predict(features_test)\n", "accuracy_score(labels_test, labels_predicted)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "accuracy_score(labels_test, labels_predicted, normalize=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(labels_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(221/250)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }