{ "cells": [ { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "Support Vector Machine\n", "\n", "using uci breast cancer dataset (again)\n", "\n", "way better than k nearest neigbors but algo is much more complex\n", "\n", "@TODO: learn linear algebra lol\n", "\n", "ISO" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from sklearn import preprocessing, svm\n", "from sklearn.model_selection import train_test_split\n", "import pandas as pd\n", "import pickle" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "%load breast-cancer-wisconsin.data.txt" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('breast-cancer-wisconsin.data.txt')\n", "# replace unkown data with outliers\n", "df.replace('?',-99999, inplace=True)\n", "# irrelevent feature\n", "df.drop(['id'], 1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# feature data\n", "X = np.array(df.drop(['class'], 1))\n", "# class / label data\n", "y = np.array(df['class'])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# separate training and testing chunks\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n", " decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',\n", " max_iter=-1, probability=False, random_state=None, shrinking=True,\n", " tol=0.001, verbose=False)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# define classifier\n", "clf = svm.SVC()\n", "\n", "# train classifier\n", "clf.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "accuracy: 0.9428571428571428\n" ] } ], "source": [ "# test\n", "accuracy = clf.score(X_test, y_test)\n", "print('accuracy:', accuracy)\n", "# about 96% accuracy without any tweaks\n", "# If you want to save the classier you'd pickle it" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "example class outputs: [2 2 4]\n" ] } ], "source": [ "example_measures = np.array([[4,2,1,1,1,2,3,2,1],[4,2,1,1,1,2,3,2,1], [1,0,6,1,5,1,2,4,2]])\n", "example_measures = example_measures.reshape(len(example_measures), -1)\n", "prediction = clf.predict(example_measures)\n", "print('example class outputs: ', prediction)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }