{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "Support Vector Machine\n",
    "\n",
    "using uci breast cancer dataset (again)\n",
    "\n",
    "way better than k nearest neigbors but algo is much more complex\n",
    "\n",
    "@TODO: learn linear algebra lol\n",
    "\n",
    "ISO"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn import preprocessing, svm\n",
    "from sklearn.model_selection import train_test_split\n",
    "import pandas as pd\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "%load breast-cancer-wisconsin.data.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('breast-cancer-wisconsin.data.txt')\n",
    "# replace unkown data with outliers\n",
    "df.replace('?',-99999, inplace=True)\n",
    "# irrelevent feature\n",
    "df.drop(['id'], 1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# feature data\n",
    "X = np.array(df.drop(['class'], 1))\n",
    "# class / label data\n",
    "y = np.array(df['class'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# separate training and testing chunks\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
       "  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',\n",
       "  max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
       "  tol=0.001, verbose=False)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# define classifier\n",
    "clf = svm.SVC()\n",
    "\n",
    "# train classifier\n",
    "clf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy: 0.9428571428571428\n"
     ]
    }
   ],
   "source": [
    "# test\n",
    "accuracy = clf.score(X_test, y_test)\n",
    "print('accuracy:', accuracy)\n",
    "# about 96% accuracy without any tweaks\n",
    "# If you want to save the classier you'd pickle it"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "example class outputs:  [2 2 4]\n"
     ]
    }
   ],
   "source": [
    "example_measures = np.array([[4,2,1,1,1,2,3,2,1],[4,2,1,1,1,2,3,2,1], [1,0,6,1,5,1,2,4,2]])\n",
    "example_measures = example_measures.reshape(len(example_measures), -1)\n",
    "prediction = clf.predict(example_measures)\n",
    "print('example class outputs: ', prediction)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}