{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Classification with Vowpal Wabbit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import pandas as pd\n",
    "import string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_data = pd.read_csv('iris-training.csv')\n",
    "testing_data = pd.read_csv('iris-testing.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def to_vw_format(line):\n",
    "    chars = re.escape(string.punctuation)\n",
    "    res = f'{int(line.y)} |'\n",
    "    for idx, value in line.drop(['y']).iteritems():\n",
    "        feature_name = re.sub(r'(['+chars+']|\\s)+', '_', idx)\n",
    "        res += f' {feature_name}:{value}'\n",
    "    return res"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Vowpal Wabbit input format\n",
    "\n",
    "Vowpal Wabbit has its own input format we can use. Lets see how it looks like."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for ex in training_data.head(10).apply(to_vw_format, axis=1):\n",
    "    print(ex)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from vowpalwabbit import pyvw\n",
    "\n",
    "vw = pyvw.vw(\"--oaa 3\")\n",
    "\n",
    "# learn from training set\n",
    "for example in training_data.apply(to_vw_format, axis = 1):\n",
    "    vw.learn(example)\n",
    "\n",
    "# predict from the testing set\n",
    "predictions = []\n",
    "for example in testing_data.apply(to_vw_format, axis = 1):\n",
    "    predicted_class = vw.predict(example)\n",
    "    predictions.append(predicted_class)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "accuracy = len(testing_data[testing_data.y == predictions]) / len(testing_data)\n",
    "\n",
    "f'Model accuracy {accuracy}'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# How was this data set generated?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "from sklearn.datasets import load_iris\n",
    "from sklearn.model_selection import train_test_split\n",
    "import os\n",
    "\n",
    "ds = load_iris()\n",
    "df = pd.DataFrame(data = ds.data, columns = ds.feature_names)\n",
    "df[\"y\"] = ds.target + 1 # vw expects labels startins on 1\n",
    "\n",
    "training_data, testing_data = train_test_split(df, random_state = 2019, test_size = 0.2)\n",
    "\n",
    "training_data.to_csv(os.path.join(os.getcwd(), 'iris-training.csv'), index=False)\n",
    "testing_data.to_csv(os.path.join(os.getcwd(),'iris-testing.csv'), index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}