{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Classification with Vowpal Wabbit" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re\n", "import pandas as pd\n", "import string" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "training_data = pd.read_csv('iris-training.csv')\n", "testing_data = pd.read_csv('iris-testing.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "training_data.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def to_vw_format(line):\n", " chars = re.escape(string.punctuation)\n", " res = f'{int(line.y)} |'\n", " for idx, value in line.drop(['y']).iteritems():\n", " feature_name = re.sub(r'(['+chars+']|\\s)+', '_', idx)\n", " res += f' {feature_name}:{value}'\n", " return res" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Vowpal Wabbit input format\n", "\n", "Vowpal Wabbit has its own input format we can use. Lets see how it looks like." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for ex in training_data.head(10).apply(to_vw_format, axis=1):\n", " print(ex)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from vowpalwabbit import pyvw\n", "\n", "vw = pyvw.vw(\"--oaa 3\")\n", "\n", "# learn from training set\n", "for example in training_data.apply(to_vw_format, axis = 1):\n", " vw.learn(example)\n", "\n", "# predict from the testing set\n", "predictions = []\n", "for example in testing_data.apply(to_vw_format, axis = 1):\n", " predicted_class = vw.predict(example)\n", " predictions.append(predicted_class)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "accuracy = len(testing_data[testing_data.y == predictions]) / len(testing_data)\n", "\n", "f'Model accuracy {accuracy}'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# How was this data set generated?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "from sklearn.datasets import load_iris\n", "from sklearn.model_selection import train_test_split\n", "import os\n", "\n", "ds = load_iris()\n", "df = pd.DataFrame(data = ds.data, columns = ds.feature_names)\n", "df[\"y\"] = ds.target + 1 # vw expects labels startins on 1\n", "\n", "training_data, testing_data = train_test_split(df, random_state = 2019, test_size = 0.2)\n", "\n", "training_data.to_csv(os.path.join(os.getcwd(), 'iris-training.csv'), index=False)\n", "testing_data.to_csv(os.path.join(os.getcwd(),'iris-testing.csv'), index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }