{ "cells": [ { "cell_type": "markdown", "id": "9f0cafc0-1993-411c-9247-d75ac091280b", "metadata": {}, "source": [ "# Performance Baselines" ] }, { "cell_type": "code", "execution_count": 1, "id": "67ce4a31-1781-4f46-a7ce-e2b1f6cedc8b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "scikit-learn: 1.0\n", "mlxtend : 0.19.0\n", "xgboost : 1.5.0\n", "\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark -p scikit-learn,mlxtend,xgboost" ] }, { "cell_type": "markdown", "id": "381c55c2-1ec2-43fc-8c66-4a2acbc4b857", "metadata": {}, "source": [ "## Dataset" ] }, { "cell_type": "markdown", "id": "38f4522c-3671-4ce5-acff-bd29143e5392", "metadata": {}, "source": [ "Source: https://archive.ics.uci.edu/ml/datasets/Dry+Bean+Dataset" ] }, { "cell_type": "code", "execution_count": 2, "id": "bf2e92fa-1bf4-4435-a1f3-4e9613ec83d0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "X_train.shape: (9119, 16)\n", "y_train.shape: (9119,)\n", "X_test.shape: (4492, 16)\n", "y_test.shape: (4492,)\n" ] } ], "source": [ "import pandas as pd\n", "\n", "\n", "X_train = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/X_train.csv', header=None).values\n", "y_train = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/y_train.csv', header=None).values.ravel().astype(int)\n", "\n", "X_test = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/X_test.csv', header=None).values\n", "y_test = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/y_test.csv', header=None).values.ravel().astype(int)\n", "\n", "print('X_train.shape:', X_train.shape)\n", "print('y_train.shape:', y_train.shape)\n", "print('X_test.shape:', X_test.shape)\n", "print('y_test.shape:', y_test.shape)" ] }, { "cell_type": "code", "execution_count": 3, "id": "dfd1ee6f-7163-48ba-ba58-542191985c84", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train/Valid/Test sizes: 9119 1824 4492\n" ] } ], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "\n", "X_train_sub, X_valid, y_train_sub, y_valid = \\\n", " train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)\n", "\n", "print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])" ] }, { "cell_type": "markdown", "id": "3285747b-edfb-4381-9b90-8212a04f6d85", "metadata": {}, "source": [ "## Baselines" ] }, { "cell_type": "markdown", "id": "1224925f-c4a2-4e28-9693-ff49c43ad694", "metadata": {}, "source": [ "Compare hyperparameter settings on validation set:" ] }, { "cell_type": "code", "execution_count": 4, "id": "2a49887e-6bd5-4d1f-bb5f-e833613c0f2f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train Accuracy: 79.657%\n", "Valid Accuracy: 71.162%\n" ] } ], "source": [ "from sklearn.neighbors import KNeighborsClassifier\n", "\n", "\n", "knn = KNeighborsClassifier(n_neighbors=5)\n", "knn.fit(X_train_sub, y_train_sub)\n", "print(f\"Train Accuracy: {knn.score(X_train_sub, y_train_sub)*100:0.3f}%\")\n", "print(f\"Valid Accuracy: {knn.score(X_valid, y_valid)*100:0.3f}%\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "2c2c6e00-15cf-4b39-80db-2ddeb46409ad", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train Accuracy: 84.003%\n", "Valid Accuracy: 71.930%\n" ] } ], "source": [ "knn = KNeighborsClassifier(n_neighbors=3)\n", "knn.fit(X_train_sub, y_train_sub)\n", "print(f\"Train Accuracy: {knn.score(X_train_sub, y_train_sub)*100:0.3f}%\")\n", "print(f\"Valid Accuracy: {knn.score(X_valid, y_valid)*100:0.3f}%\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "66df75b8-6358-480b-8b0e-914259a27aea", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train Accuracy: 77.478%\n", "Valid Accuracy: 69.518%\n" ] } ], "source": [ "knn = KNeighborsClassifier(n_neighbors=7)\n", "knn.fit(X_train_sub, y_train_sub)\n", "print(f\"Train Accuracy: {knn.score(X_train_sub, y_train_sub)*100:0.3f}%\")\n", "print(f\"Valid Accuracy: {knn.score(X_valid, y_valid)*100:0.3f}%\")" ] }, { "cell_type": "markdown", "id": "9e13b5e5-18fe-4acc-8b60-6c6be54ac460", "metadata": {}, "source": [ "Choose best model and train on whole training set:" ] }, { "cell_type": "code", "execution_count": 7, "id": "abc7a3c0-33f8-414c-9572-875bc657c919", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train Accuracy: 84.965%\n", "Test Accuracy: 71.305%\n" ] } ], "source": [ "model = KNeighborsClassifier(n_neighbors=3)\n", "model.fit(X_train, y_train)\n", "print(f\"Train Accuracy: {model.score(X_train, y_train)*100:0.3f}%\")\n", "print(f\"Test Accuracy: {model.score(X_test, y_test)*100:0.3f}%\")" ] }, { "cell_type": "code", "execution_count": null, "id": "ece3d175-a886-4738-b299-175b326b1d54", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "e142cdbb-2eca-4e73-bfd8-674241ce539e", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 5 }