{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import random\n", "import numpy as np\n", "from sklearn import datasets\n", "from sklearn.model_selection import train_test_split\n", "from sklearn import svm\n", "from sklearn.naive_bayes import GaussianNB" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(150, 4) (150,)\n" ] } ], "source": [ "iris = datasets.load_iris()\n", "X, y = iris.data, iris.target\n", "print(X.shape, y.shape)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def simulate(X, y, n=100):\n", " X_new, y_new = [], []\n", " Xmax = np.max(X, axis=0)\n", " Xmin = np.min(X, axis=0)\n", " for t in range(n):\n", " tmp = []\n", " for i in range(X.shape[1]):\n", " rg = np.linspace(Xmin[i], Xmax[i], 30)\n", " choice = round(np.random.choice(rg), 1)\n", " tmp.append(choice)\n", " X_new.append(tmp)\n", " y_new.append(random.choice(list(set(y))))\n", " return np.array(X_new), np.array(y_new) " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "X_new, y_new = simulate(X, y, 1000)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1150, 4) (1150,)\n" ] } ], "source": [ "X_all = np.concatenate((X, X_new), axis=0)\n", "y_all = np.concatenate((y, y_new), axis=0)\n", "print(X_all.shape, y_all.shape)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((920, 4), (920,), (230, 4), (230,))" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)\n", "X_train.shape, y_train.shape, X_test.shape, y_test.shape" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(0.3869565217391304, 0.30434782608695654)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "m1 = svm.SVC(gamma='auto')\n", "m2 = GaussianNB()\n", "m1.fit(X_train, y_train);\n", "m2.fit(X_train, y_train);\n", "m1.score(X_test, y_test), m2.score(X_test, y_test)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def boostrap(test, m1, m2, b=10, n=10, delta=0.05):\n", " ds = []\n", " X_test, y_test = test\n", " for i in range(b):\n", " choices = random.choices(range(X_test.shape[0]), k=n)\n", " _X_test = X_test[choices, :]\n", " _y_test = y_test[choices]\n", " dm1 = m1.score(_X_test, _y_test)\n", " dm2 = m2.score(_X_test, _y_test)\n", " d = dm1 - dm2\n", " ds.append(d)\n", " p = sum([_ for _ in ds if _ > 2*delta])/b\n", " return p" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "0.1166666666666667" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "boostrap((X_test, y_test), m1, m2, 30, 10, 0.025)\n", "# 不能拒绝原假设:m1 在双边置信度 0.05 下不比 m2 好,所以 m1 不比 m2 好" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.08666666666666666" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "boostrap((X_test, y_test), m1, m2, 30, 10, 0.05)\n", "# 拒绝原假设:m1 在双边置信度 0.1 下不比 m2 好,所以 m1 比 m2 好" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.2" }, "latex_envs": { "LaTeX_envs_menu_present": true, "autoclose": false, "autocomplete": true, "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 1, "hotkeys": { "equation": "Ctrl-E", "itemize": "Ctrl-I" }, "labels_anchors": false, "latex_user_defs": false, "report_style_numbering": false, "user_envs_cfg": false }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }