{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score\n", "from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB\n", "from sklearn.svm import SVC\n", "from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier\n", "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n", "import numpy as np\n", "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Naive bayes" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "BernoulliNB()" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.datasets import load_iris\n", "iris = load_iris()\n", "X_train, X_test, y_train, y_test = train_test_split(iris.data,\n", " iris.target,\n", " test_size=0.2,\n", " random_state=0)\n", "\n", "gauss_clf = GaussianNB()\n", "multi_clf = MultinomialNB()\n", "bernl_clf = BernoulliNB()\n", "\n", "gauss_clf.fit(X_train, y_train)\n", "multi_clf.fit(X_train, y_train)\n", "bernl_clf.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "y_pred_gauss = gauss_clf.predict(X_test)\n", "y_pred_multi = multi_clf.predict(X_test)\n", "y_pred_bernl = bernl_clf.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 1.00 1.00 1.00 11\n", " 1 0.93 1.00 0.96 13\n", " 2 1.00 0.83 0.91 6\n", "\n", " accuracy 0.97 30\n", " macro avg 0.98 0.94 0.96 30\n", "weighted avg 0.97 0.97 0.97 30\n", "\n", " precision recall f1-score support\n", "\n", " 0 1.00 1.00 1.00 11\n", " 1 0.00 0.00 0.00 13\n", " 2 0.32 1.00 0.48 6\n", "\n", " accuracy 0.57 30\n", " macro avg 0.44 0.67 0.49 30\n", "weighted avg 0.43 0.57 0.46 30\n", "\n", " precision recall f1-score support\n", "\n", " 0 0.00 0.00 0.00 11\n", " 1 0.00 0.00 0.00 13\n", " 2 0.20 1.00 0.33 6\n", "\n", " accuracy 0.20 30\n", " macro avg 0.07 0.33 0.11 30\n", "weighted avg 0.04 0.20 0.07 30\n", "\n" ] } ], "source": [ "print(classification_report(y_test, y_pred_gauss))\n", "print(classification_report(y_test, y_pred_multi))\n", "print(classification_report(y_test, y_pred_bernl))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## SVM" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import load_svmlight_file\n", "svc = SVC(kernel='rbf', random_state=101)\n", "X_train, y_train = load_svmlight_file('data_set/ijcnn1.bz2')" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SVC with rbf kernel -> cross validation accuracy: mean = 0.9625, std = 0.0185\n", "Wall time: 31.3 s\n" ] } ], "source": [ "%%time\n", "scores = cross_val_score(svc,\n", " X_train,\n", " y_train,\n", " cv=5,\n", " scoring='accuracy',\n", " n_jobs=-1)\n", "print(\n", " 'SVC with rbf kernel -> cross validation accuracy: mean = {:.4f}, std = {:.4f}'\n", " .format(np.mean(scores), np.std(scores)))" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Best parameters {'gamma': 0.1, 'C': 100}\n", "Cross validation accuracy: mean= 0.9625\n", "Wall time: 9min 7s\n" ] } ], "source": [ "%%time\n", "svc_new = SVC(kernel='rbf', random_state=101)\n", "search_dict = {\n", " 'C': [0.01, 0.1, 1, 10, 100],\n", " 'gamma': [0.1, 0.01, 0.001, 0.0001]\n", "}\n", "search_func = RandomizedSearchCV(estimator=svc_new,\n", " param_distributions=search_dict,\n", " n_iter=10,\n", " scoring='accuracy',\n", " n_jobs=-1,\n", " iid=True,\n", " refit=True,\n", " cv=5,\n", " random_state=101)\n", "search_func.fit(X_train, y_train)\n", "print('Best parameters {}'.format(search_func.best_params_))\n", "print('Cross validation accuracy: mean= {:.4f}'.format(search_func.best_score_))" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SVC with rbf kernel -> cross validation accuracy: mean = 0.9625, std = 0.0185\n", "Wall time: 41 s\n" ] } ], "source": [ "%%time\n", "svc_best = SVC(C=100, gamma=0.1, kernel='rbf', random_state=101)\n", "svc_best.fit(X_train, y_train)\n", "print(\n", " 'SVC with rbf kernel -> cross validation accuracy: mean = {:.4f}, std = {:.4f}'\n", " .format(np.mean(scores), np.std(scores)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## RandomForest ExtraTrees" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(581012, 54)" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.datasets import fetch_covtype\n", "covertype = fetch_covtype()\n", "covertype.data.shape" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "covertype_x = covertype.data\n", "covertype_y = covertype.target\n", "covertype_x_train, covertype_x_test_val, covertype_y_train, covertype_y_test_val = train_test_split(\n", " covertype_x, covertype_y, test_size=0.4, random_state=42)\n", "covertype_x_test, covertype_x_val, covertype_y_test, covertype_y_val = train_test_split(\n", " covertype_x_test_val, covertype_y_test_val, test_size=0.5, random_state=42)\n", "covertypes = [\n", " 'Spruce/Fir', 'Lodgepole Pine', 'Ponderosa Pine', 'Cottonwood/Willow',\n", " 'Aspen', 'Douglas-fir', 'Krummholz'\n", "]" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(348607, 54)\n", "(116203, 54)\n", "(116202, 54)\n" ] } ], "source": [ "print(covertype_x_train.shape)\n", "print(covertype_x_val.shape)\n", "print(covertype_x_test.shape)" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RandomForestClassifier -> cross validation accurary: mean = 0.9431, std = 0.0007\n", "Wall time: 2min 41s\n" ] } ], "source": [ "%%time\n", "rfc = RandomForestClassifier(n_estimators=100, random_state=101)\n", "scores = cross_val_score(rfc,\n", " covertype_x_train,\n", " covertype_y_train,\n", " cv=5,\n", " scoring='accuracy',\n", " n_jobs=-1)\n", "print(\n", " 'RandomForestClassifier -> cross validation accurary: mean = {:.4f}, std = {:.4f}'\n", " .format(np.mean(scores), np.std(scores)))" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.94251456, 0.94254324, 0.94264282, 0.94429225, 0.94356076])" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scores" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ExtraTreesClassifier -> cross validation accurary: mean = 0.9426, std = 0.0008\n", "Wall time: 2min 49s\n" ] } ], "source": [ "%%time\n", "etc = ExtraTreesClassifier(n_estimators=100, random_state=101)\n", "scores = cross_val_score(etc,\n", " covertype_x_train,\n", " covertype_y_train,\n", " cv=5,\n", " scoring='accuracy',\n", " n_jobs=-1)\n", "print(\n", " 'ExtraTreesClassifier -> cross validation accurary: mean = {:.4f}, std = {:.4f}'\n", " .format(np.mean(scores), np.std(scores)))" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.94211296, 0.94245719, 0.94202608, 0.94426356, 0.94219819])" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scores" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## CalibrationClassifierCV" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from sklearn.calibration import CalibratedClassifierCV\n", "from sklearn.calibration import calibration_curve" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "rfc = RandomForestClassifier(n_estimators=100, random_state=101)\n", "calibration = CalibratedClassifierCV(rfc, method='sigmoid', cv=5)" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "rfc.fit(covertype_x_train, covertype_y_train)\n", "calibration.fit(covertype_x_train, covertype_y_train)\n", "prob_raw = rfc.predict_proba(covertype_x_test)\n", "prob_cal = calibration.predict_proba(covertype_x_test)" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | raw | \n", "calibrated | \n", "
---|---|---|
0 | \n", "0.00 | \n", "0.001132 | \n", "
1 | \n", "0.42 | \n", "0.356083 | \n", "
2 | \n", "0.00 | \n", "0.001131 | \n", "
3 | \n", "0.00 | \n", "0.001130 | \n", "
4 | \n", "0.00 | \n", "0.001130 | \n", "
... | \n", "... | \n", "... | \n", "
116197 | \n", "0.00 | \n", "0.001131 | \n", "
116198 | \n", "0.00 | \n", "0.001128 | \n", "
116199 | \n", "0.32 | \n", "0.079379 | \n", "
116200 | \n", "0.00 | \n", "0.001134 | \n", "
116201 | \n", "0.00 | \n", "0.001130 | \n", "
116202 rows × 2 columns
\n", "