{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: liac-arff>=2.4.0 in /Users/kakateus.ibm.com/anaconda3/lib/python3.7/site-packages (2.4.0)\n", "\u001b[33mWARNING: You are using pip version 20.1.1; however, version 20.2.2 is available.\n", "You should consider upgrading via the '/Users/kakateus.ibm.com/anaconda3/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "#Required for accessing openml datasets from Lale\n", "!pip install 'liac-arff>=2.4.0'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dataset with class imbalance" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import lale.datasets.openml\n", "import pandas as pd\n", "(train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(\n", " 'breast-cancer', 'classification', preprocess=True)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([140, 51])" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "np.bincount(train_y)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/kakateus.ibm.com/anaconda3/lib/python3.7/site-packages/pyparsing.py:3174: FutureWarning: Possible set intersection at position 3\n", " self.re = re.compile(self.reString)\n" ] } ], "source": [ "from sklearn.preprocessing import MinMaxScaler\n", "from sklearn.decomposition import PCA\n", "from sklearn.kernel_approximation import Nystroem as Nys\n", "from lale.lib.lale import NoOp\n", "from sklearn.linear_model import LogisticRegression as LR\n", "from sklearn.ensemble.forest import RandomForestClassifier\n", "from xgboost import XGBClassifier\n", "from lale.lib.lale import Hyperopt\n", "from lale.lib.imblearn import SMOTE, CondensedNearestNeighbour, SMOTEENN\n", "from sklearn.metrics import accuracy_score\n", "import lale\n", "lale.wrap_imported_operators()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### A pipeline without any imbalance correction" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler\n", "\n", "\n", "Min-\n", "Max-\n", "Scaler\n", "\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler->pca\n", "\n", "\n", "\n", "\n", "\n", "random_forest_classifier\n", "\n", "\n", "Random-\n", "Forest-\n", "Classifier\n", "\n", "\n", "\n", "\n", "\n", "pca->random_forest_classifier\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pipeline_without_correction = MinMaxScaler() >> PCA() >> RandomForestClassifier()\n", "pipeline_without_correction.visualize()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|██████████| 10/10 [00:18<00:00, 1.90s/trial, best loss: -0.6403571428571428]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tidlosstimelog_lossstatus
name
p00NaNNaNNaNfail
p11-0.6403570.3570250.566292ok
p22-0.5824030.3622110.609279ok
p33-0.5516880.3093760.599048ok
p44-0.6316560.3501060.576526ok
p55NaNNaNNaNfail
p66-0.5000000.4973620.580290ok
p77NaNNaNNaNfail
p88-0.5000000.3863820.580199ok
p99-0.5943180.4435110.602166ok
\n", "
" ], "text/plain": [ " tid loss time log_loss status\n", "name \n", "p0 0 NaN NaN NaN fail\n", "p1 1 -0.640357 0.357025 0.566292 ok\n", "p2 2 -0.582403 0.362211 0.609279 ok\n", "p3 3 -0.551688 0.309376 0.599048 ok\n", "p4 4 -0.631656 0.350106 0.576526 ok\n", "p5 5 NaN NaN NaN fail\n", "p6 6 -0.500000 0.497362 0.580290 ok\n", "p7 7 NaN NaN NaN fail\n", "p8 8 -0.500000 0.386382 0.580199 ok\n", "p9 9 -0.594318 0.443511 0.602166 ok" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "optimizer = Hyperopt(estimator=pipeline_without_correction, max_evals = 10, scoring='roc_auc')\n", "trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))\n", "predictions = trained_optimizer.predict(test_X)\n", "trained_optimizer.summary()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.6421052631578947\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler\n", "\n", "\n", "Min-\n", "Max-\n", "Scaler\n", "\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler->pca\n", "\n", "\n", "\n", "\n", "\n", "random_forest_classifier\n", "\n", "\n", "Random-\n", "Forest-\n", "Classifier\n", "\n", "\n", "\n", "\n", "\n", "pca->random_forest_classifier\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "print(accuracy_score(test_y, predictions))\n", "best_estimator = trained_optimizer.get_pipeline()\n", "best_estimator.visualize()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Over-sampling from imbalanced-learn" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|██████████| 10/10 [00:22<00:00, 2.23s/trial, best loss: -0.6823150983857325]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tidlosstimelog_lossstatus
name
p00NaNNaNNaNfail
p11-0.6122150.4763610.661132ok
p22-0.6733100.4941340.640531ok
p33-0.6823150.3690890.642915ok
p44-0.6479390.4815000.687333ok
p55NaNNaNNaNfail
p66-0.5000000.5088200.693272ok
p77NaNNaNNaNfail
p88-0.5000000.5125380.698014ok
p99-0.6387700.4576680.669854ok
\n", "
" ], "text/plain": [ " tid loss time log_loss status\n", "name \n", "p0 0 NaN NaN NaN fail\n", "p1 1 -0.612215 0.476361 0.661132 ok\n", "p2 2 -0.673310 0.494134 0.640531 ok\n", "p3 3 -0.682315 0.369089 0.642915 ok\n", "p4 4 -0.647939 0.481500 0.687333 ok\n", "p5 5 NaN NaN NaN fail\n", "p6 6 -0.500000 0.508820 0.693272 ok\n", "p7 7 NaN NaN NaN fail\n", "p8 8 -0.500000 0.512538 0.698014 ok\n", "p9 9 -0.638770 0.457668 0.669854 ok" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline_with_correction = SMOTE(operator = MinMaxScaler() >> PCA() >> RandomForestClassifier())\n", "optimizer = Hyperopt(estimator=pipeline_with_correction, max_evals = 10, scoring='roc_auc')\n", "trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))\n", "predictions = trained_optimizer.predict(test_X)\n", "trained_optimizer.summary()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.6\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "SMOTE\n", "\n", "\n", "\n", "\n", "cluster:pipeline\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler\n", "\n", "\n", "Min-\n", "Max-\n", "Scaler\n", "\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler->pca\n", "\n", "\n", "\n", "\n", "\n", "random_forest_classifier\n", "\n", "\n", "Random-\n", "Forest-\n", "Classifier\n", "\n", "\n", "\n", "\n", "\n", "pca->random_forest_classifier\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "print(accuracy_score(test_y, predictions))\n", "best_estimator = trained_optimizer.get_pipeline()\n", "best_estimator.visualize()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Under-sampling from imbalanced-learn" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|██████████| 10/10 [00:28<00:00, 2.87s/trial, best loss: -0.665297478599759]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tidlosstimelog_lossstatus
name
p00NaNNaNNaNfail
p11-0.6201970.6187750.637794ok
p22-0.6183100.6514790.634385ok
p33-0.6297340.5798520.627842ok
p44-0.6652970.7381030.644552ok
p55NaNNaNNaNfail
p66-0.5000000.7904570.650936ok
p77NaNNaNNaNfail
p88-0.5000000.6009540.651208ok
p99-0.6268700.5954590.699177ok
\n", "
" ], "text/plain": [ " tid loss time log_loss status\n", "name \n", "p0 0 NaN NaN NaN fail\n", "p1 1 -0.620197 0.618775 0.637794 ok\n", "p2 2 -0.618310 0.651479 0.634385 ok\n", "p3 3 -0.629734 0.579852 0.627842 ok\n", "p4 4 -0.665297 0.738103 0.644552 ok\n", "p5 5 NaN NaN NaN fail\n", "p6 6 -0.500000 0.790457 0.650936 ok\n", "p7 7 NaN NaN NaN fail\n", "p8 8 -0.500000 0.600954 0.651208 ok\n", "p9 9 -0.626870 0.595459 0.699177 ok" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline_with_correction = CondensedNearestNeighbour(operator = MinMaxScaler() >> PCA() >> RandomForestClassifier())\n", "optimizer = Hyperopt(estimator=pipeline_with_correction, max_evals = 10, scoring='roc_auc')\n", "trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))\n", "predictions = trained_optimizer.predict(test_X)\n", "trained_optimizer.summary()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.6421052631578947\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "CondensedNearestNeighbour\n", "\n", "\n", "\n", "\n", "cluster:pipeline\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler\n", "\n", "\n", "Min-\n", "Max-\n", "Scaler\n", "\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler->pca\n", "\n", "\n", "\n", "\n", "\n", "random_forest_classifier\n", "\n", "\n", "Random-\n", "Forest-\n", "Classifier\n", "\n", "\n", "\n", "\n", "\n", "pca->random_forest_classifier\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "print(accuracy_score(test_y, predictions))\n", "best_estimator = trained_optimizer.get_pipeline()\n", "best_estimator.visualize()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Combined over and under sampling from imbalanced-learn" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|██████████| 10/10 [00:20<00:00, 2.04s/trial, best loss: -0.6697127399852649]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tidlosstimelog_lossstatus
name
p00NaNNaNNaNfail
p11-0.6221210.4100150.890017ok
p22-0.6171320.4070553.463371ok
p33-0.6359590.3749803.379093ok
p44-0.5683820.5067850.993301ok
p55NaNNaNNaNfail
p66-0.5000000.5406390.986296ok
p77NaNNaNNaNfail
p88-0.5000000.5393461.009724ok
p99-0.6697130.4538105.687978ok
\n", "
" ], "text/plain": [ " tid loss time log_loss status\n", "name \n", "p0 0 NaN NaN NaN fail\n", "p1 1 -0.622121 0.410015 0.890017 ok\n", "p2 2 -0.617132 0.407055 3.463371 ok\n", "p3 3 -0.635959 0.374980 3.379093 ok\n", "p4 4 -0.568382 0.506785 0.993301 ok\n", "p5 5 NaN NaN NaN fail\n", "p6 6 -0.500000 0.540639 0.986296 ok\n", "p7 7 NaN NaN NaN fail\n", "p8 8 -0.500000 0.539346 1.009724 ok\n", "p9 9 -0.669713 0.453810 5.687978 ok" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline_with_correction = SMOTEENN(operator = MinMaxScaler() >> PCA() >> RandomForestClassifier())\n", "optimizer = Hyperopt(estimator=pipeline_with_correction, max_evals = 10, scoring='roc_auc')\n", "trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))\n", "predictions = trained_optimizer.predict(test_X)\n", "trained_optimizer.summary()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.4842105263157895\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "SMOTEENN\n", "\n", "\n", "\n", "\n", "cluster:pipeline\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler\n", "\n", "\n", "Min-\n", "Max-\n", "Scaler\n", "\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler->pca\n", "\n", "\n", "\n", "\n", "\n", "random_forest_classifier\n", "\n", "\n", "Random-\n", "Forest-\n", "Classifier\n", "\n", "\n", "\n", "\n", "\n", "pca->random_forest_classifier\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "print(accuracy_score(test_y, predictions))\n", "best_estimator = trained_optimizer.get_pipeline()\n", "best_estimator.visualize()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }