{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://kakate%40us.ibm.com:****@na.artifactory.swg-devops.com/artifactory/api/pypi/wcp-nlp-pypi-virtual/simple\r\n", "Requirement already satisfied: liac-arff>=2.4.0 in /Users/kakateus.ibm.com/venv/lale39/lib/python3.9/site-packages (2.5.0)\r\n" ] } ], "source": [ "#Required for accessing openml datasets from Lale\n", "!pip install 'liac-arff>=2.4.0'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dataset with class imbalance" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import lale.datasets.openml\n", "import pandas as pd\n", "(train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(\n", " 'breast-cancer', 'classification', preprocess=True)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([140, 51])" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "np.bincount(train_y)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import MinMaxScaler\n", "from sklearn.decomposition import PCA\n", "from sklearn.kernel_approximation import Nystroem as Nys\n", "from lale.lib.lale import NoOp\n", "from sklearn.linear_model import LogisticRegression as LR\n", "from sklearn.ensemble import RandomForestClassifier\n", "from xgboost import XGBClassifier\n", "from lale.lib.lale import Hyperopt\n", "from lale.lib.imblearn import SMOTE, CondensedNearestNeighbour, SMOTEENN\n", "from sklearn.metrics import accuracy_score\n", "import lale\n", "lale.wrap_imported_operators()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### A pipeline without any imbalance correction" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler\n", "\n", "\n", "Min-\n", "Max-\n", "Scaler\n", "\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler->pca\n", "\n", "\n", "\n", "\n", "\n", "random_forest_classifier\n", "\n", "\n", "Random-\n", "Forest-\n", "Classifier\n", "\n", "\n", "\n", "\n", "\n", "pca->random_forest_classifier\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pipeline_without_correction = MinMaxScaler() >> PCA() >> RandomForestClassifier()\n", "pipeline_without_correction.visualize()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|█████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:14<00:00, 1.41s/trial, best loss: -0.6583766233766233]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tidlosstimelog_lossstatus
name
p00-0.6238640.2038960.567272ok
p11-0.6316560.2378080.566987ok
p22-0.6583770.2296310.558715ok
p33-0.5793830.2122690.628262ok
p44-0.6167860.2520340.567862ok
p55-0.6188640.2814580.585125ok
p66-0.6500320.3426330.556056ok
p77-0.6528250.2700160.558804ok
p88-0.6336040.2144080.578119ok
p99-0.6325000.2783360.576655ok
\n", "
" ], "text/plain": [ " tid loss time log_loss status\n", "name \n", "p0 0 -0.623864 0.203896 0.567272 ok\n", "p1 1 -0.631656 0.237808 0.566987 ok\n", "p2 2 -0.658377 0.229631 0.558715 ok\n", "p3 3 -0.579383 0.212269 0.628262 ok\n", "p4 4 -0.616786 0.252034 0.567862 ok\n", "p5 5 -0.618864 0.281458 0.585125 ok\n", "p6 6 -0.650032 0.342633 0.556056 ok\n", "p7 7 -0.652825 0.270016 0.558804 ok\n", "p8 8 -0.633604 0.214408 0.578119 ok\n", "p9 9 -0.632500 0.278336 0.576655 ok" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "optimizer = Hyperopt(estimator=pipeline_without_correction, max_evals = 10, scoring='roc_auc')\n", "trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))\n", "predictions = trained_optimizer.predict(test_X)\n", "trained_optimizer.summary()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.6421052631578947\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler\n", "\n", "\n", "Min-\n", "Max-\n", "Scaler\n", "\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler->pca\n", "\n", "\n", "\n", "\n", "\n", "random_forest_classifier\n", "\n", "\n", "Random-\n", "Forest-\n", "Classifier\n", "\n", "\n", "\n", "\n", "\n", "pca->random_forest_classifier\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "print(accuracy_score(test_y, predictions))\n", "best_estimator = trained_optimizer.get_pipeline()\n", "best_estimator.visualize()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Over-sampling from imbalanced-learn" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|█████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:14<00:00, 1.40s/trial, best loss: -0.6835011143534503]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tidlosstimelog_lossstatus
name
p00-0.6249540.1909580.616522ok
p11-0.6501570.2428150.655870ok
p22-0.6835010.2114130.666853ok
p33-0.6041010.1859560.599714ok
p44-0.5175290.2658530.655569ok
p55-0.5561870.3632490.635348ok
p66-0.5684770.2987470.648213ok
p77-0.5796160.2287330.658655ok
p88-0.6302380.2222520.617986ok
p99-0.5906780.3286170.641440ok
\n", "
" ], "text/plain": [ " tid loss time log_loss status\n", "name \n", "p0 0 -0.624954 0.190958 0.616522 ok\n", "p1 1 -0.650157 0.242815 0.655870 ok\n", "p2 2 -0.683501 0.211413 0.666853 ok\n", "p3 3 -0.604101 0.185956 0.599714 ok\n", "p4 4 -0.517529 0.265853 0.655569 ok\n", "p5 5 -0.556187 0.363249 0.635348 ok\n", "p6 6 -0.568477 0.298747 0.648213 ok\n", "p7 7 -0.579616 0.228733 0.658655 ok\n", "p8 8 -0.630238 0.222252 0.617986 ok\n", "p9 9 -0.590678 0.328617 0.641440 ok" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline_with_correction = SMOTE(operator = MinMaxScaler() >> PCA() >> RandomForestClassifier())\n", "optimizer = Hyperopt(estimator=pipeline_with_correction, max_evals = 10, scoring='roc_auc')\n", "trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))\n", "predictions = trained_optimizer.predict(test_X)\n", "trained_optimizer.summary()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.631578947368421\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "SMOTE\n", "\n", "\n", "\n", "\n", "cluster:pipeline\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler\n", "\n", "\n", "Min-\n", "Max-\n", "Scaler\n", "\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler->pca\n", "\n", "\n", "\n", "\n", "\n", "random_forest_classifier\n", "\n", "\n", "Random-\n", "Forest-\n", "Classifier\n", "\n", "\n", "\n", "\n", "\n", "pca->random_forest_classifier\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "print(accuracy_score(test_y, predictions))\n", "best_estimator = trained_optimizer.get_pipeline()\n", "best_estimator.visualize()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Under-sampling from imbalanced-learn" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|█████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:22<00:00, 2.26s/trial, best loss: -0.6813611664876958]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tidlosstimelog_lossstatus
name
p00-0.6424470.3539580.604473ok
p11-0.5982460.4248640.626900ok
p22-0.6795200.4525100.607253ok
p33-0.5813290.3541750.648877ok
p44-0.6262330.4686220.619565ok
p55-0.6442040.4168040.613639ok
p66-0.6491060.4457630.618898ok
p77-0.5991450.5051830.624448ok
p88-0.5842140.3688970.622633ok
p99-0.6813610.4443880.582320ok
\n", "
" ], "text/plain": [ " tid loss time log_loss status\n", "name \n", "p0 0 -0.642447 0.353958 0.604473 ok\n", "p1 1 -0.598246 0.424864 0.626900 ok\n", "p2 2 -0.679520 0.452510 0.607253 ok\n", "p3 3 -0.581329 0.354175 0.648877 ok\n", "p4 4 -0.626233 0.468622 0.619565 ok\n", "p5 5 -0.644204 0.416804 0.613639 ok\n", "p6 6 -0.649106 0.445763 0.618898 ok\n", "p7 7 -0.599145 0.505183 0.624448 ok\n", "p8 8 -0.584214 0.368897 0.622633 ok\n", "p9 9 -0.681361 0.444388 0.582320 ok" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline_with_correction = CondensedNearestNeighbour(operator = MinMaxScaler() >> PCA() >> RandomForestClassifier())\n", "optimizer = Hyperopt(estimator=pipeline_with_correction, max_evals = 10, scoring='roc_auc')\n", "trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))\n", "predictions = trained_optimizer.predict(test_X)\n", "trained_optimizer.summary()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.6947368421052632\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "CondensedNearestNeighbour\n", "\n", "\n", "\n", "\n", "cluster:pipeline\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler\n", "\n", "\n", "Min-\n", "Max-\n", "Scaler\n", "\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler->pca\n", "\n", "\n", "\n", "\n", "\n", "random_forest_classifier\n", "\n", "\n", "Random-\n", "Forest-\n", "Classifier\n", "\n", "\n", "\n", "\n", "\n", "pca->random_forest_classifier\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "print(accuracy_score(test_y, predictions))\n", "best_estimator = trained_optimizer.get_pipeline()\n", "best_estimator.visualize()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Combined over and under sampling from imbalanced-learn" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:12<00:00, 1.26s/trial, best loss: -0.664063068312234]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tidlosstimelog_lossstatus
name
p00-0.5741950.1974060.874854ok
p11-0.5815620.2118100.968561ok
p22-0.6640630.2003661.025431ok
p33-0.6616210.1837950.721990ok
p44-0.5542060.2930480.857730ok
p55-0.5934520.2406700.826638ok
p66-0.6365240.2641100.809327ok
p77-0.6372880.2194590.894342ok
p88-0.6570680.1855661.327299ok
p99-0.6267700.2414341.025155ok
\n", "
" ], "text/plain": [ " tid loss time log_loss status\n", "name \n", "p0 0 -0.574195 0.197406 0.874854 ok\n", "p1 1 -0.581562 0.211810 0.968561 ok\n", "p2 2 -0.664063 0.200366 1.025431 ok\n", "p3 3 -0.661621 0.183795 0.721990 ok\n", "p4 4 -0.554206 0.293048 0.857730 ok\n", "p5 5 -0.593452 0.240670 0.826638 ok\n", "p6 6 -0.636524 0.264110 0.809327 ok\n", "p7 7 -0.637288 0.219459 0.894342 ok\n", "p8 8 -0.657068 0.185566 1.327299 ok\n", "p9 9 -0.626770 0.241434 1.025155 ok" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline_with_correction = SMOTEENN(operator = MinMaxScaler() >> PCA() >> RandomForestClassifier())\n", "optimizer = Hyperopt(estimator=pipeline_with_correction, max_evals = 10, scoring='roc_auc')\n", "trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))\n", "predictions = trained_optimizer.predict(test_X)\n", "trained_optimizer.summary()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.35789473684210527\n" ] }, { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "cluster:(root)\n", "\n", "\n", "SMOTEENN\n", "\n", "\n", "\n", "\n", "cluster:pipeline\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler\n", "\n", "\n", "Min-\n", "Max-\n", "Scaler\n", "\n", "\n", "\n", "\n", "\n", "pca\n", "\n", "\n", "PCA\n", "\n", "\n", "\n", "\n", "\n", "min_max_scaler->pca\n", "\n", "\n", "\n", "\n", "\n", "random_forest_classifier\n", "\n", "\n", "Random-\n", "Forest-\n", "Classifier\n", "\n", "\n", "\n", "\n", "\n", "pca->random_forest_classifier\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "print(accuracy_score(test_y, predictions))\n", "best_estimator = trained_optimizer.get_pipeline()\n", "best_estimator.visualize()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.1" } }, "nbformat": 4, "nbformat_minor": 2 }