{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: liac-arff>=2.4.0 in /Users/kakateus.ibm.com/anaconda3/lib/python3.7/site-packages (2.4.0)\n",
"\u001b[33mWARNING: You are using pip version 20.1.1; however, version 20.2.2 is available.\n",
"You should consider upgrading via the '/Users/kakateus.ibm.com/anaconda3/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
]
}
],
"source": [
"#Required for accessing openml datasets from Lale\n",
"!pip install 'liac-arff>=2.4.0'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Dataset with class imbalance"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import lale.datasets.openml\n",
"import pandas as pd\n",
"(train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(\n",
" 'breast-cancer', 'classification', preprocess=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([140, 51])"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"np.bincount(train_y)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/kakateus.ibm.com/anaconda3/lib/python3.7/site-packages/pyparsing.py:3174: FutureWarning: Possible set intersection at position 3\n",
" self.re = re.compile(self.reString)\n"
]
}
],
"source": [
"from sklearn.preprocessing import MinMaxScaler\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.kernel_approximation import Nystroem as Nys\n",
"from lale.lib.lale import NoOp\n",
"from sklearn.linear_model import LogisticRegression as LR\n",
"from sklearn.ensemble.forest import RandomForestClassifier\n",
"from xgboost import XGBClassifier\n",
"from lale.lib.lale import Hyperopt\n",
"from lale.lib.imblearn import SMOTE, CondensedNearestNeighbour, SMOTEENN\n",
"from sklearn.metrics import accuracy_score\n",
"import lale\n",
"lale.wrap_imported_operators()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### A pipeline without any imbalance correction"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pipeline_without_correction = MinMaxScaler() >> PCA() >> RandomForestClassifier()\n",
"pipeline_without_correction.visualize()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100%|██████████| 10/10 [00:18<00:00, 1.90s/trial, best loss: -0.6403571428571428]\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tid | \n",
" loss | \n",
" time | \n",
" log_loss | \n",
" status | \n",
"
\n",
" \n",
" name | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" p0 | \n",
" 0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" fail | \n",
"
\n",
" \n",
" p1 | \n",
" 1 | \n",
" -0.640357 | \n",
" 0.357025 | \n",
" 0.566292 | \n",
" ok | \n",
"
\n",
" \n",
" p2 | \n",
" 2 | \n",
" -0.582403 | \n",
" 0.362211 | \n",
" 0.609279 | \n",
" ok | \n",
"
\n",
" \n",
" p3 | \n",
" 3 | \n",
" -0.551688 | \n",
" 0.309376 | \n",
" 0.599048 | \n",
" ok | \n",
"
\n",
" \n",
" p4 | \n",
" 4 | \n",
" -0.631656 | \n",
" 0.350106 | \n",
" 0.576526 | \n",
" ok | \n",
"
\n",
" \n",
" p5 | \n",
" 5 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" fail | \n",
"
\n",
" \n",
" p6 | \n",
" 6 | \n",
" -0.500000 | \n",
" 0.497362 | \n",
" 0.580290 | \n",
" ok | \n",
"
\n",
" \n",
" p7 | \n",
" 7 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" fail | \n",
"
\n",
" \n",
" p8 | \n",
" 8 | \n",
" -0.500000 | \n",
" 0.386382 | \n",
" 0.580199 | \n",
" ok | \n",
"
\n",
" \n",
" p9 | \n",
" 9 | \n",
" -0.594318 | \n",
" 0.443511 | \n",
" 0.602166 | \n",
" ok | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tid loss time log_loss status\n",
"name \n",
"p0 0 NaN NaN NaN fail\n",
"p1 1 -0.640357 0.357025 0.566292 ok\n",
"p2 2 -0.582403 0.362211 0.609279 ok\n",
"p3 3 -0.551688 0.309376 0.599048 ok\n",
"p4 4 -0.631656 0.350106 0.576526 ok\n",
"p5 5 NaN NaN NaN fail\n",
"p6 6 -0.500000 0.497362 0.580290 ok\n",
"p7 7 NaN NaN NaN fail\n",
"p8 8 -0.500000 0.386382 0.580199 ok\n",
"p9 9 -0.594318 0.443511 0.602166 ok"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"optimizer = Hyperopt(estimator=pipeline_without_correction, max_evals = 10, scoring='roc_auc')\n",
"trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))\n",
"predictions = trained_optimizer.predict(test_X)\n",
"trained_optimizer.summary()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.6421052631578947\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"print(accuracy_score(test_y, predictions))\n",
"best_estimator = trained_optimizer.get_pipeline()\n",
"best_estimator.visualize()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Over-sampling from imbalanced-learn"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100%|██████████| 10/10 [00:22<00:00, 2.23s/trial, best loss: -0.6823150983857325]\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tid | \n",
" loss | \n",
" time | \n",
" log_loss | \n",
" status | \n",
"
\n",
" \n",
" name | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" p0 | \n",
" 0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" fail | \n",
"
\n",
" \n",
" p1 | \n",
" 1 | \n",
" -0.612215 | \n",
" 0.476361 | \n",
" 0.661132 | \n",
" ok | \n",
"
\n",
" \n",
" p2 | \n",
" 2 | \n",
" -0.673310 | \n",
" 0.494134 | \n",
" 0.640531 | \n",
" ok | \n",
"
\n",
" \n",
" p3 | \n",
" 3 | \n",
" -0.682315 | \n",
" 0.369089 | \n",
" 0.642915 | \n",
" ok | \n",
"
\n",
" \n",
" p4 | \n",
" 4 | \n",
" -0.647939 | \n",
" 0.481500 | \n",
" 0.687333 | \n",
" ok | \n",
"
\n",
" \n",
" p5 | \n",
" 5 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" fail | \n",
"
\n",
" \n",
" p6 | \n",
" 6 | \n",
" -0.500000 | \n",
" 0.508820 | \n",
" 0.693272 | \n",
" ok | \n",
"
\n",
" \n",
" p7 | \n",
" 7 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" fail | \n",
"
\n",
" \n",
" p8 | \n",
" 8 | \n",
" -0.500000 | \n",
" 0.512538 | \n",
" 0.698014 | \n",
" ok | \n",
"
\n",
" \n",
" p9 | \n",
" 9 | \n",
" -0.638770 | \n",
" 0.457668 | \n",
" 0.669854 | \n",
" ok | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tid loss time log_loss status\n",
"name \n",
"p0 0 NaN NaN NaN fail\n",
"p1 1 -0.612215 0.476361 0.661132 ok\n",
"p2 2 -0.673310 0.494134 0.640531 ok\n",
"p3 3 -0.682315 0.369089 0.642915 ok\n",
"p4 4 -0.647939 0.481500 0.687333 ok\n",
"p5 5 NaN NaN NaN fail\n",
"p6 6 -0.500000 0.508820 0.693272 ok\n",
"p7 7 NaN NaN NaN fail\n",
"p8 8 -0.500000 0.512538 0.698014 ok\n",
"p9 9 -0.638770 0.457668 0.669854 ok"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline_with_correction = SMOTE(operator = MinMaxScaler() >> PCA() >> RandomForestClassifier())\n",
"optimizer = Hyperopt(estimator=pipeline_with_correction, max_evals = 10, scoring='roc_auc')\n",
"trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))\n",
"predictions = trained_optimizer.predict(test_X)\n",
"trained_optimizer.summary()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.6\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"print(accuracy_score(test_y, predictions))\n",
"best_estimator = trained_optimizer.get_pipeline()\n",
"best_estimator.visualize()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Under-sampling from imbalanced-learn"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100%|██████████| 10/10 [00:28<00:00, 2.87s/trial, best loss: -0.665297478599759]\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tid | \n",
" loss | \n",
" time | \n",
" log_loss | \n",
" status | \n",
"
\n",
" \n",
" name | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" p0 | \n",
" 0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" fail | \n",
"
\n",
" \n",
" p1 | \n",
" 1 | \n",
" -0.620197 | \n",
" 0.618775 | \n",
" 0.637794 | \n",
" ok | \n",
"
\n",
" \n",
" p2 | \n",
" 2 | \n",
" -0.618310 | \n",
" 0.651479 | \n",
" 0.634385 | \n",
" ok | \n",
"
\n",
" \n",
" p3 | \n",
" 3 | \n",
" -0.629734 | \n",
" 0.579852 | \n",
" 0.627842 | \n",
" ok | \n",
"
\n",
" \n",
" p4 | \n",
" 4 | \n",
" -0.665297 | \n",
" 0.738103 | \n",
" 0.644552 | \n",
" ok | \n",
"
\n",
" \n",
" p5 | \n",
" 5 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" fail | \n",
"
\n",
" \n",
" p6 | \n",
" 6 | \n",
" -0.500000 | \n",
" 0.790457 | \n",
" 0.650936 | \n",
" ok | \n",
"
\n",
" \n",
" p7 | \n",
" 7 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" fail | \n",
"
\n",
" \n",
" p8 | \n",
" 8 | \n",
" -0.500000 | \n",
" 0.600954 | \n",
" 0.651208 | \n",
" ok | \n",
"
\n",
" \n",
" p9 | \n",
" 9 | \n",
" -0.626870 | \n",
" 0.595459 | \n",
" 0.699177 | \n",
" ok | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tid loss time log_loss status\n",
"name \n",
"p0 0 NaN NaN NaN fail\n",
"p1 1 -0.620197 0.618775 0.637794 ok\n",
"p2 2 -0.618310 0.651479 0.634385 ok\n",
"p3 3 -0.629734 0.579852 0.627842 ok\n",
"p4 4 -0.665297 0.738103 0.644552 ok\n",
"p5 5 NaN NaN NaN fail\n",
"p6 6 -0.500000 0.790457 0.650936 ok\n",
"p7 7 NaN NaN NaN fail\n",
"p8 8 -0.500000 0.600954 0.651208 ok\n",
"p9 9 -0.626870 0.595459 0.699177 ok"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline_with_correction = CondensedNearestNeighbour(operator = MinMaxScaler() >> PCA() >> RandomForestClassifier())\n",
"optimizer = Hyperopt(estimator=pipeline_with_correction, max_evals = 10, scoring='roc_auc')\n",
"trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))\n",
"predictions = trained_optimizer.predict(test_X)\n",
"trained_optimizer.summary()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.6421052631578947\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"print(accuracy_score(test_y, predictions))\n",
"best_estimator = trained_optimizer.get_pipeline()\n",
"best_estimator.visualize()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Combined over and under sampling from imbalanced-learn"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100%|██████████| 10/10 [00:20<00:00, 2.04s/trial, best loss: -0.6697127399852649]\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tid | \n",
" loss | \n",
" time | \n",
" log_loss | \n",
" status | \n",
"
\n",
" \n",
" name | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" p0 | \n",
" 0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" fail | \n",
"
\n",
" \n",
" p1 | \n",
" 1 | \n",
" -0.622121 | \n",
" 0.410015 | \n",
" 0.890017 | \n",
" ok | \n",
"
\n",
" \n",
" p2 | \n",
" 2 | \n",
" -0.617132 | \n",
" 0.407055 | \n",
" 3.463371 | \n",
" ok | \n",
"
\n",
" \n",
" p3 | \n",
" 3 | \n",
" -0.635959 | \n",
" 0.374980 | \n",
" 3.379093 | \n",
" ok | \n",
"
\n",
" \n",
" p4 | \n",
" 4 | \n",
" -0.568382 | \n",
" 0.506785 | \n",
" 0.993301 | \n",
" ok | \n",
"
\n",
" \n",
" p5 | \n",
" 5 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" fail | \n",
"
\n",
" \n",
" p6 | \n",
" 6 | \n",
" -0.500000 | \n",
" 0.540639 | \n",
" 0.986296 | \n",
" ok | \n",
"
\n",
" \n",
" p7 | \n",
" 7 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" fail | \n",
"
\n",
" \n",
" p8 | \n",
" 8 | \n",
" -0.500000 | \n",
" 0.539346 | \n",
" 1.009724 | \n",
" ok | \n",
"
\n",
" \n",
" p9 | \n",
" 9 | \n",
" -0.669713 | \n",
" 0.453810 | \n",
" 5.687978 | \n",
" ok | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tid loss time log_loss status\n",
"name \n",
"p0 0 NaN NaN NaN fail\n",
"p1 1 -0.622121 0.410015 0.890017 ok\n",
"p2 2 -0.617132 0.407055 3.463371 ok\n",
"p3 3 -0.635959 0.374980 3.379093 ok\n",
"p4 4 -0.568382 0.506785 0.993301 ok\n",
"p5 5 NaN NaN NaN fail\n",
"p6 6 -0.500000 0.540639 0.986296 ok\n",
"p7 7 NaN NaN NaN fail\n",
"p8 8 -0.500000 0.539346 1.009724 ok\n",
"p9 9 -0.669713 0.453810 5.687978 ok"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline_with_correction = SMOTEENN(operator = MinMaxScaler() >> PCA() >> RandomForestClassifier())\n",
"optimizer = Hyperopt(estimator=pipeline_with_correction, max_evals = 10, scoring='roc_auc')\n",
"trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))\n",
"predictions = trained_optimizer.predict(test_X)\n",
"trained_optimizer.summary()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.4842105263157895\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"print(accuracy_score(test_y, predictions))\n",
"best_estimator = trained_optimizer.get_pipeline()\n",
"best_estimator.visualize()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}