{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Looking in indexes: https://kakate%40us.ibm.com:****@na.artifactory.swg-devops.com/artifactory/api/pypi/wcp-nlp-pypi-virtual/simple\r\n",
"Requirement already satisfied: liac-arff>=2.4.0 in /Users/kakateus.ibm.com/venv/lale39/lib/python3.9/site-packages (2.5.0)\r\n"
]
}
],
"source": [
"#Required for accessing openml datasets from Lale\n",
"!pip install 'liac-arff>=2.4.0'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Dataset with class imbalance"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import lale.datasets.openml\n",
"import pandas as pd\n",
"(train_X, train_y), (test_X, test_y) = lale.datasets.openml.fetch(\n",
" 'breast-cancer', 'classification', preprocess=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([140, 51])"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"np.bincount(train_y)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import MinMaxScaler\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.kernel_approximation import Nystroem as Nys\n",
"from lale.lib.lale import NoOp\n",
"from sklearn.linear_model import LogisticRegression as LR\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from xgboost import XGBClassifier\n",
"from lale.lib.lale import Hyperopt\n",
"from lale.lib.imblearn import SMOTE, CondensedNearestNeighbour, SMOTEENN\n",
"from sklearn.metrics import accuracy_score\n",
"import lale\n",
"lale.wrap_imported_operators()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### A pipeline without any imbalance correction"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pipeline_without_correction = MinMaxScaler() >> PCA() >> RandomForestClassifier()\n",
"pipeline_without_correction.visualize()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100%|█████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:14<00:00, 1.41s/trial, best loss: -0.6583766233766233]\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tid | \n",
" loss | \n",
" time | \n",
" log_loss | \n",
" status | \n",
"
\n",
" \n",
" name | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" p0 | \n",
" 0 | \n",
" -0.623864 | \n",
" 0.203896 | \n",
" 0.567272 | \n",
" ok | \n",
"
\n",
" \n",
" p1 | \n",
" 1 | \n",
" -0.631656 | \n",
" 0.237808 | \n",
" 0.566987 | \n",
" ok | \n",
"
\n",
" \n",
" p2 | \n",
" 2 | \n",
" -0.658377 | \n",
" 0.229631 | \n",
" 0.558715 | \n",
" ok | \n",
"
\n",
" \n",
" p3 | \n",
" 3 | \n",
" -0.579383 | \n",
" 0.212269 | \n",
" 0.628262 | \n",
" ok | \n",
"
\n",
" \n",
" p4 | \n",
" 4 | \n",
" -0.616786 | \n",
" 0.252034 | \n",
" 0.567862 | \n",
" ok | \n",
"
\n",
" \n",
" p5 | \n",
" 5 | \n",
" -0.618864 | \n",
" 0.281458 | \n",
" 0.585125 | \n",
" ok | \n",
"
\n",
" \n",
" p6 | \n",
" 6 | \n",
" -0.650032 | \n",
" 0.342633 | \n",
" 0.556056 | \n",
" ok | \n",
"
\n",
" \n",
" p7 | \n",
" 7 | \n",
" -0.652825 | \n",
" 0.270016 | \n",
" 0.558804 | \n",
" ok | \n",
"
\n",
" \n",
" p8 | \n",
" 8 | \n",
" -0.633604 | \n",
" 0.214408 | \n",
" 0.578119 | \n",
" ok | \n",
"
\n",
" \n",
" p9 | \n",
" 9 | \n",
" -0.632500 | \n",
" 0.278336 | \n",
" 0.576655 | \n",
" ok | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tid loss time log_loss status\n",
"name \n",
"p0 0 -0.623864 0.203896 0.567272 ok\n",
"p1 1 -0.631656 0.237808 0.566987 ok\n",
"p2 2 -0.658377 0.229631 0.558715 ok\n",
"p3 3 -0.579383 0.212269 0.628262 ok\n",
"p4 4 -0.616786 0.252034 0.567862 ok\n",
"p5 5 -0.618864 0.281458 0.585125 ok\n",
"p6 6 -0.650032 0.342633 0.556056 ok\n",
"p7 7 -0.652825 0.270016 0.558804 ok\n",
"p8 8 -0.633604 0.214408 0.578119 ok\n",
"p9 9 -0.632500 0.278336 0.576655 ok"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"optimizer = Hyperopt(estimator=pipeline_without_correction, max_evals = 10, scoring='roc_auc')\n",
"trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))\n",
"predictions = trained_optimizer.predict(test_X)\n",
"trained_optimizer.summary()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.6421052631578947\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"print(accuracy_score(test_y, predictions))\n",
"best_estimator = trained_optimizer.get_pipeline()\n",
"best_estimator.visualize()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Over-sampling from imbalanced-learn"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100%|█████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:14<00:00, 1.40s/trial, best loss: -0.6835011143534503]\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tid | \n",
" loss | \n",
" time | \n",
" log_loss | \n",
" status | \n",
"
\n",
" \n",
" name | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" p0 | \n",
" 0 | \n",
" -0.624954 | \n",
" 0.190958 | \n",
" 0.616522 | \n",
" ok | \n",
"
\n",
" \n",
" p1 | \n",
" 1 | \n",
" -0.650157 | \n",
" 0.242815 | \n",
" 0.655870 | \n",
" ok | \n",
"
\n",
" \n",
" p2 | \n",
" 2 | \n",
" -0.683501 | \n",
" 0.211413 | \n",
" 0.666853 | \n",
" ok | \n",
"
\n",
" \n",
" p3 | \n",
" 3 | \n",
" -0.604101 | \n",
" 0.185956 | \n",
" 0.599714 | \n",
" ok | \n",
"
\n",
" \n",
" p4 | \n",
" 4 | \n",
" -0.517529 | \n",
" 0.265853 | \n",
" 0.655569 | \n",
" ok | \n",
"
\n",
" \n",
" p5 | \n",
" 5 | \n",
" -0.556187 | \n",
" 0.363249 | \n",
" 0.635348 | \n",
" ok | \n",
"
\n",
" \n",
" p6 | \n",
" 6 | \n",
" -0.568477 | \n",
" 0.298747 | \n",
" 0.648213 | \n",
" ok | \n",
"
\n",
" \n",
" p7 | \n",
" 7 | \n",
" -0.579616 | \n",
" 0.228733 | \n",
" 0.658655 | \n",
" ok | \n",
"
\n",
" \n",
" p8 | \n",
" 8 | \n",
" -0.630238 | \n",
" 0.222252 | \n",
" 0.617986 | \n",
" ok | \n",
"
\n",
" \n",
" p9 | \n",
" 9 | \n",
" -0.590678 | \n",
" 0.328617 | \n",
" 0.641440 | \n",
" ok | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tid loss time log_loss status\n",
"name \n",
"p0 0 -0.624954 0.190958 0.616522 ok\n",
"p1 1 -0.650157 0.242815 0.655870 ok\n",
"p2 2 -0.683501 0.211413 0.666853 ok\n",
"p3 3 -0.604101 0.185956 0.599714 ok\n",
"p4 4 -0.517529 0.265853 0.655569 ok\n",
"p5 5 -0.556187 0.363249 0.635348 ok\n",
"p6 6 -0.568477 0.298747 0.648213 ok\n",
"p7 7 -0.579616 0.228733 0.658655 ok\n",
"p8 8 -0.630238 0.222252 0.617986 ok\n",
"p9 9 -0.590678 0.328617 0.641440 ok"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline_with_correction = SMOTE(operator = MinMaxScaler() >> PCA() >> RandomForestClassifier())\n",
"optimizer = Hyperopt(estimator=pipeline_with_correction, max_evals = 10, scoring='roc_auc')\n",
"trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))\n",
"predictions = trained_optimizer.predict(test_X)\n",
"trained_optimizer.summary()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.631578947368421\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"print(accuracy_score(test_y, predictions))\n",
"best_estimator = trained_optimizer.get_pipeline()\n",
"best_estimator.visualize()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Under-sampling from imbalanced-learn"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100%|█████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:22<00:00, 2.26s/trial, best loss: -0.6813611664876958]\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tid | \n",
" loss | \n",
" time | \n",
" log_loss | \n",
" status | \n",
"
\n",
" \n",
" name | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" p0 | \n",
" 0 | \n",
" -0.642447 | \n",
" 0.353958 | \n",
" 0.604473 | \n",
" ok | \n",
"
\n",
" \n",
" p1 | \n",
" 1 | \n",
" -0.598246 | \n",
" 0.424864 | \n",
" 0.626900 | \n",
" ok | \n",
"
\n",
" \n",
" p2 | \n",
" 2 | \n",
" -0.679520 | \n",
" 0.452510 | \n",
" 0.607253 | \n",
" ok | \n",
"
\n",
" \n",
" p3 | \n",
" 3 | \n",
" -0.581329 | \n",
" 0.354175 | \n",
" 0.648877 | \n",
" ok | \n",
"
\n",
" \n",
" p4 | \n",
" 4 | \n",
" -0.626233 | \n",
" 0.468622 | \n",
" 0.619565 | \n",
" ok | \n",
"
\n",
" \n",
" p5 | \n",
" 5 | \n",
" -0.644204 | \n",
" 0.416804 | \n",
" 0.613639 | \n",
" ok | \n",
"
\n",
" \n",
" p6 | \n",
" 6 | \n",
" -0.649106 | \n",
" 0.445763 | \n",
" 0.618898 | \n",
" ok | \n",
"
\n",
" \n",
" p7 | \n",
" 7 | \n",
" -0.599145 | \n",
" 0.505183 | \n",
" 0.624448 | \n",
" ok | \n",
"
\n",
" \n",
" p8 | \n",
" 8 | \n",
" -0.584214 | \n",
" 0.368897 | \n",
" 0.622633 | \n",
" ok | \n",
"
\n",
" \n",
" p9 | \n",
" 9 | \n",
" -0.681361 | \n",
" 0.444388 | \n",
" 0.582320 | \n",
" ok | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tid loss time log_loss status\n",
"name \n",
"p0 0 -0.642447 0.353958 0.604473 ok\n",
"p1 1 -0.598246 0.424864 0.626900 ok\n",
"p2 2 -0.679520 0.452510 0.607253 ok\n",
"p3 3 -0.581329 0.354175 0.648877 ok\n",
"p4 4 -0.626233 0.468622 0.619565 ok\n",
"p5 5 -0.644204 0.416804 0.613639 ok\n",
"p6 6 -0.649106 0.445763 0.618898 ok\n",
"p7 7 -0.599145 0.505183 0.624448 ok\n",
"p8 8 -0.584214 0.368897 0.622633 ok\n",
"p9 9 -0.681361 0.444388 0.582320 ok"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline_with_correction = CondensedNearestNeighbour(operator = MinMaxScaler() >> PCA() >> RandomForestClassifier())\n",
"optimizer = Hyperopt(estimator=pipeline_with_correction, max_evals = 10, scoring='roc_auc')\n",
"trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))\n",
"predictions = trained_optimizer.predict(test_X)\n",
"trained_optimizer.summary()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.6947368421052632\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"print(accuracy_score(test_y, predictions))\n",
"best_estimator = trained_optimizer.get_pipeline()\n",
"best_estimator.visualize()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Combined over and under sampling from imbalanced-learn"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100%|██████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:12<00:00, 1.26s/trial, best loss: -0.664063068312234]\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tid | \n",
" loss | \n",
" time | \n",
" log_loss | \n",
" status | \n",
"
\n",
" \n",
" name | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" p0 | \n",
" 0 | \n",
" -0.574195 | \n",
" 0.197406 | \n",
" 0.874854 | \n",
" ok | \n",
"
\n",
" \n",
" p1 | \n",
" 1 | \n",
" -0.581562 | \n",
" 0.211810 | \n",
" 0.968561 | \n",
" ok | \n",
"
\n",
" \n",
" p2 | \n",
" 2 | \n",
" -0.664063 | \n",
" 0.200366 | \n",
" 1.025431 | \n",
" ok | \n",
"
\n",
" \n",
" p3 | \n",
" 3 | \n",
" -0.661621 | \n",
" 0.183795 | \n",
" 0.721990 | \n",
" ok | \n",
"
\n",
" \n",
" p4 | \n",
" 4 | \n",
" -0.554206 | \n",
" 0.293048 | \n",
" 0.857730 | \n",
" ok | \n",
"
\n",
" \n",
" p5 | \n",
" 5 | \n",
" -0.593452 | \n",
" 0.240670 | \n",
" 0.826638 | \n",
" ok | \n",
"
\n",
" \n",
" p6 | \n",
" 6 | \n",
" -0.636524 | \n",
" 0.264110 | \n",
" 0.809327 | \n",
" ok | \n",
"
\n",
" \n",
" p7 | \n",
" 7 | \n",
" -0.637288 | \n",
" 0.219459 | \n",
" 0.894342 | \n",
" ok | \n",
"
\n",
" \n",
" p8 | \n",
" 8 | \n",
" -0.657068 | \n",
" 0.185566 | \n",
" 1.327299 | \n",
" ok | \n",
"
\n",
" \n",
" p9 | \n",
" 9 | \n",
" -0.626770 | \n",
" 0.241434 | \n",
" 1.025155 | \n",
" ok | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tid loss time log_loss status\n",
"name \n",
"p0 0 -0.574195 0.197406 0.874854 ok\n",
"p1 1 -0.581562 0.211810 0.968561 ok\n",
"p2 2 -0.664063 0.200366 1.025431 ok\n",
"p3 3 -0.661621 0.183795 0.721990 ok\n",
"p4 4 -0.554206 0.293048 0.857730 ok\n",
"p5 5 -0.593452 0.240670 0.826638 ok\n",
"p6 6 -0.636524 0.264110 0.809327 ok\n",
"p7 7 -0.637288 0.219459 0.894342 ok\n",
"p8 8 -0.657068 0.185566 1.327299 ok\n",
"p9 9 -0.626770 0.241434 1.025155 ok"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline_with_correction = SMOTEENN(operator = MinMaxScaler() >> PCA() >> RandomForestClassifier())\n",
"optimizer = Hyperopt(estimator=pipeline_with_correction, max_evals = 10, scoring='roc_auc')\n",
"trained_optimizer = optimizer.fit(np.array(train_X), np.array(train_y))\n",
"predictions = trained_optimizer.predict(test_X)\n",
"trained_optimizer.summary()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.35789473684210527\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"print(accuracy_score(test_y, predictions))\n",
"best_estimator = trained_optimizer.get_pipeline()\n",
"best_estimator.visualize()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}