{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from lale.grammar import Grammar\n",
"from lale.operators import make_choice\n",
"from lale import wrap_imported_operators\n",
"from lale.settings import set_disable_hyperparams_schema_validation\n",
"set_disable_hyperparams_schema_validation(True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Simple: First example"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.linear_model import LogisticRegression as LR\n",
"from sklearn.neighbors import KNeighborsClassifier as KNN\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.preprocessing import StandardScaler as Scaler\n",
"from lale.lib.lale import NoOp\n",
"wrap_imported_operators()\n",
"\n",
"g = Grammar()\n",
"\n",
"g.start = g.estimator\n",
"g.estimator = (NoOp | g.transformer) >> g.prim_est\n",
"g.transformer = (NoOp | g.transformer) >> g.prim_tfm\n",
"\n",
"g.prim_est = LR | KNN\n",
"g.prim_tfm = PCA | Scaler\n",
"\n",
"generated = g.unfold(6)\n",
"generated.visualize()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100%|██████████| 6/6 [00:04<00:00, 1.40trial/s, best loss: -0.9261575551782683]\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from lale.lib.lale import Hyperopt\n",
"import lale.datasets\n",
"(train_X, train_y), (test_X, test_y) = lale.datasets.load_iris_df()\n",
"\n",
"trainer = Hyperopt(estimator=generated, cv=2, max_evals=6)\n",
"trained = trainer.fit(train_X, train_y)\n",
"trained.get_pipeline().visualize()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pool = {g.sample(10) for _ in range(5)}\n",
"for tree in pool:\n",
" tree.visualize()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100%|██████████| 6/6 [00:03<00:00, 1.93trial/s, best loss: -0.9383575551782682]\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"generated = make_choice(*pool)\n",
"\n",
"trainer = Hyperopt(estimator=generated, cv=2, max_evals=6)\n",
"trained = trainer.fit(train_X, train_y)\n",
"trained.get_pipeline().visualize()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Grammar that exercices all combinators"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.linear_model import LogisticRegression as LR\n",
"from sklearn.neighbors import KNeighborsClassifier as KNN\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.preprocessing import StandardScaler as Scaler\n",
"from sklearn.ensemble import AdaBoostClassifier as Boost\n",
"from lale.lib.lale import ConcatFeatures as Concat\n",
"wrap_imported_operators()\n",
"\n",
"g = Grammar()\n",
"\n",
"g.start = g.estimator\n",
"g.estimator = g.term_est | g.transformer >> g.term_est\n",
"g.term_est = g.prim_est #| g.ensemble\n",
"#g.ensemble = Boost ( base_estimator = LR )\n",
"g.transformer = g.union_tfm | g.union_tfm >> g.transformer\n",
"g.union_tfm = g.prim_tfm | g.union_body >> Concat\n",
"g.union_body = g.transformer | g.transformer & g.union_body\n",
"\n",
"g.prim_est = LR | KNN\n",
"g.prim_tfm = PCA | Scaler\n",
"g.ensembler = Boost\n",
"\n",
"generated = g.unfold(7)\n",
"generated.visualize()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100%|██████████| 3/3 [00:01<00:00, 1.58trial/s, best loss: -0.8889151103565365]\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"trainer = Hyperopt(estimator=generated, cv=2, max_evals=3)\n",
"trained = trainer.fit(train_X, train_y)\n",
"trained.get_pipeline().visualize()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pool = {g.sample(20) for _ in range(5)}\n",
"for tree in pool:\n",
" tree.visualize()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100%|██████████| 6/6 [00:02<00:00, 2.80trial/s, best loss: -0.9505575551782683]\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"generated = make_choice(*pool)\n",
"\n",
"trainer = Hyperopt(estimator=generated, cv=2, max_evals=6)\n",
"trained = trainer.fit(train_X, train_y)\n",
"trained.get_pipeline().visualize()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# recipe: grammar from this [paper](https://link.springer.com/chapter/10.1007/978-3-319-55696-3_16)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.decomposition import PCA\n",
"from sklearn.cluster import FeatureAgglomeration as FeatAggl\n",
"from sklearn.preprocessing import PolynomialFeatures as PolyFeat\n",
"from sklearn.naive_bayes import GaussianNB as GaussNB\n",
"from sklearn.naive_bayes import MultinomialNB as MultinNB\n",
"from sklearn.naive_bayes import BernoulliNB as BernouNB\n",
"from sklearn.impute import SimpleImputer as Imputer\n",
"from sklearn.ensemble import RandomForestClassifier as Forest\n",
"from sklearn.tree import DecisionTreeClassifier as Tree\n",
"from sklearn.feature_selection import SelectKBest\n",
"wrap_imported_operators()\n",
"\n",
"g = Grammar()\n",
"\n",
"g.start = g.algorithm | g.preprocessing >> g.algorithm\n",
"g.preprocessing = g.imputation >> g.dimensionality_definition | g.dimensionality_definition\n",
"g.dimensionality_definition = g.feature_selection >> g.feature_construction | g.feature_selection | g.feature_construction\n",
"g.feature_selection = g.unsupervised | g.supervised \n",
"g.algorithm = g.naive_bayes | g.trees\n",
"\n",
"g.imputation = Imputer\n",
"g.supervised = SelectKBest\n",
"g.unsupervised = PCA | FeatAggl\n",
"g.feature_construction = PolyFeat\n",
"g.naive_bayes = GaussNB | MultinNB | BernouNB\n",
"g.trees = Tree | Forest\n",
"\n",
"\n",
"generated = g.unfold(5)\n",
"generated.visualize()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100%|██████████| 3/3 [00:02<00:00, 1.37trial/s, best loss: -0.9139575551782683]\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"trainer = Hyperopt(estimator=generated, cv=2, max_evals=3)\n",
"trained = trainer.fit(train_X, train_y)\n",
"trained.get_pipeline().visualize()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import sys\n",
"pool = {g.sample(30) for _ in range(5)}\n",
"for tree in pool:\n",
" if tree is None:\n",
" print('None', file=sys.stderr)\n",
" else:\n",
" tree.visualize()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100%|██████████| 6/6 [00:02<00:00, 2.24trial/s, best loss: -0.9139575551782683]\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"generated = make_choice(*pool)\n",
"\n",
"trainer = Hyperopt(estimator=generated, cv=2, max_evals=6)\n",
"trained = trainer.fit(train_X, train_y)\n",
"trained.get_pipeline().visualize()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# alphad3m: Grammar from this [paper](https://www.automl.org/wp-content/uploads/2019/06/automlws2019_Paper34.pdf)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.impute import SimpleImputer as Imputer\n",
"from sklearn.impute import MissingIndicator\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.naive_bayes import GaussianNB as GaussNB\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.linear_model import Ridge\n",
"from sklearn.linear_model import SGDClassifier\n",
"from sklearn.preprocessing import OneHotEncoder as OneHot\n",
"from sklearn.preprocessing import OrdinalEncoder\n",
"wrap_imported_operators()\n",
"\n",
"g = Grammar()\n",
"\n",
"g.start = g.est | g.clean >> g.est | g.tfm >> g.est | g.clean >> g.tfm >> g.est\n",
"g.clean = g.clean1 >> g.clean | g.clean1\n",
"g.tfm = g.tfm1 >> g.tfm | g.tfm1\n",
"\n",
"g.clean1 = Imputer | MissingIndicator\n",
"g.tfm1 = PCA | OrdinalEncoder | OneHot(handle_unknown='ignore')\n",
"g.est = GaussNB | Ridge | LinearSVC | SGDClassifier\n",
"\n",
"generated = g.unfold(6)\n",
"generated.visualize()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100%|██████████| 3/3 [00:04<00:00, 1.66s/trial, best loss: -0.45581188455008487]\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"trainer = Hyperopt(estimator=generated, cv=2, max_evals=3)\n",
"trained = trainer.fit(train_X, train_y)\n",
"if trained.get_pipeline() is None:\n",
" print('None', file=sys.stderr)\n",
"else:\n",
" trained.get_pipeline().visualize()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pool = {g.sample(10) for _ in range(10)}\n",
"for tree in pool:\n",
" tree.visualize()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100%|██████████| 10/10 [00:09<00:00, 1.04trial/s, best loss: -0.9045959105402828]\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"generated = make_choice(*pool)\n",
"\n",
"trainer = Hyperopt(estimator=generated, cv=2, max_evals=10)\n",
"trained = trainer.fit(train_X, train_y)\n",
"trained.get_pipeline().visualize()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# TPot: Grammar inferred from this [paper](https://dl.acm.org/doi/pdf/10.1145/2908812.2908918)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# from sklearn.preprocessing import StandardScaler as SScaler\n",
"# from sklearn.preprocessing import RobustScaler as RScaler\n",
"# from sklearn.preprocessing import PolynomialFeatures as PolyFeat\n",
"# from sklearn.decomposition import PCA\n",
"# from sklearn.feature_selection import SelectKBest\n",
"# from sklearn.feature_selection import RFE\n",
"# from sklearn.feature_selection import SelectPercentile\n",
"# from sklearn.feature_selection import VarianceThreshold\n",
"# from sklearn.tree import DecisionTreeClassifier as Tree\n",
"# from sklearn.ensemble import RandomForestClassifier as Forest\n",
"# from sklearn.ensemble import GradientBoostingClassifier as Gradient\n",
"# from sklearn.svm import SVC as SVM\n",
"# from sklearn.linear_model import LogisticRegression as LR\n",
"# from sklearn.neighbors import KNeighborsClassifier as KNN\n",
"# from lale.lib.lale import ConcatFeatures as Concat\n",
"\n",
"from sklearn.preprocessing import StandardScaler as PreProcess\n",
"from sklearn.decomposition import PCA as Decomposition\n",
"from sklearn.feature_selection import SelectKBest as Feature\n",
"from sklearn.tree import DecisionTreeClassifier as Model\n",
"from lale.lib.lale import ConcatFeatures as Concat\n",
"from lale.lib.lale import NoOp\n",
"\n",
"wrap_imported_operators()\n",
"\n",
"g = Grammar()\n",
"\n",
"g.start = g.tree >> g.model\n",
"g.tree = g.node | g.tree >> g.node | (g.tree & g.tree) >> Concat\n",
"g.node = g.preprocessing | g.decomposition | g.feature_selection | g.model \\\n",
"# | (g.node & g.node) >> Concat \\\n",
"# | g.node >> g.node\n",
"\n",
"\n",
"g.preprocessing = PreProcess\n",
"g.decomposition = Decomposition\n",
"g.feature_selection = Feature\n",
"g.model = Model\n",
"\n",
"# g.preprocessing = SScaler | RScaler | PolyFeat\n",
"# g.decomposition = PCA\n",
"# g.feature_selection = SelectKBest | RFE | SelectPercentile | VarianceThreshold\n",
"# g.model = Tree | Forest | Gradient | SVM | LR | KNN\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pool = {g.sample(15) for _ in range(10)}\n",
"for tree in pool:\n",
" tree.visualize()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100%|██████████| 6/6 [00:10<00:00, 1.82s/trial, best loss: 1.4095161290322582]\n"
]
},
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"generated = make_choice(*pool)\n",
"\n",
"trainer = Hyperopt(estimator=generated, cv=2, max_evals=6)\n",
"trained = trainer.fit(train_X, train_y)\n",
"trained.get_pipeline().visualize()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}