{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "![KTS logo](https://raw.githubusercontent.com/konodyuk/kts/master/docs/static/banner_alpha.png)\n", "# Stacking Guide" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
DASHBOARD
\n", "
features
\n", "
\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
simple_feature
\n", "
source
\n", "
@feature\n",
       "def simple_feature(df):\n",
       "    res = stl.empty_like(df)\n",
       "    res['is_male'] = (df.Sex == 'male') + 0\n",
       "    return res\n",
       "
\n", "
columns
\n", "
is_male
\n", "
\n", "\n", "
\n", "
\n", "
GENERIC FEATURE
\n", " \n", "
\n", "
name
\n", "
interactions
\n", "
source
\n", "
@feature\n",
       "@generic(left="Pclass", right="SibSp")\n",
       "def interactions(df):\n",
       "    res = stl.empty_like(df)\n",
       "    res[f"{left}_add_{right}"] = df[left] + df[right]\n",
       "    res[f"{left}_sub_{right}"] = df[left] - df[right]\n",
       "    res[f"{left}_mul_{right}"] = df[left] * df[right]\n",
       "    return res\n",
       "
\n", "
\n", "\n", "
\n", "
\n", "
GENERIC FEATURE
\n", " \n", "
\n", "
name
\n", "
num_aggs
\n", "
description
\n", "
Descriptions are also supported.
\n", "
source
\n", "
@feature\n",
       "@generic(col="Parch")\n",
       "def num_aggs(df):\n",
       "    """Descriptions are also supported."""\n",
       "    res = pd.DataFrame(index=df.index)\n",
       "    mean = df[col].mean()\n",
       "    std = df[col].std()\n",
       "    res[f"{col}_div_mean"] = df[col] / mean\n",
       "    res[f"{col}_sub_div_mean"] = (df[col] - mean) / mean\n",
       "    res[f"{col}_div_std"] = df[col] / std\n",
       "    return res\n",
       "
\n", "
\n", "\n", "
\n", "
\n", "
GENERIC FEATURE
\n", " \n", "
\n", "
name
\n", "
tfidf
\n", "
source
\n", "
@feature\n",
       "@generic(col='Name')\n",
       "def tfidf(df):\n",
       "    if df.train:\n",
       "        enc = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), max_features=5)\n",
       "        res = enc.fit_transform(df[col])\n",
       "        df.state['enc'] = enc\n",
       "    else:\n",
       "        enc = df.state['enc']\n",
       "        res = enc.transform(df[col])\n",
       "    return res.todense()\n",
       "
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "
helpers
\n", "
You've got no helpers so far.
\n", "
\n", "\n", "
\n", "\n", "
\n", "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import numpy as np\n", "np.random.seed(0)\n", "\n", "import kts\n", "from kts import *" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "train = kts.load('train')\n", "test = kts.load('test')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## stl.stack\n", "To stack models, `stl.stack` is used:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
STACK DOCS
\n", "
signature
\n", "
stack(experiment_id, noise_level, random_state)\n",
       "
\n", "
description
\n", "
Returns predictions of specified experiment as features

For indices used for fitting the experiment returns OOF predictions.
For unseen indices returns predictions obtained via experiment.predict().
\n", "
params
\n", "
experiment_id
id of the experiment at the leaderboard
\n", "
noise_level
range of noise added to predictions during train stage.
If specified, then uniformly distributed value from range [-noise_level/2, noise_level/2]
is added to each prediction.
\n", "
random_state
random state for random noise generator
\n", "
\n", "
returns
\n", "
A feature constructor returning predictions of the experiment.
\n", "
examples
\n", "
>>> stl.stack('ABCDEF')\n",
       ">>> stl.stack('ABCDEF', noise_level=0.3, random_state=42)\n",
       ">>> stl.concat([stl.stack('ABCDEF'), stl.stack('GHIJKL')])\n",
       "
" ], "text/plain": [ " kts.stl.backend.Stacker>" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stl.stack" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In case if we pass a train set slice, it just returns OOF predictions:\n", "\n", "*Note that it cannot be used in parallel features.*" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
COMPUTING FEATURES
feature
progress
\n", "
preview_stack
\n", "
1s
\n", "
\n", "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
KPBVAI
PassengerId
10.109578
20.985013
30.635773
40.906962
50.132054
60.124416
70.322831
80.599516
90.520683
100.922890
\n", "
" ], "text/plain": [ " KPBVAI\n", "PassengerId \n", "1 0.109578\n", "2 0.985013\n", "3 0.635773\n", "4 0.906962\n", "5 0.132054\n", "6 0.124416\n", "7 0.322831\n", "8 0.599516\n", "9 0.520683\n", "10 0.922890" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "@preview(train, 10, parallel=False)\n", "def preview_stack(df):\n", " return stl.stack('KPBVAI')(df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "But for test set, inference is run:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
COMPUTING FEATURES
feature
progress
\n", "
preview_stack
\n", "
\n", "
\n", "
num_aggs__Fare
\n", "
0s
\n", "
\n", "
simple_feature
\n", "
0s
\n", "
\n", "
interactions__Pclass_Age
\n", "
0s
\n", "
\n", "
tfidf__Name
\n", "
0s
\n", "
\n", "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFERENCE
\n", "
id
\n", "
KPBVAI
\n", "
progress
\n", "
\n", "
took
\n", "
2s
\n", "
eta
\n", "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
KPBVAI
PassengerId
8920.150248
8930.589290
8940.107736
8950.229384
8960.756447
8970.191558
8980.758887
8990.271479
9000.785848
9010.282044
\n", "
" ], "text/plain": [ " KPBVAI\n", "PassengerId \n", "892 0.150248\n", "893 0.589290\n", "894 0.107736\n", "895 0.229384\n", "896 0.756447\n", "897 0.191558\n", "898 0.758887\n", "899 0.271479\n", "900 0.785848\n", "901 0.282044" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "@preview(test, 10, parallel=False)\n", "def preview_stack(df):\n", " return stl.stack('KPBVAI')(df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Anti-overfitting\n", "\n", "KTS provides two basic ways to prevent overfitting during stacking. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Noise\n", "\n", "First of them is adding random uniform noise to first-level model predictions during training stage:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
COMPUTING FEATURES
feature
progress
\n", "
preview_stack
\n", "
0s
\n", "
\n", "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
KPBVAI
PassengerId
10.112647
20.977061
30.683239
40.921687
\n", "
" ], "text/plain": [ " KPBVAI\n", "PassengerId \n", "1 0.112647\n", "2 0.977061\n", "3 0.683239\n", "4 0.921687" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
KPBVAI
PassengerId
10.077990
21.005335
30.609979
40.933971
\n", "
" ], "text/plain": [ " KPBVAI\n", "PassengerId \n", "1 0.077990\n", "2 1.005335\n", "3 0.609979\n", "4 0.933971" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "@preview(train, 4, 4, parallel=False)\n", "def preview_stack(df):\n", " return stl.stack('KPBVAI', noise_level=0.1, random_state=None)(df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Refiner\n", "\n", "The second available option is a special splitter called Refiner, which splits each fold of an outer splitter using inner splitter. It allows to train a second-level model without even indirect leaks, as in this case each second-level model is trained using validation set of the corresponding first-level model.\n", "\n", "
\n", "\n", "
" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "from kts.validation.split import Refiner\n", "from sklearn.model_selection import StratifiedKFold\n", "from sklearn.metrics import roc_auc_score\n", "\n", "outer_skf = StratifiedKFold(5, True, 42) # splitter used to train the first-level model\n", "inner_skf = StratifiedKFold(3, True, 42) # splitter to be used to split its folds\n", "\n", "refiner = Refiner(outer_skf, inner_skf)\n", "val_stack = Validator(refiner, roc_auc_score)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "fs = FeatureSet([stl.stack('KPBVAI'), stl.stack('FYCMDA'), tfidf()], \n", " train_frame=train,\n", " targets='Survived')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "from kts.models.binary import *" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
FITTING
\n", "
progress
\n", "
\n", "
train
\n", "
\n", "
valid
\n", "
\n", "
metric
\n", "
0.927
\n", "
took
\n", "
0s
\n", "
eta
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.791
\n", "
0s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.872
\n", "
0s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.791
\n", "
0s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.833
\n", "
0s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.856
\n", "
0s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.780
\n", "
0s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.857
\n", "
0s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.828
\n", "
0s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.867
\n", "
0s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.894
\n", "
1s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.818
\n", "
0s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.774
\n", "
0s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.859
\n", "
0s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.774
\n", "
1s
\n", "
0s
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "{'score': 0.8348066445892532, 'id': 'CBUQLG'}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = LogisticRegression(solver='lbfgs', C=11)\n", "\n", "val_stack.score(model, fs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Stacked experiments behave exactly as usual experiments:" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
COMPUTING FEATURES
feature
progress
\n", "
simple_feature
\n", "
0s
\n", "
\n", "
interactions__Pclass_Age
\n", "
0s
\n", "
\n", "
num_aggs__Fare
\n", "
0s
\n", "
\n", "
tfidf__Name
\n", "
0s
\n", "
\n", "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFERENCE
\n", "
id
\n", "
KPBVAI
\n", "
progress
\n", "
\n", "
took
\n", "
2s
\n", "
eta
\n", "
\n", "
FYCMDA
\n", "
\n", "
1s
\n", "
\n", "
CBUQLG
\n", "
\n", "
2s
\n", "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "array([0.09516588, 0.46953291, 0.09342506, 0.15601523, 0.7111576 ,\n", " 0.11536574, 0.56980286, 0.16907218, 0.66821078, 0.13896083])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lb.CBUQLG.predict(test)[:10]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
COMPUTING IMPORTANCES
\n", "
progress
\n", "
\n", "
Computing tfidf__Name_4
\n", "
took
\n", "
6s
\n", "
eta
\n", "
0s
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
FEATURE IMPORTANCES
feature
mean
importance
\n", "
KPBVAI
0.281
\n", "
\n", "
\n", "
\n", "
\n", "
\n", "
\n", " \n", "
FYCMDA
0.056
\n", "
\n", "
\n", "
\n", "
\n", "
\n", "
\n", " \n", "
tfidf__Name_1
0.015
\n", "
\n", "
\n", "
\n", "
\n", "
\n", "
\n", " \n", "
tfidf__Name_0
0.014
\n", "
\n", "
\n", "
\n", "
\n", "
\n", "
\n", " \n", "
tfidf__Name_2
0.013
\n", "
\n", "
\n", "
\n", "
\n", "
\n", "
\n", " \n", "
tfidf__Name_3
4.01e-03
\n", "
\n", "
\n", "
\n", "
\n", "
\n", "
\n", " \n", "
tfidf__Name_4
1.79e-03
\n", "
\n", "
\n", "
\n", "
\n", "
\n", "
\n", " \n", "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "lb.CBUQLG.feature_importances(estimator=Permutation(train, n_iters=10))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
COMPUTING IMPORTANCES
\n", "
progress
\n", "
\n", "
Computing tfidf__Name_4
\n", "
took
\n", "
3s
\n", "
eta
\n", "
0s
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
FEATURE IMPORTANCES
feature
mean
importance
\n", "
KPBVAI
0.364
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
FYCMDA
0.137
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
tfidf__Name_2
0.101
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
tfidf__Name_1
0.091
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
tfidf__Name_0
0.067
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
tfidf__Name_4
0.066
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
tfidf__Name_3
0.048
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "lb.CBUQLG.feature_importances(estimator=PermutationBlind(test, n_iters=20))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Deep Stacking\n", "\n", "As `stl.stack` is no more than a usual feature constructor, you can build as complex stackings as you want just by adding it to feature sets. \n", "\n", "Let's write a five-level stacking with resudual connections. In this demo we don't care about overfitting and model performance and just show that:\n", "1. Stacking is as easy as adding `stl.stack(id)` to feature set\n", "2. Stacking inference is no different from ordinary experiments\n", "3. In case if two or more next-level models need predictions from model A, model A will still be run only once" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "skf = StratifiedKFold(5, True, 42)\n", "val = Validator(skf, roc_auc_score)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
COMPUTING FEATURES
feature
progress
\n", "
num_aggs__SibSp
\n", "
0s
\n", "
\n", "
num_aggs__Parch
\n", "
0s
\n", "
\n", "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
FITTING
\n", "
progress
\n", "
\n", "
train
\n", "
\n", "
valid
\n", "
\n", "
metric
\n", "
0.673
\n", "
took
\n", "
0s
\n", "
eta
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.638
\n", "
0s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.654
\n", "
0s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.707
\n", "
0s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.614
\n", "
0s
\n", "
0s
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
FITTING
\n", "
progress
\n", "
\n", "
train
\n", "
\n", "
valid
\n", "
\n", "
metric
\n", "
0.774
\n", "
took
\n", "
0s
\n", "
eta
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.682
\n", "
0s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.757
\n", "
0s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.785
\n", "
0s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.703
\n", "
0s
\n", "
0s
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
FITTING
\n", "
progress
\n", "
\n", "
train
\n", "
\n", "
valid
\n", "
\n", "
metric
\n", "
0.751
\n", "
took
\n", "
0s
\n", "
eta
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.696
\n", "
1s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.735
\n", "
1s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.782
\n", "
1s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.706
\n", "
0s
\n", "
0s
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
FITTING
\n", "
progress
\n", "
\n", "
train
\n", "
\n", "
valid
\n", "
\n", "
metric
\n", "
0.745
\n", "
took
\n", "
1s
\n", "
eta
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.682
\n", "
1s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.723
\n", "
1s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.773
\n", "
1s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.698
\n", "
1s
\n", "
0s
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
FITTING
\n", "
progress
\n", "
\n", "
train
\n", "
\n", "
valid
\n", "
\n", "
metric
\n", "
0.753
\n", "
took
\n", "
1s
\n", "
eta
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.704
\n", "
1s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.693
\n", "
1s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.777
\n", "
1s
\n", "
0s
\n", "
\n", "
\n", "
\n", "
0.698
\n", "
1s
\n", "
0s
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "current_features = [tfidf(), num_aggs('SibSp'), num_aggs('Parch')]\n", "\n", "for i in range(5):\n", " model = RandomForestClassifier(n_estimators=50)\n", " fs = FeatureSet(current_features, train_frame=train, targets='Survived')\n", "\n", " summary = val.score(model, fs, leaderboard='deepstack')\n", " current_features.append(stl.stack(summary['id']))" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
LEADERBOARD
\n", "
#
id
score
model
# features
date
took
\n", "\n", "
\n", "
\n", "
EXPERIMENT
\n", " \n", "
\n", "
ID
\n", "
EKLVJO
\n", "
score
\n", "
0.7401506654806826
\n", "
model
\n", "
RandomForestClassifier
\n", "
params
\n", "
               bootstrap = True\n",
       "            class_weight = None\n",
       "               criterion = 'gini'\n",
       "               max_depth = None\n",
       "            max_features = 'auto'\n",
       "          max_leaf_nodes = None\n",
       "   min_impurity_decrease = 0.0\n",
       "      min_impurity_split = None\n",
       "        min_samples_leaf = 1\n",
       "       min_samples_split = 2\n",
       "min_weight_fraction_leaf = 0.0\n",
       "            n_estimators = 50\n",
       "               oob_score = False\n",
       "            random_state = None\n",
       "              warm_start = False\n",
       "
\n", "
features
\n", "
\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
tfidf('Name')
\n", "
description
\n", "
An instance of generic feature constructor tfidf
\n", "
source
\n", "
tfidf('Name')\n",
       "
\n", "
additional source
\n", "
@feature\n",
       "@generic(col='Name')\n",
       "def tfidf(df):\n",
       "    if df.train:\n",
       "        enc = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), max_features=5)\n",
       "        res = enc.fit_transform(df[col])\n",
       "        df.state['enc'] = enc\n",
       "    else:\n",
       "        enc = df.state['enc']\n",
       "        res = enc.transform(df[col])\n",
       "    return res.todense()\n",
       "
\n", "
columns
\n", "
tfidf__Name_3, tfidf__Name_4, tfidf__Name_1, tfidf__Name_0, tfidf__Name_2
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
num_aggs('SibSp')
\n", "
description
\n", "
An instance of generic feature constructor num_aggs
\n", "
source
\n", "
num_aggs('SibSp')\n",
       "
\n", "
additional source
\n", "
@feature\n",
       "@generic(col="Parch")\n",
       "def num_aggs(df):\n",
       "    """Descriptions are also supported."""\n",
       "    res = pd.DataFrame(index=df.index)\n",
       "    mean = df[col].mean()\n",
       "    std = df[col].std()\n",
       "    res[f"{col}_div_mean"] = df[col] / mean\n",
       "    res[f"{col}_sub_div_mean"] = (df[col] - mean) / mean\n",
       "    res[f"{col}_div_std"] = df[col] / std\n",
       "    return res\n",
       "
\n", "
columns
\n", "
SibSp_div_mean, SibSp_sub_div_mean, SibSp_div_std
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
num_aggs('Parch')
\n", "
description
\n", "
An instance of generic feature constructor num_aggs
\n", "
source
\n", "
num_aggs('Parch')\n",
       "
\n", "
additional source
\n", "
@feature\n",
       "@generic(col="Parch")\n",
       "def num_aggs(df):\n",
       "    """Descriptions are also supported."""\n",
       "    res = pd.DataFrame(index=df.index)\n",
       "    mean = df[col].mean()\n",
       "    std = df[col].std()\n",
       "    res[f"{col}_div_mean"] = df[col] / mean\n",
       "    res[f"{col}_sub_div_mean"] = (df[col] - mean) / mean\n",
       "    res[f"{col}_div_std"] = df[col] / std\n",
       "    return res\n",
       "
\n", "
columns
\n", "
Parch_div_std, Parch_sub_div_mean, Parch_div_mean
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
stl.stack('BESHSV')
\n", "
source
\n", "
stl.stack('BESHSV')\n",
       "
\n", "
columns
\n", "
BESHSV
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "
details
\n", "
\n", "
\n", "
\n", "
FEATURE SET
\n", " \n", "
\n", "
name
\n", "
FSESGYEK
\n", "
source
\n", "
FeatureSet([tfidf('Name'),\n",
       "            num_aggs('SibSp'),\n",
       "            num_aggs('Parch'),\n",
       "            stl.stack('BESHSV')],\n",
       "           [],\n",
       "           targets=['Survived'],\n",
       "           auxiliary=[])\n",
       "
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
MODEL
\n", " \n", "
\n", "
name
\n", "
RandomForestClassifierGCO
\n", "
model
\n", "
RandomForestClassifier
\n", "
params
\n", "
               bootstrap = True\n",
       "            class_weight = None\n",
       "               criterion = 'gini'\n",
       "               max_depth = None\n",
       "            max_features = 'auto'\n",
       "          max_leaf_nodes = None\n",
       "   min_impurity_decrease = 0.0\n",
       "      min_impurity_split = None\n",
       "        min_samples_leaf = 1\n",
       "       min_samples_split = 2\n",
       "min_weight_fraction_leaf = 0.0\n",
       "            n_estimators = 50\n",
       "               oob_score = False\n",
       "            random_state = None\n",
       "              warm_start = False\n",
       "
\n", "
source
\n", "
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=50, oob_score=False, random_state=None, warm_start=False)\n",
       "
\n", "
\n", "\n", "
\n", "
\n", "
VALIDATOR
\n", " \n", "
\n", "
splitter
\n", "
StratifiedKFold(n_splits=5, random_state=42, shuffle=True)\n",
       "
\n", "
metric
\n", "
roc_auc_score\n",
       "
\n", "
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
EXPERIMENT
\n", " \n", "
\n", "
ID
\n", "
TWVZZK
\n", "
score
\n", "
0.7341934170774593
\n", "
model
\n", "
RandomForestClassifier
\n", "
params
\n", "
               bootstrap = True\n",
       "            class_weight = None\n",
       "               criterion = 'gini'\n",
       "               max_depth = None\n",
       "            max_features = 'auto'\n",
       "          max_leaf_nodes = None\n",
       "   min_impurity_decrease = 0.0\n",
       "      min_impurity_split = None\n",
       "        min_samples_leaf = 1\n",
       "       min_samples_split = 2\n",
       "min_weight_fraction_leaf = 0.0\n",
       "            n_estimators = 50\n",
       "               oob_score = False\n",
       "            random_state = None\n",
       "              warm_start = False\n",
       "
\n", "
features
\n", "
\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
tfidf('Name')
\n", "
description
\n", "
An instance of generic feature constructor tfidf
\n", "
source
\n", "
tfidf('Name')\n",
       "
\n", "
additional source
\n", "
@feature\n",
       "@generic(col='Name')\n",
       "def tfidf(df):\n",
       "    if df.train:\n",
       "        enc = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), max_features=5)\n",
       "        res = enc.fit_transform(df[col])\n",
       "        df.state['enc'] = enc\n",
       "    else:\n",
       "        enc = df.state['enc']\n",
       "        res = enc.transform(df[col])\n",
       "    return res.todense()\n",
       "
\n", "
columns
\n", "
tfidf__Name_3, tfidf__Name_4, tfidf__Name_1, tfidf__Name_0, tfidf__Name_2
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
num_aggs('SibSp')
\n", "
description
\n", "
An instance of generic feature constructor num_aggs
\n", "
source
\n", "
num_aggs('SibSp')\n",
       "
\n", "
additional source
\n", "
@feature\n",
       "@generic(col="Parch")\n",
       "def num_aggs(df):\n",
       "    """Descriptions are also supported."""\n",
       "    res = pd.DataFrame(index=df.index)\n",
       "    mean = df[col].mean()\n",
       "    std = df[col].std()\n",
       "    res[f"{col}_div_mean"] = df[col] / mean\n",
       "    res[f"{col}_sub_div_mean"] = (df[col] - mean) / mean\n",
       "    res[f"{col}_div_std"] = df[col] / std\n",
       "    return res\n",
       "
\n", "
columns
\n", "
SibSp_div_mean, SibSp_sub_div_mean, SibSp_div_std
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
num_aggs('Parch')
\n", "
description
\n", "
An instance of generic feature constructor num_aggs
\n", "
source
\n", "
num_aggs('Parch')\n",
       "
\n", "
additional source
\n", "
@feature\n",
       "@generic(col="Parch")\n",
       "def num_aggs(df):\n",
       "    """Descriptions are also supported."""\n",
       "    res = pd.DataFrame(index=df.index)\n",
       "    mean = df[col].mean()\n",
       "    std = df[col].std()\n",
       "    res[f"{col}_div_mean"] = df[col] / mean\n",
       "    res[f"{col}_sub_div_mean"] = (df[col] - mean) / mean\n",
       "    res[f"{col}_div_std"] = df[col] / std\n",
       "    return res\n",
       "
\n", "
columns
\n", "
Parch_div_std, Parch_sub_div_mean, Parch_div_mean
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
stl.stack('BESHSV')
\n", "
source
\n", "
stl.stack('BESHSV')\n",
       "
\n", "
columns
\n", "
BESHSV
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
stl.stack('EKLVJO')
\n", "
source
\n", "
stl.stack('EKLVJO')\n",
       "
\n", "
columns
\n", "
EKLVJO
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "
details
\n", "
\n", "
\n", "
\n", "
FEATURE SET
\n", " \n", "
\n", "
name
\n", "
FSEVGYEK
\n", "
source
\n", "
FeatureSet([tfidf('Name'),\n",
       "            num_aggs('SibSp'),\n",
       "            num_aggs('Parch'),\n",
       "            stl.stack('BESHSV'),\n",
       "            stl.stack('EKLVJO')],\n",
       "           [],\n",
       "           targets=['Survived'],\n",
       "           auxiliary=[])\n",
       "
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
MODEL
\n", " \n", "
\n", "
name
\n", "
RandomForestClassifierGCO
\n", "
model
\n", "
RandomForestClassifier
\n", "
params
\n", "
               bootstrap = True\n",
       "            class_weight = None\n",
       "               criterion = 'gini'\n",
       "               max_depth = None\n",
       "            max_features = 'auto'\n",
       "          max_leaf_nodes = None\n",
       "   min_impurity_decrease = 0.0\n",
       "      min_impurity_split = None\n",
       "        min_samples_leaf = 1\n",
       "       min_samples_split = 2\n",
       "min_weight_fraction_leaf = 0.0\n",
       "            n_estimators = 50\n",
       "               oob_score = False\n",
       "            random_state = None\n",
       "              warm_start = False\n",
       "
\n", "
source
\n", "
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=50, oob_score=False, random_state=None, warm_start=False)\n",
       "
\n", "
\n", "\n", "
\n", "
\n", "
VALIDATOR
\n", " \n", "
\n", "
splitter
\n", "
StratifiedKFold(n_splits=5, random_state=42, shuffle=True)\n",
       "
\n", "
metric
\n", "
roc_auc_score\n",
       "
\n", "
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
EXPERIMENT
\n", " \n", "
\n", "
ID
\n", "
BBFXPT
\n", "
score
\n", "
0.7250829531145235
\n", "
model
\n", "
RandomForestClassifier
\n", "
params
\n", "
               bootstrap = True\n",
       "            class_weight = None\n",
       "               criterion = 'gini'\n",
       "               max_depth = None\n",
       "            max_features = 'auto'\n",
       "          max_leaf_nodes = None\n",
       "   min_impurity_decrease = 0.0\n",
       "      min_impurity_split = None\n",
       "        min_samples_leaf = 1\n",
       "       min_samples_split = 2\n",
       "min_weight_fraction_leaf = 0.0\n",
       "            n_estimators = 50\n",
       "               oob_score = False\n",
       "            random_state = None\n",
       "              warm_start = False\n",
       "
\n", "
features
\n", "
\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
tfidf('Name')
\n", "
description
\n", "
An instance of generic feature constructor tfidf
\n", "
source
\n", "
tfidf('Name')\n",
       "
\n", "
additional source
\n", "
@feature\n",
       "@generic(col='Name')\n",
       "def tfidf(df):\n",
       "    if df.train:\n",
       "        enc = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), max_features=5)\n",
       "        res = enc.fit_transform(df[col])\n",
       "        df.state['enc'] = enc\n",
       "    else:\n",
       "        enc = df.state['enc']\n",
       "        res = enc.transform(df[col])\n",
       "    return res.todense()\n",
       "
\n", "
columns
\n", "
tfidf__Name_3, tfidf__Name_4, tfidf__Name_1, tfidf__Name_0, tfidf__Name_2
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
num_aggs('SibSp')
\n", "
description
\n", "
An instance of generic feature constructor num_aggs
\n", "
source
\n", "
num_aggs('SibSp')\n",
       "
\n", "
additional source
\n", "
@feature\n",
       "@generic(col="Parch")\n",
       "def num_aggs(df):\n",
       "    """Descriptions are also supported."""\n",
       "    res = pd.DataFrame(index=df.index)\n",
       "    mean = df[col].mean()\n",
       "    std = df[col].std()\n",
       "    res[f"{col}_div_mean"] = df[col] / mean\n",
       "    res[f"{col}_sub_div_mean"] = (df[col] - mean) / mean\n",
       "    res[f"{col}_div_std"] = df[col] / std\n",
       "    return res\n",
       "
\n", "
columns
\n", "
SibSp_div_mean, SibSp_sub_div_mean, SibSp_div_std
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
num_aggs('Parch')
\n", "
description
\n", "
An instance of generic feature constructor num_aggs
\n", "
source
\n", "
num_aggs('Parch')\n",
       "
\n", "
additional source
\n", "
@feature\n",
       "@generic(col="Parch")\n",
       "def num_aggs(df):\n",
       "    """Descriptions are also supported."""\n",
       "    res = pd.DataFrame(index=df.index)\n",
       "    mean = df[col].mean()\n",
       "    std = df[col].std()\n",
       "    res[f"{col}_div_mean"] = df[col] / mean\n",
       "    res[f"{col}_sub_div_mean"] = (df[col] - mean) / mean\n",
       "    res[f"{col}_div_std"] = df[col] / std\n",
       "    return res\n",
       "
\n", "
columns
\n", "
Parch_div_std, Parch_sub_div_mean, Parch_div_mean
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
stl.stack('BESHSV')
\n", "
source
\n", "
stl.stack('BESHSV')\n",
       "
\n", "
columns
\n", "
BESHSV
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
stl.stack('EKLVJO')
\n", "
source
\n", "
stl.stack('EKLVJO')\n",
       "
\n", "
columns
\n", "
EKLVJO
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
stl.stack('TWVZZK')
\n", "
source
\n", "
stl.stack('TWVZZK')\n",
       "
\n", "
columns
\n", "
TWVZZK
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
stl.stack('HJMHZG')
\n", "
source
\n", "
stl.stack('HJMHZG')\n",
       "
\n", "
columns
\n", "
HJMHZG
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "
details
\n", "
\n", "
\n", "
\n", "
FEATURE SET
\n", " \n", "
\n", "
name
\n", "
FSGMGYEK
\n", "
source
\n", "
FeatureSet([tfidf('Name'),\n",
       "            num_aggs('SibSp'),\n",
       "            num_aggs('Parch'),\n",
       "            stl.stack('BESHSV'),\n",
       "            stl.stack('EKLVJO'),\n",
       "            stl.stack('TWVZZK'),\n",
       "            stl.stack('HJMHZG')],\n",
       "           [],\n",
       "           targets=['Survived'],\n",
       "           auxiliary=[])\n",
       "
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
MODEL
\n", " \n", "
\n", "
name
\n", "
RandomForestClassifierGCO
\n", "
model
\n", "
RandomForestClassifier
\n", "
params
\n", "
               bootstrap = True\n",
       "            class_weight = None\n",
       "               criterion = 'gini'\n",
       "               max_depth = None\n",
       "            max_features = 'auto'\n",
       "          max_leaf_nodes = None\n",
       "   min_impurity_decrease = 0.0\n",
       "      min_impurity_split = None\n",
       "        min_samples_leaf = 1\n",
       "       min_samples_split = 2\n",
       "min_weight_fraction_leaf = 0.0\n",
       "            n_estimators = 50\n",
       "               oob_score = False\n",
       "            random_state = None\n",
       "              warm_start = False\n",
       "
\n", "
source
\n", "
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=50, oob_score=False, random_state=None, warm_start=False)\n",
       "
\n", "
\n", "\n", "
\n", "
\n", "
VALIDATOR
\n", " \n", "
\n", "
splitter
\n", "
StratifiedKFold(n_splits=5, random_state=42, shuffle=True)\n",
       "
\n", "
metric
\n", "
roc_auc_score\n",
       "
\n", "
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
EXPERIMENT
\n", " \n", "
\n", "
ID
\n", "
HJMHZG
\n", "
score
\n", "
0.7240638813816856
\n", "
model
\n", "
RandomForestClassifier
\n", "
params
\n", "
               bootstrap = True\n",
       "            class_weight = None\n",
       "               criterion = 'gini'\n",
       "               max_depth = None\n",
       "            max_features = 'auto'\n",
       "          max_leaf_nodes = None\n",
       "   min_impurity_decrease = 0.0\n",
       "      min_impurity_split = None\n",
       "        min_samples_leaf = 1\n",
       "       min_samples_split = 2\n",
       "min_weight_fraction_leaf = 0.0\n",
       "            n_estimators = 50\n",
       "               oob_score = False\n",
       "            random_state = None\n",
       "              warm_start = False\n",
       "
\n", "
features
\n", "
\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
tfidf('Name')
\n", "
description
\n", "
An instance of generic feature constructor tfidf
\n", "
source
\n", "
tfidf('Name')\n",
       "
\n", "
additional source
\n", "
@feature\n",
       "@generic(col='Name')\n",
       "def tfidf(df):\n",
       "    if df.train:\n",
       "        enc = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), max_features=5)\n",
       "        res = enc.fit_transform(df[col])\n",
       "        df.state['enc'] = enc\n",
       "    else:\n",
       "        enc = df.state['enc']\n",
       "        res = enc.transform(df[col])\n",
       "    return res.todense()\n",
       "
\n", "
columns
\n", "
tfidf__Name_3, tfidf__Name_4, tfidf__Name_1, tfidf__Name_0, tfidf__Name_2
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
num_aggs('SibSp')
\n", "
description
\n", "
An instance of generic feature constructor num_aggs
\n", "
source
\n", "
num_aggs('SibSp')\n",
       "
\n", "
additional source
\n", "
@feature\n",
       "@generic(col="Parch")\n",
       "def num_aggs(df):\n",
       "    """Descriptions are also supported."""\n",
       "    res = pd.DataFrame(index=df.index)\n",
       "    mean = df[col].mean()\n",
       "    std = df[col].std()\n",
       "    res[f"{col}_div_mean"] = df[col] / mean\n",
       "    res[f"{col}_sub_div_mean"] = (df[col] - mean) / mean\n",
       "    res[f"{col}_div_std"] = df[col] / std\n",
       "    return res\n",
       "
\n", "
columns
\n", "
SibSp_div_mean, SibSp_sub_div_mean, SibSp_div_std
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
num_aggs('Parch')
\n", "
description
\n", "
An instance of generic feature constructor num_aggs
\n", "
source
\n", "
num_aggs('Parch')\n",
       "
\n", "
additional source
\n", "
@feature\n",
       "@generic(col="Parch")\n",
       "def num_aggs(df):\n",
       "    """Descriptions are also supported."""\n",
       "    res = pd.DataFrame(index=df.index)\n",
       "    mean = df[col].mean()\n",
       "    std = df[col].std()\n",
       "    res[f"{col}_div_mean"] = df[col] / mean\n",
       "    res[f"{col}_sub_div_mean"] = (df[col] - mean) / mean\n",
       "    res[f"{col}_div_std"] = df[col] / std\n",
       "    return res\n",
       "
\n", "
columns
\n", "
Parch_div_std, Parch_sub_div_mean, Parch_div_mean
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
stl.stack('BESHSV')
\n", "
source
\n", "
stl.stack('BESHSV')\n",
       "
\n", "
columns
\n", "
BESHSV
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
stl.stack('EKLVJO')
\n", "
source
\n", "
stl.stack('EKLVJO')\n",
       "
\n", "
columns
\n", "
EKLVJO
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
stl.stack('TWVZZK')
\n", "
source
\n", "
stl.stack('TWVZZK')\n",
       "
\n", "
columns
\n", "
TWVZZK
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "
details
\n", "
\n", "
\n", "
\n", "
FEATURE SET
\n", " \n", "
\n", "
name
\n", "
FSCKGYEK
\n", "
source
\n", "
FeatureSet([tfidf('Name'),\n",
       "            num_aggs('SibSp'),\n",
       "            num_aggs('Parch'),\n",
       "            stl.stack('BESHSV'),\n",
       "            stl.stack('EKLVJO'),\n",
       "            stl.stack('TWVZZK')],\n",
       "           [],\n",
       "           targets=['Survived'],\n",
       "           auxiliary=[])\n",
       "
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
MODEL
\n", " \n", "
\n", "
name
\n", "
RandomForestClassifierGCO
\n", "
model
\n", "
RandomForestClassifier
\n", "
params
\n", "
               bootstrap = True\n",
       "            class_weight = None\n",
       "               criterion = 'gini'\n",
       "               max_depth = None\n",
       "            max_features = 'auto'\n",
       "          max_leaf_nodes = None\n",
       "   min_impurity_decrease = 0.0\n",
       "      min_impurity_split = None\n",
       "        min_samples_leaf = 1\n",
       "       min_samples_split = 2\n",
       "min_weight_fraction_leaf = 0.0\n",
       "            n_estimators = 50\n",
       "               oob_score = False\n",
       "            random_state = None\n",
       "              warm_start = False\n",
       "
\n", "
source
\n", "
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=50, oob_score=False, random_state=None, warm_start=False)\n",
       "
\n", "
\n", "\n", "
\n", "
\n", "
VALIDATOR
\n", " \n", "
\n", "
splitter
\n", "
StratifiedKFold(n_splits=5, random_state=42, shuffle=True)\n",
       "
\n", "
metric
\n", "
roc_auc_score\n",
       "
\n", "
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
EXPERIMENT
\n", " \n", "
\n", "
ID
\n", "
BESHSV
\n", "
score
\n", "
0.6570725640221641
\n", "
model
\n", "
RandomForestClassifier
\n", "
params
\n", "
               bootstrap = True\n",
       "            class_weight = None\n",
       "               criterion = 'gini'\n",
       "               max_depth = None\n",
       "            max_features = 'auto'\n",
       "          max_leaf_nodes = None\n",
       "   min_impurity_decrease = 0.0\n",
       "      min_impurity_split = None\n",
       "        min_samples_leaf = 1\n",
       "       min_samples_split = 2\n",
       "min_weight_fraction_leaf = 0.0\n",
       "            n_estimators = 50\n",
       "               oob_score = False\n",
       "            random_state = None\n",
       "              warm_start = False\n",
       "
\n", "
features
\n", "
\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
tfidf('Name')
\n", "
description
\n", "
An instance of generic feature constructor tfidf
\n", "
source
\n", "
tfidf('Name')\n",
       "
\n", "
additional source
\n", "
@feature\n",
       "@generic(col='Name')\n",
       "def tfidf(df):\n",
       "    if df.train:\n",
       "        enc = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), max_features=5)\n",
       "        res = enc.fit_transform(df[col])\n",
       "        df.state['enc'] = enc\n",
       "    else:\n",
       "        enc = df.state['enc']\n",
       "        res = enc.transform(df[col])\n",
       "    return res.todense()\n",
       "
\n", "
columns
\n", "
tfidf__Name_3, tfidf__Name_4, tfidf__Name_1, tfidf__Name_0, tfidf__Name_2
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
num_aggs('SibSp')
\n", "
description
\n", "
An instance of generic feature constructor num_aggs
\n", "
source
\n", "
num_aggs('SibSp')\n",
       "
\n", "
additional source
\n", "
@feature\n",
       "@generic(col="Parch")\n",
       "def num_aggs(df):\n",
       "    """Descriptions are also supported."""\n",
       "    res = pd.DataFrame(index=df.index)\n",
       "    mean = df[col].mean()\n",
       "    std = df[col].std()\n",
       "    res[f"{col}_div_mean"] = df[col] / mean\n",
       "    res[f"{col}_sub_div_mean"] = (df[col] - mean) / mean\n",
       "    res[f"{col}_div_std"] = df[col] / std\n",
       "    return res\n",
       "
\n", "
columns
\n", "
SibSp_div_mean, SibSp_sub_div_mean, SibSp_div_std
\n", "
\n", "\n", "
\n", "
\n", "
FEATURE CONSTRUCTOR
\n", " \n", "
\n", "
name
\n", "
num_aggs('Parch')
\n", "
description
\n", "
An instance of generic feature constructor num_aggs
\n", "
source
\n", "
num_aggs('Parch')\n",
       "
\n", "
additional source
\n", "
@feature\n",
       "@generic(col="Parch")\n",
       "def num_aggs(df):\n",
       "    """Descriptions are also supported."""\n",
       "    res = pd.DataFrame(index=df.index)\n",
       "    mean = df[col].mean()\n",
       "    std = df[col].std()\n",
       "    res[f"{col}_div_mean"] = df[col] / mean\n",
       "    res[f"{col}_sub_div_mean"] = (df[col] - mean) / mean\n",
       "    res[f"{col}_div_std"] = df[col] / std\n",
       "    return res\n",
       "
\n", "
columns
\n", "
Parch_div_std, Parch_sub_div_mean, Parch_div_mean
\n", "
\n", "
details
\n", "
\n", "
\n", "
\n", "
FEATURE SET
\n", " \n", "
\n", "
name
\n", "
FSSIGYEK
\n", "
source
\n", "
FeatureSet([tfidf('Name'),\n",
       "            num_aggs('SibSp'),\n",
       "            num_aggs('Parch')],\n",
       "           [],\n",
       "           targets=['Survived'],\n",
       "           auxiliary=[])\n",
       "
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
\n", "\n", "
\n", "
\n", "
MODEL
\n", " \n", "
\n", "
name
\n", "
RandomForestClassifierGCO
\n", "
model
\n", "
RandomForestClassifier
\n", "
params
\n", "
               bootstrap = True\n",
       "            class_weight = None\n",
       "               criterion = 'gini'\n",
       "               max_depth = None\n",
       "            max_features = 'auto'\n",
       "          max_leaf_nodes = None\n",
       "   min_impurity_decrease = 0.0\n",
       "      min_impurity_split = None\n",
       "        min_samples_leaf = 1\n",
       "       min_samples_split = 2\n",
       "min_weight_fraction_leaf = 0.0\n",
       "            n_estimators = 50\n",
       "               oob_score = False\n",
       "            random_state = None\n",
       "              warm_start = False\n",
       "
\n", "
source
\n", "
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=50, oob_score=False, random_state=None, warm_start=False)\n",
       "
\n", "
\n", "\n", "
\n", "
\n", "
VALIDATOR
\n", " \n", "
\n", "
splitter
\n", "
StratifiedKFold(n_splits=5, random_state=42, shuffle=True)\n",
       "
\n", "
metric
\n", "
roc_auc_score\n",
       "
\n", "
\n", "
requirements
\n", "
sklearn==0.20.2
\n", "
" ], "text/plain": [ "" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lbs.deepstack" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "So, `BBFXPT` is a fifth-level model." ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
COMPUTING FEATURES
feature
progress
\n", "
tfidf__Name
\n", "
0s
\n", "
\n", "
num_aggs__SibSp
\n", "
0s
\n", "
\n", "
num_aggs__Parch
\n", "
0s
\n", "
\n", "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
INFERENCE
\n", "
id
\n", "
BESHSV
\n", "
progress
\n", "
\n", "
took
\n", "
0s
\n", "
eta
\n", "
\n", "
EKLVJO
\n", "
\n", "
1s
\n", "
\n", "
TWVZZK
\n", "
\n", "
1s
\n", "
\n", "
HJMHZG
\n", "
\n", "
1s
\n", "
\n", "
BBFXPT
\n", "
\n", "
1s
\n", "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "array([0.07853333, 0.848 , 0.065 , 0.11357619, 0.508 ,\n", " 0.008 , 0.68731746, 0.37766667, 0.42224762, 0.24766667,\n", " 0.14748889, 0.14460952, 0.58071429, 0.33666667, 0.632 ,\n", " 0.768 , 0.542 , 0.0744 , 0.50438095, 0.46020952,\n", " 0.27 ])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lb.BBFXPT.predict(test.head(21))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
FEATURE IMPORTANCES
feature
mean
importance
\n", "
EKLVJO
0.141
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
TWVZZK
0.129
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
HJMHZG
0.120
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
BESHSV
0.103
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
tfidf__Name_4
0.095
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
tfidf__Name_3
0.082
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
tfidf__Name_0
0.080
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
tfidf__Name_2
0.080
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
tfidf__Name_1
0.078
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
SibSp_sub_div_mean
0.018
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
SibSp_div_std
0.016
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
Parch_sub_div_mean
0.015
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
SibSp_div_mean
0.015
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
Parch_div_std
0.014
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
Parch_div_mean
0.013
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "
" ], "text/plain": [ "" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lb.BBFXPT.feature_importances()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 4 }