{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "![KTS logo](https://raw.githubusercontent.com/konodyuk/kts/master/docs/static/banner_alpha.png)\n", "# Feature Engineering Guide" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Survived | \n", "Pclass | \n", "Name | \n", "Sex | \n", "Age | \n", "SibSp | \n", "Parch | \n", "Ticket | \n", "Fare | \n", "Cabin | \n", "Embarked | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|
PassengerId | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
1 | \n", "0 | \n", "3 | \n", "Braund, Mr. Owen Harris | \n", "male | \n", "22.0 | \n", "1 | \n", "0 | \n", "A/5 21171 | \n", "7.2500 | \n", "NaN | \n", "S | \n", "
2 | \n", "1 | \n", "1 | \n", "Cumings, Mrs. John Bradley (Florence Briggs Th... | \n", "female | \n", "38.0 | \n", "1 | \n", "0 | \n", "PC 17599 | \n", "71.2833 | \n", "C85 | \n", "C | \n", "
3 | \n", "1 | \n", "3 | \n", "Heikkinen, Miss. Laina | \n", "female | \n", "26.0 | \n", "0 | \n", "0 | \n", "STON/O2. 3101282 | \n", "7.9250 | \n", "NaN | \n", "S | \n", "
4 | \n", "1 | \n", "1 | \n", "Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n", "female | \n", "35.0 | \n", "1 | \n", "0 | \n", "113803 | \n", "53.1000 | \n", "C123 | \n", "S | \n", "
5 | \n", "0 | \n", "3 | \n", "Allen, Mr. William Henry | \n", "male | \n", "35.0 | \n", "0 | \n", "0 | \n", "373450 | \n", "8.0500 | \n", "NaN | \n", "S | \n", "
\n", " | a | \n", "
---|---|
PassengerId | \n", "\n", " |
1 | \n", "a | \n", "
2 | \n", "a | \n", "
\n", " | a | \n", "
---|---|
PassengerId | \n", "\n", " |
3 | \n", "a | \n", "
4 | \n", "a | \n", "
5 | \n", "a | \n", "
\n", " | a | \n", "
---|---|
PassengerId | \n", "\n", " |
1 | \n", "a | \n", "
2 | \n", "a | \n", "
\n", " | a | \n", "
---|---|
PassengerId | \n", "\n", " |
1 | \n", "a | \n", "
2 | \n", "a | \n", "
3 | \n", "a | \n", "
4 | \n", "a | \n", "
5 | \n", "a | \n", "
\n", " | Age | \n", "mean | \n", "
---|---|---|
PassengerId | \n", "\n", " | \n", " |
1 | \n", "22.0 | \n", "28.666667 | \n", "
2 | \n", "38.0 | \n", "28.666667 | \n", "
3 | \n", "26.0 | \n", "28.666667 | \n", "
\n", " | Age | \n", "mean | \n", "
---|---|---|
PassengerId | \n", "\n", " | \n", " |
1 | \n", "22.0 | \n", "31.2 | \n", "
2 | \n", "38.0 | \n", "31.2 | \n", "
3 | \n", "26.0 | \n", "31.2 | \n", "
4 | \n", "35.0 | \n", "31.2 | \n", "
5 | \n", "35.0 | \n", "31.2 | \n", "
6 | \n", "NaN | \n", "31.2 | \n", "
\n", " | a | \n", "b | \n", "c | \n", "d | \n", "mean | \n", "
---|---|---|---|---|---|
PassengerId | \n", "\n", " | \n", " | \n", " | \n", " | \n", " |
31 | \n", "a | \n", "b | \n", "c | \n", "d | \n", "44.666667 | \n", "
32 | \n", "a | \n", "b | \n", "c | \n", "d | \n", "44.666667 | \n", "
33 | \n", "a | \n", "b | \n", "c | \n", "d | \n", "44.666667 | \n", "
34 | \n", "a | \n", "b | \n", "c | \n", "d | \n", "44.666667 | \n", "
35 | \n", "a | \n", "b | \n", "c | \n", "d | \n", "44.666667 | \n", "
preview(frame, sizes, parallel, train)\n", "
>>> @preview(train, 2, 3, parallel=False)\n", "... def some_feature(df):\n", "... res = stl.empty_like(df)\n", "... res['col'] = ...\n", "... return res\n", "\n", ">>> @preview(train, 200)\n", "... def some_feature(df):\n", "... return stl.mean_encode(['Parch', 'Embarked'], 'Survived')(df)\n", "\n", ">>> @preview(train, 100)\n", "... @generic(left="Age", right="SibSp")\n", "... def numeric_interactions(df):\n", "... res = stl.empty_like(df)\n", "... res[f"{left}_add_{right}"] = df[left] + df[right]\n", "... res[f"{left}_sub_{right}"] = df[left] - df[right]\n", "... res[f"{left}_mul_{right}"] = df[left] * df[right]\n", "... return res\n", "
feature(args, cache, parallel, verbose)\n", "
>>> @feature(parallel=False, verbose=False)\n", "... def some_feature(df):\n", "... ...\n", "\n", ">>> @feature\n", "... def some_feature(df):\n", "... ...\n", "\n", ">>> @feature\n", "... @generic(param='default')\n", "... def generic_feature(df):\n", "... ...\n", "
generic(kwargs)\n", "
>>> @feature\n", "... @generic(left="Age", right="SibSp")\n", "... def numeric_interactions(df):\n", "... res = stl.empty_like(df)\n", "... res[f"{left}_add_{right}"] = df[left] + df[right]\n", "... res[f"{left}_sub_{right}"] = df[left] - df[right]\n", "... res[f"{left}_mul_{right}"] = df[left] * df[right]\n", "... return res\n", "\n", ">>> from itertools import combinations\n", ">>> fs = FeatureSet([\n", "... numeric_interactions(left, right)\n", "... for left, right in combinations(['Parch', 'SibSp', 'Age'], r=2)\n", "... ], ...)\n", "
delete(feature_or_helper, force)\n", "
>>> delete(incorrect_feature)\n", ">>> delete(old_helper)\n", ">>> delete(stl.mean_encode('Embarked', 'Survived'))\n", ">>> delete(generic_feature)\n", "
\n", " | is_male | \n", "
---|---|
PassengerId | \n", "\n", " |
1 | \n", "1 | \n", "
2 | \n", "0 | \n", "
3 | \n", "0 | \n", "
4 | \n", "0 | \n", "
5 | \n", "1 | \n", "
\n", " | a | \n", "
---|---|
PassengerId | \n", "\n", " |
1 | \n", "a | \n", "
2 | \n", "a | \n", "
\n", " | a | \n", "
---|---|
PassengerId | \n", "\n", " |
1 | \n", "a | \n", "
2 | \n", "a | \n", "
\n", " | a | \n", "
---|---|
PassengerId | \n", "\n", " |
1 | \n", "a | \n", "
2 | \n", "a | \n", "
\n", " | Pclass | \n", "somefeat | \n", "
---|---|---|
PassengerId | \n", "\n", " | \n", " |
1 | \n", "3 | \n", "6 | \n", "
2 | \n", "1 | \n", "4 | \n", "
3 | \n", "3 | \n", "6 | \n", "
4 | \n", "1 | \n", "4 | \n", "
5 | \n", "3 | \n", "6 | \n", "
6 | \n", "3 | \n", "6 | \n", "
7 | \n", "1 | \n", "4 | \n", "
\n", " | Age | \n", "age_std | \n", "
---|---|---|
PassengerId | \n", "\n", " | \n", " |
1 | \n", "22.0 | \n", "-0.294872 | \n", "
2 | \n", "38.0 | \n", "0.217949 | \n", "
3 | \n", "26.0 | \n", "-0.166667 | \n", "
4 | \n", "35.0 | \n", "0.121795 | \n", "
5 | \n", "35.0 | \n", "0.121795 | \n", "
\n", " | Pclass_add_SibSp | \n", "Pclass_sub_SibSp | \n", "Pclass_mul_SibSp | \n", "
---|---|---|---|
PassengerId | \n", "\n", " | \n", " | \n", " |
1 | \n", "4 | \n", "2 | \n", "3 | \n", "
2 | \n", "2 | \n", "0 | \n", "1 | \n", "
3 | \n", "3 | \n", "3 | \n", "0 | \n", "
4 | \n", "2 | \n", "0 | \n", "1 | \n", "
5 | \n", "3 | \n", "3 | \n", "0 | \n", "
\n", " | tfidf__Name_0 | \n", "tfidf__Name_1 | \n", "tfidf__Name_2 | \n", "tfidf__Name_3 | \n", "tfidf__Name_4 | \n", "
---|---|---|---|---|---|
PassengerId | \n", "\n", " | \n", " | \n", " | \n", " | \n", " |
1 | \n", "0.508281 | \n", "0.338854 | \n", "0.185575 | \n", "0.742300 | \n", "0.203426 | \n", "
2 | \n", "0.593616 | \n", "0.197872 | \n", "0.433463 | \n", "0.541828 | \n", "0.356369 | \n", "
3 | \n", "0.464173 | \n", "0.464173 | \n", "0.508413 | \n", "0.000000 | \n", "0.557318 | \n", "
4 | \n", "0.603771 | \n", "0.301886 | \n", "0.661317 | \n", "0.220439 | \n", "0.241644 | \n", "
5 | \n", "0.631088 | \n", "0.420725 | \n", "0.460825 | \n", "0.460825 | \n", "0.000000 | \n", "
6 | \n", "0.508984 | \n", "0.508984 | \n", "0.278748 | \n", "0.557496 | \n", "0.305561 | \n", "
7 | \n", "0.779844 | \n", "0.259948 | \n", "0.000000 | \n", "0.569447 | \n", "0.000000 | \n", "
8 | \n", "0.395067 | \n", "0.526756 | \n", "0.288481 | \n", "0.288481 | \n", "0.632461 | \n", "
9 | \n", "0.605911 | \n", "0.302956 | \n", "0.442440 | \n", "0.331830 | \n", "0.485000 | \n", "
10 | \n", "0.449865 | \n", "0.449865 | \n", "0.492741 | \n", "0.246371 | \n", "0.540139 | \n", "
@feature\n", "@generic(col='Name')\n", "def tfidf(df):\n", " if df.train:\n", " enc = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), max_features=5)\n", " res = enc.fit_transform(df[col])\n", " df.state['enc'] = enc\n", " else:\n", " enc = df.state['enc']\n", " res = enc.transform(df[col])\n", " return res.todense()\n", "
>>> @feature\n", "... def some_feature(df):\n", "... res = stl.empty_like(df)\n", "... res['col'] = ...\n", "... return res\n", "
\n", " |
---|
PassengerId | \n", "
1 | \n", "
2 | \n", "
3 | \n", "
4 | \n", "
5 | \n", "
>>> fs = FeatureSet([stl.identity, one_feature, another_feature], ...)\n", ">>> assert all((stl.identity & ['a', 'b'])(df) == stl.select(['a', 'b'])(df))\n", ">>> assert all((stl.identity - ['a', 'b'])(df) == stl.drop(['a', 'b'])(df))\n", "
\n", " | Survived | \n", "Pclass | \n", "Name | \n", "Sex | \n", "Age | \n", "SibSp | \n", "Parch | \n", "Ticket | \n", "Fare | \n", "Cabin | \n", "Embarked | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|
PassengerId | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
1 | \n", "0 | \n", "3 | \n", "Braund, Mr. Owen Harris | \n", "male | \n", "22.0 | \n", "1 | \n", "0 | \n", "A/5 21171 | \n", "7.2500 | \n", "NaN | \n", "S | \n", "
2 | \n", "1 | \n", "1 | \n", "Cumings, Mrs. John Bradley (Florence Briggs Th... | \n", "female | \n", "38.0 | \n", "1 | \n", "0 | \n", "PC 17599 | \n", "71.2833 | \n", "C85 | \n", "C | \n", "
3 | \n", "1 | \n", "3 | \n", "Heikkinen, Miss. Laina | \n", "female | \n", "26.0 | \n", "0 | \n", "0 | \n", "STON/O2. 3101282 | \n", "7.9250 | \n", "NaN | \n", "S | \n", "
4 | \n", "1 | \n", "1 | \n", "Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n", "female | \n", "35.0 | \n", "1 | \n", "0 | \n", "113803 | \n", "53.1000 | \n", "C123 | \n", "S | \n", "
5 | \n", "0 | \n", "3 | \n", "Allen, Mr. William Henry | \n", "male | \n", "35.0 | \n", "0 | \n", "0 | \n", "373450 | \n", "8.0500 | \n", "NaN | \n", "S | \n", "
select(columns)\n", "
>>> assert all(stl.select(['a', 'b'])(df) == df[['a', 'b']])\n", "
\n", " | Name | \n", "Sex | \n", "
---|---|---|
PassengerId | \n", "\n", " | \n", " |
1 | \n", "Braund, Mr. Owen Harris | \n", "male | \n", "
2 | \n", "Cumings, Mrs. John Bradley (Florence Briggs Th... | \n", "female | \n", "
3 | \n", "Heikkinen, Miss. Laina | \n", "female | \n", "
4 | \n", "Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n", "female | \n", "
5 | \n", "Allen, Mr. William Henry | \n", "male | \n", "
drop(columns)\n", "
>>> assert all(stl.drop(['a', 'b'])(df) == df.drop(['a', 'b'], axis=1))\n", "
\n", " | Pclass | \n", "Name | \n", "Sex | \n", "Age | \n", "SibSp | \n", "Parch | \n", "Ticket | \n", "Fare | \n", "Cabin | \n", "Embarked | \n", "
---|---|---|---|---|---|---|---|---|---|---|
PassengerId | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
1 | \n", "3 | \n", "Braund, Mr. Owen Harris | \n", "male | \n", "22.0 | \n", "1 | \n", "0 | \n", "A/5 21171 | \n", "7.2500 | \n", "NaN | \n", "S | \n", "
2 | \n", "1 | \n", "Cumings, Mrs. John Bradley (Florence Briggs Th... | \n", "female | \n", "38.0 | \n", "1 | \n", "0 | \n", "PC 17599 | \n", "71.2833 | \n", "C85 | \n", "C | \n", "
3 | \n", "3 | \n", "Heikkinen, Miss. Laina | \n", "female | \n", "26.0 | \n", "0 | \n", "0 | \n", "STON/O2. 3101282 | \n", "7.9250 | \n", "NaN | \n", "S | \n", "
4 | \n", "1 | \n", "Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n", "female | \n", "35.0 | \n", "1 | \n", "0 | \n", "113803 | \n", "53.1000 | \n", "C123 | \n", "S | \n", "
5 | \n", "3 | \n", "Allen, Mr. William Henry | \n", "male | \n", "35.0 | \n", "0 | \n", "0 | \n", "373450 | \n", "8.0500 | \n", "NaN | \n", "S | \n", "
concat(feature_constructors)\n", "
>>> from category_encoders import WOEEncoder, CatBoostEncoder\n", ">>> stl.concat([\n", "... stl.select('Age']),\n", "... stl.category_encode(WOEEncoder(), ['Sex', 'Embarked'], 'Survived'),\n", "... stl.category_encode(CatBoostEncoder(), ['Sex', 'Embarked'], 'Survived'),\n", "... ])\n", "
\n", " | Name | \n", "Sex | \n", "is_male | \n", "tfidf__Name_0 | \n", "tfidf__Name_1 | \n", "tfidf__Name_2 | \n", "tfidf__Name_3 | \n", "tfidf__Name_4 | \n", "
---|---|---|---|---|---|---|---|---|
PassengerId | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
1 | \n", "Braund, Mr. Owen Harris | \n", "male | \n", "1 | \n", "0.497477 | \n", "0.331651 | \n", "0.165826 | \n", "0.000000 | \n", "0.784236 | \n", "
2 | \n", "Cumings, Mrs. John Bradley (Florence Briggs Th... | \n", "female | \n", "0 | \n", "0.610662 | \n", "0.203554 | \n", "0.407108 | \n", "0.240666 | \n", "0.601665 | \n", "
3 | \n", "Heikkinen, Miss. Laina | \n", "female | \n", "0 | \n", "0.546402 | \n", "0.546402 | \n", "0.546402 | \n", "0.323011 | \n", "0.000000 | \n", "
4 | \n", "Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n", "female | \n", "0 | \n", "0.544245 | \n", "0.272122 | \n", "0.544245 | \n", "0.536227 | \n", "0.214491 | \n", "
5 | \n", "Allen, Mr. William Henry | \n", "male | \n", "1 | \n", "0.447424 | \n", "0.298283 | \n", "0.298283 | \n", "0.705332 | \n", "0.352666 | \n", "
apply(df, func, parts, optimize, verbose)\n", "
>>> def func(row):\n", "... if row.Embarked == 'S':\n", "... return row.SibSp\n", "... return row.Age\n", ">>> stl.apply(df, func, parts=7, verbose=True)\n", "
\n", " | col | \n", "
---|---|
PassengerId | \n", "\n", " |
1 | \n", "1.0 | \n", "
2 | \n", "38.0 | \n", "
3 | \n", "0.0 | \n", "
4 | \n", "1.0 | \n", "
5 | \n", "0.0 | \n", "
... | \n", "... | \n", "
696 | \n", "0.0 | \n", "
697 | \n", "0.0 | \n", "
698 | \n", "NaN | \n", "
699 | \n", "49.0 | \n", "
700 | \n", "0.0 | \n", "
700 rows × 1 columns
\n", "category_encode(encoder, columns, targets)\n", "
>>> from category_encoders import WOEEncoder, TargetEncoder\n", ">>> stl.category_encode(WOEEncoder(), ['Sex', 'Embarked'], 'Survived')\n", ">>> stl.category_encode(TargetEncoder(smoothing=3), ['Sex', 'Embarked'], ['Survived', 'Age'])\n", ">>> stl.category_encode(WOEEncoder(sigma=0.1, regularization=0.5), 'Sex', 'Survived')\n", "
\n", " | Cabin_ce_Survived_CatBoostEncoder_random_state_0_sigma_3 | \n", "Embarked_ce_Survived_CatBoostEncoder_random_state_0_sigma_3 | \n", "
---|---|---|
PassengerId | \n", "\n", " | \n", " |
1 | \n", "2.579784 | \n", "2.579784 | \n", "
2 | \n", "0.902193 | \n", "0.902193 | \n", "
3 | \n", "0.806924 | \n", "0.806924 | \n", "
4 | \n", "3.166299 | \n", "3.629659 | \n", "
5 | \n", "3.103257 | \n", "3.978111 | \n", "
... | \n", "... | \n", "... | \n", "
96 | \n", "1.096301 | \n", "1.090039 | \n", "
97 | \n", "0.422915 | \n", "0.485321 | \n", "
98 | \n", "2.606621 | \n", "2.848815 | \n", "
99 | \n", "0.479063 | \n", "0.475339 | \n", "
100 | \n", "0.783394 | \n", "0.780401 | \n", "
100 rows × 2 columns
\n", "\n", " | Survived | \n", "Cabin | \n", "Cabin_ce_Survived_CatBoostEncoder_random_state_0 | \n", "Cabin_ce_Survived_WOEEncoder | \n", "Cabin_ce_Survived_TargetEncoder | \n", "
---|---|---|---|---|---|
PassengerId | \n", "\n", " | \n", " | \n", " | \n", " | \n", " |
1 | \n", "0 | \n", "NaN | \n", "0.410000 | \n", "-0.253322 | \n", "0.35 | \n", "
2 | \n", "1 | \n", "C85 | \n", "0.410000 | \n", "0.000000 | \n", "0.41 | \n", "
3 | \n", "1 | \n", "NaN | \n", "0.205000 | \n", "-0.253322 | \n", "0.35 | \n", "
4 | \n", "1 | \n", "C123 | \n", "0.410000 | \n", "0.000000 | \n", "0.41 | \n", "
5 | \n", "0 | \n", "NaN | \n", "0.470000 | \n", "-0.253322 | \n", "0.35 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
96 | \n", "0 | \n", "NaN | \n", "0.351410 | \n", "-0.253322 | \n", "0.35 | \n", "
97 | \n", "0 | \n", "A5 | \n", "0.410000 | \n", "0.000000 | \n", "0.41 | \n", "
98 | \n", "1 | \n", "D10 D12 | \n", "0.410000 | \n", "0.000000 | \n", "0.41 | \n", "
99 | \n", "1 | \n", "NaN | \n", "0.346962 | \n", "-0.253322 | \n", "0.35 | \n", "
100 | \n", "0 | \n", "NaN | \n", "0.355125 | \n", "-0.253322 | \n", "0.35 | \n", "
100 rows × 5 columns
\n", "mean_encode(columns, targets, smoothing, min_samples_leaf)\n", "
>>> stl.mean_encoding(['Sex', 'Embarked'], ['Survived', 'Age'])\n", ">>> stl.mean_encoding(['Sex', 'Embarked'], 'Survived', smoothing=1.5, min_samples_leaf=5)\n", "
\n", " | Cabin_ce_Survived_TargetEncoder_smoothing_3.0 | \n", "
---|---|
PassengerId | \n", "\n", " |
1 | \n", "0.35 | \n", "
2 | \n", "0.41 | \n", "
3 | \n", "0.35 | \n", "
4 | \n", "0.41 | \n", "
5 | \n", "0.35 | \n", "
... | \n", "... | \n", "
96 | \n", "0.35 | \n", "
97 | \n", "0.41 | \n", "
98 | \n", "0.41 | \n", "
99 | \n", "0.35 | \n", "
100 | \n", "0.35 | \n", "
100 rows × 1 columns
\n", "one_hot_encode(columns)\n", "
>>> stl.one_hot_encode(['Sex', 'Embarked'])\n", ">>> stl.one_hot_encode('Embarked')\n", "
\n", " | Embarked_ce_OneHotEncoder_0 | \n", "Embarked_ce_OneHotEncoder_1 | \n", "Embarked_ce_OneHotEncoder_2 | \n", "Embarked_ce_OneHotEncoder_3 | \n", "
---|---|---|---|---|
PassengerId | \n", "\n", " | \n", " | \n", " | \n", " |
1 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "
2 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "
3 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "
4 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "
5 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
96 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "
97 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "
98 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "
99 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "
100 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "
100 rows × 4 columns
\n", "FeatureSet(before_split, after_split, train_frame, test_frame, targets, auxiliary, description)\n", "
>>> fs = FeatureSet([feature_1, feature_2], [single_validation_feature],\n", "... train_frame=train, targets='Survived')\n", "\n", ">>> fs = FeatureSet([feature_1, feature_2], [single_validation_feature],\n", "... train_frame=train,\n", "... targets=['Target1', 'Target2'], auxiliary=['date', 'metric_group'])\n", "\n", ">>> fs = FeatureSet([stl.select(['Age', 'Fare'])], [stl.mean_encode(['Embarked', 'Parch'], 'Survived')],\n", "... train_frame=train, targets='Survived')\n", "
FeatureSet([simple_feature,\n", " interactions('Pclass', 'Age'),\n", " num_aggs('Fare'),\n", " tfidf('Name')],\n", " [stl.category_encode(TargetEncoder(), ['Embarked'], ['Survived']),\n", " stl.category_encode(WOEEncoder(), ['Embarked'], ['Survived'])],\n", " targets=['Survived'],\n", " auxiliary=[])\n", "
\n", " | is_male | \n", "Pclass_add_Age | \n", "Pclass_sub_Age | \n", "Pclass_mul_Age | \n", "Fare_div_mean | \n", "Fare_sub_div_mean | \n", "Fare_div_std | \n", "tfidf__Name_0 | \n", "tfidf__Name_1 | \n", "tfidf__Name_2 | \n", "tfidf__Name_3 | \n", "tfidf__Name_4 | \n", "Embarked_ce_Survived_TargetEncoder | \n", "Embarked_ce_Survived_WOEEncoder | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PassengerId | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
1 | \n", "1 | \n", "25.0 | \n", "-19.0 | \n", "66.0 | \n", "0.268312 | \n", "-0.731688 | \n", "0.307178 | \n", "0.508281 | \n", "0.338854 | \n", "0.185575 | \n", "0.742300 | \n", "0.203426 | \n", "0.428748 | \n", "-0.223144 | \n", "
2 | \n", "0 | \n", "39.0 | \n", "-37.0 | \n", "38.0 | \n", "2.638088 | \n", "1.638088 | \n", "3.020231 | \n", "0.593616 | \n", "0.197872 | \n", "0.433463 | \n", "0.541828 | \n", "0.356369 | \n", "0.865529 | \n", "1.098612 | \n", "
3 | \n", "0 | \n", "29.0 | \n", "-23.0 | \n", "78.0 | \n", "0.293292 | \n", "-0.706708 | \n", "0.335778 | \n", "0.464173 | \n", "0.464173 | \n", "0.508413 | \n", "0.000000 | \n", "0.557318 | \n", "0.428748 | \n", "-0.223144 | \n", "
4 | \n", "0 | \n", "36.0 | \n", "-34.0 | \n", "35.0 | \n", "1.965151 | \n", "0.965151 | \n", "2.249815 | \n", "0.603771 | \n", "0.301886 | \n", "0.661317 | \n", "0.220439 | \n", "0.241644 | \n", "0.428748 | \n", "-0.223144 | \n", "
5 | \n", "1 | \n", "38.0 | \n", "-32.0 | \n", "105.0 | \n", "0.297918 | \n", "-0.702082 | \n", "0.341074 | \n", "0.631088 | \n", "0.420725 | \n", "0.460825 | \n", "0.460825 | \n", "0.000000 | \n", "0.428748 | \n", "-0.223144 | \n", "
6 | \n", "1 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.313029 | \n", "-0.686971 | \n", "0.358373 | \n", "0.508984 | \n", "0.508984 | \n", "0.278748 | \n", "0.557496 | \n", "0.305561 | \n", "0.500000 | \n", "0.000000 | \n", "
7 | \n", "1 | \n", "55.0 | \n", "-53.0 | \n", "54.0 | \n", "1.919353 | \n", "0.919353 | \n", "2.197383 | \n", "0.779844 | \n", "0.259948 | \n", "0.000000 | \n", "0.569447 | \n", "0.000000 | \n", "0.428748 | \n", "-0.223144 | \n", "
8 | \n", "1 | \n", "5.0 | \n", "1.0 | \n", "6.0 | \n", "0.779954 | \n", "-0.220046 | \n", "0.892935 | \n", "0.395067 | \n", "0.526756 | \n", "0.288481 | \n", "0.288481 | \n", "0.632461 | \n", "0.428748 | \n", "-0.223144 | \n", "
9 | \n", "0 | \n", "30.0 | \n", "-24.0 | \n", "81.0 | \n", "0.412027 | \n", "-0.587973 | \n", "0.471711 | \n", "0.605911 | \n", "0.302956 | \n", "0.442440 | \n", "0.331830 | \n", "0.485000 | \n", "0.428748 | \n", "-0.223144 | \n", "
10 | \n", "0 | \n", "16.0 | \n", "-12.0 | \n", "28.0 | \n", "1.112875 | \n", "0.112875 | \n", "1.274082 | \n", "0.449865 | \n", "0.449865 | \n", "0.492741 | \n", "0.246371 | \n", "0.540139 | \n", "0.865529 | \n", "1.098612 | \n", "