{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "## Feature Creation: Combine with reference feature\n", "\n", "The CombineWithReferenceFeature() applies combines a group of variables with a group of reference variables utilising mathematical operations ['sub', 'div','add','mul'], returning one or more additional features as a result.\n", "\n", "For this demonstration, we use the UCI Wine Quality Dataset.\n", "\n", "The data is publicly available on [UCI repository](https://archive.ics.uci.edu/ml/datasets/Wine+Quality)\n", "\n", "P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import (\n", " accuracy_score,\n", " roc_curve,\n", " roc_auc_score,\n", " classification_report,\n", " confusion_matrix,\n", ")\n", "from sklearn.pipeline import Pipeline as pipe\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "from feature_engine.creation import CombineWithReferenceFeature\n", "from feature_engine.imputation import MeanMedianImputer\n", "\n", "pd.set_option('display.max_columns', None)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
07.40.700.001.90.07611.034.00.99783.510.569.45
17.80.880.002.60.09825.067.00.99683.200.689.85
27.80.760.042.30.09215.054.00.99703.260.659.85
311.20.280.561.90.07517.060.00.99803.160.589.86
47.40.700.001.90.07611.034.00.99783.510.569.45
\n", "
" ], "text/plain": [ " fixed acidity volatile acidity citric acid residual sugar chlorides \\\n", "0 7.4 0.70 0.00 1.9 0.076 \n", "1 7.8 0.88 0.00 2.6 0.098 \n", "2 7.8 0.76 0.04 2.3 0.092 \n", "3 11.2 0.28 0.56 1.9 0.075 \n", "4 7.4 0.70 0.00 1.9 0.076 \n", "\n", " free sulfur dioxide total sulfur dioxide density pH sulphates \\\n", "0 11.0 34.0 0.9978 3.51 0.56 \n", "1 25.0 67.0 0.9968 3.20 0.68 \n", "2 15.0 54.0 0.9970 3.26 0.65 \n", "3 17.0 60.0 0.9980 3.16 0.58 \n", "4 11.0 34.0 0.9978 3.51 0.56 \n", "\n", " alcohol quality \n", "0 9.4 5 \n", "1 9.8 5 \n", "2 9.8 5 \n", "3 9.8 6 \n", "4 9.4 5 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Read data\n", "data = pd.read_csv('winequality-red.csv', sep=';')\n", "\n", "data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**This Data contains 11 features, all numerical, with no missing values.**" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
quality_rangequality
005
105
205
316
405
\n", "
" ], "text/plain": [ " quality_range quality\n", "0 0 5\n", "1 0 5\n", "2 0 5\n", "3 1 6\n", "4 0 5" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Let's transform the Target, i.e Wine Quality into a binary classification problem:\n", "\n", "bins = [0,5,10]\n", "\n", "labels = [0, 1] # 'low'=0, 'high'=1\n", "\n", "data['quality_range']= pd.cut(x=data['quality'], bins=bins, labels=labels)\n", "\n", "data[['quality_range','quality']].head(5)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# drop original target\n", "\n", "data.drop('quality', axis=1, inplace = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sub and Div Combinators:\n", "\n", "Let's create two new variables:\n", "\n", "- non_free_sulfur_dioxide = total sulfur dioxide - free sulfur dioxide\n", "- percentage_free_sulfur = free sulfur dioxide / total sulfur dioxide" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Create the Combinators\n", "\n", "# this transformer substracts free sulfur from total sulfur\n", "sub_with_reference_feature = CombineWithReferenceFeature(\n", " variables_to_combine=['total sulfur dioxide'],\n", " reference_variables=['free sulfur dioxide'],\n", " operations=['sub'],\n", " new_variables_names=['non_free_sulfur_dioxide']\n", ")\n", "\n", "# this transformer divides free sulfur by total sulfur\n", "div_with_reference_feature = CombineWithReferenceFeature(\n", " variables_to_combine=['free sulfur dioxide'],\n", " reference_variables=['total sulfur dioxide'],\n", " operations=['div'],\n", " new_variables_names=['percentage_free_sulfur']\n", ")\n", "\n", "# Fit the Sub Combinator on training data\n", "sub_with_reference_feature.fit(data)\n", "\n", "# perform the substraction\n", "data_t = sub_with_reference_feature.transform(data)\n", "\n", "# perform division\n", "# We can combine both steps in a single call with \".fit_transform()\" method\n", "data_t = div_with_reference_feature.fit_transform(data_t)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality_rangenon_free_sulfur_dioxidepercentage_free_sulfur
07.40.700.001.90.07611.034.00.99783.510.569.4023.00.323529
17.80.880.002.60.09825.067.00.99683.200.689.8042.00.373134
27.80.760.042.30.09215.054.00.99703.260.659.8039.00.277778
311.20.280.561.90.07517.060.00.99803.160.589.8143.00.283333
47.40.700.001.90.07611.034.00.99783.510.569.4023.00.323529
\n", "
" ], "text/plain": [ " fixed acidity volatile acidity citric acid residual sugar chlorides \\\n", "0 7.4 0.70 0.00 1.9 0.076 \n", "1 7.8 0.88 0.00 2.6 0.098 \n", "2 7.8 0.76 0.04 2.3 0.092 \n", "3 11.2 0.28 0.56 1.9 0.075 \n", "4 7.4 0.70 0.00 1.9 0.076 \n", "\n", " free sulfur dioxide total sulfur dioxide density pH sulphates \\\n", "0 11.0 34.0 0.9978 3.51 0.56 \n", "1 25.0 67.0 0.9968 3.20 0.68 \n", "2 15.0 54.0 0.9970 3.26 0.65 \n", "3 17.0 60.0 0.9980 3.16 0.58 \n", "4 11.0 34.0 0.9978 3.51 0.56 \n", "\n", " alcohol quality_range non_free_sulfur_dioxide percentage_free_sulfur \n", "0 9.4 0 23.0 0.323529 \n", "1 9.8 0 42.0 0.373134 \n", "2 9.8 0 39.0 0.277778 \n", "3 9.8 1 43.0 0.283333 \n", "4 9.4 0 23.0 0.323529 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Note the additional variables at the end of the dataframe\n", "\n", "data_t.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Combine with more than 1 operation\n", "\n", "We can also combine the variables with more than 1 mathematical operation. And the transformer has the option to create variable names automatically.\n", "\n", "Here we will create the following variables:\n", "\n", "- ratio_fixed_to_volatile_acidity = fixed acidity / volatile acidity\n", "- total_acidity = fixed acidity + volatile acidity" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Create the Combinator\n", "\n", "multiple_combinator = CombineWithReferenceFeature(\n", " variables_to_combine=['fixed acidity'],\n", " reference_variables=['volatile acidity'],\n", " operations=['div', 'add'],\n", " new_variables_names=['ratio_fixed_to_volatile', 'total_acidity']\n", ")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "CombineWithReferenceFeature(new_variables_names=['ratio_fixed_to_volatile',\n", " 'total_acidity'],\n", " operations=['div', 'add'],\n", " reference_variables=['volatile acidity'],\n", " variables_to_combine=['fixed acidity'])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Fit the Combinator to the training data\n", "\n", "multiple_combinator.fit(data_t)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Transform the data\n", "\n", "data_t = multiple_combinator.transform(data_t)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality_rangenon_free_sulfur_dioxidepercentage_free_sulfurratio_fixed_to_volatiletotal_acidity
07.40.700.001.90.07611.034.00.99783.510.569.4023.00.32352910.5714298.10
17.80.880.002.60.09825.067.00.99683.200.689.8042.00.3731348.8636368.68
27.80.760.042.30.09215.054.00.99703.260.659.8039.00.27777810.2631588.56
311.20.280.561.90.07517.060.00.99803.160.589.8143.00.28333340.00000011.48
47.40.700.001.90.07611.034.00.99783.510.569.4023.00.32352910.5714298.10
\n", "
" ], "text/plain": [ " fixed acidity volatile acidity citric acid residual sugar chlorides \\\n", "0 7.4 0.70 0.00 1.9 0.076 \n", "1 7.8 0.88 0.00 2.6 0.098 \n", "2 7.8 0.76 0.04 2.3 0.092 \n", "3 11.2 0.28 0.56 1.9 0.075 \n", "4 7.4 0.70 0.00 1.9 0.076 \n", "\n", " free sulfur dioxide total sulfur dioxide density pH sulphates \\\n", "0 11.0 34.0 0.9978 3.51 0.56 \n", "1 25.0 67.0 0.9968 3.20 0.68 \n", "2 15.0 54.0 0.9970 3.26 0.65 \n", "3 17.0 60.0 0.9980 3.16 0.58 \n", "4 11.0 34.0 0.9978 3.51 0.56 \n", "\n", " alcohol quality_range non_free_sulfur_dioxide percentage_free_sulfur \\\n", "0 9.4 0 23.0 0.323529 \n", "1 9.8 0 42.0 0.373134 \n", "2 9.8 0 39.0 0.277778 \n", "3 9.8 1 43.0 0.283333 \n", "4 9.4 0 23.0 0.323529 \n", "\n", " ratio_fixed_to_volatile total_acidity \n", "0 10.571429 8.10 \n", "1 8.863636 8.68 \n", "2 10.263158 8.56 \n", "3 40.000000 11.48 \n", "4 10.571429 8.10 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Note the additional variables at the end of the dataframe\n", "\n", "data_t.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Pipeline Example\n", "\n", "We can put all these transformations into single pipeline:\n", "\n", "Create new variables scale features and train a Logistic Regression model to predict the wine quality range.\n", "\n", "See more on how to use Feature-engine within Scikit-learn Pipelines in these [examples](https://github.com/solegalli/feature_engine/tree/master/examples/Pipelines)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((1439, 11), (160, 11))" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = data.drop(['quality_range'], axis=1)\n", "\n", "y = data.quality_range\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X,\n", " y,\n", " test_size=0.1,\n", " random_state=0,\n", " shuffle=True,\n", " stratify=y\n", " )\n", "X_train.shape, X_test.shape" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "value_pipe = pipe([\n", " \n", " # Create new features\n", " ('subtraction', CombineWithReferenceFeature(\n", " variables_to_combine=['total sulfur dioxide'],\n", " reference_variables=['free sulfur dioxide'],\n", " operations=['sub'],\n", " new_variables_names=['non_free_sulfur_dioxide']\n", " )\n", " ),\n", "\n", " ('ratio', CombineWithReferenceFeature(\n", " variables_to_combine=['free sulfur dioxide'],\n", " reference_variables=['total sulfur dioxide'],\n", " operations=['div'],\n", " new_variables_names=['percentage_free_sulfur']\n", " )\n", " ),\n", "\n", " ('acidity', CombineWithReferenceFeature(\n", " variables_to_combine=['fixed acidity'],\n", " reference_variables=['volatile acidity'],\n", " operations=['div', 'add'],\n", " new_variables_names=['ratio_fixed_to_volatile', 'total_acidity']\n", " )\n", " ),\n", "\n", " # scale features\n", " ('scaler', StandardScaler()),\n", "\n", " # Logistic Regression\n", " ('LogisticRegression', LogisticRegression())\n", "])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pipeline(steps=[('subtraction',\n", " CombineWithReferenceFeature(new_variables_names=['non_free_sulfur_dioxide'],\n", " reference_variables=['free sulfur '\n", " 'dioxide'],\n", " variables_to_combine=['total '\n", " 'sulfur '\n", " 'dioxide'])),\n", " ('ratio',\n", " CombineWithReferenceFeature(new_variables_names=['percentage_free_sulfur'],\n", " operations=['div'],\n", " reference_variables=['total '\n", " 'sulfur '\n", " 'dioxide'],\n", " variables_to_combine=['free '\n", " 'sulfur '\n", " 'dioxide'])),\n", " ('acidity',\n", " CombineWithReferenceFeature(new_variables_names=['ratio_fixed_to_volatile',\n", " 'total_acidity'],\n", " operations=['div', 'add'],\n", " reference_variables=['volatile '\n", " 'acidity'],\n", " variables_to_combine=['fixed '\n", " 'acidity'])),\n", " ('scaler', StandardScaler()),\n", " ('LogisticRegression', LogisticRegression())])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "value_pipe.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "pred_train = value_pipe.predict(X_train)\n", "pred_test = value_pipe.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Logistic Regression Model train accuracy score: 0.7477414871438499\n", "\n", "Logistic Regression Model test accuracy score: 0.75\n" ] } ], "source": [ "print('Logistic Regression Model train accuracy score: {}'.format(\n", " accuracy_score(y_train, pred_train)))\n", "\n", "print()\n", "\n", "print('Logistic Regression Model test accuracy score: {}'.format(\n", " accuracy_score(y_test, pred_test)))" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Logistic Regression Model test classification report: \n", "\n", " precision recall f1-score support\n", "\n", " 0 0.73 0.73 0.73 74\n", " 1 0.77 0.77 0.77 86\n", "\n", " accuracy 0.75 160\n", " macro avg 0.75 0.75 0.75 160\n", "weighted avg 0.75 0.75 0.75 160\n", "\n" ] } ], "source": [ "print('Logistic Regression Model test classification report: \\n\\n {}'.format(\n", " classification_report(y_test, pred_test)))" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "score = round(accuracy_score(y_test, pred_test), 3)\n", "cm = confusion_matrix(y_test, pred_test)\n", "\n", "sns.heatmap(cm, annot=True, fmt=\".0f\")\n", "plt.xlabel('Predicted Values')\n", "plt.ylabel('Actual Values')\n", "plt.title('Accuracy Score: {0}'.format(score), size=15)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Predict probabilities for the test data\n", "\n", "probs = value_pipe.predict_proba(X_test)[:, 1]\n", "\n", "# Get the ROC Curve\n", "fpr, tpr, thresholds = roc_curve(y_test, probs)\n", "\n", "# Plot ROC curve\n", "plt.figure(figsize=(8, 5))\n", "plt.plot([0, 1], [0, 1], 'k--')\n", "plt.plot(fpr, tpr)\n", "plt.xlabel('False Positive Rate = 1 - Specificity Score')\n", "plt.ylabel('True Positive Rate = Recall Score')\n", "plt.title('ROC Curve')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "fenotebook", "language": "python", "name": "fenotebook" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }