{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Feature Creation: MathematicalCombination\n", "The MathematicalCombination() applies basic mathematical operations **[‘sum’, ‘prod’, ‘mean’, ‘std’, ‘max’, ‘min’]** to multiple features, returning one or more additional features as a result.\n", "\n", "For this demonstration, we use the UCI Wine Quality Dataset.\n", "\n", "The data is publicly available on **[UCI repository](https://archive.ics.uci.edu/ml/datasets/Wine+Quality)**\n", "\n", "P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.\n", "Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import (\n", " accuracy_score,\n", " roc_curve,\n", " roc_auc_score,\n", " classification_report,\n", " confusion_matrix,\n", ")\n", "from sklearn.pipeline import Pipeline as pipe\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "from feature_engine.creation import MathematicalCombination\n", "from feature_engine.imputation import MeanMedianImputer\n", "\n", "pd.set_option('display.max_columns', None)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality
07.40.700.001.90.07611.034.00.99783.510.569.45
17.80.880.002.60.09825.067.00.99683.200.689.85
27.80.760.042.30.09215.054.00.99703.260.659.85
311.20.280.561.90.07517.060.00.99803.160.589.86
47.40.700.001.90.07611.034.00.99783.510.569.45
\n", "
" ], "text/plain": [ " fixed acidity volatile acidity citric acid residual sugar chlorides \\\n", "0 7.4 0.70 0.00 1.9 0.076 \n", "1 7.8 0.88 0.00 2.6 0.098 \n", "2 7.8 0.76 0.04 2.3 0.092 \n", "3 11.2 0.28 0.56 1.9 0.075 \n", "4 7.4 0.70 0.00 1.9 0.076 \n", "\n", " free sulfur dioxide total sulfur dioxide density pH sulphates \\\n", "0 11.0 34.0 0.9978 3.51 0.56 \n", "1 25.0 67.0 0.9968 3.20 0.68 \n", "2 15.0 54.0 0.9970 3.26 0.65 \n", "3 17.0 60.0 0.9980 3.16 0.58 \n", "4 11.0 34.0 0.9978 3.51 0.56 \n", "\n", " alcohol quality \n", "0 9.4 5 \n", "1 9.8 5 \n", "2 9.8 5 \n", "3 9.8 6 \n", "4 9.4 5 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Read data\n", "data = pd.read_csv('winequality-red.csv', sep=';')\n", "\n", "data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**This Data contains 11 features, all numerical, with no missing values.**" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
quality_rangequality
005
105
205
316
405
\n", "
" ], "text/plain": [ " quality_range quality\n", "0 0 5\n", "1 0 5\n", "2 0 5\n", "3 1 6\n", "4 0 5" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Let's transform the Target, i.e Wine Quality into a binary classification problem:\n", "\n", "bins = [0,5,10]\n", "\n", "labels = [0, 1] # 'low'=0, 'high'=1\n", "\n", "data['quality_range']= pd.cut(x=data['quality'], bins=bins, labels=labels)\n", "\n", "data[['quality_range','quality']].head(5)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# drop original target\n", "\n", "data.drop('quality', axis=1, inplace = True) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sum and Mean Combinators:\n", "Let's create two new variables:\n", "- avg_acidity = mean(fixed acidity, volatile acidity)\n", "- total_minerals = sum(Total sulfure dioxide, sulphates)\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Create the Combinators\n", "\n", "math_combinator_mean = MathematicalCombination(\n", " variables_to_combine=['fixed acidity', 'volatile acidity'],\n", " math_operations = ['mean'],\n", " new_variables_names = ['avg_acidity']\n", ")\n", "\n", "math_combinator_sum = MathematicalCombination(\n", " variables_to_combine=['total sulfur dioxide', 'sulphates'],\n", " math_operations = ['sum'],\n", " new_variables_names = ['total_minerals']\n", ")\n", "\n", "# Fit the Mean Combinator on training data\n", "math_combinator_mean.fit(data)\n", "\n", "# Transform the data\n", "data_t = math_combinator_mean.transform(data)\n", "\n", "# We can combine both steps in a single call with \".fit_transform()\" methode\n", "data_t = math_combinator_sum.fit_transform(data_t)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality_rangeavg_aciditytotal_minerals
07.40.700.001.90.07611.034.00.99783.510.569.404.0534.56
17.80.880.002.60.09825.067.00.99683.200.689.804.3467.68
27.80.760.042.30.09215.054.00.99703.260.659.804.2854.65
311.20.280.561.90.07517.060.00.99803.160.589.815.7460.58
47.40.700.001.90.07611.034.00.99783.510.569.404.0534.56
\n", "
" ], "text/plain": [ " fixed acidity volatile acidity citric acid residual sugar chlorides \\\n", "0 7.4 0.70 0.00 1.9 0.076 \n", "1 7.8 0.88 0.00 2.6 0.098 \n", "2 7.8 0.76 0.04 2.3 0.092 \n", "3 11.2 0.28 0.56 1.9 0.075 \n", "4 7.4 0.70 0.00 1.9 0.076 \n", "\n", " free sulfur dioxide total sulfur dioxide density pH sulphates \\\n", "0 11.0 34.0 0.9978 3.51 0.56 \n", "1 25.0 67.0 0.9968 3.20 0.68 \n", "2 15.0 54.0 0.9970 3.26 0.65 \n", "3 17.0 60.0 0.9980 3.16 0.58 \n", "4 11.0 34.0 0.9978 3.51 0.56 \n", "\n", " alcohol quality_range avg_acidity total_minerals \n", "0 9.4 0 4.05 34.56 \n", "1 9.8 0 4.34 67.68 \n", "2 9.8 0 4.28 54.65 \n", "3 9.8 1 5.74 60.58 \n", "4 9.4 0 4.05 34.56 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_t.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can check the mappings between each new variable and the operation it's created with in the **combination_dict_**" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'avg_acidity': 'mean'}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "math_combinator_mean.combination_dict_" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['fixed acidity', 'volatile acidity']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "math_combinator_mean.variables_to_combine" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Combine with more than 1 operation\n", "\n", "We can also combine the variables with more than 1 mathematical operation. And the transformer has the option to create variable names automatically." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Create the Combinators\n", "\n", "multiple_combinator = MathematicalCombination(\n", " variables_to_combine=['fixed acidity', 'volatile acidity'],\n", " math_operations = ['mean', 'sum'],\n", " new_variables_names = None\n", ")\n", "\n", "\n", "# Fit the Combinator to the training data\n", "multiple_combinator.fit(data)\n", "\n", "# Transform the data\n", "data_t = multiple_combinator.transform(data)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fixed acidityvolatile aciditycitric acidresidual sugarchloridesfree sulfur dioxidetotal sulfur dioxidedensitypHsulphatesalcoholquality_rangemean(fixed acidity-volatile acidity)sum(fixed acidity-volatile acidity)
07.40.700.001.90.07611.034.00.99783.510.569.404.058.10
17.80.880.002.60.09825.067.00.99683.200.689.804.348.68
27.80.760.042.30.09215.054.00.99703.260.659.804.288.56
311.20.280.561.90.07517.060.00.99803.160.589.815.7411.48
47.40.700.001.90.07611.034.00.99783.510.569.404.058.10
\n", "
" ], "text/plain": [ " fixed acidity volatile acidity citric acid residual sugar chlorides \\\n", "0 7.4 0.70 0.00 1.9 0.076 \n", "1 7.8 0.88 0.00 2.6 0.098 \n", "2 7.8 0.76 0.04 2.3 0.092 \n", "3 11.2 0.28 0.56 1.9 0.075 \n", "4 7.4 0.70 0.00 1.9 0.076 \n", "\n", " free sulfur dioxide total sulfur dioxide density pH sulphates \\\n", "0 11.0 34.0 0.9978 3.51 0.56 \n", "1 25.0 67.0 0.9968 3.20 0.68 \n", "2 15.0 54.0 0.9970 3.26 0.65 \n", "3 17.0 60.0 0.9980 3.16 0.58 \n", "4 11.0 34.0 0.9978 3.51 0.56 \n", "\n", " alcohol quality_range mean(fixed acidity-volatile acidity) \\\n", "0 9.4 0 4.05 \n", "1 9.8 0 4.34 \n", "2 9.8 0 4.28 \n", "3 9.8 1 5.74 \n", "4 9.4 0 4.05 \n", "\n", " sum(fixed acidity-volatile acidity) \n", "0 8.10 \n", "1 8.68 \n", "2 8.56 \n", "3 11.48 \n", "4 8.10 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Note the 2 additional variables at the end of the dataframe\n", "data_t.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'mean(fixed acidity-volatile acidity)': 'mean',\n", " 'sum(fixed acidity-volatile acidity)': 'sum'}" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# and here the variable names and the operation that was\n", "# applied to create that variable\n", "\n", "multiple_combinator.combination_dict_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Pipeline Example" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can put all these transformations into single pipeline:\n", "\n", "1. Create new variables\n", "2. Scale features\n", "3. Train a Logistic Regression model to predict wine quality\n", "\n", "See more on how to use Feature-engine within Scikit-learn Pipelines in these **[examples](https://github.com/solegalli/feature_engine/tree/master/examples/Pipelines)**" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((1439, 11), (160, 11))" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = data.drop(['quality_range'], axis=1)\n", "\n", "y = data.quality_range\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X,\n", " y,\n", " test_size=0.1,\n", " random_state=0,\n", " shuffle=True,\n", " stratify=y\n", " )\n", "X_train.shape, X_test.shape" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "value_pipe = pipe([\n", "\n", " # Create the new features\n", " ('math_combinator_mean', MathematicalCombination(variables_to_combine=['fixed acidity', 'volatile acidity'],\n", " math_operations=['mean'],\n", " new_variables_names=['avg_acidity'])),\n", "\n", " ('math_combinator_sum', MathematicalCombination(variables_to_combine=['total sulfur dioxide', 'sulphates'],\n", " math_operations=['sum'],\n", " new_variables_names=['total_minerals'])),\n", "\n", " # scale features\n", " ('scaler', StandardScaler()),\n", "\n", " # LogisticRegression\n", " ('LogisticRegression', LogisticRegression())\n", "])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pipeline(steps=[('math_combinator_mean',\n", " MathematicalCombination(math_operations=['mean'],\n", " new_variables_names=['avg_acidity'],\n", " variables_to_combine=['fixed acidity',\n", " 'volatile '\n", " 'acidity'])),\n", " ('math_combinator_sum',\n", " MathematicalCombination(math_operations=['sum'],\n", " new_variables_names=['total_minerals'],\n", " variables_to_combine=['total sulfur '\n", " 'dioxide',\n", " 'sulphates'])),\n", " ('scaler', StandardScaler()),\n", " ('LogisticRegression', LogisticRegression())])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "value_pipe.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "pred_train = value_pipe.predict(X_train)\n", "pred_test = value_pipe.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "LogisticRegression Model train accuracy score: 0.744266851980542\n", "\n", "LogisticRegression Model test accuracy score: 0.75\n" ] } ], "source": [ "print('Logistic Regression Model train accuracy score: {}'.format(\n", " accuracy_score(y_train, pred_train)))\n", "print()\n", "print('Logistic Regression Model test accuracy score: {}'.format(\n", " accuracy_score(y_test, pred_test)))" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "LogisticRegression Model test classification report: \n", "\n", " precision recall f1-score support\n", "\n", " 0 0.73 0.73 0.73 74\n", " 1 0.77 0.77 0.77 86\n", "\n", " accuracy 0.75 160\n", " macro avg 0.75 0.75 0.75 160\n", "weighted avg 0.75 0.75 0.75 160\n", "\n" ] } ], "source": [ "print('Logistic Regression Model test classification report: \\n\\n {}'.format(\n", " classification_report(y_test, pred_test)))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "score = round(accuracy_score(y_test, pred_test), 3)\n", "cm = confusion_matrix(y_test, pred_test)\n", "\n", "sns.heatmap(cm, annot=True, fmt=\".0f\")\n", "plt.xlabel('Predicted Values')\n", "plt.ylabel('Actual Values')\n", "plt.title('Accuracy Score: {0}'.format(score), size=15)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Predict probabilities for the test data\n", "probs = value_pipe.predict_proba(X_test)[:, 1]\n", "\n", "# Get the ROC Curve\n", "fpr, tpr, thresholds = roc_curve(y_test, probs)\n", "\n", "# Plot ROC curve\n", "plt.figure(figsize=(8, 5))\n", "plt.plot([0, 1], [0, 1], 'k--')\n", "plt.plot(fpr, tpr)\n", "plt.xlabel('False Positive Rate = 1 - Specificity Score')\n", "plt.ylabel('True Positive Rate = Recall Score')\n", "plt.title('ROC Curve')\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "fenotebook", "language": "python", "name": "fenotebook" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": { "height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "197.6px" }, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 4 }