{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\users\\anatoly\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\numpy\\_distributor_init.py:30: UserWarning: loaded more than 1 DLL from .libs:\n", "c:\\users\\anatoly\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\numpy\\.libs\\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll\n", "c:\\users\\anatoly\\appdata\\local\\programs\\python\\python39\\lib\\site-packages\\numpy\\.libs\\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll\n", " warnings.warn(\"loaded more than 1 DLL from .libs:\"\n" ] } ], "source": [ "import sys\n", "from matplotlib import pyplot as plt\n", "from sklift.metrics import uplift_at_k\n", "import seaborn as sns\n", "import numpy as np\n", "import pandas as pd\n", "# install uplift library scikit-uplift and other libraries \n", "#!{sys.executable} -m pip install scikit-uplift dill catboost\n", "\n", "%config InlineBackend.figure_format = 'svg'\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We are going to use a Hillstrom dataset from the MineThatData [hosted](https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html) in march 2008 by the president of this company Kevin Hillstrom.\n", "\n", "MineThatData is a consulting company that helps CEO understand the complex relationship between Customers, Advertising, Products, Brands, and Channels." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data description" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Dataset can be loaded from sklift.datasets module using fetch_hillstrom function.\n", "\n", "Read more about dataset in the [api docs](https://www.uplift-modeling.com/en/latest/api/datasets/fetch_hillstrom.html).\n", "\n", "This dataset contains 64,000 customers who last purchased within twelve months. The customers were involved in an e-mail test" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Major columns**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- **visit** (binary): target. 1/0 indicator, 1 = Customer visited website in the following two weeks.\n", "\n", "- **conversion** (binary): target. 1/0 indicator, 1 = Customer purchased merchandise in the following two weeks.\n", "\n", "- **spend** (float): target. Actual dollars spent in the following two weeks.\n", "\n", "- **segment** (str): treatment. The e-mail campaign the customer received\n", "\n", "Read more in the [docs](https://www.uplift-modeling.com/en/latest/api/datasets/fetch_hillstrom.html#hillstrom)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from sklift.datasets import fetch_hillstrom\n", "\n", "# returns sklearn Bunch object\n", "# with data, target, treatment keys\n", "# data features (pd.DataFrame), target (pd.Series), treatment (pd.Series) values \n", "dataset = fetch_hillstrom()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dataset type: \n", "\n", "Dataset features shape: (64000, 8)\n", "Dataset target shape: (64000,)\n", "Dataset treatment shape: (64000,)\n" ] } ], "source": [ "print(f\"Dataset type: {type(dataset)}\\n\")\n", "print(f\"Dataset features shape: {dataset.data.shape}\")\n", "print(f\"Dataset target shape: {dataset.target.shape}\")\n", "print(f\"Dataset treatment shape: {dataset.treatment.shape}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## EDA " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
recencyhistory_segmenthistorymenswomenszip_codenewbiechannel
0102) $100 - $200142.4410Surburban0Phone
163) $200 - $350329.0811Rural1Web
272) $100 - $200180.6501Surburban1Web
395) $500 - $750675.8310Rural1Web
421) $0 - $10045.3410Urban0Web
63995102) $100 - $200105.5410Urban0Web
6399651) $0 - $10038.9101Urban1Phone
6399761) $0 - $10029.9910Urban1Phone
6399815) $500 - $750552.9410Surburban1Multichannel
6399914) $350 - $500472.8201Surburban0Web
\n", "
" ], "text/plain": [ " recency history_segment history mens womens zip_code newbie \\\n", "0 10 2) $100 - $200 142.44 1 0 Surburban 0 \n", "1 6 3) $200 - $350 329.08 1 1 Rural 1 \n", "2 7 2) $100 - $200 180.65 0 1 Surburban 1 \n", "3 9 5) $500 - $750 675.83 1 0 Rural 1 \n", "4 2 1) $0 - $100 45.34 1 0 Urban 0 \n", "63995 10 2) $100 - $200 105.54 1 0 Urban 0 \n", "63996 5 1) $0 - $100 38.91 0 1 Urban 1 \n", "63997 6 1) $0 - $100 29.99 1 0 Urban 1 \n", "63998 1 5) $500 - $750 552.94 1 0 Surburban 1 \n", "63999 1 4) $350 - $500 472.82 0 1 Surburban 0 \n", "\n", " channel \n", "0 Phone \n", "1 Web \n", "2 Web \n", "3 Web \n", "4 Web \n", "63995 Web \n", "63996 Phone \n", "63997 Phone \n", "63998 Multichannel \n", "63999 Web " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.data.head().append(dataset.data.tail())" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 64000 entries, 0 to 63999\n", "Data columns (total 8 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 recency 64000 non-null int64 \n", " 1 history_segment 64000 non-null object \n", " 2 history 64000 non-null float64\n", " 3 mens 64000 non-null int64 \n", " 4 womens 64000 non-null int64 \n", " 5 zip_code 64000 non-null object \n", " 6 newbie 64000 non-null int64 \n", " 7 channel 64000 non-null object \n", "dtypes: float64(1), int64(4), object(3)\n", "memory usage: 3.9+ MB\n" ] } ], "source": [ "#info about types and null cells in dataset\n", "dataset.data.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There is no missing data in the cells! " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Categorical data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "cat_features = ['channel', 'zip_code', 'history_segment', 'newbie']" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Phone', 'Web', 'Multichannel'], dtype=object)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.data.channel.unique()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Surburban', 'Rural', 'Urban'], dtype=object)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.data.zip_code.unique()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['2) $100 - $200', '3) $200 - $350', '5) $500 - $750',\n", " '1) $0 - $100', '6) $750 - $1,000', '4) $350 - $500',\n", " '7) $1,000 +'], dtype=object)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.data.history_segment.unique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Zip code**" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/svg+xml": [ "\r\n", "\r\n", "\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " 2021-07-30T19:08:07.691428\r\n", " image/svg+xml\r\n", " \r\n", " \r\n", " Matplotlib v3.4.1, https://matplotlib.org/\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", "\r\n" ], "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "dataset.data.zip_code.value_counts().plot(kind = 'bar', grid=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Channel**" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/svg+xml": [ "\r\n", "\r\n", "\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " 2021-07-30T19:08:08.889244\r\n", " image/svg+xml\r\n", " \r\n", " \r\n", " Matplotlib v3.4.1, https://matplotlib.org/\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", "\r\n" ], "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "dataset.data.channel.value_counts().plot(kind = 'bar', grid=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**History segment**" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/svg+xml": [ "\r\n", "\r\n", "\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " 2021-07-30T19:08:17.519366\r\n", " image/svg+xml\r\n", " \r\n", " \r\n", " Matplotlib v3.4.1, https://matplotlib.org/\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", "\r\n" ], "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "dataset.data.history_segment.value_counts().plot(kind = 'bar', grid=True)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "#As option I propose to apply following function for transformation data in column \"historic_segment\"\n", "def historic_segment_transform(dataset):\n", " for payment in dataset.data['history_segment'].unique(): \n", " if payment =='1) $0 - $100':\n", " dataset.data.loc[dataset.data['history_segment'] == payment, 'history_segment'] = 50\n", " elif payment =='2) $100 - $200':\n", " dataset.data.loc[dataset.data['history_segment'] == payment, 'history_segment'] = 150\n", " elif payment =='3) $200 - $350':\n", " dataset.data.loc[dataset.data['history_segment'] == payment, 'history_segment'] = 275\n", " elif payment =='4) $350 - $500':\n", " dataset.data.loc[dataset.data['history_segment'] == payment, 'history_segment'] = 425\n", " elif payment =='5) $500 - $750':\n", " dataset.data.loc[dataset.data['history_segment'] == payment, 'history_segment'] = 575\n", " elif payment =='5) $750 - $1000':\n", " dataset.data.loc[dataset.data['history_segment'] == payment, 'history_segment'] = 825\n", " else:\n", " dataset.data.loc[dataset.data['history_segment'] == payment, 'history_segment'] = 1000\n", " return dataset.data.history_segment" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1) $0 - $100 22970\n", "2) $100 - $200 14254\n", "3) $200 - $350 12289\n", "4) $350 - $500 6409\n", "5) $500 - $750 4911\n", "6) $750 - $1,000 1859\n", "7) $1,000 + 1308\n", "Name: history_segment, dtype: int64" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.data.history_segment.value_counts()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/svg+xml": [ "\r\n", "\r\n", "\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " 2021-07-30T19:08:21.215294\r\n", " image/svg+xml\r\n", " \r\n", " \r\n", " Matplotlib v3.4.1, https://matplotlib.org/\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", "\r\n" ], "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "dataset.data.history_segment.value_counts().plot(kind = 'bar', grid=True)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "#It's better to proceed from categorical to numeric data\n", "# For example, we could replace 0-100 on average value 50 " ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/svg+xml": [ "\r\n", "\r\n", "\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " 2021-07-30T19:08:25.379790\r\n", " image/svg+xml\r\n", " \r\n", " \r\n", " Matplotlib v3.4.1, https://matplotlib.org/\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", "\r\n" ], "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "dataset.data.womens.value_counts().plot(kind = 'bar', grid=True)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/svg+xml": [ "\r\n", "\r\n", "\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " 2021-07-30T19:08:29.204404\r\n", " image/svg+xml\r\n", " \r\n", " \r\n", " Matplotlib v3.4.1, https://matplotlib.org/\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", "\r\n" ], "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "dataset.data.mens.value_counts().plot(kind = 'bar', grid=True)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1 35182\n", "0 28818\n", "Name: womens, dtype: int64" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.data.womens.value_counts()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "womens\n", "0 45.028125\n", "1 54.971875\n", "dtype: float64" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.data.groupby('womens').size()/dataset.data['womens'].count()*100" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "#55% - womens purchases\n", "#44% - mens purchases" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1 35266\n", "0 28734\n", "Name: mens, dtype: int64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.data.mens.value_counts()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "mens\n", "0 44.896875\n", "1 55.103125\n", "dtype: float64" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.data.groupby('mens').size()/dataset.data['mens'].count()*100" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "#55% - mens purchases\n", "#44% - womens purchases" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\r\n", "\r\n", "\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " 2021-07-30T19:08:42.157360\r\n", " image/svg+xml\r\n", " \r\n", " \r\n", " Matplotlib v3.4.1, https://matplotlib.org/\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", "\r\n" ], "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.figure(figsize = (14,8))\n", "\n", "sns.set(font_scale=0.75)\n", "sns.heatmap(dataset.data.corr().round(3), annot=True, square = True, linewidths=.75, cmap='RdPu', fmt = '.2f',annot_kws = {\"size\": 10} )\n", "\n", "plt.title('Correlation matrix')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "# womens and mens are in inverse correlation. I propose to make 1 column \"gender\" and merge." ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# As we can see, there is high correlation between 'history_segment' and 'history'. Could we merge it also and transform columns to numeric data? " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Numeric data" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/svg+xml": [ "\r\n", "\r\n", "\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " 2021-07-30T19:08:48.970419\r\n", " image/svg+xml\r\n", " \r\n", " \r\n", " Matplotlib v3.4.1, https://matplotlib.org/\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", "\r\n" ], "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dataset.data.loc[:, 'recency'].hist(figsize=(8, 4), bins=12, grid=True)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "29.99 7947\n", "81.20 9\n", "53.79 9\n", "60.51 8\n", "88.09 8\n", " ... \n", "200.53 1\n", "234.76 1\n", "239.91 1\n", "99.85 1\n", "738.50 1\n", "Name: history, Length: 34833, dtype: int64" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.data.history.value_counts()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\r\n", "\r\n", "\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " 2021-07-30T19:08:49.833633\r\n", " image/svg+xml\r\n", " \r\n", " \r\n", " Matplotlib v3.4.1, https://matplotlib.org/\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", "\r\n" ], "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dataset.data.loc[:, 'history'].hist(figsize=(8, 4), bins=20, grid=True);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Target data" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 Womens E-Mail\n", "1 No E-Mail\n", "2 Womens E-Mail\n", "3 Mens E-Mail\n", "4 Womens E-Mail\n", "Name: segment, dtype: object" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#dataset_segment\n", "dataset.treatment.head()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Womens E-Mail', 'No E-Mail', 'Mens E-Mail'], dtype=object)" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.treatment.unique()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/svg+xml": [ "\r\n", "\r\n", "\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " 2021-07-30T19:08:57.565502\r\n", " image/svg+xml\r\n", " \r\n", " \r\n", " Matplotlib v3.4.1, https://matplotlib.org/\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", "\r\n" ], "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dataset.treatment.value_counts().plot(kind = 'bar', grid = 'True')" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 0\n", "1 0\n", "2 0\n", "3 0\n", "4 0\n", "Name: visit, dtype: int64" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#dataset_visit\n", "dataset.target.head()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/svg+xml": [ "\r\n", "\r\n", "\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " 2021-07-30T19:09:06.525209\r\n", " image/svg+xml\r\n", " \r\n", " \r\n", " Matplotlib v3.4.1, https://matplotlib.org/\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", "\r\n" ], "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dataset.target.value_counts().plot(kind = 'bar')" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "#Target is disbalanced" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
visit01
segment
Mens E-Mail0.8172430.182757
No E-Mail0.8938330.106167
Womens E-Mail0.8486000.151400
\n", "
" ], "text/plain": [ "visit 0 1\n", "segment \n", "Mens E-Mail 0.817243 0.182757\n", "No E-Mail 0.893833 0.106167\n", "Womens E-Mail 0.848600 0.151400" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd \n", "\n", "pd.crosstab(dataset.treatment, dataset.target, normalize='index')" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 1], dtype=int64)" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.target.unique()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0.5, 1.0, 'Treatment & Target')" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/svg+xml": [ "\r\n", "\r\n", "\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " 2021-07-30T19:09:18.995398\r\n", " image/svg+xml\r\n", " \r\n", " \r\n", " Matplotlib v3.4.1, https://matplotlib.org/\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", "\r\n" ], "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "crosstab = pd.crosstab(dataset.treatment, dataset.target, normalize='index')\n", " \n", "sns.heatmap(crosstab, annot=True, fmt=\".2f\", linewidths=1, square = True, cmap = 'RdPu')\n", "plt.xlabel('Target')\n", "plt.title(\"Treatment & Target\")" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "#Let's consider two cases: \n", "#1) Womens E-mail - No E-mail\n", "#2) Mens E-mail - No E-mail" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Womens E-mail - No E-mail" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "# make treatment binary\n", "treat_dict_womens = {\n", " 'Womens E-Mail': 1,\n", " 'No E-Mail': 0, \n", " 'Mens E-Mail': 0\n", " }\n", "dataset.treatment_womens = dataset.treatment.map(treat_dict_womens)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/svg+xml": [ "\r\n", "\r\n", "\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " 2021-07-30T19:10:54.968145\r\n", " image/svg+xml\r\n", " \r\n", " \r\n", " Matplotlib v3.4.1, https://matplotlib.org/\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", "\r\n" ], "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dataset.treatment_womens.value_counts().plot(kind = 'bar', grid = 'True')" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "stratify_cols = pd.concat([dataset.treatment_womens, dataset.target], axis=1)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
segmentvisit
010
100
210
300
410
\n", "
" ], "text/plain": [ " segment visit\n", "0 1 0\n", "1 0 0\n", "2 1 0\n", "3 0 0\n", "4 1 0" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stratify_cols.head(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train shape: (44800, 8)\n", "Validation shape: (19200, 8)\n" ] } ], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "stratify_cols = pd.concat([dataset.treatment_womens, dataset.target], axis=1)\n", "\n", "X_train, X_val, trmnt_train, trmnt_val, y_train, y_val = train_test_split(\n", " dataset.data,\n", " dataset.treatment_womens,\n", " dataset.target,\n", " stratify=stratify_cols,\n", " test_size=0.3,\n", " random_state=42\n", ")\n", "\n", "print(f\"Train shape: {X_train.shape}\")\n", "print(f\"Validation shape: {X_val.shape}\")" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "from sklift.models import ClassTransformation\n", "from catboost import CatBoostClassifier\n", "\n", "estimator = CatBoostClassifier(verbose=100, \n", " cat_features=['womens', 'mens','channel', 'zip_code', 'history_segment', 'newbie'],\n", " random_state=42,\n", " thread_count=1\n", " )\n", "\n", "ct_model = ClassTransformation(estimator=estimator)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":1: UserWarning: It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.\n", " ct_model.fit(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Learning rate set to 0.052243\n", "0:\tlearn: 0.6900041\ttotal: 307ms\tremaining: 5m 7s\n", "100:\tlearn: 0.6610229\ttotal: 13.5s\tremaining: 1m 59s\n", "200:\tlearn: 0.6594380\ttotal: 26.7s\tremaining: 1m 45s\n", "300:\tlearn: 0.6569047\ttotal: 42.9s\tremaining: 1m 39s\n", "400:\tlearn: 0.6546442\ttotal: 1m\tremaining: 1m 30s\n", "500:\tlearn: 0.6523825\ttotal: 1m 17s\tremaining: 1m 17s\n", "600:\tlearn: 0.6504138\ttotal: 1m 38s\tremaining: 1m 5s\n", "700:\tlearn: 0.6482230\ttotal: 1m 57s\tremaining: 50.2s\n", "800:\tlearn: 0.6463583\ttotal: 2m 13s\tremaining: 33.3s\n", "900:\tlearn: 0.6445439\ttotal: 2m 31s\tremaining: 16.6s\n", "999:\tlearn: 0.6427664\ttotal: 2m 46s\tremaining: 0us\n" ] }, { "data": { "text/plain": [ "ClassTransformation(estimator=)" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ct_model.fit(\n", " X=X_train, \n", " y=y_train, \n", " treatment=trmnt_train\n", ")" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "#in progress with multiclassclassifier for Catboost. " ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "from sklift.metrics import uplift_at_k" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "uplift_predictions = ct_model.predict(X_val)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "uplift@10%: 0.0314 (sort groups by uplift together)\n", "uplift@10%: 0.0333 (sort groups by uplift separately)\n" ] } ], "source": [ "# k = 10%\n", "k = 0.1 \n", "\n", "# strategy='overall' sort by uplift treatment and control together\n", "uplift_overall = uplift_at_k(y_val, uplift_predictions, trmnt_val, strategy='overall', k=k)\n", "\n", "# strategy='by_group' sort by uplift treatment and control separately\n", "uplift_bygroup = uplift_at_k(y_val, uplift_predictions, trmnt_val, strategy='by_group', k=k)\n", "\n", "\n", "print(f\"uplift@{k * 100:.0f}%: {uplift_overall:.4f} (sort groups by uplift together)\")\n", "print(f\"uplift@{k * 100:.0f}%: {uplift_bygroup:.4f} (sort groups by uplift separately)\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Mens E-mail - No E-mail" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "treat_dict_mens = {\n", " 'Mens E-Mail': 1,\n", " 'No E-Mail': 0, \n", " 'Womens E-Mail': 0\n", " }\n", "dataset.treatment_mens = dataset.treatment.map(treat_dict_mens)" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "dataset.treatment_mens = dataset.treatment.map(treat_dict_mens)" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/svg+xml": [ "\r\n", "\r\n", "\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " 2021-07-30T19:20:17.529811\r\n", " image/svg+xml\r\n", " \r\n", " \r\n", " Matplotlib v3.4.1, https://matplotlib.org/\r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", " \r\n", "\r\n" ], "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dataset.treatment_mens.value_counts().plot(kind = 'bar', grid = 'True')" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "stratify_cols = pd.concat([dataset.treatment_mens, dataset.target], axis=1)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train shape: (44800, 8)\n", "Validation shape: (19200, 8)\n" ] } ], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "stratify_cols = pd.concat([dataset.treatment_mens, dataset.target], axis=1)\n", "\n", "X_train, X_val, trmnt_train, trmnt_val, y_train, y_val = train_test_split(\n", " dataset.data,\n", " dataset.treatment_mens,\n", " dataset.target,\n", " stratify=stratify_cols,\n", " test_size=0.3,\n", " random_state=42\n", ")\n", "\n", "print(f\"Train shape: {X_train.shape}\")\n", "print(f\"Validation shape: {X_val.shape}\")" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "from sklift.models import ClassTransformation\n", "from catboost import CatBoostClassifier\n", "\n", "estimator = CatBoostClassifier(verbose=100, \n", " cat_features=['womens', 'mens','channel', 'zip_code', 'history_segment', 'newbie'],\n", " random_state=42,\n", " thread_count=1\n", " )\n", "\n", "ct_model_mens = ClassTransformation(estimator=estimator)" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":1: UserWarning: It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.\n", " ct_model_mens.fit(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Learning rate set to 0.052243\n", "0:\tlearn: 0.6888651\ttotal: 196ms\tremaining: 3m 16s\n", "100:\tlearn: 0.6507044\ttotal: 13.2s\tremaining: 1m 57s\n", "200:\tlearn: 0.6493217\ttotal: 26.4s\tremaining: 1m 44s\n", "300:\tlearn: 0.6466936\ttotal: 42.3s\tremaining: 1m 38s\n", "400:\tlearn: 0.6437030\ttotal: 58.1s\tremaining: 1m 26s\n", "500:\tlearn: 0.6413201\ttotal: 1m 14s\tremaining: 1m 14s\n", "600:\tlearn: 0.6390236\ttotal: 1m 30s\tremaining: 1m\n", "700:\tlearn: 0.6366626\ttotal: 1m 47s\tremaining: 45.7s\n", "800:\tlearn: 0.6342987\ttotal: 2m 3s\tremaining: 30.8s\n", "900:\tlearn: 0.6322623\ttotal: 2m 21s\tremaining: 15.5s\n", "999:\tlearn: 0.6303012\ttotal: 2m 37s\tremaining: 0us\n" ] }, { "data": { "text/plain": [ "ClassTransformation(estimator=)" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ct_model_mens.fit(\n", " X=X_train, \n", " y=y_train, \n", " treatment=trmnt_train\n", ")" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "uplift_predictions_mens = ct_model_mens.predict(X_val)" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "uplift@10%: 0.0654 (sort groups by uplift_mens together)\n", "uplift@10%: 0.0635 (sort groups by uplift_mens separately)\n" ] } ], "source": [ "# k = 10%\n", "k = 0.1 \n", "\n", "# strategy='overall' sort by uplift treatment and control together\n", "uplift_overall_mens = uplift_at_k(y_val, uplift_predictions_mens, trmnt_val, strategy='overall', k=k)\n", "\n", "# strategy='by_group' sort by uplift treatment and control separately\n", "uplift_bygroup_mens = uplift_at_k(y_val, uplift_predictions_mens, trmnt_val, strategy='by_group', k=k)\n", "\n", "\n", "print(f\"uplift@{k * 100:.0f}%: {uplift_overall_mens:.4f} (sort groups by uplift_mens together)\")\n", "print(f\"uplift@{k * 100:.0f}%: {uplift_bygroup_mens:.4f} (sort groups by uplift_mens separately)\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 4 }