{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## collections_email"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "np.random.seed(24)\n",
    "n = 5000\n",
    "email = np.random.binomial(1, 0.5, n)\n",
    "\n",
    "credit_limit = np.random.gamma(6, 200, n)\n",
    "risk_score = np.random.beta(credit_limit, credit_limit.mean(), n)\n",
    "\n",
    "opened = np.random.normal(5 + 0.001*credit_limit - 4*risk_score, 2)\n",
    "opened = (opened > 4).astype(float) * email\n",
    "\n",
    "\n",
    "agreement = np.random.normal(30 +(-0.003*credit_limit - 10*risk_score), 7) * 2 * opened\n",
    "agreement = (agreement > 40).astype(float)\n",
    "\n",
    "payments = (np.random.normal(500 + 0.16*credit_limit - 40*risk_score + 11*agreement + email, 75).astype(int) // 10) * 10\n",
    "\n",
    "data = pd.DataFrame(dict(payments=payments,\n",
    "                         email=email,\n",
    "                         opened=opened,\n",
    "                         agreement=agreement,\n",
    "                         credit_limit=credit_limit,\n",
    "                         risk_score=risk_score))\n",
    "\n",
    "data.to_csv(\"collections_email.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## hospital_treatment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "np.random.seed(24)\n",
    "n = 80\n",
    "\n",
    "hospital = np.random.binomial(1, 0.5, n)\n",
    "\n",
    "treatment = np.where(hospital.astype(bool),\n",
    "                     np.random.binomial(1, 0.9, n),\n",
    "                     np.random.binomial(1, 0.1, n))\n",
    "\n",
    "severity = np.where(hospital.astype(bool), \n",
    "                    np.random.normal(20, 5, n),\n",
    "                    np.random.normal(10, 5, n))\n",
    "\n",
    "days = np.random.normal(15 + -5*treatment + 2*severity, 7).astype(int)\n",
    "\n",
    "hospital = pd.DataFrame(dict(hospital=hospital,\n",
    "                             treatment=treatment,\n",
    "                             severity=severity,\n",
    "                             days=days))\n",
    "\n",
    "hospital.to_csv(\"hospital_treatment.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## app engagement push"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "np.random.seed(24)\n",
    "n = 10000\n",
    "\n",
    "push_assigned = np.random.binomial(1, 0.5, n)\n",
    "\n",
    "income = np.random.gamma(6, 200, n)\n",
    "\n",
    "push_delivered = np.random.normal(5 + 0.3+income, 500)\n",
    "push_delivered = ((push_delivered > 800) & (push_assigned == 1)).astype(int)\n",
    "\n",
    "in_app_purchase = (np.random.normal(100 + 20*push_delivered + 0.5*income, 75).astype(int) // 10)\n",
    "\n",
    "data = pd.DataFrame(dict(in_app_purchase=in_app_purchase,\n",
    "                         push_assigned=push_assigned,\n",
    "                         push_delivered=push_delivered))\n",
    "\n",
    "data.to_csv(\"app_engagement_push.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Drug Impact"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "def make_confounded_data(N):\n",
    "\n",
    "    def get_severity(df):\n",
    "        return ((np.random.beta(1, 3, size=df.shape[0]) * (df[\"age\"] < 30)) +\n",
    "                (np.random.beta(3, 1.5, size=df.shape[0]) * (df[\"age\"] >= 30)))\n",
    "\n",
    "    def get_treatment(df):\n",
    "        return ((.33 * df[\"sex\"] +\n",
    "                1.5 * df[\"severity\"] +  df[\"severity\"] ** 2 +\n",
    "                0.15 * np.random.normal(size=df.shape[0])) > 1.5).astype(int)\n",
    "\n",
    "    def get_recovery(df):\n",
    "        return ((2 +\n",
    "                0.5 * df[\"sex\"] +\n",
    "                0.03 * df[\"age\"] + 0.03 * ((df[\"age\"] * 0.1) ** 2) +\n",
    "                df[\"severity\"] + np.log(df[\"severity\"]) +\n",
    "                df[\"sex\"] * df[\"severity\"] -\n",
    "                df[\"medication\"]) * 10).astype(int)\n",
    "\n",
    "    np.random.seed(1111)\n",
    "    sexes = np.random.randint(0, 2, size=N)\n",
    "    ages = np.random.gamma(8, scale=4, size=N)\n",
    "    meds = np.random.beta(1, 1, size=N)\n",
    "\n",
    "    # dados com designação aleatória\n",
    "    df_rnd = pd.DataFrame(dict(sex=sexes, age=ages, medication=meds))\n",
    "    df_rnd['severity'] = get_severity(df_rnd)\n",
    "    df_rnd['recovery'] = get_recovery(df_rnd)\n",
    "\n",
    "    features = ['sex', 'age', 'severity', 'medication', 'recovery']\n",
    "    df_rnd = df_rnd[features]  # to enforce column order\n",
    "\n",
    "    # dados observacionais\n",
    "    df_obs = df_rnd.copy()\n",
    "    df_obs['medication'] = get_treatment(df_obs)\n",
    "    df_obs['recovery'] = get_recovery(df_obs)\n",
    "\n",
    "    # dados contrafactuais data\n",
    "    df_ctf = df_obs.copy()\n",
    "    df_ctf['medication'] = ((df_ctf['medication'] == 1) ^ 1).astype(float)\n",
    "    df_ctf['recovery'] = get_recovery(df_ctf)\n",
    "\n",
    "    return df_rnd, df_obs, df_ctf\n",
    "\n",
    "np.random.seed(1234)\n",
    "df_rnd, df_obs, df_ctf = make_confounded_data(20000)\n",
    "\n",
    "df_obs.to_csv(\"medicine_impact_recovery.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Bilboard Mkt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "np.random.seed(123)\n",
    "POAMay = np.random.gamma(7,10, 500) * np.random.binomial(1, .7, 500)\n",
    "POAJul = np.random.gamma(7,15, 800) * np.random.binomial(1, .8, 800)\n",
    "FLMay = np.random.gamma(10,20, 1300) * np.random.binomial(1, .85, 1300)\n",
    "FLJul = np.random.gamma(11,21, 2000) * np.random.binomial(1, .9, 2000)\n",
    "\n",
    "data = pd.concat([\n",
    "    pd.DataFrame(dict(deposits = POAMay.astype(int), poa=1, jul=0)),\n",
    "    pd.DataFrame(dict(deposits = POAJul.astype(int), poa=1, jul=1)),\n",
    "    pd.DataFrame(dict(deposits = FLMay.astype(int), poa=0, jul=0)),\n",
    "    pd.DataFrame(dict(deposits = FLJul.astype(int), poa=0, jul=1))\n",
    "])\n",
    "data.to_csv(\"billboard_impact.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Customer Lifecycle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.3721\n",
      "[1.     1.     1.     1.     1.     1.     1.     0.9999 0.9994 0.9984\n",
      " 0.9966 0.994  0.9886 0.9791 0.9663 0.944  0.9128 0.8726 0.8205 0.7603\n",
      " 0.6932 0.6138 0.5295 0.4424 0.3618 0.2919 0.2308 0.1769 0.1286 0.0942]\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from matplotlib import pyplot as plt\n",
    "from toolz import merge\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "np.random.seed(12)\n",
    "\n",
    "n = 10000\n",
    "t = 30\n",
    "\n",
    "age = 18 + np.random.poisson(10, n)\n",
    "income = 500+np.random.exponential(2000, size=n).astype(int)\n",
    "region = np.random.choice(np.random.lognormal(4, size=50), size=n)\n",
    "\n",
    "freq = np.random.lognormal((1 + age/(18+10)).astype(int))\n",
    "churn = np.random.poisson((income-500)/2000 + 22, n)\n",
    "\n",
    "ones = np.ones((n, t))\n",
    "alive = (np.cumsum(ones, axis=1) <= churn.reshape(n, 1)).astype(int)\n",
    "buy = np.random.binomial(1, ((1/(freq+1)).reshape(n, 1) * ones))\n",
    "\n",
    "cacq = -1*abs(np.random.normal(region, 2, size=n).astype(int))\n",
    "transactions = np.random.lognormal(((income.mean() - 500) / 1000), size=(n, t)).astype(int) * buy * alive\n",
    "\n",
    "data = pd.DataFrame(merge({\"customer_id\": range(n), \"cacq\":cacq},\n",
    "                          {f\"day_{day}\": trans \n",
    "                           for day, trans in enumerate(transactions.T)}))\n",
    "\n",
    "encoced = {value:index for index, value in\n",
    "           enumerate(np.random.permutation(np.unique(region)))}\n",
    "\n",
    "customer_features = pd.DataFrame(dict(customer_id=range(n), \n",
    "                                      region=region,\n",
    "                                      income=income,\n",
    "                                      age=age)).replace({\"region\":encoced}).astype(int)\n",
    "\n",
    "print((data.drop(columns=[\"customer_id\"]).sum(axis=1) > 0).mean()) # proportion of profitable customers\n",
    "print((alive).mean(axis=0)) # alive customer per days\n",
    "\n",
    "data.to_csv(\"./causal-inference-for-the-brave-and-true/data/customer_transactions.csv\", index=False)\n",
    "customer_features.to_csv(\"./causal-inference-for-the-brave-and-true/data/customer_features.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Price and Sales"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "from matplotlib import pyplot as plt\n",
    "\n",
    "np.random.seed(5)\n",
    "\n",
    "def price_elast(price, temp, weekday, cost):\n",
    "    return  -4 + 0.2*price + 0.05*temp + 2*np.isin(weekday, [1,7]) + 0.3 * cost\n",
    "\n",
    "def sales(price, temp, weekday, cost):\n",
    "    elast = -abs(price_elast(price, temp, weekday, cost))\n",
    "    output = np.random.normal(200 + 20*np.isin(weekday, [1,7]) + 1.3 * temp +\n",
    "                              5*elast * price, 5).astype(int)\n",
    "    \n",
    "    return output\n",
    "\n",
    "\n",
    "n_rnd = 5000\n",
    "\n",
    "temp = np.random.normal(24, 4, n_rnd).round(1)\n",
    "weekday = np.random.choice(list(range(1, 8)), n_rnd)\n",
    "cost = np.random.choice([0.3, 0.5, 1.0, 1.5], n_rnd)\n",
    "price_rnd = np.random.choice(list(range(3, 11)), n_rnd)\n",
    "\n",
    "price_df_rnd = pd.DataFrame(dict(temp=temp, weekday=weekday, cost=cost,\n",
    "                                 price=price_rnd, sales=sales(price_rnd, temp, weekday, cost)))\n",
    "\n",
    "n = 10000\n",
    "temp = np.random.normal(24, 4, n).round(1)\n",
    "weekday = np.random.choice(list(range(1, 8)), n)\n",
    "cost = np.random.choice([0.3, 0.5, 1.0, 1.5], n)\n",
    "price = np.random.normal(5 + cost + np.isin(weekday, [1,7])).round(1)\n",
    "\n",
    "price_df = pd.DataFrame(dict(temp=temp, weekday=weekday, cost=cost,\n",
    "                             price=price, sales=sales(price, temp, weekday, cost)))\n",
    "\n",
    "price_df_rnd.to_csv(\"./causal-inference-for-the-brave-and-true/data/ice_cream_sales_rnd.csv\", index=False)\n",
    "price_df.to_csv(\"./causal-inference-for-the-brave-and-true/data/ice_cream_sales.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Marketing Email"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 216,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "from matplotlib import pyplot as plt\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "\n",
    "scaler = MinMaxScaler((0, 1))\n",
    "\n",
    "np.random.seed(12321)\n",
    "\n",
    "n_rnd=5000\n",
    "\n",
    "age = 18 + np.random.normal(24, 4, n_rnd).round(1)\n",
    "income = 500 + np.random.gamma(1, age * 100, n_rnd).round(2)\n",
    "insurance = np.random.gamma(30/age, age*1000, n_rnd).round(2)\n",
    "invested = np.random.gamma(age/10, income/2, n_rnd).round(2)\n",
    "\n",
    "em1_ps = income.min()/(income + 10)\n",
    "em2_ps = invested/(invested.max())\n",
    "em3_ps = np.where(age > 40, scaler.fit_transform(-income.reshape(-1,1)).ravel(), 0)\n",
    "\n",
    "em1 = np.random.binomial(1, em1_ps)\n",
    "em2 = np.random.binomial(1, em2_ps)\n",
    "em3 = np.random.binomial(1, em3_ps)\n",
    "\n",
    "elast_em1 = scaler.fit_transform((-3*age + 0.005*invested).reshape(-1,1)).ravel()\n",
    "elast_em2 = scaler.fit_transform((age + income*0.005).reshape(-1,1)).ravel()\n",
    "elast_em3 = scaler.fit_transform((-insurance).reshape(-1,1)).ravel()\n",
    "\n",
    "buy = scaler.fit_transform((1 + 0.4*age - invested/10000).reshape(-1,1)).ravel()\n",
    "buy += elast_em1*em1 + elast_em2*em2 + elast_em3*em3\n",
    "buy = scaler.fit_transform(buy.reshape(-1,1)).ravel()\n",
    "buy = np.random.binomial(1, buy).round(2)\n",
    "\n",
    "df = pd.DataFrame(dict(age=age, income=income, insurance=insurance, invested=invested,\n",
    "                       em1_ps=em1_ps, em2_ps=em2_ps, em3_ps=em3_ps,\n",
    "                       em1=em1, em2=em2, em3=em3,\n",
    "                       converted=buy))\n",
    "\n",
    "df.to_csv(\"./causal-inference-for-the-brave-and-true/data/invest_email.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "from matplotlib import pyplot as plt\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "\n",
    "scaler = MinMaxScaler((0.001, 0.999))\n",
    "\n",
    "np.random.seed(12321)\n",
    "\n",
    "n_rnd=15000\n",
    "\n",
    "age = 18 + np.random.normal(24, 4, n_rnd).round(1)\n",
    "income = 500 + np.random.gamma(1, age * 100, n_rnd).round(2)\n",
    "insurance = np.random.gamma(30/age, age*1000, n_rnd).round(2)\n",
    "invested = np.random.gamma(age/10, income/2, n_rnd).round(2)\n",
    "\n",
    "em1 = np.random.binomial(1, 0.5, n_rnd)\n",
    "em2 = np.random.binomial(1, 0.2, n_rnd)\n",
    "em3 = np.random.binomial(1, 0.9, n_rnd)\n",
    "\n",
    "elast_em1 = scaler.fit_transform((-3*age + 0.005*invested).reshape(-1,1)).ravel()\n",
    "elast_em2 = scaler.fit_transform((age + income*0.005).reshape(-1,1)).ravel()\n",
    "elast_em3 = scaler.fit_transform((-insurance).reshape(-1,1)).ravel()\n",
    "\n",
    "buy = (200*elast_em1*em1 + 100*elast_em2*em2 + 10*elast_em3*em3 \n",
    "       + 1.5*age + 0.0005*invested - 0.0001*income)\n",
    "\n",
    "buy = scaler.fit_transform(buy.reshape(-1,1)).ravel()\n",
    "\n",
    "buy = np.random.binomial(1, buy)\n",
    "\n",
    "df = pd.DataFrame(dict(age=age, income=income, insurance=insurance, invested=invested,\n",
    "                       em1=em1, em2=em2, em3=em3,\n",
    "                       converted=buy))\n",
    "\n",
    "df.to_csv(\"./causal-inference-for-the-brave-and-true/data/invest_email_rnd.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "from matplotlib import pyplot as plt\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "\n",
    "scaler = MinMaxScaler((0.001, 0.999))\n",
    "\n",
    "np.random.seed(12321)\n",
    "\n",
    "n_rnd=15000\n",
    "\n",
    "age = 18 + np.random.normal(24, 4, n_rnd).round(1)\n",
    "income = 500 + np.random.gamma(1, age * 100, n_rnd).round(2)\n",
    "insurance = np.random.gamma(30/age, age*1000, n_rnd).round(2)\n",
    "invested = np.random.gamma(age/10, income/2, n_rnd).round(2)\n",
    "\n",
    "em1_ps = income.min()/(income + 10)\n",
    "em2_ps = invested/(invested.max())\n",
    "em3_ps = np.where(age > 40, scaler.fit_transform(-income.reshape(-1,1)).ravel(), 0)\n",
    "\n",
    "\n",
    "em1 = np.random.binomial(1, em1_ps)\n",
    "em2 = np.random.binomial(1, em2_ps)\n",
    "em3 = np.random.binomial(1, em3_ps)\n",
    "\n",
    "elast_em1 = scaler.fit_transform((-3*age + 0.005*invested).reshape(-1,1)).ravel()\n",
    "elast_em2 = scaler.fit_transform((age + income*0.005).reshape(-1,1)).ravel()\n",
    "elast_em3 = scaler.fit_transform((-insurance).reshape(-1,1)).ravel()\n",
    "\n",
    "buy = (200*elast_em1*em1 + 100*elast_em2*em2 + 10*elast_em3*em3 \n",
    "       + 1.5*age + 0.0005*invested - 0.0001*income)\n",
    "\n",
    "buy = scaler.fit_transform(buy.reshape(-1,1)).ravel()\n",
    "\n",
    "buy = np.random.binomial(1, buy)\n",
    "\n",
    "df = pd.DataFrame(dict(age=age, income=income, insurance=insurance, invested=invested,\n",
    "                       em1=em1, em2=em2, em3=em3,\n",
    "                       converted=buy))\n",
    "\n",
    "df.to_csv(\"./causal-inference-for-the-brave-and-true/data/invest_email_biased.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "causal-glory",
   "language": "python",
   "name": "causal-glory"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}