{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## collections_email" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "np.random.seed(24)\n", "n = 5000\n", "email = np.random.binomial(1, 0.5, n)\n", "\n", "credit_limit = np.random.gamma(6, 200, n)\n", "risk_score = np.random.beta(credit_limit, credit_limit.mean(), n)\n", "\n", "opened = np.random.normal(5 + 0.001*credit_limit - 4*risk_score, 2)\n", "opened = (opened > 4).astype(float) * email\n", "\n", "\n", "agreement = np.random.normal(30 +(-0.003*credit_limit - 10*risk_score), 7) * 2 * opened\n", "agreement = (agreement > 40).astype(float)\n", "\n", "payments = (np.random.normal(500 + 0.16*credit_limit - 40*risk_score + 11*agreement + email, 75).astype(int) // 10) * 10\n", "\n", "data = pd.DataFrame(dict(payments=payments,\n", " email=email,\n", " opened=opened,\n", " agreement=agreement,\n", " credit_limit=credit_limit,\n", " risk_score=risk_score))\n", "\n", "data.to_csv(\"collections_email.csv\", index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## hospital_treatment" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "np.random.seed(24)\n", "n = 80\n", "\n", "hospital = np.random.binomial(1, 0.5, n)\n", "\n", "treatment = np.where(hospital.astype(bool),\n", " np.random.binomial(1, 0.9, n),\n", " np.random.binomial(1, 0.1, n))\n", "\n", "severity = np.where(hospital.astype(bool), \n", " np.random.normal(20, 5, n),\n", " np.random.normal(10, 5, n))\n", "\n", "days = np.random.normal(15 + -5*treatment + 2*severity, 7).astype(int)\n", "\n", "hospital = pd.DataFrame(dict(hospital=hospital,\n", " treatment=treatment,\n", " severity=severity,\n", " days=days))\n", "\n", "hospital.to_csv(\"hospital_treatment.csv\", index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## app engagement push" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "np.random.seed(24)\n", "n = 10000\n", "\n", "push_assigned = np.random.binomial(1, 0.5, n)\n", "\n", "income = np.random.gamma(6, 200, n)\n", "\n", "push_delivered = np.random.normal(5 + 0.3+income, 500)\n", "push_delivered = ((push_delivered > 800) & (push_assigned == 1)).astype(int)\n", "\n", "in_app_purchase = (np.random.normal(100 + 20*push_delivered + 0.5*income, 75).astype(int) // 10)\n", "\n", "data = pd.DataFrame(dict(in_app_purchase=in_app_purchase,\n", " push_assigned=push_assigned,\n", " push_delivered=push_delivered))\n", "\n", "data.to_csv(\"app_engagement_push.csv\", index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Drug Impact" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "def make_confounded_data(N):\n", "\n", " def get_severity(df):\n", " return ((np.random.beta(1, 3, size=df.shape[0]) * (df[\"age\"] < 30)) +\n", " (np.random.beta(3, 1.5, size=df.shape[0]) * (df[\"age\"] >= 30)))\n", "\n", " def get_treatment(df):\n", " return ((.33 * df[\"sex\"] +\n", " 1.5 * df[\"severity\"] + df[\"severity\"] ** 2 +\n", " 0.15 * np.random.normal(size=df.shape[0])) > 1.5).astype(int)\n", "\n", " def get_recovery(df):\n", " return ((2 +\n", " 0.5 * df[\"sex\"] +\n", " 0.03 * df[\"age\"] + 0.03 * ((df[\"age\"] * 0.1) ** 2) +\n", " df[\"severity\"] + np.log(df[\"severity\"]) +\n", " df[\"sex\"] * df[\"severity\"] -\n", " df[\"medication\"]) * 10).astype(int)\n", "\n", " np.random.seed(1111)\n", " sexes = np.random.randint(0, 2, size=N)\n", " ages = np.random.gamma(8, scale=4, size=N)\n", " meds = np.random.beta(1, 1, size=N)\n", "\n", " # dados com designação aleatória\n", " df_rnd = pd.DataFrame(dict(sex=sexes, age=ages, medication=meds))\n", " df_rnd['severity'] = get_severity(df_rnd)\n", " df_rnd['recovery'] = get_recovery(df_rnd)\n", "\n", " features = ['sex', 'age', 'severity', 'medication', 'recovery']\n", " df_rnd = df_rnd[features] # to enforce column order\n", "\n", " # dados observacionais\n", " df_obs = df_rnd.copy()\n", " df_obs['medication'] = get_treatment(df_obs)\n", " df_obs['recovery'] = get_recovery(df_obs)\n", "\n", " # dados contrafactuais data\n", " df_ctf = df_obs.copy()\n", " df_ctf['medication'] = ((df_ctf['medication'] == 1) ^ 1).astype(float)\n", " df_ctf['recovery'] = get_recovery(df_ctf)\n", "\n", " return df_rnd, df_obs, df_ctf\n", "\n", "np.random.seed(1234)\n", "df_rnd, df_obs, df_ctf = make_confounded_data(20000)\n", "\n", "df_obs.to_csv(\"medicine_impact_recovery.csv\", index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Bilboard Mkt" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "np.random.seed(123)\n", "POAMay = np.random.gamma(7,10, 500) * np.random.binomial(1, .7, 500)\n", "POAJul = np.random.gamma(7,15, 800) * np.random.binomial(1, .8, 800)\n", "FLMay = np.random.gamma(10,20, 1300) * np.random.binomial(1, .85, 1300)\n", "FLJul = np.random.gamma(11,21, 2000) * np.random.binomial(1, .9, 2000)\n", "\n", "data = pd.concat([\n", " pd.DataFrame(dict(deposits = POAMay.astype(int), poa=1, jul=0)),\n", " pd.DataFrame(dict(deposits = POAJul.astype(int), poa=1, jul=1)),\n", " pd.DataFrame(dict(deposits = FLMay.astype(int), poa=0, jul=0)),\n", " pd.DataFrame(dict(deposits = FLJul.astype(int), poa=0, jul=1))\n", "])\n", "data.to_csv(\"billboard_impact.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }