{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "tinder_experiment_notebook.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "machine_shape": "hm",
      "background_execution": "on"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Tinder experiment analysis\n",
        "\n",
        "---\n",
        "\n",
        "## Part 1: difference between Men and Women\n",
        "\n",
        "* the treatment effect metric employed is the percentage decrease in conversion rate from the Princeton profile to the Rutgers profile, \"the percentage decrease metric\"\n",
        "* bootstrap resampling employed to generate a list of the percentage decrease metrics for females matching male profiles, \"df_male_full\", and for males matching female profiles, \"df_female_full\"\n",
        "* tests, null is difference is explained by chance, alternative is percentage decrease is higher for females matching males than for males matching females (females care more about academic prestige on Tinder than males)\n",
        "  * one-sided t test \n",
        "  * one-sided permutation test\n",
        "  * cohen's d effect size"
      ],
      "metadata": {
        "id": "rctirBEjjMGB"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install --upgrade scipy"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1eGcdgCYjMNE",
        "outputId": "83d289ea-4570-49cd-8127-ee69c11272ea"
      },
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (1.4.1)\n",
            "Collecting scipy\n",
            "  Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)\n",
            "\u001b[K     |████████████████████████████████| 38.1 MB 1.3 MB/s \n",
            "\u001b[?25hRequirement already satisfied: numpy<1.23.0,>=1.16.5 in /usr/local/lib/python3.7/dist-packages (from scipy) (1.21.6)\n",
            "Installing collected packages: scipy\n",
            "  Attempting uninstall: scipy\n",
            "    Found existing installation: scipy 1.4.1\n",
            "    Uninstalling scipy-1.4.1:\n",
            "      Successfully uninstalled scipy-1.4.1\n",
            "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
            "albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.\u001b[0m\n",
            "Successfully installed scipy-1.7.3\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "from random import choices, shuffle\n",
        "import pandas as pd\n",
        "from statistics import mean, stdev\n",
        "from math import sqrt\n",
        "from scipy import stats\n",
        "import time\n"
      ],
      "metadata": {
        "id": "pO_fm1I-QlHg"
      },
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def bootstrap(ground_truth, predictions, metric, B, confidence_level=0.95):\n",
        "    \"\"\"\n",
        "    helper function for providing bootstrap resampling\n",
        "    \n",
        "        ground_truth / predictions: ground truthed labels / model predictions\n",
        "        metric: metric to generate confidence interval for\n",
        "        B: number of iterations\n",
        "        confidence_level: percentage confidence interval desired (default is 2 sigma)\n",
        "    \"\"\"\n",
        "    \n",
        "    # compute lower and upper significance index\n",
        "    critical_value=(1-confidence_level)/2\n",
        "    lower_sig=100*critical_value\n",
        "    upper_sig=100*(1-critical_value)\n",
        "    data=[]\n",
        "    for g, p in zip(ground_truth, predictions):\n",
        "        data.append([g,p])\n",
        "\n",
        "    accuracies=[]\n",
        "    # bootstrap resampling loop\n",
        "    for b in range(B):\n",
        "        choice=choices(data, k=len(data))\n",
        "        choice=np.array(choice)\n",
        "        accuracy=metric(choice[:,0], choice[:,1])\n",
        "        \n",
        "        accuracies.append(accuracy)\n",
        "    \n",
        "    #percentiles=np.percentile(accuracies, [lower_sig, 50, upper_sig])\n",
        "    \n",
        "    #lower=percentiles[0]\n",
        "    #median=percentiles[1]\n",
        "    #upper=percentiles[2]\n",
        "    \n",
        "    return accuracies"
      ],
      "metadata": {
        "id": "D0D_533LQlag"
      },
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def percentage_decrease_metric(princeton, rutgers):\n",
        "  # grab outcomes for each group\n",
        "\n",
        "  printeton_outcomes = princeton\n",
        "  rutgers_outcomes = rutgers\n",
        "\n",
        "  # grab conversion rate for each group\n",
        "  princeton_matches = printeton_outcomes[printeton_outcomes==1] \n",
        "  princeton_conversion_rate = len(princeton_matches)/len(printeton_outcomes)\n",
        "\n",
        "  rutgers_matches = rutgers_outcomes[rutgers_outcomes==1] \n",
        "  rutgers_conversion_rate = len(rutgers_matches)/len(rutgers_outcomes)\n",
        "\n",
        "  # grab percentage increase from rutgers to princeton\n",
        "  percentage_decrease = ((princeton_conversion_rate - rutgers_conversion_rate)/(princeton_conversion_rate)) * 100\n",
        "  return percentage_decrease"
      ],
      "metadata": {
        "id": "re_J63Z8QmET"
      },
      "execution_count": 4,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# read the data\n",
        "\n",
        "df_female_full = pd.read_csv('https://raw.githubusercontent.com/daniel-furman/online-dating-field-experiment/main/data/processed_data/df_female_full.csv', index_col='Unnamed: 0')\n",
        "# shuffle the data\n",
        "df_female_full = df_female_full.sample(frac=1)\n",
        "print(df_female_full.head())\n",
        "\n",
        "df_male_full = pd.read_csv('https://raw.githubusercontent.com/daniel-furman/online-dating-field-experiment/main/data/processed_data/df_male_full_2.csv', index_col='Unnamed: 0')\n",
        "# shuffle the data\n",
        "df_male_full = df_male_full.sample(frac=1)\n",
        "print('\\n', df_male_full.head())\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "RPlwk5w-RKAs",
        "outputId": "750bfd42-0901-47e3-e312-f6f68c45e764"
      },
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "         Name  Age  School  Work  Match  Treatment\n",
            "135     Tanay   23       1     0      1          1\n",
            "103  Jonathan   23       1     1      1          1\n",
            "43       John   22       0     1      0          0\n",
            "186      Dave   22       0     0      0          1\n",
            "190     Jesus   28       0     0      0          1\n",
            "\n",
            "        Name  Age  School  Work  Match  Treatment\n",
            "1       Ash   24       1     1      1          0\n",
            "27     Pope   25       0     0      0          0\n",
            "183  Ariana   22       0     0      0          1\n",
            "38   Nguyen   24       0     1      0          0\n",
            "91   Isabel   24       0     0      0          0\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "female_list = bootstrap(df_female_full[df_female_full['Treatment']==1]['Match'],\n",
        "                   df_female_full[df_female_full['Treatment']==0]['Match'],\n",
        "                   percentage_decrease_metric,\n",
        "                   100,)\n",
        "np.mean(female_list)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "s63QbevMZlzv",
        "outputId": "eb81c293-acb0-4c3f-8612-e2b12f9d96a3"
      },
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "44.31304381474861"
            ]
          },
          "metadata": {},
          "execution_count": 6
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "male_list = bootstrap(df_male_full[df_male_full['Treatment']==1]['Match'],\n",
        "                           df_male_full[df_male_full['Treatment']==0]['Match'],\n",
        "                           percentage_decrease_metric,\n",
        "                           100,)\n",
        "np.mean(male_list)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "m83zox9Fav3V",
        "outputId": "08c72bbc-0885-49f8-f335-bd6990f5dc50"
      },
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "64.58484673287334"
            ]
          },
          "metadata": {},
          "execution_count": 7
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%%time\n",
        "# t testing\n",
        "# Null is that the mean percentage decrease from princeton to rutgers is the same between the two groups.\n",
        "# Alternative is that the mean percentage decrease from princeton to rutgers is larger for females matching men.\n",
        "\n",
        "p_val_list = []\n",
        "for i in range(0,1000):\n",
        "    male_bootstrap = pd.Series(male_list).sample(frac=1, replace=True).to_list()\n",
        "    female_bootstrap = pd.Series(female_list).sample(frac=1, replace=True).to_list()\n",
        "    p_val = stats.ttest_ind(male_bootstrap, female_bootstrap, equal_var=False, alternative='greater')[1]\n",
        "    p_val_list.append(p_val)\n",
        "\n",
        "p_val_list.sort()\n",
        "lower = p_val_list[25]\n",
        "median = p_val_list[500]\n",
        "upper = p_val_list[975]\n",
        "\n",
        "print(f\"\\nP_val for Welch's T-test: {median}, with a 95% confidence interval of [{lower},{upper}]\\n\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "9o5HSw1qXdJX",
        "outputId": "a1971a6a-6992-4e32-b5d2-9225cc23e3a0"
      },
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "P_val for Welch's T-test: 3.6318763884074564e-21, with a 95% confidence interval of [7.109263185259275e-32,6.218972793583267e-14]\n",
            "\n",
            "CPU times: user 897 ms, sys: 0 ns, total: 897 ms\n",
            "Wall time: 896 ms\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# practical significance testing\n",
        "def cohens_d(list1, list2): # with correction for small sample\n",
        "    return (mean(list1) - mean(list2)) / (sqrt((stdev(list1) ** 2 + stdev(list2) ** 2) / 2)) * ((len(list1)- 3)/ (len(list1)- 2.25)) * sqrt(((len(list1)- 2)/len(list1)))\n",
        "\n",
        "print(\"Effect size, Cohens D (number of strandard deviations between distributions): \", cohens_d(male_list, female_list))"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "MnLEX-6XXdMM",
        "outputId": "10557c1d-a1ca-4b6b-f473-6c6a93519d80"
      },
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Effect size, Cohens D (number of strandard deviations between distributions):  1.5101440084277775\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%%time\n",
        "# Non-parametric testing (permutation testing) on median\n",
        "# Null is that the median percentage decrease from princeton to rutgers is explained by chance between the two groups.\n",
        "# Alternative is that the median percentage decrease from princeton to rutgers is larger for females matching men.\n",
        "\n",
        "\n",
        "# Testing on median\n",
        "p_val_list = []\n",
        "permutation_iters = 3000\n",
        "ground_truth = np.median(male_list) - np.median(female_list)\n",
        "# pool variables into one distribution, sample two distributions equal in size to the original \n",
        "pooled = list(male_list+female_list)\n",
        "for i in range(0,1000):\n",
        "    permuted_differences = []\n",
        "    for i in range(0,permutation_iters):    \n",
        "        shuffle(pooled)\n",
        "        permuted_differences.append(np.median(pooled[0:int(len(pooled)/2)]) - np.median(pooled[int(len(pooled)/2):]))\n",
        "    p_val = len(np.where(permuted_differences>=ground_truth)[0])/permutation_iters\n",
        "    p_val_list.append(p_val)\n",
        "p_val_list.sort()\n",
        "lower = p_val_list[25]\n",
        "median = p_val_list[500]\n",
        "upper = p_val_list[975]\n",
        "print(f'\\nP_val for One-Tailed Permutation Test on Median: {median}, with a 95% confidence interval of [{lower},{upper}]\\n')"
      ],
      "metadata": {
        "id": "Wht-rdTfXUs2",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "bc4d68c3-ef8e-4c5a-f0aa-d1349f999f0d"
      },
      "execution_count": null,
      "outputs": [
        {
          "metadata": {
            "tags": null
          },
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n",
            "P_val for One-Tailed Permutation Test on Median: 0.0, with a 95% confidence interval of [0.0,0.0]\n",
            "\n",
            "CPU times: user 11min 22s, sys: 1.69 s, total: 11min 23s\n",
            "Wall time: 11min 22s\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Part 2: ATE for control versus treatment inter-sex\n",
        "---\n",
        "\n",
        "* pools the females and the males together and calculates the ATE of having Princeton versus Rutgers overall"
      ],
      "metadata": {
        "id": "fsqDDS_oAi2_"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "full_df = pd.concat([df_male_full, df_female_full])\n",
        "full_df.sample(4)"
      ],
      "metadata": {
        "id": "4qIa6bGrBr0L",
        "outputId": "69c044f0-2ec4-4743-c4cf-4a54cd05b55b",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 175
        }
      },
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "       Name  Age  School  Work  Match  Treatment\n",
              "122  Edmond   25       0     0      1          1\n",
              "70    Naomi   22       0     1      0          0\n",
              "110     Jay   25       1     1      1          1\n",
              "74   Meylia   23       0     0      0          0"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-c2d931bd-beff-4916-8e7a-48ec9b29a549\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Name</th>\n",
              "      <th>Age</th>\n",
              "      <th>School</th>\n",
              "      <th>Work</th>\n",
              "      <th>Match</th>\n",
              "      <th>Treatment</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>122</th>\n",
              "      <td>Edmond</td>\n",
              "      <td>25</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>70</th>\n",
              "      <td>Naomi</td>\n",
              "      <td>22</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>110</th>\n",
              "      <td>Jay</td>\n",
              "      <td>25</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>74</th>\n",
              "      <td>Meylia</td>\n",
              "      <td>23</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-c2d931bd-beff-4916-8e7a-48ec9b29a549')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-c2d931bd-beff-4916-8e7a-48ec9b29a549 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-c2d931bd-beff-4916-8e7a-48ec9b29a549');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ]
          },
          "metadata": {},
          "execution_count": 10
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import statsmodels.api as sm\n",
        "ytrain = full_df['Match']\n",
        "Xtrain = full_df[['Age', 'School', 'Work', 'Treatment']]\n",
        "log_reg = sm.Logit(ytrain, Xtrain).fit()\n",
        "log_reg.summary()"
      ],
      "metadata": {
        "id": "mpn3hcR6AqqY",
        "outputId": "df124b0d-ebe1-4072-b035-d06568edb3b4",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 404
        }
      },
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.7/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n",
            "  import pandas.util.testing as tm\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Optimization terminated successfully.\n",
            "         Current function value: 0.550270\n",
            "         Iterations 6\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<class 'statsmodels.iolib.summary.Summary'>\n",
              "\"\"\"\n",
              "                           Logit Regression Results                           \n",
              "==============================================================================\n",
              "Dep. Variable:                  Match   No. Observations:                  413\n",
              "Model:                          Logit   Df Residuals:                      409\n",
              "Method:                           MLE   Df Model:                            3\n",
              "Date:                Mon, 02 May 2022   Pseudo R-squ.:                 0.06601\n",
              "Time:                        02:28:58   Log-Likelihood:                -227.26\n",
              "converged:                       True   LL-Null:                       -243.32\n",
              "Covariance Type:            nonrobust   LLR p-value:                 4.923e-07\n",
              "==============================================================================\n",
              "                 coef    std err          z      P>|z|      [0.025      0.975]\n",
              "------------------------------------------------------------------------------\n",
              "Age           -0.0846      0.010     -8.520      0.000      -0.104      -0.065\n",
              "School         0.8424      0.238      3.545      0.000       0.377       1.308\n",
              "Work           0.6301      0.242      2.603      0.009       0.156       1.105\n",
              "Treatment      0.7861      0.232      3.395      0.001       0.332       1.240\n",
              "==============================================================================\n",
              "\"\"\""
            ],
            "text/html": [
              "<table class=\"simpletable\">\n",
              "<caption>Logit Regression Results</caption>\n",
              "<tr>\n",
              "  <th>Dep. Variable:</th>         <td>Match</td>      <th>  No. Observations:  </th>  <td>   413</td>  \n",
              "</tr>\n",
              "<tr>\n",
              "  <th>Model:</th>                 <td>Logit</td>      <th>  Df Residuals:      </th>  <td>   409</td>  \n",
              "</tr>\n",
              "<tr>\n",
              "  <th>Method:</th>                 <td>MLE</td>       <th>  Df Model:          </th>  <td>     3</td>  \n",
              "</tr>\n",
              "<tr>\n",
              "  <th>Date:</th>            <td>Mon, 02 May 2022</td> <th>  Pseudo R-squ.:     </th>  <td>0.06601</td> \n",
              "</tr>\n",
              "<tr>\n",
              "  <th>Time:</th>                <td>02:28:58</td>     <th>  Log-Likelihood:    </th> <td> -227.26</td> \n",
              "</tr>\n",
              "<tr>\n",
              "  <th>converged:</th>             <td>True</td>       <th>  LL-Null:           </th> <td> -243.32</td> \n",
              "</tr>\n",
              "<tr>\n",
              "  <th>Covariance Type:</th>     <td>nonrobust</td>    <th>  LLR p-value:       </th> <td>4.923e-07</td>\n",
              "</tr>\n",
              "</table>\n",
              "<table class=\"simpletable\">\n",
              "<tr>\n",
              "      <td></td>         <th>coef</th>     <th>std err</th>      <th>z</th>      <th>P>|z|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
              "</tr>\n",
              "<tr>\n",
              "  <th>Age</th>       <td>   -0.0846</td> <td>    0.010</td> <td>   -8.520</td> <td> 0.000</td> <td>   -0.104</td> <td>   -0.065</td>\n",
              "</tr>\n",
              "<tr>\n",
              "  <th>School</th>    <td>    0.8424</td> <td>    0.238</td> <td>    3.545</td> <td> 0.000</td> <td>    0.377</td> <td>    1.308</td>\n",
              "</tr>\n",
              "<tr>\n",
              "  <th>Work</th>      <td>    0.6301</td> <td>    0.242</td> <td>    2.603</td> <td> 0.009</td> <td>    0.156</td> <td>    1.105</td>\n",
              "</tr>\n",
              "<tr>\n",
              "  <th>Treatment</th> <td>    0.7861</td> <td>    0.232</td> <td>    3.395</td> <td> 0.001</td> <td>    0.332</td> <td>    1.240</td>\n",
              "</tr>\n",
              "</table>"
            ]
          },
          "metadata": {},
          "execution_count": 11
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "* ATE is 0.7861 with a standard error of 0.232 and p-value of 0.001"
      ],
      "metadata": {
        "id": "xjPZ4hn1CyZi"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "#full_df['Match'][full_df['Treatment']==1]\n",
        "#full_df['Match'][full_df['Treatment']==0]\n"
      ],
      "metadata": {
        "id": "Rtl93Z5DmYeS"
      },
      "execution_count": 25,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "%%time\n",
        "# t testing\n",
        "# Null is that the match rate for princeton and rutgers is the same.\n",
        "# Alternative is that the match rate is better for princeton then rutgers.\n",
        "\n",
        "p_val_list = []\n",
        "for i in range(0,1000):\n",
        "    princeton_bootstrap = pd.Series(male_list).sample(frac=1, replace=True).to_list()\n",
        "    rutgers_bootstrap = pd.Series(female_list).sample(frac=1, replace=True).to_list()\n",
        "    p_val = stats.ttest_ind(princeton_bootstrap, rutgers_bootstrap, equal_var=False, alternative='greater')[1]\n",
        "    p_val_list.append(p_val)\n",
        "\n",
        "p_val_list.sort()\n",
        "lower = p_val_list[25]\n",
        "median = p_val_list[500]\n",
        "upper = p_val_list[975]\n",
        "\n",
        "print(f\"\\nP_val for Welch's T-test: {median}, with a 95% confidence interval of [{lower},{upper}]\\n\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "f0656ff2-1e72-4f39-ed2c-792fca8b1306",
        "id": "ZR7ox247m14E"
      },
      "execution_count": 21,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "P_val for Welch's T-test: 4.318082381377927e-21, with a 95% confidence interval of [9.314061265384596e-31,1.3447208458393505e-13]\n",
            "\n",
            "CPU times: user 870 ms, sys: 15.5 ms, total: 885 ms\n",
            "Wall time: 865 ms\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# practical significance testing\n",
        "def cohens_d(list1, list2): # with correction for small sample\n",
        "    return (mean(list1) - mean(list2)) / (sqrt((stdev(list1) ** 2 + stdev(list2) ** 2) / 2)) * ((len(list1)- 3)/ (len(list1)- 2.25)) * sqrt(((len(list1)- 2)/len(list1)))\n",
        "\n",
        "print(\"Effect size, Cohens D (number of standard deviations between distributions): \", cohens_d(list(full_df['Match'][full_df['Treatment']==1]), list(full_df['Match'][full_df['Treatment']==0])))"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "a12d34a3-eb09-4cc1-90d5-ec5cec755000",
        "id": "VePyyMc3m14Q"
      },
      "execution_count": 23,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Effect size, Cohens D (number of strandard deviations between distributions):  0.41177368591696484\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%%time\n",
        "# Non-parametric testing (permutation testing) on mean\n",
        "\n",
        "# Testing on mean\n",
        "p_val_list = []\n",
        "permutation_iters = 1000\n",
        "male_list = list(full_df['Match'][full_df['Treatment']==1])\n",
        "female_list = list(full_df['Match'][full_df['Treatment']==0])\n",
        "\n",
        "ground_truth = np.mean(male_list) - np.mean(female_list)\n",
        "# pool variables into one distribution, sample two distributions equal in size to the original \n",
        "pooled = list(male_list+female_list)\n",
        "for i in range(0,1000):\n",
        "    permuted_differences = []\n",
        "    for i in range(0,permutation_iters):    \n",
        "        shuffle(pooled)\n",
        "        permuted_differences.append(np.mean(pooled[0:int(len(pooled)/2)]) - np.mean(pooled[int(len(pooled)/2):]))\n",
        "    p_val = len(np.where(permuted_differences>=ground_truth)[0])/permutation_iters\n",
        "    p_val_list.append(p_val)\n",
        "p_val_list.sort()\n",
        "lower = p_val_list[25]\n",
        "median = p_val_list[500]\n",
        "upper = p_val_list[975]\n",
        "print(f'\\nP_val for One-Tailed Permutation Test on Median: {median}, with a 95% confidence interval of [{lower},{upper}]\\n')"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "0e957dae-9585-4725-9aa8-3af755532333",
        "id": "nSJ5W0T8m14Q"
      },
      "execution_count": 26,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "P_val for One-Tailed Permutation Test on Median: 0.0, with a 95% confidence interval of [0.0,0.0]\n",
            "\n",
            "CPU times: user 5min 37s, sys: 725 ms, total: 5min 38s\n",
            "Wall time: 5min 37s\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "hZhmw93PoXkp"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}