{ "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5-final" }, "orig_nbformat": 2, "kernelspec": { "name": "Python 3.8.5 64-bit ('streamlit': conda)", "display_name": "Python 3.8.5 64-bit ('streamlit': conda)", "metadata": { "interpreter": { "hash": "d5b47a323bfc67f94a37e7cb8fd52db1ad06cc55d227a5b8c375f2f5af6e26dc" } } } }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "source": [ "# p-values with Python from scratch" ], "cell_type": "markdown", "metadata": {} }, { "source": [ "Calculating p-values with python.\n", "\n" ], "cell_type": "markdown", "metadata": {} }, { "source": [ "## Libraries and helper functions" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from typing import Tuple\n", "import math as m" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def normal_probability_below(x: float, mu: float = 0, sigma: float = 1) -> float:\n", " return (1 + m.erf((x - mu) / m.sqrt(2) / sigma)) / 2" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def normal_probability_above(lo: float, mu: float = 0, sigma: float = 1) -> float:\n", " return 1 - normal_probability_below(lo, mu, sigma)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def normal_approximation_to_binomial(n: int, p: float) -> Tuple[float, float]:\n", " mu = p * n\n", " sigma = m.sqrt(p * (1 - p) * n)\n", "\n", " return mu, sigma" ] }, { "source": [ "## Two-sided p-values" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def two_sided_p_value(x: float, mu: float = 0, sigma: float = 1) -> float:\n", " \"\"\"Return the probability of getting at least as extreme value as `x`, given\n", " that our values are from a normal distribution with `mu` mean and `sigma` std.\n", " \"\"\"\n", "\n", " # If x is greater than the mean return everything above x\n", " if x >= mu:\n", " return 2 * normal_probability_above(x, mu, sigma)\n", " # If x is less than the mean than return everything below x\n", " else:\n", " return 2 * normal_probability_below(x, mu, sigma)" ] }, { "source": [ "E.g. if our normal distribution has a mean of 500 and standard deviation of 15, the probabilty to get 530 or above is ~5.78%" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(500.0, 15.811388300841896)" ] }, "metadata": {}, "execution_count": 6 } ], "source": [ "mu_0, sigma_0 = normal_approximation_to_binomial (1000 , 0.5 )\n", "mu_0, sigma_0" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.06207721579598835" ] }, "metadata": {}, "execution_count": 7 } ], "source": [ "two_sided_p_value(529.5, mu_0, sigma_0)" ] }, { "source": [ "As the p-value is greater than 5%, so we don't reject the null hypothesis." ], "cell_type": "markdown", "metadata": {} }, { "source": [ "However, if we look at values beyond 32 distance from the mean, however, the probability will be less than 5%." ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.046345287837786575" ] }, "metadata": {}, "execution_count": 8 } ], "source": [ "two_sided_p_value(531.5, mu_0, sigma_0)" ] }, { "source": [ "## Validating with simulation" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.07" ] }, "metadata": {}, "execution_count": 9 } ], "source": [ "import random\n", "\n", "# Run 1000 simulations with 1000 binomial trials\n", "extreme_value_count = 0\n", "for _ in range(1000):\n", " num_heads = sum(1 if random.random() < 0.5 else 0 for _ in range(1000))\n", "\n", " # Count the 'extreme' values (i.e. beyond 30 distance from the mean)\n", " if num_heads >= 530 or num_heads <= 470:\n", " extreme_value_count += 1\n", "\n", "extreme_value_count / 1000\n", "# assert 610 < extreme_value_count < 630, f\"{extreme_value_count}\"" ] }, { "source": [ "## One-sided p-values" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.06062885772582072" ] }, "metadata": {}, "execution_count": 10 } ], "source": [ "normal_probability_above(524.5, mu_0, sigma_0)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.04686839508859242" ] }, "metadata": {}, "execution_count": 11 } ], "source": [ "normal_probability_above(526.5, mu_0, sigma_0)" ] } ] }