{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "Python 3.8.5 64-bit ('streamlit': conda)",
   "display_name": "Python 3.8.5 64-bit ('streamlit': conda)",
   "metadata": {
    "interpreter": {
     "hash": "d5b47a323bfc67f94a37e7cb8fd52db1ad06cc55d227a5b8c375f2f5af6e26dc"
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "source": [
    "# Confidence intervals from scratch with Python"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "From the [Data Science from Scratch book](https://www.oreilly.com/library/view/data-science-from/9781492041122/)."
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "## Libraries and helper functions"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import math as m"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calc_normal_cdf(x: float, mu: float = 0, sigma: float = 1) -> float:\n",
    "    return (1 + m.erf((x - mu) / m.sqrt(2) / sigma)) / 2\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calc_inverse_normal_cdf(p: float, mu:float = 0, sigma: float = 1, tolerance: float = 1E-5, show_steps=False) -> float:\n",
    "\n",
    "    if p == 0: return -np.inf\n",
    "    if p == 1: return np.inf\n",
    "\n",
    "    # In case it is not a standard normal distribution, calculate the standard normal first and then rescale\n",
    "    if mu != 0 or sigma != 1:\n",
    "        return mu + sigma * calc_inverse_normal_cdf(p, tolerance=tolerance)\n",
    "\n",
    "    low_z = -10\n",
    "    hi_z = 10\n",
    "\n",
    "    if show_steps: print(f\"{'':<19}\".join(['low_z', 'mid_z', 'hi_z']), \"\\n\")\n",
    "\n",
    "    while hi_z - low_z > tolerance:\n",
    "        mid_z = (low_z + hi_z) / 2\n",
    "        mid_p = calc_normal_cdf(mid_z)\n",
    "\n",
    "        if mid_p < p:\n",
    "            low_z = mid_z\n",
    "        else:\n",
    "            hi_z = mid_z\n",
    "\n",
    "        if show_steps: print(\"\\t\".join(map(to_string, [low_z, mid_z, hi_z])))\n",
    "\n",
    "    return mid_z"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def normal_upper_bound(probabilty: float, mu: float = 0, sigma: float = 1) -> float:\n",
    "    \"\"\"Return z for which P(Z <= z) = probability\"\"\"\n",
    "    return calc_inverse_normal_cdf(probabilty, mu, sigma)\n",
    "\n",
    "def normal_lower_bound(probabilty: float, mu: float = 0, sigma: float = 1) -> float:\n",
    "    \"\"\"Return z for which P(Z >= z) = probability\"\"\"\n",
    "    return calc_inverse_normal_cdf(1 - probabilty, mu, sigma)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def normal_two_sided_bounds(probability: float, mu: float = 0, sigma: float = 1) -> float:\n",
    "    if probability == 0: return 0, 0\n",
    "\n",
    "    tail_probability = (1 - probability) / 2\n",
    "    \n",
    "    lower_bound = normal_upper_bound(tail_probability, mu, sigma)\n",
    "    upper_bound = normal_lower_bound(tail_probability, mu, sigma)\n",
    "\n",
    "    return lower_bound, upper_bound"
   ]
  },
  {
   "source": [
    "## Example 1"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "We would like to know if a coin is 'fair' or not (i.e. p=0.5)\n",
    "\n",
    "We flip the coin 1000 times and observe 525 heads.\n",
    "\n",
    "As we do not know the 'real' probability, we need to rely on our observed results."
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "(0.525, 0.525, 0.015791611697353755)"
      ]
     },
     "metadata": {},
     "execution_count": 6
    }
   ],
   "source": [
    "p_hat = 525 / 1000\n",
    "mu = p_hat\n",
    "sigma = m.sqrt(p_hat * (1 - p_hat) / 1000)\n",
    "\n",
    "p_hat, mu, sigma"
   ]
  },
  {
   "source": [
    "While we cannot tell if this is really the probabiliy of heads, we can calculate the interval within which we expect the real probability fall with our chosen confidence (here: 95%)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "(0.4940490278129096, 0.5559509721870904)"
      ]
     },
     "metadata": {},
     "execution_count": 7
    }
   ],
   "source": [
    "confidence = 0.95\n",
    "normal_two_sided_bounds(confidence, mu, sigma)"
   ]
  },
  {
   "source": [
    "As 0.5 is within the interval, we cannot reject the hypothesis that the coin is fair."
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "## Example 2"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "In another example we observe 540 heads in 1000 flips."
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "(0.54, 0.54, 0.015760710643876435)"
      ]
     },
     "metadata": {},
     "execution_count": 8
    }
   ],
   "source": [
    "p_hat = 540 / 1000\n",
    "mu = p_hat\n",
    "sigma = m.sqrt(p_hat * (1 - p_hat) / 1000)\n",
    "\n",
    "p_hat, mu, sigma"
   ]
  },
  {
   "source": [
    "Going through the same steps as above, we get that 0.5 falls outside our 95% confidence interval and therefore we reject the hypothesis that the coin is 'fair'."
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "(0.5091095927295919, 0.5708904072704082)"
      ]
     },
     "metadata": {},
     "execution_count": 9
    }
   ],
   "source": [
    "normal_two_sided_bounds(confidence, mu, sigma)"
   ]
  }
 ]
}