{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "a283c66a-b223-4656-b7a5-79130f41ecd2", "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "using Distributions\n", "using StatsPlots\n", "\n", "# warm up (少し時間を取られる。その間に下のセルでコードを入力する)\n", "histogram(randn(10^5); norm=true, alpha=0.3, size=(300, 200))\n", "plot!(Normal())" ] }, { "cell_type": "code", "execution_count": 2, "id": "06b4a207-6a23-4ef5-824a-e80697ecabb4", "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# コインを20回投げたときの表が出る回数の分布は正規分布で近似される。\n", "\n", "n = 20\n", "p = 0.5\n", "sample_of_means = rand(Binomial(n, p), 10^5)\n", "histogram(sample_of_means; norm=true, alpha=0.3, bin=-0.5:n+0.5, label=\"rand(Binomial($n, $p))\")\n", "plot!(Normal(n*p, √(n*p*(1-p))), 0, 20; label=\"normal approx.\", lw=2)" ] }, { "cell_type": "code", "execution_count": 3, "id": "eef21b78-6277-438b-9702-31ceefb4081d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Binomial{Float64}(n=20, p=0.5)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 回数n、確率パラメータpの二項分布オブジェクトを次のように作れる。\n", "\n", "bin = Binomial(n, p)" ] }, { "cell_type": "code", "execution_count": 4, "id": "cbf80976-5197-4aa9-8b38-5127a7613aec", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 確率分布オブジェクトの乱数\n", "\n", "rand(bin)" ] }, { "cell_type": "code", "execution_count": 5, "id": "d2a384a5-70ea-4776-a415-8ecba1c614b3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "rand(bin, 20) = [7, 14, 13, 12, 7, 7, 9, 13, 4, 10, 13, 13, 11, 10, 8, 12, 9, 8, 9, 12]\n" ] } ], "source": [ "# 確率分布オブジェクトの乱数を複数個生成\n", "\n", "@show rand(bin, 20);" ] }, { "cell_type": "code", "execution_count": 6, "id": "6511840c-d9b6-4a99-8697-f3931c8b825c", "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# サンプルのヒストグラム\n", "# norm = true で確率密度函数と比較できるようにする。\n", "# alpha = 0.3 で半透明化pp\n", "# bin = -0.5:n+0.5 で 1 刻みの適切なビンを設定\n", "\n", "histogram(sample_of_means; norm=true, alpha=0.3, bin=-0.5:n+0.5, label=\"sample of means\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "29f9ae86-cbb2-489c-9701-2f3205b19d15", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-0.5:1.0:20.5" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# -0.5:n+0.5 は [-0.5, 0.5, 1.5, 2.5, ..., 19.5, 20.5] とほぼ等価\n", "\n", "-0.5:n+0.5" ] }, { "cell_type": "code", "execution_count": 8, "id": "7e0acfc7-36f7-4986-b2d8-0e1d8ae1944a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "collect(-0.5:n + 0.5) = [-0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5, 16.5, 17.5, 18.5, 19.5, 20.5]\n" ] } ], "source": [ "@show collect(-0.5:n+0.5);" ] }, { "cell_type": "code", "execution_count": 9, "id": "e09374c3-1801-4c1e-8bfe-68340b8d80b9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Normal{Float64}(μ=10.0, σ=2.23606797749979)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 平均μ、標準偏差σの正規分布オブジェクトは次にようにして作れる。\n", "\n", "normal = Normal(n*p, √(n*p*(1-p)))" ] }, { "cell_type": "code", "execution_count": 10, "id": "0eaabf8b-2e51-4471-a22d-e563b235863e", "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 確率分布オブジェクト dist は plot(dist) でプロットできる。\n", "# lw = 2 で2倍の太さでプロット\n", "\n", "plot(normal, 0, 20; label=\"normal dist.\", lw=2)" ] }, { "cell_type": "code", "execution_count": 11, "id": "76528bd4-765e-4d36-903b-d671c625362d", "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 以上を重ねてプロット\n", "\n", "histogram(sample_of_means; norm=true, alpha=0.3, bin=-0.5:n+0.5, label=\"rand(Binomial($n, $p))\")\n", "plot!(Normal(n*p, √(n*p*(1-p))), 0, 20; label=\"normal approx.\", lw=2)" ] }, { "cell_type": "code", "execution_count": 12, "id": "0af47e0a-2459-40db-b763-6b4b4daa61fe", "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# サイコロを10回振って出た目の平均値の分布は正規分布でよく近似される。\n", "\n", "n = 10\n", "M = [mean(rand(1:6, n)) for _ in 1:10^5]\n", "histogram(M; norm=true, alpha=0.3, bin=0.95:0.1:6.05, label=\"mean(rand(1:6, 10))\")\n", "plot!(Normal(mean(1:6), std(1:6; corrected=false)/√n), 1, 6; label=\"normal approx.\", lw=2)" ] }, { "cell_type": "code", "execution_count": 13, "id": "7a2f5bfd-ba99-40d6-ba31-4586b71c12db", "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 2つの確率パラメータ(比率パラメータ)が等しい二項分布で生成されたサンプルのZ統計量は\n", "# 近似的に標準正規分布に従う。\n", "\n", "\"\"\"\"\"\"\n", "function gensample(bin1, bin2)\n", " m, n = ntrials(bin1), ntrials(bin2)\n", " a, b = rand(bin1), rand(bin2)\n", " c, d = m - a, n - b\n", " a, b, c, d\n", "end\n", "\n", "\"\"\"Z statistic of 2×2 contingency table\"\"\"\n", "function zstat(a, b, c, d)\n", " a*d - b*c == 0 && return zero(inv(a))\n", " m, n = a + c, b + d\n", " N = a + b + c + d\n", " p = (a + b)/N\n", " Z = (a/m - b/n) / √(p*(1-p)*(1/m + 1/n))\n", " Z\n", "end\n", "\n", "m, n, p = 30, 20, 0.3\n", "bin1 = Binomial(m, p)\n", "bin2 = Binomial(n, p)\n", "Z = [zstat(gensample(bin1, bin2)...) for _ in 1:10^5]\n", "\n", "histogram(Z; norm=true, alpha=0.3, bin=-4.25:0.5:4.25, label=\"Z statistics\")\n", "plot!(Normal(), -4, 4; label=\"std normal dist.\", lw=2)" ] }, { "cell_type": "code", "execution_count": 14, "id": "cec232b7-05dc-431a-9e70-50ba473b7b14", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Variables\n", " #self#\u001b[36m::Core.Const(zstat)\u001b[39m\n", " a\u001b[36m::Int64\u001b[39m\n", " b\u001b[36m::Int64\u001b[39m\n", " c\u001b[36m::Int64\u001b[39m\n", " d\u001b[36m::Int64\u001b[39m\n", " Z\u001b[36m::Float64\u001b[39m\n", " p\u001b[36m::Float64\u001b[39m\n", " N\u001b[36m::Int64\u001b[39m\n", " n\u001b[36m::Int64\u001b[39m\n", " m\u001b[36m::Int64\u001b[39m\n", "\n", "Body\u001b[36m::Float64\u001b[39m\n", "\u001b[90m1 ─\u001b[39m Core.NewvarNode(:(Z))\n", "\u001b[90m│ \u001b[39m Core.NewvarNode(:(p))\n", "\u001b[90m│ \u001b[39m Core.NewvarNode(:(N))\n", "\u001b[90m│ \u001b[39m Core.NewvarNode(:(n))\n", "\u001b[90m│ \u001b[39m Core.NewvarNode(:(m))\n", "\u001b[90m│ \u001b[39m %6 = (a * d)\u001b[36m::Int64\u001b[39m\n", "\u001b[90m│ \u001b[39m %7 = (b * c)\u001b[36m::Int64\u001b[39m\n", "\u001b[90m│ \u001b[39m %8 = (%6 - %7)\u001b[36m::Int64\u001b[39m\n", "\u001b[90m│ \u001b[39m %9 = (%8 == 0)\u001b[36m::Bool\u001b[39m\n", "\u001b[90m└──\u001b[39m goto #3 if not %9\n", "\u001b[90m2 ─\u001b[39m %11 = Main.inv(a)\u001b[36m::Float64\u001b[39m\n", "\u001b[90m│ \u001b[39m %12 = Main.zero(%11)\u001b[36m::Core.Const(0.0)\u001b[39m\n", "\u001b[90m└──\u001b[39m return %12\n", "\u001b[90m3 ─\u001b[39m %14 = (a + c)\u001b[36m::Int64\u001b[39m\n", "\u001b[90m│ \u001b[39m %15 = (b + d)\u001b[36m::Int64\u001b[39m\n", "\u001b[90m│ \u001b[39m (m = %14)\n", "\u001b[90m│ \u001b[39m (n = %15)\n", "\u001b[90m│ \u001b[39m (N = a + b + c + d)\n", "\u001b[90m│ \u001b[39m %19 = (a + b)\u001b[36m::Int64\u001b[39m\n", "\u001b[90m│ \u001b[39m (p = %19 / N)\n", "\u001b[90m│ \u001b[39m %21 = (a / m)\u001b[36m::Float64\u001b[39m\n", "\u001b[90m│ \u001b[39m %22 = (b / n)\u001b[36m::Float64\u001b[39m\n", "\u001b[90m│ \u001b[39m %23 = (%21 - %22)\u001b[36m::Float64\u001b[39m\n", "\u001b[90m│ \u001b[39m %24 = p\u001b[36m::Float64\u001b[39m\n", "\u001b[90m│ \u001b[39m %25 = (1 - p)\u001b[36m::Float64\u001b[39m\n", "\u001b[90m│ \u001b[39m %26 = (1 / m)\u001b[36m::Float64\u001b[39m\n", "\u001b[90m│ \u001b[39m %27 = (1 / n)\u001b[36m::Float64\u001b[39m\n", "\u001b[90m│ \u001b[39m %28 = (%26 + %27)\u001b[36m::Float64\u001b[39m\n", "\u001b[90m│ \u001b[39m %29 = (%24 * %25 * %28)\u001b[36m::Float64\u001b[39m\n", "\u001b[90m│ \u001b[39m %30 = √%29\u001b[36m::Float64\u001b[39m\n", "\u001b[90m│ \u001b[39m (Z = %23 / %30)\n", "\u001b[90m└──\u001b[39m return Z\n" ] } ], "source": [ "@code_warntype zstat(1, 2, 3, 4)" ] }, { "cell_type": "code", "execution_count": 15, "id": "7d0b131e-d614-4078-ad05-0fa570301740", "metadata": {}, "outputs": [ { "data": { "text/latex": [ "$\\begin{equation*}\\frac{\\frac{a}{a + c} - \\frac{b}{b + d}}{\\sqrt{\\frac{\\left(a + b\\right) \\left(- \\frac{a + b}{a + b + c + d} + 1\\right) \\left(\\frac{1}{b + d} + \\frac{1}{a + c}\\right)}{a + b + c + d}}}\\end{equation*}$\n" ], "text/plain": [ " a b \n", " ----- - ----- \n", " a + c b + d \n", "-----------------------------------------------------\n", " _______________________________________________\n", " / / a + b \\ / 1 1 \\ \n", " / (a + b)*|- ------------- + 1|*|----- + -----| \n", " / \\ a + b + c + d / \\b + d a + c/ \n", " / --------------------------------------------- \n", "\\/ a + b + c + d " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 数式処理を使って、Z統計量の2乗がPearsonのχ²統計量に一致することを確認。\n", "# 上の数値計算で使ったzstat函数をそのまま数式処理で利用する。\n", "\n", "using SymPy\n", "\n", "@syms a b c d\n", "Z = zstat(a, b, c, d)" ] }, { "cell_type": "code", "execution_count": 16, "id": "f06b7daa-f2d3-424a-a8fc-0e0d574a15f2", "metadata": {}, "outputs": [ { "data": { "text/latex": [ "$\\begin{equation*}\\frac{\\left(a d - b c\\right)^{2} \\left(a + b + c + d\\right)}{\\left(a + b\\right) \\left(a + c\\right) \\left(b + d\\right) \\left(c + d\\right)}\\end{equation*}$\n" ], "text/plain": [ " 2 \n", " (a*d - b*c) *(a + b + c + d) \n", "-------------------------------\n", "(a + b)*(a + c)*(b + d)*(c + d)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Z^2 |> factor は factor(Z^2) に等価\n", "# 以下の計算結果はPearsonのχ²統計量の有名な表示に一致\n", "\n", "Z^2 |> factor" ] }, { "cell_type": "code", "execution_count": 17, "id": "08e1d0b0-635e-42d4-9ce9-416b6b9d5f41", "metadata": {}, "outputs": [ { "data": { "text/latex": [ "$\\begin{equation*}\\frac{\\left(d - \\frac{\\left(b + d\\right) \\left(c + d\\right)}{a + b + c + d}\\right)^{2} \\left(a + b + c + d\\right)}{\\left(b + d\\right) \\left(c + d\\right)} + \\frac{\\left(c - \\frac{\\left(a + c\\right) \\left(c + d\\right)}{a + b + c + d}\\right)^{2} \\left(a + b + c + d\\right)}{\\left(a + c\\right) \\left(c + d\\right)} + \\frac{\\left(b - \\frac{\\left(a + b\\right) \\left(b + d\\right)}{a + b + c + d}\\right)^{2} \\left(a + b + c + d\\right)}{\\left(a + b\\right) \\left(b + d\\right)} + \\frac{\\left(a - \\frac{\\left(a + b\\right) \\left(a + c\\right)}{a + b + c + d}\\right)^{2} \\left(a + b + c + d\\right)}{\\left(a + b\\right) \\left(a + c\\right)}\\end{equation*}$\n" ], "text/plain": [ " 2 2 \n", "/ (b + d)*(c + d)\\ / (a + c)*(c + d)\\ \n", "|d - ---------------| *(a + b + c + d) |c - ---------------| *(a + b + c + d\n", "\\ a + b + c + d / \\ a + b + c + d / \n", "-------------------------------------- + -------------------------------------\n", " (b + d)*(c + d) (a + c)*(c + d) \n", "\n", " 2 2 \n", " / (a + b)*(b + d)\\ / (a + b)*(a + c)\\ \n", ") |b - ---------------| *(a + b + c + d) |a - ---------------| *(a + b + c\n", " \\ a + b + c + d / \\ a + b + c + d / \n", "- + -------------------------------------- + ---------------------------------\n", " (a + b)*(b + d) (a + b)*(a + c) \n", "\n", " \n", " \n", " + d)\n", " \n", "-----\n", " " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# (O - E)²/E の和で定義されたPearsonのχ²統計量\n", "\n", "\"\"\"Pearson's χ² statistic of 2×2 contingency table\"\"\"\n", "function chisqstat(a, b, c, d)\n", " a*d - b*c == 0 && return zero(inv(a))\n", " N = a + b + c + d\n", " s, f = a + b, c + d\n", " m, n = a + c, b + d\n", " Ea, Eb, Ec, Ed = m*s/N, n*s/N, m*f/N, n*f/N\n", " X² = (a - Ea)^2/Ea + (b - Eb)^2/Eb + (c - Ec)^2/Ec + (d - Ed)^2/Ed\n", "end\n", "\n", "X² = chisqstat(a, b, c, d)" ] }, { "cell_type": "code", "execution_count": 18, "id": "5c8e9964-261e-4394-b6ae-00a522ac4516", "metadata": {}, "outputs": [ { "data": { "text/latex": [ "$\\begin{equation*}\\frac{\\left(a d - b c\\right)^{2} \\left(a + b + c + d\\right)}{\\left(a + b\\right) \\left(a + c\\right) \\left(b + d\\right) \\left(c + d\\right)}\\end{equation*}$\n" ], "text/plain": [ " 2 \n", " (a*d - b*c) *(a + b + c + d) \n", "-------------------------------\n", "(a + b)*(a + c)*(b + d)*(c + d)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 以下の計算結果は2×2の分割表のPearson χ²統計量に関する有名な公式\n", "\n", "X² |> factor" ] }, { "cell_type": "code", "execution_count": 19, "id": "6f78ccdf-7894-4e24-8017-606ff1f9c877", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pval_exact (generic function with 1 method)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 正確二項検定のP値の定義\n", "\n", "# \\lessapprox TAB → ⪅\n", "# \\approx TAB → ≈\n", "x ⪅ y = x < y || x ≈ y\n", "\n", "\"\"\"p-value function of exact binomial test\"\"\"\n", "pval_exact(dist, k) = sum(pdf(dist, j) for j in support(dist) if pdf(dist, j) ⪅ pdf(dist, k))" ] }, { "cell_type": "code", "execution_count": 20, "id": "59cf2602-5791-4524-88b5-faa4e77f6931", "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 正確二項検定のP値の例 (パラメータpを固定した場合)\n", "\n", "n, p = 20, 0.3\n", "bin = Binomial(n, p)\n", "\n", "k = support(bin)\n", "y = pval_exact.(bin, k)\n", "plot(k, y; label=\"\")\n", "plot!(; xtick=0:20, ytick=0:0.05:1)\n", "plot!(; xlabel=\"data k\", ylabel=\"p-value for parameter n = $n, p = $p\")" ] }, { "cell_type": "code", "execution_count": 21, "id": "a1f35a6e-44f2-4b9f-9015-e631d039d94e", "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 正確二項検定のP値の例 (データkを固定した場合)\n", "\n", "n, k = 20, 6\n", "\n", "p = 0:0.002:1\n", "y = @. pval_exact(Binomial(n, p), k)\n", "plot(p, y; label=\"\")\n", "plot!(; xtick=0:0.1:1, ytick=0:0.05:1)\n", "plot!(; xlabel=\"parameter p\", ylabel=\"p-value for data n = $n, k = $k\")" ] }, { "cell_type": "code", "execution_count": 22, "id": "262394cd-bddf-4c1e-9359-ed741bfdb829", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "confint_bin (generic function with 1 method)" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 信頼区間函数の定義\n", "\n", "using Roots\n", "\n", "\"\"\"generic confidence interval function\"\"\"\n", "function confint(pvalfunc, mlefunc, n, k, pmin, pmax; α = 0.05)\n", " f(p) = pvalfunc(n, p, k) - α\n", " ci = find_zeros(f, pmin, pmax)\n", " length(ci) ≥ 2 && return ci\n", " pmle = mlefunc(n, k)\n", " first(ci) > pmle ? [pmin, first(ci)] : [first(ci), pmax]\n", "end\n", "\n", "\"\"\"confidence interval function for exact binomial test\"\"\"\n", "function confint_bin(n, k; α = 0.05)\n", " pvalfunc(n, p, k) = pval_exact(Binomial(n, p), k)\n", " mlefunc(n, k) = k/n\n", " pmin, pmax = 0.0, 1.0\n", " confint(pvalfunc, mlefunc, n, k, pmin, pmax; α)\n", "end" ] }, { "cell_type": "code", "execution_count": 23, "id": "5eca122d-0e3c-422a-a05f-9e3eafcb736a", "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 正確二項検定のデータkを固定した場合のP値函数のプロットに\n", "# 95%信頼区間のプロットを追加\n", "\n", "n, k = 20, 6\n", "α = 0.05\n", "ci = confint_bin(n, k; α)\n", "\n", "p = 0:0.002:1\n", "y = @. pval_exact(Binomial(n, p), k)\n", "plot(p, y; label=\"\")\n", "plot!(ci, [α, α]; label=\"$(100(1 - α))% confidence interval\", lw=3)\n", "plot!(; xtick=0:0.1:1, ytick=0:0.05:1)\n", "plot!(; xlabel=\"parameter p\", ylabel=\"p-value for data n = $n, k = $k\")" ] }, { "cell_type": "code", "execution_count": 24, "id": "1e9220e3-54ce-4802-a902-3b4d967eccee", "metadata": { "tags": [] }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# データkを動かして信頼区間をプロット (1)\n", "\n", "n = 20\n", "k = 0:n\n", "α = 0.05\n", "ci = confint_bin.(n, k; α)\n", "ciL, ciR = first.(ci), last.(ci)\n", "\n", "plot(; legend=:topleft)\n", "plot!(k, ciL; label=\"min. of confidence interval\")\n", "plot!(k, ciR; label=\"max. of confidenceinterval\")\n", "plot!(k, ciL; label=\"$(100(1 - α))% confidence interval (n = $n)\", lw=0, frange=ciR, fa=0.1)\n", "plot!(; xtick=0:n, ytick=0:0.1:1)\n", "plot!(; xlabel=\"data k\", ylabel=\"parameter p\")" ] }, { "cell_type": "code", "execution_count": 25, "id": "afe102d5-3d10-4cfd-9e77-fa00bf8275a1", "metadata": { "tags": [] }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# データkを動かして信頼区間をプロット (2)\n", "\n", "n = 20\n", "k = 0:n\n", "α = 0.05\n", "ci = confint_bin.(n, k; α)\n", "\n", "plot(; legend=:topleft)\n", "for i in eachindex(ci)\n", " plot!([k[i], k[i]], ci[i]; label=\"\", lw=5)\n", "end\n", "plot!(; xtick=0:n, ytick=0:0.1:1)\n", "plot!(; xlabel=\"data k\", ylabel=\"parameter p\")\n", "title!(\"$(100(1 - α))% confidence intervals (n = $n)\"; titlefontsize=12)" ] }, { "cell_type": "code", "execution_count": 26, "id": "836e72b1-1a35-4795-9090-83a3d07ed05c", "metadata": { "tags": [] }, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# データkを動かして信頼区間をプロット (3)\n", "\n", "n = 20\n", "k = 0:n\n", "α = 0.05\n", "ci = confint_bin.(n, k; α)\n", "\n", "plot(; legend=:topleft)\n", "for i in eachindex(ci)\n", " plot!([k[i], k[i]], ci[i]; label=\"\", lw=5, c=1)\n", "end\n", "plot!(; xtick=0:n, ytick=0:0.1:1)\n", "plot!(; xlabel=\"data k\", ylabel=\"parameter p\")\n", "title!(\"$(100(1 - α))% confidence intervals (n = $n)\"; titlefontsize=12)" ] }, { "cell_type": "code", "execution_count": null, "id": "729a57cd-f8e6-4977-9e27-56a9ee5af173", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "jupytext": { "encoding": "# -*- coding: utf-8 -*-", "formats": "ipynb,jl:hydrogen" }, "kernelspec": { "display_name": "Julia 1.6.3", "language": "julia", "name": "julia-1.6" }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", "version": "1.6.3" } }, "nbformat": 4, "nbformat_minor": 5 }