{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Chapter 3\n", "\n", "Examples and Exercises from Think Stats, 2nd Edition\n", "\n", "http://thinkstats2.com\n", "\n", "Copyright 2016 Allen B. Downey\n", "\n", "MIT License: https://opensource.org/licenses/MIT\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 2;\n", " var nbb_unformatted_code = \"from os.path import basename, exists\\n\\n\\ndef download(url):\\n filename = basename(url)\\n if not exists(filename):\\n from urllib.request import urlretrieve\\n\\n local, _ = urlretrieve(url, filename)\\n print(\\\"Downloaded \\\" + local)\\n\\n\\ndownload(\\\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/thinkstats2.py\\\")\\ndownload(\\\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/thinkplot.py\\\")\\ndownload(\\\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/nsfg.py\\\")\\ndownload(\\\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/first.py\\\")\\n\\ndownload(\\\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemPreg.dct\\\")\\ndownload(\\n \\\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemPreg.dat.gz\\\"\\n)\";\n", " var nbb_formatted_code = \"from os.path import basename, exists\\n\\n\\ndef download(url):\\n filename = basename(url)\\n if not exists(filename):\\n from urllib.request import urlretrieve\\n\\n local, _ = urlretrieve(url, filename)\\n print(\\\"Downloaded \\\" + local)\\n\\n\\ndownload(\\\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/thinkstats2.py\\\")\\ndownload(\\\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/thinkplot.py\\\")\\ndownload(\\\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/nsfg.py\\\")\\ndownload(\\\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/first.py\\\")\\n\\ndownload(\\\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemPreg.dct\\\")\\ndownload(\\n \\\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemPreg.dat.gz\\\"\\n)\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from os.path import basename, exists\n", "\n", "\n", "def download(url):\n", " filename = basename(url)\n", " if not exists(filename):\n", " from urllib.request import urlretrieve\n", "\n", " local, _ = urlretrieve(url, filename)\n", " print(\"Downloaded \" + local)\n", "\n", "\n", "download(\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/thinkstats2.py\")\n", "download(\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/thinkplot.py\")\n", "download(\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/nsfg.py\")\n", "download(\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/first.py\")\n", "\n", "\n", "download(\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemPreg.dct\")\n", "download(\n", " \"https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemPreg.dat.gz\"\n", ")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 3;\n", " var nbb_unformatted_code = \"import numpy as np\";\n", " var nbb_formatted_code = \"import numpy as np\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Again, I'll load the NSFG pregnancy file and select live births:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 4;\n", " var nbb_unformatted_code = \"import nsfg\\nimport first\\nimport thinkstats2\\nimport thinkplot\";\n", " var nbb_formatted_code = \"import nsfg\\nimport first\\nimport thinkstats2\\nimport thinkplot\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import nsfg\n", "import first\n", "import thinkstats2\n", "import thinkplot" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 5;\n", " var nbb_unformatted_code = \"preg = nsfg.ReadFemPreg()\\nlive = preg[preg.outcome == 1]\";\n", " var nbb_formatted_code = \"preg = nsfg.ReadFemPreg()\\nlive = preg[preg.outcome == 1]\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "preg = nsfg.ReadFemPreg()\n", "live = preg[preg.outcome == 1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here's the histogram of birth weights:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 6;\n", " var nbb_unformatted_code = \"hist = thinkstats2.Hist(live.birthwgt_lb, label='birthwgt_lb')\\nthinkplot.Hist(hist)\\nthinkplot.Config(xlabel='Birth weight (pounds)', ylabel='Count')\";\n", " var nbb_formatted_code = \"hist = thinkstats2.Hist(live.birthwgt_lb, label=\\\"birthwgt_lb\\\")\\nthinkplot.Hist(hist)\\nthinkplot.Config(xlabel=\\\"Birth weight (pounds)\\\", ylabel=\\\"Count\\\")\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "hist = thinkstats2.Hist(live.birthwgt_lb, label=\"birthwgt_lb\")\n", "thinkplot.Hist(hist)\n", "thinkplot.Config(xlabel=\"Birth weight (pounds)\", ylabel=\"Count\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To normalize the disrtibution, we could divide through by the total count:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 7;\n", " var nbb_unformatted_code = \"n = hist.Total()\\npmf = hist.Copy()\\nfor x, freq in hist.Items():\\n pmf[x] = freq / n\";\n", " var nbb_formatted_code = \"n = hist.Total()\\npmf = hist.Copy()\\nfor x, freq in hist.Items():\\n pmf[x] = freq / n\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "n = hist.Total()\n", "pmf = hist.Copy()\n", "for x, freq in hist.Items():\n", " pmf[x] = freq / n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The result is a Probability Mass Function (PMF)." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 8;\n", " var nbb_unformatted_code = \"thinkplot.Hist(pmf)\\nthinkplot.Config(xlabel='Birth weight (pounds)', ylabel='PMF')\";\n", " var nbb_formatted_code = \"thinkplot.Hist(pmf)\\nthinkplot.Config(xlabel=\\\"Birth weight (pounds)\\\", ylabel=\\\"PMF\\\")\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "thinkplot.Hist(pmf)\n", "thinkplot.Config(xlabel=\"Birth weight (pounds)\", ylabel=\"PMF\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "More directly, we can create a Pmf object." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pmf({1: 0.2, 2: 0.4, 3: 0.2, 5: 0.2})" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 9;\n", " var nbb_unformatted_code = \"pmf = thinkstats2.Pmf([1, 2, 2, 3, 5])\\npmf\";\n", " var nbb_formatted_code = \"pmf = thinkstats2.Pmf([1, 2, 2, 3, 5])\\npmf\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pmf = thinkstats2.Pmf([1, 2, 2, 3, 5])\n", "pmf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`Pmf` provides `Prob`, which looks up a value and returns its probability:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.4" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 10;\n", " var nbb_unformatted_code = \"pmf.Prob(2)\";\n", " var nbb_formatted_code = \"pmf.Prob(2)\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pmf.Prob(2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The bracket operator does the same thing." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.4" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 11;\n", " var nbb_unformatted_code = \"pmf[2]\";\n", " var nbb_formatted_code = \"pmf[2]\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pmf[2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The `Incr` method adds to the probability associated with a given values." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6000000000000001" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 12;\n", " var nbb_unformatted_code = \"pmf.Incr(2, 0.2)\\npmf[2]\";\n", " var nbb_formatted_code = \"pmf.Incr(2, 0.2)\\npmf[2]\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pmf.Incr(2, 0.2)\n", "pmf[2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The `Mult` method multiplies the probability associated with a value." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.30000000000000004" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 13;\n", " var nbb_unformatted_code = \"pmf.Mult(2, 0.5)\\npmf[2]\";\n", " var nbb_formatted_code = \"pmf.Mult(2, 0.5)\\npmf[2]\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pmf.Mult(2, 0.5)\n", "pmf[2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`Total` returns the total probability (which is no longer 1, because we changed one of the probabilities)." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8999999999999999" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 14;\n", " var nbb_unformatted_code = \"pmf.Total()\";\n", " var nbb_formatted_code = \"pmf.Total()\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pmf.Total()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`Normalize` divides through by the total probability, making it 1 again." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.0" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 15;\n", " var nbb_unformatted_code = \"pmf.Normalize()\\npmf.Total()\";\n", " var nbb_formatted_code = \"pmf.Normalize()\\npmf.Total()\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pmf.Normalize()\n", "pmf.Total()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here's the PMF of pregnancy length for live births." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 16;\n", " var nbb_unformatted_code = \"pmf = thinkstats2.Pmf(live.prglngth, label='prglngth')\";\n", " var nbb_formatted_code = \"pmf = thinkstats2.Pmf(live.prglngth, label=\\\"prglngth\\\")\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pmf = thinkstats2.Pmf(live.prglngth, label=\"prglngth\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here's what it looks like plotted with `Hist`, which makes a bar graph." ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 17;\n", " var nbb_unformatted_code = \"thinkplot.Hist(pmf)\\nthinkplot.Config(xlabel='Pregnancy length (weeks)', ylabel='Pmf')\";\n", " var nbb_formatted_code = \"thinkplot.Hist(pmf)\\nthinkplot.Config(xlabel=\\\"Pregnancy length (weeks)\\\", ylabel=\\\"Pmf\\\")\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "thinkplot.Hist(pmf)\n", "thinkplot.Config(xlabel=\"Pregnancy length (weeks)\", ylabel=\"Pmf\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here's what it looks like plotted with `Pmf`, which makes a step function." ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 18;\n", " var nbb_unformatted_code = \"thinkplot.Pmf(pmf)\\nthinkplot.Config(xlabel='Pregnancy length (weeks)', ylabel='Pmf')\";\n", " var nbb_formatted_code = \"thinkplot.Pmf(pmf)\\nthinkplot.Config(xlabel=\\\"Pregnancy length (weeks)\\\", ylabel=\\\"Pmf\\\")\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "thinkplot.Pmf(pmf)\n", "thinkplot.Config(xlabel=\"Pregnancy length (weeks)\", ylabel=\"Pmf\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can use `MakeFrames` to return DataFrames for all live births, first babies, and others." ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 19;\n", " var nbb_unformatted_code = \"live, firsts, others = first.MakeFrames()\";\n", " var nbb_formatted_code = \"live, firsts, others = first.MakeFrames()\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "live, firsts, others = first.MakeFrames()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here are the distributions of pregnancy length." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 20;\n", " var nbb_unformatted_code = \"first_pmf = thinkstats2.Pmf(firsts.prglngth, label='firsts')\\nother_pmf = thinkstats2.Pmf(others.prglngth, label='others')\";\n", " var nbb_formatted_code = \"first_pmf = thinkstats2.Pmf(firsts.prglngth, label=\\\"firsts\\\")\\nother_pmf = thinkstats2.Pmf(others.prglngth, label=\\\"others\\\")\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "first_pmf = thinkstats2.Pmf(firsts.prglngth, label=\"firsts\")\n", "other_pmf = thinkstats2.Pmf(others.prglngth, label=\"others\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And here's the code that replicates one of the figures in the chapter." ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 21;\n", " var nbb_unformatted_code = \"width=0.45\\naxis = [27, 46, 0, 0.6]\\nthinkplot.PrePlot(2, cols=2)\\nthinkplot.Hist(first_pmf, align='right', width=width)\\nthinkplot.Hist(other_pmf, align='left', width=width)\\nthinkplot.Config(xlabel='Pregnancy length(weeks)', ylabel='PMF', axis=axis)\\n\\nthinkplot.PrePlot(2)\\nthinkplot.SubPlot(2)\\nthinkplot.Pmfs([first_pmf, other_pmf])\\nthinkplot.Config(xlabel='Pregnancy length(weeks)', axis=axis)\";\n", " var nbb_formatted_code = \"width = 0.45\\naxis = [27, 46, 0, 0.6]\\nthinkplot.PrePlot(2, cols=2)\\nthinkplot.Hist(first_pmf, align=\\\"right\\\", width=width)\\nthinkplot.Hist(other_pmf, align=\\\"left\\\", width=width)\\nthinkplot.Config(xlabel=\\\"Pregnancy length(weeks)\\\", ylabel=\\\"PMF\\\", axis=axis)\\n\\nthinkplot.PrePlot(2)\\nthinkplot.SubPlot(2)\\nthinkplot.Pmfs([first_pmf, other_pmf])\\nthinkplot.Config(xlabel=\\\"Pregnancy length(weeks)\\\", axis=axis)\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "width = 0.45\n", "axis = [27, 46, 0, 0.6]\n", "thinkplot.PrePlot(2, cols=2)\n", "thinkplot.Hist(first_pmf, align=\"right\", width=width)\n", "thinkplot.Hist(other_pmf, align=\"left\", width=width)\n", "thinkplot.Config(xlabel=\"Pregnancy length(weeks)\", ylabel=\"PMF\", axis=axis)\n", "\n", "thinkplot.PrePlot(2)\n", "thinkplot.SubPlot(2)\n", "thinkplot.Pmfs([first_pmf, other_pmf])\n", "thinkplot.Config(xlabel=\"Pregnancy length(weeks)\", axis=axis)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here's the code that generates a plot of the difference in probability (in percentage points) between first babies and others, for each week of pregnancy (showing only pregnancies considered \"full term\"). " ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 22;\n", " var nbb_unformatted_code = \"weeks = range(35, 46)\\ndiffs = []\\nfor week in weeks:\\n p1 = first_pmf.Prob(week)\\n p2 = other_pmf.Prob(week)\\n diff = 100 * (p1 - p2)\\n diffs.append(diff)\\n\\nthinkplot.Bar(weeks, diffs)\\nthinkplot.Config(xlabel='Pregnancy length(weeks)', ylabel='Difference (percentage points)')\";\n", " var nbb_formatted_code = \"weeks = range(35, 46)\\ndiffs = []\\nfor week in weeks:\\n p1 = first_pmf.Prob(week)\\n p2 = other_pmf.Prob(week)\\n diff = 100 * (p1 - p2)\\n diffs.append(diff)\\n\\nthinkplot.Bar(weeks, diffs)\\nthinkplot.Config(\\n xlabel=\\\"Pregnancy length(weeks)\\\", ylabel=\\\"Difference (percentage points)\\\"\\n)\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "weeks = range(35, 46)\n", "diffs = []\n", "for week in weeks:\n", " p1 = first_pmf.Prob(week)\n", " p2 = other_pmf.Prob(week)\n", " diff = 100 * (p1 - p2)\n", " diffs.append(diff)\n", "\n", "thinkplot.Bar(weeks, diffs)\n", "thinkplot.Config(xlabel='Pregnancy length(weeks)', ylabel='Difference (percentage points)')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Biasing and unbiasing PMFs\n", "\n", "Here's the example in the book showing operations we can perform with `Pmf` objects.\n", "\n", "Suppose we have the following distribution of class sizes." ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 23;\n", " var nbb_unformatted_code = \"d = { 7: 8, 12: 8, 17: 14, 22: 4, \\n 27: 6, 32: 12, 37: 8, 42: 3, 47: 2 }\\n\\npmf = thinkstats2.Pmf(d, label='actual')\";\n", " var nbb_formatted_code = \"d = {7: 8, 12: 8, 17: 14, 22: 4, 27: 6, 32: 12, 37: 8, 42: 3, 47: 2}\\n\\npmf = thinkstats2.Pmf(d, label=\\\"actual\\\")\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "d = {7: 8, 12: 8, 17: 14, 22: 4, 27: 6, 32: 12, 37: 8, 42: 3, 47: 2}\n", "\n", "pmf = thinkstats2.Pmf(d, label=\"actual\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This function computes the biased PMF we would get if we surveyed students and asked about the size of the classes they are in." ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 24;\n", " var nbb_unformatted_code = \"def BiasPmf(pmf, label):\\n new_pmf = pmf.Copy(label=label)\\n\\n for x, p in pmf.Items():\\n new_pmf.Mult(x, x)\\n \\n new_pmf.Normalize()\\n return new_pmf\";\n", " var nbb_formatted_code = \"def BiasPmf(pmf, label):\\n new_pmf = pmf.Copy(label=label)\\n\\n for x, p in pmf.Items():\\n new_pmf.Mult(x, x)\\n\\n new_pmf.Normalize()\\n return new_pmf\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def BiasPmf(pmf, label):\n", " new_pmf = pmf.Copy(label=label)\n", "\n", " for x, p in pmf.Items():\n", " new_pmf.Mult(x, x)\n", "\n", " new_pmf.Normalize()\n", " return new_pmf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The following graph shows the difference between the actual and observed distributions." ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 25;\n", " var nbb_unformatted_code = \"biased_pmf = BiasPmf(pmf, label='observed')\\nthinkplot.PrePlot(2)\\nthinkplot.Pmfs([pmf, biased_pmf])\\nthinkplot.Config(xlabel='Class size', ylabel='PMF')\";\n", " var nbb_formatted_code = \"biased_pmf = BiasPmf(pmf, label=\\\"observed\\\")\\nthinkplot.PrePlot(2)\\nthinkplot.Pmfs([pmf, biased_pmf])\\nthinkplot.Config(xlabel=\\\"Class size\\\", ylabel=\\\"PMF\\\")\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "biased_pmf = BiasPmf(pmf, label=\"observed\")\n", "thinkplot.PrePlot(2)\n", "thinkplot.Pmfs([pmf, biased_pmf])\n", "thinkplot.Config(xlabel=\"Class size\", ylabel=\"PMF\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The observed mean is substantially higher than the actual." ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Actual mean 23.692307692307693\n", "Observed mean 29.123376623376625\n" ] }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 26;\n", " var nbb_unformatted_code = \"print('Actual mean', pmf.Mean())\\nprint('Observed mean', biased_pmf.Mean())\";\n", " var nbb_formatted_code = \"print(\\\"Actual mean\\\", pmf.Mean())\\nprint(\\\"Observed mean\\\", biased_pmf.Mean())\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "print(\"Actual mean\", pmf.Mean())\n", "print(\"Observed mean\", biased_pmf.Mean())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If we were only able to collect the biased sample, we could \"unbias\" it by applying the inverse operation." ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 27;\n", " var nbb_unformatted_code = \"def UnbiasPmf(pmf, label=None):\\n new_pmf = pmf.Copy(label=label)\\n\\n for x, p in pmf.Items():\\n new_pmf[x] *= 1/x\\n \\n new_pmf.Normalize()\\n return new_pmf\";\n", " var nbb_formatted_code = \"def UnbiasPmf(pmf, label=None):\\n new_pmf = pmf.Copy(label=label)\\n\\n for x, p in pmf.Items():\\n new_pmf[x] *= 1 / x\\n\\n new_pmf.Normalize()\\n return new_pmf\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def UnbiasPmf(pmf, label=None):\n", " new_pmf = pmf.Copy(label=label)\n", "\n", " for x, p in pmf.Items():\n", " new_pmf[x] *= 1 / x\n", "\n", " new_pmf.Normalize()\n", " return new_pmf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can unbias the biased PMF:" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Unbiased mean 23.69230769230769\n" ] }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 28;\n", " var nbb_unformatted_code = \"unbiased = UnbiasPmf(biased_pmf, label='unbiased')\\nprint('Unbiased mean', unbiased.Mean())\";\n", " var nbb_formatted_code = \"unbiased = UnbiasPmf(biased_pmf, label=\\\"unbiased\\\")\\nprint(\\\"Unbiased mean\\\", unbiased.Mean())\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "unbiased = UnbiasPmf(biased_pmf, label=\"unbiased\")\n", "print(\"Unbiased mean\", unbiased.Mean())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And plot the two distributions to confirm they are the same." ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 29;\n", " var nbb_unformatted_code = \"thinkplot.PrePlot(2)\\nthinkplot.Pmfs([pmf, unbiased])\\nthinkplot.Config(xlabel='Class size', ylabel='PMF')\";\n", " var nbb_formatted_code = \"thinkplot.PrePlot(2)\\nthinkplot.Pmfs([pmf, unbiased])\\nthinkplot.Config(xlabel=\\\"Class size\\\", ylabel=\\\"PMF\\\")\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "thinkplot.PrePlot(2)\n", "thinkplot.Pmfs([pmf, unbiased])\n", "thinkplot.Config(xlabel=\"Class size\", ylabel=\"PMF\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Pandas indexing\n", "\n", "Here's an example of a small DataFrame." ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01
01.360839-0.653037
10.055034-0.008179
2-0.9885342.137857
31.0281421.087482
\n", "
" ], "text/plain": [ " 0 1\n", "0 1.360839 -0.653037\n", "1 0.055034 -0.008179\n", "2 -0.988534 2.137857\n", "3 1.028142 1.087482" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 30;\n", " var nbb_unformatted_code = \"import numpy as np\\nimport pandas\\narray = np.random.randn(4, 2)\\ndf = pandas.DataFrame(array)\\ndf\";\n", " var nbb_formatted_code = \"import numpy as np\\nimport pandas\\n\\narray = np.random.randn(4, 2)\\ndf = pandas.DataFrame(array)\\ndf\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import numpy as np\n", "import pandas\n", "\n", "array = np.random.randn(4, 2)\n", "df = pandas.DataFrame(array)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can specify column names when we create the DataFrame:" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
01.360839-0.653037
10.055034-0.008179
2-0.9885342.137857
31.0281421.087482
\n", "
" ], "text/plain": [ " A B\n", "0 1.360839 -0.653037\n", "1 0.055034 -0.008179\n", "2 -0.988534 2.137857\n", "3 1.028142 1.087482" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 31;\n", " var nbb_unformatted_code = \"columns = ['A', 'B']\\ndf = pandas.DataFrame(array, columns=columns)\\ndf\";\n", " var nbb_formatted_code = \"columns = [\\\"A\\\", \\\"B\\\"]\\ndf = pandas.DataFrame(array, columns=columns)\\ndf\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "columns = [\"A\", \"B\"]\n", "df = pandas.DataFrame(array, columns=columns)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can also specify an index that contains labels for the rows." ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
a1.360839-0.653037
b0.055034-0.008179
c-0.9885342.137857
d1.0281421.087482
\n", "
" ], "text/plain": [ " A B\n", "a 1.360839 -0.653037\n", "b 0.055034 -0.008179\n", "c -0.988534 2.137857\n", "d 1.028142 1.087482" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 32;\n", " var nbb_unformatted_code = \"index = ['a', 'b', 'c', 'd']\\ndf = pandas.DataFrame(array, columns=columns, index=index)\\ndf\";\n", " var nbb_formatted_code = \"index = [\\\"a\\\", \\\"b\\\", \\\"c\\\", \\\"d\\\"]\\ndf = pandas.DataFrame(array, columns=columns, index=index)\\ndf\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "index = [\"a\", \"b\", \"c\", \"d\"]\n", "df = pandas.DataFrame(array, columns=columns, index=index)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Normal indexing selects columns." ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "a 1.360839\n", "b 0.055034\n", "c -0.988534\n", "d 1.028142\n", "Name: A, dtype: float64" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 33;\n", " var nbb_unformatted_code = \"df['A']\";\n", " var nbb_formatted_code = \"df[\\\"A\\\"]\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df[\"A\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can use the `loc` attribute to select rows." ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "A 1.360839\n", "B -0.653037\n", "Name: a, dtype: float64" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 34;\n", " var nbb_unformatted_code = \"df.loc['a']\";\n", " var nbb_formatted_code = \"df.loc[\\\"a\\\"]\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.loc[\"a\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If you don't want to use the row labels and prefer to access the rows using integer indices, you can use the `iloc` attribute:" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "A 1.360839\n", "B -0.653037\n", "Name: a, dtype: float64" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 35;\n", " var nbb_unformatted_code = \"df.iloc[0]\";\n", " var nbb_formatted_code = \"df.iloc[0]\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.iloc[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`loc` can also take a list of labels." ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
a1.360839-0.653037
c-0.9885342.137857
\n", "
" ], "text/plain": [ " A B\n", "a 1.360839 -0.653037\n", "c -0.988534 2.137857" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 36;\n", " var nbb_unformatted_code = \"indices = ['a', 'c']\\ndf.loc[indices]\";\n", " var nbb_formatted_code = \"indices = [\\\"a\\\", \\\"c\\\"]\\ndf.loc[indices]\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "indices = [\"a\", \"c\"]\n", "df.loc[indices]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If you provide a slice of labels, `DataFrame` uses it to select rows." ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
a1.360839-0.653037
b0.055034-0.008179
c-0.9885342.137857
\n", "
" ], "text/plain": [ " A B\n", "a 1.360839 -0.653037\n", "b 0.055034 -0.008179\n", "c -0.988534 2.137857" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 37;\n", " var nbb_unformatted_code = \"df['a':'c']\";\n", " var nbb_formatted_code = \"df[\\\"a\\\":\\\"c\\\"]\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df[\"a\":\"c\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If you provide a slice of integers, `DataFrame` selects rows by integer index." ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
a1.360839-0.653037
b0.055034-0.008179
\n", "
" ], "text/plain": [ " A B\n", "a 1.360839 -0.653037\n", "b 0.055034 -0.008179" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 38;\n", " var nbb_unformatted_code = \"df[0:2]\";\n", " var nbb_formatted_code = \"df[0:2]\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df[0:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "But notice that one method includes the last elements of the slice and one does not.\n", "\n", "In general, I recommend giving labels to the rows and names to the columns, and using them consistently." ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Exercises" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Exercise:** In Chapter 3 we computed the mean of a sample by adding up\n", "the elements and dividing by n. If you are given a PMF, you can\n", "still compute the mean, but the process is slightly different:\n", "%\n", "$$ \\bar x = \\sum_i p_i~x_i $$\n", "%\n", "where the $x_i$ are the unique values in the PMF and $p_i=PMF(x_i)$.\n", "Similarly, you can compute variance like this:\n", "%\n", "$$ S^2 = \\sum_i p_i~(x_i - \\bar x)^2 $$\n", "% \n", "Write functions called `PmfMean` and `PmfVar` that take a\n", "Pmf object and compute the mean and variance. To test these methods,\n", "check that they are consistent with the methods `Mean` and `Var`\n", "provided by `Pmf`." ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 39;\n", " var nbb_unformatted_code = \"def PmfMean(pmf):\\n \\\"\\\"\\\"Computes the mean of a PMF.\\n Returns:\\n float mean\\n \\\"\\\"\\\"\\n return sum(p * x for x, p in pmf.Items())\";\n", " var nbb_formatted_code = \"def PmfMean(pmf):\\n \\\"\\\"\\\"Computes the mean of a PMF.\\n Returns:\\n float mean\\n \\\"\\\"\\\"\\n return sum(p * x for x, p in pmf.Items())\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def PmfMean(pmf):\n", " \"\"\"Computes the mean of a PMF.\n", " Returns:\n", " float mean\n", " \"\"\"\n", " return sum(p * x for x, p in pmf.Items())" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 40;\n", " var nbb_unformatted_code = \"def PmfVar(pmf, mu=None):\\n \\\"\\\"\\\"Computes the variance of a PMF.\\n mu: the point around which the variance is computed;\\n if omitted, computes the mean\\n returns: float variance\\n \\\"\\\"\\\"\\n if mu is None:\\n mu = PmfMean(pmf)\\n\\n return sum(p * (x-mu)**2 for x, p in pmf.Items())\";\n", " var nbb_formatted_code = \"def PmfVar(pmf, mu=None):\\n \\\"\\\"\\\"Computes the variance of a PMF.\\n mu: the point around which the variance is computed;\\n if omitted, computes the mean\\n returns: float variance\\n \\\"\\\"\\\"\\n if mu is None:\\n mu = PmfMean(pmf)\\n\\n return sum(p * (x - mu) ** 2 for x, p in pmf.Items())\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def PmfVar(pmf, mu=None):\n", " \"\"\"Computes the variance of a PMF.\n", " mu: the point around which the variance is computed;\n", " if omitted, computes the mean\n", " returns: float variance\n", " \"\"\"\n", " if mu is None:\n", " mu = PmfMean(pmf)\n", "\n", " return sum(p * (x - mu) ** 2 for x, p in pmf.Items())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Exercise:** Something like the class size paradox appears if you survey children and ask how many children are in their family. Families with many children are more likely to appear in your sample, and families with no children have no chance to be in the sample.\n", "\n", "Use the NSFG respondent variable `numkdhh` to construct the actual distribution for the number of children under 18 in the respondents' households.\n", "\n", "Now compute the biased distribution we would see if we surveyed the children and asked them how many children under 18 (including themselves) are in their household.\n", "\n", "Plot the actual and biased distributions, and compute their means." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "download(\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemResp.dct\")\n", "download(\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemResp.dat.gz\")" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 41;\n", " var nbb_unformatted_code = \"resp = nsfg.ReadFemResp()\";\n", " var nbb_formatted_code = \"resp = nsfg.ReadFemResp()\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "resp = nsfg.ReadFemResp()" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 42;\n", " var nbb_unformatted_code = \"# Solution\\n\\npmf = thinkstats2.Pmf(resp.numkdhh, label='numkdhh')\";\n", " var nbb_formatted_code = \"# Solution\\n\\npmf = thinkstats2.Pmf(resp.numkdhh, label=\\\"numkdhh\\\")\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Solution\n", "\n", "pmf = thinkstats2.Pmf(resp.numkdhh, label=\"numkdhh\")" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEGCAYAAABo25JHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAVHUlEQVR4nO3df5BdZX3H8c8nm92kpYBDsoOQTZtUQiFFA+EaBGIlGeiwiCSWYEAEQRxIazCA1OI4I9o6o0FlxCklAoUIWAISgYwG0amJKaCQDeFHws8U47AlyCZaMdr82OTbP+5J5maz2Zvs3rNn733er5kdz4/nnvs9K9nPOc855zmOCAEA0jWs6AIAAMUiCAAgcQQBACSOIACAxBEEAJC44UUXcKBGjx4d48aNK7oMAKgrq1at2hgRrb2tq7sgGDdunDo6OoouAwDqiu1f7WsdXUMAkDiCAAASRxAAQOLq7hoBgMa0fft2dXZ2asuWLUWXUtdGjhyptrY2NTc37/dnCAIAQ0JnZ6cOPvhgjRs3TraLLqcuRYQ2bdqkzs5OjR8/fr8/R9cQgCFhy5YtGjVqFCEwALY1atSoAz6rIggADBmEwMD153eYTNfQwz99Vvc90qGt27YXXUpNjGhp1uz2kmZMn1R0KQDqXDJnBI0UApK0ddt23fcID9YB9eC0006r+iDswoULNXfuXEnSJZdcogceeGCvNsuXL9fZZ59d8/qSCYJGCoFdGnGfAAy+ZLqGKi2+aU7RJQzIufMWFF0C0JDWr1+v9vZ2TZ06VU888YTGjBmjhx9+WO3t7fr617+uUqmkjRs3qlQqaf369Vq4cKEeeugh7dixQ2vWrNFnPvMZbdu2TXfffbdGjBihpUuX6rDDDtu9/Z07d+rSSy/V2LFj9eUvf1l33nmnvvKVr+iII47Q0UcfrREjRuxuu2LFCt1444168803dcMNN2jWrFmSpM2bN2vWrFlas2aNTjzxRN1zzz0DvraSZBAAGNryPNipdiD46quv6t5779Vtt92mj3zkI1q8eHGf7desWaPVq1dry5YtOuqoozR//nytXr1aV199te666y5dddVVkqTu7m5deOGFOu644/T5z39eGzZs0PXXX69Vq1bp0EMP1bRp03TCCSfs3u6GDRv02GOP6aWXXtI555yzOwhWr16ttWvX6sgjj9Spp56qxx9/XFOnTh3Q7ySZriEA2B/jx4/X8ccfL0k68cQTtX79+j7bT5s2TQcffLBaW1t16KGH6kMf+pAk6d3vfvcen73iiit2h4AkPfnkkzrttNPU2tqqlpYWzZ49e4/tzpw5U8OGDdPEiRP161//evfyKVOmqK2tTcOGDdPxxx9ftb79QRAAQIXK7pmmpiZ1d3dr+PDh2rlzpyTtdY9+Zfthw4btnh82bJi6u7t3rzvllFO0bNmyPT7fV5dO5XYjos/6BoquIQBDzlC7jjdu3DitWrVKU6ZM6fVunv1x2WWXacWKFTrvvPP04IMP6qSTTtK8efO0adMmHXLIIfre976nSZOKuR2cMwIAqOLaa6/VLbfcolNOOUUbN27s93auueYaTZ48WRdddJEOP/xwffGLX9TJJ5+s008/XZMnT65hxQfGlacc9aBUKkV/XkxTefFpqB1tHKhG2hdglxdffFHHHnts0WU0hN5+l7ZXRUSpt/acEQBA4ggCAEgcQQBgyKi3ruqhqD+/Q4IAwJAwcuRIbdq0iTAYgF3vIxg5cuQBfY7bRwEMCW1tbers7FRXV1fRpdS1XW8oOxAEAYAhobm5+YDeqoXaoWsIABJHEABA4ggCAEgcQQAAiSMIACBxBAEAJI4gAIDEEQQAkDiCAAASRxAAQOJyDQLbZ9p+2fY629f10e69tnfYnpVnPQCAveUWBLabJN0sqV3SREkX2J64j3bzJT2aVy0AgH3L84xgiqR1EfFaRGyTtEjSjF7aXSlpsaS3cqwFALAPeQbBGEmvV8x3Zst2sz1G0oclLVAfbF9uu8N2B0PUAkBt5RkE7mVZzzdOfFPSP0XEjr42FBG3RkQpIkqtra21qg8AoHzfR9ApaWzFfJukN3q0KUlaZFuSRks6y3Z3RDyUY10AgAp5BsFKSRNsj5f0P5LOl/TRygYRsfstFLYXSvoBIQAAgyu3IIiIbttzVb4bqEnSHRGx1vacbH2f1wUAAIMj11dVRsRSSUt7LOs1ACLikjxrAQD0jieLASBxBAEAJI4gAIDEEQQAkDiCAAASRxAAQOIIAgBIHEEAAIkjCAAgcQQBACSOIACAxBEEAJA4ggAAEkcQAEDiCAIASBxBAACJIwgAIHEEAQAkjiAAgMQRBACQOIIAABJHEABA4ggCAEgcQQAAiSMIACBxBAEAJI4gAIDEEQQAkDiCAAASRxAAQOIIAgBIHEEAAIkjCAAgcQQBACQu1yCwfabtl22vs31dL+tn2H7O9jO2O2xPzbMeAMDehue1YdtNkm6WdIakTkkrbS+JiBcqmv2npCUREbbfI+l+ScfkVRMAYG95nhFMkbQuIl6LiG2SFkmaUdkgIjZHRGSzB0kKAQAGVZ5BMEbS6xXzndmyPdj+sO2XJP1Q0id625Dty7Ouo46urq5cigWAVOUZBO5l2V5H/BHxYEQcI2mmpH/pbUMRcWtElCKi1NraWtsqASBxeQZBp6SxFfNtkt7YV+OIWCHpXbZH51gTAKCHPINgpaQJtsfbbpF0vqQllQ1sH2Xb2fRkSS2SNuVYEwCgh9zuGoqIbttzJT0qqUnSHRGx1vacbP0CSedKutj2dkn/J2l2xcVjAMAgyC0IJCkilkpa2mPZgorp+ZLm51kDAKBvPFkMAIkjCAAgcQQBACSOIACAxBEEAJA4ggAAEkcQAEDiCAIASBxBAACJIwgAIHEEAQAkjiAAgMQRBACQOIIAABJHEABA4voMAtsLK6Y/nns1AIBBV+2MYFLF9Lw8CwEAFKNaEPDaSABocNVeVdlm+1uSXDG9W0R8OrfKAACDoloQ/GPFdEeehQAAitFnEETEdwarEABAMfoMAttL+lofEefUthwAwGCr1jV0sqTXJd0r6UmVrxUAABpItSB4p6QzJF0g6aOSfijp3ohYm3dhAIDB0eftoxGxIyJ+FBEfl/Q+SeskLbd95aBUBwDIXbUzAtkeIemDKp8VjJP0LUnfz7csAMBgqXax+DuSjpP0iKQvRcSaQakKADBoqp0RXCTpD5KOljTP9q4njS0pIuKQPIsDAOSv2nMEjE4KAA2uWtfQSElzJB0l6TlJd0RE92AUhv1z7rwFRZcwYCNamjW7vaQZ0ydVbwyg5qod8X9HUknS85LOkvSN3CtCVSNamosuoaa2btuu+x5hBBOgKNWCYGJEfCwivi1plqT3D0JNqGJ2e6khwwBAMapdLN79rzMium0eLB4KZkyf1DDdKI3QtQXUu6ovprH9dvbze0nv2TVt++1qG7d9pu2Xba+zfV0v6y+0/Vz284TtxvjrBgB1pNpdQ0393bDtJkk3qzxERaeklbaXRMQLFc1+KekDEfFb2+2SbpV0Un+/EwBw4PK8PXSKpHUR8VpEbJO0SNKMygYR8URE/Dab/YWkthzrAQD0Is8gGKPyyKW7dGbL9uUylZ9g3ovty2132O7o6uqqYYkAgKpjDQ1Ab1eWe30Hsu1pKgfB1N7WR8StKncbqVQq8R7lBtUIF455JgL1KM8zgk5JYyvm2yS90bOR7fdIul3SjIjYlGM9GIIa8TZYnolAvckzCFZKmmB7vO0WSedL2uONZ7b/XOWRTC+KiFdyrAVDFM9EAMXLrWsoe+5grqRHJTWpPDzFWttzsvULJH1B0ihJ/5Y9o9AdEaW8asLQwzMRQPHyvEagiFgqaWmPZQsqpj8p6ZN51gAA6BujiwJA4ggCAEgcQQAAiSMIACBxBAEAJI4gAIDEEQQAkDiCAAASRxAAQOIIAgBIHEEAAIkjCAAgcQQBACSOIACAxBEEAJA4ggAAEkcQAEDiCAIASBxBAACJIwgAIHEEAQAkjiAAgMQRBACQOIIAABJHEABA4ggCAEgcQQAAiSMIACBxBAEAJI4gAIDEEQQAkDiCAAASRxAAQOJyDQLbZ9p+2fY629f1sv4Y2z+3vdX2tXnWAgDo3fC8Nmy7SdLNks6Q1Clppe0lEfFCRbPfSPq0pJl51QEA6FueZwRTJK2LiNciYpukRZJmVDaIiLciYqWk7TnWAQDoQ55BMEbS6xXzndkyAMAQkmcQuJdl0a8N2Zfb7rDd0dXVNcCyAACV8gyCTkljK+bbJL3Rnw1FxK0RUYqIUmtra02KAwCU5RkEKyVNsD3edouk8yUtyfH7AAD9kNtdQxHRbXuupEclNUm6IyLW2p6TrV9g+52SOiQdImmn7askTYyIt/OqCwCwp9yCQJIiYqmkpT2WLaiYflPlLiOgoZw7b0H1RkPYiJZmzW4vacb0SUWXgkHAk8VAjYxoaS66hJrZum277nuko+gyMEgIAqBGZreXGi4MkIZcu4aAlMyYPqkhulLqvVsLB44zAgBIHEEAAIkjCAAgcQQBACSOIACAxBEEAJA4ggAAEkcQAEDiCAIASBxBAACJIwgAIHEEAQAkjiAAgMQRBACQOIIAABJHEABA4ggCAEgcQQAAiSMIACBxBAEAJI4gAIDEEQQAkLjhRRcAYOg6d96CoksYsBEtzZrdXtKM6ZOKLmXI4owAwB5GtDQXXUJNbd22Xfc90lF0GUMaQQBgD7PbSw0ZBtg3uoYA7GHG9EkN043SCF1bg4EzAgBIHEEAAImjawhAEhqhmyivO6A4IwDQsBrxonced0ARBAAaFndA7Z9cu4ZsnynpJklNkm6PiK/2WO9s/VmS/ijpkoh4Os+aAKSDO6D2T25nBLabJN0sqV3SREkX2J7Yo1m7pAnZz+WSbsmrHgBA7/LsGpoiaV1EvBYR2yQtkjSjR5sZku6Ksl9IeoftI3KsCQDQQ55BMEbS6xXzndmyA20j25fb7rDd0dXVVfNCASBleQaBe1kW/WijiLg1IkoRUWptba1JcQCAsjwvFndKGlsx3ybpjX60qYnFN83JY7MAMCjy/BuW5xnBSkkTbI+33SLpfElLerRZIulil71P0u8iYkOONQEAesjtjCAium3PlfSoyreP3hERa23PydYvkLRU5VtH16l8++iledUDAOhdrs8RRMRSlf/YVy5bUDEdkj6VZw0AgL7xZDEAJI4gAIDEEQQAkDiCAAAS5/L12vphu0vSr4quo4rRkjYWXUSNNMq+NMp+SOzLUFQP+/EXEdHrE7l1FwT1wHZHRJSKrqMWGmVfGmU/JPZlKKr3/aBrCAASRxAAQOIIgnzcWnQBNdQo+9Io+yGxL0NRXe8H1wgAIHGcEQBA4ggCAEgcQVBDts+0/bLtdbavK7qe/rJ9h+23bK8pupaBsj3W9jLbL9pea3te0TX1l+2Rtp+y/Wy2L18quqaBsN1ke7XtHxRdy0DYXm/7edvP2O4oup7+4BpBjdhukvSKpDNUfuHOSkkXRMQLhRbWD7b/RtJmld8nfVzR9QxE9g7sIyLiadsHS1olaWad/v9iSQdFxGbbzZIekzQve9933bF9jaSSpEMi4uyi6+kv2+sllSJiqD9Qtk+cEdTOFEnrIuK1iNgmaZGkGQXX1C8RsULSb4quoxYiYkNEPJ1N/17Si+rlvdj1IMo2Z7PN2U9dHsnZbpP0QUm3F10LCIJaGiPp9Yr5TtXpH5xGZXucpBMkPVlwKf2Wdac8I+ktST+JiHrdl29K+qyknQXXUQsh6ce2V9m+vOhi+oMgqB33sqwuj9Yake0/k7RY0lUR8XbR9fRXROyIiONVfr/3FNt113Vn+2xJb0XEqqJrqZFTI2KypHZJn8q6VusKQVA7nZLGVsy3SXqjoFpQIetPXyzpuxHx/aLrqYWI+F9JyyWdWWwl/XKqpHOyvvVFkqbbvqfYkvovIt7I/vctSQ+q3E1cVwiC2lkpaYLt8bZbJJ0vaUnBNSUvu8D675JejIgbi65nIGy32n5HNv0nkk6X9FKhRfVDRHwuItoiYpzK/05+GhEfK7isfrF9UHYTgmwfJOlvJdXd3XYEQY1ERLekuZIeVfmC5P0RsbbYqvrH9r2Sfi7pr2x32r6s6JoG4FRJF6l81PlM9nNW0UX10xGSltl+TuUDj59ERF3fetkADpf0mO1nJT0l6YcR8aOCazpg3D4KAInjjAAAEkcQAEDiCAIASBxBAACJIwgAIHEEAYYU22H7GxXz19r+Yo22vdD2rFpsq8r3nJeNdrpsP9svt73Xi89tl2x/K5u+xPa/7uPzm3tbDuwvggBDzVZJf2d7dNGFVMpGl91fl0n6h4iYNpDvjIiOiPh0fz57gPUicQQBhppuld//enXPFT2P6HcdCds+zfbPbN9v+xXbX7V9YTZ2//O231WxmdNt/1fW7uzs8022v2Z7pe3nbF9Rsd1ltv9D0vO91HNBtv01tudny74gaaqkBba/1stnPpt95lnbX61YdV5W7yu231/x/Xs9MJY9vf7zrN5/qVi+R71V9mu57Qdsv2T7u9kT2EjU8KILAHpxs6TnbN9wAJ+ZJOlYlYfPfk3S7RExxeUX0Vwp6aqs3ThJH5D0LpWf0j1K0sWSfhcR77U9QtLjtn+ctZ8i6biI+GXll9k+UtJ8SSdK+q3Ko0/OjIh/tj1d0rUR0dHjM+2SZko6KSL+aPuwitXDs3rPknS9ysNH7MtNkm6JiLtsf6rHut31ZiNh7mu/TpD01yqPh/W4yk9gP9bHd6KBcUaAIScbHfQuSQfSLbIye/fAVkn/LWnXH7znVf7jv8v9EbEzIl5VOTCOUXl8mIuz4Z2flDRK0oSs/VM9QyDzXknLI6IrG17ku5KqjTp5uqQ7I+KP2X5WvvNh12B4q3rU25tTJd2bTd/dY11lvdX2qzMidkp6Zj++Ew2MMwIMVd+U9LSkOyuWdSs7eMm6Mloq1m2tmN5ZMb9Te/533nNMlVB5CPErI+LRyhW2T5P0h33U15+uFPfy/bvsqneH9u/f5b62U1lvX/tV+fva3+9Eg+KMAENSdrR8v8oXXndZr3JXjFR++1tzPzZ9nu1h2XWDv5T0ssoDBf59Nly1bB+djSTZlyclfcD26OzC7AWSflblMz+W9Anbf5p9z2FV2u/L4yqP2ilJF/bRrj/7hQQRBBjKviGp8u6h21T+4/uUpJO076P1vrys8h/sRyTNiYgtKr8u8QVJT9teI+nbqnKEHBEbJH1O0jJJz0p6OiIervKZH6k8NHlH1l1zbT/ql6R5Kr8AZaWkQ/tod8D7hTQx+igAJI4zAgBIHEEAAIkjCAAgcQQBACSOIACAxBEEAJA4ggAAEvf/5QjnIJiJ0WMAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 43;\n", " var nbb_unformatted_code = \"# Solution\\n\\nthinkplot.Pmf(pmf)\\nthinkplot.Config(xlabel='Number of children', ylabel='PMF')\";\n", " var nbb_formatted_code = \"# Solution\\n\\nthinkplot.Pmf(pmf)\\nthinkplot.Config(xlabel=\\\"Number of children\\\", ylabel=\\\"PMF\\\")\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Solution\n", "\n", "thinkplot.Pmf(pmf)\n", "thinkplot.Config(xlabel=\"Number of children\", ylabel=\"PMF\")" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 44;\n", " var nbb_unformatted_code = \"# Solution\\n\\nbiased = BiasPmf(pmf, label='biased')\";\n", " var nbb_formatted_code = \"# Solution\\n\\nbiased = BiasPmf(pmf, label=\\\"biased\\\")\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Solution\n", "\n", "biased = BiasPmf(pmf, label=\"biased\")" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 45;\n", " var nbb_unformatted_code = \"# Solution\\n\\nthinkplot.PrePlot(2)\\nthinkplot.Pmfs([pmf, biased])\\nthinkplot.Config(xlabel='Number of children', ylabel='PMF')\";\n", " var nbb_formatted_code = \"# Solution\\n\\nthinkplot.PrePlot(2)\\nthinkplot.Pmfs([pmf, biased])\\nthinkplot.Config(xlabel=\\\"Number of children\\\", ylabel=\\\"PMF\\\")\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Solution\n", "\n", "thinkplot.PrePlot(2)\n", "thinkplot.Pmfs([pmf, biased])\n", "thinkplot.Config(xlabel=\"Number of children\", ylabel=\"PMF\")" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.024205155043831" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 46;\n", " var nbb_unformatted_code = \"# Solution\\n\\npmf.Mean()\";\n", " var nbb_formatted_code = \"# Solution\\n\\npmf.Mean()\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Solution\n", "\n", "pmf.Mean()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.403679100664282" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 47;\n", " var nbb_unformatted_code = \"# Solution\\n\\nbiased.Mean()\";\n", " var nbb_formatted_code = \"# Solution\\n\\nbiased.Mean()\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Solution\n", "\n", "biased.Mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Exercise:** I started this book with the question, \"Are first babies more likely to be late?\" To address it, I computed the difference in means between groups of babies, but I ignored the possibility that there might be a difference between first babies and others for the same woman.\n", "\n", "To address this version of the question, select respondents who have at least two live births and compute pairwise differences. Does this formulation of the question yield a different result?\n", "\n", "Hint: use `nsfg.MakePregMap`:" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 48;\n", " var nbb_unformatted_code = \"live, firsts, others = first.MakeFrames()\";\n", " var nbb_formatted_code = \"live, firsts, others = first.MakeFrames()\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "live, firsts, others = first.MakeFrames()" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 49;\n", " var nbb_unformatted_code = \"preg_map = nsfg.MakePregMap(live)\";\n", " var nbb_formatted_code = \"preg_map = nsfg.MakePregMap(live)\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "preg_map = nsfg.MakePregMap(live)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 50;\n", " var nbb_unformatted_code = \"# Solution\\n\\nhist = thinkstats2.Hist()\\n\\nfor caseid, indices in preg_map.items():\\n if len(indices) >= 2:\\n pair = preg.loc[indices[0:2]].prglngth\\n diff = np.diff(pair)[0]\\n hist[diff] += 1\";\n", " var nbb_formatted_code = \"# Solution\\n\\nhist = thinkstats2.Hist()\\n\\nfor caseid, indices in preg_map.items():\\n if len(indices) >= 2:\\n pair = preg.loc[indices[0:2]].prglngth\\n diff = np.diff(pair)[0]\\n hist[diff] += 1\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Solution\n", "\n", "hist = thinkstats2.Hist()\n", "\n", "for caseid, indices in preg_map.items():\n", " if len(indices) >= 2:\n", " pair = preg.loc[indices[0:2]].prglngth\n", " diff = np.diff(pair)[0]\n", " hist[diff] += 1" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAStklEQVR4nO3dfYxc53me8esuadNfESxVK4EmiZIpWLeUkNb2glXrIjAgp2IcQ1SBCqARx2yjgqghN04/YJNVYKFoCChNkSZGKgeErJpGFAmE40CsCyVmmBhGAdnKypItUjSjdYiKGzHipkYatQGYUH76x7yKJ6slubuzmh3qvX7AYM55znvmPEsu73l55sxMqgpJUh/+2lo3IEkaH0Nfkjpi6EtSRwx9SeqIoS9JHVm/1g1cyfXXX19bt25d6zYk6ary5JNP/nFVTS2sT3zob926lZmZmbVuQ5KuKkn+12J1T+9IUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHJv4dudK4fexnH/4r67/8Mx9ao06k1edMX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kduWLoJ3kwyfkkJxbZ9u+SVJLrh2oHkswmOZ3ktqH6e5I807Z9OklW78eQJC3FUmb6nwN2LSwm2QL8CPD8UG0HsAe4qe1zf5J1bfNngH3A9nZ71WNKkl5bVwz9qvoq8N1FNv0X4BNADdV2A49U1YWqOgPMAjuTbASuqarHq6qAzwN3jNq8JGl5VnROP8ntwB9W1TcXbNoEnB1an2u1TW15Yf1Sj78vyUySmfn5+ZW0KElaxLJDP8lbgHuATy22eZFaXaa+qKo6VFXTVTU9NTW13BYlSZewkm/O+pvANuCb7bXYzcA3kuxkMIPfMjR2M/BCq29epC5JGqNlz/Sr6pmquqGqtlbVVgaB/u6q+iPgKLAnyYYk2xi8YPtEVZ0DXkpyS7tq5yPAo6v3Y0iSlmIpl2w+DDwOvDPJXJK7LjW2qk4CR4Bngd8E7q6ql9vmjwIPMHhx9zvAYyP2Lklapiue3qmqy34rdJvtD68fBA4uMm4GuHmZ/UmSVpHvyJWkjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1JGlfEfug0nOJzkxVPv5JN9O8q0kv5Hk7UPbDiSZTXI6yW1D9fckeaZt+3T7gnRJ0hgtZab/OWDXgtox4Oaq+iHg94EDAEl2AHuAm9o+9ydZ1/b5DLAP2N5uCx9TkvQau2LoV9VXge8uqH25qi621a8Bm9vybuCRqrpQVWeAWWBnko3ANVX1eFUV8HngjlX6GSRJS7Qa5/R/EnisLW8Czg5tm2u1TW15YV2SNEYjhX6Se4CLwEOvlBYZVpepX+px9yWZSTIzPz8/SouSpCErDv0ke4EPAj/eTtnAYAa/ZWjYZuCFVt+8SH1RVXWoqqaranpqamqlLUqSFlhR6CfZBXwSuL2q/mxo01FgT5INSbYxeMH2iao6B7yU5JZ21c5HgEdH7F2StEzrrzQgycPA+4Drk8wB9zK4WmcDcKxdefm1qvqXVXUyyRHgWQanfe6uqpfbQ32UwZVAb2bwGsBjSJLG6oqhX1UfWqT82cuMPwgcXKQ+A9y8rO4kSavKd+RKUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHbli6Cd5MMn5JCeGatclOZbkuXZ/7dC2A0lmk5xOcttQ/T1JnmnbPp32jeqSpPFZykz/c8CuBbX9wPGq2g4cb+sk2QHsAW5q+9yfZF3b5zPAPmB7uy18TEnSa+yKoV9VXwW+u6C8Gzjclg8DdwzVH6mqC1V1BpgFdibZCFxTVY9XVQGfH9pHkjQmKz2nf2NVnQNo9ze0+ibg7NC4uVbb1JYX1heVZF+SmSQz8/PzK2xRkrTQar+Qu9h5+rpMfVFVdaiqpqtqempqatWak6TerTT0X2ynbGj351t9DtgyNG4z8EKrb16kLkkao5WG/lFgb1veCzw6VN+TZEOSbQxesH2inQJ6Kckt7aqdjwztI0kak/VXGpDkYeB9wPVJ5oB7gfuAI0nuAp4H7gSoqpNJjgDPAheBu6vq5fZQH2VwJdCbgcfaTZI0RlcM/ar60CU23XqJ8QeBg4vUZ4Cbl9WdJGlV+Y5cSeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdGSn0k/zrJCeTnEjycJI3JbkuybEkz7X7a4fGH0gym+R0kttGb1+StBwrDv0km4CfAqar6mZgHbAH2A8cr6rtwPG2TpIdbftNwC7g/iTrRmtfkrQco57eWQ+8Ocl64C3AC8Bu4HDbfhi4oy3vBh6pqgtVdQaYBXaOeHxJ0jKsOPSr6g+B/ww8D5wD/k9VfRm4sarOtTHngBvaLpuAs0MPMddqr5JkX5KZJDPz8/MrbVGStMAop3euZTB73wa8A3hrkg9fbpdFarXYwKo6VFXTVTU9NTW10hYlSQuMcnrn/cCZqpqvqr8Avgj8Q+DFJBsB2v35Nn4O2DK0/2YGp4MkSWMySug/D9yS5C1JAtwKnAKOAnvbmL3Ao235KLAnyYYk24DtwBMjHF+StEzrV7pjVX09yReAbwAXgaeAQ8DbgCNJ7mLwxHBnG38yyRHg2Tb+7qp6ecT+JUnLsOLQB6iqe4F7F5QvMJj1Lzb+IHBwlGNKklbOd+RKUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SerISKGf5O1JvpDk20lOJfkHSa5LcizJc+3+2qHxB5LMJjmd5LbR25ckLceoM/1fAn6zqv428HeBU8B+4HhVbQeOt3WS7AD2ADcBu4D7k6wb8fiSpGVYcegnuQb4YeCzAFX151X1J8Bu4HAbdhi4oy3vBh6pqgtVdQaYBXau9PiSpOUbZab/g8A88N+SPJXkgSRvBW6sqnMA7f6GNn4TcHZo/7lWe5Uk+5LMJJmZn58foUVJ0rBRQn898G7gM1X1LuD/0U7lXEIWqdViA6vqUFVNV9X01NTUCC1KkoaNEvpzwFxVfb2tf4HBk8CLSTYCtPvzQ+O3DO2/GXhhhONLkpZpxaFfVX8EnE3yzla6FXgWOArsbbW9wKNt+SiwJ8mGJNuA7cATKz2+JGn51o+4/78CHkryRuAPgH/O4InkSJK7gOeBOwGq6mSSIwyeGC4Cd1fVyyMeX5K0DCOFflU9DUwvsunWS4w/CBwc5ZiSpJXzHbmS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjoycugnWZfkqSRfauvXJTmW5Ll2f+3Q2ANJZpOcTnLbqMeWJC3Pasz0Pw6cGlrfDxyvqu3A8bZOkh3AHuAmYBdwf5J1q3B8SdISjRT6STYDPwY8MFTeDRxuy4eBO4bqj1TVhao6A8wCO0c5viRpeUad6f8i8Ange0O1G6vqHEC7v6HVNwFnh8bNtdqrJNmXZCbJzPz8/IgtSpJeseLQT/JB4HxVPbnUXRap1WIDq+pQVU1X1fTU1NRKW5QkLbB+hH3fC9ye5APAm4Brkvwq8GKSjVV1LslG4HwbPwdsGdp/M/DCCMeXJC3Timf6VXWgqjZX1VYGL9D+TlV9GDgK7G3D9gKPtuWjwJ4kG5JsA7YDT6y4c0nSso0y07+U+4AjSe4CngfuBKiqk0mOAM8CF4G7q+rl1+D4kqRLWJXQr6qvAF9py/8buPUS4w4CB1fjmJKk5fMduZLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOrLi0E+yJcnvJjmV5GSSj7f6dUmOJXmu3V87tM+BJLNJTie5bTV+AEnS0o3yHbkXgX9bVd9I8gPAk0mOAf8MOF5V9yXZD+wHPplkB7AHuAl4B/DbSf6WX46utfaxn314rVuQxmbFoV9V54BzbfmlJKeATcBu4H1t2GEGX5j+yVZ/pKouAGeSzAI7gcdX2oO0Eoa8erYq5/STbAXeBXwduLE9IbzyxHBDG7YJODu021yrSZLGZOTQT/I24NeBn66qP73c0EVqdYnH3JdkJsnM/Pz8qC1KkpqRQj/JGxgE/kNV9cVWfjHJxrZ9I3C+1eeALUO7bwZeWOxxq+pQVU1X1fTU1NQoLUqShoxy9U6AzwKnquoXhjYdBfa25b3Ao0P1PUk2JNkGbAeeWOnxJUnLN8rVO+8FfgJ4JsnTrfbvgfuAI0nuAp4H7gSoqpNJjgDPMrjy526v3JGk8Rrl6p3/yeLn6QFuvcQ+B4GDKz2mJGk0o8z0pS4tvOTzl3/mQ2vUibR8fgyDJHXEmb50Bb6ZS68nzvQlqSOGviR1xNCXpI4Y+pLUEV/IlVaZl3Rqkhn6et0xdKVL8/SOJHXEmb5e97zOXvo+Z/qS1BFDX5I6YuhLUkc8py+NyNcMdDVxpi9JHXGmr6ve8Ezba/KlyzP0pauAT2xaLZ7ekaSOjH2mn2QX8EvAOuCBqrpv3D1osix3Fnu1zXr9WAhNkrGGfpJ1wH8FfgSYA34vydGqenacfWiyLQz1qy3kl+JKP+Pr8WfWZBj3TH8nMFtVfwCQ5BFgN2DoX6WudLniYoFloI3uck8SS7Hwz93/jfQjVTW+gyX/FNhVVf+irf8E8Per6mMLxu0D9rXVdwKnx9bk0lwP/PFaN3EFk97jpPcH9rgaJr0/eP32+Deqamphcdwz/SxSe9WzTlUdAg699u2sTJKZqppe6z4uZ9J7nPT+wB5Xw6T3B/31OO6rd+aALUPrm4EXxtyDJHVr3KH/e8D2JNuSvBHYAxwdcw+S1K2xnt6pqotJPgb8FoNLNh+sqpPj7GGVTOyppyGT3uOk9wf2uBomvT/orMexvpArSVpbviNXkjpi6EtSRwz9ZUjyH5N8K8nTSb6c5B1D2w4kmU1yOslta9Tfzyf5duvxN5K8fZL6a33cmeRkku8lmV6wbVJ63NV6mE2yf636GJbkwSTnk5wYql2X5FiS59r9tWvc45Ykv5vkVPs7/vgk9ZnkTUmeSPLN1t9/mKT+FvS6LslTSb606j1Wlbcl3oBrhpZ/CviVtrwD+CawAdgGfAdYtwb9/WNgfVv+OeDnJqm/1svfYfCGu68A00P1ieiRwQUG3wF+EHhj62nHBPzu/TDwbuDEUO0/Afvb8v5X/r7XsMeNwLvb8g8Av9/+XieiTwbvE3pbW34D8HXglknpb0Gv/wb4NeBLq/137Ux/GarqT4dW38r331i2G3ikqi5U1RlglsFHToy7vy9X1cW2+jUG74OYmP5aj6eqarF3WE9Kj3/5USFV9efAKx8Vsqaq6qvAdxeUdwOH2/Jh4I5x9rRQVZ2rqm+05ZeAU8AmJqTPGvi/bfUN7VZMSH+vSLIZ+DHggaHyqvVo6C9TkoNJzgI/DnyqlTcBZ4eGzbXaWvpJ4LG2PIn9LTQpPU5KH0txY1Wdg0HgAjescT9/KclW4F0MZtMT02c7bfI0cB44VlUT1V/zi8AngO8N1VatR0N/gSS/neTEIrfdAFV1T1VtAR4CXvnMoCV9vMQ4+mtj7gEuth7H2t9Se1xst3H2eBmT0sdVK8nbgF8HfnrB/47XXFW9XFV/j8H/gncmuXmNW/orknwQOF9VT75Wx/CbsxaoqvcvceivAf8DuJcxfrzElfpLshf4IHBrtROA4+wPlvVnOGxSPqJjUvpYiheTbKyqc0k2Mpi9rqkkb2AQ+A9V1RdbeeL6rKo/SfIVYBeT1d97gduTfAB4E3BNkl9dzR6d6S9Dku1Dq7cD327LR4E9STYk2QZsB55Yg/52AZ8Ebq+qPxvaNBH9XcGk9Hg1fVTIUWBvW94LPLqGvZAkwGeBU1X1C0ObJqLPJFOvXNGW5M3A+xn8G56I/gCq6kBVba6qrQx+936nqj7Mava41q9SX003BjOYE8C3gP8ObBradg+Dqz5OAz+6Rv3NMjgf/XS7/cok9df6+CcMZtMXgBeB35rAHj/A4MqT7wD3rPXvXevpYeAc8Bftz+8u4K8Dx4Hn2v11a9zjP2JwKuxbQ7+DH5iUPoEfAp5q/Z0APtXqE9HfIv2+j+9fvbNqPfoxDJLUEU/vSFJHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUkf8PM4v5SJ3y7XgAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 51;\n", " var nbb_unformatted_code = \"# Solution\\n\\nthinkplot.Hist(hist)\";\n", " var nbb_formatted_code = \"# Solution\\n\\nthinkplot.Hist(hist)\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Solution\n", "\n", "thinkplot.Hist(hist)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-0.056367432150313125" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 52;\n", " var nbb_unformatted_code = \"# Solution\\n\\npmf = thinkstats2.Pmf(hist)\\npmf.Mean()\";\n", " var nbb_formatted_code = \"# Solution\\n\\npmf = thinkstats2.Pmf(hist)\\npmf.Mean()\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Solution\n", "\n", "pmf = thinkstats2.Pmf(hist)\n", "pmf.Mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Exercise:** In most foot races, everyone starts at the same time. If you are a fast runner, you usually pass a lot of people at the beginning of the race, but after a few miles everyone around you is going at the same speed.\n", "When I ran a long-distance (209 miles) relay race for the first time, I noticed an odd phenomenon: when I overtook another runner, I was usually much faster, and when another runner overtook me, he was usually much faster.\n", "\n", "At first I thought that the distribution of speeds might be bimodal; that is, there were many slow runners and many fast runners, but few at my speed.\n", "\n", "Then I realized that I was the victim of a bias similar to the effect of class size. The race was unusual in two ways: it used a staggered start, so teams started at different times; also, many teams included runners at different levels of ability.\n", "\n", "As a result, runners were spread out along the course with little relationship between speed and location. When I joined the race, the runners near me were (pretty much) a random sample of the runners in the race.\n", "\n", "So where does the bias come from? During my time on the course, the chance of overtaking a runner, or being overtaken, is proportional to the difference in our speeds. I am more likely to catch a slow runner, and more likely to be caught by a fast runner. But runners at the same speed are unlikely to see each other.\n", "\n", "Write a function called `ObservedPmf` that takes a `Pmf` representing the actual distribution of runners’ speeds, and the speed of a running observer, and returns a new `Pmf` representing the distribution of runners’ speeds as seen by the observer.\n", "\n", "To test your function, you can use `relay.py`, which reads the results from the James Joyce Ramble 10K in Dedham MA and converts the pace of each runner to mph.\n", "\n", "Compute the distribution of speeds you would observe if you ran a relay race at 7 mph with this group of runners." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "download(\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/relay.py\")\n", "download(\"https://github.com/AllenDowney/ThinkStats2/raw/master/code/Apr25_27thAn_set1.shtml\")" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 53;\n", " var nbb_unformatted_code = \"import relay\\n\\nresults = relay.ReadResults()\\nspeeds = relay.GetSpeeds(results)\\nspeeds = relay.BinData(speeds, 3, 12, 100)\";\n", " var nbb_formatted_code = \"import relay\\n\\nresults = relay.ReadResults()\\nspeeds = relay.GetSpeeds(results)\\nspeeds = relay.BinData(speeds, 3, 12, 100)\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import relay\n", "\n", "results = relay.ReadResults()\n", "speeds = relay.GetSpeeds(results)\n", "speeds = relay.BinData(speeds, 3, 12, 100)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 54;\n", " var nbb_unformatted_code = \"pmf = thinkstats2.Pmf(speeds, 'actual speeds')\\nthinkplot.Pmf(pmf)\\nthinkplot.Config(xlabel='Speed (mph)', ylabel='PMF')\";\n", " var nbb_formatted_code = \"pmf = thinkstats2.Pmf(speeds, \\\"actual speeds\\\")\\nthinkplot.Pmf(pmf)\\nthinkplot.Config(xlabel=\\\"Speed (mph)\\\", ylabel=\\\"PMF\\\")\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pmf = thinkstats2.Pmf(speeds, \"actual speeds\")\n", "thinkplot.Pmf(pmf)\n", "thinkplot.Config(xlabel=\"Speed (mph)\", ylabel=\"PMF\")" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 55;\n", " var nbb_unformatted_code = \"# Solution\\n\\ndef ObservedPmf(pmf, speed, label=None):\\n \\\"\\\"\\\"Returns a new Pmf representing speeds observed at a given speed.\\n\\n The chance of observing a runner is proportional to the difference\\n in speed.\\n\\n Args:\\n pmf: distribution of actual speeds\\n speed: speed of the observing runner\\n label: string label for the new dist\\n\\n Returns:\\n Pmf object\\n \\\"\\\"\\\"\\n new = pmf.Copy(label=label)\\n for val in new.Values():\\n diff = abs(val - speed)\\n new[val] *= diff\\n new.Normalize()\\n return new\";\n", " var nbb_formatted_code = \"# Solution\\n\\n\\ndef ObservedPmf(pmf, speed, label=None):\\n \\\"\\\"\\\"Returns a new Pmf representing speeds observed at a given speed.\\n\\n The chance of observing a runner is proportional to the difference\\n in speed.\\n\\n Args:\\n pmf: distribution of actual speeds\\n speed: speed of the observing runner\\n label: string label for the new dist\\n\\n Returns:\\n Pmf object\\n \\\"\\\"\\\"\\n new = pmf.Copy(label=label)\\n for val in new.Values():\\n diff = abs(val - speed)\\n new[val] *= diff\\n new.Normalize()\\n return new\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Solution\n", "\n", "def ObservedPmf(pmf, speed, label=None):\n", " \"\"\"Returns a new Pmf representing speeds observed at a given speed.\n", "\n", " The chance of observing a runner is proportional to the difference\n", " in speed.\n", "\n", " Args:\n", " pmf: distribution of actual speeds\n", " speed: speed of the observing runner\n", " label: string label for the new dist\n", "\n", " Returns:\n", " Pmf object\n", " \"\"\"\n", " new = pmf.Copy(label=label)\n", " for val in new.Values():\n", " diff = abs(val - speed)\n", " new[val] *= diff\n", " new.Normalize()\n", " return new\n" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 56;\n", " var nbb_unformatted_code = \"# Solution\\n\\nbiased = ObservedPmf(pmf, 7, label='observed speeds')\\nthinkplot.Pmf(biased)\\nthinkplot.Config(xlabel='Speed (mph)', ylabel='PMF')\";\n", " var nbb_formatted_code = \"# Solution\\n\\nbiased = ObservedPmf(pmf, 7, label=\\\"observed speeds\\\")\\nthinkplot.Pmf(biased)\\nthinkplot.Config(xlabel=\\\"Speed (mph)\\\", ylabel=\\\"PMF\\\")\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", " nbb_cells[i].set_text(nbb_formatted_code);\n", " }\n", " break;\n", " }\n", " }\n", " }, 500);\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Solution\n", "\n", "biased = ObservedPmf(pmf, 7, label=\"observed speeds\")\n", "thinkplot.Pmf(biased)\n", "thinkplot.Config(xlabel=\"Speed (mph)\", ylabel=\"PMF\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.11" } }, "nbformat": 4, "nbformat_minor": 1 }