{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import numpy as np\n", "import scipy.stats" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Implement ContingencyTable" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "I know scipy has a method to do this, but I thought it would be fun to implement it myself." ] }, { "cell_type": "code", "collapsed": false, "input": [ "class HtmlObject(object):\n", " \"\"\"An object that can be displayed as HTML.\"\"\"\n", " def __init__(self, html):\n", " self.html = html\n", "\n", " def _repr_html_(self):\n", " return self.html\n", "\n", "\n", "class ContingencyTable(object):\n", " \"\"\"A generic contingency table.\"\"\"\n", "\n", " def __init__(self, row_names, col_names, counts):\n", " assert counts.shape == (len(row_names), len(col_names))\n", " self.row_names = row_names\n", " self.col_names = col_names\n", " self.counts = counts\n", "\n", " def get_chisq_stat(self):\n", " \"\"\"Compute the chi-squared test statistic.\"\"\"\n", " expected = self.get_expected_counts()\n", " return np.sum((self.counts - expected)**2 / expected)\n", "\n", " def get_degrees_of_freedom(self):\n", " \"\"\"Get the number of degrees of freedom for the chi-squared distribution.\"\"\"\n", " return np.product(self.counts.shape) - np.sum(self.counts.shape) + 1\n", "\n", " def get_expected_counts(self):\n", " \"\"\"Get a matric of the expected counts under the null hypothesis.\"\"\"\n", " expected = np.zeros(self.counts.shape)\n", " for i in xrange(expected.shape[0]):\n", " for j in xrange(expected.shape[1]):\n", " n_i_dot = self.marginal(i, '.')\n", " n_j_dot = self.marginal('.', j)\n", " expected[i, j] = n_i_dot * n_j_dot\n", " expected = expected / self.marginal('.', '.')\n", " return expected\n", "\n", " def get_p_value(self):\n", " \"\"\"Compute a p-value.\"\"\"\n", " chisq_stat = self.get_chisq_stat()\n", " dof = self.get_degrees_of_freedom()\n", " return 1.0 - scipy.stats.chi2.cdf(chisq_stat, dof)\n", "\n", " def marginal(self, i, j):\n", " \"\"\"Compute a marginal (sum of row/column). Each argument can be either\n", " an index or a '.'\"\"\"\n", " return self._marginal(i, j, self.counts)\n", "\n", " def to_null_hypothesis_html(self):\n", " \"\"\"Get a table in HTML format showing the expected counts.\"\"\"\n", " expected = self.get_expected_counts()\n", " return HtmlObject(\n", " self._to_html(expected, caption='Expected Counts', summary=False))\n", "\n", " def _marginal(self, i, j, cell_values):\n", " i_dot = i == '.'\n", " j_dot = j == '.'\n", " assert i_dot or j_dot\n", " if i_dot and j_dot:\n", " return cell_values.sum()\n", " elif i_dot:\n", " return cell_values[:, j].sum()\n", " elif j_dot:\n", " return cell_values[i].sum()\n", "\n", " def _to_html(self, cell_values, caption='Counts', summary=True):\n", " lines = ['']\n", " lines.append(''.format(caption))\n", " \n", " def start_row():\n", " lines.append('')\n", "\n", " def end_row():\n", " lines.append('')\n", " \n", " def cell(contents):\n", " lines.append(''.format(str(contents)))\n", " \n", " # Column names\n", " start_row()\n", " cell('')\n", " for col_name in self.col_names:\n", " cell(col_name)\n", " cell('')\n", " end_row()\n", " \n", " # Rows\n", " for i, row_name in enumerate(self.row_names):\n", " start_row()\n", " cell(row_name)\n", " for j in xrange(len(self.col_names)):\n", " cell(cell_values[i, j])\n", " cell(self._marginal(i, '.', cell_values))\n", " end_row()\n", "\n", " start_row()\n", " cell('')\n", " for j in xrange(len(self.col_names)):\n", " cell(self._marginal('.', j, cell_values))\n", " cell(self._marginal('.', '.', cell_values))\n", " end_row()\n", " \n", " lines.append('
{0}
{0}
')\n", "\n", " if summary:\n", " chi2 = self.get_chisq_stat()\n", " df = self.get_degrees_of_freedom()\n", " s = 's' if df > 1 else ''\n", " pval = self.get_p_value()\n", " fmt = \"$\\\\chi^2 = {0}$. {1} degree{2} of freedom. P-value {3}.\"\n", " lines.append(fmt.format(chi2, df, s, pval))\n", " \n", " return '\\n'.join(lines)\n", "\n", " def _repr_html_(self):\n", " return self._to_html(self.counts)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Test ContingencyTable on the example in segment 32." ] }, { "cell_type": "code", "collapsed": false, "input": [ "rows, cols = ('1 Exon', '>1 Exon'), ('AT Fraction < .4', 'AT Fraction > .6')\n", "counts = np.array([[2386, 689], [13369, 3982]])\n", "table = ContingencyTable(rows, cols, counts)\n", "table" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Counts
AT Fraction < .4AT Fraction > .6
1 Exon23866893075
>1 Exon13369398217351
15755467120426
\n", "$\\chi^2 = 0.43693330893$. 1 degree of freedom. P-value 0.508606342548." ], "metadata": {}, "output_type": "pyout", "prompt_number": 3, "text": [ "<__main__.ContingencyTable at 0x9b10d2c>" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "print 'Null Hypothesis'\n", "table.to_null_hypothesis_html()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Null Hypothesis\n" ] }, { "html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Expected Counts
AT Fraction < .4AT Fraction > .6
1 Exon2371.81166161703.1883383923075.0
>1 Exon13383.18833843967.8116616117351.0
15755.04671.020426.0
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 4, "text": [ "<__main__.HtmlObject at 0xa2377ac>" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "p_val = table.get_p_value()\n", "print 'p-value', p_val\n", "assert np.allclose(1.0 - p_val, 0.4914, atol=0.0001)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "p-value 0.508606342548\n" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "table.to_null_hypothesis_html()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Expected Counts
AT Fraction < .4AT Fraction > .6
1 Exon2371.81166161703.1883383923075.0
>1 Exon13383.18833843967.8116616117351.0
15755.04671.020426.0
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 6, "text": [ "<__main__.HtmlObject at 0xa237e0c>" ] } ], "prompt_number": 6 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "In Class: Ice Cream Flavors" ] }, { "cell_type": "code", "collapsed": false, "input": [ "rows, cols = ('Texas Tech', 'A&M', 'UT'), ('Vanilla', 'Strawberry', 'Chocolate')\n", "counts = np.array([[1, 1, 13], [16, 4, 15], [45, 32, 80]])\n", "table = ContingencyTable(rows, cols, counts)\n", "table" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Counts
VanillaStrawberryChocolate
Texas Tech111315
A&M1641535
UT453280157
6237108207
\n", "$\\chi^2 = 12.173427174$. 4 degrees of freedom. P-value 0.0161071520031." ], "metadata": {}, "output_type": "pyout", "prompt_number": 7, "text": [ "<__main__.ContingencyTable at 0xa0de4ac>" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "table.to_null_hypothesis_html()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Expected Counts
VanillaStrawberryChocolate
Texas Tech4.492753623192.681159420297.8260869565215.0
A&M10.48309178746.2560386473418.260869565235.0
UT47.024154589428.062801932481.9130434783157.0
62.037.0108.0207.0
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ "<__main__.HtmlObject at 0xa23796c>" ] } ], "prompt_number": 8 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "In Class: Grades" ] }, { "cell_type": "code", "collapsed": false, "input": [ "rows, cols = ('A&M', 'UT'), ('A', 'B', 'C', 'D')\n", "counts = np.array([[5, 24, 32, 1], [17, 80, 50, 18]])\n", "table = ContingencyTable(rows, cols, counts)\n", "table" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Counts
ABCD
A&M52432162
UT17805018165
221048219227
\n", "$\\chi^2 = 11.4912235216$. 3 degrees of freedom. P-value 0.00934566370554." ], "metadata": {}, "output_type": "pyout", "prompt_number": 9, "text": [ "<__main__.ContingencyTable at 0xa0de7cc>" ] } ], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "table.to_null_hypothesis_html()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Expected Counts
ABCD
A&M6.0088105726928.405286343622.39647577095.1894273127862.0
UT15.991189427375.594713656459.603524229113.8105726872165.0
22.0104.082.019.0227.0
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 11, "text": [ "<__main__.HtmlObject at 0xa2377ec>" ] } ], "prompt_number": 11 } ], "metadata": {} } ] }