{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import numpy as np\n",
"import scipy.stats"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Implement ContingencyTable"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"I know scipy has a method to do this, but I thought it would be fun to implement it myself."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"class HtmlObject(object):\n",
" \"\"\"An object that can be displayed as HTML.\"\"\"\n",
" def __init__(self, html):\n",
" self.html = html\n",
"\n",
" def _repr_html_(self):\n",
" return self.html\n",
"\n",
"\n",
"class ContingencyTable(object):\n",
" \"\"\"A generic contingency table.\"\"\"\n",
"\n",
" def __init__(self, row_names, col_names, counts):\n",
" assert counts.shape == (len(row_names), len(col_names))\n",
" self.row_names = row_names\n",
" self.col_names = col_names\n",
" self.counts = counts\n",
"\n",
" def get_chisq_stat(self):\n",
" \"\"\"Compute the chi-squared test statistic.\"\"\"\n",
" expected = self.get_expected_counts()\n",
" return np.sum((self.counts - expected)**2 / expected)\n",
"\n",
" def get_degrees_of_freedom(self):\n",
" \"\"\"Get the number of degrees of freedom for the chi-squared distribution.\"\"\"\n",
" return np.product(self.counts.shape) - np.sum(self.counts.shape) + 1\n",
"\n",
" def get_expected_counts(self):\n",
" \"\"\"Get a matric of the expected counts under the null hypothesis.\"\"\"\n",
" expected = np.zeros(self.counts.shape)\n",
" for i in xrange(expected.shape[0]):\n",
" for j in xrange(expected.shape[1]):\n",
" n_i_dot = self.marginal(i, '.')\n",
" n_j_dot = self.marginal('.', j)\n",
" expected[i, j] = n_i_dot * n_j_dot\n",
" expected = expected / self.marginal('.', '.')\n",
" return expected\n",
"\n",
" def get_p_value(self):\n",
" \"\"\"Compute a p-value.\"\"\"\n",
" chisq_stat = self.get_chisq_stat()\n",
" dof = self.get_degrees_of_freedom()\n",
" return 1.0 - scipy.stats.chi2.cdf(chisq_stat, dof)\n",
"\n",
" def marginal(self, i, j):\n",
" \"\"\"Compute a marginal (sum of row/column). Each argument can be either\n",
" an index or a '.'\"\"\"\n",
" return self._marginal(i, j, self.counts)\n",
"\n",
" def to_null_hypothesis_html(self):\n",
" \"\"\"Get a table in HTML format showing the expected counts.\"\"\"\n",
" expected = self.get_expected_counts()\n",
" return HtmlObject(\n",
" self._to_html(expected, caption='Expected Counts', summary=False))\n",
"\n",
" def _marginal(self, i, j, cell_values):\n",
" i_dot = i == '.'\n",
" j_dot = j == '.'\n",
" assert i_dot or j_dot\n",
" if i_dot and j_dot:\n",
" return cell_values.sum()\n",
" elif i_dot:\n",
" return cell_values[:, j].sum()\n",
" elif j_dot:\n",
" return cell_values[i].sum()\n",
"\n",
" def _to_html(self, cell_values, caption='Counts', summary=True):\n",
" lines = ['
']\n",
" lines.append('{0}'.format(caption))\n",
" \n",
" def start_row():\n",
" lines.append('')\n",
"\n",
" def end_row():\n",
" lines.append('
')\n",
" \n",
" def cell(contents):\n",
" lines.append('{0} | '.format(str(contents)))\n",
" \n",
" # Column names\n",
" start_row()\n",
" cell('')\n",
" for col_name in self.col_names:\n",
" cell(col_name)\n",
" cell('')\n",
" end_row()\n",
" \n",
" # Rows\n",
" for i, row_name in enumerate(self.row_names):\n",
" start_row()\n",
" cell(row_name)\n",
" for j in xrange(len(self.col_names)):\n",
" cell(cell_values[i, j])\n",
" cell(self._marginal(i, '.', cell_values))\n",
" end_row()\n",
"\n",
" start_row()\n",
" cell('')\n",
" for j in xrange(len(self.col_names)):\n",
" cell(self._marginal('.', j, cell_values))\n",
" cell(self._marginal('.', '.', cell_values))\n",
" end_row()\n",
" \n",
" lines.append('
')\n",
"\n",
" if summary:\n",
" chi2 = self.get_chisq_stat()\n",
" df = self.get_degrees_of_freedom()\n",
" s = 's' if df > 1 else ''\n",
" pval = self.get_p_value()\n",
" fmt = \"$\\\\chi^2 = {0}$. {1} degree{2} of freedom. P-value {3}.\"\n",
" lines.append(fmt.format(chi2, df, s, pval))\n",
" \n",
" return '\\n'.join(lines)\n",
"\n",
" def _repr_html_(self):\n",
" return self._to_html(self.counts)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Test ContingencyTable on the example in segment 32."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"rows, cols = ('1 Exon', '>1 Exon'), ('AT Fraction < .4', 'AT Fraction > .6')\n",
"counts = np.array([[2386, 689], [13369, 3982]])\n",
"table = ContingencyTable(rows, cols, counts)\n",
"table"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"Counts\n",
"\n",
" | \n",
"AT Fraction < .4 | \n",
"AT Fraction > .6 | \n",
" | \n",
"
\n",
"\n",
"1 Exon | \n",
"2386 | \n",
"689 | \n",
"3075 | \n",
"
\n",
"\n",
">1 Exon | \n",
"13369 | \n",
"3982 | \n",
"17351 | \n",
"
\n",
"\n",
" | \n",
"15755 | \n",
"4671 | \n",
"20426 | \n",
"
\n",
"
\n",
"$\\chi^2 = 0.43693330893$. 1 degree of freedom. P-value 0.508606342548."
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 3,
"text": [
"<__main__.ContingencyTable at 0x9b10d2c>"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print 'Null Hypothesis'\n",
"table.to_null_hypothesis_html()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Null Hypothesis\n"
]
},
{
"html": [
"\n",
"Expected Counts\n",
"\n",
" | \n",
"AT Fraction < .4 | \n",
"AT Fraction > .6 | \n",
" | \n",
"
\n",
"\n",
"1 Exon | \n",
"2371.81166161 | \n",
"703.188338392 | \n",
"3075.0 | \n",
"
\n",
"\n",
">1 Exon | \n",
"13383.1883384 | \n",
"3967.81166161 | \n",
"17351.0 | \n",
"
\n",
"\n",
" | \n",
"15755.0 | \n",
"4671.0 | \n",
"20426.0 | \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": [
"<__main__.HtmlObject at 0xa2377ac>"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"p_val = table.get_p_value()\n",
"print 'p-value', p_val\n",
"assert np.allclose(1.0 - p_val, 0.4914, atol=0.0001)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"p-value 0.508606342548\n"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"table.to_null_hypothesis_html()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"Expected Counts\n",
"\n",
" | \n",
"AT Fraction < .4 | \n",
"AT Fraction > .6 | \n",
" | \n",
"
\n",
"\n",
"1 Exon | \n",
"2371.81166161 | \n",
"703.188338392 | \n",
"3075.0 | \n",
"
\n",
"\n",
">1 Exon | \n",
"13383.1883384 | \n",
"3967.81166161 | \n",
"17351.0 | \n",
"
\n",
"\n",
" | \n",
"15755.0 | \n",
"4671.0 | \n",
"20426.0 | \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 6,
"text": [
"<__main__.HtmlObject at 0xa237e0c>"
]
}
],
"prompt_number": 6
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"In Class: Ice Cream Flavors"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"rows, cols = ('Texas Tech', 'A&M', 'UT'), ('Vanilla', 'Strawberry', 'Chocolate')\n",
"counts = np.array([[1, 1, 13], [16, 4, 15], [45, 32, 80]])\n",
"table = ContingencyTable(rows, cols, counts)\n",
"table"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"Counts\n",
"\n",
" | \n",
"Vanilla | \n",
"Strawberry | \n",
"Chocolate | \n",
" | \n",
"
\n",
"\n",
"Texas Tech | \n",
"1 | \n",
"1 | \n",
"13 | \n",
"15 | \n",
"
\n",
"\n",
"A&M | \n",
"16 | \n",
"4 | \n",
"15 | \n",
"35 | \n",
"
\n",
"\n",
"UT | \n",
"45 | \n",
"32 | \n",
"80 | \n",
"157 | \n",
"
\n",
"\n",
" | \n",
"62 | \n",
"37 | \n",
"108 | \n",
"207 | \n",
"
\n",
"
\n",
"$\\chi^2 = 12.173427174$. 4 degrees of freedom. P-value 0.0161071520031."
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": [
"<__main__.ContingencyTable at 0xa0de4ac>"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"table.to_null_hypothesis_html()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"Expected Counts\n",
"\n",
" | \n",
"Vanilla | \n",
"Strawberry | \n",
"Chocolate | \n",
" | \n",
"
\n",
"\n",
"Texas Tech | \n",
"4.49275362319 | \n",
"2.68115942029 | \n",
"7.82608695652 | \n",
"15.0 | \n",
"
\n",
"\n",
"A&M | \n",
"10.4830917874 | \n",
"6.25603864734 | \n",
"18.2608695652 | \n",
"35.0 | \n",
"
\n",
"\n",
"UT | \n",
"47.0241545894 | \n",
"28.0628019324 | \n",
"81.9130434783 | \n",
"157.0 | \n",
"
\n",
"\n",
" | \n",
"62.0 | \n",
"37.0 | \n",
"108.0 | \n",
"207.0 | \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 8,
"text": [
"<__main__.HtmlObject at 0xa23796c>"
]
}
],
"prompt_number": 8
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"In Class: Grades"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"rows, cols = ('A&M', 'UT'), ('A', 'B', 'C', 'D')\n",
"counts = np.array([[5, 24, 32, 1], [17, 80, 50, 18]])\n",
"table = ContingencyTable(rows, cols, counts)\n",
"table"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"Counts\n",
"\n",
" | \n",
"A | \n",
"B | \n",
"C | \n",
"D | \n",
" | \n",
"
\n",
"\n",
"A&M | \n",
"5 | \n",
"24 | \n",
"32 | \n",
"1 | \n",
"62 | \n",
"
\n",
"\n",
"UT | \n",
"17 | \n",
"80 | \n",
"50 | \n",
"18 | \n",
"165 | \n",
"
\n",
"\n",
" | \n",
"22 | \n",
"104 | \n",
"82 | \n",
"19 | \n",
"227 | \n",
"
\n",
"
\n",
"$\\chi^2 = 11.4912235216$. 3 degrees of freedom. P-value 0.00934566370554."
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 9,
"text": [
"<__main__.ContingencyTable at 0xa0de7cc>"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"table.to_null_hypothesis_html()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"Expected Counts\n",
"\n",
" | \n",
"A | \n",
"B | \n",
"C | \n",
"D | \n",
" | \n",
"
\n",
"\n",
"A&M | \n",
"6.00881057269 | \n",
"28.4052863436 | \n",
"22.3964757709 | \n",
"5.18942731278 | \n",
"62.0 | \n",
"
\n",
"\n",
"UT | \n",
"15.9911894273 | \n",
"75.5947136564 | \n",
"59.6035242291 | \n",
"13.8105726872 | \n",
"165.0 | \n",
"
\n",
"\n",
" | \n",
"22.0 | \n",
"104.0 | \n",
"82.0 | \n",
"19.0 | \n",
"227.0 | \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 11,
"text": [
"<__main__.HtmlObject at 0xa2377ec>"
]
}
],
"prompt_number": 11
}
],
"metadata": {}
}
]
}