{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- Credit\n",
" - http://www.cyclismo.org/tutorial/R/confidence.html"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"source": [
"import pandas as pd, numpy as np\n",
"%matplotlib inline\n",
"%pylab inline\n",
"import seaborn as sns \n",
"import matplotlib.pyplot as plt\n",
"plt.style.use('ggplot')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1-1) Calculating a Confidence Interval From a Normal Distribution"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from scipy.stats import norm"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.959963984540054"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# quantile function in python\n",
"# R \n",
"# https://stat.ethz.ch/R-manual/R-devel/library/stats/html/Normal.html\n",
"# PYTHON \n",
"# https://stackoverflow.com/questions/24695174/python-equivalent-of-qnorm-qf-and-qchi2-of-r\n",
"# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.norm.html\n",
"\n",
"# qnorm(0.975) IN R \n",
"norm.ppf(.975)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"left: 5.876522540576581\n",
"right: 4.123477459423419\n"
]
}
],
"source": [
"# assume \n",
"# sample mean = 5,\n",
"# standard deviation = 2\n",
"# sample size = 20. \n",
"# use a 95% confidence level \n",
"\n",
"a = 5 \n",
"s = -2 \n",
"n = 20\n",
"error = norm.ppf(.975)*s/np.sqrt(20)\n",
"left = a-error\n",
"right = a+error\n",
"\n",
"print ('left: {}'.format(left))\n",
"print ('right: {}'.format(right))\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`The mean of predicting values are within confidence interval between 4.12 and 5.88 with 95% confidence interval, data is normally distributed and samples are independent`\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1-2) Calculating a Confidence Interval From a t Distribution"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from scipy.stats import t"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2.093024054408263"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# quantile function in the The Student t Distribution\n",
"# R \n",
"# https://stat.ethz.ch/R-manual/R-devel/library/stats/html/TDist.html\n",
"# PYTHON\n",
"# https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.t.html\n",
"# https://stackoverflow.com/questions/19339305/python-function-to-get-the-t-statistic\n",
"\n",
"# qt(0.975,df=n-1) IN R \n",
"t.ppf(.975, df=n-1)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"left: 5.936028812839819\n",
"right: 4.063971187160181\n"
]
}
],
"source": [
"# assume \n",
"# sample mean = 5,\n",
"# standard deviation = 2\n",
"# sample size = 20. \n",
"# use a 95% confidence level \n",
"\n",
"a = 5 \n",
"s = -2 \n",
"n = 20\n",
"error = t.ppf(.975, df=n-1)*s/np.sqrt(20)\n",
"left = a-error\n",
"right = a+error\n",
"\n",
"print ('left: {}'.format(left))\n",
"print ('right: {}'.format(right))\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`The true mean has a probability of 95% of being in the interval between 4.06 and 5.94 assuming that the original random variable is normally distributed, and the samples are independent.`"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1-3) Calculating a Confidence Interval From a DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.read_csv('http://www.cyclismo.org/tutorial/R/_static/w1.dat')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" vals | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.43 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.40 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.45 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" vals\n",
"0 0.43\n",
"1 0.40\n",
"2 0.45"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count | \n",
" mean | \n",
" std | \n",
" min | \n",
" 25% | \n",
" 50% | \n",
" 75% | \n",
" max | \n",
"
\n",
" \n",
" \n",
" \n",
" vals | \n",
" 54.0 | \n",
" 0.765 | \n",
" 0.378122 | \n",
" 0.13 | \n",
" 0.48 | \n",
" 0.72 | \n",
" 1.0075 | \n",
" 1.76 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" count mean std min 25% 50% 75% max\n",
"vals 54.0 0.765 0.378122 0.13 0.48 0.72 1.0075 1.76"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe().T"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"error : vals 0.102247\n",
"dtype: float64\n"
]
}
],
"source": [
"error = t.ppf(.975, df=len(df)-1)*std(df)/np.sqrt(len(df))\n",
"\n",
"print ('error : {}'.format(error))\n",
"\n",
"# R \n",
"# error <- qt(0.975,df=length(w1$vals)-1)*std(df)/sqrt(length(w1$vals))\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"left: vals 0.662753\n",
"dtype: float64\n",
"right: vals 0.867247\n",
"dtype: float64\n"
]
}
],
"source": [
"left = mean(df)-error\n",
"right = mean(df)+error\n",
"\n",
"print ('left: {}'.format(left))\n",
"print ('right: {}'.format(right))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`There is a 95% probability that the true mean is between 0.66 and 0.87 assuming that the original random variable is normally distributed, and the samples are independent.`"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1-4) Calculating Many Confidence Intervals From a t Distribution"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Consider we have following test results :"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"# df1 \n",
"df_Comparison1 = pd.DataFrame({'Mean':[10,15],\n",
" 'Std. Dev.':[3,2.5],\n",
" 'Number':[300,230]})\n",
"df_Comparison1 = df_Comparison1.rename({0: 'Group1', 1: 'Group1'}) \n",
"\n",
"# df2 \n",
"df_Comparison2 = pd.DataFrame({'Mean':[12,13],\n",
" 'Std. Dev.':[4,5.3],\n",
" 'Number':[210,340]})\n",
"df_Comparison2 = df_Comparison2.rename({0: 'Group1', 1: 'Group1'}) \n",
"\n",
"\n",
"# df3\n",
"df_Comparison3 = pd.DataFrame({'Mean':[30,28.5],\n",
" 'Std. Dev.':[4.5,3],\n",
" 'Number':[420,400]})\n",
"df_Comparison3 = df_Comparison3.rename({0: 'Group1', 1: 'Group1'}) \n"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Mean | \n",
" Number | \n",
" Std. Dev. | \n",
"
\n",
" \n",
" \n",
" \n",
" Group1 | \n",
" 10 | \n",
" 300 | \n",
" 3.0 | \n",
"
\n",
" \n",
" Group1 | \n",
" 15 | \n",
" 230 | \n",
" 2.5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Mean Number Std. Dev.\n",
"Group1 10 300 3.0\n",
"Group1 15 230 2.5"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_Comparison1"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Mean | \n",
" Number | \n",
" Std. Dev. | \n",
"
\n",
" \n",
" \n",
" \n",
" Group1 | \n",
" 12 | \n",
" 210 | \n",
" 4.0 | \n",
"
\n",
" \n",
" Group1 | \n",
" 13 | \n",
" 340 | \n",
" 5.3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Mean Number Std. Dev.\n",
"Group1 12 210 4.0\n",
"Group1 13 340 5.3"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_Comparison2"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Mean | \n",
" Number | \n",
" Std. Dev. | \n",
"
\n",
" \n",
" \n",
" \n",
" Group1 | \n",
" 30.0 | \n",
" 420 | \n",
" 4.5 | \n",
"
\n",
" \n",
" Group1 | \n",
" 28.5 | \n",
" 400 | \n",
" 3.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Mean Number Std. Dev.\n",
"Group1 30.0 420 4.5\n",
"Group1 28.5 400 3.0"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_Comparison3"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# R \n",
"# pmin() function returns the parallel minima vector of multiple vectors or matrix.\n",
"# http://www.endmemo.com/program/R/pmin.php\n",
"# e.g.\n",
"#> x <- c(3, 26, 122, 6)\n",
"#> y <- c(43,2,54,8)\n",
"#> z <- c(9,32,1,9)\n",
"#> pmax(x,y,z)\n",
"#[1] 43 32 122 9\n",
"#> pmin(x,y,z)\n",
"#[1] 3 2 1 6\n",
"\n",
"# PYTHON \n",
"# np.minimum()\n",
"# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.minimum.html\n"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"m1 = np.array([10,12,30])\n",
"m2 = np.array([10.5,13,28.5])\n",
"sd1 = np.array([3,4,4.5])\n",
"sd2 = np.array([2.5,5.3,3])\n",
"num1 = np.array([300,210,420])\n",
"num2 = np.array([230,340,400])\n",
"se = np.sqrt(sd1*sd1/num1+sd2*sd2/num2)\n",
"#error = qt(0.975,df=pmin(num1,num2)-1)*se\n",
"error = t.ppf(.975, df=np.minimum(num1,num2) -1)*se\n"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0.23911067, 0.39850737, 0.26592158])"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"se"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0.47113823, 0.78560924, 0.52278249])"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"error"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"left: [-0.97113823 -1.78560924 0.97721751]\n",
"right: [-0.02886177 -0.21439076 2.02278249]\n"
]
}
],
"source": [
"left = (m1-m2)-error\n",
"right = (m1-m2)+error\n",
"\n",
"print ('left: {}'.format(left))\n",
"print ('right: {}'.format(right))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This gives the confidence intervals for each of the three tests. For example, in the first experiment the 95% confidence interval is between -0.97 and -0.03 assuming that the random variables are normally distributed, and the samples are independent."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}