{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "- Credit\n", " - http://www.cyclismo.org/tutorial/R/confidence.html" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ] } ], "source": [ "import pandas as pd, numpy as np\n", "%matplotlib inline\n", "%pylab inline\n", "import seaborn as sns \n", "import matplotlib.pyplot as plt\n", "plt.style.use('ggplot')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1-1) Calculating a Confidence Interval From a Normal Distribution" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from scipy.stats import norm" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.959963984540054" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# quantile function in python\n", "# R \n", "# https://stat.ethz.ch/R-manual/R-devel/library/stats/html/Normal.html\n", "# PYTHON \n", "# https://stackoverflow.com/questions/24695174/python-equivalent-of-qnorm-qf-and-qchi2-of-r\n", "# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.norm.html\n", "\n", "# qnorm(0.975) IN R \n", "norm.ppf(.975)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "left: 5.876522540576581\n", "right: 4.123477459423419\n" ] } ], "source": [ "# assume \n", "# sample mean = 5,\n", "# standard deviation = 2\n", "# sample size = 20. \n", "# use a 95% confidence level \n", "\n", "a = 5 \n", "s = -2 \n", "n = 20\n", "error = norm.ppf(.975)*s/np.sqrt(20)\n", "left = a-error\n", "right = a+error\n", "\n", "print ('left: {}'.format(left))\n", "print ('right: {}'.format(right))\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`The mean of predicting values are within confidence interval between 4.12 and 5.88 with 95% confidence interval, data is normally distributed and samples are independent`\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1-2) Calculating a Confidence Interval From a t Distribution" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from scipy.stats import t" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.093024054408263" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# quantile function in the The Student t Distribution\n", "# R \n", "# https://stat.ethz.ch/R-manual/R-devel/library/stats/html/TDist.html\n", "# PYTHON\n", "# https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.t.html\n", "# https://stackoverflow.com/questions/19339305/python-function-to-get-the-t-statistic\n", "\n", "# qt(0.975,df=n-1) IN R \n", "t.ppf(.975, df=n-1)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "left: 5.936028812839819\n", "right: 4.063971187160181\n" ] } ], "source": [ "# assume \n", "# sample mean = 5,\n", "# standard deviation = 2\n", "# sample size = 20. \n", "# use a 95% confidence level \n", "\n", "a = 5 \n", "s = -2 \n", "n = 20\n", "error = t.ppf(.975, df=n-1)*s/np.sqrt(20)\n", "left = a-error\n", "right = a+error\n", "\n", "print ('left: {}'.format(left))\n", "print ('right: {}'.format(right))\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`The true mean has a probability of 95% of being in the interval between 4.06 and 5.94 assuming that the original random variable is normally distributed, and the samples are independent.`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1-3) Calculating a Confidence Interval From a DataFrame" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = pd.read_csv('http://www.cyclismo.org/tutorial/R/_static/w1.dat')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
vals
00.43
10.40
20.45
\n", "
" ], "text/plain": [ " vals\n", "0 0.43\n", "1 0.40\n", "2 0.45" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(3)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countmeanstdmin25%50%75%max
vals54.00.7650.3781220.130.480.721.00751.76
\n", "
" ], "text/plain": [ " count mean std min 25% 50% 75% max\n", "vals 54.0 0.765 0.378122 0.13 0.48 0.72 1.0075 1.76" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe().T" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "error : vals 0.102247\n", "dtype: float64\n" ] } ], "source": [ "error = t.ppf(.975, df=len(df)-1)*std(df)/np.sqrt(len(df))\n", "\n", "print ('error : {}'.format(error))\n", "\n", "# R \n", "# error <- qt(0.975,df=length(w1$vals)-1)*std(df)/sqrt(length(w1$vals))\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "left: vals 0.662753\n", "dtype: float64\n", "right: vals 0.867247\n", "dtype: float64\n" ] } ], "source": [ "left = mean(df)-error\n", "right = mean(df)+error\n", "\n", "print ('left: {}'.format(left))\n", "print ('right: {}'.format(right))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`There is a 95% probability that the true mean is between 0.66 and 0.87 assuming that the original random variable is normally distributed, and the samples are independent.`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1-4) Calculating Many Confidence Intervals From a t Distribution" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Consider we have following test results :" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "# df1 \n", "df_Comparison1 = pd.DataFrame({'Mean':[10,15],\n", " 'Std. Dev.':[3,2.5],\n", " 'Number':[300,230]})\n", "df_Comparison1 = df_Comparison1.rename({0: 'Group1', 1: 'Group1'}) \n", "\n", "# df2 \n", "df_Comparison2 = pd.DataFrame({'Mean':[12,13],\n", " 'Std. Dev.':[4,5.3],\n", " 'Number':[210,340]})\n", "df_Comparison2 = df_Comparison2.rename({0: 'Group1', 1: 'Group1'}) \n", "\n", "\n", "# df3\n", "df_Comparison3 = pd.DataFrame({'Mean':[30,28.5],\n", " 'Std. Dev.':[4.5,3],\n", " 'Number':[420,400]})\n", "df_Comparison3 = df_Comparison3.rename({0: 'Group1', 1: 'Group1'}) \n" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MeanNumberStd. Dev.
Group1103003.0
Group1152302.5
\n", "
" ], "text/plain": [ " Mean Number Std. Dev.\n", "Group1 10 300 3.0\n", "Group1 15 230 2.5" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_Comparison1" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MeanNumberStd. Dev.
Group1122104.0
Group1133405.3
\n", "
" ], "text/plain": [ " Mean Number Std. Dev.\n", "Group1 12 210 4.0\n", "Group1 13 340 5.3" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_Comparison2" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MeanNumberStd. Dev.
Group130.04204.5
Group128.54003.0
\n", "
" ], "text/plain": [ " Mean Number Std. Dev.\n", "Group1 30.0 420 4.5\n", "Group1 28.5 400 3.0" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_Comparison3" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# R \n", "# pmin() function returns the parallel minima vector of multiple vectors or matrix.\n", "# http://www.endmemo.com/program/R/pmin.php\n", "# e.g.\n", "#> x <- c(3, 26, 122, 6)\n", "#> y <- c(43,2,54,8)\n", "#> z <- c(9,32,1,9)\n", "#> pmax(x,y,z)\n", "#[1] 43 32 122 9\n", "#> pmin(x,y,z)\n", "#[1] 3 2 1 6\n", "\n", "# PYTHON \n", "# np.minimum()\n", "# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.minimum.html\n" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": true }, "outputs": [], "source": [ "m1 = np.array([10,12,30])\n", "m2 = np.array([10.5,13,28.5])\n", "sd1 = np.array([3,4,4.5])\n", "sd2 = np.array([2.5,5.3,3])\n", "num1 = np.array([300,210,420])\n", "num2 = np.array([230,340,400])\n", "se = np.sqrt(sd1*sd1/num1+sd2*sd2/num2)\n", "#error = qt(0.975,df=pmin(num1,num2)-1)*se\n", "error = t.ppf(.975, df=np.minimum(num1,num2) -1)*se\n" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0.23911067, 0.39850737, 0.26592158])" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "se" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0.47113823, 0.78560924, 0.52278249])" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "error" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "left: [-0.97113823 -1.78560924 0.97721751]\n", "right: [-0.02886177 -0.21439076 2.02278249]\n" ] } ], "source": [ "left = (m1-m2)-error\n", "right = (m1-m2)+error\n", "\n", "print ('left: {}'.format(left))\n", "print ('right: {}'.format(right))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This gives the confidence intervals for each of the three tests. For example, in the first experiment the 95% confidence interval is between -0.97 and -0.03 assuming that the random variables are normally distributed, and the samples are independent." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.5" } }, "nbformat": 4, "nbformat_minor": 2 }