{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Analysis of multivariate data\n", "\n", "- Regression line\n", "- Correlation\n", "\n", "Author: Thomas Haslwanter, Date: Jun-2017" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "%matplotlib inline\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import scipy.stats as stats\n", "import pandas as pd\n", "from numpy.linalg import lstsq\n", "from urllib.request import urlopen\n", "import statsmodels.api as sm" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "## Regression Line\n", "\n", "Fit a line, using the powerful \"ordinary least square\" method of pandas.\n", "\n", "*Data from 24 type 1 diabetic patients, relating Fasting blood glucose (mmol/l) to mean circumferential shortening velocity (%/sec), derived form echocardiography.*" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "# Get the data\n", "url_base = 'https://raw.githubusercontent.com/thomas-haslwanter/statsintro_python/master/ipynb/Data/data_altman/'\n", "inFile = 'altman_11_6.txt'\n", "url = url_base + inFile\n", "data = np.genfromtxt(urlopen(url), delimiter=',')" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "### Solve equations \"by hand\" ..." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(array([ 1.098, 0.022]), array([ 0.986]), 2, array([ 54.079, 1.838]))\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAEYVJREFUeJzt3XuMXOV9xvHnMevslsTc4jWpudhRFZwEEkwywbTQBkMV\nDFR2IrWo2E0KJUIJSUpQVCANDYrCH23oJbFQAIu4Tmtw1CY4SUmJCLQNlgqGNVcDAUeFLOaSXWq1\n5iJvWfnXP2YcFnt3Z3bm7LznvPP9SCvv7BzN+Wk859l33/M773FECACQlzmpCwAAFI9wB4AMEe4A\nkCHCHQAyRLgDQIYIdwDIEOEOABki3AEgQ4Q7AGSoL9WO58+fH4sXL061ewCopG3btr0UEYPNtksW\n7osXL9bQ0FCq3QNAJdn+RSvbMS0DABki3AEgQ4Q7AGSIcAeADBHuAJAhwh0ACjCye4/Ou/Eejby8\nJ3Upkgh3ACjE2rt26P5ndmntnTtSlyIpYZ87AORgyVW3a2x8768eb9w6rI1bh9XfN0dPXnN2sroY\nuQNAB7Zcvlwrly7UwNx6nA7MnaNVSxdqyxXLk9ZFuANABxYcMqB5/X0aG9+r/r45Ghvfq3n9fVow\nbyBpXUzLAECHXnplTGuWLdLqk4/VLfcNa7QEJ1UdEUl2XKvVgrVlAGBmbG+LiFqz7ZiWAYAMEe4A\nkCHCHQAyRLgDQIYIdwDIEOEOABki3AEgQ4Q7AGSIcAeADBHuAJAhwh0AMkS4A0CGCHcAyBDhDgAZ\nItwBIEOEOwBkiHAHgAwR7gCQIcIdADJEuANAhgh3AMgQ4Q4AGSLcASBDhDsAZIhwB4AMNQ132+tt\nj9jePsXzh9r+F9sP237M9oXFlwkAmIlWRu4bJK2Y5vnPSHo8Ik6UdLqkv7H9ls5LAwC0q2m4R8Td\nknZNt4mkebYt6W2NbceLKQ8A0I4i5tyvk/QeSc9LelTSpRGxd7INbV9se8j20OjoaAG7BgBMpohw\nP0vSQ5IWSloq6Trbh0y2YUSsi4haRNQGBwcL2DUAYDJFhPuFkm6Nup9LelrSuwt4XQBAm4oI92FJ\nZ0qS7SMlLZH0XwW8LgCgTX3NNrC9SfUumPm2d0q6WtJcSYqIGyR9VdIG249KsqQrIuKlWasYANBU\n03CPiPObPP+8pI8UVhEAoGNcoQpgWiO79+i8G+/RyMt7UpeCGSDcAUxr7V07dP8zu7T2zh2pS8EM\nNJ2WAdCbllx1u8bG37hkZePWYW3cOqz+vjl68pqzE1aGVjByBzCpLZcv18qlCzUwtx4TA3PnaNXS\nhdpyxfLElaEVhDuASS04ZEDz+vs0Nr5X/X1zNDa+V/P6+7Rg3kDq0tACpmUATOmlV8a0ZtkirT75\nWN1y37BGOalaGY6IJDuu1WoxNDSUZN8AUFW2t0VErdl2TMsAQIYIdwDIEOEOABki3AEgQ4Q7AGSI\ncAeADBHumBUsNgWkRbhjVrDYFJAWV6iiUCw2BZQDI3cUisWmgHIg3FEoFpsCyoFpGRSOxaaA9Fg4\nDAAqhIXDAKCHEe4AkCHCHQAyRLgDQIYIdwDIEOEOABki3AEgQ4Q7AGSIcAeADBHuAJAhwh0AMkS4\nA0CGCHcAyBDhDgAZahruttfbHrG9fZptTrf9kO3HbP+02BIBADPVysh9g6QVUz1p+zBJ35S0MiKO\nl/QHxZQGAGhX03CPiLsl7Zpmk9WSbo2I4cb2IwXVBgBoUxFz7sdJOtz2f9jeZvsTU21o+2LbQ7aH\nRkdHC9g1AGAyRYR7n6QPSjpX0lmS/sL2cZNtGBHrIqIWEbXBwcECdg0AmEwRN8jeKem/I+JVSa/a\nvlvSiZKeKuC1AQBtKGLk/gNJp9nus32wpGWSnijgdQEAbWo6cre9SdLpkubb3inpaklzJSkiboiI\nJ2z/WNIjkvZKuikipmybBADMvqbhHhHnt7DNtZKuLaQiAEDHuEIVADJEuANAhgh3AMgQ4Q4AGSLc\nASBDhDsAZIhwB4AMEe4AkCHCHQAyRLgnMrJ7j8678R6NvLwndSkAMkS4J7L2rh26/5ldWnvnjtSl\nAMhQEUv+YgaWXHW7xsb3/urxxq3D2rh1WP19c/TkNWcnrAxAThi5d9mWy5dr5dKFGphbf+sH5s7R\nqqULteWK5YkrA5ATwr3LFhwyoHn9fRob36v+vjkaG9+ref19WjBvIHVpADLCtEwCL70ypjXLFmn1\nycfqlvuGNcpJVQAFc0Qk2XGtVouhoaEk+57MyO49+uymB3Xd6pMYRQMoLdvbIqLWbDumZRroXgGQ\nk56flqF7BUCOen7kTvcKgBz1fLjTvQKgm7p1dXrPh7v0RvfK5ktO1ZplizT6yljqkgBkqlvn9+iW\nAYAu2P/83j4zPb9HtwwAlEi3z+8R7gDQBd0+v9fzrZAA0C3dvDqdOXcAqBDm3AGghxHuAJAhwh0A\nMkS4A0CGCHcAyBDhDgAZItwBIEOEOwBkqGm4215ve8T29ibbfcj2uO3fL648AEA7Whm5b5C0YroN\nbB8k6a8k3VFATQCADjUN94i4W9KuJpt9TtL3JI0UURQAoDMdz7nbPkrSxyRd38K2F9sesj00Ojra\n6a4BAFMo4oTq1yVdEREHrkK/n4hYFxG1iKgNDg4WsGsAwGSKWPK3Juk7tiVpvqRzbI9HxPcLeG0A\nQBs6DveIeOe+721vkHQbwQ4AabXSCrlJ0j2Sltjeafsi25+y/anZLw+9qlt3iAdy1XTkHhHnt/pi\nEXFBR9UADRPvEH/Nx96XuhygcrjNHkpl/zvEb9w6rI1bh2d8h3ig17H8AEql23eIB3JFuKNUun2H\neCBXTMugdLp5h3ggV46IJDuu1WoxNDSUZN/Ix8juPfrspgd13eqTGN2jJ9jeFhG1ZtsxLYNKm9hV\nA+ANTMugkuiqAabHyB2VRFcNMD3CHZVEVw0wPcIdlbWvq2bzJadqzbJFGn1lLHVJbWGpBcwGumWA\nxK7a/Khuvm9Ya04+lqUW0FSr3TKcUEUhaEmcOU4KYzYxLYNC0JI4c5wUxmxi5I6OMPpsHyeFMZsY\nuaMjjD47k8tJYZQPI3d0hNFnZ278+Bvnxa756AkJK0FuCHd0jIW+gPKhFRLAm9D5VG4sHAagLXQ+\n5YFwL7mirl7kKkg0s+Sq27X4yh9p49ZhRdQ7nxZf+SMtuer21KWhDYR7yRU1imI0hmbofMoLJ1RL\nqqj+cfrQ0So6n/LCyL2kihpFMRrDTNB3nw9G7iVV1CiK0Rhmgr77fDByT6SVE5xFjaIYjQG9hz73\nRFjmFUA7WPK3pDjBCaAbmJbpMk5wAugGwr3LOMEJoBuYlkmAhbYAzDZOqAIzwKJaSI2Fw4BZwDIO\nqAqmZYAW0OWEqmHkDrSALidUTdNwt73e9ojt7VM8v8b2I7Yftf2ftk8svkwgLbqcUDWtjNw3SFox\nzfNPS/pwRLxP0lclrSugLqB0WMYBVdJSt4ztxZJui4hpVxKyfbik7RFxVLPXpFsGAGYuVbfMRZKm\nvG2L7YttD9keGh0dLXjX7eEORSgCnyOUTWHhbnu56uF+xVTbRMS6iKhFRG1wcLCoXXeE1jYUgc8R\nyqaQaRnb75e0WdLZEfFUKztOPS2zf2vbPlVpbeNimnKo+ucI1dO1aRnbx0q6VdLHWw32Mqh6axsj\nxXKo+ucI+Wp6EZPtTZJOlzTf9k5JV0uaK0kRcYOkL0t6u6Rv2pak8VZ+q6RW1dY2LqYpl6p+jpC/\npuEeEec3ef6Tkj5ZWEVdVMUFvLZcvlzX/OsTuuOxF7Xn9b0amDtHZx3/Dn3p3PekLq1nVfFzhPz1\n9PIDVbxfJCPF8qni5wj56+lwrypGigCaYclfAKgQlvwFgB5GuANAhgh3AMgQ4Q4AGSLcASBDhDsA\nZKhy4c7Sqt3F+w1UU+XCnQWzuov3G6imylzExNKq3cX7DZRTdhcxsbRqd/F+A9VWmXBnwazu4v0G\nqq1SC4exYFZ38X4D1VWZOXdgJrgNIXKV3Zw7MBN0+aDXVWpaBmiG2xACdYzckRW6fIA6wh1ZocsH\nqGNaBtmhywegWwYAKoVuGQDoYYQ7AGSIcAcwq1g2Og3CHcCs4oKyNOiWATAruKAsLUbuAGYFF5Sl\n1TPhzrwf0F1cUJZWz4Q7835A9+27oGzzJadqzbJFGn1lLHVJPSP7i5i4XRyAnHARUwPzfgB6Ufbh\nzrwfgF7UE62QLCQFoNc0nXO3vV7S70kaiYgTJnnekr4h6RxJr0m6ICIeaLZjFg4DgJkrcs59g6QV\n0zx/tqR3Nb4ulnR9KwUCAGZP03CPiLsl7Zpmk1WS/iHq7pV0mO1fL6pAAMDMFXFC9ShJz054vLPx\nswPYvtj2kO2h0dHRAnYNAJhMV7tlImJdRNQiojY4ONjNXQNATyki3J+TdMyEx0c3fgYASKSIcP+h\npE+47hRJ/xsRLxTwugCANrXSCrlJ0umS5kv6paSrJc2VpIi4odEKeZ3qHTWvSbowIpr2ONoelfSL\nFuucL+mlFrdNgfo6Q33tK3NtEvV1arL6FkVE03ntZGvLzITtoVb6OlOhvs5QX/vKXJtEfZ3qpL7s\nlx8AgF5EuANAhqoS7utSF9AE9XWG+tpX5tok6utU2/VVYs4dADAzVRm5AwBmoBLhbvsg2w/avi11\nLfuzfZjt79r+me0nbP9m6pomsn2Z7cdsb7e9yXbShextr7c9Ynv7hJ8dYfsntnc0/j28RLVd2/i/\nfcT2ZtuHpahtqvomPPcF22F7foraGjVMWp/tzzXew8dsf61M9dleavte2w81lkY5OVFtx9j+d9uP\nN96nSxs/b/vYqES4S7pU0hOpi5jCNyT9OCLeLelElahO20dJ+lNJtcZyzQdJ+sO0VU26yuiVku6K\niHdJuqvxOIUNOrC2n0g6ISLeL+kpSV/sdlETbNAkK7TaPkbSRyQNd7ug/WzQfvXZXq764oInRsTx\nkv46QV37bNCB79/XJH0lIpZK+nLjcQrjkr4QEe+VdIqkz9h+rzo4Nkof7raPlnSupJtS17I/24dK\n+h1J35KkiPi/iPiftFUdoE/Sr9nuk3SwpOdTFjPFKqOrJH278f23JX20q0U1TFZbRNwREeONh/eq\nvrxGEtOs0Pp3ki6XlPQE2hT1fVrSX0bEWGObka4X1jBFfSHpkMb3hyrR8RERL+y7D0ZEvKz6IPEo\ndXBslD7cJX1d9Q/ugXe5Tu+dkkYl/X1j2ugm229NXdQ+EfGc6iOlYUkvqL40xB1pq5rUkROWrHhR\n0pEpi5nGn0i6PXURE9leJem5iHg4dS1TOE7Sb9veavuntj+UuqD9fF7StbafVf1YSfmXmSTJ9mJJ\nJ0naqg6OjVKHu+19d4DalrqWKfRJ+oCk6yPiJEmvKt2UwgEa83OrVP8ltFDSW23/Udqqphf19q3S\ntXDZ/pLqfzrfnLqWfWwfLOnPVZ9OKKs+SUeoPtXwZ5L+qbFkSVl8WtJlEXGMpMvU+Cs8Fdtvk/Q9\nSZ+PiN0Tn5vpsVHqcJd0qqSVtp+R9B1JZ9jemLakN9kpaWdEbG08/q7qYV8Wvyvp6YgYjYjXJd0q\n6bcS1zSZX+67wUvj32R/uk/G9gWq32pyTZSrd/g3VP/F/XDjGDla0gO235G0qjfbKenWxs187lP9\nL/BkJ30n8ceqHxeS9M+SkpxQlSTbc1UP9psjYl9NbR8bpQ73iPhiRBwdEYtVPxH4bxFRmpFnRLwo\n6VnbSxo/OlPS4wlL2t+wpFNsH9wYLZ2pEp3wneCHqh9kavz7g4S1vIntFapPC66MiNdS1zNRRDwa\nEQsiYnHjGNkp6QONz2VZfF/SckmyfZykt6hcC3U9L+nDje/PkLQjRRGN4/Nbkp6IiL+d8FT7x0ZE\nVOJL9ZUpb0tdxyR1LZU0JOkR1T/Ih6euab/6viLpZ5K2S/pHSf2J69mk+vz/66qH0UWS3q56J8AO\nSXdKOqJEtf1c9TuNPdT4uqFM791+zz8jaX6Z6lM9zDc2Pn8PSDqjZPWdJmmbpIdVn+P+YKLaTlN9\nyuWRCZ+1czo5NrhCFQAyVOppGQBAewh3AMgQ4Q4AGSLcASBDhDsAZIhwB4AMEe4AkCHCHQAy9P+e\nb3ujZYMyEQAAAABJRU5ErkJggg==\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# First I have to delete rows containing \"nan\"\n", "a,b = np.where(np.isnan(data))\n", "data = np.delete(data, a, axis=0)\n", "\n", "x,y = data[:,0], data[:,1]\n", "plt.plot(x,y,'*')\n", "\n", "# Create the design matrix\n", "Xmat = sm.add_constant(x)\n", "\n", "# Calculate the parameters\n", "params = lstsq(Xmat, y)\n", "np.set_printoptions(precision=3)\n", "print(params)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "### ... then solve them with *pandas* and *statsmodels*\n", "\n", "pandas handles \"nan\" gracefully, and also provides more information about the fit. So let's use pandas, and compare the results" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Results: Ordinary least squares\n", "=================================================================\n", "Model: OLS Adj. R-squared: 0.134 \n", "Dependent Variable: Vcf AIC: -3.1672 \n", "Date: 2017-06-07 22:00 BIC: -0.8962 \n", "No. Observations: 23 Log-Likelihood: 3.5836 \n", "Df Model: 1 F-statistic: 4.414 \n", "Df Residuals: 21 Prob (F-statistic): 0.0479 \n", "R-squared: 0.174 Scale: 0.046957\n", "-------------------------------------------------------------------\n", " Coef. Std.Err. t P>|t| [0.025 0.975]\n", "-------------------------------------------------------------------\n", "Intercept 1.0978 0.1175 9.3446 0.0000 0.8535 1.3421\n", "glucose 0.0220 0.0105 2.1010 0.0479 0.0002 0.0437\n", "-----------------------------------------------------------------\n", "Omnibus: 1.717 Durbin-Watson: 1.802\n", "Prob(Omnibus): 0.424 Jarque-Bera (JB): 1.270\n", "Skew: 0.562 Prob(JB): 0.530\n", "Kurtosis: 2.756 Condition No.: 29 \n", "=================================================================\n", "\n" ] } ], "source": [ "import statsmodels.formula.api as smf\n", "\n", "# Convert them into a pandas DataFrame\n", "df = pd.DataFrame(data, columns=['glucose', 'Vcf'])\n", "\n", "model_fit = smf.ols('Vcf~glucose', df).fit()\n", "\n", "print(model_fit.summary2())" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Correlation\n", "\n", "Pearson correlation, and two types of rank correlation (Spearman, Kendall)\n", "\n", "*Comparing age and percentage of body-fat (measured by dual-photon absorptiometry) for 18 normal adults.*" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "# Get the data\n", "inFile = 'altman_11_1.txt'\n", "url = url_base + inFile\n", "data = np.genfromtxt(urlopen(url), delimiter=',')\n", "\n", "x = data[:,0]\n", "y = data[:,1]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'pearson': 0.79208623217849128, 'spearman': 0.75387958553761558, 'kendall': 0.57620948508912284}\n" ] } ], "source": [ "# Calculate correlations\n", "corr = {}\n", "corr['pearson'], _ = stats.pearsonr(x,y)\n", "corr['spearman'], _ = stats.spearmanr(x,y)\n", "corr['kendall'], _ = stats.kendalltau(x,y)\n", "\n", "print(corr) " ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Spearman's rho = 0.754, and Pearson's r (rankordered) = 0.754\n" ] } ], "source": [ "# Show that Spearman's rho is just the Pearson's R of the rank-ordered data\n", "r_rankordered = stats.pearsonr(stats.rankdata(x), stats.rankdata(y))[0]\n", "print(\"Spearman's rho = {0:5.3f}, and Pearson's r (rankordered) = {1:5.3f}\".format(corr['spearman'], r_rankordered))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "celltoolbar": "Slideshow", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 1 }