{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datascience import *\n", "import numpy as np\n", "import matplotlib\n", "from mpl_toolkits.mplot3d import Axes3D\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "plots.style.use('fivethirtyeight')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def standard_units(arr):\n", " return (arr - np.average(arr))/np.std(arr)\n", "\n", "def correlation(t, x, y):\n", " x_standard = standard_units(t.column(x))\n", " y_standard = standard_units(t.column(y))\n", " return np.average(x_standard * y_standard)\n", "\n", "def slope(t, x, y):\n", " r = correlation(t, x, y)\n", " y_sd = np.std(t.column(y))\n", " x_sd = np.std(t.column(x))\n", " return r * y_sd / x_sd\n", "\n", "def intercept(t, x, y):\n", " x_mean = np.mean(t.column(x))\n", " y_mean = np.mean(t.column(y))\n", " return y_mean - slope(t, x, y)*x_mean\n", "\n", "def fitted_values(t, x, y):\n", " \"\"\"Return an array of the regression estimates at all the x values\"\"\"\n", " a = slope(t, x, y)\n", " b = intercept(t, x, y)\n", " return a*t.column(x) + b\n", "\n", "def residuals(t, x, y):\n", " predictions = fitted_values(t, x, y)\n", " return t.column(y) - predictions" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Hypothesis testing for the slope" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def bootstrap_slope(t, x, y, repetitions=5000):\n", " \n", " # Bootstrap the scatter, find the slope, collect\n", " slopes = make_array()\n", " for i in np.arange(repetitions):\n", " bootstrap_sample = t.sample()\n", " bootstrap_slope = slope(bootstrap_sample, x, y)\n", " slopes = np.append(slopes, bootstrap_slope)\n", " \n", " # Find the endpoints of the 95% confidence interval for the true slope\n", " left = percentile(2.5, slopes)\n", " right = percentile(97.5, slopes)\n", " \n", " # Slope of the regression line from the original sample\n", " observed_slope = slope(t, x, y)\n", " \n", " # Display results\n", " Table().with_column('Bootstrap Slopes', slopes).hist(bins=20)\n", " plots.plot(make_array(left, right), make_array(0, 0), color='yellow', lw=8);\n", " print('Slope of regression line:', observed_slope)\n", " print('Approximate 95%-confidence interval for the slope of the true line:')\n", " print(left, 'to', right)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "baby = Table.read_table('baby.csv')\n", "baby.show(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "slope(baby, 'Maternal Age', 'Birth Weight')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "baby.scatter('Maternal Age', 'Birth Weight', fit_line=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bootstrap_slope(baby, 'Maternal Age', 'Birth Weight', 1000)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Classification" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#ckd = chronic kidney disease\n", "#class = 1 = has ckd\n", "#class = 0 = does not have ckd\n", "ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')\n", "ckd.show(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ckd.group('Class')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Could you predict if a patient has ckd?\n", "ckd.scatter('White Blood Cell Count', 'Glucose', colors='Class')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ckd.scatter('Hemoglobin', 'Glucose', colors='Class')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Can you tell if a bank note is counterfeit or legitimate?\n", "#Variables based on photgraphs of many banknotes (a few numbers for each image calculated)\n", "banknotes = Table.read_table('banknote.csv')\n", "banknotes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "banknotes.scatter('WaveletVar', 'WaveletCurt', colors='Class')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "banknotes.scatter('WaveletSkew', 'Entropy', colors='Class')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Two attributes have some overlap of classes...what happens with three attributes?\n", "fig = plots.figure(figsize=(8,8))\n", "ax = Axes3D(fig)\n", "ax.scatter(banknotes.column('WaveletSkew'), \n", " banknotes.column('WaveletVar'), \n", " banknotes.column('WaveletCurt'), \n", " c=banknotes.column('Class'),\n", " cmap='viridis',\n", " s=50);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 2 }