{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datascience import *\n",
    "import numpy as np\n",
    "import matplotlib\n",
    "from mpl_toolkits.mplot3d import Axes3D\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plots\n",
    "plots.style.use('fivethirtyeight')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def standard_units(arr):\n",
    "    return (arr - np.average(arr))/np.std(arr)\n",
    "\n",
    "def correlation(t, x, y):\n",
    "    x_standard = standard_units(t.column(x))\n",
    "    y_standard = standard_units(t.column(y))\n",
    "    return np.average(x_standard * y_standard)\n",
    "\n",
    "def slope(t, x, y):\n",
    "    r = correlation(t, x, y)\n",
    "    y_sd = np.std(t.column(y))\n",
    "    x_sd = np.std(t.column(x))\n",
    "    return r * y_sd / x_sd\n",
    "\n",
    "def intercept(t, x, y):\n",
    "    x_mean = np.mean(t.column(x))\n",
    "    y_mean = np.mean(t.column(y))\n",
    "    return y_mean - slope(t, x, y)*x_mean\n",
    "\n",
    "def fitted_values(t, x, y):\n",
    "    \"\"\"Return an array of the regression estimates at all the x values\"\"\"\n",
    "    a = slope(t, x, y)\n",
    "    b = intercept(t, x, y)\n",
    "    return a*t.column(x) + b\n",
    "\n",
    "def residuals(t, x, y):\n",
    "    predictions = fitted_values(t, x, y)\n",
    "    return t.column(y) - predictions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Hypothesis testing for the slope"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def bootstrap_slope(t, x, y, repetitions=5000):\n",
    "    \n",
    "    # Bootstrap the scatter, find the slope, collect\n",
    "    slopes = make_array()\n",
    "    for i in np.arange(repetitions):\n",
    "        bootstrap_sample = t.sample()\n",
    "        bootstrap_slope = slope(bootstrap_sample, x, y)\n",
    "        slopes = np.append(slopes, bootstrap_slope)\n",
    "    \n",
    "    # Find the endpoints of the 95% confidence interval for the true slope\n",
    "    left = percentile(2.5, slopes)\n",
    "    right = percentile(97.5, slopes)\n",
    "    \n",
    "    # Slope of the regression line from the original sample\n",
    "    observed_slope = slope(t, x, y)\n",
    "    \n",
    "    # Display results\n",
    "    Table().with_column('Bootstrap Slopes', slopes).hist(bins=20)\n",
    "    plots.plot(make_array(left, right), make_array(0, 0), color='yellow', lw=8);\n",
    "    print('Slope of regression line:', observed_slope)\n",
    "    print('Approximate 95%-confidence interval for the slope of the true line:')\n",
    "    print(left, 'to', right)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "baby = Table.read_table('baby.csv')\n",
    "baby.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "slope(baby, 'Maternal Age', 'Birth Weight')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "baby.scatter('Maternal Age', 'Birth Weight', fit_line=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bootstrap_slope(baby, 'Maternal Age', 'Birth Weight', 1000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#ckd = chronic kidney disease\n",
    "#class = 1 = has ckd\n",
    "#class = 0 = does not have ckd\n",
    "ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')\n",
    "ckd.show(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ckd.group('Class')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Could you predict if a patient has ckd?\n",
    "ckd.scatter('White Blood Cell Count', 'Glucose', colors='Class')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ckd.scatter('Hemoglobin', 'Glucose', colors='Class')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Can you tell if a bank note is counterfeit or legitimate?\n",
    "#Variables based on photgraphs of many banknotes (a few numbers for each image calculated)\n",
    "banknotes = Table.read_table('banknote.csv')\n",
    "banknotes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "banknotes.scatter('WaveletVar', 'WaveletCurt', colors='Class')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "banknotes.scatter('WaveletSkew', 'Entropy', colors='Class')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Two attributes have some overlap of classes...what happens with three attributes?\n",
    "fig = plots.figure(figsize=(8,8))\n",
    "ax = Axes3D(fig)\n",
    "ax.scatter(banknotes.column('WaveletSkew'), \n",
    "           banknotes.column('WaveletVar'), \n",
    "           banknotes.column('WaveletCurt'), \n",
    "           c=banknotes.column('Class'),\n",
    "           cmap='viridis',\n",
    "          s=50);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}