{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "toc": true
   },
   "source": [
    "<h1>Contents<span class=\"tocSkip\"></span></h1>\n",
    "<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Part-0:-Standard-Library\" data-toc-modified-id=\"Part-0:-Standard-Library-0\">Part 0: Standard Library</a></span><ul class=\"toc-item\"><li><span><a href=\"#Static-type-declarations:-typing\" data-toc-modified-id=\"Static-type-declarations:-typing-0.1\">Static type declarations: <code>typing</code></a></span></li><li><span><a href=\"#Looping\" data-toc-modified-id=\"Looping-0.2\">Looping</a></span></li><li><span><a href=\"#os-and-sys\" data-toc-modified-id=\"os-and-sys-0.3\"><code>os</code> and <code>sys</code></a></span></li><li><span><a href=\"#Miscellaneous\" data-toc-modified-id=\"Miscellaneous-0.4\">Miscellaneous</a></span></li></ul></li><li><span><a href=\"#Part-1:-Scikit-learn-and-Matplotlib\" data-toc-modified-id=\"Part-1:-Scikit-learn-and-Matplotlib-1\">Part 1: Scikit-learn and Matplotlib</a></span><ul class=\"toc-item\"><li><span><a href=\"#Loading-Data\" data-toc-modified-id=\"Loading-Data-1.1\">Loading Data</a></span></li><li><span><a href=\"#Exploratory-Data-Analysis-(EDA)\" data-toc-modified-id=\"Exploratory-Data-Analysis-(EDA)-1.2\">Exploratory Data Analysis (EDA)</a></span></li><li><span><a href=\"#Data-Visualization-(matplotlib-and-seaborn)\" data-toc-modified-id=\"Data-Visualization-(matplotlib-and-seaborn)-1.3\">Data Visualization (matplotlib and seaborn)</a></span></li><li><span><a href=\"#Preprocessing\" data-toc-modified-id=\"Preprocessing-1.4\">Preprocessing</a></span></li><li><span><a href=\"#Evaluating-ML-Algorithms\" data-toc-modified-id=\"Evaluating-ML-Algorithms-1.5\">Evaluating ML Algorithms</a></span></li><li><span><a href=\"#Algorithm-Tuning\" data-toc-modified-id=\"Algorithm-Tuning-1.6\">Algorithm Tuning</a></span></li></ul></li><li><span><a href=\"#Part-2:-NumPy-&amp;-Pandas\" data-toc-modified-id=\"Part-2:-NumPy-&amp;-Pandas-2\">Part 2: NumPy &amp; Pandas</a></span><ul class=\"toc-item\"><li><span><a href=\"#NumPy\" data-toc-modified-id=\"NumPy-2.1\">NumPy</a></span></li><li><span><a href=\"#PCA\" data-toc-modified-id=\"PCA-2.2\">PCA</a></span></li><li><span><a href=\"#Pandas\" data-toc-modified-id=\"Pandas-2.3\">Pandas</a></span></li></ul></li><li><span><a href=\"#Part-3:-Deep-Learning-(PyTorch)\" data-toc-modified-id=\"Part-3:-Deep-Learning-(PyTorch)-3\">Part 3: Deep Learning (PyTorch)</a></span><ul class=\"toc-item\"><li><span><a href=\"#Dataset,-DataLoader,-and-MLPs\" data-toc-modified-id=\"Dataset,-DataLoader,-and-MLPs-3.1\">Dataset, DataLoader, and MLPs</a></span></li></ul></li><li><span><a href=\"#Part-4:-Statistics\" data-toc-modified-id=\"Part-4:-Statistics-4\">Part 4: Statistics</a></span></li><li><span><a href=\"#Part-5:-Miscellaneous\" data-toc-modified-id=\"Part-5:-Miscellaneous-5\">Part 5: Miscellaneous</a></span></li></ul></div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns; sns.set()\n",
    "import matplotlib as mpl\n",
    "import matplotlib.pyplot as plt\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "# embed static images in the ipynb\n",
    "%matplotlib inline "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----\n",
    "\n",
    "## Part 0: Standard Library\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- [ ] TODO: lists and dictionaries\n",
    "- [ ] TODO: File handling \n",
    "- [ ] TODO: os \n",
    "- [ ] TODO: sys "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Static type declarations: `typing`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import List, Set, Dict, Tuple, Optional\n",
    "\n",
    "# primitive types\n",
    "x: int = 1\n",
    "x: float = 1.0\n",
    "x: bool = True\n",
    "x: str = \"test\"\n",
    "x: bytes = b\"test\"\n",
    "    \n",
    "# Collection types\n",
    "x: Set[int] = {1, 2}\n",
    "x: List[int] = [0]\n",
    "x: Dict[str, float] = {'credit': 705.0}\n",
    "x: Tuple[str, ...] = ('a', 'b', 'c')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# annotation for function declarations\n",
    "from typing import Callable, Iterable, Union, Optional\n",
    "\n",
    "def fn(n: int, m: int = 2) -> str:\n",
    "    return str(n + m)\n",
    "\n",
    "x: Callable[[int, int], str] = fn\n",
    "    \n",
    "# Anything that we can use a for loop on is an 'Iterable'    \n",
    "\n",
    "def generate_up_to(n: int) -> Iterable[int]:\n",
    "    i = 0 \n",
    "    while i <= n:\n",
    "        yield i\n",
    "        i += 1\n",
    "        \n",
    "from typing import Generator\n",
    "# A generator (function with yield statement) is a specific kind of Iterable.\n",
    "def generate_up_to(n: int) -> Generator[str]:\n",
    "    i = 0\n",
    "    while i <= n:\n",
    "        yield str(i)\n",
    "        i += 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "https://mypy.readthedocs.io/en/stable/cheat_sheet_py3.html"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Looping"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Looping: `zip` vs. `enumerate`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "a 0\n",
      "b 1\n",
      "c 2\n",
      "a 0\n",
      "b 1\n",
      "c 2\n"
     ]
    }
   ],
   "source": [
    "letters = ['a', 'b', 'c']\n",
    "nums = [0, 1, 2]\n",
    "# Goal: Create (letter, int) pairs\n",
    "\n",
    "# Without zip\n",
    "for idx, letter in enumerate(letters):\n",
    "    num = nums[idx]\n",
    "    print(f\"{letter} {num}\")\n",
    "\n",
    "# With zip\n",
    "for num, letter in zip(nums, letters):\n",
    "    print(f\"{letter} {num}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "What's the advantage of using zip? \n",
    "- We've got less code and we're not distracted by an unnecessary index when we only want to deal with the elements of the iterables. \n",
    "- Powerful: `zip` is a function that takes in iterables as values and produces a value which is itself an iterable. \n",
    "\n",
    "**Note** that the two collections have to have the same number of elements for `zip`. One advantage of using `enumerate` to keep an index is that the number of elements won't need to match for the two collections. \n",
    "\n",
    "\n",
    "Since `zip` accepts a stream of pairs, we can use it to generate dictionaries:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'a': 0, 'b': 1, 'c': 2}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "letters = ['a', 'b', 'c']\n",
    "nums = [0, 1, 2]\n",
    "dict(zip(letters, nums))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.9906713395043445\n",
      "('b6', 0.9906713395043445)\n",
      "b6\n"
     ]
    }
   ],
   "source": [
    "import numpy as np \n",
    "buildings = ['b' + str(i) for i in np.arange(10)]\n",
    "heights = np.random.random(10)\n",
    "bh: dict = dict(zip(buildings, heights))\n",
    "\n",
    "# Print the height of the tallest building in 'bh'\n",
    "print(max(bh.values()))\n",
    "\n",
    "# Print the building-height pair corresponding to the tallest building\n",
    "print(max(bh.items(), key=lambda b: b[1])) # max of the seq sorted by seq[1] \n",
    "\n",
    "# Print the name of the tallest building\n",
    "print(max(bh, key=bh.get))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Ex. Break out of a nested loop "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\t[0, 0]\n",
      "1\t[0, 1]\n",
      "4\t[0, 2]\n",
      "9\t[1, 0]\n",
      "16\t[1, 1]\n"
     ]
    }
   ],
   "source": [
    "# Create object to loop through.\n",
    "import numpy as np\n",
    "n_rows = 3\n",
    "M: np.ndarray = np.arange(n_rows**2).reshape(n_rows, -1)\n",
    "    \n",
    "# Create generator to replace nested loop.\n",
    "def element_generator(arr):\n",
    "    assert arr.ndim == 2\n",
    "    for i, row in enumerate(arr):\n",
    "        for j, element in enumerate(row):\n",
    "            idx = [i, j]\n",
    "            yield idx, element\n",
    "\n",
    "# Exiting the generator loop only requires 1 break statement.\n",
    "for idx, element in element_generator(M):\n",
    "    print(f\"{element**2}\\t{idx}\")\n",
    "    if idx == [1, 1]:\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### `os` and `sys`\n",
    "\n",
    "The `os` library is for operating system dependent functionality, whereas `sys` is for functionality related to interactions between the program and the Python interpreter. We manipulate the Python runtime environment with `sys`."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Mine for commands: [link](https://topic.alibabacloud.com/a/the-difference-between-os-and-sys-two-modules-in-python_1_29_30262075.html#:~:text=Os%3Athis%20module%20provides%20a,St%20Rongly%20with%20the%20interpreter.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os \n",
    "# returns the current working director as a string\n",
    "os.getcwd()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check if a path exists\n",
    "path: str  = os.getcwd()\n",
    "os.path.exists(path)  \n",
    "\n",
    "# Joining paths with strings\n",
    "os.path.join(path, \"..\", \"Website\")\n",
    "\n",
    "os.path.exists( os.path.join(path, \"..\", \"Website\") )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When you use the `import` statement, a module can be loaded from the \"PYTHONPATH\" environment variable, the current working directory, or other directories configured when Python was installed. \n",
    "\n",
    "In order to access (and add to) the available Python system paths, we use the `sys` library. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "sys.path # module search paths"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Since `sys.path` is a list, it can be appended. This means modules can be installed from any desired folder by means of `sys.path.append(some_path)`."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Ex: Display Python version \n",
    "\n",
    "You can display the Python version either from the terminal or using the `sys` library."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Python 3.8.5\n"
     ]
    }
   ],
   "source": [
    "!python --version"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The above cell uses the Jupyter magic command for shell inputs. To accomplish the same thing outside of a Jupyter notebook, you'd either enter text directly or, equivalently, use the `os` library."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "os.system(\"python --version\")\n",
    "os.system(\"cd\")  # Note that these commands output to the terminal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Display Python version with 'sys'\n",
    "import sys\n",
    "sys.version"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Miscellaneous"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Ex: Return the attributes of an object.\n",
    "\n",
    "You have a few options for this. Try, `dir()` and `vars()`:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Ex. Create quick classes w/ `collections.namedtuple`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tesla 2.86 black\n",
      "Car(brand='Tesla', miles_per_kWh=2.86, color='black')\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'brand': 'Tesla', 'miles_per_kWh': 2.86, 'color': 'black'}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# collections.namedtuple \n",
    "\n",
    "import collections\n",
    "from typing import NamedTuple\n",
    "\n",
    "def make_car(brand, fuel_efficiency, color) -> NamedTuple:\n",
    "    if brand == \"Tesla\":\n",
    "        Car  = collections.namedtuple(\n",
    "            typename=\"Car\", field_names=[\"brand\", \"miles_per_kWh\", \"color\"])\n",
    "    else:\n",
    "        Car = collections.namedtuple(\n",
    "            typename=\"Car\", field_names=[\"brand\", \"mpg\", \"color\"])\n",
    "    return Car(brand, fuel_efficiency, color)\n",
    "\n",
    "black_model_X = make_car(\"Tesla\", 2.86, \"black\")\n",
    "# NamedTuple have customizable attributes.\n",
    "print(black_model_X.brand, black_model_X.miles_per_kWh, black_model_X.color)\n",
    "print(black_model_X)\n",
    "black_model_X._asdict() # return as dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sklearn.datasets\n",
    "\n",
    "obj = sklearn.datasets.load_iris()\n",
    "print(f\"dir(obj):\\t{dir(obj)}\\nfor object type:\\t{type(obj)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Ex: sorted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted([1, 4, 3, 2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted(range(4, 0, -1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "letters = ['a', 'abcd', 'ab', 'abc', 'cb', 'defg', 'def', 'e']\n",
    "sorted(letters, key=len) # sorts based on output of len()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Part 1: Scikit-learn and Matplotlib"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Loading Data  \n",
    "\n",
    "Toy datasets in sci-kit learn come from `sklearn.datasets`."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Ex: Return the attributes of an object.\n",
    "\n",
    "You have a few options for this. Try, `dir()` and `vars()`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sklearn.datasets\n",
    "\n",
    "obj = sklearn.datasets.load_iris()\n",
    "print(f\"dir(obj):\\t{dir(obj)}\\nfor object type:\\t{type(obj)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sklearn.datasets\n",
    "\n",
    "sklearn_dataset = sklearn.datasets.load_breast_cancer()\n",
    "dir(sklearn_dataset) # Display attributes of sklearn.utils.Bunch object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "type(sklearn_dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# sklearn.utils.Bunch description\n",
    "type(sklearn_dataset.DESCR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(sklearn_dataset.DESCR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sklearn_dataset.data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Ex: Convert `sklearn.datasets` dataset into a `pd.DataFrame`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.datasets import load_boston\n",
    "import pandas as pd\n",
    "boston = load_boston()\n",
    "boston_df  = pd.DataFrame(boston.data) # Set column indices as feature names\n",
    "boston_df.columns = boston.feature_names\n",
    "boston_df['PRICE'] = boston.target # Specify 'PRICE' as the target variable\n",
    "boston_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----\n",
    "\n",
    "### Exploratory Data Analysis (EDA)\n",
    "\n",
    "- Statistical descriptions\n",
    "- Data visualization (matplotlib and seaborn)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Ex: Correlation matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert a dataset from 'sklearn.datasets' into a pd.DataFrame.\n",
    "\n",
    "from sklearn import datasets\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "def convert_boston_to_df():\n",
    "    boston = datasets.load_boston()\n",
    "    boston_df  = pd.DataFrame(\n",
    "        data = boston.data, columns = boston.feature_names) \n",
    "    boston_df['PRICE'] = boston.target # Specify 'PRICE' as the target variable\n",
    "    return boston_df\n",
    "    \n",
    "def correlation_matrix(df, plot: bool, digits: int = 2) -> pd.DataFrame:\n",
    "    \"\"\"Plot correlation matrix from a pd.DataFrame\"\"\"\n",
    "    corr_df = df.corr().round(digits)\n",
    "    if plot:\n",
    "        fig = plt.figure(figsize=(16,12))\n",
    "        ax = sns.heatmap(correlation_matrix, annot=True, cmap='Blues')\n",
    "        plt.show()\n",
    "    return corr_df\n",
    "\n",
    "correlation_matrix(df = convert_boston_to_df(), plot=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "pd.DataFrame?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data Visualization (matplotlib and seaborn)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Ex: Random Walk: [wikipedia link](https://en.wikipedia.org/wiki/Random_walk#:~:text=In%20mathematics%2C%20a%20random%20walk,space%20such%20as%20the%20integers.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate simple random walk plot \n",
    "rng = np.random.RandomState(1)\n",
    "x = np.linspace(0, 10, 500)\n",
    "y = np.cumsum(rng.randn(500, 6), 0) # sample 500 by 6 array from std normal dist\n",
    "plt.plot(x, y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Q: What does the `np.cumsum` method do?\n",
    "> Return the **cumulative sum** of the elements of a `np.ndarray` along a given axis.\n",
    "\n",
    "Q: What does the `np.linspace` method do?\n",
    "> Return a `np.ndarray` of evenly spaced numbers over a specified interval."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Ex: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot sine and cosine graphs from -pi to pi\n",
    "x = np.linspace(start=-np.pi, stop=np.pi, num=100)  \n",
    "fig = plt.figure()\n",
    "plt.plot(x, np.sin(x), '-')\n",
    "plt.plot(x, np.cos(x), '--')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Q: What is the type of `plt.figure`?\n",
    "- `function` | a method inside the matplotlib.pyplot module\n",
    "\n",
    "Q: What does the `plt.figure` method do?\n",
    "- Create a new figure, or activate an existing figure.\n",
    "\n",
    "Q: The figure object generated by `plt.figure()` has what type?\n",
    "- `mpl.figure.Figure`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure()\n",
    "type(fig)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Q: Save the figure to the working directory as a png."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot sine and cosine graphs from -pi to pi\n",
    "x = np.linspace(start=-np.pi, stop=np.pi, num=100)  \n",
    "fig = plt.figure()\n",
    "plt.plot(x, np.sin(x), '-')\n",
    "plt.plot(x, np.cos(x), '--')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig.savefig('trig_functions.png')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Q: Use markdown to confirm that the image, 'trig_functions.png', is saved in the working directory.\n",
    "```markdown\n",
    "![image info](trig_functions.png)\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![image info](trig_functions.png)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Q: Use a Jupyter module to confirm that the image, 'trig_functions.png', is saved in the working directory."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import Image\n",
    "Image(\"trig_functions.png\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "cloze: \n",
    "- In the `mpl.figure.Figure.savefig()` method, the file format is inferred from the extension of the given filename."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display supported file types for the mpl.figure.Figure.savefig() method.\n",
    "fig.canvas.get_supported_filetypes()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Ex: Creat a grid of plots with the object-oriented interface"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(8, 12))\n",
    "\n",
    "x = np.linspace(start=-10, stop=10, num=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ax[0].plot(x, np.sin(x))\n",
    "ax[1].plot(x, np.cos(x))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Q: What's the type of `ax`?\n",
    "\n",
    "AxesSubplot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots()\n",
    "type(ax)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Q: What's the type of `fig`?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "type(fig)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# simple sinusoids using figure and axes instances\n",
    "fig, ax = plt.subplots()\n",
    "\n",
    "x = np.linspace(start=0, stop=4*np.pi)\n",
    "ax.plot(x, np.sin(x))\n",
    "ax.plot(x, np.cos(x))\n",
    "print(type(fig), type(ax))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots()\n",
    "x = np.linspace(start=0, stop=4*np.pi)\n",
    "# Line colors\n",
    "ax.plot(x, np.sin(x - 0), color='blue') # specify color by name\n",
    "ax.plot(x, np.sin(x - 1), color='g') # short color code (rgbcmyk)\n",
    "ax.plot(x, np.sin(x - 2), color='0.75') # Grayscale between 0 and 1\n",
    "ax.plot(x, np.sin(x - 3), color='#FFDD44') # Hex code (RRGGBB from 00 to FF)\n",
    "ax.plot(x, np.sin(x - 4), color=(1.0,0.2,0.3)) # RGB tuple, values 0 and 1\n",
    "ax.plot(x, np.sin(x - 5), color='chartreuse'); # all HTML color names supported"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "rgbcmyk: cmyk stands for Cyan, Magenta, Yellow, blacK"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots()\n",
    "x = np.linspace(start=0, stop=4*np.pi)\n",
    "\n",
    "# Line styles\n",
    "ax.plot(x, x + 0, linestyle='solid')\n",
    "ax.plot(x, x + 1, linestyle='dashed')\n",
    "ax.plot(x, x + 2, linestyle='dashdot')\n",
    "ax.plot(x, x + 3, linestyle='dotted');\n",
    "# shorthand for line styles\n",
    "ax.plot(x, x + 4, linestyle='-') # solid\n",
    "ax.plot(x, x + 5, linestyle='--') # dashed\n",
    "ax.plot(x, x + 6, linestyle='-.') # dashdot\n",
    "ax.plot(x, x + 7, linestyle=':'); # dotted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set multiple properties of an AxesSubplot with ax.set() method\n",
    "x = np.linspace(start=0, stop=4*np.pi)\n",
    "\n",
    "ax = plt.axes()\n",
    "ax.plot(x, np.sin(x))\n",
    "ax.set(xlim=(0,4*np.pi), ylim=(-1, 1),\n",
    "       xlabel='x', ylabel='sin(x)',\n",
    "      title='Simple Sinusoid');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax_array = plt.subplots(nrows=4, ncols=3, figsize=(10,10))\n",
    "fig.tight_layout() # prevents subplots from overlapping\n",
    "\n",
    "count = 0\n",
    "shift = 0\n",
    "for row in ax_array:    \n",
    "    for axes in row:\n",
    "        axes.plot(x, np.sin(x + shift))\n",
    "        axes.set(title='t='+ str(count))\n",
    "        count += 1\n",
    "        shift += np.pi / 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# currently in Section 4.2.3: Labeling Plots"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----\n",
    "\n",
    "<a id='s3'></a>\n",
    " \n",
    "### Preprocessing\n",
    "\n",
    "- Data cleaning\n",
    "- Feature selection\n",
    "- Feature Engineering\n",
    "- Data transforms \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Null Values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def check_null_values(df):\n",
    "    \"\"\"Check if there are missing values in a pd.DataFrame.\"\"\"\n",
    "    n_col = len(df.columns)\n",
    "    null_counts = np.array(df.isnull().sum())\n",
    "    \n",
    "    if np.any(null_counts) == False:\n",
    "        print(\"The DataFrame has no null values.\")\n",
    "    else:\n",
    "        return df.isnull().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Ex: Train-test split (random)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "df = pd.read_csv('https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/639388c2cbc2120a14dcf466e85730eb8be498bb/iris.csv')\n",
    "\n",
    "def tt_split(data):\n",
    "    # Specify feature and target matrix\n",
    "    if isinstance(data, pd.DataFrame):\n",
    "        X, Y = data.iloc[:, :4], data.iloc[:, -1]\n",
    "    elif isinstance(data, np.ndarray):\n",
    "        X, Y = data[:, :4], data[:, -1].astype(float)\n",
    "\n",
    "    # Perform train-test split\n",
    "    X_train, X_test, Y_train, Y_test = train_test_split(\n",
    "            X, Y, test_size=0.3, random_state=7)\n",
    "    return X_train, X_test, Y_train, Y_test\n",
    "\n",
    "X_train, X_test, Y_train, Y_test = tt_split(df) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Scaling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import preprocessing\n",
    "def scale_data(X_train, X_test, type=\"standard\"):\n",
    "    scaler = (preprocessing.StandardScaler() \n",
    "        if type == \"standard\"\n",
    "        else preprocessing.MinMaxScaler())\n",
    "\n",
    "    X_train_scaled = scaler.fit_transform(X_train)\n",
    "    X_test_scaled = scaler.transform(X_test)\n",
    "    return X_train_scaled, X_test_scaled\n",
    "\n",
    "# Gives minimum of 0, max of 1\n",
    "X_train, X_test = scale_data(X_train, X_test, \"minmax\")\n",
    "pd.DataFrame(X_train).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Gives mean of 0, standard dev of 1\n",
    "X_train, X_test = scale_data(X_train, X_test, \"standard\")\n",
    "pd.DataFrame(X_train).describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Evaluating ML Algorithms\n",
    "\n",
    "- Test options and evaluation metrics\n",
    "- \"Benchmark\" algorithms\n",
    "- Compare algorithms\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import metrics\n",
    "\n",
    "# metrics.auc?\n",
    "# metrics.roc_auc_score?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----\n",
    "\n",
    "### Algorithm Tuning"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Part 2: NumPy & Pandas"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### NumPy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Ex: Generalized argmax function for an np.ndarray"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Note: The 'torch.topk' method accomplishes this task\n",
    "import numpy as np\n",
    "\n",
    "def get_index_n_highest(a: np.ndarray, n: int):\n",
    "    \"\"\"\n",
    "    Arguments:\n",
    "        a (np.ndarray, 1D): input vector\n",
    "        n (int): how many of the highest indices you want\n",
    "    \n",
    "    Returns:\n",
    "        highest_indices (np.ndarray): array containing the inidices of \n",
    "            the n highest elements\n",
    "            \n",
    "    Examples:\n",
    "    --------\n",
    "    Return the indices of the rows containing the 6 highest values in \n",
    "    the last column of A, a 2x2 matrix.\n",
    "    \n",
    "    >>> import numpy as np\n",
    "    >>> import copy\n",
    "    >>> rng = np.random.RandomState(7)\n",
    "    >>> A = rng.randint(low=1, high=100, size=(25,25))\n",
    "    >>> last_col = A[:,-1].copy()\n",
    "    \n",
    "    >>> get_index_n_highest(last_col, 6)\n",
    "    array([12  8 23 13 21 20])\n",
    "    >>> last_col[highest_indices]\n",
    "    array([92, 90, 88, 85, 83, 77])\n",
    "    \"\"\"\n",
    "    \n",
    "    highest_indices = a.argsort()[-n:][::-1]\n",
    "    return highest_indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "rng = np.random.RandomState()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "M = rng.random((3,4))\n",
    "np.min(M,v axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Ex: Centering data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "# Let's assume you have some data matrix X.\n",
    "rng = np.random.RandomState()\n",
    "X: np.ndarray = rng.randint(low=0, high=5, size=9).reshape(3, -1)\n",
    "assert X.ndim == 2, \"'X' is a matrix.\"\n",
    "\n",
    "# column-wise centering\n",
    "col_means = np.mean(X, axis=0)\n",
    "for col_idx, col_vals in enumerate(X.T):\n",
    "    X[:, col_idx] -= col_means[col_idx]\n",
    "\n",
    "# row-wise centering\n",
    "row_means = np.mean(X, axis=1)\n",
    "for row_idx, row_vals in enumerate(X):\n",
    "    X[row_idx, :] -= row_means[row_idx]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PCA\n",
    "\n",
    "The goal of principal component analysis is to compute the most meaningful basis to re-express a noisy dataset.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "from sklearn import decomposition\n",
    "from numpy import linalg\n",
    "from typing import Union, Iterable\n",
    "\n",
    "def PCA_decomposition(X: np.ndarray, n: int = None):\n",
    "    \"\"\"Perform PCA decomposition on X.\n",
    "    Args:\n",
    "        X (np.ndarray): A tabular (2-D) data matrix consisting entirely \n",
    "            of numerical features.\n",
    "        n (int, optional): Number of principal components to keep for the \n",
    "            decomposition. If None, the decomposition is done using the maximum\n",
    "            possible number of components.\n",
    "    Returns:\n",
    "        (sklearn.decomposition.PCA): A PCA decomposition that has been fitted\n",
    "            to data matrix X.\n",
    "    \"\"\"\n",
    "    # Find the maximum valid number of principal components\n",
    "    n_components: int = linalg.matrix_rank(X) \n",
    "\n",
    "    # Perform PCA decomposition on X\n",
    "    if n is not None:\n",
    "        if not isinstance(n, int):\n",
    "            raise ValueError(\"n must be an integer.\")\n",
    "        elif n >= n_components:\n",
    "            raise ValueError(f\"'n' value of {n} is too high.\" \n",
    "                             + f\"'n' can be at most {n_components}.\")\n",
    "    pca = decomposition.PCA(components = n_components)\n",
    "    pca.fit(X)\n",
    "    return pca\n",
    "\n",
    "def PCA_req_components(pca_X: decomposition.PCA, \n",
    "                       threshold: Union[float, Iterable[float]] = 0.99,\n",
    "                       plot=True, verbose=False) -> pd.DataFrame:\n",
    "    \"\"\"Compute the number of required components for different variance\n",
    "    explained values. \n",
    "    \n",
    "    Args:\n",
    "        pca_X (decomposion.PCA): An instance of sklearn.decomposition.PCA \n",
    "            that has been fitted to data matrix X. \n",
    "        threshold (Union[float, Iterable[float]]): \n",
    "        plot (bool, optional): Toggles whether to display a visualization of \n",
    "            how the number of principal components varies with variance \n",
    "            explained.\n",
    "    Returns:\n",
    "        req_components (pd.DataFrame): A table containing the required number\n",
    "            of components for the following threshold values \n",
    "            [0.9, 0.95, 0.97, 0.99, 0.999].\n",
    "    \"\"\"\n",
    "    n_components: int\n",
    "    n_features: int\n",
    "    n_components, n_features = pca_X.components_\n",
    "    explained_variance = np.cumsum(pca_X.explained_variance_ratio_)\n",
    "    threshold_line = np.ones(n_components) * threshold\n",
    "    p_component_idxs = np.arange(n_components) + 1\n",
    "\n",
    "    if plot:\n",
    "        fig, ax = plt.subplots(figsize=(8, 6))\n",
    "        ax.plot(p_component_idxs, explained_variance, \n",
    "                label='cumulative variance explained')\n",
    "        ax.plot(p_component_idxs, threshold_line, '--', label='threshold')\n",
    "        # ax.plot(p_components, pca.explained_variance_ratio_, 'o', \n",
    "        #         label='individual variance explained')\n",
    "        ax.set(title = f\"Variance Explained, n_features = {n_features}\",\n",
    "               xlabel = \"Principal components\", \n",
    "               ylabel = \"Percentage of Variance Explained\")\n",
    "        ax.legend()\n",
    "        plt.show()\n",
    "\n",
    "    reduced_components: int = list(explained_variance >= threshold).index(True)\n",
    "\n",
    "    # Thresholds, required components table\n",
    "    thresholds = [0.9, 0.95, 0.97, 0.99, 0.999]\n",
    "    idx = []\n",
    "    for t in thresholds:\n",
    "        idx.append(list(explained_variance >= t).index(True))\n",
    "    thresholds, idx = [np.array(l) for l in [thresholds, idx]]\n",
    "    req_components = pd.DataFrame(np.vstack([thresholds, idx]),\n",
    "                                  index = [\"threshold\", \"principal components\"])\n",
    "    return req_components\n",
    "\n",
    "def PCA_reduction(X: np.ndarray, pca_X: decomposition.PCA) -> np.ndarray:\n",
    "    \"\"\" Transforms a feature matrix with PCA feature reduction.\n",
    "    Args:\n",
    "        X (np.ndarray): Feature matrix\n",
    "        pca_X (decomposition.PCA): An sklearn.decomposition.PCA instance that \n",
    "            has been fitted to X.\n",
    "    Returns:\n",
    "        X_new (np.ndarray): X with PCA feature reduction.\n",
    "    \"\"\"\n",
    "    X_new = pca_X.fit_transform(X)\n",
    "    return X_new"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pandas \n",
    "\n",
    "[pandas cookbook](https://pandas.pydata.org/pandas-docs/stable/user_guide/cookbook.html)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Part 3: Deep Learning (PyTorch)\n",
    "\n",
    "### Dataset, DataLoader, and MLPs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import numpy as np\n",
    "from torch.utils.data import Dataset, DataLoader\n",
    "from typing import Union\n",
    "from numpy import ndarray\n",
    "from torch import Tensor\n",
    "\n",
    "class SupervisedTabular(Dataset):\n",
    "    def __init__(self, \n",
    "                 X: Union[ndarray, Tensor], \n",
    "                 Y: Union[ndarray, Tensor], \n",
    "                 kind: str = \"r\"):\n",
    "        self.X = X\n",
    "        self.Y = Y\n",
    "        self.kind = kind\n",
    "        self.check_for_valid_inputs()\n",
    "        self.convert_data_to_tensors()\n",
    "        self.n_samples = self.X.shape[0]\n",
    "\n",
    "    def __getitem__(self, idx):\n",
    "        return self.X[idx], self.Y[idx]\n",
    "    \n",
    "    def __len__(self) -> int:\n",
    "        return self.n_samples\n",
    "        \n",
    "    def check_for_valid_inputs(self):\n",
    "        X, Y = self.X, self.Y\n",
    "        assert X.ndim in [0, 1, 2], (\n",
    "            f\"The array dimension of X is too high. X.ndim: {X.ndim}\")\n",
    "        assert Y.ndim in [0, 1, 2], (\n",
    "            f\"The array dimension of Y is too high. Y.ndim: {Y.ndim}\")\n",
    "        assert X.shape[0] == Y.shape[0], (\n",
    "            f\"X and Y have different numbers of samples. Dim 0 should match.\")\n",
    "        assert isinstance(X, (ndarray, Tensor))\n",
    "        assert isinstance(Y, (ndarray, Tensor))\n",
    "\n",
    "        assert self.kind in [\"c\", \"classification\", \"r\", \"regression\"], (\n",
    "            f\"Attribute 'kind' must be 'c' or 'r' for classification\"\n",
    "            +\" or regression.\")\n",
    "        if self.kind in [\"c\", \"classification\"]:\n",
    "            self.kind = \"c\"\n",
    "        else:\n",
    "            self.kind = \"r\"\n",
    "\n",
    "    def convert_data_to_tensors(self):\n",
    "        X, Y = self.X, self.Y\n",
    "        \n",
    "        if isinstance(X, ndarray):\n",
    "            self.X = torch.from_numpy(X).float()\n",
    "        elif isinstance(X, Tensor):\n",
    "            self.X = X.float()\n",
    "        else:\n",
    "            raise Exception(\"Impossible!\")\n",
    "\n",
    "        if isinstance(Y, ndarray):\n",
    "            Y = Y.reshape(-1)\n",
    "            if self.kind == \"r\":\n",
    "                self.Y = torch.from_numpy(Y).float()\n",
    "            else:\n",
    "                self.Y = torch.from_numpy(Y).long()\n",
    "        elif isinstance(Y, Tensor):\n",
    "            Y = Y.view(-1)\n",
    "            if self.kind == \"r\":\n",
    "                self.Y = Y.float()\n",
    "            else:\n",
    "                self.Y = Y.long()\n",
    "        else:\n",
    "            raise Exception(\"Impossible!\")\n",
    "        \n",
    "        assert isinstance(X, (ndarray, Tensor))\n",
    "        \n",
    "# TODO: Dataloader\n",
    "\n",
    "class MLPClassifier(nn.Module):\n",
    "    \"\"\"Feed Forward Neural Network (Classification)\"\"\"\n",
    "    def __init__(self, in_nodes: int, n_classes: int):\n",
    "        super(MLPClassifier, self).__init__()\n",
    "        hidden_dim: int = np.around(np.sqrt(in_nodes * n_classes))\n",
    "        self.fc_layers = nn.Sequential(\n",
    "            nn.Linear(in_features=in_nodes, out_features=hidden_dim),\n",
    "            nn.ReLU(),\n",
    "            nn.Linear(hidden_dim, hidden_dim),\n",
    "            nn.ReLU(),\n",
    "            nn.Linear(hidden_dim, n_classes),\n",
    "            nn.LeakyReLU())\n",
    "        \n",
    "        def forward(self, x):\n",
    "            logits = self.fc_layers(x)\n",
    "            return logits\n",
    "        \n",
    "class MLPRegressor(nn.Module):\n",
    "    \"\"\"Feed Forward Neural Netowrk (Regression)\"\"\"\n",
    "    def __init__(self, in_nodes: int):\n",
    "        super(MLPRegressor, self).__init__()\n",
    "        self.fc_layers = nn.Sequential(\n",
    "            nn.Linear(in_features=in_nodes, out_features=10),\n",
    "            nn.ReLU(),\n",
    "            nn.Linear(10, 5),\n",
    "            nn.ReLU(),\n",
    "            nn.Linear(5, 1),\n",
    "            nn.LeakyReLU())\n",
    "        \n",
    "    def forward(self, x):\n",
    "        return self.fc_layers(x)\n",
    "    \n",
    "# TODO: Recurrent Neural Network on sequential data\n",
    "\n",
    "# TODO: CNN for Image classification\n",
    "\n",
    "# TODO: NLP\n",
    "\n",
    "# TODO: Simple GANs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch.nn as nn\n",
    "import numpy as np\n",
    "np.around?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Just uncomment the next cell if you need to install. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install pytorch_lightning --quiet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pytorch_lightning as pl"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Part 4: Statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy import stats\n",
    "\n",
    "# stats.norm (norm_gen): A normal continuous random variable\n",
    "mean, var = stats.norm.stats(loc=0, scale=1, moments=\"mv\")\n",
    "    # loc kw specifies mean\n",
    "    # scale kw specifies var\n",
    "mean, var, skew, kurt = stats.norm.stats(loc=0, scale=1, moments=\"mvsk\")\n",
    "\n",
    "stats.norm.rvs(size=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "stats.norm?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Part 5: Miscellaneous"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Anki Interval Modifier\n",
    "\n",
    "Informed by the [Janki method](http://www.jackkinsella.ie/articles/janki-method-refined) and my own personal research, it's clear that the interval modifier should be set in such a way that the desired mature retention rate is achieved. In the Anki manual (i.e. SuperMemo), an equation based on Ebbinghaus's [forgetting curve](https://en.wikipedia.org/wiki/Forgetting_curve) is cited from SuperMemo as being able to calculate what setting Anki's interval modifier should have for a desired retention rate. - See [docs.ankiweb.net](https://docs.ankiweb.net/#/deck-options?id=reviews)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "# Note, the arrow is a type hint: https://www.python.org/dev/peps/pep-0484/\n",
    "\n",
    "def calcIntervalModifier(modifier_now, success_now, success_wanted=0.85) -> int:\n",
    "    \"\"\"Calculates the interval modifier needed for a target retention rate in Anki.\n",
    "    \n",
    "    Args: \n",
    "        modifier_now (int): Current interval modifier. \n",
    "        success_wanted (real number): Desired retention rate. This can be lower\n",
    "            or higher than `success_now` depending on your goals. \n",
    "        success_now (real number): Current mature retention rate.\n",
    "    Returns:\n",
    "        modifier_new (int): Optimal interval modifier.\n",
    "    \n",
    "    Examples:\n",
    "    --------\n",
    "    Suppose you have a mature accuracy of 96% for the past month in a deck.\n",
    "    If you interval modifier is currently set to 100%, you could acheive higher\n",
    "    efficiency by studying more cards and lowering the accuracy to 90%, and \n",
    "    even higher efficiency by going down to 80 or 85%.\n",
    "    \n",
    "    >>> calcIntervalModifier(100, .96, .90)\n",
    "    258\n",
    "    \n",
    "    The above indicates an optimal setting of 258% for the interval modifier.\n",
    "    \n",
    "    >>> calcIntervalModifier(130, .95)\n",
    "    412\n",
    "    \"\"\"\n",
    "    modifier_new = modifier_now * np.log(success_wanted) / np.log(success_now)\n",
    "    modifier_new = round(modifier_new)\n",
    "    \n",
    "    return modifier_new"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Sharing conda virtual environments\n",
    "\n",
    "In order to save an environment's packages and versions so that the environment can quickly be reproduced, you need the `environment.yml`. To get one named \"env_name\", activate your environment in the conda prompt and then enter:\n",
    "```\n",
    "conda env export > env_name.yml\n",
    "```\n",
    "\n",
    "This will create a file in the current working directory. Then, recreate the environment from the YAML file:\n",
    "```\n",
    "conda env create -f env_name.yml\n",
    "```\n",
    "\n",
    "To verify the installation, enter `conda env list` and then activate the env with `conda activate env_name`. Entering `conda info --envs` will also work.\n",
    "\n",
    "source: [docs.conda.io](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### How long caffeine stays in the body\n",
    "\n",
    "Caffeine has a half-life of about 6 hours in the body for most people. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def caffeine_in_body(hours: float, pct: bool = True, \n",
    "                     input_caff: float = None ) -> float:\n",
    "    \"\"\"Args: \n",
    "        hours (float): Time passed in hours.\n",
    "        pct (bool, optional): Returns caffeine in the body as a percentage \n",
    "            if True. Defaults to True.\n",
    "        input_caff (float, optional): Starting amount of caffeine in the body.\n",
    "    \"\"\"\n",
    "    if pct:\n",
    "        current_caffeine_pct = 0.5**(hours / 6)\n",
    "        return current_caff_pct \n",
    "    else:\n",
    "        if input_caff is None: \n",
    "            raise ValueError(\"'input_caff' must be >= 0.\")\n",
    "        current_caffeine = input_caff * 0.5 **(hours / 6)\n",
    "        return current_caffeine"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (ds_env)",
   "language": "python",
   "name": "ds_env"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "toc": {
   "base_numbering": 0,
   "nav_menu": {},
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Contents",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}