{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "(**Click the icon below to open this notebook in Colab**)\n", "[![Open InColab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/xiangshiyin/machine-learning-for-actuarial-science/blob/main/2025-spring/week03/notebook/demo.ipynb)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/xiangshiyin/Documents/Teaching/machine-learning-for-actuarial-science/.venv/bin/python\n" ] } ], "source": [ "%%sh\n", "\n", "which python" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load the MNIST data\n", "\n", "We will take the built-in hand-written digits dataset in `scikit-learn` as an example. This is a copy of the test set of the UCI ML hand-written digits datasets https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits\n", "\n", "Each datapoint is a 8x8 image of a digit.\n", "\n", "| Attribute | Value |\n", "|---------------------|---------------|\n", "| Classes | 10 |\n", "| Samples per class | ~180 |\n", "| Samples total | 1797 |\n", "| Dimensionality | 64 |\n", "| Features | integers 0-16 |\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from sklearn import datasets\n", "\n", "digits = datasets.load_digits()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "data = digits.data # the input X\n", "target = digits.target # the output Y\n", "feature_names = digits.feature_names\n", "target_names = digits.target_names" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The shape of the data is (1797, 64)\n", "The shape of the target is (1797,)\n", "First 5 feature names are ['pixel_0_0', 'pixel_0_1', 'pixel_0_2', 'pixel_0_3', 'pixel_0_4']\n", "Target names are [0 1 2 3 4 5 6 7 8 9]\n" ] } ], "source": [ "print(f\"The shape of the data is {data.shape}\")\n", "print(f\"The shape of the target is {target.shape}\")\n", "print(f\"First 5 feature names are {feature_names[:5]}\")\n", "print(f\"Target names are {target_names}\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 1, 2, 3, 4])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Visualize the MNIST digits" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAxsAAACvCAYAAACVbcM3AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAEixJREFUeJzt3X+s1XX9B/DXFfgCinIvGjS0wJsmOJk0EIt0XEVCkylsQPxRQUm50oUOEmoprC2DlGSGJpUKLPsjCC6tMZskbNkY+GMXsQkSlzt/LBSCi7iAQD7fP8q7DBTwfd73cM59PLa7yed83q/P63x4cT/36eeec2qKoigCAACgxM4odwMAAEB1EjYAAIAshA0AACALYQMAAMhC2AAAALIQNgAAgCyEDQAAIAthAwAAyELYAAAAsujwYaOlpSVqamri/vvvL1nNdevWRU1NTaxbt65kNalO5o9yMn+UmxmknMxf+6jIsLF48eKoqamJ5557rtytZPPGG2/ExIkTo7a2Ns4555y4+eabo7m5udxtEdU/f1u3bo0777wzhg8fHt26dYuamppoaWkpd1v8R7XP34oVK+JLX/pS1NfXx5lnnhmXXHJJTJ8+PVpbW8vdGv9R7TO4cuXKGD16dPTt2ze6du0aF1xwQYwfPz5eeumlcrdGVP/8/a9Ro0ZFTU1N3H777eVu5SPrXO4GONY777wT11xzTezbty++//3vR5cuXeKBBx6IESNGRFNTU5x77rnlbpEqtn79+njwwQfj0ksvjYEDB0ZTU1O5W6ID+eY3vxl9+/aNL3/5y/HJT34yNm/eHAsXLozVq1fHCy+8EN27dy93i1S5zZs3R11dXUybNi3OO++82LlzZzz22GMxbNiwWL9+fVx++eXlbpEOYsWKFbF+/fpyt5FM2DgNPfzww7Ft27bYuHFjXHHFFRERccMNN8Rll10W8+fPj3vvvbfMHVLNbrrppmhtbY2zzz477r//fmGDdrV8+fJoaGh437YhQ4bE5MmT44knnoipU6eWpzE6jHvuueeYbVOnTo0LLrggfv7zn8cjjzxShq7oaA4ePBjTp0+PmTNnHncmK0lF/hrVyfjXv/4V99xzTwwZMiR69uwZZ511Vlx99dWxdu3aD1zzwAMPRL9+/aJ79+4xYsSI494y3bJlS4wfPz569eoV3bp1i6FDh8bvf//7E/bzz3/+M7Zs2RK7d+8+4b7Lly+PK664oi1oREQMGDAgRo4cGb/97W9PuJ7yq+T569WrV5x99tkn3I/TVyXP3/8GjYiIcePGRUTEyy+/fML1nB4qeQaPp3fv3nHmmWf6db4KUQ3z95Of/CSOHj0aM2bMOOk1p6uqDRtvv/12/OpXv4qGhoaYN29ezJkzJ3bt2hWjR48+7v+pXbp0aTz44INx2223xfe+97146aWX4tprr40333yzbZ+//vWv8dnPfjZefvnlmDVrVsyfPz/OOuusGDt2bKxcufJD+9m4cWMMHDgwFi5c+KH7HT16NF588cUYOnToMY8NGzYstm/fHvv37z+5k0DZVOr8UR2qbf527twZERHnnXfeR1pP+6uGGWxtbY1du3bF5s2bY+rUqfH222/HyJEjT3o95VPp8/fqq6/G3LlzY968edXxq6NFBXr88ceLiCieffbZD9znyJEjxaFDh963be/evUWfPn2Kr3/9623bduzYUURE0b179+L1119v275hw4YiIoo777yzbdvIkSOLQYMGFQcPHmzbdvTo0WL48OHFxRdf3LZt7dq1RUQUa9euPWbb7NmzP/S57dq1q4iI4oc//OExjz300ENFRBRbtmz50BrkVc3z97/uu+++IiKKHTt2nNI68ulI8/eeW265pejUqVPxyiuvfKT1lFZHmcFLLrmkiIgiIooePXoUP/jBD4p33333pNeTR0eYv/HjxxfDhw9v+3NEFLfddttJrT0dVe2djU6dOsX//d//RcS/7xbs2bMnjhw5EkOHDo0XXnjhmP3Hjh0b559/ftufhw0bFldeeWWsXr06IiL27NkTTz/9dEycODH2798fu3fvjt27d8c//vGPGD16dGzbti3eeOOND+ynoaEhiqKIOXPmfGjfBw4ciIiIrl27HvNYt27d3rcPp69KnT+qQzXN329+85t49NFHY/r06XHxxRef8nrKoxpm8PHHH48nn3wyHn744Rg4cGAcOHAg3n333ZNeT/lU8vytXbs2fve738WCBQtO7Umfxqr6BeJLliyJ+fPnx5YtW+Lw4cNt2y+88MJj9j3eRezTn/5022sk/va3v0VRFHH33XfH3XfffdzjvfXWW+8b1o/ivdtlhw4dOuaxgwcPvm8fTm+VOH9Uj2qYvz//+c9xyy23xOjRo+NHP/pRSWuTX6XP4Oc+97m2/540aVIMHDgwIqKkn8lAPpU4f0eOHInvfOc78ZWvfOV9r9utdFUbNn7961/HlClTYuzYsfHd7343evfuHZ06dYof//jHsX379lOud/To0YiImDFjRowePfq4+1x00UVJPUf8+8W5Xbt2jb///e/HPPbetr59+yYfh7wqdf6oDtUwf5s2bYqbbropLrvssli+fHl07ly1l6uqVA0z+N/q6uri2muvjSeeeELYqACVOn9Lly6NrVu3xqJFi475fKv9+/dHS0tL25sVVJKq/e69fPnyqK+vjxUrVkRNTU3b9tmzZx93/23bth2z7ZVXXon+/ftHRER9fX1ERHTp0iWuu+660jf8H2eccUYMGjTouB9Ws2HDhqivr/dOQRWgUueP6lDp87d9+/a4/vrro3fv3rF69ero0aNH9mNSWpU+g8dz4MCB2LdvX1mOzamp1Pl79dVX4/Dhw/H5z3/+mMeWLl0aS5cujZUrV8bYsWOz9ZBDVb9mIyKiKIq2bRs2bPjAD0dpbGx83+/bbdy4MTZs2BA33HBDRPz7be8aGhpi0aJFx73rsGvXrg/t51Te9mz8+PHx7LPPvi9wbN26NZ5++umYMGHCCddTfpU8f1S+Sp6/nTt3xhe+8IU444wz4o9//GN87GMfO+EaTj+VPINvvfXWMdtaWlriT3/603HfKZLTT6XO36RJk2LlypXHfEVEfPGLX4yVK1fGlVde+aE1TkcVfWfjscceiyeffPKY7dOmTYsxY8bEihUrYty4cXHjjTfGjh074pFHHolLL7003nnnnWPWXHTRRXHVVVfFt771rTh06FAsWLAgzj333Ljrrrva9nnooYfiqquuikGDBsU3vvGNqK+vjzfffDPWr18fr7/+emzatOkDe924cWNcc801MXv27BO+QOjb3/52/PKXv4wbb7wxZsyYEV26dImf/vSn0adPn5g+ffrJnyCyqtb527dvX/zsZz+LiIi//OUvERGxcOHCqK2tjdra2rj99ttP5vSQWbXO3/XXXx/Nzc1x1113xTPPPBPPPPNM22N9+vSJUaNGncTZoT1U6wwOGjQoRo4cGYMHD466urrYtm1bPProo3H48OGYO3fuyZ8gsqrG+RswYEAMGDDguI9deOGFFXdHo00Z3gEr2Xtve/ZBX6+99lpx9OjR4t577y369etXdO3atfjMZz5T/OEPfygmT55c9OvXr63We297dt999xXz588vPvGJTxRdu3Ytrr766mLTpk3HHHv79u3FV7/61eLjH/940aVLl+L8888vxowZUyxfvrxtn1K87dlrr71WjB8/vjjnnHOKHj16FGPGjCm2bdv2UU8ZJVTt8/deT8f7+u/eKY9qn78Pe24jRoxIOHOUSrXP4OzZs4uhQ4cWdXV1RefOnYu+ffsWkyZNKl588cWU00aJVPv8HU9U+Fvf1hTFf91jAgAAKJGqfc0GAABQXsIGAACQhbABAABkIWwAAABZCBsAAEAWwgYAAJDFSX+o339/3Hu5lOLTs0vxgTxr1qxJrjFr1qyk9Xv37k3uoRTa652TT4f5K4V169Yl16itrU2ucaIPtTqRxsbG5B5KoT3fubtaZrChoSG5Rin+/puampLWl+J5lEJH+h44c+bM5BqluAY3Nzcn10j9JHDX4MpUiuvn4sWLk2tU7Ifz/Y+TnT93NgAAgCyEDQAAIAthAwAAyELYAAAAshA2AACALIQNAAAgC2EDAADIQtgAAACyEDYAAIAshA0AACALYQMAAMhC2AAAALIQNgAAgCyEDQAAIAthAwAAyELYAAAAsuhc7gZOxdy5c5Nr1NfXJ9eoq6tLrrFnz56k9RMnTkzuYdmyZck1ODWtra3JNUaMGJFco6GhIWl9Y2Njcg+cusGDByfXWLt2bXKNffv2Jdfo379/cg1OTeo1dMKECck93Hrrrck1Fi1alFxjyJAhSevXrFmT3APtb8qUKck1mpqakmt0NO5sAAAAWQgbAABAFsIGAACQhbABAABkIWwAAABZCBsAAEAWwgYAAJCFsAEAAGQhbAAAAFkIGwAAQBbCBgAAkIWwAQAAZCFsAAAAWQgbAABAFsIGAACQRef2PNiQIUOS1tfX1yf38KlPfSq5RnNzc3KNp556Kml96rmMiFi2bFlyjY5k8ODByTUaGhqSa5RCU1NTuVvgIxg7dmxyjU2bNiXXaGxsTK4xe/bs5Bqcml/84hdJ6+fNm5fcw3PPPZdcoxTX4DVr1iTXoH3V1tYm15gyZUpyjQULFiTX6N+/f3KNVC0tLe12LHc2AACALIQNAAAgC2EDAADIQtgAAACyEDYAAIAshA0AACALYQMAAMhC2AAAALIQNgAAgCyEDQAAIAthAwAAyELYAAAAshA2AACALIQNAAAgC2EDAADIQtgAAACy6NyeB6urq0ta//zzzyf30NzcnFyjFErxXDg1d9xxR9L6OXPmJPfQs2fP5BqlsG7dunK3wEewYMGC5BotLS2nRR+rVq1KrsGpSb3+1dfXJ/dQihpr1qxJrpH688jevXuTe+DUTJkyJblG//79k2ssXrw4uUbq99DW1tbkHkrxM83JcmcDAADIQtgAAACyEDYAAIAshA0AACALYQMAAMhC2AAAALIQNgAAgCyEDQAAIAthAwAAyELYAAAAshA2AACALIQNAAAgC2EDAADIQtgAAACyEDYAAIAshA0AACCLzu15sLq6uqT1a9asKVEn5Zd6Lvbu3VuiTjqOBQsWJK1fvHhxcg+ny99bbW1tuVvokFLP+x133JHcw9ixY5NrlMKUKVPK3QKnqLm5OblGr169kms89dRTZa8xatSo5B5Ol+tBe0n93vPAAw8k97BkyZLkGqUwbdq0pPVf+9rXStRJ+3BnAwAAyELYAAAAshA2AACALIQNAAAgC2EDAADIQtgAAACyEDYAAIAshA0AACALYQMAAMhC2AAAALIQNgAAgCyEDQAAIAthAwAAyELYAAAAshA2AACALIQNAAAgi87tebC9e/cmrR8yZEiJOklTV1eXXCP1uSxbtiy5BzquwYMHJ61vamoqSR8dzZw5c5LWT5s2rTSNJBo3blxyjdbW1vRGqDipPwdERIwaNSq5xqJFi5LWz5w5M7mHWbNmJdeoJKn/5vft25fcw+TJk5NrpF4/S6GxsbHcLZwSdzYAAIAshA0AACALYQMAAMhC2AAAALIQNgAAgCyEDQAAIAthAwAAyELYAAAAshA2AACALIQNAAAgC2EDAADIQtgAAACyEDYAAIAshA0AACALYQMAAMiic3serLm5OWn9kCFDknuYMGHCaVEj1bx588rdAnCKFi9enLS+oaEhuYfLL788ucbKlSuTa6xatSppfeq5jIhobGxMrtGRzJ07N7nGmjVrkmvU1dUl17juuuuS1i9btiy5h45m3bp1Setra2uTexg8eHByjdTnERGxZMmSpPWtra3JPbQndzYAAIAshA0AACALYQMAAMhC2AAAALIQNgAAgCyEDQAAIAthAwAAyELYAAAAshA2AACALIQNAAAgC2EDAADIQtgAAACyEDYAAIAshA0AACALYQMAAMhC2AAAALLo3J4Ha25uTlo/a9as5B7mzp2bXOP5559PrjF06NDkGrSv1tbW5BqrVq1KrnHzzTcn12hoaEhav3jx4uQeOqKmpqak9YMHD07uoRQ15syZk1wjdY5bWlqSe2hsbEyu0ZHs3bs3ucaiRYtK0Em6ZcuWJa2/9dZbS9QJ7akU1/GePXsm1+ho11B3NgAAgCyEDQAAIAthAwAAyELYAAAAshA2AACALIQNAAAgC2EDAADIQtgAAACyEDYAAIAshA0AACALYQMAAMhC2AAAALIQNgAAgCyEDQAAIAthAwAAyELYAAAAsqgpiqIodxMAAED1cWcDAADIQtgAAACyEDYAAIAshA0AACALYQMAAMhC2AAAALIQNgAAgCyEDQAAIAthAwAAyOL/AQiY2mXU6VNhAAAAAElFTkSuQmCC", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# visualize the first 5 images\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "fig, ax = plt.subplots(1, 5, figsize=(10, 5))\n", "for i in range(5):\n", " ax[i].imshow(data[i].reshape(8, 8), cmap='gray')\n", " ax[i].set_title(f\"Label: {target[i]}\")\n", " ax[i].axis('off')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# The format of the model output\n", "The output of the algorithm could be\n", "- Option 1: The actual digits [0,9]\n", "- Option 2: The one-hot encoded vecor\n", " - Example: number 1 will be represented as [0,1,0,0,0,0,0,0,0,0]\n", " \n", "If we go with option 2, here are dimensions of data we are dealing with\n", "- Input shape: 1797 x 64\n", "- Output shape: 1797 x 10" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# The loss function\n", "\n", "We use the cross-entropy loss function for this classification problem. \n", "\n", "## Single Data Point\n", "For a single data point, the categorical cross-entropy loss function is defined as:\n", "\n", "$$\n", "\\begin{aligned}\n", "L = -\\sum_{j=1}^{C}y_j log(p_j)\n", "\\end{aligned}\n", "$$\n", "\n", "Where:\n", "* $C$ is the number of classes\n", "* $y_i$ is the true label (a one-hot encoded vector, where $y_i=1$ for the correct class and $y_i=0$ for the incorrect classes)\n", "* $p_i$ is the predicted probability for class $i$ (output of the softmax function)\n", "\n", "## Multiple Data Points\n", "\n", "For a batch of N data points, the loss function is defined as the average over the whole batch:\n", "\n", "$$\n", "\\begin{aligned}\n", "L = -\\frac{1}{N}\\sum_{i=1}^{N}\\sum_{j=1}^{C}y_{ij} log(p_{ij})\n", "\\end{aligned}\n", "$$" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Implement the cross-entropy loss function with `numpy`" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "\n", "# Implement the single-sample version of the loss function\n", "def cross_entropy_loss_single(y_true, y_pred):\n", " # Clip the predicted values to avoid log(0)\n", " y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)\n", " # Calculate the cross-entropy loss for a single sample\n", " loss = -np.sum(y_true * np.log(y_pred))\n", " return loss" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loss 1: 0.01005033585350145\n", "Loss 2: 0.35667494393873245\n", "Loss 3: 1.6094379124341003\n", "Loss 4: 1.1086626245216111\n" ] } ], "source": [ "# Example: test the single-sample loss function\n", "y_true = np.array([0,1,0])\n", "y_pred_1 = np.array([0.,0.99,0.01])\n", "y_pred_2 = np.array([0.2,0.7,0.1])\n", "y_pred_3 = np.array([0.7,0.2,0.1])\n", "y_pred_4 = np.array([0.33,0.33,0.34])\n", "loss_1 = cross_entropy_loss_single(y_true,y_pred_1)\n", "loss_2 = cross_entropy_loss_single(y_true,y_pred_2)\n", "loss_3 = cross_entropy_loss_single(y_true,y_pred_3)\n", "loss_4 = cross_entropy_loss_single(y_true,y_pred_4)\n", "print(f\"Loss 1: {loss_1}\")\n", "print(f\"Loss 2: {loss_2}\")\n", "print(f\"Loss 3: {loss_3}\")\n", "print(f\"Loss 4: {loss_4}\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Implement the multi-sample version of the loss function\n", "def cross_entropy_loss_multi(y_true, y_pred):\n", " # Calculate the number of samples in the batch\n", " num_samples = y_true.shape[0]\n", " # Calculate the cross-entropy loss for each sample\n", " losses = -np.sum(y_true * np.log(y_pred), axis=1)\n", " # Calculate the average loss across all samples\n", " loss = np.sum(losses) / num_samples\n", " return loss" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[1 0 0]\n", " [0 1 0]\n", " [0 0 1]]\n", "[[0.58544872 0.10647212 0.30807916]\n", " [0.10495975 0.15775603 0.73728422]\n", " [0.11822677 0.45973382 0.42203941]]\n", "[1. 1. 1.]\n", "Loss = 1.0815796007689922\n" ] } ], "source": [ "# Example: test the multi-sample loss function\n", "## Generate \n", "y_true = np.array([\n", " [1,0,0],\n", " [0,1,0],\n", " [0,0,1]\n", "])\n", "y_pred = np.random.random(size=(3, 3))\n", "y_pred = y_pred / y_pred.sum(axis=1, keepdims=True)\n", "loss = cross_entropy_loss_multi(y_true, y_pred)\n", "print(y_true)\n", "print(y_pred)\n", "print(y_pred.sum(axis=1)) # validate if the total is 1\n", "print(f\"Loss = {loss}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# The gradient function\n", "**Q**: What would be the corresponding gradient of the cross entropy function we defined earlier?\n", "\n", "**A**: It depends on the actual prediction model ..." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Softmax function\n", "\n", "The softmax function transforms a vector of real numbers into a vector of probabilities, such that the elements of the output vector are in the range (0, 1) and sum to 1. The softmax function is defined as:\n", "\n", "$$\n", "\\begin{aligned}\n", "\\sigma(z_i) = \\frac{e^{z_i}}{\\sum_{j=1}^C e^{z_j}}\n", "\\end{aligned}\n", "$$\n", "\n", "Where:\n", "- $z_i$ is the ith element of the input vector\n", "- $C$ is the number of elements (classes) in the input vector\n", "- The exponential function in the formula above ensures the obtained values are non-negative.\n", "- The normalization term in the denominator ensures that the sum of the obtained values from the softmax function sum to 1 and the individual values always lie between 0 and 1.\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def softmax(z):\n", " exp_z = np.exp(z)\n", " # Subtract max for numerical stability to avoid overflow\n", " exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))\n", " return exp_z / np.sum(exp_z, axis=1, keepdims=True)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[0.09003057 0.24472847 0.66524096]\n", " [0.84379473 0.04201007 0.1141952 ]]\n" ] } ], "source": [ "z = np.array([\n", " [1, 2, 3],\n", " [4, 1, 2]\n", "])\n", "probabilities = softmax(z)\n", "print(probabilities)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1.],\n", " [1.]])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.sum(probabilities, axis=1, keepdims=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## The gradient of the loss function\n", "\n", "We know that the cross entropy loss function for multiple data points can be expressed as\n", "$$\n", "\\begin{aligned}\n", "L = -\\frac{1}{N}\\sum_{i=1}^{N}\\sum_{j=1}^{C}y_{ij} log(p_{ij})\n", "\\end{aligned}\n", "$$\n", "\n", "Where \n", "- $y_{ij}$ is the ground truth label of the $i^{th}$ data point being of class $j$.\n", "- $p_{ij}$ is the probability of the $i^{th}$ data point being of class $j$.\n", "\n", "Assume $p_{ij}$ can be expressed in the form of a softmax function of an input vector $z_{ij}$:\n", "$$\n", "\\begin{aligned}\n", "p_{ij} = \\frac{e^{z_{ij}}}{\\sum_{k=1}^{C}e^{z_{ik}}}\n", "\\end{aligned}\n", "$$\n", "\n", "The loss function could then be rewritten as \n", "$$\n", "\\begin{aligned}\n", "L = -\\frac{1}{N}\\sum_{i=1}^{N}\\sum_{j=1}^{C}y_{ij} log(\\frac{e^{z_{ij}}}{\\sum_{k=1}^{C}e^{z_{ik}}}) = \\frac{1}{N}\\sum_{i=1}^{N}L_i\n", "\\end{aligned}\n", "$$\n", "\n", "Where \n", "$$\n", "\\begin{aligned}\n", "L_i = - \\sum_{j=1}^{C}y_{ij} log(p_{ij}) = - \\sum_{j=1}^{C}y_{ij} log(\\frac{e^{z_{ij}}}{\\sum_{k=1}^{C}e^{z_{ik}}})\n", "\\end{aligned}\n", "$$\n", "\n", "In order to calculate the gradient function, we could take the following steps:\n", "### Step 1: Compute $\\frac{\\partial L_i}{\\partial z_{il}}$\n", "$$\n", "\\begin{aligned}\n", "\\frac{\\partial L_i}{\\partial z_{il}} = - \\sum_{j=1}^{C}y_{ij} \\frac{\\partial log(p_{ij})}{\\partial z_{il}} \n", "\\end{aligned}\n", "$$\n", "\n", "$$\n", "\\begin{aligned}\n", "\\frac{\\partial log(p_{ij})}{\\partial z_{il}} = \\frac{\\partial (z_{ij} - log(\\sum_{k=1}^{C}e^{z_{ik}}))}{\\partial z_{il}} = \\delta_{jl} - \\frac{e^{z_{il}}}{\\sum_{k=1}^{C}e^{z_{ik}}} = \\delta_{jl} - p_{il}\n", "\\end{aligned}\n", "$$\n", "\n", "Therefore,\n", "$$\n", "\\begin{aligned}\n", "\\frac{\\partial L_i}{\\partial z_{il}} = - \\sum_{j=1}^{C}y_{ij} \\frac{\\partial log(p_{ij})}{\\partial z_{il}} = - \\sum_{j=1}^{C}y_{ij} (\\delta_{jl} - p_{il}) = -(y_{il} - p_{il} \\sum_{j=1}^{C}y_{ij}) = p_{il} - y_{il}\n", "\\end{aligned}\n", "$$\n", "\n", "### Step 2: Gradient for the batch with regard to the input vector $Z$\n", "$$\n", "\\begin{aligned}\n", "\\frac{\\partial L}{\\partial z_{ij}} = \\frac{1}{N} (p_{il} - y_{il})\n", "\\end{aligned}\n", "$$\n", "\n", "If we rewrite the formula in matrix form, we could get:\n", "$$\n", "\\begin{aligned}\n", "\\frac{\\partial L}{\\partial Z} = \\frac{1}{N} (P - Y)\n", "\\end{aligned}\n", "$$\n", "\n", "Where:\n", "- $Z$ is an input matrix (shape $(N, C)$)\n", "- $P$ is a matrix of predicted probabilities (shape $(N, C)$)\n", "- $Y$ is a one-hot encoded matrix of ground truth labels (shape $(N, C)$)\n", "\n", "### Step 3: Gradient for the batch with regard to the weight matrix $\\theta$\n", "\n", "Assume the following relation between $X$ and $Z$:\n", "$$\n", "\\begin{aligned}\n", "Z = X \\cdot \\theta\n", "\\end{aligned}\n", "$$\n", "\n", "Where:\n", "- $X$ is the input training data matrix (shape $(N, D)$), where $D$ is the number of features\n", "- $\\theta$ is the weight matrix (shape $(D, C)$)\n", "\n", "According to the chain rule, we could calculate the gradient of the loss function with regard to $\\theta$ as follows:\n", "$$\n", "\\begin{aligned}\n", "\\frac{\\partial L}{\\partial \\theta} = \\frac{\\partial L}{\\partial Z} \\cdot \\frac{\\partial Z}{\\partial \\theta} = \\frac{1}{N} X^T \\cdot (P - Y)\n", "\\end{aligned}\n", "$$\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Put all together\n", "\n", "In order to train a multi-class classification model, we need to repeat the following steps until the model converges or until the maximum number of iterations (also known as `epochs`) is reached:\n", "- Iterate over the training data $X_{N \\times D}$, $y_{N \\times C}$\n", "- Update the parameters $\\theta = \\theta - \\frac{\\alpha}{N} X^T \\cdot (P - Y)$" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The shape of the data is (1797, 64)\n", "The shape of the target is (1797,)\n", "First 5 samples of the target are [0 1 2 3 4]\n", "Target names are [0 1 2 3 4 5 6 7 8 9]\n" ] } ], "source": [ "# Let's look at the data we have again\n", "print(f\"The shape of the data is {data.shape}\")\n", "print(f\"The shape of the target is {target.shape}\")\n", "print(f\"First 5 samples of the target are {target[:5]}\")\n", "print(f\"Target names are {target_names}\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Now we have a problem\n", "The shape of the target is of shape (N,) not (N, C)!!!\n", "- Specifically, we need the target $Y$ to be of shape $(1797, 10)$" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "N: 1797, C: 10\n" ] } ], "source": [ "# We need to do one-hot encoding for each target label\n", "\n", "N = data.shape[0]\n", "C = len(target_names)\n", "print(f\"N: {N}, C: {C}\")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "Y = np.zeros((N, C))\n", "Y[np.arange(N), target] = 1" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", " [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],\n", " [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Y[:3]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 1, 2])" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target[:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Let's implement the algorithm" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 0: Loss = 60.566761553085776\n", "Epoch 10: Loss = 10.957079130784745\n", "Epoch 20: Loss = 4.073395896554243\n", "Epoch 30: Loss = 2.6622148037966116\n", "Epoch 40: Loss = 1.9901399990076862\n", "Epoch 50: Loss = 1.6523905417468232\n", "Epoch 60: Loss = 1.3407614744731022\n", "Epoch 70: Loss = 1.1322092677026916\n", "Epoch 80: Loss = 0.996374294120693\n", "Epoch 90: Loss = 0.8945413101988137\n" ] } ], "source": [ "import numpy as np\n", "from sklearn import datasets\n", "from sklearn.model_selection import train_test_split\n", "\n", "digits = datasets.load_digits()\n", "\n", "N = digits.data.shape[0]\n", "C = len(digits.target_names)\n", "X = data\n", "Y = np.zeros((N, C))\n", "Y[np.arange(N), digits.target] = 1\n", "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)\n", "\n", "epochs = 100\n", "alpha = 0.0001\n", "def train(X, Y, alpha, epochs):\n", " D, C = X.shape[1], Y.shape[1]\n", " theta = np.random.randn(D, C)\n", " for epoch in range(epochs):\n", " Z = X.dot(theta)\n", " P = softmax(Z)\n", " loss = cross_entropy_loss_multi(Y, P)\n", " if epoch % 10 == 0:\n", " print(f\"Epoch {epoch}: Loss = {loss}\")\n", " theta = theta - alpha * X.T.dot(P - Y)\n", " return theta\n", "\n", "theta = train(X_train, Y_train, alpha, epochs)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Let's make some predictions" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "def predict(X, theta):\n", " P = softmax(X @ theta)\n", " # Convert the one-hot encoded predictions to a single integer\n", " return np.argmax(P, axis=1)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "y_pred = predict(X_test, theta)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([6, 9, 3, 7, 6, 2, 5, 2, 5, 2])" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_pred[:10]" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([6, 9, 3, 7, 2, 1, 5, 2, 5, 2])" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_test = np.argmax(Y_test, axis=1)\n", "y_test[:10]" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.8916666666666667\n" ] } ], "source": [ "# Measure the accuracy of the model we just built\n", "accuracy = np.mean(y_test == y_pred)\n", "print(f\"Accuracy: {accuracy}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Let's clean up the code a bit" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Let's try a pre-built model\n", "\n", "[[Source](https://scikit-learn.org/1.5/auto_examples/classification/plot_digits_classification.html)]" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Test Accuracy: 97.50%\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/xiangshiyin/Documents/Teaching/machine-learning-for-actuarial-science/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.\n", " warnings.warn(\n" ] } ], "source": [ "import numpy as np\n", "from sklearn import datasets\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score\n", "\n", "# Load the MNIST digits dataset\n", "digits = datasets.load_digits()\n", "\n", "# Prepare the data\n", "X = digits.data # Features (8x8 pixel images flattened into 64-dimensional vectors)\n", "y = digits.target # Labels (digits from 0 to 9)\n", "\n", "# Split the data into training and testing sets\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "\n", "# Initialize the Logistic Regression model\n", "# Set multi_class='multinomial' for multi-class classification\n", "model = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')\n", "\n", "# Train the model\n", "model.fit(X_train, y_train)\n", "\n", "# Make predictions on the test set\n", "y_pred = model.predict(X_test)\n", "\n", "# Evaluate the model\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(f\"Test Accuracy: {accuracy * 100:.2f}%\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.6" } }, "nbformat": 4, "nbformat_minor": 4 }