{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Information Gain\n", "\n", "Nbviewer [https://nbviewer.jupyter.org/github/shaundsouza/deep-learning/blob/master/ig.ipynb](https://nbviewer.jupyter.org/github/shaundsouza/ai-ecosystems-enabling/blob/master/ig.ipynb)\n", "\n", "Execute on Binder [https://mybinder.org/v2/gh/shaundsouza/ai-ecosystems-enabling/master?filepath=ig.ipynb](https://mybinder.org/v2/gh/shaundsouza/ai-ecosystems-enabling/master?filepath=ig.ipynb)\n", "\n", "Code [https://nbviewer.jupyter.org/format/script/github/shaundsouza/ai-ecosystems-enabling/blob/master/ig.ipynb](https://nbviewer.jupyter.org/format/script/github/shaundsouza/deep-learning/blob/master/ig.ipynb)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "def B(q):\n", " if q == 0 or q == 1:\n", " return 0\n", " else:\n", " return -(q * np.log2([q])[0] + (1 - q) * np.log([1 - q])[0])\n", "\n", "def H(q):\n", " x = pd.unique(q)\n", " h = 0\n", " for i in x:\n", " p = sum(q == i) / len(q)\n", " # print(i, p)\n", " h = h + p * np.log2([p])[0]\n", " return -h\n", "\n", "def remainder(q, t):\n", " x = pd.unique(q)\n", " h = 0\n", " for i in x:\n", " # print(i)\n", " pknk = sum(q == i)\n", " pk = sum(t[q == i] == \"Yes\")\n", " p = pknk / len(q) * B(pk / pknk)\n", " # print(pknk, pk, p)\n", " h = h + p\n", " return h\n", "\n", "def remainderH(q, t):\n", " x = pd.unique(q)\n", " h = 0\n", " for i in x:\n", " # print(i)\n", " pknk = sum(q == i)\n", " # pk = sum(t[q == i] == \"Yes\")\n", " p = pknk / len(q) * H(t[q == i])\n", " # print(i, pknk, p)\n", " h = h + p\n", " return h\n", "\n", "def gain(q, t):\n", " b = B(sum(t == \"Yes\") / len(t))\n", " return b - remainder(q, t)\n", "\n", "def gainH(q, t):\n", " return H(t) - remainderH(q, t)\n", "\n", "dataset = pd.read_csv(\"house/train.csv\")\n", "size = dataset.shape\n", "\n", "dataset = dataset.fillna(0)\n", "names = dataset.columns.values\n", "\n", "for i in range(size[1]):\n", " for j in range(size[1]):\n", " print(names[i], names[j], gainH(dataset.iloc[:, i], dataset.iloc[:, j]))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }