{ "cells": [ { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Common imports\n", "import numpy as np\n", "import os\n", "\n", "from math import log\n", "import pandas as pd\n", "import operator\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 前置知识\n", "pandas\n", " 选择行列:https://blog.csdn.net/qq_38328378/article/details/81166518\n", " 统计数量与频率iris['Species'].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 载入数据并查看" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
15.13.51.40.2setosa
24.93.01.40.2setosa
34.73.21.30.2setosa
44.63.11.50.2setosa
55.03.61.40.2setosa
\n", "
" ], "text/plain": [ " Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n", "1 5.1 3.5 1.4 0.2 setosa\n", "2 4.9 3.0 1.4 0.2 setosa\n", "3 4.7 3.2 1.3 0.2 setosa\n", "4 4.6 3.1 1.5 0.2 setosa\n", "5 5.0 3.6 1.4 0.2 setosa" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "csv_path=os.path.join(\"datas\", \"iris\",\"iris.csv\")\n", "iris=pd.read_csv(csv_path,sep=',',header=0,index_col=0)\n", "iris.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.describe" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',\n", " 'Species'],\n", " dtype='object')" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spec=iris.columns\n", "spec" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "versicolor 50\n", "virginica 50\n", "setosa 50\n", "Name: Species, dtype: int64\n", "Index(['versicolor', 'virginica', 'setosa'], dtype='object')\n", "50\n" ] } ], "source": [ "print(iris['Species'].value_counts(),iris['Species'].value_counts().index,iris['Species'].value_counts()[2],sep='\\n')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 构建信息函数,其实就是熵" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.584962500721156" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def ent(daframe,colum):\n", " \"\"\"输入dataframe以及需要计算熵的列索引,输出熵\"\"\"\n", " coun=daframe[colum].value_counts()\n", " rat=coun/sum(coun)\n", " return sum(-rat*rat.apply(np.log2))\n", "ent(iris,'Species')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "5.0 0.066667\n", "6.3 0.060000\n", "5.1 0.060000\n", "6.7 0.053333\n", "5.7 0.053333\n", "5.5 0.046667\n", "5.8 0.046667\n", "6.4 0.046667\n", "6.0 0.040000\n", "4.9 0.040000\n", "6.1 0.040000\n", "5.4 0.040000\n", "5.6 0.040000\n", "6.5 0.033333\n", "4.8 0.033333\n", "7.7 0.026667\n", "6.9 0.026667\n", "5.2 0.026667\n", "6.2 0.026667\n", "4.6 0.026667\n", "7.2 0.020000\n", "6.8 0.020000\n", "4.4 0.020000\n", "5.9 0.020000\n", "6.6 0.013333\n", "4.7 0.013333\n", "7.6 0.006667\n", "7.4 0.006667\n", "4.3 0.006667\n", "7.9 0.006667\n", "7.3 0.006667\n", "7.0 0.006667\n", "4.5 0.006667\n", "5.3 0.006667\n", "7.1 0.006667\n", "Name: Sepal.Length, dtype: float64" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coun=iris['Sepal.Length'].value_counts()\n", "rat=coun/sum(coun)\n", "rat" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 构建gain函数" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.584962500721156 Sepal.Length\n", "1.584962500721156 Sepal.Width\n", "1.584962500721156 Petal.Length\n", "1.584962500721156 Petal.Width\n", "1.584962500721156 Species\n" ] } ], "source": [ "def Gain(daframe,D):\n", " \"\"\"输入:\n", " dataframe,通常是上一次分支后的\n", " D,当前列\n", " a,计划试探的列\"\"\"\n", " coun=iris[D].value_counts()\n", " rat=coun/sum(coun)\n", " gain=ent(daframe,'Species')\n", " for i in range(len(coun)):\n", " #为dataframe中D的每一个取值拆分出叫做temp的,包含该取值的dataframe\n", " temp=daframe[daframe[D]==coun.index[i]]\n", " #print(temp)\n", " gain-=rat.iloc[i]*ent(temp,D)\n", " #print(ent(temp,D))\n", " #print(gain)\n", " return gain\n", "#由于这些是连续值,所以算出来都是0,但是计算ent和gain任务算是完成了\n", "for i in spec:\n", " print(Gain(iris,i),i)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
55.03.61.40.2setosa
65.43.91.70.4setosa
74.63.41.40.3setosa
\n", "
" ], "text/plain": [ " Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n", "5 5.0 3.6 1.4 0.2 setosa\n", "6 5.4 3.9 1.7 0.4 setosa\n", "7 4.6 3.4 1.4 0.3 setosa" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.iloc[4:7,0:len(iris.columns)+1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }