{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Common imports\n",
"import numpy as np\n",
"import os\n",
"\n",
"from math import log\n",
"import pandas as pd\n",
"import operator\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 前置知识\n",
"pandas\n",
" 选择行列:https://blog.csdn.net/qq_38328378/article/details/81166518\n",
" 统计数量与频率iris['Species'].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 载入数据并查看"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Sepal.Length | \n",
" Sepal.Width | \n",
" Petal.Length | \n",
" Petal.Width | \n",
" Species | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1 | \n",
" 5.1 | \n",
" 3.5 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 2 | \n",
" 4.9 | \n",
" 3.0 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 3 | \n",
" 4.7 | \n",
" 3.2 | \n",
" 1.3 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 4 | \n",
" 4.6 | \n",
" 3.1 | \n",
" 1.5 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 5 | \n",
" 5.0 | \n",
" 3.6 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n",
"1 5.1 3.5 1.4 0.2 setosa\n",
"2 4.9 3.0 1.4 0.2 setosa\n",
"3 4.7 3.2 1.3 0.2 setosa\n",
"4 4.6 3.1 1.5 0.2 setosa\n",
"5 5.0 3.6 1.4 0.2 setosa"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"csv_path=os.path.join(\"datas\", \"iris\",\"iris.csv\")\n",
"iris=pd.read_csv(csv_path,sep=',',header=0,index_col=0)\n",
"iris.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris.describe"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',\n",
" 'Species'],\n",
" dtype='object')"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"spec=iris.columns\n",
"spec"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"versicolor 50\n",
"virginica 50\n",
"setosa 50\n",
"Name: Species, dtype: int64\n",
"Index(['versicolor', 'virginica', 'setosa'], dtype='object')\n",
"50\n"
]
}
],
"source": [
"print(iris['Species'].value_counts(),iris['Species'].value_counts().index,iris['Species'].value_counts()[2],sep='\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 构建信息函数,其实就是熵"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.584962500721156"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def ent(daframe,colum):\n",
" \"\"\"输入dataframe以及需要计算熵的列索引,输出熵\"\"\"\n",
" coun=daframe[colum].value_counts()\n",
" rat=coun/sum(coun)\n",
" return sum(-rat*rat.apply(np.log2))\n",
"ent(iris,'Species')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5.0 0.066667\n",
"6.3 0.060000\n",
"5.1 0.060000\n",
"6.7 0.053333\n",
"5.7 0.053333\n",
"5.5 0.046667\n",
"5.8 0.046667\n",
"6.4 0.046667\n",
"6.0 0.040000\n",
"4.9 0.040000\n",
"6.1 0.040000\n",
"5.4 0.040000\n",
"5.6 0.040000\n",
"6.5 0.033333\n",
"4.8 0.033333\n",
"7.7 0.026667\n",
"6.9 0.026667\n",
"5.2 0.026667\n",
"6.2 0.026667\n",
"4.6 0.026667\n",
"7.2 0.020000\n",
"6.8 0.020000\n",
"4.4 0.020000\n",
"5.9 0.020000\n",
"6.6 0.013333\n",
"4.7 0.013333\n",
"7.6 0.006667\n",
"7.4 0.006667\n",
"4.3 0.006667\n",
"7.9 0.006667\n",
"7.3 0.006667\n",
"7.0 0.006667\n",
"4.5 0.006667\n",
"5.3 0.006667\n",
"7.1 0.006667\n",
"Name: Sepal.Length, dtype: float64"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"coun=iris['Sepal.Length'].value_counts()\n",
"rat=coun/sum(coun)\n",
"rat"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 构建gain函数"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.584962500721156 Sepal.Length\n",
"1.584962500721156 Sepal.Width\n",
"1.584962500721156 Petal.Length\n",
"1.584962500721156 Petal.Width\n",
"1.584962500721156 Species\n"
]
}
],
"source": [
"def Gain(daframe,D):\n",
" \"\"\"输入:\n",
" dataframe,通常是上一次分支后的\n",
" D,当前列\n",
" a,计划试探的列\"\"\"\n",
" coun=iris[D].value_counts()\n",
" rat=coun/sum(coun)\n",
" gain=ent(daframe,'Species')\n",
" for i in range(len(coun)):\n",
" #为dataframe中D的每一个取值拆分出叫做temp的,包含该取值的dataframe\n",
" temp=daframe[daframe[D]==coun.index[i]]\n",
" #print(temp)\n",
" gain-=rat.iloc[i]*ent(temp,D)\n",
" #print(ent(temp,D))\n",
" #print(gain)\n",
" return gain\n",
"#由于这些是连续值,所以算出来都是0,但是计算ent和gain任务算是完成了\n",
"for i in spec:\n",
" print(Gain(iris,i),i)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Sepal.Length | \n",
" Sepal.Width | \n",
" Petal.Length | \n",
" Petal.Width | \n",
" Species | \n",
"
\n",
" \n",
" \n",
" \n",
" | 5 | \n",
" 5.0 | \n",
" 3.6 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 6 | \n",
" 5.4 | \n",
" 3.9 | \n",
" 1.7 | \n",
" 0.4 | \n",
" setosa | \n",
"
\n",
" \n",
" | 7 | \n",
" 4.6 | \n",
" 3.4 | \n",
" 1.4 | \n",
" 0.3 | \n",
" setosa | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n",
"5 5.0 3.6 1.4 0.2 setosa\n",
"6 5.4 3.9 1.7 0.4 setosa\n",
"7 4.6 3.4 1.4 0.3 setosa"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"iris.iloc[4:7,0:len(iris.columns)+1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}