{
"cells": [
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"from sklearn import datasets, linear_model\n",
"from sklearn.metrics import mean_squared_error, r2_score\n",
"\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error\n",
"import seaborn as sns"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sepal_length | \n",
" sepal_width | \n",
" petal_length | \n",
" petal_width | \n",
" species | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 5.1 | \n",
" NaN | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 1 | \n",
" 4.9 | \n",
" 3.0 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 2 | \n",
" 4.7 | \n",
" 3.2 | \n",
" 1.3 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 3 | \n",
" 4.6 | \n",
" NaN | \n",
" 1.5 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 4 | \n",
" 5.0 | \n",
" 3.6 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sepal_length sepal_width petal_length petal_width species\n",
"0 5.1 NaN 1.4 0.2 setosa\n",
"1 4.9 3.0 1.4 0.2 setosa\n",
"2 4.7 3.2 1.3 0.2 setosa\n",
"3 4.6 NaN 1.5 0.2 setosa\n",
"4 5.0 3.6 1.4 0.2 setosa"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"### 1.0.0 数据探索EDA <每次面对新数据都应该进行数据探索,依据探索结果进行特征工程和选择模型>\n",
"#加载iris数据集,数据集被手动删除一些特征值。\n",
"iris_data = pd.read_csv(\"../_Datasets/iris_miss.data\",sep=',') #指定字段分隔符,默认逗号\n",
"\n",
"##(1)查看前几行数据。\n",
"iris_data.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(150, 5)"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"##(2)查看数据维度(例如二维数据的行列数)\n",
"iris_data.shape #150行5列\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',\n",
" 'species'],\n",
" dtype='object')"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"##(3)查看特征列名。\n",
"iris_data.columns"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['SL', 'SW', 'PL', 'PW', 'species'], dtype='object')"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"##(4)特征列重命名\n",
"\n",
"#可以根据列名筛选特征。也可以重命名该列。例如将diabetes_X特征s1-s6重命名为feature1 - feature6。\n",
"##方法1-暴力重命名,此方法需要写全所有列名,否则报错。\n",
"#diabetes_X.columns = ['feature1', 'feature2', 'feature3', 'feature4', 'species']\n",
"#diabetes_X.columns\n",
"\n",
"#方法2-rename方法,此方法只需写需要重命名的字段。根据需要,非必需\n",
"iris_data.rename(columns={'sepal_length':'SL','sepal_width':'SW','petal_length':'PL','petal_width':'PW'},inplace=True)\n",
"iris_data.columns"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SL | \n",
" SW | \n",
" PL | \n",
" PW | \n",
"
\n",
" \n",
" \n",
" \n",
" | count | \n",
" 147.000000 | \n",
" 143.000000 | \n",
" 148.000000 | \n",
" 149.000000 | \n",
"
\n",
" \n",
" | mean | \n",
" 5.859184 | \n",
" 3.042657 | \n",
" 3.737162 | \n",
" 1.205369 | \n",
"
\n",
" \n",
" | std | \n",
" 0.828413 | \n",
" 0.432075 | \n",
" 1.766055 | \n",
" 0.761292 | \n",
"
\n",
" \n",
" | min | \n",
" 4.300000 | \n",
" 2.000000 | \n",
" 1.000000 | \n",
" 0.100000 | \n",
"
\n",
" \n",
" | 25% | \n",
" 5.100000 | \n",
" 2.800000 | \n",
" 1.575000 | \n",
" 0.300000 | \n",
"
\n",
" \n",
" | 50% | \n",
" 5.800000 | \n",
" 3.000000 | \n",
" 4.300000 | \n",
" 1.300000 | \n",
"
\n",
" \n",
" | 75% | \n",
" 6.400000 | \n",
" 3.300000 | \n",
" 5.100000 | \n",
" 1.800000 | \n",
"
\n",
" \n",
" | max | \n",
" 7.900000 | \n",
" 4.400000 | \n",
" 6.900000 | \n",
" 2.500000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" SL SW PL PW\n",
"count 147.000000 143.000000 148.000000 149.000000\n",
"mean 5.859184 3.042657 3.737162 1.205369\n",
"std 0.828413 0.432075 1.766055 0.761292\n",
"min 4.300000 2.000000 1.000000 0.100000\n",
"25% 5.100000 2.800000 1.575000 0.300000\n",
"50% 5.800000 3.000000 4.300000 1.300000\n",
"75% 6.400000 3.300000 5.100000 1.800000\n",
"max 7.900000 4.400000 6.900000 2.500000"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"##(5)统计摘要describe\n",
"#describe() 方法用于生成DataFrame中数值列的统计摘要。\n",
"#它提供了每个数值列的计数、均值、标准差、最小值、25th、50th(中位数)、75th 四分位数和最大值。\n",
"iris_data.describe()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 150 entries, 0 to 149\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 SL 147 non-null float64\n",
" 1 SW 143 non-null float64\n",
" 2 PL 148 non-null float64\n",
" 3 PW 149 non-null float64\n",
" 4 species 145 non-null object \n",
"dtypes: float64(4), object(1)\n",
"memory usage: 6.0+ KB\n"
]
}
],
"source": [
"##(6)INFO摘要\n",
"#info() 方法用于获取DataFrame的摘要信息,包括每列的非空值数量、列的数据类型等\n",
"iris_data.info()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"SL 3\n",
"SW 7\n",
"PL 2\n",
"PW 1\n",
"species 5\n",
"dtype: int64"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 也可以用isnull方法简单计算含缺失特征的样本数。\n",
"missing_values = iris_data.isnull().sum()\n",
"missing_values"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SL | \n",
" SW | \n",
" PL | \n",
" PW | \n",
" species | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1 | \n",
" 4.9 | \n",
" 3.0 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 2 | \n",
" 4.7 | \n",
" 3.2 | \n",
" 1.3 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 4 | \n",
" 5.0 | \n",
" 3.6 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 5 | \n",
" 5.4 | \n",
" 3.9 | \n",
" 1.7 | \n",
" 0.4 | \n",
" setosa | \n",
"
\n",
" \n",
" | 6 | \n",
" 4.6 | \n",
" 3.4 | \n",
" 1.4 | \n",
" 0.3 | \n",
" setosa | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 144 | \n",
" 6.7 | \n",
" 3.3 | \n",
" 5.7 | \n",
" 2.5 | \n",
" virginica | \n",
"
\n",
" \n",
" | 145 | \n",
" 6.7 | \n",
" 3.0 | \n",
" 5.2 | \n",
" 2.3 | \n",
" virginica | \n",
"
\n",
" \n",
" | 147 | \n",
" 6.5 | \n",
" 3.0 | \n",
" 5.2 | \n",
" 2.0 | \n",
" virginica | \n",
"
\n",
" \n",
" | 148 | \n",
" 6.2 | \n",
" 3.4 | \n",
" 5.4 | \n",
" 2.3 | \n",
" virginica | \n",
"
\n",
" \n",
" | 149 | \n",
" 5.9 | \n",
" 3.0 | \n",
" 5.1 | \n",
" 1.8 | \n",
" virginica | \n",
"
\n",
" \n",
"
\n",
"
132 rows × 5 columns
\n",
"
"
],
"text/plain": [
" SL SW PL PW species\n",
"1 4.9 3.0 1.4 0.2 setosa\n",
"2 4.7 3.2 1.3 0.2 setosa\n",
"4 5.0 3.6 1.4 0.2 setosa\n",
"5 5.4 3.9 1.7 0.4 setosa\n",
"6 4.6 3.4 1.4 0.3 setosa\n",
".. ... ... ... ... ...\n",
"144 6.7 3.3 5.7 2.5 virginica\n",
"145 6.7 3.0 5.2 2.3 virginica\n",
"147 6.5 3.0 5.2 2.0 virginica\n",
"148 6.2 3.4 5.4 2.3 virginica\n",
"149 5.9 3.0 5.1 1.8 virginica\n",
"\n",
"[132 rows x 5 columns]"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## (7)缺失值处理\n",
"#可简单删除缺失值样本,也可补全(特征补全参考5.4节)\n",
"df_filtered = iris_data.dropna()\n",
"df_filtered #一共有18条样本包含缺失值,删除后剩余132条样本"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"### 总结\n",
"# 数据探索完后,根据探索结果进行样本补全、特征工程等,再训练模型"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}