{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "from sklearn import datasets, linear_model\n",
    "from sklearn.metrics import mean_squared_error, r2_score\n",
    "\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sepal_length</th>\n",
       "      <th>sepal_width</th>\n",
       "      <th>petal_length</th>\n",
       "      <th>petal_width</th>\n",
       "      <th>species</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   sepal_length  sepal_width  petal_length  petal_width species\n",
       "0           5.1          NaN           1.4          0.2  setosa\n",
       "1           4.9          3.0           1.4          0.2  setosa\n",
       "2           4.7          3.2           1.3          0.2  setosa\n",
       "3           4.6          NaN           1.5          0.2  setosa\n",
       "4           5.0          3.6           1.4          0.2  setosa"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "### 1.0.0 数据探索EDA <每次面对新数据都应该进行数据探索，依据探索结果进行特征工程和选择模型>\n",
    "#加载iris数据集，数据集被手动删除一些特征值。\n",
    "iris_data = pd.read_csv(\"../_Datasets/iris_miss.data\",sep=',')  #指定字段分隔符，默认逗号\n",
    "\n",
    "##（1）查看前几行数据。\n",
    "iris_data.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150, 5)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "##（2）查看数据维度（例如二维数据的行列数）\n",
    "iris_data.shape  #150行5列\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',\n",
       "       'species'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "##（3）查看特征列名。\n",
    "iris_data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['SL', 'SW', 'PL', 'PW', 'species'], dtype='object')"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "##（4）特征列重命名\n",
    "\n",
    "#可以根据列名筛选特征。也可以重命名该列。例如将diabetes_X特征s1-s6重命名为feature1 - feature6。\n",
    "##方法1-暴力重命名，此方法需要写全所有列名，否则报错。\n",
    "#diabetes_X.columns = ['feature1', 'feature2', 'feature3', 'feature4', 'species']\n",
    "#diabetes_X.columns\n",
    "\n",
    "#方法2-rename方法，此方法只需写需要重命名的字段。根据需要，非必需\n",
    "iris_data.rename(columns={'sepal_length':'SL','sepal_width':'SW','petal_length':'PL','petal_width':'PW'},inplace=True)\n",
    "iris_data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SL</th>\n",
       "      <th>SW</th>\n",
       "      <th>PL</th>\n",
       "      <th>PW</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>147.000000</td>\n",
       "      <td>143.000000</td>\n",
       "      <td>148.000000</td>\n",
       "      <td>149.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>5.859184</td>\n",
       "      <td>3.042657</td>\n",
       "      <td>3.737162</td>\n",
       "      <td>1.205369</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>0.828413</td>\n",
       "      <td>0.432075</td>\n",
       "      <td>1.766055</td>\n",
       "      <td>0.761292</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>4.300000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.100000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>5.100000</td>\n",
       "      <td>2.800000</td>\n",
       "      <td>1.575000</td>\n",
       "      <td>0.300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>5.800000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>4.300000</td>\n",
       "      <td>1.300000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>6.400000</td>\n",
       "      <td>3.300000</td>\n",
       "      <td>5.100000</td>\n",
       "      <td>1.800000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>7.900000</td>\n",
       "      <td>4.400000</td>\n",
       "      <td>6.900000</td>\n",
       "      <td>2.500000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               SL          SW          PL          PW\n",
       "count  147.000000  143.000000  148.000000  149.000000\n",
       "mean     5.859184    3.042657    3.737162    1.205369\n",
       "std      0.828413    0.432075    1.766055    0.761292\n",
       "min      4.300000    2.000000    1.000000    0.100000\n",
       "25%      5.100000    2.800000    1.575000    0.300000\n",
       "50%      5.800000    3.000000    4.300000    1.300000\n",
       "75%      6.400000    3.300000    5.100000    1.800000\n",
       "max      7.900000    4.400000    6.900000    2.500000"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "##（5）统计摘要describe\n",
    "#describe() 方法用于生成DataFrame中数值列的统计摘要。\n",
    "#它提供了每个数值列的计数、均值、标准差、最小值、25th、50th（中位数）、75th 四分位数和最大值。\n",
    "iris_data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 150 entries, 0 to 149\n",
      "Data columns (total 5 columns):\n",
      " #   Column   Non-Null Count  Dtype  \n",
      "---  ------   --------------  -----  \n",
      " 0   SL       147 non-null    float64\n",
      " 1   SW       143 non-null    float64\n",
      " 2   PL       148 non-null    float64\n",
      " 3   PW       149 non-null    float64\n",
      " 4   species  145 non-null    object \n",
      "dtypes: float64(4), object(1)\n",
      "memory usage: 6.0+ KB\n"
     ]
    }
   ],
   "source": [
    "##（6）INFO摘要\n",
    "#info() 方法用于获取DataFrame的摘要信息，包括每列的非空值数量、列的数据类型等\n",
    "iris_data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SL         3\n",
       "SW         7\n",
       "PL         2\n",
       "PW         1\n",
       "species    5\n",
       "dtype: int64"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 也可以用isnull方法简单计算含缺失特征的样本数。\n",
    "missing_values = iris_data.isnull().sum()\n",
    "missing_values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SL</th>\n",
       "      <th>SW</th>\n",
       "      <th>PL</th>\n",
       "      <th>PW</th>\n",
       "      <th>species</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5.4</td>\n",
       "      <td>3.9</td>\n",
       "      <td>1.7</td>\n",
       "      <td>0.4</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>4.6</td>\n",
       "      <td>3.4</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.3</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>144</th>\n",
       "      <td>6.7</td>\n",
       "      <td>3.3</td>\n",
       "      <td>5.7</td>\n",
       "      <td>2.5</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>145</th>\n",
       "      <td>6.7</td>\n",
       "      <td>3.0</td>\n",
       "      <td>5.2</td>\n",
       "      <td>2.3</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147</th>\n",
       "      <td>6.5</td>\n",
       "      <td>3.0</td>\n",
       "      <td>5.2</td>\n",
       "      <td>2.0</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>148</th>\n",
       "      <td>6.2</td>\n",
       "      <td>3.4</td>\n",
       "      <td>5.4</td>\n",
       "      <td>2.3</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149</th>\n",
       "      <td>5.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>5.1</td>\n",
       "      <td>1.8</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>132 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      SL   SW   PL   PW    species\n",
       "1    4.9  3.0  1.4  0.2     setosa\n",
       "2    4.7  3.2  1.3  0.2     setosa\n",
       "4    5.0  3.6  1.4  0.2     setosa\n",
       "5    5.4  3.9  1.7  0.4     setosa\n",
       "6    4.6  3.4  1.4  0.3     setosa\n",
       "..   ...  ...  ...  ...        ...\n",
       "144  6.7  3.3  5.7  2.5  virginica\n",
       "145  6.7  3.0  5.2  2.3  virginica\n",
       "147  6.5  3.0  5.2  2.0  virginica\n",
       "148  6.2  3.4  5.4  2.3  virginica\n",
       "149  5.9  3.0  5.1  1.8  virginica\n",
       "\n",
       "[132 rows x 5 columns]"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## （7）缺失值处理\n",
    "#可简单删除缺失值样本，也可补全（特征补全参考5.4节）\n",
    "df_filtered = iris_data.dropna()\n",
    "df_filtered  #一共有18条样本包含缺失值，删除后剩余132条样本"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "### 总结\n",
    "# 数据探索完后，根据探索结果进行样本补全、特征工程等，再训练模型"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}