{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"warnings.simplefilter(action='ignore')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import urllib.request\n",
"import os\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn import preprocessing"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1. 下载 Titanic 号上旅客的数据集"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls'\n",
"file_path = 'data/titanic3.xls'\n",
"if not os.path.isfile(file_path):\n",
" result = urllib.request.urlretrieve(url, file_path)\n",
" print('download:', result)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2. 使用 pandas 读取数据并进行预处理"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2.1 读取 titanic3.xls 文件为 DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"all_df = pd.read_excel(file_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2.2 查看前两项数据"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pclass | \n",
" survived | \n",
" name | \n",
" sex | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" ticket | \n",
" fare | \n",
" cabin | \n",
" embarked | \n",
" boat | \n",
" body | \n",
" home.dest | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" Allen, Miss. Elisabeth Walton | \n",
" female | \n",
" 29.0000 | \n",
" 0 | \n",
" 0 | \n",
" 24160 | \n",
" 211.3375 | \n",
" B5 | \n",
" S | \n",
" 2 | \n",
" NaN | \n",
" St Louis, MO | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" Allison, Master. Hudson Trevor | \n",
" male | \n",
" 0.9167 | \n",
" 1 | \n",
" 2 | \n",
" 113781 | \n",
" 151.5500 | \n",
" C22 C26 | \n",
" S | \n",
" 11 | \n",
" NaN | \n",
" Montreal, PQ / Chesterville, ON | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pclass survived name sex age sibsp \\\n",
"0 1 1 Allen, Miss. Elisabeth Walton female 29.0000 0 \n",
"1 1 1 Allison, Master. Hudson Trevor male 0.9167 1 \n",
"\n",
" parch ticket fare cabin embarked boat body \\\n",
"0 0 24160 211.3375 B5 S 2 NaN \n",
"1 2 113781 151.5500 C22 C26 S 11 NaN \n",
"\n",
" home.dest \n",
"0 St Louis, MO \n",
"1 Montreal, PQ / Chesterville, ON "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_df[:2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2.3 把需要的字段选取到 DataFrame 中"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"cols = ['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']\n",
"all_df = all_df[cols]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" survived | \n",
" name | \n",
" pclass | \n",
" sex | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" fare | \n",
" embarked | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Allen, Miss. Elisabeth Walton | \n",
" 1 | \n",
" female | \n",
" 29.0000 | \n",
" 0 | \n",
" 0 | \n",
" 211.3375 | \n",
" S | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" Allison, Master. Hudson Trevor | \n",
" 1 | \n",
" male | \n",
" 0.9167 | \n",
" 1 | \n",
" 2 | \n",
" 151.5500 | \n",
" S | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" survived name pclass sex age sibsp \\\n",
"0 1 Allen, Miss. Elisabeth Walton 1 female 29.0000 0 \n",
"1 1 Allison, Master. Hudson Trevor 1 male 0.9167 1 \n",
"\n",
" parch fare embarked \n",
"0 0 211.3375 S \n",
"1 2 151.5500 S "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_df[:2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2.4 找出含有 null 值的字段"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"survived 0\n",
"name 0\n",
"pclass 0\n",
"sex 0\n",
"age 263\n",
"sibsp 0\n",
"parch 0\n",
"fare 1\n",
"embarked 2\n",
"dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_df.isnull().sum()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2.5 将 name 字段删除"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"df = all_df.drop(['name'], axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2.6 将 age 与 fare 为 null 的数据替换成平均值"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"age_mean = df['age'].mean()\n",
"df['age'] = df['age'].fillna(age_mean)\n",
"\n",
"fare_mean = df['fare'].mean()\n",
"df['fare'] = df['fare'].fillna(fare_mean)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" survived | \n",
" pclass | \n",
" sex | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" fare | \n",
" embarked | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" female | \n",
" 29.0000 | \n",
" 0 | \n",
" 0 | \n",
" 211.3375 | \n",
" S | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" male | \n",
" 0.9167 | \n",
" 1 | \n",
" 2 | \n",
" 151.5500 | \n",
" S | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" survived pclass sex age sibsp parch fare embarked\n",
"0 1 1 female 29.0000 0 0 211.3375 S\n",
"1 1 1 male 0.9167 1 2 151.5500 S"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[:2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2.7 转换性别字段为 0 与 1"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" survived | \n",
" pclass | \n",
" sex | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" fare | \n",
" embarked | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 29.0000 | \n",
" 0 | \n",
" 0 | \n",
" 211.3375 | \n",
" S | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0.9167 | \n",
" 1 | \n",
" 2 | \n",
" 151.5500 | \n",
" S | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" survived pclass sex age sibsp parch fare embarked\n",
"0 1 1 0 29.0000 0 0 211.3375 S\n",
"1 1 1 1 0.9167 1 2 151.5500 S"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[:2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2.8 将 embarked 字段进行 one-hot 编码"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"x_one_hot_df = pd.get_dummies(data=df, columns=['embarked'])"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" survived | \n",
" pclass | \n",
" sex | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" fare | \n",
" embarked_C | \n",
" embarked_Q | \n",
" embarked_S | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 29.0000 | \n",
" 0 | \n",
" 0 | \n",
" 211.3375 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0.9167 | \n",
" 1 | \n",
" 2 | \n",
" 151.5500 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" survived pclass sex age sibsp parch fare embarked_C \\\n",
"0 1 1 0 29.0000 0 0 211.3375 0 \n",
"1 1 1 1 0.9167 1 2 151.5500 0 \n",
"\n",
" embarked_Q embarked_S \n",
"0 0 1 \n",
"1 0 1 "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x_one_hot_df[:2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3. 将 DataFrame 转换为 Array"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 3.1 DataFrame 转换为 Array"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"ndarray = x_one_hot_df.values"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 3.2 查看 ndarray 的 shape, 以及前两项数据"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1309, 10)"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ndarray.shape"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 1. , 1. , 0. , 29. , 0. , 0. ,\n",
" 211.3375, 0. , 0. , 1. ],\n",
" [ 1. , 1. , 1. , 0.9167, 1. , 2. ,\n",
" 151.55 , 0. , 0. , 1. ]])"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ndarray[:2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 3.3 提取 features 与 label"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"label = ndarray[:, 0]\n",
"features = ndarray[:, 1:]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 3.4 分别查看 features 与 label 的 shape, 以及前两项数据"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1309,)"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"label.shape"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1., 1.])"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"label[:2]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1309, 9)"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"features.shape"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 1. , 0. , 29. , 0. , 0. , 211.3375,\n",
" 0. , 0. , 1. ],\n",
" [ 1. , 1. , 0.9167, 1. , 2. , 151.55 ,\n",
" 0. , 0. , 1. ]])"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"features[:2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 4. 将 Array 进行标准化"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1)) # 标准仳之后的范围在0与1之间\n",
"scaled_features = minmax_scale.fit_transform(features)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0. , 0. , 0.36116884, 0. , 0. ,\n",
" 0.41250333, 0. , 0. , 1. ],\n",
" [0. , 1. , 0.00939458, 0.125 , 0.22222222,\n",
" 0.2958059 , 0. , 0. , 1. ]])"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scaled_features[:2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 5. 将数据分为训练数据与测试数据"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 5.1 将数据以随机方式分为训练数据与测试数据"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"mask = np.random.rand(len(all_df)) < 0.8\n",
"train_df = all_df[mask]\n",
"test_df = all_df[~mask]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"total size: 1309\n",
"train size: 1043\n",
"test size: 266\n"
]
}
],
"source": [
"print('total size:', len(all_df))\n",
"print('train size:', len(train_df))\n",
"print('test size:', len(test_df))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 5.2 创建函数进行数据的预处理"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"def preprocess_data(raw_df):\n",
" df = raw_df.drop(['name'], axis=1)\n",
" age_mean = df['age'].mean()\n",
" df['age'] = df['age'].fillna(age_mean)\n",
" fare_mean = df['fare'].mean()\n",
" df['fare'] = df['fare'].fillna(fare_mean)\n",
" df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)\n",
" x_one_hot_df = pd.get_dummies(data=df, columns=['embarked'])\n",
" \n",
" ndarray = x_one_hot_df.values\n",
" label = ndarray[:, 0]\n",
" features = ndarray[:, 1:]\n",
" \n",
" minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))\n",
" scaled_features = minmax_scale.fit_transform(features)\n",
" \n",
" return scaled_features, label"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"train_features, train_label = preprocess_data(train_df)\n",
"test_features, test_label = preprocess_data(test_df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 5.3 查看数据预处理后训练数据的特征与标签字段"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0. , 0. , 0.36116884, 0. , 0. ,\n",
" 0.41250333, 0. , 0. , 1. ],\n",
" [0. , 0. , 0.31106443, 0.125 , 0.22222222,\n",
" 0.2958059 , 0. , 0. , 1. ]])"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_features[:2]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1., 0.])"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_label[:2]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "tensorflow-keras-practice",
"language": "python",
"name": "tensorflow-keras-practice"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}