{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.simplefilter(action='ignore')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import urllib.request\n", "import os\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn import preprocessing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1. 下载 Titanic 号上旅客的数据集" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "url = 'http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls'\n", "file_path = 'data/titanic3.xls'\n", "if not os.path.isfile(file_path):\n", " result = urllib.request.urlretrieve(url, file_path)\n", " print('download:', result)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2. 使用 pandas 读取数据并进行预处理" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2.1 读取 titanic3.xls 文件为 DataFrame" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "all_df = pd.read_excel(file_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2.2 查看前两项数据" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pclasssurvivednamesexagesibspparchticketfarecabinembarkedboatbodyhome.dest
011Allen, Miss. Elisabeth Waltonfemale29.00000024160211.3375B5S2NaNSt Louis, MO
111Allison, Master. Hudson Trevormale0.916712113781151.5500C22 C26S11NaNMontreal, PQ / Chesterville, ON
\n", "
" ], "text/plain": [ " pclass survived name sex age sibsp \\\n", "0 1 1 Allen, Miss. Elisabeth Walton female 29.0000 0 \n", "1 1 1 Allison, Master. Hudson Trevor male 0.9167 1 \n", "\n", " parch ticket fare cabin embarked boat body \\\n", "0 0 24160 211.3375 B5 S 2 NaN \n", "1 2 113781 151.5500 C22 C26 S 11 NaN \n", "\n", " home.dest \n", "0 St Louis, MO \n", "1 Montreal, PQ / Chesterville, ON " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2.3 把需要的字段选取到 DataFrame 中" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "cols = ['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']\n", "all_df = all_df[cols]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
survivednamepclasssexagesibspparchfareembarked
01Allen, Miss. Elisabeth Walton1female29.000000211.3375S
11Allison, Master. Hudson Trevor1male0.916712151.5500S
\n", "
" ], "text/plain": [ " survived name pclass sex age sibsp \\\n", "0 1 Allen, Miss. Elisabeth Walton 1 female 29.0000 0 \n", "1 1 Allison, Master. Hudson Trevor 1 male 0.9167 1 \n", "\n", " parch fare embarked \n", "0 0 211.3375 S \n", "1 2 151.5500 S " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2.4 找出含有 null 值的字段" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "survived 0\n", "name 0\n", "pclass 0\n", "sex 0\n", "age 263\n", "sibsp 0\n", "parch 0\n", "fare 1\n", "embarked 2\n", "dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df.isnull().sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2.5 将 name 字段删除" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "df = all_df.drop(['name'], axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2.6 将 age 与 fare 为 null 的数据替换成平均值" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "age_mean = df['age'].mean()\n", "df['age'] = df['age'].fillna(age_mean)\n", "\n", "fare_mean = df['fare'].mean()\n", "df['fare'] = df['fare'].fillna(fare_mean)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
survivedpclasssexagesibspparchfareembarked
011female29.000000211.3375S
111male0.916712151.5500S
\n", "
" ], "text/plain": [ " survived pclass sex age sibsp parch fare embarked\n", "0 1 1 female 29.0000 0 0 211.3375 S\n", "1 1 1 male 0.9167 1 2 151.5500 S" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2.7 转换性别字段为 0 与 1" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
survivedpclasssexagesibspparchfareembarked
011029.000000211.3375S
11110.916712151.5500S
\n", "
" ], "text/plain": [ " survived pclass sex age sibsp parch fare embarked\n", "0 1 1 0 29.0000 0 0 211.3375 S\n", "1 1 1 1 0.9167 1 2 151.5500 S" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2.8 将 embarked 字段进行 one-hot 编码" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "x_one_hot_df = pd.get_dummies(data=df, columns=['embarked'])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
survivedpclasssexagesibspparchfareembarked_Cembarked_Qembarked_S
011029.000000211.3375001
11110.916712151.5500001
\n", "
" ], "text/plain": [ " survived pclass sex age sibsp parch fare embarked_C \\\n", "0 1 1 0 29.0000 0 0 211.3375 0 \n", "1 1 1 1 0.9167 1 2 151.5500 0 \n", "\n", " embarked_Q embarked_S \n", "0 0 1 \n", "1 0 1 " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_one_hot_df[:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3. 将 DataFrame 转换为 Array" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 3.1 DataFrame 转换为 Array" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "ndarray = x_one_hot_df.values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 3.2 查看 ndarray 的 shape, 以及前两项数据" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1309, 10)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ndarray.shape" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 1. , 1. , 0. , 29. , 0. , 0. ,\n", " 211.3375, 0. , 0. , 1. ],\n", " [ 1. , 1. , 1. , 0.9167, 1. , 2. ,\n", " 151.55 , 0. , 0. , 1. ]])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ndarray[:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 3.3 提取 features 与 label" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "label = ndarray[:, 0]\n", "features = ndarray[:, 1:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 3.4 分别查看 features 与 label 的 shape, 以及前两项数据" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1309,)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "label.shape" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1., 1.])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "label[:2]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1309, 9)" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "features.shape" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 1. , 0. , 29. , 0. , 0. , 211.3375,\n", " 0. , 0. , 1. ],\n", " [ 1. , 1. , 0.9167, 1. , 2. , 151.55 ,\n", " 0. , 0. , 1. ]])" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "features[:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4. 将 Array 进行标准化" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1)) # 标准仳之后的范围在0与1之间\n", "scaled_features = minmax_scale.fit_transform(features)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0. , 0. , 0.36116884, 0. , 0. ,\n", " 0.41250333, 0. , 0. , 1. ],\n", " [0. , 1. , 0.00939458, 0.125 , 0.22222222,\n", " 0.2958059 , 0. , 0. , 1. ]])" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scaled_features[:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 5. 将数据分为训练数据与测试数据" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 5.1 将数据以随机方式分为训练数据与测试数据" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "mask = np.random.rand(len(all_df)) < 0.8\n", "train_df = all_df[mask]\n", "test_df = all_df[~mask]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total size: 1309\n", "train size: 1043\n", "test size: 266\n" ] } ], "source": [ "print('total size:', len(all_df))\n", "print('train size:', len(train_df))\n", "print('test size:', len(test_df))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 5.2 创建函数进行数据的预处理" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "def preprocess_data(raw_df):\n", " df = raw_df.drop(['name'], axis=1)\n", " age_mean = df['age'].mean()\n", " df['age'] = df['age'].fillna(age_mean)\n", " fare_mean = df['fare'].mean()\n", " df['fare'] = df['fare'].fillna(fare_mean)\n", " df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)\n", " x_one_hot_df = pd.get_dummies(data=df, columns=['embarked'])\n", " \n", " ndarray = x_one_hot_df.values\n", " label = ndarray[:, 0]\n", " features = ndarray[:, 1:]\n", " \n", " minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))\n", " scaled_features = minmax_scale.fit_transform(features)\n", " \n", " return scaled_features, label" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "train_features, train_label = preprocess_data(train_df)\n", "test_features, test_label = preprocess_data(test_df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 5.3 查看数据预处理后训练数据的特征与标签字段" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0. , 0. , 0.36116884, 0. , 0. ,\n", " 0.41250333, 0. , 0. , 1. ],\n", " [0. , 0. , 0.31106443, 0.125 , 0.22222222,\n", " 0.2958059 , 0. , 0. , 1. ]])" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_features[:2]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1., 0.])" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_label[:2]" ] } ], "metadata": { "kernelspec": { "display_name": "tensorflow-keras-practice", "language": "python", "name": "tensorflow-keras-practice" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }