{
"cells": [
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" customerID gender SeniorCitizen Partner Dependents tenure PhoneService \\\n",
"0 7590-VHVEG Female 0 Yes No 1 No \n",
"1 5575-GNVDE Male 0 No No 34 Yes \n",
"2 3668-QPYBK Male 0 No No 2 Yes \n",
"3 7795-CFOCW Male 0 No No 45 No \n",
"4 9237-HQITU Female 0 No No 2 Yes \n",
"5 9305-CDSKC Female 0 No No 8 Yes \n",
"6 1452-KIOVK Male 0 No Yes 22 Yes \n",
"7 6713-OKOMC Female 0 No No 10 No \n",
"8 7892-POOKP Female 0 Yes No 28 Yes \n",
"9 6388-TABGU Male 0 No Yes 62 Yes \n",
"\n",
" MultipleLines InternetService OnlineSecurity ... DeviceProtection \\\n",
"0 No phone service DSL No ... No \n",
"1 No DSL Yes ... Yes \n",
"2 No DSL Yes ... No \n",
"3 No phone service DSL Yes ... Yes \n",
"4 No Fiber optic No ... No \n",
"5 Yes Fiber optic No ... Yes \n",
"6 Yes Fiber optic No ... No \n",
"7 No phone service DSL Yes ... No \n",
"8 Yes Fiber optic No ... Yes \n",
"9 No DSL Yes ... No \n",
"\n",
" TechSupport StreamingTV StreamingMovies Contract PaperlessBilling \\\n",
"0 No No No Month-to-month Yes \n",
"1 No No No One year No \n",
"2 No No No Month-to-month Yes \n",
"3 Yes No No One year No \n",
"4 No No No Month-to-month Yes \n",
"5 No Yes Yes Month-to-month Yes \n",
"6 No Yes No Month-to-month Yes \n",
"7 No No No Month-to-month No \n",
"8 Yes Yes Yes Month-to-month Yes \n",
"9 No No No One year No \n",
"\n",
" PaymentMethod MonthlyCharges TotalCharges Churn \n",
"0 Electronic check 29.85 29.85 No \n",
"1 Mailed check 56.95 1889.5 No \n",
"2 Mailed check 53.85 108.15 Yes \n",
"3 Bank transfer (automatic) 42.30 1840.75 No \n",
"4 Electronic check 70.70 151.65 Yes \n",
"5 Electronic check 99.65 820.5 Yes \n",
"6 Credit card (automatic) 89.10 1949.4 No \n",
"7 Mailed check 29.75 301.9 No \n",
"8 Electronic check 104.80 3046.05 Yes \n",
"9 Bank transfer (automatic) 56.15 3487.95 No \n",
"\n",
"[10 rows x 21 columns]\n"
]
}
],
"source": [
"# P31 导入文件\n",
"import pandas as pd\n",
"\n",
"df = pd.read_csv('/Users/emmett/Downloads/tel_data.csv')\n",
"print(df.head(10))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"从上面的结果可以看出,这个文件一共有21列内容,有的是数据,有的是字符。"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" customerID | \n",
" gender | \n",
" SeniorCitizen | \n",
" Partner | \n",
" Dependents | \n",
" tenure | \n",
" PhoneService | \n",
" MultipleLines | \n",
" InternetService | \n",
" OnlineSecurity | \n",
" ... | \n",
" DeviceProtection | \n",
" TechSupport | \n",
" StreamingTV | \n",
" StreamingMovies | \n",
" Contract | \n",
" PaperlessBilling | \n",
" PaymentMethod | \n",
" MonthlyCharges | \n",
" TotalCharges | \n",
" Churn | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" ... | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" 1 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" ... | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" 2 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" ... | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" 3 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" ... | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" 4 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" ... | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 7038 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" ... | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" 7039 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" ... | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" 7040 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" ... | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" 7041 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" ... | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" 7042 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" ... | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
"
\n",
"
7043 rows × 21 columns
\n",
"
"
],
"text/plain": [
" customerID gender SeniorCitizen Partner Dependents tenure \\\n",
"0 False False False False False False \n",
"1 False False False False False False \n",
"2 False False False False False False \n",
"3 False False False False False False \n",
"4 False False False False False False \n",
"... ... ... ... ... ... ... \n",
"7038 False False False False False False \n",
"7039 False False False False False False \n",
"7040 False False False False False False \n",
"7041 False False False False False False \n",
"7042 False False False False False False \n",
"\n",
" PhoneService MultipleLines InternetService OnlineSecurity ... \\\n",
"0 False False False False ... \n",
"1 False False False False ... \n",
"2 False False False False ... \n",
"3 False False False False ... \n",
"4 False False False False ... \n",
"... ... ... ... ... ... \n",
"7038 False False False False ... \n",
"7039 False False False False ... \n",
"7040 False False False False ... \n",
"7041 False False False False ... \n",
"7042 False False False False ... \n",
"\n",
" DeviceProtection TechSupport StreamingTV StreamingMovies Contract \\\n",
"0 False False False False False \n",
"1 False False False False False \n",
"2 False False False False False \n",
"3 False False False False False \n",
"4 False False False False False \n",
"... ... ... ... ... ... \n",
"7038 False False False False False \n",
"7039 False False False False False \n",
"7040 False False False False False \n",
"7041 False False False False False \n",
"7042 False False False False False \n",
"\n",
" PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn \n",
"0 False False False False False \n",
"1 False False False False False \n",
"2 False False False False False \n",
"3 False False False False False \n",
"4 False False False False False \n",
"... ... ... ... ... ... \n",
"7038 False False False False False \n",
"7039 False False False False False \n",
"7040 False False False False False \n",
"7041 False False False False False \n",
"7042 False False False False False \n",
"\n",
"[7043 rows x 21 columns]"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# P32 检查文件中是否有缺失值\n",
"df.isnull()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"可以看到,所有没有缺失的单元格被标上了False。如果该单元格缺失,会被填上True。\n",
"但是这里,我们没有办法看到所有单元格的情况,我们按列进行统计看看。"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"customerID 0\n",
"gender 0\n",
"SeniorCitizen 0\n",
"Partner 0\n",
"Dependents 0\n",
"tenure 0\n",
"PhoneService 0\n",
"MultipleLines 0\n",
"InternetService 0\n",
"OnlineSecurity 0\n",
"OnlineBackup 0\n",
"DeviceProtection 0\n",
"TechSupport 0\n",
"StreamingTV 0\n",
"StreamingMovies 0\n",
"Contract 0\n",
"PaperlessBilling 0\n",
"PaymentMethod 0\n",
"MonthlyCharges 0\n",
"TotalCharges 0\n",
"Churn 0\n",
"dtype: int64"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 按行求和,所有的False会被转化为0,所有True会被转化为1,如果一整列都是0,说明该列没有缺失。\n",
"df.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 7043 entries, 0 to 7042\n",
"Data columns (total 21 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 customerID 7043 non-null object \n",
" 1 gender 7043 non-null object \n",
" 2 SeniorCitizen 7043 non-null int64 \n",
" 3 Partner 7043 non-null object \n",
" 4 Dependents 7043 non-null object \n",
" 5 tenure 7043 non-null int64 \n",
" 6 PhoneService 7043 non-null object \n",
" 7 MultipleLines 7043 non-null object \n",
" 8 InternetService 7043 non-null object \n",
" 9 OnlineSecurity 7043 non-null object \n",
" 10 OnlineBackup 7043 non-null object \n",
" 11 DeviceProtection 7043 non-null object \n",
" 12 TechSupport 7043 non-null object \n",
" 13 StreamingTV 7043 non-null object \n",
" 14 StreamingMovies 7043 non-null object \n",
" 15 Contract 7043 non-null object \n",
" 16 PaperlessBilling 7043 non-null object \n",
" 17 PaymentMethod 7043 non-null object \n",
" 18 MonthlyCharges 7043 non-null float64\n",
" 19 TotalCharges 7043 non-null object \n",
" 20 Churn 7043 non-null object \n",
"dtypes: float64(1), int64(2), object(18)\n",
"memory usage: 1.1+ MB\n"
]
}
],
"source": [
"# P33 对数据类型进行修正\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
" 11\n",
"20.2 11\n",
"19.75 9\n",
"20.05 8\n",
"19.9 8\n",
" ..\n",
"6849.4 1\n",
"692.35 1\n",
"130.15 1\n",
"3211.9 1\n",
"6844.5 1\n",
"Name: TotalCharges, Length: 6531, dtype: int64"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 在以上的结果中我们发现TotalCharges这一列的数据类型是object,然而这一列应该是数字。肯定是其中混入了一些不是数字的元素,我们对这一列进行统计频次,看看这一列都是啥东西。\n",
"df[\"TotalCharges\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1397.475 11\n",
"20.200 11\n",
"19.750 9\n",
"20.050 8\n",
"19.900 8\n",
" ..\n",
"6849.400 1\n",
"692.350 1\n",
"130.150 1\n",
"3211.900 1\n",
"6844.500 1\n",
"Name: TotalCharges, Length: 6531, dtype: int64"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 发现:有个含空格的单元格出现了11次。现在把这些空单元格替换为这组数据的中位数。\n",
"\n",
"# 中位数\n",
"median = df[\"TotalCharges\"][df[\"TotalCharges\"] != \" \"].median()\n",
"\n",
"# 将median填到空格里\n",
"df.loc[df[\"TotalCharges\"] == \" \", \"TotalCharges\"] = median\n",
"\n",
"# 再把整列的数据类型修改为float\n",
"df[\"TotalCharges\"] = df[\"TotalCharges\"].astype(float)\n",
"\n",
"# 检查\n",
"df[\"TotalCharges\"].value_counts()\n"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',\n",
" 'tenure', 'PhoneService', 'MultipleLines', 'InternetService',\n",
" 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',\n",
" 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',\n",
" 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],\n",
" dtype='object')"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# P34 将数字类型外的其他列都改成分类变量(category)\n",
"# 先看下都有哪些列\n",
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 7043 entries, 0 to 7042\n",
"Data columns (total 21 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 customerID 7043 non-null category\n",
" 1 gender 7043 non-null category\n",
" 2 SeniorCitizen 7043 non-null category\n",
" 3 Partner 7043 non-null category\n",
" 4 Dependents 7043 non-null category\n",
" 5 tenure 7043 non-null float64 \n",
" 6 PhoneService 7043 non-null category\n",
" 7 MultipleLines 7043 non-null category\n",
" 8 InternetService 7043 non-null category\n",
" 9 OnlineSecurity 7043 non-null category\n",
" 10 OnlineBackup 7043 non-null category\n",
" 11 DeviceProtection 7043 non-null category\n",
" 12 TechSupport 7043 non-null category\n",
" 13 StreamingTV 7043 non-null category\n",
" 14 StreamingMovies 7043 non-null category\n",
" 15 Contract 7043 non-null category\n",
" 16 PaperlessBilling 7043 non-null category\n",
" 17 PaymentMethod 7043 non-null category\n",
" 18 MonthlyCharges 7043 non-null float64 \n",
" 19 TotalCharges 7043 non-null float64 \n",
" 20 Churn 7043 non-null category\n",
"dtypes: category(18), float64(3)\n",
"memory usage: 611.1 KB\n"
]
}
],
"source": [
"# 只有tenure, MonthlyCharges, TotalCharges是数字类型(float),其他的都改成分类变量(categorical)\n",
"\n",
"# 先把这三列的列名挑出来\n",
"\n",
"number_columns = [\"tenure\", \"MonthlyCharges\", \"TotalCharges\"]\n",
"for column in number_columns:\n",
" df[column] = df[column].astype(float)\n",
"\n",
"# 修改其他列 set的减法:所有列名的set减去number_columns的set\n",
"for column in set(df.columns) - set(number_columns):\n",
" df[column] = pd.Categorical(df[column])\n",
"\n",
"# 再查看每一列的数据类型\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" customerID | \n",
" gender | \n",
" SeniorCitizen | \n",
" Partner | \n",
" Dependents | \n",
" PhoneService | \n",
" MultipleLines | \n",
" InternetService | \n",
" OnlineSecurity | \n",
" OnlineBackup | \n",
" DeviceProtection | \n",
" TechSupport | \n",
" StreamingTV | \n",
" StreamingMovies | \n",
" Contract | \n",
" PaperlessBilling | \n",
" PaymentMethod | \n",
" Churn | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 7043 | \n",
" 7043 | \n",
" 7043 | \n",
" 7043 | \n",
" 7043 | \n",
" 7043 | \n",
" 7043 | \n",
" 7043 | \n",
" 7043 | \n",
" 7043 | \n",
" 7043 | \n",
" 7043 | \n",
" 7043 | \n",
" 7043 | \n",
" 7043 | \n",
" 7043 | \n",
" 7043 | \n",
" 7043 | \n",
"
\n",
" \n",
" unique | \n",
" 7043 | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" 3 | \n",
" 3 | \n",
" 3 | \n",
" 3 | \n",
" 3 | \n",
" 3 | \n",
" 3 | \n",
" 3 | \n",
" 3 | \n",
" 2 | \n",
" 4 | \n",
" 2 | \n",
"
\n",
" \n",
" top | \n",
" 0002-ORFBO | \n",
" Male | \n",
" 0 | \n",
" No | \n",
" No | \n",
" Yes | \n",
" No | \n",
" Fiber optic | \n",
" No | \n",
" No | \n",
" No | \n",
" No | \n",
" No | \n",
" No | \n",
" Month-to-month | \n",
" Yes | \n",
" Electronic check | \n",
" No | \n",
"
\n",
" \n",
" freq | \n",
" 1 | \n",
" 3555 | \n",
" 5901 | \n",
" 3641 | \n",
" 4933 | \n",
" 6361 | \n",
" 3390 | \n",
" 3096 | \n",
" 3498 | \n",
" 3088 | \n",
" 3095 | \n",
" 3473 | \n",
" 2810 | \n",
" 2785 | \n",
" 3875 | \n",
" 4171 | \n",
" 2365 | \n",
" 5174 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" customerID gender SeniorCitizen Partner Dependents PhoneService \\\n",
"count 7043 7043 7043 7043 7043 7043 \n",
"unique 7043 2 2 2 2 2 \n",
"top 0002-ORFBO Male 0 No No Yes \n",
"freq 1 3555 5901 3641 4933 6361 \n",
"\n",
" MultipleLines InternetService OnlineSecurity OnlineBackup \\\n",
"count 7043 7043 7043 7043 \n",
"unique 3 3 3 3 \n",
"top No Fiber optic No No \n",
"freq 3390 3096 3498 3088 \n",
"\n",
" DeviceProtection TechSupport StreamingTV StreamingMovies \\\n",
"count 7043 7043 7043 7043 \n",
"unique 3 3 3 3 \n",
"top No No No No \n",
"freq 3095 3473 2810 2785 \n",
"\n",
" Contract PaperlessBilling PaymentMethod Churn \n",
"count 7043 7043 7043 7043 \n",
"unique 3 2 4 2 \n",
"top Month-to-month Yes Electronic check No \n",
"freq 3875 4171 2365 5174 "
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# P35 Describe描述所有分类变量的列\n",
"df.describe(include=[\"category\"])\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"这里和数据类型的describe结果是不一样的\n",
"\n",
"count表示了所有的个数,unique表示种类数,也就是去重之后的,比如gender里面,只有男女,所以unique是2。Top和freq是最多的分类和对应的频数。"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"No 5174\n",
"Yes 1869\n",
"Name: Churn, dtype: int64"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# P36 看某一列的数据分布\n",
"df[\"Churn\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Churn PaymentMethod \n",
"No Bank transfer (automatic) 65.049417\n",
" Credit card (automatic) 64.562209\n",
" Electronic check 74.232032\n",
" Mailed check 41.403911\n",
"Yes Bank transfer (automatic) 77.875581\n",
" Credit card (automatic) 77.356034\n",
" Electronic check 78.700980\n",
" Mailed check 54.557143\n",
"Name: MonthlyCharges, dtype: float64"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# P37 多维度查看月费字段统计\n",
"# Churn:用户是否流失\n",
"# PaymentMethod:支付方式\n",
"# MonthlyCharges:月费\n",
"\n",
"# 查看按照Churn、PaymentMethod两个指标分组后的月费均值\n",
"df.groupby([\"Churn\", \"PaymentMethod\"])[\"MonthlyCharges\"].mean()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"从这里可以很明显的的发现,无论是哪种支付方式,用户流失了(Churn=Yes)的月费均值都大于未流失的。这就是隐藏在数据中的规律。"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 5174\n",
"1 1869\n",
"Name: Churn, dtype: int64"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# P38 Churn字段的数据映射\n",
"# 需要把Yes/No转化为1-0数据,这对于机器学习数据的处理是一个很有用的操作\n",
"\n",
"df[\"Churn\"] = df[\"Churn\"].map({\"Yes\":1,\"No\":0})\n",
"df[\"Churn\"].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"这和之前对于Churn的描述是一致的,只不过No被替换为了0,Yes被替换为了1。"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/3g/cxwzy4z940n8qvfv8vrkc97m0000gn/T/ipykernel_84349/1043428941.py:4: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n",
" df.corr()\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tenure | \n",
" MonthlyCharges | \n",
" TotalCharges | \n",
"
\n",
" \n",
" \n",
" \n",
" tenure | \n",
" 1.000000 | \n",
" 0.247900 | \n",
" 0.825464 | \n",
"
\n",
" \n",
" MonthlyCharges | \n",
" 0.247900 | \n",
" 1.000000 | \n",
" 0.650864 | \n",
"
\n",
" \n",
" TotalCharges | \n",
" 0.825464 | \n",
" 0.650864 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tenure MonthlyCharges TotalCharges\n",
"tenure 1.000000 0.247900 0.825464\n",
"MonthlyCharges 0.247900 1.000000 0.650864\n",
"TotalCharges 0.825464 0.650864 1.000000"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# P39 查看字段的相关性矩阵\n",
"\n",
"# 统计DataFrame的相关性矩阵\n",
"df.corr()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"可以发现,相关性矩阵只会通过数值型的变量来计算。"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" customerID | \n",
" gender | \n",
" SeniorCitizen | \n",
" Partner | \n",
" Dependents | \n",
" tenure | \n",
" PhoneService | \n",
" MultipleLines | \n",
" InternetService | \n",
" OnlineSecurity | \n",
" ... | \n",
" DeviceProtection | \n",
" TechSupport | \n",
" StreamingTV | \n",
" StreamingMovies | \n",
" Contract | \n",
" PaperlessBilling | \n",
" PaymentMethod | \n",
" MonthlyCharges | \n",
" TotalCharges | \n",
" Churn | \n",
"
\n",
" \n",
" \n",
" \n",
" 4824 | \n",
" 3339-EAQNV | \n",
" Male | \n",
" 1 | \n",
" Yes | \n",
" No | \n",
" 72.0 | \n",
" Yes | \n",
" Yes | \n",
" Fiber optic | \n",
" No | \n",
" ... | \n",
" No | \n",
" No | \n",
" Yes | \n",
" Yes | \n",
" One year | \n",
" Yes | \n",
" Credit card (automatic) | \n",
" 97.25 | \n",
" 7133.10 | \n",
" 0 | \n",
"
\n",
" \n",
" 1539 | \n",
" 5071-FBJFS | \n",
" Female | \n",
" 0 | \n",
" Yes | \n",
" Yes | \n",
" 4.0 | \n",
" Yes | \n",
" No | \n",
" DSL | \n",
" Yes | \n",
" ... | \n",
" No | \n",
" No | \n",
" No | \n",
" No | \n",
" Month-to-month | \n",
" No | \n",
" Electronic check | \n",
" 50.30 | \n",
" 217.10 | \n",
" 0 | \n",
"
\n",
" \n",
" 6006 | \n",
" 1360-RCYRT | \n",
" Male | \n",
" 0 | \n",
" Yes | \n",
" Yes | \n",
" 23.0 | \n",
" No | \n",
" No phone service | \n",
" DSL | \n",
" No | \n",
" ... | \n",
" No | \n",
" Yes | \n",
" No | \n",
" No | \n",
" Month-to-month | \n",
" Yes | \n",
" Bank transfer (automatic) | \n",
" 30.35 | \n",
" 678.75 | \n",
" 0 | \n",
"
\n",
" \n",
" 4841 | \n",
" 9058-MJLZC | \n",
" Female | \n",
" 0 | \n",
" No | \n",
" No | \n",
" 24.0 | \n",
" Yes | \n",
" No | \n",
" Fiber optic | \n",
" Yes | \n",
" ... | \n",
" No | \n",
" No | \n",
" Yes | \n",
" Yes | \n",
" Month-to-month | \n",
" Yes | \n",
" Electronic check | \n",
" 94.60 | \n",
" 2283.15 | \n",
" 0 | \n",
"
\n",
" \n",
" 6002 | \n",
" 3727-RJMEO | \n",
" Male | \n",
" 0 | \n",
" Yes | \n",
" No | \n",
" 6.0 | \n",
" Yes | \n",
" Yes | \n",
" Fiber optic | \n",
" No | \n",
" ... | \n",
" No | \n",
" No | \n",
" Yes | \n",
" No | \n",
" Month-to-month | \n",
" Yes | \n",
" Electronic check | \n",
" 82.85 | \n",
" 460.25 | \n",
" 1 | \n",
"
\n",
" \n",
" 5155 | \n",
" 8294-UIMBA | \n",
" Female | \n",
" 0 | \n",
" No | \n",
" No | \n",
" 30.0 | \n",
" Yes | \n",
" No | \n",
" Fiber optic | \n",
" No | \n",
" ... | \n",
" No | \n",
" Yes | \n",
" Yes | \n",
" Yes | \n",
" One year | \n",
" Yes | \n",
" Bank transfer (automatic) | \n",
" 94.40 | \n",
" 2638.10 | \n",
" 0 | \n",
"
\n",
" \n",
" 3793 | \n",
" 6278-FEPBZ | \n",
" Female | \n",
" 0 | \n",
" No | \n",
" No | \n",
" 9.0 | \n",
" Yes | \n",
" No | \n",
" No | \n",
" No internet service | \n",
" ... | \n",
" No internet service | \n",
" No internet service | \n",
" No internet service | \n",
" No internet service | \n",
" Month-to-month | \n",
" No | \n",
" Mailed check | \n",
" 20.25 | \n",
" 186.15 | \n",
" 0 | \n",
"
\n",
" \n",
" 2273 | \n",
" 0219-QAERP | \n",
" Male | \n",
" 0 | \n",
" Yes | \n",
" No | \n",
" 30.0 | \n",
" Yes | \n",
" No | \n",
" No | \n",
" No internet service | \n",
" ... | \n",
" No internet service | \n",
" No internet service | \n",
" No internet service | \n",
" No internet service | \n",
" One year | \n",
" No | \n",
" Credit card (automatic) | \n",
" 19.80 | \n",
" 576.65 | \n",
" 0 | \n",
"
\n",
" \n",
" 5258 | \n",
" 8634-CILSZ | \n",
" Male | \n",
" 0 | \n",
" No | \n",
" No | \n",
" 69.0 | \n",
" Yes | \n",
" Yes | \n",
" Fiber optic | \n",
" No | \n",
" ... | \n",
" No | \n",
" Yes | \n",
" Yes | \n",
" Yes | \n",
" One year | \n",
" Yes | \n",
" Bank transfer (automatic) | \n",
" 104.70 | \n",
" 7220.35 | \n",
" 1 | \n",
"
\n",
" \n",
" 2766 | \n",
" 1229-RCALF | \n",
" Female | \n",
" 0 | \n",
" Yes | \n",
" No | \n",
" 64.0 | \n",
" Yes | \n",
" Yes | \n",
" Fiber optic | \n",
" Yes | \n",
" ... | \n",
" Yes | \n",
" Yes | \n",
" Yes | \n",
" Yes | \n",
" One year | \n",
" Yes | \n",
" Electronic check | \n",
" 115.00 | \n",
" 7396.15 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
10 rows × 21 columns
\n",
"
"
],
"text/plain": [
" customerID gender SeniorCitizen Partner Dependents tenure \\\n",
"4824 3339-EAQNV Male 1 Yes No 72.0 \n",
"1539 5071-FBJFS Female 0 Yes Yes 4.0 \n",
"6006 1360-RCYRT Male 0 Yes Yes 23.0 \n",
"4841 9058-MJLZC Female 0 No No 24.0 \n",
"6002 3727-RJMEO Male 0 Yes No 6.0 \n",
"5155 8294-UIMBA Female 0 No No 30.0 \n",
"3793 6278-FEPBZ Female 0 No No 9.0 \n",
"2273 0219-QAERP Male 0 Yes No 30.0 \n",
"5258 8634-CILSZ Male 0 No No 69.0 \n",
"2766 1229-RCALF Female 0 Yes No 64.0 \n",
"\n",
" PhoneService MultipleLines InternetService OnlineSecurity ... \\\n",
"4824 Yes Yes Fiber optic No ... \n",
"1539 Yes No DSL Yes ... \n",
"6006 No No phone service DSL No ... \n",
"4841 Yes No Fiber optic Yes ... \n",
"6002 Yes Yes Fiber optic No ... \n",
"5155 Yes No Fiber optic No ... \n",
"3793 Yes No No No internet service ... \n",
"2273 Yes No No No internet service ... \n",
"5258 Yes Yes Fiber optic No ... \n",
"2766 Yes Yes Fiber optic Yes ... \n",
"\n",
" DeviceProtection TechSupport StreamingTV \\\n",
"4824 No No Yes \n",
"1539 No No No \n",
"6006 No Yes No \n",
"4841 No No Yes \n",
"6002 No No Yes \n",
"5155 No Yes Yes \n",
"3793 No internet service No internet service No internet service \n",
"2273 No internet service No internet service No internet service \n",
"5258 No Yes Yes \n",
"2766 Yes Yes Yes \n",
"\n",
" StreamingMovies Contract PaperlessBilling \\\n",
"4824 Yes One year Yes \n",
"1539 No Month-to-month No \n",
"6006 No Month-to-month Yes \n",
"4841 Yes Month-to-month Yes \n",
"6002 No Month-to-month Yes \n",
"5155 Yes One year Yes \n",
"3793 No internet service Month-to-month No \n",
"2273 No internet service One year No \n",
"5258 Yes One year Yes \n",
"2766 Yes One year Yes \n",
"\n",
" PaymentMethod MonthlyCharges TotalCharges Churn \n",
"4824 Credit card (automatic) 97.25 7133.10 0 \n",
"1539 Electronic check 50.30 217.10 0 \n",
"6006 Bank transfer (automatic) 30.35 678.75 0 \n",
"4841 Electronic check 94.60 2283.15 0 \n",
"6002 Electronic check 82.85 460.25 1 \n",
"5155 Bank transfer (automatic) 94.40 2638.10 0 \n",
"3793 Mailed check 20.25 186.15 0 \n",
"2273 Credit card (automatic) 19.80 576.65 0 \n",
"5258 Bank transfer (automatic) 104.70 7220.35 1 \n",
"2766 Electronic check 115.00 7396.15 0 \n",
"\n",
"[10 rows x 21 columns]"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# P40 数据的采样\n",
"\n",
"# 随机挑选一定的行数\n",
"df.sample(10)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}