{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Author:马肖\n",
"#### E-Mail:maxiaoscut@aliyun.com\n",
"#### GitHub:https://github.com/Albertsr"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"\n",
"def high_categorical(dataframe, high_discrete, k=3):\n",
" # df为pandas.DataFrame格式\n",
" # feature为df的某一列高势集离散型特征,为pandas.Series格式\n",
" # k表示上述离散型特征出现频次最高的k个不重复取值\n",
" \n",
" value_counts = high_discrete.value_counts()\n",
" top_categories = list(value_counts[:k].index)\n",
" top_categories.append('other')\n",
" \n",
" high_discrete = high_discrete.apply(lambda category: category if category in top_categories else 'other')\n",
" #print(high_discrete)\n",
" feature_dummies = pd.get_dummies(high_discrete, prefix=high_discrete.name)\n",
" \n",
" dataframe = dataframe.join(feature_dummies)\n",
" dataframe.drop(high_discrete.name, axis=1, inplace=True)\n",
" return dataframe"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 实验"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 销售额 | \n",
" 邮编 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 142 | \n",
" 10072 | \n",
"
\n",
" \n",
" | 1 | \n",
" 140 | \n",
" 10114 | \n",
"
\n",
" \n",
" | 2 | \n",
" 130 | \n",
" 10037 | \n",
"
\n",
" \n",
" | 3 | \n",
" 108 | \n",
" 10024 | \n",
"
\n",
" \n",
" | 4 | \n",
" 136 | \n",
" 10029 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 销售额 邮编\n",
"0 142 10072\n",
"1 140 10114\n",
"2 130 10037\n",
"3 108 10024\n",
"4 136 10029"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.random.seed(2019)\n",
"zipcode = np.random.randint(10000, 10150, size=5000)\n",
"sales = np.random.randint(100, 150, size=5000)\n",
"df = pd.DataFrame({'销售额':sales, '邮编':zipcode})\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 销售额 | \n",
" 邮编_10001 | \n",
" 邮编_10012 | \n",
" 邮编_10075 | \n",
" 邮编_10114 | \n",
" 邮编_10126 | \n",
" 邮编_other | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 142 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" 140 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 130 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 3 | \n",
" 108 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 4 | \n",
" 136 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 销售额 邮编_10001 邮编_10012 邮编_10075 邮编_10114 邮编_10126 邮编_other\n",
"0 142 0 0 0 0 0 1\n",
"1 140 0 0 0 1 0 0\n",
"2 130 0 0 0 0 0 1\n",
"3 108 0 0 0 0 0 1\n",
"4 136 0 0 0 0 0 1"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"high_categorical(df, df['邮编'], k=5).head(5)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}