{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Author:马肖\n", "#### E-Mail:maxiaoscut@aliyun.com\n", "#### GitHub:https://github.com/Albertsr" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "\n", "def high_categorical(dataframe, high_discrete, k=3):\n", " # df为pandas.DataFrame格式\n", " # feature为df的某一列高势集离散型特征,为pandas.Series格式\n", " # k表示上述离散型特征出现频次最高的k个不重复取值\n", " \n", " value_counts = high_discrete.value_counts()\n", " top_categories = list(value_counts[:k].index)\n", " top_categories.append('other')\n", " \n", " high_discrete = high_discrete.apply(lambda category: category if category in top_categories else 'other')\n", " #print(high_discrete)\n", " feature_dummies = pd.get_dummies(high_discrete, prefix=high_discrete.name)\n", " \n", " dataframe = dataframe.join(feature_dummies)\n", " dataframe.drop(high_discrete.name, axis=1, inplace=True)\n", " return dataframe" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 实验" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
销售额邮编
014210072
114010114
213010037
310810024
413610029
\n", "
" ], "text/plain": [ " 销售额 邮编\n", "0 142 10072\n", "1 140 10114\n", "2 130 10037\n", "3 108 10024\n", "4 136 10029" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.random.seed(2019)\n", "zipcode = np.random.randint(10000, 10150, size=5000)\n", "sales = np.random.randint(100, 150, size=5000)\n", "df = pd.DataFrame({'销售额':sales, '邮编':zipcode})\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
销售额邮编_10001邮编_10012邮编_10075邮编_10114邮编_10126邮编_other
0142000001
1140000100
2130000001
3108000001
4136000001
\n", "
" ], "text/plain": [ " 销售额 邮编_10001 邮编_10012 邮编_10075 邮编_10114 邮编_10126 邮编_other\n", "0 142 0 0 0 0 0 1\n", "1 140 0 0 0 1 0 0\n", "2 130 0 0 0 0 0 1\n", "3 108 0 0 0 0 0 1\n", "4 136 0 0 0 0 0 1" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "high_categorical(df, df['邮编'], k=5).head(5)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 2 }