{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Author:马肖\n", "#### E-Mail:maxiaoscut@aliyun.com\n", "#### GitHub:https://github.com/Albertsr" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "\n", "def high_categorical(dataframe, high_discrete, k=3):\n", " # df为pandas.DataFrame格式\n", " # feature为df的某一列高势集离散型特征,为pandas.Series格式\n", " # k表示上述离散型特征出现频次最高的k个不重复取值\n", " \n", " value_counts = high_discrete.value_counts()\n", " top_categories = list(value_counts[:k].index)\n", " top_categories.append('other')\n", " \n", " high_discrete = high_discrete.apply(lambda category: category if category in top_categories else 'other')\n", " #print(high_discrete)\n", " feature_dummies = pd.get_dummies(high_discrete, prefix=high_discrete.name)\n", " \n", " dataframe = dataframe.join(feature_dummies)\n", " dataframe.drop(high_discrete.name, axis=1, inplace=True)\n", " return dataframe" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 实验" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>销售额</th>\n", " <th>邮编</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>142</td>\n", " <td>10072</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>140</td>\n", " <td>10114</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>130</td>\n", " <td>10037</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>108</td>\n", " <td>10024</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>136</td>\n", " <td>10029</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " 销售额 邮编\n", "0 142 10072\n", "1 140 10114\n", "2 130 10037\n", "3 108 10024\n", "4 136 10029" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.random.seed(2019)\n", "zipcode = np.random.randint(10000, 10150, size=5000)\n", "sales = np.random.randint(100, 150, size=5000)\n", "df = pd.DataFrame({'销售额':sales, '邮编':zipcode})\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>销售额</th>\n", " <th>邮编_10001</th>\n", " <th>邮编_10012</th>\n", " <th>邮编_10075</th>\n", " <th>邮编_10114</th>\n", " <th>邮编_10126</th>\n", " <th>邮编_other</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>142</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>140</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>130</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>108</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>136</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " 销售额 邮编_10001 邮编_10012 邮编_10075 邮编_10114 邮编_10126 邮编_other\n", "0 142 0 0 0 0 0 1\n", "1 140 0 0 0 1 0 0\n", "2 130 0 0 0 0 0 1\n", "3 108 0 0 0 0 0 1\n", "4 136 0 0 0 0 0 1" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "high_categorical(df, df['邮编'], k=5).head(5)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 2 }