{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "id": "179E3C64C6D247D2866F1CE7E776D441", "scrolled": false }, "outputs": [], "source": [ "# 模型1--许景益模型" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "75A3AD2021FA47AE9011FAC6F5DBEDA6" }, "outputs": [], "source": [ "!mkdir /home/kesci/work/features/\n", "!mkdir /home/kesci/work/basic/" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "B432DE9ADF29479D8D95E6EFB022DD75" }, "outputs": [], "source": [ "-*- coding:utf-8 -*-\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn import metrics\n", "'''\n", "一些基础的工具或环境函数\n", "'''\n", "\n", "def data_path():\n", " return '/mnt/datasets/fusai/'\n", "\n", "def basic_path():\n", " return '/home/kesci/work/basic/'\n", "\n", "def features_path():\n", " return '/home/kesci/work/features/'\n", "\n", "def split_data(data, columns, start_day, end_day):\n", " data = data[(data[columns] >= start_day) & (data[columns] <= end_day)]\n", " return data \n", "\n", "'''\n", "下面返回的lsit说明下:\n", "以1-10天为起始特征区间,用于返回需要划多少天\n", "例如划1-18,就返回18-10=8,\n", "划测试集1-30,就返回30-10=20\n", "需要少划几个对应修改就好\n", "... 要修改起始特征区间,修改下面的 ups 和 downs 函数\n", "''' \n", "def features_addday_list():\n", " return [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20]\n", "\n", "def ups():\n", " return 1\n", "\n", "def downs():\n", " return 10\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "35B9D8B1791740D5AD3D7B80D6FDD5D1" }, "outputs": [], "source": [ "'''\n", "存放标签数据\n", "划几个窗口存几列标签\n", "'''\n", "launch = pd.read_csv(data_path()+'app_launch_log.txt', sep='\\t', header=None,\n", " names=['user_id', 'launch_day'],\n", " dtype={0: np.uint32, 1: np.uint8})\n", "register = pd.read_csv(data_path() + 'user_register_log.txt', sep='\\t', header=None,\n", " names=['user_id', 'register_day', 'register_type', 'device_type'],\n", " dtype={0: np.uint32, 1: np.uint8, 2: np.uint16, 3: np.uint16})\n", "\n", "def get_label_list(start_day, end_day):\n", " result = split_data(launch, 'launch_day', start_day, end_day)['user_id'].drop_duplicates()\n", " return pd.Series(result)\n", "\n", "\n", "if __name__ == '__main__':\n", " up = downs()+1\n", " down = downs()+7\n", " data = register.loc[:, ['user_id']]\n", " for label_num in range(len(features_addday_list())-1):\n", " label_list = get_label_list(up + label_num, down + label_num)\n", " label_name = 'label_' + str(label_num)\n", " data[label_name] = data['user_id'].isin(label_list).replace({True: 1, False: 0})\n", " data.to_csv(basic_path()+'data_label.csv', index=None)\n", " print('data_label.csv complete!')\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "908D4C43DB82447F8EFBD15CA341D393" }, "outputs": [], "source": [ "'''\n", "注册表特征\n", "'''\n", "\n", "if __name__ == '__main__':\n", " up = ups()\n", " down = downs()\n", "\n", " for feature_num in features_addday_list():\n", " # 读数据\n", " register = pd.read_csv(data_path()+'user_register_log.txt', sep='\\t', header=None,\n", " names=['user_id','register_day','register_type','device_type'],\n", " dtype={0: np.uint32, 1: np.uint8, 2: np.uint8, 3: np.uint16})\n", "\n", " # 基础变量定义\n", " feature_start = up + 0\n", " feature_end = down + feature_num\n", "\n", " '''\n", " result_data 是存放特征的结果文件\n", " feature_data 用于存放被提取的原文件\n", " *****_tmp 存放临时特征文件\n", " 类似文件后续不再注释\n", " '''\n", " result_data = split_data(register, 'register_day', 1, feature_end)\n", " feature_data = split_data(register, 'register_day', feature_start, feature_end)\n", " del register\n", "\n", " # # # # # # # # #\n", " # 提特征(已经包含设备类型、设备类型)\n", " # \n", " # 特征区间最大天数减去注册日期\n", " result_data['maxday_red_registerday'] = max(feature_data['register_day']) - feature_data['register_day']\n", " result_data = result_data.fillna(max(feature_data['register_day']))\n", "\n", " del result_data['register_day']\n", "\n", " # # # # # # # # #\n", " # 保存结果\n", " result_file_name = 'register_feature_'+str(feature_num)+'.csv'\n", " result_data.to_csv(features_path()+result_file_name, index=None)\n", " print(result_file_name+' complete!')\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7D037E68D37742F79FAA8328D56B16BC" }, "outputs": [], "source": [ "'''\n", "视频创建特征\n", "'''\n", "\n", "if __name__ == '__main__':\n", " up = ups()\n", " down = downs()\n", "\n", " for feature_num in features_addday_list():\n", " # 读数据\n", " register = pd.read_csv(data_path() + 'user_register_log.txt', sep='\\t', header=None,\n", " names=['user_id', 'register_day', 'register_type', 'device_type'],\n", " dtype={0: np.uint32, 1: np.uint8, 2: np.uint16, 3: np.uint16})\n", " create = pd.read_csv(data_path() + 'video_create_log.txt', sep='\\t', header=None,\n", " names=['user_id', 'create_day'],\n", " dtype={0: np.uint32, 1: np.uint8})\n", "\n", " # 基础变量定义\n", " feature_start = up\n", " feature_end = down + feature_num\n", " result_data = split_data(register, 'register_day', 1, feature_end).loc[:, ['user_id', 'register_day']]\n", " feature_data = split_data(create, 'create_day', feature_start, feature_end)\n", " del register\n", " del create\n", "\n", " # # # # # # # # #\n", " # 提特征\n", " #\n", " # 用户创建视频计数\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='create_day',\n", " aggfunc='count').reset_index().rename(columns={\"create_day\": 'create_count'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data = result_data.fillna(0)\n", "\n", " # 用户创建视频的 平均/最大/最小日期 与 注册日期/最大时间 的时间差\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='create_day',\n", " aggfunc='mean').reset_index().rename(columns={\"create_day\": 'create_mean'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data['createmean_red_register'] = result_data['create_mean'] - result_data['register_day']\n", " result_data['maxday_red_createmean'] = max(result_data['register_day']) - result_data['create_mean']\n", "\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='create_day',\n", " aggfunc=np.max).reset_index().rename(columns={\"create_day\": 'create_max'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data['createmax_red_register'] = result_data['create_max'] - result_data['register_day']\n", " result_data['maxday_red_createmax'] = max(result_data['register_day']) - result_data['create_max']\n", "\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='create_day',\n", " aggfunc=np.min).reset_index().rename(columns={\"create_day\": 'create_min'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data['createmin_red_register'] = result_data['create_min'] - result_data['register_day']\n", " result_data['maxday_red_createmin'] = max(result_data['register_day']) - result_data['create_min']\n", " result_data = result_data.fillna(-1)\n", "\n", " # 创建最大间隔\n", " result_data['max_red_min_create'] = result_data['create_max'] - result_data['create_min']\n", "\n", " # 最后一天是否有活动\n", " result_data['create_at_lastday'] = pd.Series(\n", " result_data['create_max'] == max(feature_data['create_day'])).replace({True: 1, False: 0})\n", "\n", " # 均值/最大/最小 天数处理\n", " result_data['create_mean'] = max(feature_data['create_day']) - result_data['create_mean']\n", " result_data['create_max'] = max(feature_data['create_day']) - result_data['create_max']\n", " result_data['create_min'] = max(feature_data['create_day']) - result_data['create_min']\n", "\n", " # 间隔的 方差/均值\n", " feature_data_tmp = feature_data.drop_duplicates(['user_id', 'create_day']).sort_values(\n", " by=['user_id', 'create_day'])\n", " feature_data_tmp['create_gap'] = np.array(feature_data_tmp['create_day']) - np.array(\n", " feature_data_tmp.tail(1).append(feature_data_tmp.head(len(feature_data_tmp) - 1))['create_day'])\n", "\n", " feature_tmp = pd.pivot_table(feature_data_tmp, index='user_id', values='create_gap',\n", " aggfunc=(lambda a: np.average(a[1:]))).reset_index().rename(\n", " columns={\"create_gap\": 'create_gap_mean'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " feature_tmp = pd.pivot_table(feature_data_tmp, index='user_id', values='create_gap',\n", " aggfunc=(lambda a: np.var(a[1:]))).reset_index().rename(\n", " columns={\"create_gap\": 'create_gap_var'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data = result_data.fillna(0)\n", "\n", " # 是否一直连续/连续到结束\n", " result_data['always_create'] = [1 if i == 1 else 0 for i in result_data['create_gap_mean']]\n", " tmp = (result_data['create_at_lastday'] == 1).replace({True: 1, False: 0})\n", " result_data['always_create_atlast'] = tmp * result_data['always_create']\n", " del tmp\n", "\n", " # 创建日期的 方差/峰度/偏度\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='create_day',\n", " aggfunc=np.var).reset_index().rename(columns={\"create_day\": 'create_var'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='create_day',\n", " aggfunc=pd.Series.kurt).reset_index().rename(columns={\"create_day\": 'create_kurt'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='create_day',\n", " aggfunc=pd.Series.skew).reset_index().rename(columns={\"create_day\": 'create_skew'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data = result_data.fillna(0)\n", "\n", " # 求一天最大创建数\n", " feature_data['max_create_in_oneday'] = 0\n", " feature_tmp = pd.pivot_table(feature_data, index=['user_id', 'create_day'], values='max_create_in_oneday',\n", " aggfunc='count').reset_index()\n", " feature_tmp = pd.DataFrame(feature_tmp.groupby(['user_id'])['max_create_in_oneday'].max()).reset_index()\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data.fillna(0, inplace=True)\n", "\n", " del result_data['register_day']\n", "\n", " # # # # # # # #\n", " # 保存结果\n", " result_file_name = 'create_feature_' + str(feature_num) + '.csv'\n", " result_data.to_csv(features_path() + result_file_name, index=None)\n", " print(result_file_name + ' complete!')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "EB0C0C83E6D84408853D462DE274DAAC" }, "outputs": [], "source": [ "'''\n", "登录表特征\n", "'''\n", "\n", "if __name__ == '__main__':\n", " up = ups()\n", " down = downs()\n", "\n", " for feature_num in features_addday_list():\n", " # 读数据\n", " register = pd.read_csv(data_path()+'user_register_log.txt', sep='\\t', header=None,\n", " names=['user_id','register_day','register_type','device_type'],\n", " dtype={0: np.uint32, 1: np.uint8, 2: np.uint16, 3: np.uint16})\n", " launch = pd.read_csv(data_path() + 'app_launch_log.txt', sep='\\t', header=None,\n", " names=['user_id', 'launch_day'],\n", " dtype={0: np.uint32, 1: np.uint8})\n", "\n", " # 基础变量定义\n", " feature_start = up\n", " feature_end = down + feature_num\n", " result_data = split_data(register, 'register_day', 1, feature_end).loc[:, ['user_id', 'register_day']]\n", " feature_data = split_data(launch, 'launch_day', feature_start, feature_end)\n", " del register\n", " del launch\n", "\n", " # # # # # # # # #\n", " # 提特征\n", " #\n", " # 登录计数/登录率\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='launch_day', \n", " aggfunc='count').reset_index().rename(columns={\"launch_day\": 'launch_count'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " distance = (max(feature_data['launch_day']) - min(feature_data['launch_day']))\n", " result_data['launch_ratio'] = result_data['launch_count'] * 1.0 / distance\n", " result_data = result_data.fillna(0)\n", "\n", " # 登录的 平均/最大/最小日期 与 注册日期/最大时间 的时间差\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='launch_day', \n", " aggfunc='mean').reset_index().rename(columns={\"launch_day\": 'launch_mean'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data['launchmean_red_register'] = result_data['launch_mean'] - result_data['register_day']\n", " result_data['maxday_red_launchmean'] = max(result_data['register_day']) - result_data['launch_mean']\n", "\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='launch_day', \n", " aggfunc=np.max).reset_index().rename(columns={\"launch_day\": 'launch_max'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data['launchmax_red_register'] = result_data['launch_max'] - result_data['register_day']\n", " result_data['maxday_red_launchmax'] = max(result_data['register_day']) - result_data['launch_max']\n", "\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='launch_day',\n", " aggfunc=np.min).reset_index().rename(columns={\"launch_day\": 'launch_min'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " # result_data['launchmin_red_register'] = result_data['launch_min'] - result_data['register_day']\n", " result_data['maxday_red_launchmin'] = max(result_data['register_day']) - result_data['launch_min']\n", " result_data = result_data.fillna(-1)\n", "\n", " # 登录最大与最小差\n", " result_data['max_red_min_launch'] = result_data['launch_max'] - result_data['launch_min']\n", "\n", " # 最后一天是否有活动\n", " result_data['launch_at_lastday'] = pd.Series(result_data['launch_max'] == max(feature_data['launch_day'])).replace({True: 1, False: 0})\n", "\n", " # 均值/最大/最小 天数处理\n", " result_data['launch_mean'] = max(feature_data['launch_day']) - result_data['launch_mean']\n", " result_data['launch_max'] = max(feature_data['launch_day']) - result_data['launch_max']\n", " result_data['launch_min'] = max(feature_data['launch_day']) - result_data['launch_min']\n", "\n", " # 间隔的 方差/均值/最大\n", " feature_data_tmp = feature_data.drop_duplicates(['user_id', 'launch_day']).sort_values(by=['user_id', 'launch_day'])\n", " feature_data_tmp['launch_gap'] = np.array(feature_data_tmp['launch_day']) - np.array(\n", " feature_data_tmp.tail(1).append(feature_data_tmp.head(len(feature_data_tmp) - 1))['launch_day'])\n", "\n", " feature_tmp = pd.pivot_table(feature_data_tmp, index='user_id', values='launch_gap',\n", " aggfunc=(lambda a: np.average(a[1:]))).reset_index().rename(columns={\"launch_gap\": 'launch_gap_mean'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " feature_tmp = pd.pivot_table(feature_data_tmp, index='user_id', values='launch_gap',\n", " aggfunc=(lambda a: np.var(a[1:]))).reset_index().rename(columns={\"launch_gap\": 'launch_gap_var'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " feature_tmp = pd.pivot_table(feature_data_tmp, index='user_id', values='launch_gap',\n", " aggfunc=(lambda a: np.max(a[1:]))).reset_index().rename(columns={\"launch_gap\": 'launch_gap_max'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data = result_data.fillna(0)\n", "\n", " # 是否一直连续/连续到结束\n", " result_data['always_launch'] = [1 if i == 1 else 0 for i in result_data['launch_gap_mean']]\n", " tmp = (result_data['launch_at_lastday'] == 1).replace({True: 1, False: 0})\n", " result_data['always_launch_atlast'] = tmp * result_data['always_launch']\n", " del tmp\n", "\n", " # 登录日期的 方差/峰度/偏度\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='launch_day',\n", " aggfunc=np.var).reset_index().rename(columns={\"launch_day\": 'launch_var'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='launch_day',\n", " aggfunc=pd.Series.kurt).reset_index().rename(columns={\"launch_day\": 'launch_kurt'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='launch_day',\n", " aggfunc=pd.Series.skew).reset_index().rename(columns={\"launch_day\": 'launch_skew'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data = result_data.fillna(0)\n", "\n", " del result_data['register_day']\n", " \n", " # # # # # # # #\n", " # 保存结果\n", " result_file_name = 'launch_feature_' + str(feature_num) + '.csv'\n", " result_data.to_csv(features_path() + result_file_name, index=None)\n", " print(result_file_name + ' complete!')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "F8379C27B3F7424689D77A3DF2FEFFDA" }, "outputs": [], "source": [ "'''\n", "活动表特征\n", "\n", "另外\n", "\n", "这个真的非常慢\n", "'''\n", "\n", "if __name__ == '__main__':\n", " up = ups()\n", " down = downs()\n", "\n", " for feature_num in features_addday_list():\n", " # 读数据\n", " register = pd.read_csv(data_path() + 'user_register_log.txt', sep='\\t', header=None,\n", " names=['user_id', 'register_day', 'register_type', 'device_type'],\n", " dtype={0: np.uint32, 1: np.uint8, 2: np.uint16, 3: np.uint16})\n", " activity = pd.read_csv(data_path() + 'user_activity_log.txt', sep='\\t', header=None,\n", " names=['user_id', 'act_day', 'page', 'video_id', 'author_id', 'action_type'],\n", " dtype={0: np.uint32, 1: np.uint8, 2: np.uint8, 3: np.uint32, 4: np.uint32, 5: np.uint8})\n", "\n", " # 基础变量定义\n", " feature_start = up\n", " feature_end = down + feature_num\n", " result_data = split_data(register, 'register_day', 1, feature_end).loc[:, ['user_id', 'register_day']]\n", " feature_data = split_data(activity, 'act_day', feature_start, feature_end)\n", " del register\n", " del activity\n", "\n", " # # # # # # # # #\n", " # 提特征\n", " #\n", " # 活动计数\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',\n", " aggfunc='count').reset_index().rename(columns={\"act_day\": 'act_count'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data = result_data.fillna(0)\n", "\n", " # 活动的 平均/最大/最小日期 与 注册日期/最大时间 的时间差\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',\n", " aggfunc='mean').reset_index().rename(columns={\"act_day\": 'act_mean'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data['actmean_red_register'] = result_data['act_mean'] - result_data['register_day']\n", " result_data['maxday_red_actmean'] = max(result_data['register_day']) - result_data['act_mean']\n", "\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',\n", " aggfunc=np.max).reset_index().rename(columns={\"act_day\": 'act_max'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data['actmax_red_register'] = result_data['act_max'] - result_data['register_day']\n", " result_data['maxday_red_actmax'] = max(result_data['register_day']) - result_data['act_max']\n", "\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',\n", " aggfunc=np.min).reset_index().rename(columns={\"act_day\": 'act_min'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data['actmin_red_register'] = result_data['act_min'] - result_data['register_day']\n", " result_data['maxday_red_actmin'] = max(result_data['register_day']) - result_data['act_min']\n", " result_data = result_data.fillna(-1)\n", "\n", " # 最后一天是否有活动\n", " result_data['act_at_lastday'] = pd.Series(result_data['act_max'] == max(feature_data['act_day'])).replace({True: 1, False: 0})\n", "\n", " # 均值/最大/最小 天数处理\n", " result_data['act_mean'] = max(feature_data['act_day']) - result_data['act_mean']\n", " result_data['act_max'] = max(feature_data['act_day']) - result_data['act_max']\n", " result_data['act_min'] = max(feature_data['act_day']) - result_data['act_min']\n", "\n", " # 观看自己计数\n", " feature_tmp = pd.pivot_table(feature_data[feature_data['user_id'] == feature_data['author_id']],\n", " index='user_id', values='author_id', aggfunc='count').reset_index().rename(columns={\"author_id\": 'act_self_count'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data = result_data.fillna(0)\n", "\n", " # 活动日期的 方差/峰度/偏度\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',\n", " aggfunc=np.var).reset_index().rename(columns={\"act_day\": 'act_var'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',\n", " aggfunc=pd.Series.kurt).reset_index().rename(columns={\"act_day\": 'act_kurt'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',\n", " aggfunc=pd.Series.skew).reset_index().rename(columns={\"act_day\": 'act_skew'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data = result_data.fillna(0)\n", "\n", " # action 的 计数/率\n", " feature_tmp = feature_data.loc[:, ['user_id', 'action_type', 'act_day']].groupby(['user_id', 'action_type']).count().reset_index().rename(columns={\"act_day\": 'action_count'})\n", " for i in range(6):\n", " fea_name = 'action_' + str(i) + '_count'\n", " action_tmp = feature_tmp[feature_tmp['action_type'] == i].loc[:, ['user_id', 'action_count']].rename(columns={\"action_count\": fea_name})\n", " result_data = pd.merge(result_data, action_tmp, how='left', on='user_id')\n", " result_data = result_data.fillna(0)\n", " result_data['action_all'] = (result_data['action_0_count']+result_data['action_1_count']+\n", " result_data['action_2_count']+result_data['action_3_count']+\n", " result_data['action_4_count']+result_data['action_5_count']).replace(0, 1)\n", " for i in range(6):\n", " fea_name = 'action_' + str(i) + '_ratio'\n", " fea_name_2 = 'action_' + str(i) + '_count'\n", " result_data[fea_name] = result_data[fea_name_2] / result_data['action_all']\n", "\n", " # page 的 计数/率\n", " feature_tmp = feature_data.loc[:, ['user_id', 'page', 'act_day']].groupby(['user_id', 'page']).count().reset_index().rename(columns={\"act_day\": 'page_count'})\n", " for i in range(5):\n", " fea_name = 'page_' + str(i) + '_count'\n", " page_tmp = feature_tmp[feature_tmp['page'] == i].loc[:, ['user_id', 'page_count']].rename(columns={\"page_count\": fea_name})\n", " result_data = pd.merge(result_data, page_tmp, how='left', on='user_id')\n", " result_data = result_data.fillna(0)\n", " result_data['page_all'] = (result_data['page_0_count']+result_data['page_1_count']+\n", " result_data['page_2_count']+result_data['page_3_count']+\n", " result_data['page_4_count']).replace(0, 1)\n", " for i in range(5):\n", " fea_name = 'page_' + str(i) + '_ratio'\n", " fea_name_2 = 'page_' + str(i) + '_count'\n", " result_data[fea_name] = result_data[fea_name_2] / result_data['page_all']\n", "\n", " del result_data['page_all']\n", " del result_data['action_all']\n", " del result_data['register_day']\n", "\n", " # # # # # # # #\n", " # 保存结果\n", " result_file_name = 'activity_feature_' + str(feature_num) + '.csv'\n", " result_data.to_csv(features_path() + result_file_name, index=None)\n", " print(result_file_name + ' complete!')'''\n", "活动表特征\n", "\n", "另外\n", "\n", "这个真的非常慢\n", "'''\n", "\n", "if __name__ == '__main__':\n", " up = ups()\n", " down = downs()\n", "\n", " for feature_num in features_addday_list():\n", " # 读数据\n", " register = pd.read_csv(data_path() + 'user_register_log.txt', sep='\\t', header=None,\n", " names=['user_id', 'register_day', 'register_type', 'device_type'],\n", " dtype={0: np.uint32, 1: np.uint8, 2: np.uint16, 3: np.uint16})\n", " activity = pd.read_csv(data_path() + 'user_activity_log.txt', sep='\\t', header=None,\n", " names=['user_id', 'act_day', 'page', 'video_id', 'author_id', 'action_type'],\n", " dtype={0: np.uint32, 1: np.uint8, 2: np.uint8, 3: np.uint32, 4: np.uint32, 5: np.uint8})\n", "\n", " # 基础变量定义\n", " feature_start = up\n", " feature_end = down + feature_num\n", " result_data = split_data(register, 'register_day', 1, feature_end).loc[:, ['user_id', 'register_day']]\n", " feature_data = split_data(activity, 'act_day', feature_start, feature_end)\n", " del register\n", " del activity\n", "\n", " # # # # # # # # #\n", " # 提特征\n", " #\n", " # 活动计数\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',\n", " aggfunc='count').reset_index().rename(columns={\"act_day\": 'act_count'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data = result_data.fillna(0)\n", "\n", " # 活动的 平均/最大/最小日期 与 注册日期/最大时间 的时间差\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',\n", " aggfunc='mean').reset_index().rename(columns={\"act_day\": 'act_mean'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data['actmean_red_register'] = result_data['act_mean'] - result_data['register_day']\n", " result_data['maxday_red_actmean'] = max(result_data['register_day']) - result_data['act_mean']\n", "\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',\n", " aggfunc=np.max).reset_index().rename(columns={\"act_day\": 'act_max'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data['actmax_red_register'] = result_data['act_max'] - result_data['register_day']\n", " result_data['maxday_red_actmax'] = max(result_data['register_day']) - result_data['act_max']\n", "\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',\n", " aggfunc=np.min).reset_index().rename(columns={\"act_day\": 'act_min'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data['actmin_red_register'] = result_data['act_min'] - result_data['register_day']\n", " result_data['maxday_red_actmin'] = max(result_data['register_day']) - result_data['act_min']\n", " result_data = result_data.fillna(-1)\n", "\n", " # 最后一天是否有活动\n", " result_data['act_at_lastday'] = pd.Series(result_data['act_max'] == max(feature_data['act_day'])).replace({True: 1, False: 0})\n", "\n", " # 均值/最大/最小 天数处理\n", " result_data['act_mean'] = max(feature_data['act_day']) - result_data['act_mean']\n", " result_data['act_max'] = max(feature_data['act_day']) - result_data['act_max']\n", " result_data['act_min'] = max(feature_data['act_day']) - result_data['act_min']\n", "\n", " # 观看自己计数\n", " feature_tmp = pd.pivot_table(feature_data[feature_data['user_id'] == feature_data['author_id']],\n", " index='user_id', values='author_id', aggfunc='count').reset_index().rename(columns={\"author_id\": 'act_self_count'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data = result_data.fillna(0)\n", "\n", " # 活动日期的 方差/峰度/偏度\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',\n", " aggfunc=np.var).reset_index().rename(columns={\"act_day\": 'act_var'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',\n", " aggfunc=pd.Series.kurt).reset_index().rename(columns={\"act_day\": 'act_kurt'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',\n", " aggfunc=pd.Series.skew).reset_index().rename(columns={\"act_day\": 'act_skew'})\n", " result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')\n", " result_data = result_data.fillna(0)\n", "\n", " # action 的 计数/率\n", " feature_tmp = feature_data.loc[:, ['user_id', 'action_type', 'act_day']].groupby(['user_id', 'action_type']).count().reset_index().rename(columns={\"act_day\": 'action_count'})\n", " for i in range(6):\n", " fea_name = 'action_' + str(i) + '_count'\n", " action_tmp = feature_tmp[feature_tmp['action_type'] == i].loc[:, ['user_id', 'action_count']].rename(columns={\"action_count\": fea_name})\n", " result_data = pd.merge(result_data, action_tmp, how='left', on='user_id')\n", " result_data = result_data.fillna(0)\n", " result_data['action_all'] = (result_data['action_0_count']+result_data['action_1_count']+\n", " result_data['action_2_count']+result_data['action_3_count']+\n", " result_data['action_4_count']+result_data['action_5_count']).replace(0, 1)\n", " for i in range(6):\n", " fea_name = 'action_' + str(i) + '_ratio'\n", " fea_name_2 = 'action_' + str(i) + '_count'\n", " result_data[fea_name] = result_data[fea_name_2] / result_data['action_all']\n", "\n", " # page 的 计数/率\n", " feature_tmp = feature_data.loc[:, ['user_id', 'page', 'act_day']].groupby(['user_id', 'page']).count().reset_index().rename(columns={\"act_day\": 'page_count'})\n", " for i in range(5):\n", " fea_name = 'page_' + str(i) + '_count'\n", " page_tmp = feature_tmp[feature_tmp['page'] == i].loc[:, ['user_id', 'page_count']].rename(columns={\"page_count\": fea_name})\n", " result_data = pd.merge(result_data, page_tmp, how='left', on='user_id')\n", " result_data = result_data.fillna(0)\n", " result_data['page_all'] = (result_data['page_0_count']+result_data['page_1_count']+\n", " result_data['page_2_count']+result_data['page_3_count']+\n", " result_data['page_4_count']).replace(0, 1)\n", " for i in range(5):\n", " fea_name = 'page_' + str(i) + '_ratio'\n", " fea_name_2 = 'page_' + str(i) + '_count'\n", " result_data[fea_name] = result_data[fea_name_2] / result_data['page_all']\n", "\n", " del result_data['page_all']\n", " del result_data['action_all']\n", " del result_data['register_day']\n", "\n", " # # # # # # # #\n", " # 保存结果\n", " result_file_name = 'activity_feature_' + str(feature_num) + '.csv'\n", " result_data.to_csv(features_path() + result_file_name, index=None)\n", " print(result_file_name + ' complete!')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "D699409D99294B088F2C0B5F64EA5D88" }, "outputs": [], "source": [ "'''\n", "跑模型\n", "'''\n", "import xgboost as xgb\n", "\n", "def get_feature(num, data_label=None):\n", " register = pd.read_csv(features_path()+'register_feature_'+str(num)+'.csv')\n", " create = pd.read_csv(features_path()+'create_feature_'+str(num)+'.csv')\n", " launch = pd.read_csv(features_path()+'launch_feature_'+str(num)+'.csv')\n", " activity = pd.read_csv(features_path()+'activity_feature_'+str(num)+'.csv')\n", " feature = pd.merge(register, launch, on='user_id', how='left')\n", " feature = pd.merge(feature, activity, on='user_id', how='left')\n", " feature = pd.merge(feature, create, on='user_id', how='left')\n", " del register\n", " del create\n", " del launch\n", "\n", " if data_label is not None:\n", " label_name = 'label_' + str(num)\n", " data_label_tmp = data_label[data_label['user_id'].isin(feature['user_id'])]\n", " data_label_tmp = data_label.loc[:, ['user_id', label_name]]\n", " data_label_tmp.columns = ['user_id', 'label']\n", " feature = pd.merge(feature, data_label_tmp, on='user_id', how='left')\n", " return feature\n", "\n", "\n", "if __name__ == '__main__':\n", " # 读标签数据\n", " data_label = pd.read_csv(basic_path()+'data_label.csv')\n", "\n", " # 读特征数据\n", " test_x = get_feature('20')\n", " train_x = get_feature('0', data_label).append(get_feature('1', data_label)).append(\n", " get_feature('2', data_label)).append(get_feature('3', data_label)).append(\n", " get_feature('4', data_label)).append(get_feature('5', data_label)).append(\n", " get_feature('6', data_label)).append(get_feature('7', data_label)).append(\n", " get_feature('8', data_label)).append(get_feature('9', data_label)).append(\n", " get_feature('10', data_label))\n", "\n", " train_y = train_x['label']\n", " test_user = test_x['user_id']\n", "\n", " del train_x['user_id']\n", " del test_x['user_id']\n", " del train_x['label']\n", " \n", " # XGBOOST 训练\n", " dtrain = xgb.DMatrix(train_x, label=train_y)\n", " dtest = xgb.DMatrix(test_x)\n", " params = {\n", " # 'objective': 'binary:logistic',\n", " 'objective': 'rank:pairwise',\n", " 'eta': 0.03,\n", " 'max_depth': 5,\n", " 'colsample_bytree': 0.8,\n", " 'subsample': 0.8,\n", " 'min_child_weight': 16,\n", " 'silent': 1,\n", " }\n", " bst = xgb.train(params, dtrain, 1500, watchlist = [(dtrain, 'train')])\n", " pre_label = bst.predict(dtest)\n", "\n", " # 生成结果文件\n", " pd.DataFrame(data={0:test_user, 1:pre_label}).to_csv('/home/kesci/xjy_model.txt', index=None, header=None)\n", "\n", "\t\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "6F0B9DE3160C4FDB8015EE77B669F2A1" }, "outputs": [], "source": [ "# 模型2--余薇模型" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2B195A6ED6EE4A7281DFCE1A53B04CA0" }, "outputs": [], "source": [ "\n", "\n", "# -*- coding: utf-8 -*-\n", "\"\"\"\n", "Created on Mon May 28 12:20:47 2018\n", "\n", "@author: yuwei\n", "\"\"\"\n", "\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import xgboost as xgb\n", "import gc\n", "\n", "\n", "#%%\n", "def loadData():\n", " \"读取数据集\"\n", " #覆盖完全注册信息表\n", " app = pd.read_table(r'/mnt/datasets/fusai/app_launch_log.txt',names=['user_id','day'],encoding='utf-8',sep='\\t',)\n", " #覆盖注册信息表43708\n", " user_act = pd.read_table(r'/mnt/datasets/fusai/user_activity_log.txt',names=['user_id','day','page','video_id','author_id','action_type'],encoding='utf-8',sep='\\t')\n", " #注册信息表共计51709\n", " user_reg = pd.read_table(r'/mnt/datasets/fusai/user_register_log.txt',names=['user_id','register_day','register_type','device_type'],encoding='utf-8',sep='\\t')\n", " #仅覆盖7606\n", " vedio = pd.read_table(r'/mnt/datasets/fusai/video_create_log.txt',names=['user_id','day'],encoding='utf-8',sep='\\t')\n", " return app,user_act,user_reg,vedio\n", "#%%\n", "def makeLabel(app,user_act,user_reg,vedio):\n", " \"打标\"\n", " \n", " \"测试集\"\n", " #10-30号 未来七天31-71\n", " '''\n", " te_app = app[app.day>=10];te_user_act = user_act[user_act.day>=10]\n", " te_user_reg = user_reg[user_reg.register_day>=10];te_vedio = vedio[vedio.day>=10]\n", " te1 = te_app[['user_id']].drop_duplicates()\n", " te2 = te_user_act[['user_id']].drop_duplicates()\n", " te3 = te_user_reg[['user_id']].drop_duplicates()\n", " te4 = te_vedio[['user_id']].drop_duplicates()\n", " te = pd.concat([te1,te2,te3,te4],axis=0)\n", " test = te[['user_id']].drop_duplicates()\n", " '''\n", " test = user_reg[['user_id']].drop_duplicates()\n", " \n", " \"验证集-1\"\n", " #3-23号\n", " tr_app = app[(app.day>=1)&(app.day<=23)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=23)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=23)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=23)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train1 = tr[['user_id']].drop_duplicates()\n", " #未来7天:24-30号\n", " tr_app = app[(app.day>=24)&(app.day<=30)];tr_user_act = user_act[(user_act.day>=24)&(user_act.day<=30)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=24)&(user_reg.register_day<=30)];tr_vedio = vedio[(vedio.day>=24)&(vedio.day<=30)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train1_true = tr[['user_id']].drop_duplicates()\n", " #左连接打标\n", " train1_true['label'] = 1\n", " train1 = pd.merge(train1,train1_true,on='user_id',how='left')\n", " train1= train1.fillna(0)\n", " del train1_true;gc.collect();\n", "\n", " \"训练集-2\"\n", " tr_app = app[(app.day>=1)&(app.day<=22)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=22)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=22)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=22)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train2 = tr[['user_id']].drop_duplicates()\n", " #未来7天:23-29号\n", " tr_app = app[(app.day>=23)&(app.day<=29)];tr_user_act = user_act[(user_act.day>=23)&(user_act.day<=29)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=23)&(user_reg.register_day<=29)];tr_vedio = vedio[(vedio.day>=23)&(vedio.day<=29)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train2_true = tr[['user_id']].drop_duplicates()\n", " #左连接打标\n", " train2_true['label'] = 1\n", " train2 = pd.merge(train2,train2_true,on='user_id',how='left')\n", " train2= train2.fillna(0)\n", " del train2_true;gc.collect();\n", "\n", " \n", " \"训练集-3\"\n", " tr_app = app[(app.day>=1)&(app.day<=21)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=21)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=21)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=21)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train3 = tr[['user_id']].drop_duplicates()\n", " #未来7天:22-28号\n", " tr_app = app[(app.day>=22)&(app.day<=28)];tr_user_act = user_act[(user_act.day>=22)&(user_act.day<=28)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=22)&(user_reg.register_day<=28)];tr_vedio = vedio[(vedio.day>=22)&(vedio.day<=28)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train3_true = tr[['user_id']].drop_duplicates()\n", " #左连接打标\n", " train3_true['label'] = 1\n", " train3 = pd.merge(train3,train3_true,on='user_id',how='left')\n", " train3= train3.fillna(0)\n", " del train3_true;gc.collect();\n", "\n", " \"训练集-4\"\n", " tr_app = app[(app.day>=1)&(app.day<=20)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=20)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=20)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=20)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train4 = tr[['user_id']].drop_duplicates()\n", " #未来7天:21-27号\n", " tr_app = app[(app.day>=21)&(app.day<=27)];tr_user_act = user_act[(user_act.day>=21)&(user_act.day<=27)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=21)&(user_reg.register_day<=27)];tr_vedio = vedio[(vedio.day>=21)&(vedio.day<=27)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train4_true = tr[['user_id']].drop_duplicates()\n", " #左连接打标\n", " train4_true['label'] = 1\n", " train4 = pd.merge(train4,train4_true,on='user_id',how='left')\n", " train4= train4.fillna(0)\n", " del train4_true;gc.collect();\n", " \n", " \"训练集-5\"\n", " tr_app = app[(app.day>=1)&(app.day<=19)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=19)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=19)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=19)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train5 = tr[['user_id']].drop_duplicates()\n", " #未来7天:21-27号\n", " tr_app = app[(app.day>=20)&(app.day<=26)];tr_user_act = user_act[(user_act.day>=20)&(user_act.day<=26)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=20)&(user_reg.register_day<=26)];tr_vedio = vedio[(vedio.day>=20)&(vedio.day<=26)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train5_true = tr[['user_id']].drop_duplicates()\n", " #左连接打标\n", " train5_true['label'] = 1\n", " train5 = pd.merge(train5,train5_true,on='user_id',how='left')\n", " train5 = train5.fillna(0)\n", " del train5_true;gc.collect(); \n", "\n", " \"训练集-6\"\n", " tr_app = app[(app.day>=1)&(app.day<=18)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=18)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=18)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=18)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train6 = tr[['user_id']].drop_duplicates()\n", " #未来7天:21-27号\n", " tr_app = app[(app.day>=19)&(app.day<=25)];tr_user_act = user_act[(user_act.day>=19)&(user_act.day<=25)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=19)&(user_reg.register_day<=25)];tr_vedio = vedio[(vedio.day>=19)&(vedio.day<=25)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train6_true = tr[['user_id']].drop_duplicates()\n", " #左连接打标\n", " train6_true['label'] = 1\n", " train6 = pd.merge(train6,train6_true,on='user_id',how='left')\n", " train6 = train6.fillna(0)\n", " del train6_true;gc.collect();\n", " \n", " \"训练集-7\"\n", " tr_app = app[(app.day>=1)&(app.day<=17)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=17)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=17)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=17)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train7 = tr[['user_id']].drop_duplicates()\n", " #未来7天:21-27号\n", " tr_app = app[(app.day>=18)&(app.day<=24)];tr_user_act = user_act[(user_act.day>=18)&(user_act.day<=24)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=18)&(user_reg.register_day<=24)];tr_vedio = vedio[(vedio.day>=18)&(vedio.day<=24)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train7_true = tr[['user_id']].drop_duplicates()\n", " #左连接打标\n", " train7_true['label'] = 1\n", " train7 = pd.merge(train7,train7_true,on='user_id',how='left')\n", " train7 = train7.fillna(0)\n", " del train7_true;gc.collect();\n", " \n", " \"训练集-8\"\n", " tr_app = app[(app.day>=1)&(app.day<=16)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=16)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=16)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=16)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train8 = tr[['user_id']].drop_duplicates()\n", " #未来7天:21-27号\n", " tr_app = app[(app.day>=17)&(app.day<=23)];tr_user_act = user_act[(user_act.day>=17)&(user_act.day<=23)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=17)&(user_reg.register_day<=23)];tr_vedio = vedio[(vedio.day>=17)&(vedio.day<=23)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train8_true = tr[['user_id']].drop_duplicates()\n", " #左连接打标\n", " train8_true['label'] = 1\n", " train8 = pd.merge(train8,train8_true,on='user_id',how='left')\n", " train8 = train8.fillna(0)\n", " del train8_true;gc.collect();\n", " \n", " \"训练集-9\"\n", " tr_app = app[(app.day>=1)&(app.day<=15)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=15)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=15)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=15)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train9 = tr[['user_id']].drop_duplicates()\n", " #未来7天:21-27号\n", " tr_app = app[(app.day>=16)&(app.day<=22)];tr_user_act = user_act[(user_act.day>=16)&(user_act.day<=22)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=16)&(user_reg.register_day<=22)];tr_vedio = vedio[(vedio.day>=16)&(vedio.day<=22)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train9_true = tr[['user_id']].drop_duplicates()\n", " #左连接打标\n", " train9_true['label'] = 1\n", " train9 = pd.merge(train9,train9_true,on='user_id',how='left')\n", " train9 = train9.fillna(0)\n", " del train9_true;gc.collect();\n", " \n", " \"训练集-10\"\n", " tr_app = app[(app.day>=1)&(app.day<=14)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=14)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=14)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=14)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train10 = tr[['user_id']].drop_duplicates()\n", " #未来7天:21-27号\n", " tr_app = app[(app.day>=15)&(app.day<=21)];tr_user_act = user_act[(user_act.day>=15)&(user_act.day<=21)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=15)&(user_reg.register_day<=21)];tr_vedio = vedio[(vedio.day>=15)&(vedio.day<=21)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train10_true = tr[['user_id']].drop_duplicates()\n", " #左连接打标\n", " train10_true['label'] = 1\n", " train10 = pd.merge(train10,train10_true,on='user_id',how='left')\n", " train10 = train10.fillna(0)\n", " del train10_true;gc.collect(); \n", " \n", " \"训练集-11\"\n", " tr_app = app[(app.day>=1)&(app.day<=13)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=13)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=13)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=13)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train11 = tr[['user_id']].drop_duplicates()\n", " #未来7天:21-27号\n", " tr_app = app[(app.day>=14)&(app.day<=20)];tr_user_act = user_act[(user_act.day>=14)&(user_act.day<=20)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=14)&(user_reg.register_day<=20)];tr_vedio = vedio[(vedio.day>=14)&(vedio.day<=20)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train11_true = tr[['user_id']].drop_duplicates()\n", " #左连接打标\n", " train11_true['label'] = 1\n", " train11 = pd.merge(train11,train11_true,on='user_id',how='left')\n", " train11 = train11.fillna(0)\n", " del train11_true;gc.collect();\n", " \n", " \"训练集-12\"\n", " tr_app = app[(app.day>=1)&(app.day<=12)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=12)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=12)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=12)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train12 = tr[['user_id']].drop_duplicates()\n", " #未来7天:21-27号\n", " tr_app = app[(app.day>=13)&(app.day<=19)];tr_user_act = user_act[(user_act.day>=13)&(user_act.day<=19)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=13)&(user_reg.register_day<=19)];tr_vedio = vedio[(vedio.day>=13)&(vedio.day<=19)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train12_true = tr[['user_id']].drop_duplicates()\n", " #左连接打标\n", " train12_true['label'] = 1\n", " train12 = pd.merge(train12,train12_true,on='user_id',how='left')\n", " train12 = train12.fillna(0)\n", " del train12_true;gc.collect();\n", " \n", " \"训练集-13\"\n", " tr_app = app[(app.day>=1)&(app.day<=11)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=11)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=11)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=11)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train13 = tr[['user_id']].drop_duplicates()\n", " #未来7天:21-27号\n", " tr_app = app[(app.day>=12)&(app.day<=18)];tr_user_act = user_act[(user_act.day>=12)&(user_act.day<=18)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=12)&(user_reg.register_day<=18)];tr_vedio = vedio[(vedio.day>=12)&(vedio.day<=18)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train13_true = tr[['user_id']].drop_duplicates()\n", " #左连接打标\n", " train13_true['label'] = 1\n", " train13 = pd.merge(train13,train13_true,on='user_id',how='left')\n", " train13 = train13.fillna(0)\n", " del train13_true;gc.collect();\n", " \n", " \"训练集-14\"\n", " tr_app = app[(app.day>=1)&(app.day<=10)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=10)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=10)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=10)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train14 = tr[['user_id']].drop_duplicates()\n", " #未来7天:21-27号\n", " tr_app = app[(app.day>=11)&(app.day<=17)];tr_user_act = user_act[(user_act.day>=11)&(user_act.day<=17)]\n", " tr_user_reg = user_reg[(user_reg.register_day>=11)&(user_reg.register_day<=17)];tr_vedio = vedio[(vedio.day>=11)&(vedio.day<=17)]\n", " tr1 = tr_app[['user_id']].drop_duplicates()\n", " tr2 = tr_user_act[['user_id']].drop_duplicates()\n", " tr3 = tr_user_reg[['user_id']].drop_duplicates()\n", " tr4 = tr_vedio[['user_id']].drop_duplicates()\n", " tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)\n", " train14_true = tr[['user_id']].drop_duplicates()\n", " #左连接打标\n", " train14_true['label'] = 1\n", " train14 = pd.merge(train14,train14_true,on='user_id',how='left')\n", " train14 = train14.fillna(0)\n", " del train14_true;gc.collect();\n", " \n", " del tr1;gc.collect();\n", " del tr2;gc.collect();\n", " del tr3;gc.collect();\n", " del tr4;gc.collect();\n", " del tr;gc.collect();\n", " del tr_app;gc.collect();\n", " del tr_user_reg;gc.collect();\n", " del tr_vedio;gc.collect();\n", " \n", " \n", " return test,train1,train2,train3,train4,train5,train6,train7,train8,train9,train10,train11,train12,train13,train14\n", "\n", "#%%\n", "def culContinuousMeanLaunchDay(s,day_min):\n", " f_start = day_min - 21\n", " launch_day = [int(x) for x in list(set(s.split(':')))]\n", " launch_day.sort()\n", "\n", " continuous_day_count = []\n", " count = 0\n", " for i in range(len(launch_day) - 1):\n", " if ((int(launch_day[i + 1]) - int(launch_day[i]) == 1)):\n", " if (i == len(launch_day) - 2):\n", " count += (int(launch_day[i]) - f_start + 1)\n", " continuous_day_count.append(\n", " count + (int(launch_day[i + 1]) - f_start + 1))\n", " else:\n", " count += (int(launch_day[i]) - f_start + 1)\n", " else:\n", " if (count != 0):\n", " continuous_day_count.append(\n", " count + (int(launch_day[i + 1]) - f_start + 1))\n", " count = 0\n", "\n", " if (len(continuous_day_count) > 0):\n", " continuous_day_count = np.array(continuous_day_count)\n", " else:\n", " continuous_day_count.append(0)\n", " continuous_day_count = np.array(continuous_day_count)\n", "\n", " return np.mean(continuous_day_count)\n", "\n", "#%%\n", "def culContinuousMaxLaunchDay(s,day_min):\n", " f_start = day_min - 21\n", " launch_day = [int(x) for x in list(set(s.split(':')))]\n", " launch_day.sort()\n", "\n", " continuous_day_count = []\n", " count = 0\n", " for i in range(len(launch_day) - 1):\n", " if ((int(launch_day[i + 1]) - int(launch_day[i]) == 1)):\n", " if (i == len(launch_day) - 2):\n", " count += (int(launch_day[i]) - f_start + 1)\n", " continuous_day_count.append(\n", " count + (int(launch_day[i + 1]) - f_start + 1))\n", " else:\n", " count += (int(launch_day[i]) - f_start + 1)\n", " else:\n", " if (count != 0):\n", " continuous_day_count.append(\n", " count + (int(launch_day[i + 1]) - f_start + 1))\n", " count = 0\n", "\n", " if (len(continuous_day_count) > 0):\n", " continuous_day_count = np.array(continuous_day_count)\n", " else:\n", " continuous_day_count.append(0)\n", " continuous_day_count = np.array(continuous_day_count)\n", "\n", " return np.max(continuous_day_count)\n", " \n", "#%%\n", "def culContinuousMinLaunchDay(s,day_min):\n", " f_start = day_min - 21\n", " launch_day = [int(x) for x in list(set(s.split(':')))]\n", " launch_day.sort()\n", "\n", " continuous_day_count = []\n", " count = 0\n", " for i in range(len(launch_day) - 1):\n", " if ((int(launch_day[i + 1]) - int(launch_day[i]) == 1)):\n", " if (i == len(launch_day) - 2):\n", " count += (int(launch_day[i]) - f_start + 1)\n", " continuous_day_count.append(\n", " count + (int(launch_day[i + 1]) - f_start + 1))\n", " else:\n", " count += (int(launch_day[i]) - f_start + 1)\n", " else:\n", " if (count != 0):\n", " continuous_day_count.append(\n", " count + (int(launch_day[i + 1]) - f_start + 1))\n", " count = 0\n", "\n", " if (len(continuous_day_count) > 0):\n", " continuous_day_count = np.array(continuous_day_count)\n", " else:\n", " continuous_day_count.append(0)\n", " continuous_day_count = np.array(continuous_day_count)\n", "\n", " return np.min(continuous_day_count)\n", " \n", "#%%\n", "\n", "def genFeature(day_min,day_max,data,app,user_act,user_reg,vedio):\n", " \"特征提取\"\n", " #保存原始表\n", " ans = data.copy()\n", " \n", " #计算最大最小平均使用时间\n", " app = app[(app.day>=day_min-21)&(app.day<=day_max-7)]\n", " vedio = vedio[(vedio.day>=day_min-21)&(vedio.day<=day_max-7)]\n", " user_act = user_act[(user_act.day>=day_min-21)&(user_act.day<=day_max-7)]\n", "\n", " a = app.copy()\n", " a['s'] = a.day\n", " a['s'] = a.s.astype('str')\n", " a = a.groupby(['user_id'])['s'].agg(lambda x:':'.join(x)).reset_index()\n", " print(1)\n", " a['s_mean_day'] = a.s.apply(lambda x:culContinuousMeanLaunchDay(x,day_min))\n", " print(2)\n", " a['s_max_day'] = a.s.apply(lambda x:culContinuousMaxLaunchDay(x,day_min))\n", " print(3)\n", " a = a[['user_id','s_mean_day','s_max_day']]\n", " ans = pd.merge(ans,a,on='user_id',how='left')\n", " del a;gc.collect();\n", " \n", " a = vedio.copy()\n", " a['s'] = a.day\n", " a['s'] = a.s.astype('str')\n", " a = a.groupby(['user_id'])['s'].agg(lambda x:':'.join(x)).reset_index()\n", " print(1)\n", " a['s_mean_day_1'] = a.s.apply(lambda x:culContinuousMeanLaunchDay(x,day_min))\n", " print(2)\n", " a['s_max_day_1'] = a.s.apply(lambda x:culContinuousMaxLaunchDay(x,day_min))\n", " print(3)\n", " a = a[['user_id','s_mean_day_1','s_max_day_1']]\n", " ans = pd.merge(ans,a,on='user_id',how='left')\n", " del a;gc.collect();\n", " \n", " a = user_act.copy()\n", " a['s'] = a.day\n", " a['s'] = a.s.astype('str')\n", " a = a.groupby(['user_id'])['s'].agg(lambda x:':'.join(x)).reset_index()\n", " print(1)\n", " a['s_mean_day_2'] = a.s.apply(lambda x:culContinuousMeanLaunchDay(x,day_min))\n", " print(2)\n", " a['s_max_day_2'] = a.s.apply(lambda x:culContinuousMaxLaunchDay(x,day_min))\n", " print(3)\n", " a = a[['user_id','s_mean_day_2','s_max_day_2']]\n", " ans = pd.merge(ans,a,on='user_id',how='left')\n", " del a;gc.collect();\n", " \n", "#%%\n", " \"提取 注册信息表 特征\"\n", " ans = pd.merge(ans,user_reg,on='user_id',how='left')\n", " #注册日期距离最小/最大日期距离\n", " ans['register_sub_min'] = day_min - ans['register_day']\n", " ans['register_sub_max'] = day_max - ans['register_day']\n", " #离散注册类型\n", " register_type_df = pd.get_dummies(ans['register_type'],prefix = 'register_type')\n", " ans = pd.concat([ans,register_type_df],axis=1)\n", " del ans['register_type']\n", " del register_type_df;gc.collect();\n", "#%% \n", " \"添加时间差\"\n", " ans['day_median_sub'] = day_min - ans['register_day']\n", "#%%\n", " \"提取 app 特征\"\n", " app = app[(app.day>=day_min-21)&(app.day<=day_max-7)]\n", " #统计用户在前21天启用app多少次\n", " app['app_count'] = app['user_id']\n", " feat = pd.pivot_table(app,index=['user_id'],values='app_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['app_count'] = ans['app_count']/ans['day_median_sub']\n", " \n", " #统计用户最近一次使用app距离最小/最大日期距离\n", " #最小使用app时间\n", " app['app_day_min'] = app['day']\n", " feat = pd.pivot_table(app,index=['user_id'],values='app_day_min',aggfunc='min').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " #最大使用app时间\n", " app['app_day_max'] = app['day']\n", " feat = pd.pivot_table(app,index=['user_id'],values='app_day_max',aggfunc='max').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['appmin_sub_day_min'] = day_min - ans['app_day_min']\n", " ans['appmax_sub_day_min'] = day_min - ans['app_day_max']\n", " ans['appmin_sub_day_max'] = day_max - ans['app_day_min']\n", " ans['appmax_sub_day_max'] = day_max - ans['app_day_max']\n", " del ans['app_day_min'];del ans['app_day_max']\n", "\n", " \"划分粒度\"\n", " #统计用户在前14天启用app多少次\n", " app = app[(app.day>=day_min-14)&(app.day<=day_max-7)]\n", " app['app_count_14'] = app['user_id']\n", " feat = pd.pivot_table(app,index=['user_id'],values='app_count_14',aggfunc='count').reset_index() \n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['app_count_14'] = ans['app_count_14']/ans['day_median_sub']\n", " #统计用户在前10天启用app多少次\n", " app = app[(app.day>=day_min-10)&(app.day<=day_max-7)]\n", " app['app_count_10'] = app['user_id']\n", " feat = pd.pivot_table(app,index=['user_id'],values='app_count_10',aggfunc='count').reset_index() \n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['app_count_10'] = ans['app_count_10']/ans['day_median_sub']\n", " #统计用户在前7天启用app多少次\n", " app = app[(app.day>=day_min-7)&(app.day<=day_max-7)]\n", " app['app_count_7'] = app['user_id']\n", " feat = pd.pivot_table(app,index=['user_id'],values='app_count_7',aggfunc='count').reset_index() \n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['app_count_7'] = ans['app_count_7']/ans['day_median_sub']\n", " #统计用户在前5天启用app多少次\n", " app = app[(app.day>=day_min-5)&(app.day<=day_max-7)]\n", " app['app_count_5'] = app['user_id']\n", " feat = pd.pivot_table(app,index=['user_id'],values='app_count_5',aggfunc='count').reset_index() \n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " #统计用户在前4天启用app多少次\n", " app = app[(app.day>=day_min-4)&(app.day<=day_max-7)]\n", " app['app_count_4'] = app['user_id']\n", " feat = pd.pivot_table(app,index=['user_id'],values='app_count_4',aggfunc='count').reset_index() \n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " #统计用户在前3天启用app多少次\n", " app = app[(app.day>=day_min-3)&(app.day<=day_max-7)]\n", " app['app_count_3'] = app['user_id']\n", " feat = pd.pivot_table(app,index=['user_id'],values='app_count_3',aggfunc='count').reset_index() \n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " #统计用户在前2天启用app多少次\n", " app = app[(app.day>=day_min-2)&(app.day<=day_max-7)]\n", " app['app_count_2'] = app['user_id']\n", " feat = pd.pivot_table(app,index=['user_id'],values='app_count_2',aggfunc='count').reset_index() \n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " #统计用户在前1天启用app多少次\n", " app = app[(app.day>=day_min-1)&(app.day<=day_max-7)]\n", " app['app_count_1'] = app['user_id']\n", " feat = pd.pivot_table(app,index=['user_id'],values='app_count_1',aggfunc='count').reset_index() \n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " \n", "#%%\n", " \"提取 vedio 表特征\"\n", " vedio = vedio[(vedio.day>=day_min-21)&(vedio.day<=day_max-7)]\n", " #统计用户在前21天拍摄视频多少次\n", " vedio['vedio_count'] = vedio['user_id']\n", " feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_count',aggfunc='count').reset_index() \n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['vedio_count'] = ans['vedio_count']/ans['day_median_sub']\n", " #统计用户最近一次拍摄视频距离最小/最大日期距离\n", " #最小使用vedio时间\n", " vedio['vedio_day_min'] = vedio['day']\n", " feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_day_min',aggfunc='min').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " #最大使用vedio时间\n", " vedio['vedio_day_max'] = vedio['day']\n", " feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_day_max',aggfunc='max').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['vediomin_sub_day_min'] = day_min - ans['vedio_day_min']\n", " ans['vediomax_sub_day_min'] = day_min - ans['vedio_day_max']\n", " ans['vediomin_sub_day_max'] = day_max - ans['vedio_day_min']\n", " ans['vediomax_sub_day_max'] = day_max - ans['vedio_day_max']\n", " del ans['vedio_day_min'];del ans['vedio_day_max']\n", " \n", " \"划分粒度\"\n", " #统计用户在前14天拍摄视频多少次\n", " vedio = vedio[(vedio.day>=day_min-14)&(vedio.day<=day_max-7)]\n", " vedio['vedio_count_14'] = vedio['user_id']\n", " feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_count_14',aggfunc='count').reset_index() \n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['vedio_count_14'] = ans['vedio_count_14']/ans['day_median_sub']\n", " #统计用户在前10天拍摄视频多少次\n", " vedio = vedio[(vedio.day>=day_min-10)&(vedio.day<=day_max-7)]\n", " vedio['vedio_count_10'] = vedio['user_id']\n", " feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_count_10',aggfunc='count').reset_index() \n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['vedio_count_10'] = ans['vedio_count_10']/ans['day_median_sub']\n", " #统计用户在前7天拍摄视频多少次\n", " vedio = vedio[(vedio.day>=day_min-7)&(vedio.day<=day_max-7)]\n", " vedio['vedio_count_7'] = vedio['user_id']\n", " feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_count_7',aggfunc='count').reset_index() \n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['vedio_count_7'] = ans['vedio_count_7']/ans['day_median_sub']\n", " #统计用户在前5天拍摄视频多少次\n", " vedio = vedio[(vedio.day>=day_min-5)&(vedio.day<=day_max-7)]\n", " vedio['vedio_count_5'] = vedio['user_id']\n", " feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_count_5',aggfunc='count').reset_index() \n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " #统计用户在前4天拍摄视频多少次\n", " vedio = vedio[(vedio.day>=day_min-4)&(vedio.day<=day_max-7)]\n", " vedio['vedio_count_4'] = vedio['user_id']\n", " feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_count_4',aggfunc='count').reset_index() \n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " #统计用户在前3天拍摄视频多少次\n", " vedio = vedio[(vedio.day>=day_min-3)&(vedio.day<=day_max-7)]\n", " vedio['vedio_count_3'] = vedio['user_id']\n", " feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_count_3',aggfunc='count').reset_index() \n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " #统计用户在前2天拍摄视频多少次\n", " vedio = vedio[(vedio.day>=day_min-2)&(vedio.day<=day_max-7)]\n", " vedio['vedio_count_2'] = vedio['user_id']\n", " feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_count_2',aggfunc='count').reset_index() \n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " #统计用户在前1天拍摄视频多少次\n", " vedio = vedio[(vedio.day>=day_min-1)&(vedio.day<=day_max-7)]\n", " vedio['vedio_count_1'] = vedio['user_id']\n", " feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_count_1',aggfunc='count').reset_index() \n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " \n", " \n", "#%%\n", " \"提取 activity 表特征\"\n", " user_act = user_act[(user_act.day>=day_min-21)&(user_act.day<=day_max-7)]\n", " #统计前21天共活跃多少次\n", " user_act['user_act_count'] = user_act['user_id']\n", " feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_count'] = ans['user_act_count']/ans['day_median_sub']\n", "\n", "#%%\n", " \"划分粒度\"\n", "#%% 前18天\n", " '''\n", " user_act = user_act[(user_act.day>=day_min-18)&(user_act.day<=day_max-7)]\n", " #统计前18天共活跃多少次\n", " user_act['user_act_count_18'] = user_act['user_id']\n", " feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count_18',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_count_18'] = ans['user_act_count_18']/ans['day_median_sub']\n", " '''\n", " \n", "#%% 前14天\n", " user_act = user_act[(user_act.day>=day_min-14)&(user_act.day<=day_max-7)]\n", " #统计前14天共活跃多少次\n", " user_act['user_act_count_14'] = user_act['user_id']\n", " feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count_14',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_count_14'] = ans['user_act_count_14']/ans['day_median_sub']\n", " \n", " \"对page类别统计\"\n", " #统计page为0的次数\n", " page_0_14 = user_act[user_act.page==0]\n", " page_0_14['page_0_14_count'] = page_0_14['user_id']\n", " feat = pd.pivot_table(page_0_14,index=['user_id'],values='page_0_14_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['page_0_14_count'] = ans['page_0_14_count']/ans['day_median_sub']\n", " del page_0_14;gc.collect();\n", " #统计page为1的次数\n", " page_1_14 = user_act[user_act.page==1]\n", " page_1_14['page_1_14_count'] = page_1_14['user_id']\n", " feat = pd.pivot_table(page_1_14,index=['user_id'],values='page_1_14_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['page_1_14_count'] = ans['page_1_14_count']/ans['day_median_sub']\n", " del page_1_14;gc.collect();\n", " #统计page为2的次数\n", " page_2_14 = user_act[user_act.page==2]\n", " page_2_14['page_2_14_count'] = page_2_14['user_id']\n", " feat = pd.pivot_table(page_2_14,index=['user_id'],values='page_2_14_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['page_2_14_count'] = ans['page_2_14_count']/ans['day_median_sub']\n", " del page_2_14;gc.collect();\n", " #统计page为3的次数\n", " page_3_14 = user_act[user_act.page==3]\n", " page_3_14['page_3_14_count'] = page_3_14['user_id']\n", " feat = pd.pivot_table(page_3_14,index=['user_id'],values='page_3_14_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['page_3_14_count'] = ans['page_3_14_count']/ans['day_median_sub']\n", " del page_3_14;gc.collect();\n", " #统计page为4的次数\n", " page_4_14 = user_act[user_act.page==4]\n", " page_4_14['page_4_14_count'] = page_4_14['user_id']\n", " feat = pd.pivot_table(page_4_14,index=['user_id'],values='page_4_14_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['page_4_14_count'] = ans['page_4_14_count']/ans['day_median_sub']\n", " del page_4_14;gc.collect();\n", " \n", " \"对action_type进行统计\"\n", " #统计action_type为0的次数\n", " user_act_0_14 = user_act[user_act.action_type==0]\n", " user_act_0_14['user_act_0_14_count'] = user_act_0_14['user_id']\n", " feat = pd.pivot_table(user_act_0_14,index=['user_id'],values='user_act_0_14_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_0_14_count'] = ans['user_act_0_14_count']/ans['day_median_sub']\n", " del user_act_0_14;gc.collect();\n", " #统计action_type为1的次数\n", " user_act_1_14 = user_act[user_act.action_type==1]\n", " user_act_1_14['user_act_1_14_count'] = user_act_1_14['user_id']\n", " feat = pd.pivot_table(user_act_1_14,index=['user_id'],values='user_act_1_14_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_1_14_count'] = ans['user_act_1_14_count']/ans['day_median_sub']\n", " del user_act_1_14;gc.collect(); \n", " #统计action_type为2的次数\n", " user_act_2_14 = user_act[user_act.action_type==2]\n", " user_act_2_14['user_act_2_14_count'] = user_act_2_14['user_id']\n", " feat = pd.pivot_table(user_act_2_14,index=['user_id'],values='user_act_2_14_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_2_14_count'] = ans['user_act_2_14_count']/ans['day_median_sub']\n", " del user_act_2_14;gc.collect(); \n", " #统计action_type为3的次数\n", " user_act_3_14 = user_act[user_act.action_type==3]\n", " user_act_3_14['user_act_3_14_count'] = user_act_3_14['user_id']\n", " feat = pd.pivot_table(user_act_3_14,index=['user_id'],values='user_act_3_14_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_3_14_count'] = ans['user_act_3_14_count']/ans['day_median_sub']\n", " del user_act_3_14;gc.collect(); \n", " #统计action_type为4的次数\n", " user_act_4_14 = user_act[user_act.action_type==4]\n", " user_act_4_14['user_act_4_14_count'] = user_act_4_14['user_id']\n", " feat = pd.pivot_table(user_act_4_14,index=['user_id'],values='user_act_4_14_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_4_14_count'] = ans['user_act_4_14_count']/ans['day_median_sub']\n", " del user_act_4_14;gc.collect(); \n", " #统计action_type为5的次数\n", " user_act_5_14 = user_act[user_act.action_type==5]\n", " user_act_5_14['user_act_5_14_count'] = user_act_5_14['user_id']\n", " feat = pd.pivot_table(user_act_5_14,index=['user_id'],values='user_act_5_14_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_5_14_count'] = ans['user_act_5_14_count']/ans['day_median_sub']\n", " del user_act_5_14;gc.collect();\n", "\n", "#%% 前10天\n", " user_act = user_act[(user_act.day>=day_min-10)&(user_act.day<=day_max-7)]\n", " #统计前10天共活跃多少次\n", " user_act['user_act_count_10'] = user_act['user_id']\n", " feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count_10',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_count_10'] = ans['user_act_count_10']/ans['day_median_sub']\n", " \n", " \"对page类别统计\"\n", " #统计page为0的次数\n", " page_0_10 = user_act[user_act.page==0]\n", " page_0_10['page_0_10_count'] = page_0_10['user_id']\n", " feat = pd.pivot_table(page_0_10,index=['user_id'],values='page_0_10_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['page_0_10_count'] = ans['page_0_10_count']/ans['day_median_sub']\n", " del page_0_10;gc.collect();\n", " #统计page为1的次数\n", " page_1_10 = user_act[user_act.page==1]\n", " page_1_10['page_1_10_count'] = page_1_10['user_id']\n", " feat = pd.pivot_table(page_1_10,index=['user_id'],values='page_1_10_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['page_1_10_count'] = ans['page_1_10_count']/ans['day_median_sub']\n", " del page_1_10;gc.collect();\n", " #统计page为2的次数\n", " page_2_10 = user_act[user_act.page==2]\n", " page_2_10['page_2_10_count'] = page_2_10['user_id']\n", " feat = pd.pivot_table(page_2_10,index=['user_id'],values='page_2_10_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['page_2_10_count'] = ans['page_2_10_count']/ans['day_median_sub']\n", " del page_2_10;gc.collect();\n", " #统计page为3的次数\n", " page_3_10 = user_act[user_act.page==3]\n", " page_3_10['page_3_10_count'] = page_3_10['user_id']\n", " feat = pd.pivot_table(page_3_10,index=['user_id'],values='page_3_10_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['page_3_10_count'] = ans['page_3_10_count']/ans['day_median_sub']\n", " del page_3_10;gc.collect();\n", " #统计page为4的次数\n", " page_4_10 = user_act[user_act.page==4]\n", " page_4_10['page_4_10_count'] = page_4_10['user_id']\n", " feat = pd.pivot_table(page_4_10,index=['user_id'],values='page_4_10_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['page_4_10_count'] = ans['page_4_10_count']/ans['day_median_sub']\n", " del page_4_10;gc.collect();\n", " \n", " \"对action_type进行统计\"\n", " #统计action_type为0的次数\n", " user_act_0_10 = user_act[user_act.action_type==0]\n", " user_act_0_10['user_act_0_10_count'] = user_act_0_10['user_id']\n", " feat = pd.pivot_table(user_act_0_10,index=['user_id'],values='user_act_0_10_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_0_10_count'] = ans['user_act_0_10_count']/ans['day_median_sub']\n", " del user_act_0_10;gc.collect();\n", " #统计action_type为1的次数\n", " user_act_1_10 = user_act[user_act.action_type==1]\n", " user_act_1_10['user_act_1_10_count'] = user_act_1_10['user_id']\n", " feat = pd.pivot_table(user_act_1_10,index=['user_id'],values='user_act_1_10_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_1_10_count'] = ans['user_act_1_10_count']/ans['day_median_sub']\n", " del user_act_1_10 ;gc.collect();\n", " #统计action_type为2的次数\n", " user_act_2_10 = user_act[user_act.action_type==2]\n", " user_act_2_10['user_act_2_10_count'] = user_act_2_10['user_id']\n", " feat = pd.pivot_table(user_act_2_10,index=['user_id'],values='user_act_2_10_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_2_10_count'] = ans['user_act_2_10_count']/ans['day_median_sub']\n", " del user_act_2_10 ;gc.collect(); \n", " #统计action_type为3的次数\n", " user_act_3_10 = user_act[user_act.action_type==3]\n", " user_act_3_10['user_act_3_10_count'] = user_act_3_10['user_id']\n", " feat = pd.pivot_table(user_act_3_10,index=['user_id'],values='user_act_3_10_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_3_10_count'] = ans['user_act_3_10_count']/ans['day_median_sub']\n", " del user_act_3_10 ;gc.collect(); \n", " #统计action_type为4的次数\n", " user_act_4_10 = user_act[user_act.action_type==4]\n", " user_act_4_10['user_act_4_10_count'] = user_act_4_10['user_id']\n", " feat = pd.pivot_table(user_act_4_10,index=['user_id'],values='user_act_4_10_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_4_10_count'] = ans['user_act_4_10_count']/ans['day_median_sub']\n", " del user_act_4_10 ;gc.collect();\n", " #统计action_type为5的次数\n", " user_act_5_10 = user_act[user_act.action_type==5]\n", " user_act_5_10['user_act_5_10_count'] = user_act_5_10['user_id']\n", " feat = pd.pivot_table(user_act_5_10,index=['user_id'],values='user_act_5_10_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_5_10_count'] = ans['user_act_5_10_count']/ans['day_median_sub']\n", " del user_act_5_10;gc.collect();\n", "#%% 前7天\n", " user_act = user_act[(user_act.day>=day_min-7)&(user_act.day<=day_max-7)]\n", " #统计前7天共活跃多少次\n", " user_act['user_act_count_7'] = user_act['user_id']\n", " feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count_7',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_count_7'] = ans['user_act_count_7']/ans['day_median_sub']\n", " \n", " \"对page每种进行统计\"\n", " #统计page为0的次数\n", " page_0_7 = user_act[user_act.page==0]\n", " page_0_7['page_0_7_count'] = page_0_7['user_id']\n", " feat = pd.pivot_table(page_0_7,index=['user_id'],values='page_0_7_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['page_0_7_count'] = ans['page_0_7_count']/ans['day_median_sub']\n", " del page_0_7;gc.collect();\n", " #统计page为1的次数\n", " page_1_7 = user_act[user_act.page==1]\n", " page_1_7['page_1_7_count'] = page_1_7['user_id']\n", " feat = pd.pivot_table(page_1_7,index=['user_id'],values='page_1_7_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['page_1_7_count'] = ans['page_1_7_count']/ans['day_median_sub']\n", " del page_1_7;gc.collect();\n", " #统计page为2的次数\n", " page_2_7 = user_act[user_act.page==2]\n", " page_2_7['page_2_7_count'] = page_2_7['user_id']\n", " feat = pd.pivot_table(page_2_7,index=['user_id'],values='page_2_7_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['page_2_7_count'] = ans['page_2_7_count']/ans['day_median_sub']\n", " del page_2_7;gc.collect();\n", " #统计page为3的次数\n", " page_3_7 = user_act[user_act.page==3]\n", " page_3_7['page_3_7_count'] = page_3_7['user_id']\n", " feat = pd.pivot_table(page_3_7,index=['user_id'],values='page_3_7_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['page_3_7_count'] = ans['page_3_7_count']/ans['day_median_sub']\n", " del page_3_7;gc.collect();\n", " #统计page为4的次数\n", " page_4_7 = user_act[user_act.page==4]\n", " page_4_7['page_4_7_count'] = page_4_7['user_id']\n", " feat = pd.pivot_table(page_4_7,index=['user_id'],values='page_4_7_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['page_4_7_count'] = ans['page_4_7_count']/ans['day_median_sub']\n", " del page_4_7;gc.collect();\n", " \n", " \"对action_type进行统计\"\n", " #统计action_type为0的次数\n", " user_act_0_7 = user_act[user_act.action_type==0]\n", " user_act_0_7['user_act_0_7_count'] = user_act_0_7['user_id']\n", " feat = pd.pivot_table(user_act_0_7,index=['user_id'],values='user_act_0_7_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_0_7_count'] = ans['user_act_0_7_count']/ans['day_median_sub']\n", " del user_act_0_7;gc.collect();\n", " #统计action_type为1的次数\n", " user_act_1_7 = user_act[user_act.action_type==1]\n", " user_act_1_7['user_act_1_7_count'] = user_act_1_7['user_id']\n", " feat = pd.pivot_table(user_act_1_7,index=['user_id'],values='user_act_1_7_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_1_7_count'] = ans['user_act_1_7_count']/ans['day_median_sub']\n", " del user_act_1_7;gc.collect(); \n", " #统计action_type为2的次数\n", " user_act_2_7 = user_act[user_act.action_type==2]\n", " user_act_2_7['user_act_2_7_count'] = user_act_2_7['user_id']\n", " feat = pd.pivot_table(user_act_2_7,index=['user_id'],values='user_act_2_7_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_2_7_count'] = ans['user_act_2_7_count']/ans['day_median_sub']\n", " del user_act_2_7;gc.collect(); \n", " #统计action_type为3的次数\n", " user_act_3_7 = user_act[user_act.action_type==3]\n", " user_act_3_7['user_act_3_7_count'] = user_act_3_7['user_id']\n", " feat = pd.pivot_table(user_act_3_7,index=['user_id'],values='user_act_3_7_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_3_7_count'] = ans['user_act_3_7_count']/ans['day_median_sub']\n", " del user_act_3_7;gc.collect(); \n", " #统计action_type为4的次数\n", " user_act_4_7 = user_act[user_act.action_type==4]\n", " user_act_4_7['user_act_4_7_count'] = user_act_4_7['user_id']\n", " feat = pd.pivot_table(user_act_4_7,index=['user_id'],values='user_act_4_7_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_4_7_count'] = ans['user_act_4_7_count']/ans['day_median_sub']\n", " del user_act_4_7;gc.collect(); \n", " #统计action_type为5的次数\n", " user_act_5_7 = user_act[user_act.action_type==5]\n", " user_act_5_7['user_act_5_7_count'] = user_act_5_7['user_id']\n", " feat = pd.pivot_table(user_act_5_7,index=['user_id'],values='user_act_5_7_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " ans['user_act_5_7_count'] = ans['user_act_5_7_count']/ans['day_median_sub']\n", " del user_act_5_7;gc.collect();\n", " \n", "#%% 前5天\n", " user_act = user_act[(user_act.day>=day_min-5)&(user_act.day<=day_max-7)]\n", " #统计前5天共活跃多少次\n", " user_act['user_act_count_5'] = user_act['user_id']\n", " feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count_5',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " \n", " \"对page每种进行统计\"\n", " #统计page为0的次数\n", " page_0_5 = user_act[user_act.page==0]\n", " page_0_5['page_0_5_count'] = page_0_5['user_id']\n", " feat = pd.pivot_table(page_0_5,index=['user_id'],values='page_0_5_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_0_5;gc.collect();\n", " #统计page为1的次数\n", " page_1_5 = user_act[user_act.page==1]\n", " page_1_5['page_1_5_count'] = page_1_5['user_id']\n", " feat = pd.pivot_table(page_1_5,index=['user_id'],values='page_1_5_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_1_5;gc.collect();\n", " #统计page为2的次数\n", " page_2_5 = user_act[user_act.page==2]\n", " page_2_5['page_2_5_count'] = page_2_5['user_id']\n", " feat = pd.pivot_table(page_2_5,index=['user_id'],values='page_2_5_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_2_5;gc.collect();\n", " #统计page为3的次数\n", " page_3_5 = user_act[user_act.page==3]\n", " page_3_5['page_3_5_count'] = page_3_5['user_id']\n", " feat = pd.pivot_table(page_3_5,index=['user_id'],values='page_3_5_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_3_5;gc.collect();\n", " #统计page为4的次数\n", " page_4_5 = user_act[user_act.page==4]\n", " page_4_5['page_4_5_count'] = page_4_5['user_id']\n", " feat = pd.pivot_table(page_4_5,index=['user_id'],values='page_4_5_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_4_5;gc.collect();\n", " \n", " \"对action_type进行统计\"\n", " #统计action_type为0的次数\n", " user_act_0_5 = user_act[user_act.action_type==0]\n", " user_act_0_5['user_act_0_5_count'] = user_act_0_5['user_id']\n", " feat = pd.pivot_table(user_act_0_5,index=['user_id'],values='user_act_0_5_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_0_5;gc.collect();\n", " #统计action_type为1的次数\n", " user_act_1_5 = user_act[user_act.action_type==1]\n", " user_act_1_5['user_act_1_5_count'] = user_act_1_5['user_id']\n", " feat = pd.pivot_table(user_act_1_5,index=['user_id'],values='user_act_1_5_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_1_5 ;gc.collect();\n", " #统计action_type为2的次数\n", " user_act_2_5 = user_act[user_act.action_type==2]\n", " user_act_2_5['user_act_2_5_count'] = user_act_2_5['user_id']\n", " feat = pd.pivot_table(user_act_2_5,index=['user_id'],values='user_act_2_5_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_2_5 ;gc.collect(); \n", " #统计action_type为3的次数\n", " user_act_3_5 = user_act[user_act.action_type==3]\n", " user_act_3_5['user_act_3_5_count'] = user_act_3_5['user_id']\n", " feat = pd.pivot_table(user_act_3_5,index=['user_id'],values='user_act_3_5_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_3_5 ;gc.collect();\n", " #统计action_type为4的次数\n", " user_act_4_5 = user_act[user_act.action_type==4]\n", " user_act_4_5['user_act_4_5_count'] = user_act_4_5['user_id']\n", " feat = pd.pivot_table(user_act_4_5,index=['user_id'],values='user_act_4_5_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_4_5 ;gc.collect();\n", " #统计action_type为5的次数\n", " user_act_5_5 = user_act[user_act.action_type==5]\n", " user_act_5_5['user_act_5_5_count'] = user_act_5_5['user_id']\n", " feat = pd.pivot_table(user_act_5_5,index=['user_id'],values='user_act_5_5_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_5_5;gc.collect();\n", "#%% 前4天\n", " user_act = user_act[(user_act.day>=day_min-4)&(user_act.day<=day_max-7)]\n", " #统计前4天共活跃多少次\n", " user_act['user_act_count_4'] = user_act['user_id']\n", " feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count_4',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " \n", " \"对page每种进行统计\"\n", " #统计page为0的次数\n", " page_0_4 = user_act[user_act.page==0]\n", " page_0_4['page_0_4_count'] = page_0_4['user_id']\n", " feat = pd.pivot_table(page_0_4,index=['user_id'],values='page_0_4_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_0_4;gc.collect();\n", " #统计page为1的次数\n", " page_1_4 = user_act[user_act.page==1]\n", " page_1_4['page_1_4_count'] = page_1_4['user_id']\n", " feat = pd.pivot_table(page_1_4,index=['user_id'],values='page_1_4_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_1_4;gc.collect();\n", " #统计page为2的次数\n", " page_2_4 = user_act[user_act.page==2]\n", " page_2_4['page_2_4_count'] = page_2_4['user_id']\n", " feat = pd.pivot_table(page_2_4,index=['user_id'],values='page_2_4_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_2_4;gc.collect();\n", " #统计page为3的次数\n", " page_3_4 = user_act[user_act.page==3]\n", " page_3_4['page_3_4_count'] = page_3_4['user_id']\n", " feat = pd.pivot_table(page_3_4,index=['user_id'],values='page_3_4_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_3_4;gc.collect();\n", " #统计page为4的次数\n", " page_4_4 = user_act[user_act.page==4]\n", " page_4_4['page_4_4_count'] = page_4_4['user_id']\n", " feat = pd.pivot_table(page_4_4,index=['user_id'],values='page_4_4_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_4_4;gc.collect();\n", " \n", " \"对action_type进行统计\"\n", " #统计action_type为0的次数\n", " user_act_0_4 = user_act[user_act.action_type==0]\n", " user_act_0_4['user_act_0_4_count'] = user_act_0_4['user_id']\n", " feat = pd.pivot_table(user_act_0_4,index=['user_id'],values='user_act_0_4_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_0_4;gc.collect();\n", " #统计action_type为1的次数\n", " user_act_1_4 = user_act[user_act.action_type==1]\n", " user_act_1_4['user_act_1_5_count'] = user_act_1_4['user_id']\n", " feat = pd.pivot_table(user_act_1_4,index=['user_id'],values='user_act_1_5_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_1_4 ;gc.collect();\n", " #统计action_type为2的次数\n", " user_act_2_4 = user_act[user_act.action_type==2]\n", " user_act_2_4['user_act_2_4_count'] = user_act_2_4['user_id']\n", " feat = pd.pivot_table(user_act_2_4,index=['user_id'],values='user_act_2_4_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_2_4 ;gc.collect(); \n", " #统计action_type为3的次数\n", " user_act_3_4 = user_act[user_act.action_type==3]\n", " user_act_3_4['user_act_3_4_count'] = user_act_3_4['user_id']\n", " feat = pd.pivot_table(user_act_3_4,index=['user_id'],values='user_act_3_4_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_3_4 ;gc.collect(); \n", " #统计action_type为4的次数\n", " user_act_4_4 = user_act[user_act.action_type==4]\n", " user_act_4_4['user_act_4_4_count'] = user_act_4_4['user_id']\n", " feat = pd.pivot_table(user_act_4_4,index=['user_id'],values='user_act_4_4_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_4_4;gc.collect();\n", " #统计action_type为5的次数\n", " user_act_5_4 = user_act[user_act.action_type==5]\n", " user_act_5_4['user_act_5_4_count'] = user_act_5_4['user_id']\n", " feat = pd.pivot_table(user_act_5_4,index=['user_id'],values='user_act_5_4_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_5_4 ;gc.collect(); \n", "#%% 前3天\n", " user_act = user_act[(user_act.day>=day_min-3)&(user_act.day<=day_max-7)]\n", " #统计前3天共活跃多少次\n", " user_act['user_act_count_3'] = user_act['user_id']\n", " feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count_3',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " \n", " \"对page每种进行统计\"\n", " #统计page为0的次数\n", " page_0_3 = user_act[user_act.page==0]\n", " page_0_3['page_0_3_count'] = page_0_3['user_id']\n", " feat = pd.pivot_table(page_0_3,index=['user_id'],values='page_0_3_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_0_3;gc.collect();\n", " #统计page为1的次数\n", " page_1_3 = user_act[user_act.page==1]\n", " page_1_3['page_1_3_count'] = page_1_3['user_id']\n", " feat = pd.pivot_table(page_1_3,index=['user_id'],values='page_1_3_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_1_3;gc.collect();\n", " #统计page为2的次数\n", " page_2_3 = user_act[user_act.page==2]\n", " page_2_3['page_2_3_count'] = page_2_3['user_id']\n", " feat = pd.pivot_table(page_2_3,index=['user_id'],values='page_2_3_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_2_3;gc.collect();\n", " #统计page为3的次数\n", " page_3_3 = user_act[user_act.page==3]\n", " page_3_3['page_3_3_count'] = page_3_3['user_id']\n", " feat = pd.pivot_table(page_3_3,index=['user_id'],values='page_3_3_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_3_3;gc.collect();\n", " #统计page为4的次数\n", " page_4_3 = user_act[user_act.page==4]\n", " page_4_3['page_4_3_count'] = page_4_3['user_id']\n", " feat = pd.pivot_table(page_4_3,index=['user_id'],values='page_4_3_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_4_3;gc.collect();\n", " \n", " \"对action_type进行统计\"\n", " #统计action_type为0的次数\n", " user_act_0_3 = user_act[user_act.action_type==0]\n", " user_act_0_3['user_act_0_3_count'] = user_act_0_3['user_id']\n", " feat = pd.pivot_table(user_act_0_3,index=['user_id'],values='user_act_0_3_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_0_3;gc.collect();\n", " #统计action_type为1的次数\n", " user_act_1_3 = user_act[user_act.action_type==1]\n", " user_act_1_3['user_act_1_3_count'] = user_act_1_3['user_id']\n", " feat = pd.pivot_table(user_act_1_3,index=['user_id'],values='user_act_1_3_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_1_3;gc.collect();\n", " #统计action_type为2的次数\n", " user_act_2_3 = user_act[user_act.action_type==2]\n", " user_act_2_3['user_act_2_3_count'] = user_act_2_3['user_id']\n", " feat = pd.pivot_table(user_act_2_3,index=['user_id'],values='user_act_2_3_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_2_3;gc.collect(); \n", " #统计action_type为3的次数\n", " user_act_3_3 = user_act[user_act.action_type==3]\n", " user_act_3_3['user_act_3_3_count'] = user_act_3_3['user_id']\n", " feat = pd.pivot_table(user_act_3_3,index=['user_id'],values='user_act_3_3_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_3_3;gc.collect(); \n", " #统计action_type为4的次数\n", " user_act_4_3 = user_act[user_act.action_type==4]\n", " user_act_4_3['user_act_4_3_count'] = user_act_4_3['user_id']\n", " feat = pd.pivot_table(user_act_4_3,index=['user_id'],values='user_act_4_3_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_4_3;gc.collect(); \n", " #统计action_type为5的次数\n", " user_act_5_3 = user_act[user_act.action_type==5]\n", " user_act_5_3['user_act_5_3_count'] = user_act_5_3['user_id']\n", " feat = pd.pivot_table(user_act_5_3,index=['user_id'],values='user_act_5_3_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_5_3;gc.collect();\n", " \n", "#%% 前2天\n", " user_act = user_act[(user_act.day>=day_min-2)&(user_act.day<=day_max-7)]\n", " #统计前2天共活跃多少次\n", " user_act['user_act_count_2'] = user_act['user_id']\n", " feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count_2',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " \n", " \"对page每种进行统计\"\n", " #统计page为0的次数\n", " page_0_2 = user_act[user_act.page==0]\n", " page_0_2['page_0_2_count'] = page_0_2['user_id']\n", " feat = pd.pivot_table(page_0_2,index=['user_id'],values='page_0_2_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_0_2;gc.collect();\n", " #统计page为1的次数\n", " page_1_2 = user_act[user_act.page==1]\n", " page_1_2['page_1_2_count'] = page_1_2['user_id']\n", " feat = pd.pivot_table(page_1_2,index=['user_id'],values='page_1_2_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_1_2;gc.collect();\n", " #统计page为2的次数\n", " page_2_2 = user_act[user_act.page==2]\n", " page_2_2['page_2_2_count'] = page_2_2['user_id']\n", " feat = pd.pivot_table(page_2_2,index=['user_id'],values='page_2_2_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_2_2;gc.collect();\n", " #统计page为3的次数\n", " page_3_2 = user_act[user_act.page==3]\n", " page_3_2['page_3_2_count'] = page_3_2['user_id']\n", " feat = pd.pivot_table(page_3_2,index=['user_id'],values='page_3_2_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_3_2;gc.collect();\n", " #统计page为4的次数\n", " page_4_2 = user_act[user_act.page==4]\n", " page_4_2['page_4_2_count'] = page_4_2['user_id']\n", " feat = pd.pivot_table(page_4_2,index=['user_id'],values='page_4_2_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_4_2;gc.collect();\n", " \n", " \"对action_type进行统计\"\n", " #统计action_type为0的次数\n", " user_act_0_2 = user_act[user_act.action_type==0]\n", " user_act_0_2['user_act_0_2_count'] = user_act_0_2['user_id']\n", " feat = pd.pivot_table(user_act_0_2,index=['user_id'],values='user_act_0_2_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_0_2;gc.collect();\n", " #统计action_type为1的次数\n", " user_act_1_2 = user_act[user_act.action_type==1]\n", " user_act_1_2['user_act_1_2_count'] = user_act_1_2['user_id']\n", " feat = pd.pivot_table(user_act_1_2,index=['user_id'],values='user_act_1_2_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_1_2;gc.collect();\n", " #统计action_type为2的次数\n", " user_act_2_2 = user_act[user_act.action_type==2]\n", " user_act_2_2['user_act_2_2_count'] = user_act_2_2['user_id']\n", " feat = pd.pivot_table(user_act_2_2,index=['user_id'],values='user_act_2_2_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_2_2 ;gc.collect(); \n", " #统计action_type为3的次数\n", " user_act_3_2 = user_act[user_act.action_type==3]\n", " user_act_3_2['user_act_3_2_count'] = user_act_3_2['user_id']\n", " feat = pd.pivot_table(user_act_3_2,index=['user_id'],values='user_act_3_2_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_3_2 ;gc.collect(); \n", " #统计action_type为4的次数\n", " user_act_4_2 = user_act[user_act.action_type==4]\n", " user_act_4_2['user_act_4_2_count'] = user_act_4_2['user_id']\n", " feat = pd.pivot_table(user_act_4_2,index=['user_id'],values='user_act_4_2_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_4_2 ;gc.collect();\n", " #统计action_type为5的次数\n", " user_act_5_2 = user_act[user_act.action_type==5]\n", " user_act_5_2['user_act_5_2_count'] = user_act_5_2['user_id']\n", " feat = pd.pivot_table(user_act_5_2,index=['user_id'],values='user_act_5_2_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_5_2;gc.collect();\n", " \n", "#%% 前1天\n", " user_act = user_act[(user_act.day>=day_min-1)&(user_act.day<=day_max-7)]\n", " #统计前1天共活跃多少次\n", " user_act['user_act_count_1'] = user_act['user_id']\n", " feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count_1',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " \n", " \"对page每种进行统计\"\n", " #统计page为0的次数\n", " page_0_1 = user_act[user_act.page==0]\n", " page_0_1['page_0_1_count'] = page_0_1['user_id']\n", " feat = pd.pivot_table(page_0_1,index=['user_id'],values='page_0_1_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_0_1;gc.collect();\n", " #统计page为1的次数\n", " page_1_1 = user_act[user_act.page==1]\n", " page_1_1['page_1_1_count'] = page_1_1['user_id']\n", " feat = pd.pivot_table(page_1_1,index=['user_id'],values='page_1_1_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_1_1;gc.collect();\n", " #统计page为2的次数\n", " page_2_1 = user_act[user_act.page==2]\n", " page_2_1['page_2_1_count'] = page_2_1['user_id']\n", " feat = pd.pivot_table(page_2_1,index=['user_id'],values='page_2_1_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_2_1;gc.collect();\n", " #统计page为3的次数\n", " page_3_1 = user_act[user_act.page==3]\n", " page_3_1['page_3_1_count'] = page_3_1['user_id']\n", " feat = pd.pivot_table(page_3_1,index=['user_id'],values='page_3_1_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_3_1;gc.collect();\n", " #统计page为4的次数\n", " page_4_1 = user_act[user_act.page==4]\n", " page_4_1['page_4_1_count'] = page_4_1['user_id']\n", " feat = pd.pivot_table(page_4_1,index=['user_id'],values='page_4_1_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del page_4_1;gc.collect();\n", " \n", " \"对action_type进行统计\"\n", " #统计action_type为0的次数\n", " user_act_0_1 = user_act[user_act.action_type==0]\n", " user_act_0_1['user_act_0_1_count'] = user_act_0_1['user_id']\n", " feat = pd.pivot_table(user_act_0_1,index=['user_id'],values='user_act_0_1_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_0_1;gc.collect();\n", " #统计action_type为1的次数\n", " user_act_1_1 = user_act[user_act.action_type==1]\n", " user_act_1_1['user_act_1_1_count'] = user_act_1_1['user_id']\n", " feat = pd.pivot_table(user_act_1_1,index=['user_id'],values='user_act_1_1_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_1_1;gc.collect();\n", " #统计action_type为2的次数\n", " user_act_2_1 = user_act[user_act.action_type==2]\n", " user_act_2_1['user_act_2_1_count'] = user_act_2_1['user_id']\n", " feat = pd.pivot_table(user_act_2_1,index=['user_id'],values='user_act_2_1_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_2_1;gc.collect(); \n", " #统计action_type为3的次数\n", " user_act_3_1 = user_act[user_act.action_type==3]\n", " user_act_3_1['user_act_3_1_count'] = user_act_3_1['user_id']\n", " feat = pd.pivot_table(user_act_3_1,index=['user_id'],values='user_act_3_1_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_3_1;gc.collect(); \n", " #统计action_type为4的次数\n", " user_act_4_1 = user_act[user_act.action_type==4]\n", " user_act_4_1['user_act_4_1_count'] = user_act_4_1['user_id']\n", " feat = pd.pivot_table(user_act_4_1,index=['user_id'],values='user_act_4_1_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_4_1;gc.collect(); \n", " #统计action_type为5的次数\n", " user_act_5_1 = user_act[user_act.action_type==5]\n", " user_act_5_1['user_act_5_1_count'] = user_act_5_1['user_id']\n", " feat = pd.pivot_table(user_act_5_1,index=['user_id'],values='user_act_5_1_count',aggfunc='count').reset_index()\n", " ans = pd.merge(ans,feat,on='user_id',how='left')\n", " del user_act_5_1;gc.collect();\n", " del feat;gc.collect();\n", " del day_min;gc.collect();\n", " del day_max;gc.collect();\n", " del data;gc.collect();\n", " del app;gc.collect();\n", " del user_act;gc.collect();\n", " del user_reg;gc.collect();\n", " del vedio;gc.collect();\n", " \n", " return ans\n", "\n", "#%%\n", "\n", "def modelXgb(train,test):\n", " \"xgb模型\"\n", " train_y = train['label'].values\n", " \n", " train_x = train.drop(['user_id','register_day','label'],axis=1).values\n", " test_x = test.drop(['user_id','register_day'],axis=1).values \n", " \n", " dtrain = xgb.DMatrix(train_x, label=train_y)\n", " dtest = xgb.DMatrix(test_x)\n", " \n", " del train_x;gc.collect();\n", " del test_x;gc.collect();\n", " \n", " # 模型参数\n", " params = {'booster': 'gbtree',\n", " 'objective':'binary:logistic',\n", " 'eval_metric':'auc',\n", " 'eta': 0.03,\n", " 'max_depth': 5, # 6\n", " 'colsample_bytree': 0.9,#0.8\n", " 'subsample': 0.9,\n", " 'scale_pos_weight': 1,\n", " 'min_child_weight': 18 # 2\n", " }\n", " # 训练\n", " watchlist = [(dtrain,'train')]\n", " bst = xgb.train(params, dtrain, num_boost_round=1500,evals=watchlist)\n", " # 预测\n", " predict = bst.predict(dtest)\n", " \n", " del dtrain;gc.collect();\n", " del dtest;gc.collect();\n", " \n", " test_xy = test[['user_id']]\n", " test_xy['predicted_score'] = predict\n", " test_xy = test_xy.sort_values('predicted_score', ascending=False)\n", " \n", " del predict;gc.collect();\n", " \n", " return test_xy\n", "\n", "#%%\n", "def main():\n", " \"训练模型\"\n", " print('下载数据...')\n", " app,user_act,user_reg,vedio = loadData()\n", " print('打标数据...')\n", " test,train1,train2,train3,train4,train5,train6,train7,train8,train9,train10,train11,train12,train13,train14 = makeLabel(app,user_act,user_reg,vedio)\n", " \n", " print('提取te特征...')\n", " te = genFeature(31,37,test,app,user_act,user_reg,vedio)\n", " del test;gc.collect();\n", " print('测试集提取完成...')\n", "\n", " #训练集提取特征\n", " print('提取tr1特征...')\n", " tr1 = genFeature(24,30,train1,app,user_act,user_reg,vedio)\n", " del train1;gc.collect();\n", " print('提取tr2特征...')\n", " tr2 = genFeature(23,29,train2,app,user_act,user_reg,vedio)\n", " del train2;gc.collect();\n", " print('提取tr3特征...')\n", " tr3 = genFeature(22,28,train3,app,user_act,user_reg,vedio)\n", " del train3;gc.collect();\n", " print('提取tr4特征...')\n", " tr4 = genFeature(21,27,train4,app,user_act,user_reg,vedio)\n", " del train4;gc.collect();\n", " print('提取tr5特征...')\n", " tr5 = genFeature(20,26,train5,app,user_act,user_reg,vedio)\n", " del train5;gc.collect(); \n", " print('提取tr6特征...')\n", " tr6 = genFeature(19,25,train6,app,user_act,user_reg,vedio)\n", " del train6;gc.collect(); \n", " print('提取tr7特征...')\n", " tr7 = genFeature(18,24,train7,app,user_act,user_reg,vedio)\n", " del train7;gc.collect(); \n", " print('提取tr8特征...')\n", " tr8 = genFeature(17,23,train8,app,user_act,user_reg,vedio)\n", " del train8;gc.collect(); \n", " print('提取tr9特征...')\n", " tr9 = genFeature(16,22,train9,app,user_act,user_reg,vedio)\n", " del train9;gc.collect(); \n", " print('提取tr10特征...')\n", " tr10 = genFeature(15,21,train10,app,user_act,user_reg,vedio)\n", " del train10;gc.collect(); \n", " print('提取tr11特征...')\n", " tr11 = genFeature(14,20,train11,app,user_act,user_reg,vedio)\n", " del train11;gc.collect(); \n", " print('提取tr12特征...')\n", " tr12 = genFeature(13,19,train12,app,user_act,user_reg,vedio)\n", " del train12;gc.collect();\n", " print('提取tr13特征...')\n", " tr13 = genFeature(12,18,train13,app,user_act,user_reg,vedio)\n", " del train13;gc.collect();\n", " print('提取tr14特征...')\n", " tr14 = genFeature(11,17,train14,app,user_act,user_reg,vedio)\n", " del train14;gc.collect();\n", " \n", " del app;gc.collect();\n", " del user_act;gc.collect();\n", " del user_reg;gc.collect();\n", " del vedio;gc.collect();\n", " \n", " #合并训练集\n", " tr = pd.concat([tr1,tr2,tr3,tr4,tr5,tr6,tr7,tr8,tr9,tr10,tr11,tr12,tr13,tr14],axis=0)\n", " del tr1;gc.collect();\n", " del tr2;gc.collect();\n", " del tr3;gc.collect();\n", " del tr4;gc.collect();\n", " del tr5;gc.collect();\n", " del tr6;gc.collect();\n", " del tr7;gc.collect();\n", " del tr8;gc.collect();\n", " del tr9;gc.collect();\n", " del tr10;gc.collect();\n", " del tr11;gc.collect();\n", " del tr12;gc.collect();\n", " del tr13;gc.collect();\n", " del tr14;gc.collect();\n", " \n", " print('开始训练模型...')\n", " #训练模型\n", " answer = modelXgb(tr,te)\n", " print('结束训练模型...')\n", " #导出结果\n", " answer.to_csv('/home/kesci/work/yw_model.txt',index=False, header=None)\n", " #提交文件my_submission.txt进行评审;温馨提示:本次比赛提交文件的格式为txt\n", " #!./kesci_submit -token 61c8c5af49a7abd0 -file ans_0723_2.txt \n", " \n", "#%%\n", "if __name__ == '__main__':\n", " \"主函数入口\"\n", "\n", " main()\n", "\n", "#%%\n", " \n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "0DB0B92AC4BC49A79FFCB4AAD1323449" }, "outputs": [], "source": [ "# 模型3--闵子剑模型" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7527B085E8664D25AB6B22E8BEFFD400" }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import xgboost as xgb\n", "\n", "def readData():\n", " print('reading...')\n", " print('app launch')\n", " user_launch = pd.read_csv(r'/mnt/datasets/fusai/app_launch_log.txt', sep='\\t', header=None)\n", " user_launch.columns = ['user_id', 'day']\n", "\n", " print('user register')\n", " user_register = pd.read_csv(r'/mnt/datasets/fusai/user_register_log.txt', sep='\\t', header=None)\n", " user_register.columns = ['user_id', 'register_day', 'register_type', 'device_type']\n", "\n", " print('video create')\n", " video_create = pd.read_csv(r'/mnt/datasets/fusai/video_create_log.txt', sep='\\t', header=None)\n", " video_create.columns = ['user_id', 'day']\n", "\n", " print('user activity')\n", " user_activity = pd.read_csv(r'/mnt/datasets/fusai/user_activity_log.txt', sep='\\t', header=None)\n", " user_activity.columns = ['user_id', 'day', 'page', 'video_id', 'author_id', 'action_type']\n", " print('reading have finished!')\n", "\n", " return user_launch, user_register, video_create, user_activity\n", "\n", "def labelRegister(user_launch, user_register, video_create, user_activity, f_start, f_end):\n", " print('labeling...')\n", " act_user = pd.concat([user_launch, video_create]).reset_index(drop = True)\n", " act_user = pd.concat([act_user, user_activity[['user_id', 'day']]]).reset_index(drop = True)\n", " act_user = act_user[(act_user.day >= f_start) & (act_user.day <= f_end)]\n", " act_user = act_user[['user_id']]\n", " act_user.drop_duplicates(inplace = True)\n", " act_user['label'] = 1\n", "\n", " user_register = user_register[user_register.register_day <= (f_start - 1)]\n", " user_register = pd.merge(user_register, act_user, on = ['user_id'], how = 'left')\n", " user_register.fillna(0, inplace = True)\n", " print('labeling have finished!')\n", " return user_register\n", "\n", "def getMaxMinNormalization(feature):\n", " max = np.max(feature)\n", " min = np.min(feature)\n", "\n", " return (feature - min) / (max - min)\n", "\n", "def getLabel(user_launch, user_register, video_create, user_activity):\n", " print('label1...')\n", " register_have_label1 = labelRegister(user_launch, user_register, video_create, user_activity, 24, 30)\n", " print('label2...')\n", " register_have_label2 = labelRegister(user_launch, user_register, video_create, user_activity, 17, 23)\n", "\n", " return register_have_label1, register_have_label2\n", "\n", "def getLastDayFeature(user_launch, user_register, video_create, user_activity, f_start, f_end):\n", " print('get last day feature...')\n", " t1 = user_launch[(user_launch.day >= f_start) &\n", " (user_launch.day <= f_end)][['user_id']]\n", " t1['user_%d_before_launch_count' % (f_end - f_start + 1)] = 1\n", " feat = pd.pivot_table(t1, index=['user_id'],\n", " values='user_%d_before_launch_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", "\n", " t1_1 = pd.merge(feat, user_register[['user_id', 'register_type', 'device_type']], on=['user_id'], how='left')\n", " t1_1 = t1_1[['register_type', 'device_type', 'user_%d_before_launch_count' % (f_end - f_start + 1)]]\n", " feat1 = pd.pivot_table(t1_1, index=['register_type', 'device_type'],\n", " values='user_%d_before_launch_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " feat1.rename(columns={'user_%d_before_launch_count' % (f_end - f_start + 1):\n", " 'device_register_type_%d_before_launch_count'\n", " % (f_end - f_start + 1)}, inplace=True)\n", " feat1['device_register_type_%d_before_launch_count' %\n", " (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(feat1['device_register_type_%d_before_launch_count' %\n", " (f_end - f_start + 1)])\n", " feat2 = pd.pivot_table(t1_1, index=['device_type'],\n", " values='user_%d_before_launch_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " feat2.rename(columns={'user_%d_before_launch_count' % (f_end - f_start + 1):\n", " 'device_type_%d_before_launch_count' % (f_end - f_start + 1)}, inplace=True)\n", " feat2['device_type_%d_before_launch_count' %\n", " (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(feat2['device_type_%d_before_launch_count' %\n", " (f_end - f_start + 1)])\n", " feat3 = pd.pivot_table(t1_1, index=['register_type'],\n", " values='user_%d_before_launch_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " feat3.rename(columns={'user_%d_before_launch_count' % (f_end - f_start + 1):\n", " 'register_type_%d_before_launch_count'\n", " % (f_end - f_start + 1)}, inplace=True)\n", " feat3['register_type_%d_before_launch_count' %\n", " (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(feat3['register_type_%d_before_launch_count' %\n", " (f_end - f_start + 1)])\n", "\n", " t1 = pd.pivot_table(t1, index=['user_id'],\n", " values='user_%d_before_launch_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " t1['user_%d_before_launch_count' % (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(t1['user_%d_before_launch_count' % (f_end - f_start + 1)])\n", " user_register = pd.merge(user_register, t1, on=['user_id'], how='left')\n", " user_register = pd.merge(user_register, feat1, on=['register_type', 'device_type'], how='left')\n", " user_register = pd.merge(user_register, feat2, on=['device_type'], how='left')\n", " user_register = pd.merge(user_register, feat3, on=['register_type'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t2 = video_create[(video_create.day >= f_start) &\n", " (video_create.day <= f_end)][['user_id']]\n", " t2['user_%d_before_video_create_count' % (f_end - f_start + 1)] = 1\n", " feat = pd.pivot_table(t2, index=['user_id'],\n", " values='user_%d_before_video_create_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " t2_1 = pd.merge(feat, user_register[['user_id', 'register_type', 'device_type']], on=['user_id'], how='left')\n", " t2_1 = t2_1[['register_type', 'device_type', 'user_%d_before_video_create_count' % (f_end - f_start + 1)]]\n", " feat1 = pd.pivot_table(t2_1, index=['register_type', 'device_type'],\n", " values='user_%d_before_video_create_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " feat1.rename(columns={'user_%d_before_video_create_count' % (f_end - f_start + 1):\n", " 'device_register_type_%d_before_video_create_count'\n", " % (f_end - f_start + 1)}, inplace=True)\n", " feat1['device_register_type_%d_before_video_create_count' %\n", " (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(feat1['device_register_type_%d_before_video_create_count' %\n", " (f_end - f_start + 1)])\n", " feat2 = pd.pivot_table(t2_1, index=['device_type'],\n", " values='user_%d_before_video_create_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " feat2.rename(columns={'user_%d_before_video_create_count' % (f_end - f_start + 1):\n", " 'device_type_%d_before_video_create_count' % (f_end - f_start + 1)}, inplace=True)\n", " feat2['device_type_%d_before_video_create_count' %\n", " (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(feat2['device_type_%d_before_video_create_count' %\n", " (f_end - f_start + 1)])\n", "\n", " feat3 = pd.pivot_table(t2_1, index=['register_type'],\n", " values='user_%d_before_video_create_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " feat3.rename(columns={'user_%d_before_video_create_count' % (f_end - f_start + 1):\n", " 'register_type_%d_before_video_create_count' % (f_end - f_start + 1)}, inplace=True)\n", " feat3['register_type_%d_before_video_create_count' %\n", " (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(feat3['register_type_%d_before_video_create_count' %\n", " (f_end - f_start + 1)])\n", " t2 = pd.pivot_table(t2, index=['user_id'],\n", " values='user_%d_before_video_create_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " t2['user_%d_before_video_create_count' % (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(t2['user_%d_before_video_create_count'\n", " % (f_end - f_start + 1)])\n", " user_register = pd.merge(user_register, t2, on=['user_id'], how='left')\n", " user_register = pd.merge(user_register, feat1, on=['register_type', 'device_type'], how='left')\n", " user_register = pd.merge(user_register, feat2, on=['device_type'], how='left')\n", " user_register = pd.merge(user_register, feat3, on=['register_type'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " for i in range(5):\n", " t3 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end) &\n", " (user_activity.page == i)][['user_id']]\n", " t3['user_%d_before_page_count_%d' % (f_end - f_start + 1, i)] = 1\n", " t3 = pd.pivot_table(t3, index=['user_id'],\n", " values='user_%d_before_page_count_%d' % (f_end - f_start + 1, i),\n", " aggfunc='sum').reset_index()\n", " t3['user_%d_before_page_count_%d' % (f_end - f_start + 1, i)] = \\\n", " getMaxMinNormalization(t3['user_%d_before_page_count_%d'\n", " % (f_end - f_start + 1, i)])\n", " user_register = pd.merge(user_register, t3, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " for j in range(6):\n", " t4 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end) &\n", " (user_activity.action_type == j)][['user_id']]\n", " t4['user_%d_before_action_type_count_%d' % (f_end - f_start + 1, j)] = 1\n", " t4 = pd.pivot_table(t4, index=['user_id'],\n", " values='user_%d_before_action_type_count_%d' % (f_end - f_start + 1, j),\n", " aggfunc='sum').reset_index()\n", " t4['user_%d_before_action_type_count_%d' % (f_end - f_start + 1, j)] = \\\n", " getMaxMinNormalization(t4['user_%d_before_action_type_count_%d'\n", " % (f_end - f_start + 1, j)])\n", " user_register = pd.merge(user_register, t4, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t5 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end)][['user_id', 'video_id']]\n", " t5.drop_duplicates(inplace = True)\n", " t5['video_id'] = 1\n", " t5 = pd.pivot_table(t5, index=['user_id'],\n", " values='video_id',\n", " aggfunc='sum').reset_index()\n", " t5['video_id'] = getMaxMinNormalization(t5['video_id'])\n", " t5.rename(columns={'video_id': 'user_%d_before_watch_video_type_count'\n", " % (f_end - f_start + 1)}, inplace=True)\n", " user_register = pd.merge(user_register, t5, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t6 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end)][['user_id', 'author_id']]\n", " t6.drop_duplicates(inplace = True)\n", " t6['author_id'] = 1\n", " t6 = pd.pivot_table(t6, index=['user_id'],\n", " values='author_id',\n", " aggfunc='sum').reset_index()\n", " t6['author_id'] = getMaxMinNormalization(t6['author_id'])\n", " t6.rename(columns={'author_id': 'user_%d_before_watch_video_author_type_count'\n", " % (f_end - f_start + 1)}, inplace=True)\n", " user_register = pd.merge(user_register, t6, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t7 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end)][['user_id']]\n", " t7['user_%d_before_activity_count' % (f_end - f_start + 1)] = 1\n", " feat = pd.pivot_table(t7, index=['user_id'],\n", " values='user_%d_before_activity_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", "\n", " t7_1 = pd.merge(feat, user_register[['user_id', 'register_type', 'device_type']], on=['user_id'], how='left')\n", " t7_1 = t7_1[['register_type', 'device_type', 'user_%d_before_activity_count' % (f_end - f_start + 1)]]\n", " feat1 = pd.pivot_table(t7_1, index=['register_type', 'device_type'],\n", " values='user_%d_before_activity_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " feat2 = pd.pivot_table(t7_1, index=['device_type'],\n", " values='user_%d_before_activity_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " feat3 = pd.pivot_table(t7_1, index=['register_type'],\n", " values='user_%d_before_activity_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " feat1.rename(columns={'user_%d_before_activity_count' % (f_end - f_start + 1):\n", " 'device_register_type_%d_before_activity_count' % (f_end - f_start + 1)}, inplace=True)\n", " feat2.rename(columns={'user_%d_before_activity_count' % (f_end - f_start + 1):\n", " 'device_type_%d_before_activity_count' % (f_end - f_start + 1)}, inplace=True)\n", " feat3.rename(columns={'user_%d_before_activity_count' % (f_end - f_start + 1):\n", " 'register_type_%d_before_activity_count' % (f_end - f_start + 1)}, inplace=True)\n", " feat1['device_register_type_%d_before_activity_count'\n", " % (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(feat1['device_register_type_%d_before_activity_count'\n", " % (f_end - f_start + 1)])\n", " feat2['device_type_%d_before_activity_count'\n", " % (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(feat2['device_type_%d_before_activity_count'\n", " % (f_end - f_start + 1)])\n", " feat3['register_type_%d_before_activity_count'\n", " % (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(feat3['register_type_%d_before_activity_count'\n", " % (f_end - f_start + 1)])\n", "\n", " user_register = pd.merge(user_register, feat1, on=['register_type', 'device_type'], how='left')\n", " user_register = pd.merge(user_register, feat2, on=['device_type'], how='left')\n", " user_register = pd.merge(user_register, feat3, on=['register_type'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " for i in range(6):\n", " t8 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end) &\n", " (user_activity.action_type == i)][['author_id']]\n", " t8['author_%d_before_action_type_%d_showed_count'\n", " % (f_end - f_start + 1, i)] = 1\n", " t8 = pd.pivot_table(t8, index=['author_id'],\n", " values='author_%d_before_action_type_%d_showed_count' % (f_end - f_start + 1, i),\n", " aggfunc='sum').reset_index()\n", " t8['author_%d_before_action_type_%d_showed_count'\n", " % (f_end - f_start + 1, i)] = \\\n", " getMaxMinNormalization(t8['author_%d_before_action_type_%d_showed_count'\n", " % (f_end - f_start + 1, i)])\n", " t8.rename(columns={'author_id': 'user_id'}, inplace=True)\n", " user_register = pd.merge(user_register, t8, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " print('get last day feature have finished!')\n", " return user_register\n", "\n", "def getAllTimeFeature(user_launch, user_register, video_create, user_activity, f_start, f_end):\n", " print('get all time feature...')\n", " t1 = user_launch[(user_launch.day >= f_start) &\n", " (user_launch.day <= f_end)][['user_id', 'day']]\n", " t1['user_%d_before_launch_count' % (f_end - f_start + 1)] = 1\n", " feat = pd.pivot_table(t1, index=['user_id'],\n", " values='user_%d_before_launch_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", "\n", " t1_1 = pd.merge(feat, user_register[['user_id', 'register_type', 'device_type']], on=['user_id'], how='left')\n", " t1_1 = t1_1[['register_type', 'device_type', 'user_%d_before_launch_count' % (f_end - f_start + 1)]]\n", " feat1 = pd.pivot_table(t1_1, index=['register_type', 'device_type'],\n", " values='user_%d_before_launch_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " feat1.rename(columns={'user_%d_before_launch_count' % (f_end - f_start + 1):\n", " 'device_register_type_%d_before_launch_count' % (f_end - f_start + 1)}, inplace=True)\n", " feat1['device_register_type_%d_before_launch_count' %\n", " (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(feat1['device_register_type_%d_before_launch_count' %\n", " (f_end - f_start + 1)])\n", " feat2 = pd.pivot_table(t1_1, index=['device_type'],\n", " values='user_%d_before_launch_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " feat2.rename(columns={'user_%d_before_launch_count' % (f_end - f_start + 1):\n", " 'device_type_%d_before_launch_count' % (f_end - f_start + 1)}, inplace=True)\n", " feat2['device_type_%d_before_launch_count' %\n", " (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(feat2['device_type_%d_before_launch_count' %\n", " (f_end - f_start + 1)])\n", "\n", " feat3 = pd.pivot_table(t1_1, index=['register_type'],\n", " values='user_%d_before_launch_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " feat3.rename(columns={'user_%d_before_launch_count' % (f_end - f_start + 1):\n", " 'register_type_%d_before_launch_count' % (f_end - f_start + 1)}, inplace=True)\n", " feat3['register_type_%d_before_launch_count' %\n", " (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(feat3['register_type_%d_before_launch_count' %\n", " (f_end - f_start + 1)])\n", "\n", " t1['user_%d_before_launch_count' % (f_end - f_start + 1)] = \\\n", " (t1['day'] - f_start) / (f_end - f_start)\n", " t1 = pd.pivot_table(t1, index=['user_id'],\n", " values='user_%d_before_launch_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " t1['user_%d_before_launch_count' % (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(t1['user_%d_before_launch_count' % (f_end - f_start + 1)])\n", " user_register = pd.merge(user_register, t1, on=['user_id'], how='left')\n", " user_register = pd.merge(user_register, feat1, on=['register_type', 'device_type'], how='left')\n", " user_register = pd.merge(user_register, feat2, on=['device_type'], how='left')\n", " user_register = pd.merge(user_register, feat3, on=['register_type'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t2 = video_create[(video_create.day >= f_start) &\n", " (video_create.day <= f_end)][['user_id', 'day']]\n", " t2['user_%d_before_video_create_count' % (f_end - f_start + 1)] = 1\n", " feat = pd.pivot_table(t2, index=['user_id'],\n", " values='user_%d_before_video_create_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", "\n", " t2_1 = pd.merge(feat, user_register[['user_id', 'register_type', 'device_type']], on=['user_id'], how='left')\n", " t2_1 = t2_1[['register_type', 'device_type', 'user_%d_before_video_create_count' % (f_end - f_start + 1)]]\n", " feat1 = pd.pivot_table(t2_1, index=['register_type', 'device_type'],\n", " values='user_%d_before_video_create_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " feat1.rename(columns={'user_%d_before_video_create_count' % (f_end - f_start + 1):\n", " 'device_register_type_%d_before_video_create_count'\n", " % (f_end - f_start + 1)}, inplace=True)\n", " feat1['device_register_type_%d_before_video_create_count' %\n", " (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(feat1['device_register_type_%d_before_video_create_count' %\n", " (f_end - f_start + 1)])\n", " feat2 = pd.pivot_table(t2_1, index=['device_type'],\n", " values='user_%d_before_video_create_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " feat2.rename(columns={'user_%d_before_video_create_count' % (f_end - f_start + 1):\n", " 'device_type_%d_before_video_create_count' % (f_end - f_start + 1)}, inplace=True)\n", " feat2['device_type_%d_before_video_create_count' %\n", " (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(feat2['device_type_%d_before_video_create_count' %\n", " (f_end - f_start + 1)])\n", "\n", " feat3 = pd.pivot_table(t2_1, index=['register_type'],\n", " values='user_%d_before_video_create_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " feat3.rename(columns={'user_%d_before_video_create_count' % (f_end - f_start + 1):\n", " 'register_type_%d_before_video_create_count' % (f_end - f_start + 1)}, inplace=True)\n", " feat3['register_type_%d_before_video_create_count' %\n", " (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(feat3['register_type_%d_before_video_create_count' %\n", " (f_end - f_start + 1)])\n", "\n", " t2['user_%d_before_video_create_count' % (f_end - f_start + 1)] = \\\n", " (t2['day'] - f_start) / (f_end - f_start)\n", " t2 = pd.pivot_table(t2, index=['user_id'],\n", " values='user_%d_before_video_create_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " t2['user_%d_before_video_create_count' % (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(t2['user_%d_before_video_create_count'\n", " % (f_end - f_start + 1)])\n", " user_register = pd.merge(user_register, t2, on=['user_id'], how='left')\n", " user_register = pd.merge(user_register, feat1, on=['register_type', 'device_type'], how='left')\n", " user_register = pd.merge(user_register, feat2, on=['device_type'], how='left')\n", " user_register = pd.merge(user_register, feat3, on=['register_type'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " for i in range(5):\n", " t3 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end) &\n", " (user_activity.page == i)][['user_id', 'day']]\n", " t3['user_%d_before_page_count_%d' % (f_end - f_start + 1, i)] \\\n", " = (t3['day'] - f_start) / (f_end - f_start)\n", " t3 = pd.pivot_table(t3, index=['user_id'],\n", " values='user_%d_before_page_count_%d' % (f_end - f_start + 1, i),\n", " aggfunc='sum').reset_index()\n", " t3['user_%d_before_page_count_%d' % (f_end - f_start + 1, i)] = \\\n", " getMaxMinNormalization(t3['user_%d_before_page_count_%d'\n", " % (f_end - f_start + 1, i)])\n", " user_register = pd.merge(user_register, t3, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " for j in range(6):\n", " t4 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end) &\n", " (user_activity.action_type == j)][['user_id', 'day']]\n", " t4['user_%d_before_action_type_count_%d' % (f_end - f_start + 1, j)] = \\\n", " (t4['day'] - f_start) / (f_end - f_start)\n", " t4 = pd.pivot_table(t4, index=['user_id'],\n", " values='user_%d_before_action_type_count_%d' % (f_end - f_start + 1, j),\n", " aggfunc='sum').reset_index()\n", " t4['user_%d_before_action_type_count_%d' % (f_end - f_start + 1, j)] = \\\n", " getMaxMinNormalization(t4['user_%d_before_action_type_count_%d'\n", " % (f_end - f_start + 1, j)])\n", " user_register = pd.merge(user_register, t4, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t5 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end)][['user_id', 'video_id', 'day']]\n", " t5 = t5.groupby(['user_id', 'video_id']).agg('max').reset_index()\n", " t5['video_id'] = (t5['day'] - f_start) / (f_end - f_start)\n", " t5 = pd.pivot_table(t5, index=['user_id'],\n", " values='video_id',\n", " aggfunc='sum').reset_index()\n", " t5['video_id'] = getMaxMinNormalization(t5['video_id'])\n", " t5.rename(columns={'video_id': 'user_%d_before_watch_video_type_count'\n", " % (f_end - f_start + 1)}, inplace=True)\n", " user_register = pd.merge(user_register, t5, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t6 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end)][['user_id', 'author_id', 'day']]\n", " t6 = t6.groupby(['user_id', 'author_id']).agg('max').reset_index()\n", " t6['author_id'] = (t6['day'] - f_start) / (f_end - f_start)\n", " t6 = pd.pivot_table(t6, index=['user_id'],\n", " values='author_id',\n", " aggfunc='sum').reset_index()\n", " t6['author_id'] = getMaxMinNormalization(t6['author_id'])\n", " t6.rename(columns={'author_id': 'user_%d_before_watch_video_author_type_count'\n", " % (f_end - f_start + 1)}, inplace=True)\n", " user_register = pd.merge(user_register, t6, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t7 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end)][['user_id', 'day']]\n", " t7['user_%d_before_activity_count' % (f_end - f_start + 1)] = 1\n", " feat = pd.pivot_table(t7, index=['user_id'],\n", " values='user_%d_before_activity_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", "\n", " t7_1 = pd.merge(feat, user_register[['user_id', 'register_type', 'device_type']], on=['user_id'], how='left')\n", " t7_1 = t7_1[['register_type', 'device_type', 'user_%d_before_activity_count' % (f_end - f_start + 1)]]\n", " feat1 = pd.pivot_table(t7_1, index=['register_type', 'device_type'],\n", " values='user_%d_before_activity_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " feat2 = pd.pivot_table(t7_1, index=['device_type'],\n", " values='user_%d_before_activity_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " feat3 = pd.pivot_table(t7_1, index=['register_type'],\n", " values='user_%d_before_activity_count' % (f_end - f_start + 1),\n", " aggfunc='sum').reset_index()\n", " feat1.rename(columns={'user_%d_before_activity_count' % (f_end - f_start + 1):\n", " 'device_register_type_%d_before_activity_count' % (f_end - f_start + 1)}, inplace=True)\n", " feat1['device_register_type_%d_before_activity_count' % (f_end - f_start + 1)] = \\\n", " feat1['device_register_type_%d_before_activity_count' % (f_end - f_start + 1)] / (f_end - f_start + 1)\n", " feat2.rename(columns={'user_%d_before_activity_count' % (f_end - f_start + 1):\n", " 'device_type_%d_before_activity_count' % (f_end - f_start + 1)}, inplace=True)\n", " feat2['device_type_%d_before_activity_count' % (f_end - f_start + 1)] = \\\n", " feat2['device_type_%d_before_activity_count' % (f_end - f_start + 1)] / (f_end - f_start + 1)\n", " feat3.rename(columns={'user_%d_before_activity_count' % (f_end - f_start + 1):\n", " 'register_type_%d_before_activity_count' % (f_end - f_start + 1)}, inplace=True)\n", " feat3['register_type_%d_before_activity_count' % (f_end - f_start + 1)] = \\\n", " feat3['register_type_%d_before_activity_count' % (f_end - f_start + 1)] / (f_end - f_start + 1)\n", "\n", " feat1['device_register_type_%d_before_activity_count'\n", " % (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(feat1['device_register_type_%d_before_activity_count'\n", " % (f_end - f_start + 1)])\n", " feat2['device_type_%d_before_activity_count'\n", " % (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(feat2['device_type_%d_before_activity_count'\n", " % (f_end - f_start + 1)])\n", " feat3['register_type_%d_before_activity_count'\n", " % (f_end - f_start + 1)] = \\\n", " getMaxMinNormalization(feat3['register_type_%d_before_activity_count'\n", " % (f_end - f_start + 1)])\n", "\n", " user_register = pd.merge(user_register, feat1, on=['register_type', 'device_type'], how='left')\n", " user_register = pd.merge(user_register, feat2, on=['device_type'], how='left')\n", " user_register = pd.merge(user_register, feat3, on=['register_type'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t8_1 = user_launch[(user_launch.day >= f_start) &\n", " (user_launch.day <= f_end)][['user_id', 'day']]\n", " t8_1 = t8_1.groupby(['user_id']).agg('max').reset_index()\n", " t8_1.rename(columns={'day': 'max_day'}, inplace=True)\n", "\n", " t8_2 = user_launch[(user_launch.day >= f_start) &\n", " (user_launch.day <= f_end)][['user_id', 'day']]\n", " t8_2.drop_duplicates(inplace=True)\n", " t8_2['day'] = 1\n", " t8_2 = t8_2.groupby(['user_id']).agg('sum').reset_index()\n", " t8_2.rename(columns={'day': 'user_%d_before_launch_day_distance'\n", " % (f_end - f_start + 1)}, inplace=True)\n", "\n", " t8 = pd.merge(t8_1, t8_2, on=['user_id'], how='left')\n", " t8['user_%d_before_launch_day_distance' % (f_end - f_start + 1)] = \\\n", " (t8['user_%d_before_launch_day_distance' % (f_end - f_start + 1)] /\n", " (f_end - f_start + 1)) * (t8['max_day'] - f_start + 1)\n", " t8 = t8[['user_id', 'user_%d_before_launch_day_distance' % (f_end - f_start + 1)]]\n", " user_register = pd.merge(user_register, t8, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t9_1 = video_create[(video_create.day >= f_start) &\n", " (video_create.day <= f_end)][['user_id', 'day']]\n", " t9_1 = t9_1.groupby(['user_id']).agg('max').reset_index()\n", " t9_1.rename(columns={'day': 'max_day'}, inplace=True)\n", "\n", " t9_2 = video_create[(video_create.day >= f_start) &\n", " (video_create.day <= f_end)][['user_id', 'day']]\n", " t9_2.drop_duplicates(inplace=True)\n", " t9_2['day'] = 1\n", " t9_2 = t9_2.groupby(['user_id']).agg('sum').reset_index()\n", " t9_2.rename(columns={'day': 'user_%d_before_video_create_day_distance'\n", " % (f_end - f_start + 1)}, inplace=True)\n", "\n", " t9 = pd.merge(t9_1, t9_2, on=['user_id'], how='left')\n", " t9['user_%d_before_video_create_day_distance' % (f_end - f_start + 1)] = \\\n", " (t9['user_%d_before_video_create_day_distance' % (f_end - f_start + 1)] /\n", " (f_end - f_start + 1)) * (t9['max_day'] - f_start + 1)\n", " t9 = t9[['user_id', 'user_%d_before_video_create_day_distance' % (f_end - f_start + 1)]]\n", " user_register = pd.merge(user_register, t9, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t10_1 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end)][['user_id', 'day']]\n", " t10_1 = t10_1.groupby(['user_id']).agg('max').reset_index()\n", " t10_1.rename(columns={'day': 'max_day'}, inplace=True)\n", "\n", " t10_2 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end)][['user_id', 'day']]\n", " t10_2.drop_duplicates(inplace=True)\n", " t10_2['day'] = 1\n", " t10_2 = t10_2.groupby(['user_id']).agg('sum').reset_index()\n", " t10_2.rename(columns={'day': 'user_%d_before_activity_day_distance'\n", " % (f_end - f_start + 1)}, inplace=True)\n", "\n", " t10 = pd.merge(t10_1, t10_2, on=['user_id'], how='left')\n", " t10['user_%d_before_activity_day_distance' % (f_end - f_start + 1)] = \\\n", " (t10['user_%d_before_activity_day_distance' % (f_end - f_start + 1)] /\n", " (f_end - f_start + 1)) * (t10['max_day'] - f_start + 1)\n", " t10 = t10[['user_id', 'user_%d_before_activity_day_distance' % (f_end - f_start + 1)]]\n", " user_register = pd.merge(user_register, t10, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " for i in range(4):\n", " t13 = user_activity[(user_activity.day >= f_start) & (user_activity.day <= f_end)\n", " & (user_activity.action_type == i)][['author_id', 'day']]\n", " t13['author_%d_before_action_type_%d_showed_count' % (f_end - f_start + 1, i)] = \\\n", " (t13['day'] - f_start) / (f_end - f_start)\n", " t13 = pd.pivot_table(t13, index=['author_id'],\n", " values='author_%d_before_action_type_%d_showed_count'\n", " % (f_end - f_start + 1, i),\n", " aggfunc='sum').reset_index()\n", " t13['author_%d_before_action_type_%d_showed_count'\n", " % (f_end - f_start + 1, i)] = getMaxMinNormalization(t13['author_%d_before_action_type_%d_showed_count'\n", " % (f_end - f_start + 1, i)])\n", " t13.rename(columns={'author_id': 'user_id'}, inplace=True)\n", " user_register = pd.merge(user_register, t13, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t15 = user_launch[(user_launch.day <= f_end) &\n", " (user_launch.day >= f_start)][['user_id', 'day']]\n", " t15 = t15.groupby(['user_id']).agg('max').reset_index()\n", " t15.columns = ['user_id', 'max_day']\n", " user_register = pd.merge(user_register, t15, on=['user_id'], how='left')\n", " user_register['user_launch_max_day_distance'] = f_end - user_register['max_day'] + 1\n", " user_register.fillna(999, inplace=True)\n", " user_register.drop(['max_day'], axis=1, inplace=True)\n", "\n", " t16 = video_create[(video_create.day <= f_end) &\n", " (video_create.day >= f_start)][['user_id', 'day']]\n", " t16 = t16.groupby(['user_id']).agg('max').reset_index()\n", " t16.columns = ['user_id', 'max_day']\n", " user_register = pd.merge(user_register, t16, on=['user_id'], how='left')\n", " user_register['user_video_create_max_day_distance'] = f_end - user_register['max_day'] + 1\n", " user_register.fillna(999, inplace=True)\n", " user_register.drop(['max_day'], axis=1, inplace=True)\n", "\n", " for i in range(4):\n", " t17 = user_activity[(user_activity.day <= f_end) &\n", " (user_activity.day >= f_start) &\n", " (user_activity.action_type == i)][['user_id', 'day']]\n", " t17 = t17.groupby(['user_id']).agg('max').reset_index()\n", " t17.columns = ['user_id', 'max_day']\n", " user_register = pd.merge(user_register, t17, on=['user_id'], how='left')\n", " user_register['user_activity_action_type_%d_max_day_distance' % (i)] \\\n", " = f_end - user_register['max_day'] + 1\n", " user_register.fillna(999, inplace=True)\n", " user_register.drop(['max_day'], axis=1, inplace=True)\n", "\n", " t18 = user_launch[(user_launch.day >= f_start) &\n", " (user_launch.day <= f_end)][['user_id', 'day']]\n", " t18['day'] = t18['day'].astype('str')\n", " t18 = t18.groupby(['user_id']).agg(lambda x: ':'.join(x)).reset_index()\n", " t18['user_launch_number'] = t18['day'].apply(lambda x: len(x.split(':')))\n", " t18 = t18[t18.user_launch_number >= 1]\n", " t18 = t18[['user_id', 'day']]\n", "\n", " def culContinuousMeanLaunchDay(s):\n", " launch_day = [int(x) for x in list(set(s.split(':')))]\n", " launch_day.sort()\n", "\n", " continuous_day_count = []\n", " if (len(launch_day) == 1):\n", " continuous_day_count.append(int(launch_day[0]) - f_start + 1)\n", " else:\n", " count = 0\n", " for i in range(len(launch_day) - 1):\n", " if ((int(launch_day[i + 1]) - int(launch_day[i]) == 1)):\n", " if (i == len(launch_day) - 2):\n", " count += (int(launch_day[i]) - f_start + 1)\n", " continuous_day_count.append(\n", " count + (int(launch_day[i + 1]) - f_start + 1))\n", " else:\n", " count += (int(launch_day[i]) - f_start + 1)\n", " else:\n", " if (i == len(launch_day) - 2):\n", " continuous_day_count.append(\n", " count + (int(launch_day[i]) - f_start + 1))\n", " count = 0\n", " continuous_day_count.append(\n", " count + (int(launch_day[i + 1]) - f_start + 1))\n", " else:\n", " continuous_day_count.append(\n", " count + (int(launch_day[i]) - f_start + 1))\n", " count = 0\n", "\n", " continuous_day_count = np.array(continuous_day_count)\n", "\n", " return np.mean(continuous_day_count)\n", "\n", " t18['user_%d_before_continuous_mean_launch_day' % (f_end - f_start + 1)] \\\n", " = t18.day.apply(culContinuousMeanLaunchDay)\n", " t18 = t18[['user_id', 'user_%d_before_continuous_mean_launch_day' % (f_end - f_start + 1)]]\n", " user_register = pd.merge(user_register, t18, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t19 = video_create[(video_create.day >= f_start) &\n", " (video_create.day <= f_end)][['user_id', 'day']]\n", " t19['day'] = t19['day'].astype('str')\n", " t19 = t19.groupby(['user_id']).agg(lambda x: ':'.join(x)).reset_index()\n", " t19['user_create_video_number'] = t19['day'].apply(lambda x: len(x.split(':')))\n", " t19 = t19[t19.user_create_video_number >= 1]\n", " t19 = t19[['user_id', 'day']]\n", "\n", " def culContinuousMeanVideoCreateDay(s):\n", " video_create_day = [int(x) for x in list(set(s.split(':')))]\n", " video_create_day.sort()\n", "\n", " continuous_day_count = []\n", " if(len(video_create_day) == 1):\n", " continuous_day_count.append(int(video_create_day[0] - f_start + 1))\n", " else:\n", " count = 0\n", " for i in range(len(video_create_day) - 1):\n", " if ((int(video_create_day[i + 1]) - int(video_create_day[i]) == 1)):\n", " if (i == len(video_create_day) - 2):\n", " count += (int(video_create_day[i]) - f_start + 1)\n", " continuous_day_count.append(\n", " count + (int(video_create_day[i + 1]) - f_start + 1))\n", " else:\n", " count += (int(video_create_day[i]) - f_start + 1)\n", " else:\n", " if (i == len(video_create_day) - 2):\n", " continuous_day_count.append(\n", " count + (int(video_create_day[i]) - f_start + 1))\n", " count = 0\n", " continuous_day_count.append(\n", " count + (int(video_create_day[i + 1]) - f_start + 1))\n", " else:\n", " continuous_day_count.append(\n", " count + (int(video_create_day[i]) - f_start + 1))\n", " count = 0\n", "\n", " continuous_day_count = np.array(continuous_day_count)\n", "\n", " return np.mean(continuous_day_count)\n", "\n", " t19['user_%d_before_continuous_mean_create_video_day' % (f_end - f_start + 1)] \\\n", " = t19.day.apply(culContinuousMeanVideoCreateDay)\n", " t19 = t19[['user_id', 'user_%d_before_continuous_mean_create_video_day' % (f_end - f_start + 1)]]\n", " user_register = pd.merge(user_register, t19, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t20 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end)][['user_id', 'day']]\n", " t20['day'] = t20['day'].astype('str')\n", " t20 = t20.groupby(['user_id']).agg(lambda x: ':'.join(x)).reset_index()\n", " t20['user_activity_number'] = t20['day'].apply(lambda x: len(x.split(':')))\n", " t20 = t20[t20.user_activity_number >= 1]\n", " t20 = t20[['user_id', 'day']]\n", "\n", " def culContinuousMeanUserActivityDay(s):\n", " user_activity_day = [int(x) for x in list(set(s.split(':')))]\n", " user_activity_day.sort()\n", "\n", " continuous_day_count = []\n", " if(len(user_activity_day) == 1):\n", " continuous_day_count.append(int(user_activity_day[0]) - f_start + 1)\n", " else:\n", " count = 0\n", " for i in range(len(user_activity_day) - 1):\n", " if ((int(user_activity_day[i + 1]) - int(user_activity_day[i]) == 1)):\n", " if (i == len(user_activity_day) - 2):\n", " count += (int(user_activity_day[i]) - f_start + 1)\n", " continuous_day_count.append(\n", " count + (int(user_activity_day[i + 1]) - f_start + 1))\n", " else:\n", " count += (int(user_activity_day[i]) - f_start + 1)\n", " else:\n", " if (i == len(user_activity_day) - 2):\n", " continuous_day_count.append(\n", " count + (int(user_activity_day[i]) - f_start + 1))\n", " count = 0\n", " continuous_day_count.append(\n", " count + (int(user_activity_day[i + 1]) - f_start + 1))\n", " else:\n", " continuous_day_count.append(\n", " count + (int(user_activity_day[i]) - f_start + 1))\n", " count = 0\n", "\n", " continuous_day_count = np.array(continuous_day_count)\n", "\n", " return np.mean(continuous_day_count)\n", "\n", " t20['user_%d_before_continuous_mean_user_activity_day' % (f_end - f_start + 1)] \\\n", " = t20.day.apply(culContinuousMeanUserActivityDay)\n", " t20 = t20[['user_id', 'user_%d_before_continuous_mean_user_activity_day' % (f_end - f_start + 1)]]\n", " user_register = pd.merge(user_register, t20, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t24_1 = user_launch[(user_launch.day >= f_start) &\n", " (user_launch.day <= f_end)][['user_id', 'day']]\n", " t24_1.drop_duplicates(inplace=True)\n", " t24_1['day'] = t24_1['day'].astype('str')\n", " t24_1 = t24_1.groupby(['user_id'])['day'].agg(lambda x: ':'.join(x)).reset_index()\n", " t24_1['user_launch_number'] = t24_1['day'].apply(lambda x: len(x.split(':')))\n", " t24_1 = t24_1[t24_1.user_launch_number >= 1]\n", " t24_1 = t24_1[['user_id', 'day']]\n", " t24_1.columns = ['user_id', 'launch_day']\n", "\n", " t24_2 = video_create[(video_create.day >= f_start) &\n", " (video_create.day <= f_end)][['user_id', 'day']]\n", " t24_2['day'] = t24_2['day'].astype('str')\n", " t24_2 = t24_2.groupby(['user_id'])['day'].agg(lambda x: ':'.join(x)).reset_index()\n", " t24_2['user_video_number'] = t24_2['day'].apply(lambda x: len(x.split(':')))\n", " t24_2 = t24_2[t24_2.user_video_number >= 1]\n", " t24_2 = t24_2[['user_id', 'day']]\n", " t24_2.columns = ['user_id', 'video_day']\n", "\n", " t24 = pd.merge(t24_1, t24_2, on=['user_id'], how='right')\n", " t24['day'] = t24['launch_day'] + ',' + t24['video_day']\n", " t24 = t24[t24.day.notnull()]\n", " t24 = t24[['user_id', 'day']]\n", "\n", " def videoCreateFrequencyAfterLaunch(s):\n", " launch, video = s.split(',')\n", " launch_day = [int(x) for x in list(set(launch.split(':')))]\n", " video_day = [int(x) for x in video.split(':')]\n", " launch_day.sort()\n", " video_day.sort()\n", " gap_list = []\n", " for i in range(len(launch_day) - 1):\n", " gap = 0\n", " for j in range(len(video_day)):\n", " if ((int(video_day[j]) >= int(launch_day[i])) &\n", " (int(video_day[j]) < int(launch_day[i + 1]))):\n", " gap += (int(video_day[j]) - f_start + 1)\n", " gap_list.append(gap)\n", " gap = 0\n", " for j in range(len(video_day)):\n", " if (int(video_day[j]) >= int(launch_day[len(launch_day) - 1])):\n", " gap += (int(video_day[j]) - f_start + 1)\n", " gap_list.append(gap)\n", " gap_array = np.array(gap_list)\n", "\n", " return np.mean(gap_array)\n", "\n", " t24['user_%d_before_create_video_after_launch_frequency' % (f_end - f_start + 1)] = \\\n", " t24.day.apply(videoCreateFrequencyAfterLaunch)\n", " t24 = t24[['user_id', 'user_%d_before_create_video_after_launch_frequency' % (f_end - f_start + 1)]]\n", " user_register = pd.merge(user_register, t24, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t25_1 = user_launch[(user_launch.day >= f_start) &\n", " (user_launch.day <= f_end)][['user_id', 'day']]\n", " t25_1.drop_duplicates(inplace=True)\n", " t25_1['day'] = t25_1['day'].astype('str')\n", " t25_1 = t25_1.groupby(['user_id'])['day'].agg(lambda x: ':'.join(x)).reset_index()\n", " t25_1['user_launch_number'] = t25_1['day'].apply(lambda x: len(x.split(':')))\n", " t25_1 = t25_1[t25_1.user_launch_number >= 1]\n", " t25_1 = t25_1[['user_id', 'day']]\n", " t25_1.columns = ['user_id', 'launch_day']\n", "\n", " t25_2 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end)][['user_id', 'day']]\n", " t25_2['day'] = t25_2['day'].astype('str')\n", " t25_2 = t25_2.groupby(['user_id'])['day'].agg(lambda x: ':'.join(x)).reset_index()\n", " t25_2['user_activity_number'] = t25_2['day'].apply(lambda x: len(x.split(':')))\n", " t25_2 = t25_2[t25_2.user_activity_number >= 1]\n", " t25_2 = t25_2[['user_id', 'day']]\n", " t25_2.columns = ['user_id', 'activity_day']\n", "\n", " t25 = pd.merge(t25_1, t25_2, on=['user_id'], how='right')\n", " t25['day'] = t25['launch_day'] + ',' + t25['activity_day']\n", " t25 = t25[t25.day.notnull()]\n", " t25 = t25[['user_id', 'day']]\n", "\n", " def activityFrequencyAfterLaunch(s):\n", " launch, activity = s.split(',')\n", " launch_day = [int(x) for x in list(set(launch.split(':')))]\n", " activity_day = [int(x) for x in activity.split(':')]\n", " launch_day.sort()\n", " activity_day.sort()\n", " gap_list = []\n", " for i in range(len(launch_day) - 1):\n", " gap = 0\n", " for j in range(len(activity_day)):\n", " if ((int(activity_day[j]) >= int(launch_day[i])) &\n", " (int(activity_day[j]) < int(launch_day[i + 1]))):\n", " gap += (int(activity_day[j]) - f_start + 1)\n", " gap_list.append(gap)\n", " gap = 0\n", " for j in range(len(activity_day)):\n", " if (int(activity_day[j]) >= int(launch_day[len(launch_day) - 1])):\n", " gap += (int(activity_day[j]) - f_start + 1)\n", " gap_list.append(gap)\n", " gap_array = np.array(gap_list)\n", "\n", " return np.mean(gap_array)\n", "\n", " t25['user_%d_before_activity_after_launch_frequency' % (f_end - f_start + 1)] = \\\n", " t25.day.apply(activityFrequencyAfterLaunch)\n", " t25 = t25[['user_id', 'user_%d_before_activity_after_launch_frequency' % (f_end - f_start + 1)]]\n", " user_register = pd.merge(user_register, t25, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t26 = user_activity[(user_activity.day <= f_end) &\n", " (user_activity.day >= f_start)][['user_id', 'day']]\n", " t26['user_day_count'] = 1\n", " t26 = t26.groupby(['user_id', 'day']).agg('sum').reset_index()\n", " t26['day'] = t26['day'].astype('str')\n", " t26['user_day_count'] = t26['user_day_count'].astype('str')\n", " t26['user_day_and_day_count'] = t26['day'] + ':' + t26['user_day_count']\n", " t26 = t26[['user_id', 'user_day_and_day_count']]\n", " t26 = t26.groupby(['user_id'])['user_day_and_day_count'].agg(lambda x: ','.join(x)).reset_index()\n", " t26['user_day_number'] = t26.user_day_and_day_count.apply(lambda x: len(x.split(',')))\n", " t26 = t26[t26.user_day_number > 1]\n", " t26 = t26[['user_id', 'user_day_and_day_count']]\n", "\n", " def calculateAcceleration(s):\n", " day_and_day_count = [x for x in s.split(',')]\n", " day_list = [int(x.split(':')[0]) for x in day_and_day_count]\n", " day_list.sort()\n", " dc_dict = {}\n", " for dc in day_and_day_count:\n", " dc_dict[int(dc.split(':')[0])] = int(dc.split(':')[1])\n", " gap = []\n", " for i in range(len(day_list) - 1):\n", " gap.append((dc_dict[day_list[i + 1]] - dc_dict[day_list[i]]) /\n", " (day_list[i + 1] - day_list[i]))\n", " gap = np.array(gap)\n", " return np.mean(gap)\n", "\n", " t26['user_whole_day_activity_acceleration'] = \\\n", " t26.user_day_and_day_count.apply(calculateAcceleration)\n", " t26 = t26[['user_id', 'user_whole_day_activity_acceleration']]\n", " user_register = pd.merge(user_register, t26, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t29 = user_launch[(user_launch.day >= f_start) &\n", " (user_launch.day <= f_end)][['user_id', 'day']]\n", " t29['day'] = t29['day'].astype('str')\n", " t29 = t29.groupby(['user_id'])['day'].agg(lambda x: '-'.join(x)).reset_index()\n", " t29['user_day_number'] = t29.day.apply(lambda x: len(x.split('-')))\n", " t29 = t29[t29.user_day_number > 1]\n", " t29 = t29[['user_id', 'day']]\n", "\n", " def culculateFrequncy(s):\n", " day_list = [int(x) for x in s.split('-')]\n", " day_list.sort()\n", " day_array= np.array(day_list)\n", " gap = np.diff(day_array)\n", " return np.mean(gap)\n", "\n", " t29['user_%d_before_launch_day_mean_frequncy' % (f_end - f_start + 1)] = t29.day.apply(culculateFrequncy)\n", " t29 = t29[['user_id', 'user_%d_before_launch_day_mean_frequncy' % (f_end - f_start + 1)]]\n", " user_register = pd.merge(user_register, t29, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t30 = video_create[(video_create.day >= f_start) &\n", " (video_create.day <= f_end)][['user_id', 'day']]\n", " t30['day'] = t30['day'].astype('str')\n", " t30 = t30.groupby(['user_id'])['day'].agg(lambda x : '-'.join(x)).reset_index()\n", " t30['user_day_number'] = t30.day.apply(lambda x: len(x.split('-')))\n", " t30 = t30[t30.user_day_number > 1]\n", " t30 = t30[['user_id', 'day']]\n", "\n", " def culculateFrequncy(s):\n", " day_list = [int(x) for x in s.split('-')]\n", " day_list.sort()\n", " day_array= np.array(day_list)\n", " gap = np.diff(day_array)\n", " return np.mean(gap)\n", "\n", " t30['user_%d_before_video_create_day_mean_frequncy' % (f_end - f_start + 1)] = t30.day.apply(culculateFrequncy)\n", " t30 = t30[['user_id', 'user_%d_before_video_create_day_mean_frequncy' % (f_end - f_start + 1)]]\n", " user_register = pd.merge(user_register, t30, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t31 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end)][['user_id', 'day']]\n", " t31['day'] = t31['day'].astype('str')\n", " t31 = t31.groupby(['user_id'])['day'].agg(lambda x: '-'.join(x)).reset_index()\n", " t31['user_day_number'] = t31.day.apply(lambda x: len(x.split('-')))\n", " t31 = t31[t31.user_day_number > 1]\n", " t31 = t31[['user_id', 'day']]\n", "\n", " def culculateFrequncy(s):\n", " day_list = [int(x) for x in s.split('-')]\n", " day_list.sort()\n", " day_array= np.array(day_list)\n", " gap = np.diff(day_array)\n", " return np.mean(gap)\n", "\n", " t31['user_%d_before_activity_day_mean_frequncy' % (f_end - f_start + 1)] = t31.day.apply(culculateFrequncy)\n", " t31 = t31[['user_id', 'user_%d_before_activity_day_mean_frequncy' % (f_end - f_start + 1)]]\n", " user_register = pd.merge(user_register, t31, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t9_1 = video_create[(video_create.day >= f_start) &\n", " (video_create.day <= f_end)][['user_id', 'day']]\n", " t9_1 = t9_1.groupby(['user_id']).agg('max').reset_index()\n", " t9_1.rename(columns={'day': 'max_day'}, inplace=True)\n", "\n", " t9_2 = video_create[(video_create.day >= f_start) &\n", " (video_create.day <= f_end)][['user_id', 'day']]\n", " t9_2 = t9_2.groupby(['user_id']).agg('min').reset_index()\n", " t9_2.rename(columns={'day': 'min_day'}, inplace=True)\n", "\n", " t9 = pd.merge(t9_1, t9_2, on=['user_id'], how='left')\n", " t9['user_%d_before_video_create_day_distance2' % (f_end - f_start + 1)] = t9['max_day'] - t9['min_day']\n", " t9['user_%d_before_video_create_day_distance2' % (f_end - f_start + 1)] = \\\n", " (t9['user_%d_before_video_create_day_distance2' % (f_end - f_start + 1)] /\n", " (f_end - f_start + 1)) * (t9['max_day'] - f_start)\n", " t9 = t9[['user_id', 'user_%d_before_video_create_day_distance2' % (f_end - f_start + 1)]]\n", " user_register = pd.merge(user_register, t9, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t10_1 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end)][['user_id', 'day']]\n", " t10_1 = t10_1.groupby(['user_id']).agg('max').reset_index()\n", " t10_1.rename(columns={'day': 'max_day'}, inplace=True)\n", "\n", " t10_2 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end)][['user_id', 'day']]\n", " t10_2 = t10_2.groupby(['user_id']).agg('min').reset_index()\n", " t10_2.rename(columns={'day': 'min_day'}, inplace=True)\n", "\n", " t10 = pd.merge(t10_1, t10_2, on=['user_id'], how='left')\n", " t10['user_%d_before_activity_day_distance2' % (f_end - f_start + 1)] = t10['max_day'] - t10['min_day']\n", " t10['user_%d_before_activity_day_distance2' % (f_end - f_start + 1)] = \\\n", " (t10['user_%d_before_activity_day_distance2' % (f_end - f_start + 1)] /\n", " (f_end - f_start + 1)) * (t10['max_day'] - f_start)\n", " t10 = t10[['user_id', 'user_%d_before_activity_day_distance2' % (f_end - f_start + 1)]]\n", " user_register = pd.merge(user_register, t10, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t8_1 = user_launch[(user_launch.day >= f_start) &\n", " (user_launch.day <= f_end)][['user_id', 'day']]\n", " t8_1 = t8_1.groupby(['user_id']).agg('max').reset_index()\n", " t8_1.rename(columns={'day': 'max_day'}, inplace=True)\n", "\n", " user_register = pd.merge(user_register, t8_1, on=['user_id'], how='left')\n", " user_register['user_launch_max_day_register_day_distance'] = \\\n", " (user_register['max_day'] - user_register['register_day'] + 1)\n", " user_register.drop(['max_day'], axis=1, inplace=True)\n", " user_register.fillna(0, inplace = True)\n", "\n", " t9_1 = video_create[(video_create.day >= f_start) &\n", " (video_create.day <= f_end)][['user_id', 'day']]\n", " t9_1 = t9_1.groupby(['user_id']).agg('max').reset_index()\n", " t9_1.rename(columns={'day': 'max_day'}, inplace=True)\n", " user_register = pd.merge(user_register, t9_1, on=['user_id'], how='left')\n", " user_register['user_video_max_day_register_day_distance'] = \\\n", " (user_register['max_day'] - user_register['register_day'] + 1)\n", " user_register.drop(['max_day'], axis=1, inplace=True)\n", " user_register.fillna(0, inplace = True)\n", "\n", " t10_1 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end)][['user_id', 'day']]\n", " t10_1 = t10_1.groupby(['user_id']).agg('max').reset_index()\n", " t10_1.rename(columns={'day': 'max_day'}, inplace=True)\n", "\n", " user_register = pd.merge(user_register, t10_1, on=['user_id'], how='left')\n", " user_register['user_activity_max_day_register_day_distance'] = \\\n", " (user_register['max_day'] - user_register['register_day'] + 1)\n", " user_register.drop(['max_day'], axis=1, inplace=True)\n", " user_register.fillna(0, inplace = True)\n", "\n", " t32 = user_launch[(user_launch.day >= f_start) &\n", " (user_launch.day <= f_end)][['user_id', 'day']]\n", " t32 = pd.merge(t32, user_register[['user_id', 'register_day']], on = ['user_id'], how = 'left')\n", " t32['user_launch_count_concerned_register_day'] = t32['day'] - t32['register_day'] + 1\n", " t32 = pd.pivot_table(t32, values = 'user_launch_count_concerned_register_day',\n", " index = 'user_id', aggfunc = 'sum').reset_index()\n", " user_register = pd.merge(user_register, t32, on = ['user_id'], how = 'left')\n", " user_register.fillna(0, inplace = True)\n", "\n", " t33 = video_create[(video_create.day >= f_start) &\n", " (video_create.day <= f_end)][['user_id', 'day']]\n", " t33 = pd.merge(t33, user_register[['user_id', 'register_day']], on = ['user_id'], how = 'left')\n", " t33['user_video_create_count_concerned_register_day'] = t33['day'] - t33['register_day'] + 1\n", " t33 = pd.pivot_table(t33, values = 'user_video_create_count_concerned_register_day',\n", " index = 'user_id', aggfunc = 'sum').reset_index()\n", " user_register = pd.merge(user_register, t33, on = ['user_id'], how = 'left')\n", " user_register.fillna(0, inplace = True)\n", "\n", " for i in range(4):\n", " t33 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end) &\n", " (user_activity.action_type == i)][['user_id', 'day']]\n", " t33 = pd.merge(t33, user_register[['user_id', 'register_day']], on = ['user_id'], how = 'left')\n", " t33['user_activity_action_type_%d_count_concerned_register_day'%(i)] = \\\n", " t33['day'] - t33['register_day'] + 1\n", " t33 = pd.pivot_table(t33, values = 'user_activity_action_type_%d_count_concerned_register_day'%(i),\n", " index = 'user_id', aggfunc = 'sum').reset_index()\n", " user_register = pd.merge(user_register, t33, on = ['user_id'], how = 'left')\n", " user_register.fillna(0, inplace = True)\n", "\n", " t35 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end)][['user_id', 'author_id', 'day']]\n", " t35 = pd.pivot_table(t35, values = 'day',\n", " index = ['user_id', 'author_id'],\n", " aggfunc = np.max).reset_index()\n", " t35 = pd.merge(t35, user_register[['user_id', 'register_day']], on = ['user_id'], how = 'left')\n", " t35['author_id'] = t35['day'] - t35['register_day'] + 1\n", " t35 = pd.pivot_table(t35, values = 'author_id',\n", " index = ['user_id'],\n", " aggfunc = 'sum').reset_index()\\\n", " .rename(columns = {'author_id' : 'user_author_type_count_concerned_register_day'})\n", " user_register = pd.merge(user_register, t35, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " t36 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end)][['user_id', 'author_id', 'day']]\n", " t36 = pd.pivot_table(t36, values = 'day',\n", " index = ['user_id', 'author_id'],\n", " aggfunc = np.mean).reset_index()\n", " t36 = pd.merge(t36, user_register[['user_id', 'register_day']], on=['user_id'], how='left')\n", " t36['user_id'] = t36['day'] - t36['register_day'] + 1\n", " t36 = pd.pivot_table(t36, values='user_id',\n", " index=['author_id'],\n", " aggfunc='sum').reset_index() \\\n", " .rename(columns={'user_id': 'author_user_type_mean_count_concerned_register_day'})\n", " t36.rename(columns = {'author_id' : 'user_id'}, inplace = True)\n", " user_register = pd.merge(user_register, t36, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace = True)\n", "\n", " t8_1 = user_launch[(user_launch.day >= f_start) &\n", " (user_launch.day <= f_end)][['user_id', 'day']]\n", " t8_1 = t8_1.groupby(['user_id']).agg('mean').reset_index()\n", " t8_1.rename(columns={'day': 'mean_day'}, inplace=True)\n", "\n", " user_register = pd.merge(user_register, t8_1, on=['user_id'], how='left')\n", " user_register['user_launch_mean_day_register_day_distance'] = \\\n", " (user_register['mean_day'] - user_register['register_day'] + 1) / \\\n", " (f_end - user_register['register_day'] + 1)\n", " user_register.drop(['mean_day'], axis=1, inplace=True)\n", " user_register.fillna(0, inplace=True)\n", "\n", " t9_1 = video_create[(video_create.day >= f_start) &\n", " (video_create.day <= f_end)][['user_id', 'day']]\n", " t9_1 = t9_1.groupby(['user_id']).agg('mean').reset_index()\n", " t9_1.rename(columns={'day': 'mean_day'}, inplace=True)\n", " user_register = pd.merge(user_register, t9_1, on=['user_id'], how='left')\n", " user_register['user_video_mean_day_register_day_distance'] = \\\n", " (user_register['mean_day'] - user_register['register_day'] + 1) / \\\n", " (f_end - user_register['register_day'] + 1)\n", " user_register.drop(['mean_day'], axis=1, inplace=True)\n", " user_register.fillna(0, inplace=True)\n", "\n", " t10_1 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end)][['user_id', 'day']]\n", " t10_1 = t10_1.groupby(['user_id']).agg('mean').reset_index()\n", " t10_1.rename(columns={'day': 'mean_day'}, inplace=True)\n", "\n", " user_register = pd.merge(user_register, t10_1, on=['user_id'], how='left')\n", " user_register['user_activity_mean_day_register_day_distance'] = \\\n", " (user_register['mean_day'] - user_register['register_day'] + 1) / \\\n", " (f_end - user_register['register_day'] + 1)\n", " user_register.drop(['mean_day'], axis=1, inplace=True)\n", " user_register.fillna(0, inplace=True)\n", "\n", " t8_1 = user_launch[(user_launch.day >= f_start) &\n", " (user_launch.day <= f_end)][['user_id', 'day']]\n", " t8_1 = t8_1.groupby(['user_id']).agg('median').reset_index()\n", " t8_1.rename(columns={'day': 'median_day'}, inplace=True)\n", "\n", " user_register = pd.merge(user_register, t8_1, on=['user_id'], how='left')\n", " user_register['user_launch_median_day_register_day_distance'] = \\\n", " (user_register['median_day'] - user_register['register_day'] + 1)\n", " user_register.drop(['median_day'], axis=1, inplace=True)\n", " user_register.fillna(0, inplace=True)\n", "\n", " t9_1 = video_create[(video_create.day >= f_start) &\n", " (video_create.day <= f_end)][['user_id', 'day']]\n", " t9_1 = t9_1.groupby(['user_id']).agg('median').reset_index()\n", " t9_1.rename(columns={'day': 'median_day'}, inplace=True)\n", " user_register = pd.merge(user_register, t9_1, on=['user_id'], how='left')\n", " user_register['user_video_median_day_register_day_distance'] = \\\n", " (user_register['median_day'] - user_register['register_day'] + 1)\n", " user_register.drop(['median_day'], axis=1, inplace=True)\n", " user_register.fillna(0, inplace=True)\n", "\n", " t10_1 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end)][['user_id', 'day']]\n", " t10_1 = t10_1.groupby(['user_id']).agg('median').reset_index()\n", " t10_1.rename(columns={'day': 'median_day'}, inplace=True)\n", "\n", " user_register = pd.merge(user_register, t10_1, on=['user_id'], how='left')\n", " user_register['user_activity_median_day_register_day_distance'] = \\\n", " (user_register['median_day'] - user_register['register_day'] + 1)\n", " user_register.drop(['median_day'], axis=1, inplace=True)\n", " user_register.fillna(0, inplace=True)\n", "\n", " for i in range(6):\n", " t40_1 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end) &\n", " (user_activity.action_type == i)][['user_id']]\n", " t40_1['count1'] = 1\n", " t40_1 = pd.pivot_table(t40_1, index=['user_id'],\n", " values=['count1'], aggfunc = 'sum').reset_index()\n", "\n", " t40_2 = user_activity[(user_activity.day >= f_start)\n", " & (user_activity.day <= f_end)][['user_id']]\n", " t40_2['count2'] = 1\n", " t40_2 = pd.pivot_table(t40_2, index = ['user_id'],\n", " values = ['count2'], aggfunc = 'sum').reset_index()\n", " t40 = pd.merge(t40_2, t40_1, on = ['user_id'], how = 'left')\n", " t40.fillna(0, inplace = True)\n", "\n", " t40['user_%d_before_activity_action_type_%d_rate'\n", " %(f_end - f_start + 1, i)] = t40['count1'] / t40['count2']\n", " t40 = t40[['user_id', 'user_%d_before_activity_action_type_%d_rate'\n", " %(f_end - f_start + 1, i)]]\n", " user_register = pd.merge(user_register, t40, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " for i in range(5):\n", " t40_1 = user_activity[(user_activity.day >= f_start) &\n", " (user_activity.day <= f_end) &\n", " (user_activity.page == i)][['user_id']]\n", " t40_1['count1'] = 1\n", " t40_1 = pd.pivot_table(t40_1, index=['user_id'],\n", " values=['count1'], aggfunc='sum').reset_index()\n", "\n", " t40_2 = user_activity[(user_activity.day >= f_start)\n", " & (user_activity.day <= f_end)][['user_id']]\n", " t40_2['count2'] = 1\n", " t40_2 = pd.pivot_table(t40_2, index=['user_id'],\n", " values=['count2'], aggfunc='sum').reset_index()\n", "\n", " t40 = pd.merge(t40_2, t40_1, on=['user_id'], how='left')\n", " t40.fillna(0, inplace=True)\n", "\n", " t40['user_%d_before_activity_page_%d_rate'\n", " % (f_end - f_start + 1, i)] = t40['count1'] / t40['count2']\n", " t40 = t40[['user_id', 'user_%d_before_activity_page_%d_rate'\n", " % (f_end - f_start + 1, i)]]\n", " user_register = pd.merge(user_register, t40, on=['user_id'], how='left')\n", " user_register.fillna(0, inplace=True)\n", "\n", " print('get all time feature have finished!')\n", "\n", " return user_register\n", "\n", "def getSlideJoin(user_launch, register_have_label, video_create, user_activity, end):\n", " register_have_feature = getLastDayFeature(user_launch, register_have_label,\n", " video_create, user_activity, end, end)\n", "\n", " register_have_feature = getAllTimeFeature(user_launch, register_have_feature,\n", " video_create, user_activity, end - 15, end)\n", "\n", " return register_have_feature\n", "\n", "def getDummiesFeature(user_register):\n", " register_type_df = pd.get_dummies(user_register['register_type'], prefix='register_type')\n", " user_register = pd.concat([user_register, register_type_df], axis=1)\n", "\n", " return user_register[user_register['flag'].notnull()].reset_index(drop=True), \\\n", " user_register[user_register['flag'].isnull()].reset_index(drop=True)\n", "\n", "def getFeature(user_launch, user_register, video_create, user_activity,\n", " register_have_label1, register_have_label2):\n", " print('train1')\n", " register_have_feature_train1 = getSlideJoin(user_launch, register_have_label1,\n", " video_create, user_activity, 23)\n", " print(register_have_feature_train1.shape)\n", "\n", " print('train2')\n", " register_have_feature_train2 = getSlideJoin(user_launch, register_have_label2,\n", " video_create, user_activity, 16)\n", " print(register_have_feature_train2.shape)\n", "\n", " register_have_feature_train = pd.concat([register_have_feature_train1,\n", " register_have_feature_train2]).reset_index(drop=True)\n", "\n", " print('test')\n", " register_have_feature_test = getSlideJoin(user_launch, user_register,\n", " video_create, user_activity, 30)\n", " print(register_have_feature_test.shape)\n", "\n", " register_have_feature_train['flag'] = 1\n", " register_have_feature_train, register_have_feature_test = \\\n", " getDummiesFeature(pd.concat([register_have_feature_train,\n", " register_have_feature_test]).reset_index(drop=True))\n", " print(register_have_feature_train.shape)\n", " print(register_have_feature_test.shape)\n", " register_have_feature_train.drop(['user_id', 'register_day', 'flag'], axis=1, inplace=True)\n", " register_have_feature_test.drop(['register_day', 'flag'], axis=1, inplace=True)\n", " return register_have_feature_train, register_have_feature_test\n", "\n", "def runXGBoost(train, test):\n", " print('run xgboost...')\n", " train_feat = [x for x in train.columns if x != 'label']\n", " test_feat = [x for x in test.columns if x != 'user_id']\n", " feat = [x for x in train_feat if x in test_feat]\n", " print('feat:', len(feat))\n", " train_x = train[feat]\n", " train_y = train[['label']]\n", " test_x = test[feat]\n", " test_pre = test[['user_id']]\n", "\n", " train_xgb = xgb.DMatrix(train_x, label=train_y)\n", " test_xgb = xgb.DMatrix(test_x)\n", "\n", " params = {\n", " 'booster': 'gbtree',\n", " 'objective': 'rank:pairwise',\n", " 'eval_metric': 'auc',\n", " 'gamma': 0.1,\n", " 'min_child_weight': 1.5,\n", " 'max_depth': 5,\n", " 'lambda': 10,\n", " 'subsample': 0.7,\n", " 'colsample_bytree': 0.7,\n", " 'colsample_bylevel': 0.7,\n", " 'eta': 0.03,\n", " 'tree_method': 'exact',\n", " 'seed': 0,\n", " 'nthread': 12\n", " }\n", "\n", " # train on dataset2, evaluate on dataset1\n", " watchlist = [(train_xgb, 'train'), (train_xgb, 'val')]\n", " model = xgb.train(params, train_xgb, num_boost_round=700, evals=watchlist)\n", " test_pre['predicted_pro'] = model.predict(test_xgb)\n", " min_pro = np.min(test_pre.predicted_pro)\n", " max_pro = np.max(test_pre.predicted_pro)\n", " test_pre.predicted_pro = \\\n", " (test_pre.predicted_pro - min_pro) / (max_pro - min_pro)\n", " result = test_pre.sort_index(by=['predicted_pro'], ascending=False).reset_index(drop=True)\n", " print('run xgboost have finished!')\n", " return result\n", "\n", "def main():\n", " # read pure txt\n", " user_launch, user_register, video_create, user_activity = readData()\n", "\n", " # labeling register log\n", " print('get label...')\n", " register_have_label1, register_have_label2 =\\\n", " getLabel(user_launch, user_register, video_create, user_activity)\n", " print('get label have finished!')\n", "\n", " # get feature\n", " print('get feature...')\n", " train, test = \\\n", " getFeature(user_launch, user_register, video_create, user_activity,\n", " register_have_label1, register_have_label2)\n", " print('get feature have finished!')\n", "\n", " # run xgboost\n", " result_10_cv = test[['user_id']]\n", " result_10_cv['predicted_pro'] = 0\n", " for i in range(10):\n", " train_sample = train.sample(frac=0.9)\n", " result = runXGBoost(train_sample, test)\n", " result.rename(columns={'predicted_pro': 'predicted_pro_%d' % (i)}, inplace=True)\n", " result_10_cv = pd.merge(result_10_cv, result, on=['user_id'], how='left')\n", " result_10_cv['predicted_pro'] = result_10_cv['predicted_pro'] +\\\n", " 0.1 * result_10_cv['predicted_pro_%d' % (i)]\n", " result_10_cv = result_10_cv[['user_id', 'predicted_pro']]\n", "\n", " result = runXGBoost(train, test)\n", " result.rename(columns={'predicted_pro': 'predicted_pro_all'}, inplace=True)\n", " result_10_cv = pd.merge(result_10_cv, result, on=['user_id'], how='left')\n", " result_10_cv['predicted_pro'] = 0.5 * result_10_cv['predicted_pro'] + 0.5 * result_10_cv['predicted_pro_all']\n", " result_10_cv = result_10_cv[['user_id', 'predicted_pro']]\n", "\n", " result_10_cv.to_csv(\"mzj_model.csv\", encoding='utf-8', index=None, header=None)\n", "\n", "if __name__ == '__main__':\n", " main()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "BB692C648EA6491E9CC66DD1E70DB453" }, "outputs": [], "source": [ "# 模型融合" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "64FFD5D52D4D47198E388E229A64A076" }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "def getMaxMinNormalization(feature):\n", " max = np.max(feature)\n", " min = np.min(feature)\n", "\n", " return (feature - min) / (max - min)\n", "\n", "r1 = pd.read_csv(r'yw_model.txt', header = None)\n", "r1.columns = ['user_id', 'label1']\n", "r2 = pd.read_csv(r'xjy_model.txt', header = None)\n", "r2.columns = ['user_id', 'label2']\n", "r2['label2'] = getMaxMinNormalization(r2['label2'])\n", "r3 = pd.read_csv(r'mzj_model.csv', header = None)\n", "r3.columns = ['user_id', 'label3']\n", "print(r1.shape)\n", "print(r2.shape)\n", "print(r3.shape)\n", "result = pd.merge(r1, r2, on = ['user_id'], how = 'left')\n", "result = pd.merge(result, r3, on = ['user_id'], how = 'left')\n", "result['label'] = 0.3 * result['label1'] + 0.4 * result['label2'] + 0.3 * result['label3']\n", "\n", "result = result[['user_id', 'label']]\n", "result = result.sort_index(by=['label'], ascending=False).reset_index(drop=True)\n", "print(result.shape)\n", "result.to_csv(\"fafenlousi_result.csv\", encoding='utf-8', index=None, header = None)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 1 }