{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"2022-01-11-deepfm-criteo.ipynb","provenance":[{"file_id":"https://github.com/recohut/nbs/blob/main/raw/P237732%20%7C%20DeepFM%20on%20Criteo%20DAC%20sample%20dataset%20in%20PyTorch.ipynb","timestamp":1644607197734},{"file_id":"1PcrzoopQcJ6T5CwS38RIyYqoayb0ytc7","timestamp":1641537359438},{"file_id":"1FEZmnoLGIsTsGiK2gi1TsIHLAaWCXF_a","timestamp":1640329037065}],"collapsed_sections":[],"mount_file_id":"1FEZmnoLGIsTsGiK2gi1TsIHLAaWCXF_a","authorship_tag":"ABX9TyNHfJtptRpR1n+zn0BOMPle"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# DeepFM on Criteo DAC sample dataset in PyTorch"],"metadata":{"id":"2VZ8fr0qMt5t"}},{"cell_type":"code","source":["import pandas as pd\n","import torch\n","from sklearn.metrics import log_loss, roc_auc_score\n","from sklearn.model_selection import train_test_split\n","from sklearn.preprocessing import LabelEncoder, MinMaxScaler\n","import sys\n","import os\n","\n","import torch.nn as nn\n","import numpy as np\n","import torch.utils.data as Data\n","from torch.utils.data import DataLoader\n","import torch.optim as optim\n","import torch.nn.functional as F\n","from sklearn.metrics import log_loss, roc_auc_score\n","from collections import OrderedDict, namedtuple, defaultdict\n","import random"],"metadata":{"id":"EToD4LnRLgyY"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["!wget -q --show-progress https://github.com/RecoHut-Datasets/criteo/raw/v1/dac_sample.txt"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"L60wRz3KLutF","executionInfo":{"status":"ok","timestamp":1641536229646,"user_tz":-330,"elapsed":1342,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"1a380f02-5cf1-43d2-8a40-974119bbecdc"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\rdac_sample.txt 0%[ ] 0 --.-KB/s \rdac_sample.txt 100%[===================>] 23.20M --.-KB/s in 0.09s \n"]}]},{"cell_type":"code","source":["class FM(nn.Module):\n"," def __init__(self, p, k):\n"," super(FM, self).__init__()\n"," self.p = p\n"," self.k = k\n"," self.linear = nn.Linear(self.p, 1, bias=True)\n"," self.v = nn.Parameter(torch.Tensor(self.p, self.k), requires_grad=True)\n"," self.v.data.uniform_(-0.01, 0.01)\n"," self.drop = nn.Dropout(0.3)\n","\n"," def forward(self, x):\n"," linear_part = self.linear(x)\n"," inter_part1 = torch.pow(torch.mm(x, self.v), 2)\n"," inter_part2 = torch.mm(torch.pow(x, 2), torch.pow(self.v, 2))\n"," pair_interactions = torch.sum(torch.sub(inter_part1, inter_part2), dim=1)\n"," self.drop(pair_interactions)\n"," output = linear_part.transpose(1, 0) + 0.5 * pair_interactions\n"," return output.view(-1, 1)"],"metadata":{"id":"D_slY7KGN44C"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["class deepfm(nn.Module):\n"," def __init__(self, feat_sizes, sparse_feature_columns, dense_feature_columns,dnn_hidden_units=[400, 400,400], dnn_dropout=0.0, ebedding_size=4,\n"," l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, init_std=0.0001, seed=1024,\n"," device='cpu'):\n"," super(deepfm, self).__init__()\n"," self.feat_sizes = feat_sizes\n"," self.device = device\n"," self.dense_feature_columns = dense_feature_columns\n"," self.sparse_feature_columns = sparse_feature_columns\n"," self.embedding_size = ebedding_size\n"," self.l2_reg_linear = l2_reg_linear\n","\n"," self.bias = nn.Parameter(torch.zeros((1, )))\n"," self.init_std = init_std\n"," self.dnn_dropout = dnn_dropout\n","\n"," self.embedding_dic = nn.ModuleDict({feat:nn.Embedding(self.feat_sizes[feat], self.embedding_size, sparse=False)\n"," for feat in self.sparse_feature_columns})\n"," for tensor in self.embedding_dic.values():\n"," nn.init.normal_(tensor.weight, mean=0, std=self.init_std)\n"," self.embedding_dic.to(self.device)\n","\n"," self.feature_index = defaultdict(int)\n"," start = 0\n"," for feat in self.feat_sizes:\n"," if feat in self.feature_index:\n"," continue\n"," self.feature_index[feat] = start\n"," start += 1\n","\n"," # 输入维度 fm层与DNN层共享嵌入层, 输入维度应该是一样的\n"," self.input_size = self.embedding_size * len(self.sparse_feature_columns)+len(self.dense_feature_columns)\n"," # fm\n"," self.fm = FM(self.input_size, 10)\n","\n"," # DNN\n"," self.dropout = nn.Dropout(self.dnn_dropout)\n"," self.hidden_units = [self.input_size] + dnn_hidden_units\n"," self.Linears = nn.ModuleList([nn.Linear(self.hidden_units[i], self.hidden_units[i+1]) for i in range(len(self.hidden_units)-1)])\n"," self.relus = nn.ModuleList([nn.ReLU() for i in range(len(self.hidden_units)-1)])\n"," for name, tensor in self.Linears.named_parameters():\n"," if 'weight' in name:\n"," nn.init.normal_(tensor, mean=0, std=self.init_std)\n"," self.dnn_outlayer = nn.Linear(dnn_hidden_units[-1], 1, bias=False).to(self.device)\n","\n","\n"," def forward(self, x):\n"," # x shape 1024*39\n","\n"," sparse_embedding = [self.embedding_dic[feat](x[:, self.feature_index[feat]].long()) for feat in self.sparse_feature_columns]\n"," sparse_embedding = torch.cat(sparse_embedding, dim=-1)\n"," # print(sparse_embedding.shape) # batch * 208\n","\n"," dense_value = [x[:, self.feature_index[feat]] for feat in\n"," self.dense_feature_columns]\n","\n"," dense_value = torch.cat(dense_value, dim=0)\n"," dense_value = torch.reshape(dense_value, (len(self.dense_feature_columns), -1))\n"," dense_value = dense_value.T\n"," # print(dense_value.shape) # batch * 13\n","\n"," input_x = torch.cat((dense_value, sparse_embedding), dim=1)\n"," # print(input_x.shape) # batch * 221\n","\n"," fm_logit = self.fm(input_x)\n","\n"," for i in range(len(self.Linears)):\n"," fc = self.Linears[i](input_x)\n"," fc = self.relus[i](fc)\n"," fc = self.dropout(fc)\n"," input_x = fc\n"," dnn_logit = self.dnn_outlayer(input_x)\n","\n"," y_pre = torch.sigmoid(fm_logit+dnn_logit+self.bias)\n"," return y_pre"],"metadata":{"id":"ZiDRlS-GO9Y1"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def get_auc(loader, model):\n"," pred, target = [], []\n"," model.eval()\n"," with torch.no_grad():\n"," for x, y in loader:\n"," x = x.to(device).float()\n"," y = y.to(device).float()\n"," y_hat = model(x)\n"," pred += list(y_hat.numpy())\n"," target += list(y.numpy())\n"," auc = roc_auc_score(target, pred)\n"," return auc"],"metadata":{"id":"g-zB2cR0PJQ_"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["batch_size = 1024\n","lr = 0.00005\n","wd = 0.00001\n","epoches = 10\n","\n","seed = 1024\n","torch.manual_seed(seed)\n","torch.cuda.manual_seed(seed)\n","torch.cuda.manual_seed_all(seed)\n","np.random.seed(seed)\n","random.seed(seed)\n","\n","sparse_features = ['C' + str(i) for i in range(1, 27)]\n","dense_features = ['I' + str(i) for i in range(1, 14)]\n","col_names = ['label'] + dense_features + sparse_features\n","df = pd.read_csv('dac_sample.txt', names=col_names, sep='\\t')\n","feature_names = dense_features + sparse_features\n","\n","df[sparse_features] = df[sparse_features].fillna('-1', )\n","df[dense_features] = df[dense_features].fillna(0, )\n","target = ['label']\n","\n","for feat in sparse_features:\n"," lbe = LabelEncoder()\n"," df[feat] = lbe.fit_transform(df[feat])\n","\n","mms = MinMaxScaler(feature_range=(0, 1))\n","df[dense_features] = mms.fit_transform(df[dense_features])\n","\n","feat_size1 = {feat: 1 for feat in dense_features}\n","feat_size2 = {feat: len(df[feat].unique()) for feat in sparse_features}\n","feat_sizes = {}\n","feat_sizes.update(feat_size1)\n","feat_sizes.update(feat_size2)\n","\n","# print(df.head(5))\n","# print(feat_sizes)\n","\n","train, test =train_test_split(df, test_size=0.2, random_state=2021)\n","train_model_input = {name: train[name] for name in feature_names}\n","test_model_input = {name: test[name] for name in feature_names}\n","\n","device = 'cpu'\n","\n","model = deepfm(feat_sizes, sparse_feature_columns=sparse_features, dense_feature_columns=dense_features,\n"," dnn_hidden_units=[1000, 500, 250], dnn_dropout=0.9, ebedding_size=16,\n"," l2_reg_linear=1e-3, device=device)\n","\n","train_label = pd.DataFrame(train['label'])\n","train_data = train.drop(columns=['label'])\n","#print(train.head(5))\n","train_tensor_data = torch.utils.data.TensorDataset(torch.from_numpy(np.array(train_data)), torch.from_numpy(np.array(train_label)))\n","train_loader = DataLoader(dataset=train_tensor_data, shuffle=True, batch_size=batch_size)\n","\n","test_label = pd.DataFrame(test['label'])\n","test_data = test.drop(columns=['label'])\n","test_tensor_data = torch.utils.data.TensorDataset(torch.from_numpy(np.array(test_data)),\n"," torch.from_numpy(np.array(test_label)))\n","test_loader = DataLoader(dataset=test_tensor_data, shuffle=False, batch_size=batch_size)\n","\n","loss_func = nn.BCELoss(reduction='mean')\n","optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)\n","\n","for epoch in range(epoches):\n"," total_loss_epoch = 0.0\n"," total_tmp = 0\n","\n"," model.train()\n"," for index, (x, y) in enumerate(train_loader):\n"," x = x.to(device).float()\n"," y = y.to(device).float()\n","\n"," y_hat = model(x)\n","\n"," optimizer.zero_grad()\n"," loss = loss_func(y_hat, y)\n"," loss.backward()\n"," optimizer.step()\n"," total_loss_epoch += loss.item()\n"," total_tmp += 1\n","\n"," auc = get_auc(test_loader, model)\n"," print('epoch/epoches: {}/{}, train loss: {:.3f}, test auc: {:.3f}'.format(epoch, epoches, total_loss_epoch / total_tmp, auc))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ZH69aNy6LmvA","executionInfo":{"status":"ok","timestamp":1641537334473,"user_tz":-330,"elapsed":181659,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"9a504fcb-fe7b-49c4-bf15-2ccece85893d"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["epoch/epoches: 0/10, train loss: 0.667, test auc: 0.569\n","epoch/epoches: 1/10, train loss: 0.564, test auc: 0.681\n","epoch/epoches: 2/10, train loss: 0.531, test auc: 0.710\n","epoch/epoches: 3/10, train loss: 0.507, test auc: 0.720\n","epoch/epoches: 4/10, train loss: 0.482, test auc: 0.727\n","epoch/epoches: 5/10, train loss: 0.455, test auc: 0.735\n","epoch/epoches: 6/10, train loss: 0.425, test auc: 0.740\n","epoch/epoches: 7/10, train loss: 0.393, test auc: 0.742\n","epoch/epoches: 8/10, train loss: 0.363, test auc: 0.739\n","epoch/epoches: 9/10, train loss: 0.337, test auc: 0.733\n"]}]},{"cell_type":"code","source":["!pip install -q watermark\n","%reload_ext watermark\n","%watermark -a \"Sparsh A.\" -m -iv -u -t -d"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"tmhWlpwiL0bb","executionInfo":{"status":"ok","timestamp":1641537342905,"user_tz":-330,"elapsed":3754,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"57e12b81-7f55-4ce0-8aa4-b43e46a80887"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Author: Sparsh A.\n","\n","Last updated: 2022-01-07 06:35:43\n","\n","Compiler : GCC 7.5.0\n","OS : Linux\n","Release : 5.4.144+\n","Machine : x86_64\n","Processor : x86_64\n","CPU cores : 2\n","Architecture: 64bit\n","\n","pandas : 1.1.5\n","sys : 3.7.12 (default, Sep 10 2021, 00:21:48) \n","[GCC 7.5.0]\n","torch : 1.10.0+cu111\n","numpy : 1.19.5\n","IPython: 5.5.0\n","\n"]}]}]}