{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"2022-01-09-dcn-criteo.ipynb","provenance":[{"file_id":"https://github.com/recohut/nbs/blob/main/raw/P102393%20%7C%20DCN%20on%20Criteo%20DAC%20sample%20dataset%20in%20PyTorch.ipynb","timestamp":1644598048864},{"file_id":"1PcrzoopQcJ6T5CwS38RIyYqoayb0ytc7","timestamp":1641536551008},{"file_id":"1FEZmnoLGIsTsGiK2gi1TsIHLAaWCXF_a","timestamp":1640329037065}],"collapsed_sections":[],"mount_file_id":"1FEZmnoLGIsTsGiK2gi1TsIHLAaWCXF_a","authorship_tag":"ABX9TyNxKibDIYfmBLdJ9ymWUBTw"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# DCN on Criteo DAC sample dataset in PyTorch"],"metadata":{"id":"2VZ8fr0qMt5t"}},{"cell_type":"code","source":["import numpy as np\n","import pandas as pd\n","import torch\n","import torch.nn as nn\n","from torch.utils.data import Dataset, DataLoader, TensorDataset\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import log_loss, roc_auc_score\n","from sklearn.preprocessing import LabelEncoder, MinMaxScaler\n","from collections import OrderedDict, namedtuple, defaultdict"],"metadata":{"id":"EToD4LnRLgyY"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["!wget -q --show-progress https://github.com/RecoHut-Datasets/criteo/raw/v1/dac_sample.txt"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"L60wRz3KLutF","executionInfo":{"status":"ok","timestamp":1641536229646,"user_tz":-330,"elapsed":1342,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"1a380f02-5cf1-43d2-8a40-974119bbecdc"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\rdac_sample.txt 0%[ ] 0 --.-KB/s \rdac_sample.txt 100%[===================>] 23.20M --.-KB/s in 0.09s \n"]}]},{"cell_type":"code","source":["def get_auc(loader, model):\n"," pred, target = [], []\n"," model.eval()\n"," with torch.no_grad():\n"," for x, y in loader:\n"," x, y = x.to(device).float(), y.to(device).float()\n"," y_hat = model(x)\n"," pred += list(y_hat.cpu().numpy())\n"," target += list(y.cpu().numpy())\n"," auc = roc_auc_score(target, pred)\n"," return auc"],"metadata":{"id":"9Rs8-3zMLSCa"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["class DNN(nn.Module):\n"," def __init__(self, inputs_dim, hidden_units, dropout_rate):\n"," super(DNN, self).__init__()\n"," self.inputs_dim = inputs_dim\n"," self.hidden_units = hidden_units\n"," self.dropout = nn.Dropout(dropout_rate)\n","\n"," self.hidden_units = [inputs_dim] + list(self.hidden_units)\n"," self.linear = nn.ModuleList([\n"," nn.Linear(self.hidden_units[i], self.hidden_units[i+1]) for i in range(len(self.hidden_units)-1)\n"," ])\n"," for name, tensor in self.linear.named_parameters():\n"," if 'weight' in name:\n"," nn.init.normal_(tensor, mean=0, std=0.0001)\n","\n"," # self.bn = nn.ModuleList([\n"," # nn.Linear(self.hidden_units[i], self.hidden_units[i + 1]) for i in range(len(self.hidden_units) - 1)\n"," # ])\n"," self.activation = nn.ReLU()\n"," def forward(self, X):\n"," inputs = X\n"," for i in range(len(self.linear)):\n"," fc = self.linear[i](inputs)\n"," fc = self.activation(fc)\n"," fc - self.dropout(fc)\n"," inputs = fc\n"," return inputs"],"metadata":{"id":"4hXMuz8gLirV"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["class CrossNet(nn.Module):\n"," def __init__(self, in_features, layer_num=2, parameterization='vector', seed=2022):\n"," super(CrossNet, self).__init__()\n"," self.layer_num = layer_num\n"," self.parameterization = parameterization\n"," if self.parameterization == 'vector':\n"," self.kernels = nn.Parameter(torch.Tensor(self.layer_num, in_features, 1))\n"," elif self.parameterization == 'matrix':\n"," self.kernels = nn.Parameter(torch.Tensor(self.layer_num, in_features, in_features))\n"," self.bias = nn.Parameter(torch.Tensor(self.layer_num, in_features, 1))\n","\n"," for i in range(self.kernels.shape[0]):\n"," nn.init.xavier_normal_(self.kernels[i])\n"," for i in range(self.bias.shape[0]):\n"," nn.init.zeros_(self.bias[0])\n","\n"," def forward(self, inputs):\n"," x_0 = inputs.unsqueeze(2)\n"," x_1 = x_0\n"," for i in range(self.layer_num):\n"," if self.parameterization == 'vector':\n"," x1_w = torch.tensordot(x_1, self.kernels[i], dims=([1], [0]))\n"," dot_ = torch.matmul(x_0, x1_w)\n"," x_1 = dot_ + self.bias[i] + x_1\n"," else:\n"," x1_w = torch.tensordot(self.kernels[i], x_1)\n"," dot_ = x1_w + self.bias[i]\n"," x_1 = x_0 * dot_ + x_1\n"," x_1 = torch.squeeze(x_1, dim=2)\n"," return x_1"],"metadata":{"id":"KJTogIIrLkMM"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["class DCN(nn.Module):\n"," def __init__(self, feat_size, embedding_size, linear_feature_columns, dnn_feature_columns, cross_num=2,\n"," cross_param='vector', dnn_hidden_units=(128, 128,), init_std=0.0001, seed=2022, l2_reg=0.00001,\n"," drop_rate=0.5):\n"," super(DCN, self).__init__()\n"," self.feat_size = feat_size\n"," self.embedding_size = embedding_size\n"," self.dnn_hidden_units = dnn_hidden_units\n"," self.cross_num = 2\n"," self.cross_param = cross_param\n"," self.drop_rate = drop_rate\n"," self.l2_reg = 0.00001\n","\n"," self.act = nn.ReLU()\n"," self.dropout = nn.Dropout(drop_rate)\n","\n"," self.dense_feature_columns = list(filter(lambda x:x[1]=='dense', dnn_feature_columns))\n"," self.sparse_feature_columns = list(filter(lambda x:x[1]=='sparse', dnn_feature_columns))\n","\n"," self.embedding_dic = nn.ModuleDict({feat[0]:nn.Embedding(feat_size[feat[0]], self.embedding_size, sparse=False)\n"," for feat in self.sparse_feature_columns})\n","\n"," self.feature_index = defaultdict(int)\n"," start = 0\n"," for feat in self.feat_size:\n"," self.feature_index[feat] = start\n"," start += 1\n","\n","\n"," inputs_dim = len(self.dense_feature_columns)+self.embedding_size*len(self.sparse_feature_columns)\n","\n"," self.dnn = DNN(inputs_dim,self.dnn_hidden_units, 0.5)\n","\n"," self.crossnet = CrossNet(inputs_dim, layer_num=self.cross_num, parameterization=self.cross_param)\n"," self.dnn_linear = nn.Linear(inputs_dim+dnn_hidden_units[-1], 1, bias=False)\n","\n"," dnn_hidden_units = [len(feat_size)] + list(dnn_hidden_units) + [1]\n"," self.linear = nn.ModuleList([\n"," nn.Linear(dnn_hidden_units[i], dnn_hidden_units[i+1]) for i in range(len(dnn_hidden_units)-1)\n"," ])\n"," for name, tensor in self.linear.named_parameters():\n"," if 'weight' in name:\n"," nn.init.normal_(tensor, mean=0, std=init_std)\n","\n"," def forward(self, X):\n","\n"," logit = X\n"," for i in range(len(self.linear)):\n"," fc = self.linear[i](logit)\n"," fc = self.act(fc)\n"," fc = self.dropout(fc)\n"," logit = fc\n","\n"," sparse_embedding = [self.embedding_dic[feat[0]](X[:, self.feature_index[feat[0]]].long()).reshape(X.shape[0], 1, -1)\n"," for feat in self.sparse_feature_columns]\n"," dense_values = [X[:, self.feature_index[feat[0]]].reshape(-1, 1) for feat in self.dense_feature_columns]\n","\n"," dense_input = torch.cat(dense_values, dim=1)\n"," sparse_input = torch.cat(sparse_embedding, dim=1)\n","\n"," # 拉直 本来是 [batch_size, sparase特征数, 嵌入维度] =》 [batch_size, sparase特征数 * 嵌入维度]\n"," sparse_input = torch.flatten(sparse_input, start_dim=1)\n","\n"," dnn_input = torch.cat((dense_input, sparse_input), dim=1)\n","\n"," # print('sparse input size', sparse_input.shape)\n"," # print('dense input size', dense_input.shape)\n"," # print('dnn input size', dnn_input.shape)\n","\n"," deep_out = self.dnn(dnn_input)\n"," cross_out = self.crossnet(dnn_input)\n"," stack_out = torch.cat((cross_out, deep_out), dim=-1)\n","\n"," logit += self.dnn_linear(stack_out)\n"," #print('logit size', logit.shape)\n"," y_pred = torch.sigmoid(logit)\n"," #print('y_pred', y_pred.shape)\n"," return y_pred"],"metadata":{"id":"-mjdfBpILlWC"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["batch_size = 1024\n","lr = 1e-2\n","wd = 1e-3\n","epoches = 20\n","seed = 2022\n","embedding_size = 4\n","device = 'cpu'\n","\n","sparse_feature = ['C' + str(i) for i in range(1, 27)]\n","dense_feature = ['I' + str(i) for i in range(1, 14)]\n","col_names = ['label'] + dense_feature + sparse_feature\n","data = pd.read_csv('dac_sample.txt', names=col_names, sep='\\t')\n","\n","data[sparse_feature] = data[sparse_feature].fillna('-1', )\n","data[dense_feature] = data[dense_feature].fillna('0',)\n","target = ['label']\n","\n","feat_sizes = {}\n","feat_sizes_dense = {feat:1 for feat in dense_feature}\n","feat_sizes_sparse = {feat:len(data[feat].unique()) for feat in sparse_feature}\n","feat_sizes.update(feat_sizes_dense)\n","feat_sizes.update(feat_sizes_sparse)\n","\n","for feat in sparse_feature:\n"," lbe = LabelEncoder()\n"," data[feat] = lbe.fit_transform(data[feat])\n","nms = MinMaxScaler(feature_range=(0, 1))\n","data[dense_feature] = nms.fit_transform(data[dense_feature])\n","\n","fixlen_feature_columns = [(feat,'sparse') for feat in sparse_feature ] + [(feat,'dense') for feat in dense_feature]\n","dnn_feature_columns = fixlen_feature_columns\n","linear_feature_columns = fixlen_feature_columns\n","\n","train, test = train_test_split(data, test_size=0.2, random_state=seed)\n","\n","model = DCN(feat_sizes, embedding_size, linear_feature_columns, dnn_feature_columns).to(device)\n","\n","train_label = pd.DataFrame(train['label'])\n","train = train.drop(columns=['label'])\n","train_tensor_data = TensorDataset(torch.from_numpy(np.array(train)), torch.from_numpy(np.array(train_label)))\n","train_loader = DataLoader(train_tensor_data, shuffle=True, batch_size=batch_size)\n","\n","test_label = pd.DataFrame(test['label'])\n","test = test.drop(columns=['label'])\n","test_tensor_data = TensorDataset(torch.from_numpy(np.array(test)), torch.from_numpy(np.array(test_label)))\n","test_loader = DataLoader(test_tensor_data, batch_size=batch_size)\n","\n","loss_func = nn.BCELoss(reduction='mean')\n","optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)\n","\n","for epoch in range(epoches):\n"," total_loss_epoch = 0.0\n"," total_tmp = 0\n"," model.train()\n"," for index, (x, y) in enumerate(train_loader):\n"," x, y = x.to(device).float(), y.to(device).float()\n"," y_hat = model(x)\n","\n"," optimizer.zero_grad()\n"," loss = loss_func(y_hat, y)\n"," loss.backward()\n"," optimizer.step()\n"," total_loss_epoch += loss.item()\n"," total_tmp += 1\n"," auc = get_auc(test_loader, model)\n"," print('epoch/epoches: {}/{}, train loss: {:.3f}, test auc: {:.3f}'.format(epoch, epoches,\n"," total_loss_epoch / total_tmp, auc))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ZH69aNy6LmvA","executionInfo":{"status":"ok","timestamp":1641536386937,"user_tz":-330,"elapsed":111195,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"b4a53767-f829-4ac0-c989-c623dcf6b3e6"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["epoch/epoches: 0/20, train loss: 0.520, test auc: 0.667\n","epoch/epoches: 1/20, train loss: 0.501, test auc: 0.692\n","epoch/epoches: 2/20, train loss: 0.485, test auc: 0.718\n","epoch/epoches: 3/20, train loss: 0.507, test auc: 0.730\n","epoch/epoches: 4/20, train loss: 0.466, test auc: 0.735\n","epoch/epoches: 5/20, train loss: 0.462, test auc: 0.745\n","epoch/epoches: 6/20, train loss: 0.457, test auc: 0.742\n","epoch/epoches: 7/20, train loss: 0.454, test auc: 0.746\n","epoch/epoches: 8/20, train loss: 0.453, test auc: 0.746\n","epoch/epoches: 9/20, train loss: 0.451, test auc: 0.750\n","epoch/epoches: 10/20, train loss: 0.449, test auc: 0.748\n","epoch/epoches: 11/20, train loss: 0.448, test auc: 0.749\n","epoch/epoches: 12/20, train loss: 0.446, test auc: 0.747\n","epoch/epoches: 13/20, train loss: 0.445, test auc: 0.746\n","epoch/epoches: 14/20, train loss: 0.443, test auc: 0.747\n","epoch/epoches: 15/20, train loss: 0.441, test auc: 0.748\n","epoch/epoches: 16/20, train loss: 0.438, test auc: 0.749\n","epoch/epoches: 17/20, train loss: 0.437, test auc: 0.748\n","epoch/epoches: 18/20, train loss: 0.434, test auc: 0.746\n","epoch/epoches: 19/20, train loss: 0.432, test auc: 0.748\n"]}]},{"cell_type":"code","source":["!pip install -q watermark\n","%reload_ext watermark\n","%watermark -a \"Sparsh A.\" -m -iv -u -t -d"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"tmhWlpwiL0bb","executionInfo":{"status":"ok","timestamp":1641536460223,"user_tz":-330,"elapsed":3599,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"fac5bee6-aa05-4846-ac1d-c13dd1f05222"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Author: Sparsh A.\n","\n","Last updated: 2022-01-07 06:21:01\n","\n","Compiler : GCC 7.5.0\n","OS : Linux\n","Release : 5.4.144+\n","Machine : x86_64\n","Processor : x86_64\n","CPU cores : 2\n","Architecture: 64bit\n","\n","IPython: 5.5.0\n","torch : 1.10.0+cu111\n","pandas : 1.1.5\n","numpy : 1.19.5\n","\n"]}]}]}