{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"2022-01-20-fm-criteo.ipynb","provenance":[{"file_id":"https://github.com/recohut/nbs/blob/main/raw/T065764%20%7C%20FM%20on%20Criteo%20Ad%20Dataset%20in%20TF%202x.ipynb","timestamp":1644654352245},{"file_id":"1bhRkvGnfmxPWUPFRag57Dl6-4pl1ZEzm","timestamp":1637061905313}],"collapsed_sections":[],"mount_file_id":"1bhRkvGnfmxPWUPFRag57Dl6-4pl1ZEzm","authorship_tag":"ABX9TyMBUyPEeyzg6d0Ip4cgvR7R"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# FM on Criteo Ad Dataset in TF 2.x"],"metadata":{"id":"3JIsowxxCcBJ"}},{"cell_type":"code","metadata":{"id":"0pxSU24FbSAy"},"source":["!pip install tensorflow==2.5.0"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"QnFg_wXiclo4","executionInfo":{"status":"ok","timestamp":1637061269255,"user_tz":-330,"elapsed":135691,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"6cc6fdd1-38da-459c-ea28-576e6c5492b1"},"source":["!pip install -q -U kaggle\n","!pip install --upgrade --force-reinstall --no-deps kaggle\n","!mkdir ~/.kaggle\n","!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/\n","!chmod 600 ~/.kaggle/kaggle.json\n","!kaggle datasets download -d mrkmakr/criteo-dataset"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting kaggle\n"," Downloading kaggle-1.5.12.tar.gz (58 kB)\n","\u001b[K |████████████████████████████████| 58 kB 2.6 MB/s \n","\u001b[?25hBuilding wheels for collected packages: kaggle\n"," Building wheel for kaggle (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73051 sha256=d0e4b97f111ac4c64a7c2fead5edb3baa0f1f4f21eca031e865b06b3f40f3bbb\n"," Stored in directory: /root/.cache/pip/wheels/62/d6/58/5853130f941e75b2177d281eb7e44b4a98ed46dd155f556dc5\n","Successfully built kaggle\n","Installing collected packages: kaggle\n"," Attempting uninstall: kaggle\n"," Found existing installation: kaggle 1.5.12\n"," Uninstalling kaggle-1.5.12:\n"," Successfully uninstalled kaggle-1.5.12\n","Successfully installed kaggle-1.5.12\n","Downloading criteo-dataset.zip to /content\n","100% 4.31G/4.31G [02:05<00:00, 44.7MB/s]\n","100% 4.31G/4.31G [02:05<00:00, 36.8MB/s]\n"]}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"PZYBD38Ad5j2","executionInfo":{"status":"ok","timestamp":1637061804413,"user_tz":-330,"elapsed":325474,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"efff747a-58f1-49d5-bb09-51875616aaa0"},"source":["!unzip criteo-dataset.zip"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Archive: criteo-dataset.zip\n"," inflating: dac/readme.txt \n"," inflating: dac/test.txt \n"," inflating: dac/train.txt \n"]}]},{"cell_type":"code","metadata":{"id":"BZwknC3Gd8Qg"},"source":["import os\n","import pandas as pd\n","import numpy as np\n","from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer\n","from sklearn.model_selection import train_test_split\n","\n","import tensorflow as tf\n","from tensorflow.keras import Model\n","from tensorflow.keras.layers import Layer, Input\n","from tensorflow.keras.regularizers import l2\n","from tensorflow.keras.losses import binary_crossentropy\n","from tensorflow.keras.callbacks import EarlyStopping\n","from tensorflow.keras.optimizers import Adam\n","from tensorflow.keras.metrics import AUC"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"GDRfYvu4e4mO"},"source":["os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'\n","os.environ['CUDA_VISIBLE_DEVICES'] = '0'\n","\n","file = 'dac/train.txt'\n","read_part = True\n","sample_num = 100000\n","test_size = 0.2\n","\n","k = 8\n","\n","learning_rate = 0.001\n","batch_size = 4096\n","epochs = 10"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"9zzb1WXIet8A"},"source":["def sparseFeature(feat, feat_num, embed_dim=4):\n"," \"\"\"\n"," create dictionary for sparse feature\n"," :param feat: feature name\n"," :param feat_num: the total number of sparse features that do not repeat\n"," :param embed_dim: embedding dimension\n"," :return:\n"," \"\"\"\n"," return {'feat_name': feat, 'feat_num': feat_num, 'embed_dim': embed_dim}\n","\n","\n","def denseFeature(feat):\n"," \"\"\"\n"," create dictionary for dense feature\n"," :param feat: dense feature name\n"," :return:\n"," \"\"\"\n"," return {'feat_name': feat}"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"5NnOfRIQerQh"},"source":["def create_criteo_dataset(file, embed_dim=8, read_part=True, sample_num=100000, test_size=0.2):\n"," \"\"\"\n"," a example about creating criteo dataset\n"," :param file: dataset's path\n"," :param embed_dim: the embedding dimension of sparse features\n"," :param read_part: whether to read part of it\n"," :param sample_num: the number of instances if read_part is True\n"," :param test_size: ratio of test dataset\n"," :return: feature columns, train, test\n"," \"\"\"\n"," names = ['label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11',\n"," 'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11',\n"," 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22',\n"," 'C23', 'C24', 'C25', 'C26']\n","\n"," if read_part:\n"," data_df = pd.read_csv(file, sep='\\t', iterator=True, header=None,\n"," names=names)\n"," data_df = data_df.get_chunk(sample_num)\n","\n"," else:\n"," data_df = pd.read_csv(file, sep='\\t', header=None, names=names)\n","\n"," sparse_features = ['C' + str(i) for i in range(1, 27)]\n"," dense_features = ['I' + str(i) for i in range(1, 14)]\n"," features = sparse_features + dense_features\n","\n"," data_df[sparse_features] = data_df[sparse_features].fillna('-1')\n"," data_df[dense_features] = data_df[dense_features].fillna(0)\n","\n"," # Bin continuous data into intervals.\n"," est = KBinsDiscretizer(n_bins=100, encode='ordinal', strategy='uniform')\n"," data_df[dense_features] = est.fit_transform(data_df[dense_features])\n","\n"," for feat in sparse_features:\n"," le = LabelEncoder()\n"," data_df[feat] = le.fit_transform(data_df[feat])\n","\n"," # ==============Feature Engineering===================\n","\n"," # ====================================================\n"," feature_columns = [sparseFeature(feat, int(data_df[feat].max()) + 1, embed_dim=embed_dim)\n"," for feat in features]\n"," train, test = train_test_split(data_df, test_size=test_size)\n","\n"," train_X = train[features].values.astype('int32')\n"," train_y = train['label'].values.astype('int32')\n"," test_X = test[features].values.astype('int32')\n"," test_y = test['label'].values.astype('int32')\n","\n"," return feature_columns, (train_X, train_y), (test_X, test_y)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"m6L1wMGGeCGE"},"source":["class FM_Layer(Layer):\n"," def __init__(self, feature_columns, k, w_reg=1e-6, v_reg=1e-6):\n"," \"\"\"\n"," Factorization Machines\n"," :param feature_columns: A list. sparse column feature information.\n"," :param k: the latent vector\n"," :param w_reg: the regularization coefficient of parameter w\n"," :param v_reg: the regularization coefficient of parameter v\n"," \"\"\"\n"," super(FM_Layer, self).__init__()\n"," self.sparse_feature_columns = feature_columns\n"," self.index_mapping = []\n"," self.feature_length = 0\n"," for feat in self.sparse_feature_columns:\n"," self.index_mapping.append(self.feature_length)\n"," self.feature_length += feat['feat_num']\n"," self.k = k\n"," self.w_reg = w_reg\n"," self.v_reg = v_reg\n","\n"," def build(self, input_shape):\n"," self.w0 = self.add_weight(name='w0', shape=(1,),\n"," initializer=tf.zeros_initializer(),\n"," trainable=True)\n"," self.w = self.add_weight(name='w', shape=(self.feature_length, 1),\n"," initializer=tf.random_normal_initializer(),\n"," regularizer=l2(self.w_reg),\n"," trainable=True)\n"," self.V = self.add_weight(name='V', shape=(self.feature_length, self.k),\n"," initializer=tf.random_normal_initializer(),\n"," regularizer=l2(self.v_reg),\n"," trainable=True)\n","\n"," def call(self, inputs, **kwargs):\n"," # mapping\n"," inputs = inputs + tf.convert_to_tensor(self.index_mapping)\n"," # first order\n"," first_order = self.w0 + tf.reduce_sum(tf.nn.embedding_lookup(self.w, inputs), axis=1) # (batch_size, 1)\n"," # second order\n"," second_inputs = tf.nn.embedding_lookup(self.V, inputs) # (batch_size, fields, embed_dim)\n"," square_sum = tf.square(tf.reduce_sum(second_inputs, axis=1, keepdims=True)) # (batch_size, 1, embed_dim)\n"," sum_square = tf.reduce_sum(tf.square(second_inputs), axis=1, keepdims=True) # (batch_size, 1, embed_dim)\n"," second_order = 0.5 * tf.reduce_sum(square_sum - sum_square, axis=2) # (batch_size, 1)\n"," # outputs\n"," outputs = first_order + second_order\n"," return outputs\n","\n","\n","class FM(Model):\n"," def __init__(self, feature_columns, k, w_reg=1e-6, v_reg=1e-6):\n"," \"\"\"\n"," Factorization Machines\n"," :param feature_columns: A list. sparse column feature information.\n"," :param k: the latent vector\n"," :param w_reg: the regularization coefficient of parameter w\n","\t\t:param v_reg: the regularization coefficient of parameter v\n"," \"\"\"\n"," super(FM, self).__init__()\n"," self.sparse_feature_columns = feature_columns\n"," self.fm = FM_Layer(feature_columns, k, w_reg, v_reg)\n","\n"," def call(self, inputs, **kwargs):\n"," fm_outputs = self.fm(inputs)\n"," outputs = tf.nn.sigmoid(fm_outputs)\n"," return outputs\n","\n"," def summary(self, **kwargs):\n"," sparse_inputs = Input(shape=(len(self.sparse_feature_columns),), dtype=tf.int32)\n"," Model(inputs=sparse_inputs, outputs=self.call(sparse_inputs)).summary()"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"dHIfDXsXePmP","executionInfo":{"status":"ok","timestamp":1637061897728,"user_tz":-330,"elapsed":18358,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"a8dc1bd2-4a01-4b35-ddfd-d05abd1fbf08"},"source":["# ========================== Create dataset =======================\n","feature_columns, train, test = create_criteo_dataset(file=file,\n"," read_part=read_part,\n"," sample_num=sample_num,\n"," test_size=test_size)\n","train_X, train_y = train\n","test_X, test_y = test\n","# ============================Build Model==========================\n","mirrored_strategy = tf.distribute.MirroredStrategy()\n","with mirrored_strategy.scope():\n"," model = FM(feature_columns=feature_columns, k=k)\n"," model.summary()\n"," # ============================Compile============================\n"," model.compile(loss=binary_crossentropy, optimizer=Adam(learning_rate=learning_rate),\n"," metrics=[AUC()])\n","# ============================model checkpoint======================\n","# check_path = '../save/fm_weights.epoch_{epoch:04d}.val_loss_{val_loss:.4f}.ckpt'\n","# checkpoint = tf.keras.callbacks.ModelCheckpoint(check_path, save_weights_only=True,\n","# verbose=1, period=5)\n","# ==============================Fit==============================\n","model.fit(\n"," train_X,\n"," train_y,\n"," epochs=epochs,\n"," callbacks=[EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)], # checkpoint\n"," batch_size=batch_size,\n"," validation_split=0.1\n",")\n","# ===========================Test==============================\n","print('test AUC: %f' % model.evaluate(test_X, test_y, batch_size=batch_size)[1])"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["WARNING:tensorflow:There are non-GPU devices in `tf.distribute.Strategy`, not using nccl allreduce.\n","WARNING:tensorflow:Collective ops is not configured at program startup. Some performance features may not be enabled.\n","INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)\n","Model: \"model\"\n","_________________________________________________________________\n","Layer (type) Output Shape Param # \n","=================================================================\n","input_1 (InputLayer) [(None, 39)] 0 \n","_________________________________________________________________\n","fm__layer (FM_Layer) (None, 1) 2183743 \n","_________________________________________________________________\n","tf.math.sigmoid (TFOpLambda) (None, 1) 0 \n","=================================================================\n","Total params: 2,183,743\n","Trainable params: 2,183,743\n","Non-trainable params: 0\n","_________________________________________________________________\n","Epoch 1/10\n","18/18 [==============================] - 6s 131ms/step - loss: 0.6923 - auc: 0.5181 - val_loss: 0.5869 - val_auc: 0.5628\n","Epoch 2/10\n","18/18 [==============================] - 1s 51ms/step - loss: 0.5463 - auc: 0.6363 - val_loss: 0.5256 - val_auc: 0.6269\n","Epoch 3/10\n","18/18 [==============================] - 1s 60ms/step - loss: 0.4988 - auc: 0.7473 - val_loss: 0.5023 - val_auc: 0.6838\n","Epoch 4/10\n","18/18 [==============================] - 1s 49ms/step - loss: 0.4652 - auc: 0.8144 - val_loss: 0.4846 - val_auc: 0.7263\n","Epoch 5/10\n","18/18 [==============================] - 1s 50ms/step - loss: 0.4313 - auc: 0.8495 - val_loss: 0.4705 - val_auc: 0.7455\n","Epoch 6/10\n","18/18 [==============================] - 1s 50ms/step - loss: 0.3992 - auc: 0.8743 - val_loss: 0.4625 - val_auc: 0.7561\n","Epoch 7/10\n","18/18 [==============================] - 1s 50ms/step - loss: 0.3699 - auc: 0.8936 - val_loss: 0.4567 - val_auc: 0.7618\n","Epoch 8/10\n","18/18 [==============================] - 1s 50ms/step - loss: 0.3429 - auc: 0.9103 - val_loss: 0.4531 - val_auc: 0.7646\n","Epoch 9/10\n","18/18 [==============================] - 1s 51ms/step - loss: 0.3183 - auc: 0.9247 - val_loss: 0.4533 - val_auc: 0.7657\n","Epoch 10/10\n","18/18 [==============================] - 1s 50ms/step - loss: 0.2960 - auc: 0.9358 - val_loss: 0.4533 - val_auc: 0.7654\n","5/5 [==============================] - 0s 24ms/step - loss: 0.4618 - auc: 0.7561\n","test AUC: 0.756149\n"]}]}]}