{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"2022-01-20-fm-criteo.ipynb","provenance":[{"file_id":"https://github.com/recohut/nbs/blob/main/raw/T065764%20%7C%20FM%20on%20Criteo%20Ad%20Dataset%20in%20TF%202x.ipynb","timestamp":1644654352245},{"file_id":"1bhRkvGnfmxPWUPFRag57Dl6-4pl1ZEzm","timestamp":1637061905313}],"collapsed_sections":[],"mount_file_id":"1bhRkvGnfmxPWUPFRag57Dl6-4pl1ZEzm","authorship_tag":"ABX9TyMBUyPEeyzg6d0Ip4cgvR7R"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# FM on Criteo Ad Dataset in TF 2.x"],"metadata":{"id":"3JIsowxxCcBJ"}},{"cell_type":"code","metadata":{"id":"0pxSU24FbSAy"},"source":["!pip install tensorflow==2.5.0"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"QnFg_wXiclo4","executionInfo":{"status":"ok","timestamp":1637061269255,"user_tz":-330,"elapsed":135691,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"6cc6fdd1-38da-459c-ea28-576e6c5492b1"},"source":["!pip install -q -U kaggle\n","!pip install --upgrade --force-reinstall --no-deps kaggle\n","!mkdir ~/.kaggle\n","!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/\n","!chmod 600 ~/.kaggle/kaggle.json\n","!kaggle datasets download -d mrkmakr/criteo-dataset"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting kaggle\n","  Downloading kaggle-1.5.12.tar.gz (58 kB)\n","\u001b[K     |████████████████████████████████| 58 kB 2.6 MB/s \n","\u001b[?25hBuilding wheels for collected packages: kaggle\n","  Building wheel for kaggle (setup.py) ... \u001b[?25l\u001b[?25hdone\n","  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73051 sha256=d0e4b97f111ac4c64a7c2fead5edb3baa0f1f4f21eca031e865b06b3f40f3bbb\n","  Stored in directory: /root/.cache/pip/wheels/62/d6/58/5853130f941e75b2177d281eb7e44b4a98ed46dd155f556dc5\n","Successfully built kaggle\n","Installing collected packages: kaggle\n","  Attempting uninstall: kaggle\n","    Found existing installation: kaggle 1.5.12\n","    Uninstalling kaggle-1.5.12:\n","      Successfully uninstalled kaggle-1.5.12\n","Successfully installed kaggle-1.5.12\n","Downloading criteo-dataset.zip to /content\n","100% 4.31G/4.31G [02:05<00:00, 44.7MB/s]\n","100% 4.31G/4.31G [02:05<00:00, 36.8MB/s]\n"]}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"PZYBD38Ad5j2","executionInfo":{"status":"ok","timestamp":1637061804413,"user_tz":-330,"elapsed":325474,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"efff747a-58f1-49d5-bb09-51875616aaa0"},"source":["!unzip criteo-dataset.zip"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Archive:  criteo-dataset.zip\n","  inflating: dac/readme.txt          \n","  inflating: dac/test.txt            \n","  inflating: dac/train.txt           \n"]}]},{"cell_type":"code","metadata":{"id":"BZwknC3Gd8Qg"},"source":["import os\n","import pandas as pd\n","import numpy as np\n","from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer\n","from sklearn.model_selection import train_test_split\n","\n","import tensorflow as tf\n","from tensorflow.keras import Model\n","from tensorflow.keras.layers import Layer, Input\n","from tensorflow.keras.regularizers import l2\n","from tensorflow.keras.losses import binary_crossentropy\n","from tensorflow.keras.callbacks import EarlyStopping\n","from tensorflow.keras.optimizers import Adam\n","from tensorflow.keras.metrics import AUC"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"GDRfYvu4e4mO"},"source":["os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'\n","os.environ['CUDA_VISIBLE_DEVICES'] = '0'\n","\n","file = 'dac/train.txt'\n","read_part = True\n","sample_num = 100000\n","test_size = 0.2\n","\n","k = 8\n","\n","learning_rate = 0.001\n","batch_size = 4096\n","epochs = 10"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"9zzb1WXIet8A"},"source":["def sparseFeature(feat, feat_num, embed_dim=4):\n","    \"\"\"\n","    create dictionary for sparse feature\n","    :param feat: feature name\n","    :param feat_num: the total number of sparse features that do not repeat\n","    :param embed_dim: embedding dimension\n","    :return:\n","    \"\"\"\n","    return {'feat_name': feat, 'feat_num': feat_num, 'embed_dim': embed_dim}\n","\n","\n","def denseFeature(feat):\n","    \"\"\"\n","    create dictionary for dense feature\n","    :param feat: dense feature name\n","    :return:\n","    \"\"\"\n","    return {'feat_name': feat}"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"5NnOfRIQerQh"},"source":["def create_criteo_dataset(file, embed_dim=8, read_part=True, sample_num=100000, test_size=0.2):\n","    \"\"\"\n","    a example about creating criteo dataset\n","    :param file: dataset's path\n","    :param embed_dim: the embedding dimension of sparse features\n","    :param read_part: whether to read part of it\n","    :param sample_num: the number of instances if read_part is True\n","    :param test_size: ratio of test dataset\n","    :return: feature columns, train, test\n","    \"\"\"\n","    names = ['label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11',\n","             'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11',\n","             'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22',\n","             'C23', 'C24', 'C25', 'C26']\n","\n","    if read_part:\n","        data_df = pd.read_csv(file, sep='\\t', iterator=True, header=None,\n","                          names=names)\n","        data_df = data_df.get_chunk(sample_num)\n","\n","    else:\n","        data_df = pd.read_csv(file, sep='\\t', header=None, names=names)\n","\n","    sparse_features = ['C' + str(i) for i in range(1, 27)]\n","    dense_features = ['I' + str(i) for i in range(1, 14)]\n","    features = sparse_features + dense_features\n","\n","    data_df[sparse_features] = data_df[sparse_features].fillna('-1')\n","    data_df[dense_features] = data_df[dense_features].fillna(0)\n","\n","    # Bin continuous data into intervals.\n","    est = KBinsDiscretizer(n_bins=100, encode='ordinal', strategy='uniform')\n","    data_df[dense_features] = est.fit_transform(data_df[dense_features])\n","\n","    for feat in sparse_features:\n","        le = LabelEncoder()\n","        data_df[feat] = le.fit_transform(data_df[feat])\n","\n","    # ==============Feature Engineering===================\n","\n","    # ====================================================\n","    feature_columns = [sparseFeature(feat, int(data_df[feat].max()) + 1, embed_dim=embed_dim)\n","                        for feat in features]\n","    train, test = train_test_split(data_df, test_size=test_size)\n","\n","    train_X = train[features].values.astype('int32')\n","    train_y = train['label'].values.astype('int32')\n","    test_X = test[features].values.astype('int32')\n","    test_y = test['label'].values.astype('int32')\n","\n","    return feature_columns, (train_X, train_y), (test_X, test_y)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"m6L1wMGGeCGE"},"source":["class FM_Layer(Layer):\n","    def __init__(self, feature_columns, k, w_reg=1e-6, v_reg=1e-6):\n","        \"\"\"\n","        Factorization Machines\n","        :param feature_columns: A list. sparse column feature information.\n","        :param k: the latent vector\n","        :param w_reg: the regularization coefficient of parameter w\n","        :param v_reg: the regularization coefficient of parameter v\n","        \"\"\"\n","        super(FM_Layer, self).__init__()\n","        self.sparse_feature_columns = feature_columns\n","        self.index_mapping = []\n","        self.feature_length = 0\n","        for feat in self.sparse_feature_columns:\n","            self.index_mapping.append(self.feature_length)\n","            self.feature_length += feat['feat_num']\n","        self.k = k\n","        self.w_reg = w_reg\n","        self.v_reg = v_reg\n","\n","    def build(self, input_shape):\n","        self.w0 = self.add_weight(name='w0', shape=(1,),\n","                                  initializer=tf.zeros_initializer(),\n","                                  trainable=True)\n","        self.w = self.add_weight(name='w', shape=(self.feature_length, 1),\n","                                 initializer=tf.random_normal_initializer(),\n","                                 regularizer=l2(self.w_reg),\n","                                 trainable=True)\n","        self.V = self.add_weight(name='V', shape=(self.feature_length, self.k),\n","                                 initializer=tf.random_normal_initializer(),\n","                                 regularizer=l2(self.v_reg),\n","                                 trainable=True)\n","\n","    def call(self, inputs, **kwargs):\n","        # mapping\n","        inputs = inputs + tf.convert_to_tensor(self.index_mapping)\n","        # first order\n","        first_order = self.w0 + tf.reduce_sum(tf.nn.embedding_lookup(self.w, inputs), axis=1)  # (batch_size, 1)\n","        # second order\n","        second_inputs = tf.nn.embedding_lookup(self.V, inputs)  # (batch_size, fields, embed_dim)\n","        square_sum = tf.square(tf.reduce_sum(second_inputs, axis=1, keepdims=True))  # (batch_size, 1, embed_dim)\n","        sum_square = tf.reduce_sum(tf.square(second_inputs), axis=1, keepdims=True)  # (batch_size, 1, embed_dim)\n","        second_order = 0.5 * tf.reduce_sum(square_sum - sum_square, axis=2)  # (batch_size, 1)\n","        # outputs\n","        outputs = first_order + second_order\n","        return outputs\n","\n","\n","class FM(Model):\n","    def __init__(self, feature_columns, k, w_reg=1e-6, v_reg=1e-6):\n","        \"\"\"\n","        Factorization Machines\n","        :param feature_columns: A list. sparse column feature information.\n","        :param k: the latent vector\n","        :param w_reg: the regularization coefficient of parameter w\n","\t\t:param v_reg: the regularization coefficient of parameter v\n","        \"\"\"\n","        super(FM, self).__init__()\n","        self.sparse_feature_columns = feature_columns\n","        self.fm = FM_Layer(feature_columns, k, w_reg, v_reg)\n","\n","    def call(self, inputs, **kwargs):\n","        fm_outputs = self.fm(inputs)\n","        outputs = tf.nn.sigmoid(fm_outputs)\n","        return outputs\n","\n","    def summary(self, **kwargs):\n","        sparse_inputs = Input(shape=(len(self.sparse_feature_columns),), dtype=tf.int32)\n","        Model(inputs=sparse_inputs, outputs=self.call(sparse_inputs)).summary()"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"dHIfDXsXePmP","executionInfo":{"status":"ok","timestamp":1637061897728,"user_tz":-330,"elapsed":18358,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"a8dc1bd2-4a01-4b35-ddfd-d05abd1fbf08"},"source":["# ========================== Create dataset =======================\n","feature_columns, train, test = create_criteo_dataset(file=file,\n","                                        read_part=read_part,\n","                                        sample_num=sample_num,\n","                                        test_size=test_size)\n","train_X, train_y = train\n","test_X, test_y = test\n","# ============================Build Model==========================\n","mirrored_strategy = tf.distribute.MirroredStrategy()\n","with mirrored_strategy.scope():\n","    model = FM(feature_columns=feature_columns, k=k)\n","    model.summary()\n","    # ============================Compile============================\n","    model.compile(loss=binary_crossentropy, optimizer=Adam(learning_rate=learning_rate),\n","                    metrics=[AUC()])\n","# ============================model checkpoint======================\n","# check_path = '../save/fm_weights.epoch_{epoch:04d}.val_loss_{val_loss:.4f}.ckpt'\n","# checkpoint = tf.keras.callbacks.ModelCheckpoint(check_path, save_weights_only=True,\n","#                                                 verbose=1, period=5)\n","# ==============================Fit==============================\n","model.fit(\n","    train_X,\n","    train_y,\n","    epochs=epochs,\n","    callbacks=[EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)],  # checkpoint\n","    batch_size=batch_size,\n","    validation_split=0.1\n",")\n","# ===========================Test==============================\n","print('test AUC: %f' % model.evaluate(test_X, test_y, batch_size=batch_size)[1])"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["WARNING:tensorflow:There are non-GPU devices in `tf.distribute.Strategy`, not using nccl allreduce.\n","WARNING:tensorflow:Collective ops is not configured at program startup. Some performance features may not be enabled.\n","INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)\n","Model: \"model\"\n","_________________________________________________________________\n","Layer (type)                 Output Shape              Param #   \n","=================================================================\n","input_1 (InputLayer)         [(None, 39)]              0         \n","_________________________________________________________________\n","fm__layer (FM_Layer)         (None, 1)                 2183743   \n","_________________________________________________________________\n","tf.math.sigmoid (TFOpLambda) (None, 1)                 0         \n","=================================================================\n","Total params: 2,183,743\n","Trainable params: 2,183,743\n","Non-trainable params: 0\n","_________________________________________________________________\n","Epoch 1/10\n","18/18 [==============================] - 6s 131ms/step - loss: 0.6923 - auc: 0.5181 - val_loss: 0.5869 - val_auc: 0.5628\n","Epoch 2/10\n","18/18 [==============================] - 1s 51ms/step - loss: 0.5463 - auc: 0.6363 - val_loss: 0.5256 - val_auc: 0.6269\n","Epoch 3/10\n","18/18 [==============================] - 1s 60ms/step - loss: 0.4988 - auc: 0.7473 - val_loss: 0.5023 - val_auc: 0.6838\n","Epoch 4/10\n","18/18 [==============================] - 1s 49ms/step - loss: 0.4652 - auc: 0.8144 - val_loss: 0.4846 - val_auc: 0.7263\n","Epoch 5/10\n","18/18 [==============================] - 1s 50ms/step - loss: 0.4313 - auc: 0.8495 - val_loss: 0.4705 - val_auc: 0.7455\n","Epoch 6/10\n","18/18 [==============================] - 1s 50ms/step - loss: 0.3992 - auc: 0.8743 - val_loss: 0.4625 - val_auc: 0.7561\n","Epoch 7/10\n","18/18 [==============================] - 1s 50ms/step - loss: 0.3699 - auc: 0.8936 - val_loss: 0.4567 - val_auc: 0.7618\n","Epoch 8/10\n","18/18 [==============================] - 1s 50ms/step - loss: 0.3429 - auc: 0.9103 - val_loss: 0.4531 - val_auc: 0.7646\n","Epoch 9/10\n","18/18 [==============================] - 1s 51ms/step - loss: 0.3183 - auc: 0.9247 - val_loss: 0.4533 - val_auc: 0.7657\n","Epoch 10/10\n","18/18 [==============================] - 1s 50ms/step - loss: 0.2960 - auc: 0.9358 - val_loss: 0.4533 - val_auc: 0.7654\n","5/5 [==============================] - 0s 24ms/step - loss: 0.4618 - auc: 0.7561\n","test AUC: 0.756149\n"]}]}]}