{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Test the NCF modules under folder [cf_ec2](../cf_ec2) with ml-1m dataset, save the best model" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using TensorFlow backend.\n" ] } ], "source": [ "import numpy as np \n", "import pandas as pd\n", "import keras\n", "from keras import Model\n", "from keras.regularizers import l2\n", "from keras.optimizers import (\n", " Adam,\n", " Adamax,\n", " Adagrad,\n", " SGD,\n", " RMSprop\n", ")\n", "from keras.layers import (\n", " Embedding, \n", " Input,\n", " Flatten, \n", " Multiply, \n", " Concatenate,\n", " Dense\n", ")\n", "\n", "import sys\n", "sys.path.append('../')\n", "from cf_ec2 import (\n", " GMF,\n", " MLP,\n", " NCF,\n", " Data,\n", " evaluation\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## step 1: load the data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "train = pd.read_csv('../data/ml-1m.train.rating',sep='\\t',header=None,names=['user','item','rating','event_ts'])\n", "test = pd.read_csv('../data/ml-1m.test.rating',sep='\\t',header=None,names=['user','item','rating','event_ts'])" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
useritemratingevent_ts
00324978824330
10344978824330
2045978824291
\n", "
" ], "text/plain": [ " user item rating event_ts\n", "0 0 32 4 978824330\n", "1 0 34 4 978824330\n", "2 0 4 5 978824291" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.head(3)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
useritemratingevent_ts
00255978824351
111333978300174
222074978298504
\n", "
" ], "text/plain": [ " user item rating event_ts\n", "0 0 25 5 978824351\n", "1 1 133 3 978300174\n", "2 2 207 4 978298504" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test.head(3)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(6040, (6040, 4))" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test.user.nunique(), test.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## step 2: prepare the data for ncf model training" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "dataset = Data(\n", " train=train,\n", " test=test,\n", " col_user='user',\n", " col_item='item',\n", " col_rating='rating',\n", " col_time='event_ts',\n", " binary=True,\n", " n_neg=4,\n", " n_neg_test=100\n", ")\n", "dataset.prepTrainDNN()\n", "dataset.prepTestDNN()\n", "dataset.negativeSampling()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(4970845, (994169, 6))" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(dataset.users),train.shape" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(610040, (6040, 6))" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(dataset.users_test),test.shape" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(6040, 6040)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.user.nunique(), test.user.nunique()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(3704, 1921)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.item.nunique(), test.item.nunique()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
useritem_interacteditem_negative
00{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...{52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6...
11{15, 22, 31, 34, 35, 42, 43, 52, 53, 54, 55, 5...{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
22{2, 135, 136, 14, 18, 147, 159, 163, 36, 40, 1...{0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15...
\n", "
" ], "text/plain": [ " user item_interacted \\\n", "0 0 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... \n", "1 1 {15, 22, 31, 34, 35, 42, 43, 52, 53, 54, 55, 5... \n", "2 2 {2, 135, 136, 14, 18, 147, 159, 163, 36, 40, 1... \n", "\n", " item_negative \n", "0 {52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6... \n", "1 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... \n", "2 {0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15... " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.interaction_train.head(3)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(6040, 3704)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(set(dataset.users)), len(set(dataset.items))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(6040, 3706)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(set(dataset.users_test)), len(set(dataset.items_test))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### prepare the test dataset" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "newItems = set(dataset.items_test)-set(dataset.items)\n", "idx2del = []\n", "for idx,item in enumerate(dataset.items_test):\n", " if item in newItems:\n", " idx2del.append(idx)\n", "\n", "length_test_original = len(dataset.users_test)\n", "dataset.users_test = [\n", " dataset.users_test[idx]\n", " for idx in range(length_test_original) if idx not in idx2del\n", "]\n", "dataset.items_test = [\n", " dataset.items_test[idx]\n", " for idx in range(length_test_original) if idx not in idx2del\n", "]\n", "dataset.ratings_test = [\n", " dataset.ratings_test[idx]\n", " for idx in range(length_test_original) if idx not in idx2del\n", "]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## step 3: create the model architecture" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "scrolled": false }, "outputs": [], "source": [ "n_users = 6040\n", "n_items = 3704\n", "n_factors_gmf = 32\n", "layers_mlp = [64,32,16,8]\n", "reg_gmf = 0.\n", "reg_layers_mlp = [0.,0.,0.,0.]\n", "learning_rate = 0.01\n", "flg_pretrain = ''\n", "filepath = ''\n", "filepath_gmf_pretrain = ''\n", "filepath_mlp_pretrain = ''\n", "num_epochs = 20\n", "batch_size = 100\n", "\n", "ncf = NCF(\n", " n_users=n_users,\n", " n_items=n_items,\n", " n_factors_gmf=n_factors_gmf,\n", " layers_mlp=layers_mlp,\n", " reg_gmf=reg_gmf,\n", " reg_layers_mlp=reg_layers_mlp\n", ")\n", "model = ncf.create_model()\n", "#### compile the model\n", "model.compile(\n", " optimizer=Adam(lr=learning_rate),\n", " loss='binary_crossentropy',\n", " metrics=['accuracy']\n", ")\n", "#### create the callback metrics\n", "filepath=\"../metadata/ncf/ncf-weights-improvement-{epoch:02d}-{val_loss:.4f}.hdf5\"\n", "checkpoint = keras.callbacks.ModelCheckpoint(\n", " filepath= filepath, \n", " verbose=1, \n", " save_best_only=True\n", ")\n", "csvlog = keras.callbacks.CSVLogger(\n", " '../metadata/ncf/ncf_log.csv', \n", " separator=',', \n", " append=False\n", ")\n", "earlystop = keras.callbacks.EarlyStopping(patience=12)\n", "lrreduce = keras.callbacks.ReduceLROnPlateau(\n", " monitor=\"val_loss\", \n", " factor=0.3, \n", " patience=4, \n", " verbose=1\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## step 4: train the model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### define customized metrics" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "class newMetrics(keras.callbacks.Callback):\n", " def on_epoch_end(self, epoch, logs):\n", "# print(len(self.validation_data))\n", "# print(self.validation_data[0][:5])\n", "# print(self.validation_data[1][:5]) \n", "# print(self.validation_data[2][:5])\n", "# print(self.validation_data[3][:5]) \n", "# X_val, y_val = self.validation_data[0], self.validation_data[1]\n", " X_val = [self.validation_data[0],self.validation_data[1]]\n", " y_val = self.validation_data[2]\n", " y_predict = model.predict(x = X_val)\n", " logs['val_auc'] = evaluation.auc(y_val, y_predict)\n", "\n", "metrics2 = newMetrics()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow_core/python/framework/indexed_slices.py:424: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n", " \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Train on 4970845 samples, validate on 610038 samples\n", "Epoch 1/20\n", " - 260s - loss: 0.3409 - accuracy: 0.8465 - val_loss: 0.1485 - val_accuracy: 0.9459\n", "\n", "Epoch 00001: val_loss improved from inf to 0.14850, saving model to ../metadata/ncf/ncf-weights-improvement-01-0.1485.hdf5\n", "Epoch 2/20\n", " - 252s - loss: 0.3123 - accuracy: 0.8611 - val_loss: 0.1842 - val_accuracy: 0.9258\n", "\n", "Epoch 00002: val_loss did not improve from 0.14850\n", "Epoch 3/20\n", " - 272s - loss: 0.3037 - accuracy: 0.8661 - val_loss: 0.1595 - val_accuracy: 0.9413\n", "\n", "Epoch 00003: val_loss did not improve from 0.14850\n", "Epoch 4/20\n", " - 264s - loss: 0.2990 - accuracy: 0.8687 - val_loss: 0.1650 - val_accuracy: 0.9344\n", "\n", "Epoch 00004: val_loss did not improve from 0.14850\n", "Epoch 5/20\n", " - 263s - loss: 0.2942 - accuracy: 0.8719 - val_loss: 0.1507 - val_accuracy: 0.9395\n", "\n", "Epoch 00005: val_loss did not improve from 0.14850\n", "\n", "Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0029999999329447745.\n", "Epoch 6/20\n", " - 247s - loss: 0.2747 - accuracy: 0.8808 - val_loss: 0.1807 - val_accuracy: 0.9248\n", "\n", "Epoch 00006: val_loss did not improve from 0.14850\n", "Epoch 7/20\n", " - 247s - loss: 0.2698 - accuracy: 0.8837 - val_loss: 0.1595 - val_accuracy: 0.9363\n", "\n", "Epoch 00007: val_loss did not improve from 0.14850\n", "Epoch 8/20\n", " - 237s - loss: 0.2627 - accuracy: 0.8878 - val_loss: 0.1495 - val_accuracy: 0.9404\n", "\n", "Epoch 00008: val_loss did not improve from 0.14850\n", "Epoch 9/20\n", " - 237s - loss: 0.2538 - accuracy: 0.8925 - val_loss: 0.1833 - val_accuracy: 0.9252\n", "\n", "Epoch 00009: val_loss did not improve from 0.14850\n", "\n", "Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0009000000078231095.\n", "Epoch 10/20\n", " - 233s - loss: 0.2358 - accuracy: 0.9011 - val_loss: 0.1506 - val_accuracy: 0.9392\n", "\n", "Epoch 00010: val_loss did not improve from 0.14850\n", "Epoch 11/20\n", " - 234s - loss: 0.2321 - accuracy: 0.9031 - val_loss: 0.1592 - val_accuracy: 0.9351\n", "\n", "Epoch 00011: val_loss did not improve from 0.14850\n", "Epoch 12/20\n", " - 235s - loss: 0.2291 - accuracy: 0.9046 - val_loss: 0.1607 - val_accuracy: 0.9341\n", "\n", "Epoch 00012: val_loss did not improve from 0.14850\n", "Epoch 13/20\n", " - 254s - loss: 0.2264 - accuracy: 0.9060 - val_loss: 0.1729 - val_accuracy: 0.9296\n", "\n", "Epoch 00013: val_loss did not improve from 0.14850\n", "\n", "Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.00026999999536201356.\n" ] } ], "source": [ "#### train\n", "hist = model.fit(\n", " x = [\n", " np.array(dataset.users),\n", " np.array(dataset.items)\n", " ],\n", " y = np.array(dataset.ratings),\n", " batch_size=batch_size,\n", " epochs=num_epochs,\n", " verbose=2,\n", " shuffle=True,\n", " callbacks=[metrics2,checkpoint,csvlog,earlystop,lrreduce],\n", " validation_data=(\n", " [\n", " np.array(dataset.users_test),\n", " np.array(dataset.items_test)\n", " ],\n", " np.array(dataset.ratings_test)\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "([0, 0, 0, 0, 0],\n", " [398, 2310, 2068, 2263, 1366],\n", " [1.0, 0.0, 0.0, 0.0, 0.0],\n", " array([1., 0., 0., 0., 0.]))" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.users_test[:5], dataset.items_test[:5], dataset.ratings_test[:5], dataset.ratings[:5]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: \"model_1\"\n", "__________________________________________________________________________________________________\n", "Layer (type) Output Shape Param # Connected to \n", "==================================================================================================\n", "user_input (InputLayer) (None, 1) 0 \n", "__________________________________________________________________________________________________\n", "item_input (InputLayer) (None, 1) 0 \n", "__________________________________________________________________________________________________\n", "embedding_mlp_User (Embedding) (None, 1, 32) 193280 user_input[0][0] \n", "__________________________________________________________________________________________________\n", "embedding_mlp_Item (Embedding) (None, 1, 32) 118528 item_input[0][0] \n", "__________________________________________________________________________________________________\n", "flatten_mlp_User (Flatten) (None, 32) 0 embedding_mlp_User[0][0] \n", "__________________________________________________________________________________________________\n", "flatten_mlp_Item (Flatten) (None, 32) 0 embedding_mlp_Item[0][0] \n", "__________________________________________________________________________________________________\n", "concat_mlp_UserItem (Concatenat (None, 64) 0 flatten_mlp_User[0][0] \n", " flatten_mlp_Item[0][0] \n", "__________________________________________________________________________________________________\n", "embedding_gmf_User (Embedding) (None, 1, 32) 193280 user_input[0][0] \n", "__________________________________________________________________________________________________\n", "embedding_gmf_Item (Embedding) (None, 1, 32) 118528 item_input[0][0] \n", "__________________________________________________________________________________________________\n", "mlp_layer_1 (Dense) (None, 32) 2080 concat_mlp_UserItem[0][0] \n", "__________________________________________________________________________________________________\n", "flatten_gmf_User (Flatten) (None, 32) 0 embedding_gmf_User[0][0] \n", "__________________________________________________________________________________________________\n", "flatten_gmf_Item (Flatten) (None, 32) 0 embedding_gmf_Item[0][0] \n", "__________________________________________________________________________________________________\n", "mlp_layer_2 (Dense) (None, 16) 528 mlp_layer_1[0][0] \n", "__________________________________________________________________________________________________\n", "multiply_gmf_UserItem (Multiply (None, 32) 0 flatten_gmf_User[0][0] \n", " flatten_gmf_Item[0][0] \n", "__________________________________________________________________________________________________\n", "mlp_layer_3 (Dense) (None, 8) 136 mlp_layer_2[0][0] \n", "__________________________________________________________________________________________________\n", "concat_gmf_mlp (Concatenate) (None, 40) 0 multiply_gmf_UserItem[0][0] \n", " mlp_layer_3[0][0] \n", "__________________________________________________________________________________________________\n", "output (Dense) (None, 1) 41 concat_gmf_mlp[0][0] \n", "==================================================================================================\n", "Total params: 626,401\n", "Trainable params: 626,401\n", "Non-trainable params: 0\n", "__________________________________________________________________________________________________\n" ] } ], "source": [ "model.summary()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'val_loss': [0.1485011165742999,\n", " 0.18416567678242549,\n", " 0.15954290449423783,\n", " 0.1650358262217424,\n", " 0.1506881339903949,\n", " 0.18065066654141143,\n", " 0.1595049133731821,\n", " 0.14945811447602397,\n", " 0.18333839380089495,\n", " 0.15058260036174537,\n", " 0.15917451103293503,\n", " 0.16069093156649547,\n", " 0.1729039898706394],\n", " 'val_accuracy': [0.945934534072876,\n", " 0.9257521629333496,\n", " 0.9413331747055054,\n", " 0.9343729615211487,\n", " 0.9395332932472229,\n", " 0.9247997403144836,\n", " 0.9362990260124207,\n", " 0.9404119253158569,\n", " 0.9251571297645569,\n", " 0.939182460308075,\n", " 0.9351253509521484,\n", " 0.9341335892677307,\n", " 0.9295814633369446],\n", " 'loss': [0.34090790990596076,\n", " 0.31232653234047203,\n", " 0.3036990459802174,\n", " 0.2990263803599324,\n", " 0.2942262135450036,\n", " 0.27472649822994305,\n", " 0.2697909015162971,\n", " 0.2626914049020689,\n", " 0.2537869813717614,\n", " 0.2357732586596515,\n", " 0.2320512784263774,\n", " 0.22910663956301205,\n", " 0.22642496287824582],\n", " 'accuracy': [0.84647256,\n", " 0.8611172,\n", " 0.86606985,\n", " 0.86869276,\n", " 0.8718952,\n", " 0.8808474,\n", " 0.8836669,\n", " 0.8877706,\n", " 0.89248186,\n", " 0.90110457,\n", " 0.9031352,\n", " 0.90458083,\n", " 0.90598017],\n", " 'val_auc': [0.8447472837591501,\n", " 0.858340589621141,\n", " 0.8635911596862258,\n", " 0.8709023831407706,\n", " 0.8599701511015224,\n", " 0.8788743092862202,\n", " 0.8787586388578734,\n", " 0.8794580647071855,\n", " 0.8790272227328466,\n", " 0.8798275979502883,\n", " 0.8773889560378091,\n", " 0.8815762746534642,\n", " 0.8802717551533443],\n", " 'lr': [0.01,\n", " 0.01,\n", " 0.01,\n", " 0.01,\n", " 0.01,\n", " 0.003,\n", " 0.003,\n", " 0.003,\n", " 0.003,\n", " 0.0009,\n", " 0.0009,\n", " 0.0009,\n", " 0.0009]}" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hist.history" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }