{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Test the GMF modules under folder [cf_ec2](../cf_ec2) with ml-1m dataset, save the best model" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using TensorFlow backend.\n" ] } ], "source": [ "import numpy as np \n", "import pandas as pd\n", "import keras\n", "from keras import Model\n", "from keras.regularizers import l2\n", "from keras.optimizers import (\n", " Adam,\n", " Adamax,\n", " Adagrad,\n", " SGD,\n", " RMSprop\n", ")\n", "from keras.layers import (\n", " Embedding, \n", " Input,\n", " Flatten, \n", " Multiply, \n", " Concatenate,\n", " Dense\n", ")\n", "\n", "import sys\n", "sys.path.append('../')\n", "from cf_ec2 import (\n", " GMF,\n", " MLP,\n", " NCF,\n", " Data,\n", " evaluation\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## step 1: load the data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "train = pd.read_csv('../data/ml-1m.train.rating',sep='\\t',header=None,names=['user','item','rating','event_ts'])\n", "test = pd.read_csv('../data/ml-1m.test.rating',sep='\\t',header=None,names=['user','item','rating','event_ts'])" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
useritemratingevent_ts
00324978824330
10344978824330
2045978824291
\n", "
" ], "text/plain": [ " user item rating event_ts\n", "0 0 32 4 978824330\n", "1 0 34 4 978824330\n", "2 0 4 5 978824291" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.head(3)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
useritemratingevent_ts
00255978824351
111333978300174
222074978298504
\n", "
" ], "text/plain": [ " user item rating event_ts\n", "0 0 25 5 978824351\n", "1 1 133 3 978300174\n", "2 2 207 4 978298504" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test.head(3)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(6040, (6040, 4))" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test.user.nunique(), test.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## step 2: prepare the data for gmf model training" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "dataset = Data(\n", " train=train,\n", " test=test,\n", " col_user='user',\n", " col_item='item',\n", " col_rating='rating',\n", " col_time='event_ts',\n", " binary=True,\n", " n_neg=4,\n", " n_neg_test=100\n", ")\n", "dataset.prepTrainDNN(negSample=True)\n", "dataset.prepTestDNN(group=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Method to save python object to disk for later use\n", "\n", "```python\n", "import pickle\n", "## pickle data\n", "with open('../metadata/datasetGmf','wb') as fp:\n", " pickle.dump(dataset, fp)\n", "## pickle data with compression\n", "import bz2\n", "with bz2.BZ2File('datasetGmfSmaller', 'w') as fp:\n", " pickle.dump(dataset, fp)\n", " \n", "## unpickle data\n", "with open('../metadata/datasetGmf','rb') as fp:\n", " dataset2 = pickle.load(fp)\n", "with bz2.BZ2File('../metadata/datasetGmfSmaller', 'r') as fp:\n", " dataset2 = pickle.load(fp) \n", "```" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(4970845, (994169, 6))" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(dataset.users),train.shape" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(610040, (6040, 6))" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(dataset.users_test),test.shape" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(6040, 6040)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.user.nunique(), test.user.nunique()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(3704, 1921)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.item.nunique(), test.item.nunique()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
useritem_interacteditem_negative
00{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...{52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6...
11{15, 22, 31, 34, 35, 42, 43, 52, 53, 54, 55, 5...{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
22{2, 135, 136, 14, 18, 147, 159, 163, 36, 40, 1...{0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15...
\n", "
" ], "text/plain": [ " user item_interacted \\\n", "0 0 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... \n", "1 1 {15, 22, 31, 34, 35, 42, 43, 52, 53, 54, 55, 5... \n", "2 2 {2, 135, 136, 14, 18, 147, 159, 163, 36, 40, 1... \n", "\n", " item_negative \n", "0 {52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6... \n", "1 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... \n", "2 {0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15... " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.interaction_train.head(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### prepare the test dataset" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "newItems = set(dataset.items_test)-set(dataset.items)\n", "idx2del = []\n", "for idx,item in enumerate(dataset.items_test):\n", " if item in newItems:\n", " idx2del.append(idx)\n", "\n", "length_test_original = len(dataset.users_test)\n", "dataset.users_test = [\n", " dataset.users_test[idx]\n", " for idx in range(length_test_original) if idx not in idx2del\n", "]\n", "dataset.items_test = [\n", " dataset.items_test[idx]\n", " for idx in range(length_test_original) if idx not in idx2del\n", "]\n", "dataset.ratings_test = [\n", " dataset.ratings_test[idx]\n", " for idx in range(length_test_original) if idx not in idx2del\n", "]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## step 3: create the model architecture" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "scrolled": false }, "outputs": [], "source": [ "n_users = 6040\n", "n_items = 3704\n", "n_factors_gmf = 32\n", "layers_mlp = [64,32,16,8]\n", "reg_gmf = 0.\n", "reg_layers_mlp = [0.,0.,0.,0.]\n", "learning_rate = 0.01\n", "flg_pretrain = ''\n", "filepath = ''\n", "filepath_gmf_pretrain = ''\n", "filepath_mlp_pretrain = ''\n", "num_epochs = 20\n", "batch_size = 100\n", "\n", "\n", "gmf = GMF(\n", " n_users=n_users,\n", " n_items=n_items,\n", " n_factors_gmf=n_factors_gmf\n", ")\n", "\n", "model = gmf.create_model()\n", "#### compile the model\n", "model.compile(\n", " optimizer=Adam(lr=learning_rate),\n", " loss='binary_crossentropy',\n", " metrics=['accuracy']\n", ")\n", "#### create the callback metrics\n", "filepath=\"../metadata/gmf/gmf-weights-improvement-{epoch:02d}-{val_loss:.4f}.hdf5\"\n", "checkpoint = keras.callbacks.ModelCheckpoint(\n", " filepath=filepath, \n", " verbose=1, \n", " save_best_only=True\n", ")\n", "csvlog = keras.callbacks.CSVLogger(\n", " '../metadata/gmf/gmf_log.csv', \n", " separator=',', \n", " append=False\n", ")\n", "earlystop = keras.callbacks.EarlyStopping(patience=12)\n", "lrreduce = keras.callbacks.ReduceLROnPlateau(\n", " monitor=\"val_loss\", \n", " factor=0.3, \n", " patience=4, \n", " verbose=1\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## step 4: train the model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### define customized metrics" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "class newMetrics(keras.callbacks.Callback):\n", " def on_epoch_end(self, epoch, logs):\n", "# print(len(self.validation_data))\n", "# print(self.validation_data[0][:5])\n", "# print(self.validation_data[1][:5]) \n", "# print(self.validation_data[2][:5])\n", "# print(self.validation_data[3][:5]) \n", "# X_val, y_val = self.validation_data[0], self.validation_data[1]\n", " X_val = [self.validation_data[0],self.validation_data[1]]\n", " y_val = self.validation_data[2]\n", " y_predict = model.predict(x = X_val)\n", " logs['val_auc'] = evaluation.auc(y_val, y_predict)\n", "\n", "metrics2 = newMetrics()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 4970845 samples, validate on 610038 samples\n", "Epoch 1/20\n", " - 141s - loss: 0.3309 - accuracy: 0.8533 - val_loss: 0.1642 - val_accuracy: 0.9342\n", "\n", "Epoch 00001: val_loss improved from inf to 0.16419, saving model to ../metadata/gmf/gmf-weights-improvement-01-0.1642.hdf5\n", "Epoch 2/20\n", " - 127s - loss: 0.2979 - accuracy: 0.8706 - val_loss: 0.1629 - val_accuracy: 0.9331\n", "\n", "Epoch 00002: val_loss improved from 0.16419 to 0.16291, saving model to ../metadata/gmf/gmf-weights-improvement-02-0.1629.hdf5\n", "Epoch 3/20\n", " - 133s - loss: 0.2978 - accuracy: 0.8721 - val_loss: 0.1971 - val_accuracy: 0.9157\n", "\n", "Epoch 00003: val_loss did not improve from 0.16291\n", "Epoch 4/20\n", " - 141s - loss: 0.3012 - accuracy: 0.8717 - val_loss: 0.1345 - val_accuracy: 0.9442\n", "\n", "Epoch 00004: val_loss improved from 0.16291 to 0.13455, saving model to ../metadata/gmf/gmf-weights-improvement-04-0.1345.hdf5\n", "Epoch 5/20\n", " - 138s - loss: 0.3032 - accuracy: 0.8722 - val_loss: 0.1558 - val_accuracy: 0.9334\n", "\n", "Epoch 00005: val_loss did not improve from 0.13455\n", "Epoch 6/20\n", " - 138s - loss: 0.3065 - accuracy: 0.8717 - val_loss: 0.1512 - val_accuracy: 0.9353\n", "\n", "Epoch 00006: val_loss did not improve from 0.13455\n", "Epoch 7/20\n", " - 140s - loss: 0.3101 - accuracy: 0.8711 - val_loss: 0.2357 - val_accuracy: 0.8980\n", "\n", "Epoch 00007: val_loss did not improve from 0.13455\n", "Epoch 8/20\n", " - 136s - loss: 0.3128 - accuracy: 0.8706 - val_loss: 0.1754 - val_accuracy: 0.9287\n", "\n", "Epoch 00008: val_loss did not improve from 0.13455\n", "\n", "Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0029999999329447745.\n", "Epoch 9/20\n", " - 142s - loss: 0.2770 - accuracy: 0.8824 - val_loss: 0.1284 - val_accuracy: 0.9473\n", "\n", "Epoch 00009: val_loss improved from 0.13455 to 0.12836, saving model to ../metadata/gmf/gmf-weights-improvement-09-0.1284.hdf5\n", "Epoch 10/20\n", " - 143s - loss: 0.2738 - accuracy: 0.8839 - val_loss: 0.1577 - val_accuracy: 0.9319\n", "\n", "Epoch 00010: val_loss did not improve from 0.12836\n", "Epoch 11/20\n", " - 146s - loss: 0.2700 - accuracy: 0.8859 - val_loss: 0.1619 - val_accuracy: 0.9309\n", "\n", "Epoch 00011: val_loss did not improve from 0.12836\n", "Epoch 12/20\n", " - 143s - loss: 0.2650 - accuracy: 0.8885 - val_loss: 0.1763 - val_accuracy: 0.9251\n", "\n", "Epoch 00012: val_loss did not improve from 0.12836\n", "Epoch 13/20\n", " - 177s - loss: 0.2601 - accuracy: 0.8912 - val_loss: 0.1537 - val_accuracy: 0.9356\n", "\n", "Epoch 00013: val_loss did not improve from 0.12836\n", "\n", "Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.0009000000078231095.\n", "Epoch 14/20\n", " - 135s - loss: 0.2380 - accuracy: 0.9007 - val_loss: 0.1668 - val_accuracy: 0.9293\n", "\n", "Epoch 00014: val_loss did not improve from 0.12836\n", "Epoch 15/20\n", " - 137s - loss: 0.2358 - accuracy: 0.9017 - val_loss: 0.1608 - val_accuracy: 0.9320\n", "\n", "Epoch 00015: val_loss did not improve from 0.12836\n", "Epoch 16/20\n", " - 131s - loss: 0.2340 - accuracy: 0.9026 - val_loss: 0.1760 - val_accuracy: 0.9249\n", "\n", "Epoch 00016: val_loss did not improve from 0.12836\n", "Epoch 17/20\n", " - 125s - loss: 0.2325 - accuracy: 0.9033 - val_loss: 0.1782 - val_accuracy: 0.9243\n", "\n", "Epoch 00017: val_loss did not improve from 0.12836\n", "\n", "Epoch 00017: ReduceLROnPlateau reducing learning rate to 0.00026999999536201356.\n", "Epoch 18/20\n", " - 133s - loss: 0.2248 - accuracy: 0.9068 - val_loss: 0.1582 - val_accuracy: 0.9338\n", "\n", "Epoch 00018: val_loss did not improve from 0.12836\n", "Epoch 19/20\n", " - 143s - loss: 0.2243 - accuracy: 0.9070 - val_loss: 0.1655 - val_accuracy: 0.9303\n", "\n", "Epoch 00019: val_loss did not improve from 0.12836\n", "Epoch 20/20\n", " - 143s - loss: 0.2238 - accuracy: 0.9072 - val_loss: 0.1588 - val_accuracy: 0.9338\n", "\n", "Epoch 00020: val_loss did not improve from 0.12836\n" ] } ], "source": [ "#### train\n", "hist = model.fit(\n", " x = [\n", " np.array(dataset.users),\n", " np.array(dataset.items)\n", " ],\n", " y = np.array(dataset.ratings),\n", " batch_size=batch_size,\n", " epochs=num_epochs,\n", " verbose=2,\n", " shuffle=True,\n", " callbacks=[metrics2,checkpoint,csvlog,earlystop,lrreduce],\n", " validation_data=(\n", " [\n", " np.array(dataset.users_test),\n", " np.array(dataset.items_test)\n", " ],\n", " np.array(dataset.ratings_test)\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "([0, 0, 0, 0, 0],\n", " [398, 1981, 873, 752, 1481],\n", " [1.0, 0.0, 0.0, 0.0, 0.0],\n", " array([1., 0., 0., 0., 0.]))" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.users_test[:5], dataset.items_test[:5], dataset.ratings_test[:5], dataset.ratings[:5]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: \"model_1\"\n", "__________________________________________________________________________________________________\n", "Layer (type) Output Shape Param # Connected to \n", "==================================================================================================\n", "user_input (InputLayer) (None, 1) 0 \n", "__________________________________________________________________________________________________\n", "item_input (InputLayer) (None, 1) 0 \n", "__________________________________________________________________________________________________\n", "embedding_gmf_User (Embedding) (None, 1, 32) 193280 user_input[0][0] \n", "__________________________________________________________________________________________________\n", "embedding_gmf_Item (Embedding) (None, 1, 32) 118528 item_input[0][0] \n", "__________________________________________________________________________________________________\n", "flatten_gmf_User (Flatten) (None, 32) 0 embedding_gmf_User[0][0] \n", "__________________________________________________________________________________________________\n", "flatten_gmf_Item (Flatten) (None, 32) 0 embedding_gmf_Item[0][0] \n", "__________________________________________________________________________________________________\n", "multiply_gmf_UserItem (Multiply (None, 32) 0 flatten_gmf_User[0][0] \n", " flatten_gmf_Item[0][0] \n", "__________________________________________________________________________________________________\n", "output (Dense) (None, 1) 33 multiply_gmf_UserItem[0][0] \n", "==================================================================================================\n", "Total params: 311,841\n", "Trainable params: 311,841\n", "Non-trainable params: 0\n", "__________________________________________________________________________________________________\n" ] } ], "source": [ "model.summary()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }