{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Test the GMF module under folder [cf_ec2](../cf_ec2) with ml-1m dataset, save the best model (using integrated modules with compile and fit components)\n", "\n", "#### 3/18/2020" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using TensorFlow backend.\n" ] } ], "source": [ "import numpy as np \n", "import pandas as pd\n", "import keras\n", "from keras import Model\n", "from keras.regularizers import l2\n", "from keras.optimizers import (\n", " Adam,\n", " Adamax,\n", " Adagrad,\n", " SGD,\n", " RMSprop\n", ")\n", "from keras.layers import (\n", " Embedding, \n", " Input,\n", " Flatten, \n", " Multiply, \n", " Concatenate,\n", " Dense\n", ")\n", "\n", "import sys\n", "sys.path.append('../')\n", "from cf_ec2 import (\n", " GMF,\n", " MLP,\n", " NCF,\n", " Data,\n", " evaluation,\n", " evaluation_grouped\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## step 1: load the data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "train = pd.read_csv('../data/ml-1m.train.rating',sep='\\t',header=None,names=['user','item','rating','event_ts'])\n", "test = pd.read_csv('../data/ml-1m.test.rating',sep='\\t',header=None,names=['user','item','rating','event_ts'])" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(6040, (6040, 4))" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test.user.nunique(), test.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## step 2: prepare the data for gmf model training" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "dataset = Data(\n", " train=train,\n", " test=test,\n", " col_user='user',\n", " col_item='item',\n", " col_rating='rating',\n", " col_time='event_ts',\n", " binary=True,\n", " n_neg=4,\n", " n_neg_test=100\n", ")\n", "dataset.prepTrainDNN(negSample=True)\n", "dataset.prepTestDNN(group=False)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
useritem_interacteditem_negative
00{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...{52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6...
11{15, 22, 31, 34, 35, 42, 43, 52, 53, 54, 55, 5...{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
22{2, 135, 136, 14, 18, 147, 159, 163, 36, 40, 1...{0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15...
\n", "
" ], "text/plain": [ " user item_interacted \\\n", "0 0 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... \n", "1 1 {15, 22, 31, 34, 35, 42, 43, 52, 53, 54, 55, 5... \n", "2 2 {2, 135, 136, 14, 18, 147, 159, 163, 36, 40, 1... \n", "\n", " item_negative \n", "0 {52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6... \n", "1 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... \n", "2 {0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15... " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.interaction_train.head(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### prepare the test dataset" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "newItems = set(dataset.items_test)-set(dataset.items)\n", "idx2del = []\n", "for idx,item in enumerate(dataset.items_test):\n", " if item in newItems:\n", " idx2del.append(idx)\n", "\n", "length_test_original = len(dataset.users_test)\n", "dataset.users_test = [\n", " dataset.users_test[idx]\n", " for idx in range(length_test_original) if idx not in idx2del\n", "]\n", "dataset.items_test = [\n", " dataset.items_test[idx]\n", " for idx in range(length_test_original) if idx not in idx2del\n", "]\n", "dataset.ratings_test = [\n", " dataset.ratings_test[idx]\n", " for idx in range(length_test_original) if idx not in idx2del\n", "]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## step 3: create the model architecture and fit model with training data" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "scrolled": false }, "outputs": [], "source": [ "n_users = 6040\n", "n_items = 3704\n", "n_factors_gmf = 32\n", "layers_mlp = [64,32,16,8]\n", "reg_gmf = 0.\n", "reg_layers_mlp = [0.,0.,0.,0.]\n", "learning_rate = 0.01\n", "flg_pretrain = ''\n", "filepath = ''\n", "filepath_gmf_pretrain = ''\n", "filepath_mlp_pretrain = ''\n", "num_epochs = 20\n", "batch_size = 100\n", "\n", "\n", "gmf = GMF(\n", " n_users=n_users,\n", " n_items=n_items,\n", " n_factors_gmf=n_factors_gmf\n", ")\n", "gmf.create_model()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "gmf.compile(learning_rate=0.01)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/xyin/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:433: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n", " \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Train on 4970845 samples, validate on 610038 samples\n", "Epoch 1/20\n", " - 140s - loss: 0.3344 - accuracy: 0.8515 - val_loss: 0.1743 - val_accuracy: 0.9315\n", "\n", "Epoch 00001: val_loss improved from inf to 0.17435, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/gmf/gmf-weights-improvement-01-0.1743.hdf5\n", "Epoch 2/20\n", " - 134s - loss: 0.3026 - accuracy: 0.8685 - val_loss: 0.1670 - val_accuracy: 0.9302\n", "\n", "Epoch 00002: val_loss improved from 0.17435 to 0.16697, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/gmf/gmf-weights-improvement-02-0.1670.hdf5\n", "Epoch 3/20\n", " - 139s - loss: 0.3021 - accuracy: 0.8699 - val_loss: 0.1932 - val_accuracy: 0.9173\n", "\n", "Epoch 00003: val_loss did not improve from 0.16697\n", "Epoch 4/20\n", " - 133s - loss: 0.3051 - accuracy: 0.8699 - val_loss: 0.1588 - val_accuracy: 0.9335\n", "\n", "Epoch 00004: val_loss improved from 0.16697 to 0.15884, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/gmf/gmf-weights-improvement-04-0.1588.hdf5\n", "Epoch 5/20\n", " - 131s - loss: 0.3075 - accuracy: 0.8702 - val_loss: 0.1795 - val_accuracy: 0.9208\n", "\n", "Epoch 00005: val_loss did not improve from 0.15884\n", "Epoch 6/20\n", " - 132s - loss: 0.3101 - accuracy: 0.8698 - val_loss: 0.1624 - val_accuracy: 0.9293\n", "\n", "Epoch 00006: val_loss did not improve from 0.15884\n", "Epoch 7/20\n", " - 127s - loss: 0.3130 - accuracy: 0.8695 - val_loss: 0.2141 - val_accuracy: 0.9046\n", "\n", "Epoch 00007: val_loss did not improve from 0.15884\n", "Epoch 8/20\n", " - 126s - loss: 0.3158 - accuracy: 0.8690 - val_loss: 0.2003 - val_accuracy: 0.9094\n", "\n", "Epoch 00008: val_loss did not improve from 0.15884\n", "\n", "Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0029999999329447745.\n", "Epoch 9/20\n", " - 135s - loss: 0.2802 - accuracy: 0.8809 - val_loss: 0.1479 - val_accuracy: 0.9366\n", "\n", "Epoch 00009: val_loss improved from 0.15884 to 0.14790, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/gmf/gmf-weights-improvement-09-0.1479.hdf5\n", "Epoch 10/20\n", " - 128s - loss: 0.2759 - accuracy: 0.8829 - val_loss: 0.1420 - val_accuracy: 0.9403\n", "\n", "Epoch 00010: val_loss improved from 0.14790 to 0.14199, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/gmf/gmf-weights-improvement-10-0.1420.hdf5\n", "Epoch 11/20\n", " - 126s - loss: 0.2711 - accuracy: 0.8854 - val_loss: 0.1357 - val_accuracy: 0.9434\n", "\n", "Epoch 00011: val_loss improved from 0.14199 to 0.13571, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/gmf/gmf-weights-improvement-11-0.1357.hdf5\n", "Epoch 12/20\n", " - 137s - loss: 0.2657 - accuracy: 0.8884 - val_loss: 0.1365 - val_accuracy: 0.9438\n", "\n", "Epoch 00012: val_loss did not improve from 0.13571\n", "Epoch 13/20\n", " - 126s - loss: 0.2604 - accuracy: 0.8912 - val_loss: 0.1747 - val_accuracy: 0.9252\n", "\n", "Epoch 00013: val_loss did not improve from 0.13571\n", "Epoch 14/20\n", " - 124s - loss: 0.2561 - accuracy: 0.8932 - val_loss: 0.1606 - val_accuracy: 0.9323\n", "\n", "Epoch 00014: val_loss did not improve from 0.13571\n", "Epoch 15/20\n", " - 133s - loss: 0.2533 - accuracy: 0.8949 - val_loss: 0.1698 - val_accuracy: 0.9282\n", "\n", "Epoch 00015: val_loss did not improve from 0.13571\n", "\n", "Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.0009000000078231095.\n", "Epoch 16/20\n", " - 130s - loss: 0.2304 - accuracy: 0.9041 - val_loss: 0.1834 - val_accuracy: 0.9216\n", "\n", "Epoch 00016: val_loss did not improve from 0.13571\n", "Epoch 17/20\n", " - 128s - loss: 0.2288 - accuracy: 0.9049 - val_loss: 0.1515 - val_accuracy: 0.9374\n", "\n", "Epoch 00017: val_loss did not improve from 0.13571\n", "Epoch 18/20\n", " - 127s - loss: 0.2277 - accuracy: 0.9056 - val_loss: 0.1380 - val_accuracy: 0.9438\n", "\n", "Epoch 00018: val_loss did not improve from 0.13571\n", "Epoch 19/20\n", " - 128s - loss: 0.2268 - accuracy: 0.9060 - val_loss: 0.2042 - val_accuracy: 0.9120\n", "\n", "Epoch 00019: val_loss did not improve from 0.13571\n", "\n", "Epoch 00019: ReduceLROnPlateau reducing learning rate to 0.00026999999536201356.\n", "Epoch 20/20\n", " - 132s - loss: 0.2190 - accuracy: 0.9094 - val_loss: 0.1693 - val_accuracy: 0.9287\n", "\n", "Epoch 00020: val_loss did not improve from 0.13571\n" ] } ], "source": [ "hist = gmf.fit(\n", " dataset=dataset,\n", " batch_size=batch_size,\n", " num_epochs=num_epochs,\n", " path_model_weights='/Users/xyin/Documents/work/projects/rec_utils/metadata/gmf/gmf-weights-improvement-{epoch:02d}-{val_loss:.4f}.hdf5',\n", " path_csvlog='/Users/xyin/Documents/work/projects/rec_utils/metadata/gmf/gmf_log.csv'\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### double check the current state of the trained model" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import imp\n", "imp.reload(evaluation_grouped)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 6040/6040 [02:11<00:00, 46.00it/s]\n" ] } ], "source": [ "evaluator = evaluation_grouped.metricsEval(\n", " model=gmf.model,\n", " users=dataset.users,\n", " items=dataset.items\n", ")\n", "evaluator.getRecs()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIDitemIDprediction
0000.956932
1010.991923
2020.988913
\n", "
" ], "text/plain": [ " userID itemID prediction\n", "0 0 0 0.956932\n", "1 0 1 0.991923\n", "2 0 2 0.988913" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "evaluator.all_predictions.head(3)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "rmse,auc,logloss = evaluator.getOverlapBasedMetrics(\n", " dataset.users_test,\n", " dataset.items_test,\n", " dataset.ratings_test\n", ")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(0.22678508081596974, 0.8910193560266216, 0.16930384419089145)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rmse,auc,logloss" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "it proves that the model is still at the state of last epoch !!!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can also do something like this" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "scores = gmf.model.evaluate(\n", " x = [\n", " np.array(dataset.users_test),\n", " np.array(dataset.items_test)\n", " ],\n", " y = np.array(dataset.ratings_test),\n", " verbose=0\n", ")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[0.16930384472775314, 0.9286995530128479]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scores" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['loss', 'accuracy']" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gmf.model.metrics_names" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### try to load the parameters from the best model" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "gmf.model.load_weights('../metadata/gmf/gmf-weights-improvement-11-0.1357.hdf5')" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "scores = gmf.model.evaluate(\n", " x = [\n", " np.array(dataset.users_test),\n", " np.array(dataset.items_test)\n", " ],\n", " y = np.array(dataset.ratings_test),\n", " verbose=0\n", ")" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[0.13571090596280566, 0.9433576464653015]" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scores" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 6040/6040 [02:15<00:00, 44.54it/s]\n" ] }, { "data": { "text/plain": [ "(0.20208704972160227, 0.8827614048663103, 0.13571090694790497)" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "evaluator = evaluation_grouped.metricsEval(\n", " model=gmf.model,\n", " users=dataset.users,\n", " items=dataset.items\n", ")\n", "evaluator.getRecs()\n", "rmse,auc,logloss = evaluator.getOverlapBasedMetrics(\n", " dataset.users_test,\n", " dataset.items_test,\n", " dataset.ratings_test\n", ")\n", "rmse,auc,logloss" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### try create a new model by loading pre-trained parameters" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "gmf2 = GMF(\n", " n_users=n_users,\n", " n_items=n_items,\n", " n_factors_gmf=n_factors_gmf\n", ")\n", "gmf2.create_model(path_pretrain='../metadata/gmf/gmf-weights-improvement-11-0.1357.hdf5')\n", "gmf2.compile(learning_rate=learning_rate)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[0.13571090596280566, 0.9433576464653015]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scores = gmf2.model.evaluate(\n", " x = [\n", " np.array(dataset.users_test),\n", " np.array(dataset.items_test)\n", " ],\n", " y = np.array(dataset.ratings_test),\n", " verbose=0\n", ")\n", "scores" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 6040/6040 [02:18<00:00, 43.61it/s]\n" ] }, { "data": { "text/plain": [ "(0.20208704972160227, 0.8827614048663103, 0.13571090694790497)" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "evaluator = evaluation_grouped.metricsEval(\n", " model=gmf2.model,\n", " users=dataset.users,\n", " items=dataset.items\n", ")\n", "evaluator.getRecs()\n", "rmse,auc,logloss = evaluator.getOverlapBasedMetrics(\n", " dataset.users_test,\n", " dataset.items_test,\n", " dataset.ratings_test\n", ")\n", "rmse,auc,logloss" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Confirmed that results from both ways are the same!!!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### try to save/load the complete model" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "gmf.model.save('../metadata/gmf/gmf-best.hdf5')" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/xyin/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:433: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n", " \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n" ] } ], "source": [ "model3 = keras.models.load_model('../metadata/gmf/gmf-best.hdf5')" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 6040/6040 [02:17<00:00, 44.01it/s]\n" ] }, { "data": { "text/plain": [ "(0.20208704972160227, 0.8827614048663103, 0.13571090694790497)" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "evaluator = evaluation_grouped.metricsEval(\n", " model=model3,\n", " users=dataset.users,\n", " items=dataset.items\n", ")\n", "evaluator.getRecs()\n", "rmse,auc,logloss = evaluator.getOverlapBasedMetrics(\n", " dataset.users_test,\n", " dataset.items_test,\n", " dataset.ratings_test\n", ")\n", "rmse,auc,logloss" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }