{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Test the GMF module under folder [cf_ec2](../cf_ec2) with ml-1m dataset, save the best model (using integrated modules with compile and fit components)\n",
"\n",
"#### 3/18/2020"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
}
],
"source": [
"import numpy as np \n",
"import pandas as pd\n",
"import keras\n",
"from keras import Model\n",
"from keras.regularizers import l2\n",
"from keras.optimizers import (\n",
" Adam,\n",
" Adamax,\n",
" Adagrad,\n",
" SGD,\n",
" RMSprop\n",
")\n",
"from keras.layers import (\n",
" Embedding, \n",
" Input,\n",
" Flatten, \n",
" Multiply, \n",
" Concatenate,\n",
" Dense\n",
")\n",
"\n",
"import sys\n",
"sys.path.append('../')\n",
"from cf_ec2 import (\n",
" GMF,\n",
" MLP,\n",
" NCF,\n",
" Data,\n",
" evaluation,\n",
" evaluation_grouped\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## step 1: load the data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"train = pd.read_csv('../data/ml-1m.train.rating',sep='\\t',header=None,names=['user','item','rating','event_ts'])\n",
"test = pd.read_csv('../data/ml-1m.test.rating',sep='\\t',header=None,names=['user','item','rating','event_ts'])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(6040, (6040, 4))"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.user.nunique(), test.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## step 2: prepare the data for gmf model training"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"dataset = Data(\n",
" train=train,\n",
" test=test,\n",
" col_user='user',\n",
" col_item='item',\n",
" col_rating='rating',\n",
" col_time='event_ts',\n",
" binary=True,\n",
" n_neg=4,\n",
" n_neg_test=100\n",
")\n",
"dataset.prepTrainDNN(negSample=True)\n",
"dataset.prepTestDNN(group=False)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user | \n",
" item_interacted | \n",
" item_negative | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | \n",
" {52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6... | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" {15, 22, 31, 34, 35, 42, 43, 52, 53, 54, 55, 5... | \n",
" {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
" {2, 135, 136, 14, 18, 147, 159, 163, 36, 40, 1... | \n",
" {0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user item_interacted \\\n",
"0 0 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... \n",
"1 1 {15, 22, 31, 34, 35, 42, 43, 52, 53, 54, 55, 5... \n",
"2 2 {2, 135, 136, 14, 18, 147, 159, 163, 36, 40, 1... \n",
"\n",
" item_negative \n",
"0 {52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6... \n",
"1 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... \n",
"2 {0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15... "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.interaction_train.head(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### prepare the test dataset"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"newItems = set(dataset.items_test)-set(dataset.items)\n",
"idx2del = []\n",
"for idx,item in enumerate(dataset.items_test):\n",
" if item in newItems:\n",
" idx2del.append(idx)\n",
"\n",
"length_test_original = len(dataset.users_test)\n",
"dataset.users_test = [\n",
" dataset.users_test[idx]\n",
" for idx in range(length_test_original) if idx not in idx2del\n",
"]\n",
"dataset.items_test = [\n",
" dataset.items_test[idx]\n",
" for idx in range(length_test_original) if idx not in idx2del\n",
"]\n",
"dataset.ratings_test = [\n",
" dataset.ratings_test[idx]\n",
" for idx in range(length_test_original) if idx not in idx2del\n",
"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## step 3: create the model architecture and fit model with training data"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"n_users = 6040\n",
"n_items = 3704\n",
"n_factors_gmf = 32\n",
"layers_mlp = [64,32,16,8]\n",
"reg_gmf = 0.\n",
"reg_layers_mlp = [0.,0.,0.,0.]\n",
"learning_rate = 0.01\n",
"flg_pretrain = ''\n",
"filepath = ''\n",
"filepath_gmf_pretrain = ''\n",
"filepath_mlp_pretrain = ''\n",
"num_epochs = 20\n",
"batch_size = 100\n",
"\n",
"\n",
"gmf = GMF(\n",
" n_users=n_users,\n",
" n_items=n_items,\n",
" n_factors_gmf=n_factors_gmf\n",
")\n",
"gmf.create_model()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"gmf.compile(learning_rate=0.01)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/xyin/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:433: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
" \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train on 4970845 samples, validate on 610038 samples\n",
"Epoch 1/20\n",
" - 140s - loss: 0.3344 - accuracy: 0.8515 - val_loss: 0.1743 - val_accuracy: 0.9315\n",
"\n",
"Epoch 00001: val_loss improved from inf to 0.17435, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/gmf/gmf-weights-improvement-01-0.1743.hdf5\n",
"Epoch 2/20\n",
" - 134s - loss: 0.3026 - accuracy: 0.8685 - val_loss: 0.1670 - val_accuracy: 0.9302\n",
"\n",
"Epoch 00002: val_loss improved from 0.17435 to 0.16697, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/gmf/gmf-weights-improvement-02-0.1670.hdf5\n",
"Epoch 3/20\n",
" - 139s - loss: 0.3021 - accuracy: 0.8699 - val_loss: 0.1932 - val_accuracy: 0.9173\n",
"\n",
"Epoch 00003: val_loss did not improve from 0.16697\n",
"Epoch 4/20\n",
" - 133s - loss: 0.3051 - accuracy: 0.8699 - val_loss: 0.1588 - val_accuracy: 0.9335\n",
"\n",
"Epoch 00004: val_loss improved from 0.16697 to 0.15884, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/gmf/gmf-weights-improvement-04-0.1588.hdf5\n",
"Epoch 5/20\n",
" - 131s - loss: 0.3075 - accuracy: 0.8702 - val_loss: 0.1795 - val_accuracy: 0.9208\n",
"\n",
"Epoch 00005: val_loss did not improve from 0.15884\n",
"Epoch 6/20\n",
" - 132s - loss: 0.3101 - accuracy: 0.8698 - val_loss: 0.1624 - val_accuracy: 0.9293\n",
"\n",
"Epoch 00006: val_loss did not improve from 0.15884\n",
"Epoch 7/20\n",
" - 127s - loss: 0.3130 - accuracy: 0.8695 - val_loss: 0.2141 - val_accuracy: 0.9046\n",
"\n",
"Epoch 00007: val_loss did not improve from 0.15884\n",
"Epoch 8/20\n",
" - 126s - loss: 0.3158 - accuracy: 0.8690 - val_loss: 0.2003 - val_accuracy: 0.9094\n",
"\n",
"Epoch 00008: val_loss did not improve from 0.15884\n",
"\n",
"Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0029999999329447745.\n",
"Epoch 9/20\n",
" - 135s - loss: 0.2802 - accuracy: 0.8809 - val_loss: 0.1479 - val_accuracy: 0.9366\n",
"\n",
"Epoch 00009: val_loss improved from 0.15884 to 0.14790, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/gmf/gmf-weights-improvement-09-0.1479.hdf5\n",
"Epoch 10/20\n",
" - 128s - loss: 0.2759 - accuracy: 0.8829 - val_loss: 0.1420 - val_accuracy: 0.9403\n",
"\n",
"Epoch 00010: val_loss improved from 0.14790 to 0.14199, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/gmf/gmf-weights-improvement-10-0.1420.hdf5\n",
"Epoch 11/20\n",
" - 126s - loss: 0.2711 - accuracy: 0.8854 - val_loss: 0.1357 - val_accuracy: 0.9434\n",
"\n",
"Epoch 00011: val_loss improved from 0.14199 to 0.13571, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/gmf/gmf-weights-improvement-11-0.1357.hdf5\n",
"Epoch 12/20\n",
" - 137s - loss: 0.2657 - accuracy: 0.8884 - val_loss: 0.1365 - val_accuracy: 0.9438\n",
"\n",
"Epoch 00012: val_loss did not improve from 0.13571\n",
"Epoch 13/20\n",
" - 126s - loss: 0.2604 - accuracy: 0.8912 - val_loss: 0.1747 - val_accuracy: 0.9252\n",
"\n",
"Epoch 00013: val_loss did not improve from 0.13571\n",
"Epoch 14/20\n",
" - 124s - loss: 0.2561 - accuracy: 0.8932 - val_loss: 0.1606 - val_accuracy: 0.9323\n",
"\n",
"Epoch 00014: val_loss did not improve from 0.13571\n",
"Epoch 15/20\n",
" - 133s - loss: 0.2533 - accuracy: 0.8949 - val_loss: 0.1698 - val_accuracy: 0.9282\n",
"\n",
"Epoch 00015: val_loss did not improve from 0.13571\n",
"\n",
"Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.0009000000078231095.\n",
"Epoch 16/20\n",
" - 130s - loss: 0.2304 - accuracy: 0.9041 - val_loss: 0.1834 - val_accuracy: 0.9216\n",
"\n",
"Epoch 00016: val_loss did not improve from 0.13571\n",
"Epoch 17/20\n",
" - 128s - loss: 0.2288 - accuracy: 0.9049 - val_loss: 0.1515 - val_accuracy: 0.9374\n",
"\n",
"Epoch 00017: val_loss did not improve from 0.13571\n",
"Epoch 18/20\n",
" - 127s - loss: 0.2277 - accuracy: 0.9056 - val_loss: 0.1380 - val_accuracy: 0.9438\n",
"\n",
"Epoch 00018: val_loss did not improve from 0.13571\n",
"Epoch 19/20\n",
" - 128s - loss: 0.2268 - accuracy: 0.9060 - val_loss: 0.2042 - val_accuracy: 0.9120\n",
"\n",
"Epoch 00019: val_loss did not improve from 0.13571\n",
"\n",
"Epoch 00019: ReduceLROnPlateau reducing learning rate to 0.00026999999536201356.\n",
"Epoch 20/20\n",
" - 132s - loss: 0.2190 - accuracy: 0.9094 - val_loss: 0.1693 - val_accuracy: 0.9287\n",
"\n",
"Epoch 00020: val_loss did not improve from 0.13571\n"
]
}
],
"source": [
"hist = gmf.fit(\n",
" dataset=dataset,\n",
" batch_size=batch_size,\n",
" num_epochs=num_epochs,\n",
" path_model_weights='/Users/xyin/Documents/work/projects/rec_utils/metadata/gmf/gmf-weights-improvement-{epoch:02d}-{val_loss:.4f}.hdf5',\n",
" path_csvlog='/Users/xyin/Documents/work/projects/rec_utils/metadata/gmf/gmf_log.csv'\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### double check the current state of the trained model"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import imp\n",
"imp.reload(evaluation_grouped)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 6040/6040 [02:11<00:00, 46.00it/s]\n"
]
}
],
"source": [
"evaluator = evaluation_grouped.metricsEval(\n",
" model=gmf.model,\n",
" users=dataset.users,\n",
" items=dataset.items\n",
")\n",
"evaluator.getRecs()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" userID | \n",
" itemID | \n",
" prediction | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 0 | \n",
" 0.956932 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0 | \n",
" 1 | \n",
" 0.991923 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0 | \n",
" 2 | \n",
" 0.988913 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" userID itemID prediction\n",
"0 0 0 0.956932\n",
"1 0 1 0.991923\n",
"2 0 2 0.988913"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"evaluator.all_predictions.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"rmse,auc,logloss = evaluator.getOverlapBasedMetrics(\n",
" dataset.users_test,\n",
" dataset.items_test,\n",
" dataset.ratings_test\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0.22678508081596974, 0.8910193560266216, 0.16930384419089145)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse,auc,logloss"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"it proves that the model is still at the state of last epoch !!!"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can also do something like this"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"scores = gmf.model.evaluate(\n",
" x = [\n",
" np.array(dataset.users_test),\n",
" np.array(dataset.items_test)\n",
" ],\n",
" y = np.array(dataset.ratings_test),\n",
" verbose=0\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[0.16930384472775314, 0.9286995530128479]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scores"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['loss', 'accuracy']"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gmf.model.metrics_names"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### try to load the parameters from the best model"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"gmf.model.load_weights('../metadata/gmf/gmf-weights-improvement-11-0.1357.hdf5')"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"scores = gmf.model.evaluate(\n",
" x = [\n",
" np.array(dataset.users_test),\n",
" np.array(dataset.items_test)\n",
" ],\n",
" y = np.array(dataset.ratings_test),\n",
" verbose=0\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[0.13571090596280566, 0.9433576464653015]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scores"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 6040/6040 [02:15<00:00, 44.54it/s]\n"
]
},
{
"data": {
"text/plain": [
"(0.20208704972160227, 0.8827614048663103, 0.13571090694790497)"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"evaluator = evaluation_grouped.metricsEval(\n",
" model=gmf.model,\n",
" users=dataset.users,\n",
" items=dataset.items\n",
")\n",
"evaluator.getRecs()\n",
"rmse,auc,logloss = evaluator.getOverlapBasedMetrics(\n",
" dataset.users_test,\n",
" dataset.items_test,\n",
" dataset.ratings_test\n",
")\n",
"rmse,auc,logloss"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### try create a new model by loading pre-trained parameters"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"gmf2 = GMF(\n",
" n_users=n_users,\n",
" n_items=n_items,\n",
" n_factors_gmf=n_factors_gmf\n",
")\n",
"gmf2.create_model(path_pretrain='../metadata/gmf/gmf-weights-improvement-11-0.1357.hdf5')\n",
"gmf2.compile(learning_rate=learning_rate)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[0.13571090596280566, 0.9433576464653015]"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scores = gmf2.model.evaluate(\n",
" x = [\n",
" np.array(dataset.users_test),\n",
" np.array(dataset.items_test)\n",
" ],\n",
" y = np.array(dataset.ratings_test),\n",
" verbose=0\n",
")\n",
"scores"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 6040/6040 [02:18<00:00, 43.61it/s]\n"
]
},
{
"data": {
"text/plain": [
"(0.20208704972160227, 0.8827614048663103, 0.13571090694790497)"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"evaluator = evaluation_grouped.metricsEval(\n",
" model=gmf2.model,\n",
" users=dataset.users,\n",
" items=dataset.items\n",
")\n",
"evaluator.getRecs()\n",
"rmse,auc,logloss = evaluator.getOverlapBasedMetrics(\n",
" dataset.users_test,\n",
" dataset.items_test,\n",
" dataset.ratings_test\n",
")\n",
"rmse,auc,logloss"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Confirmed that results from both ways are the same!!!"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### try to save/load the complete model"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"gmf.model.save('../metadata/gmf/gmf-best.hdf5')"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/xyin/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/indexed_slices.py:433: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
" \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
]
}
],
"source": [
"model3 = keras.models.load_model('../metadata/gmf/gmf-best.hdf5')"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 6040/6040 [02:17<00:00, 44.01it/s]\n"
]
},
{
"data": {
"text/plain": [
"(0.20208704972160227, 0.8827614048663103, 0.13571090694790497)"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"evaluator = evaluation_grouped.metricsEval(\n",
" model=model3,\n",
" users=dataset.users,\n",
" items=dataset.items\n",
")\n",
"evaluator.getRecs()\n",
"rmse,auc,logloss = evaluator.getOverlapBasedMetrics(\n",
" dataset.users_test,\n",
" dataset.items_test,\n",
" dataset.ratings_test\n",
")\n",
"rmse,auc,logloss"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}