{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Test the GMF modules under folder [cf_ec2](../cf_ec2) with ml-1m dataset, save the best model"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
}
],
"source": [
"import numpy as np \n",
"import pandas as pd\n",
"import keras\n",
"from keras import Model\n",
"from keras.regularizers import l2\n",
"from keras.optimizers import (\n",
" Adam,\n",
" Adamax,\n",
" Adagrad,\n",
" SGD,\n",
" RMSprop\n",
")\n",
"from keras.layers import (\n",
" Embedding, \n",
" Input,\n",
" Flatten, \n",
" Multiply, \n",
" Concatenate,\n",
" Dense\n",
")\n",
"\n",
"import sys\n",
"sys.path.append('../')\n",
"from cf_ec2 import (\n",
" GMF,\n",
" MLP,\n",
" NCF,\n",
" Data,\n",
" evaluation\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## step 1: load the data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"train = pd.read_csv('../data/ml-1m.train.rating',sep='\\t',header=None,names=['user','item','rating','event_ts'])\n",
"test = pd.read_csv('../data/ml-1m.test.rating',sep='\\t',header=None,names=['user','item','rating','event_ts'])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user | \n",
" item | \n",
" rating | \n",
" event_ts | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 32 | \n",
" 4 | \n",
" 978824330 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0 | \n",
" 34 | \n",
" 4 | \n",
" 978824330 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0 | \n",
" 4 | \n",
" 5 | \n",
" 978824291 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user item rating event_ts\n",
"0 0 32 4 978824330\n",
"1 0 34 4 978824330\n",
"2 0 4 5 978824291"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user | \n",
" item | \n",
" rating | \n",
" event_ts | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 25 | \n",
" 5 | \n",
" 978824351 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 133 | \n",
" 3 | \n",
" 978300174 | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
" 207 | \n",
" 4 | \n",
" 978298504 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user item rating event_ts\n",
"0 0 25 5 978824351\n",
"1 1 133 3 978300174\n",
"2 2 207 4 978298504"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(6040, (6040, 4))"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.user.nunique(), test.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## step 2: prepare the data for gmf model training"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"dataset = Data(\n",
" train=train,\n",
" test=test,\n",
" col_user='user',\n",
" col_item='item',\n",
" col_rating='rating',\n",
" col_time='event_ts',\n",
" binary=True,\n",
" n_neg=4,\n",
" n_neg_test=100\n",
")\n",
"dataset.prepTrainDNN(negSample=True)\n",
"dataset.prepTestDNN(group=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Method to save python object to disk for later use\n",
"\n",
"```python\n",
"import pickle\n",
"## pickle data\n",
"with open('../metadata/datasetGmf','wb') as fp:\n",
" pickle.dump(dataset, fp)\n",
"## pickle data with compression\n",
"import bz2\n",
"with bz2.BZ2File('datasetGmfSmaller', 'w') as fp:\n",
" pickle.dump(dataset, fp)\n",
" \n",
"## unpickle data\n",
"with open('../metadata/datasetGmf','rb') as fp:\n",
" dataset2 = pickle.load(fp)\n",
"with bz2.BZ2File('../metadata/datasetGmfSmaller', 'r') as fp:\n",
" dataset2 = pickle.load(fp) \n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(4970845, (994169, 6))"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(dataset.users),train.shape"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(610040, (6040, 6))"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(dataset.users_test),test.shape"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(6040, 6040)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.user.nunique(), test.user.nunique()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(3704, 1921)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.item.nunique(), test.item.nunique()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user | \n",
" item_interacted | \n",
" item_negative | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | \n",
" {52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6... | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" {15, 22, 31, 34, 35, 42, 43, 52, 53, 54, 55, 5... | \n",
" {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
" {2, 135, 136, 14, 18, 147, 159, 163, 36, 40, 1... | \n",
" {0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user item_interacted \\\n",
"0 0 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... \n",
"1 1 {15, 22, 31, 34, 35, 42, 43, 52, 53, 54, 55, 5... \n",
"2 2 {2, 135, 136, 14, 18, 147, 159, 163, 36, 40, 1... \n",
"\n",
" item_negative \n",
"0 {52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6... \n",
"1 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... \n",
"2 {0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15... "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.interaction_train.head(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### prepare the test dataset"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"newItems = set(dataset.items_test)-set(dataset.items)\n",
"idx2del = []\n",
"for idx,item in enumerate(dataset.items_test):\n",
" if item in newItems:\n",
" idx2del.append(idx)\n",
"\n",
"length_test_original = len(dataset.users_test)\n",
"dataset.users_test = [\n",
" dataset.users_test[idx]\n",
" for idx in range(length_test_original) if idx not in idx2del\n",
"]\n",
"dataset.items_test = [\n",
" dataset.items_test[idx]\n",
" for idx in range(length_test_original) if idx not in idx2del\n",
"]\n",
"dataset.ratings_test = [\n",
" dataset.ratings_test[idx]\n",
" for idx in range(length_test_original) if idx not in idx2del\n",
"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## step 3: create the model architecture"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"n_users = 6040\n",
"n_items = 3704\n",
"n_factors_gmf = 32\n",
"layers_mlp = [64,32,16,8]\n",
"reg_gmf = 0.\n",
"reg_layers_mlp = [0.,0.,0.,0.]\n",
"learning_rate = 0.01\n",
"flg_pretrain = ''\n",
"filepath = ''\n",
"filepath_gmf_pretrain = ''\n",
"filepath_mlp_pretrain = ''\n",
"num_epochs = 20\n",
"batch_size = 100\n",
"\n",
"\n",
"gmf = GMF(\n",
" n_users=n_users,\n",
" n_items=n_items,\n",
" n_factors_gmf=n_factors_gmf\n",
")\n",
"\n",
"model = gmf.create_model()\n",
"#### compile the model\n",
"model.compile(\n",
" optimizer=Adam(lr=learning_rate),\n",
" loss='binary_crossentropy',\n",
" metrics=['accuracy']\n",
")\n",
"#### create the callback metrics\n",
"filepath=\"../metadata/gmf/gmf-weights-improvement-{epoch:02d}-{val_loss:.4f}.hdf5\"\n",
"checkpoint = keras.callbacks.ModelCheckpoint(\n",
" filepath=filepath, \n",
" verbose=1, \n",
" save_best_only=True\n",
")\n",
"csvlog = keras.callbacks.CSVLogger(\n",
" '../metadata/gmf/gmf_log.csv', \n",
" separator=',', \n",
" append=False\n",
")\n",
"earlystop = keras.callbacks.EarlyStopping(patience=12)\n",
"lrreduce = keras.callbacks.ReduceLROnPlateau(\n",
" monitor=\"val_loss\", \n",
" factor=0.3, \n",
" patience=4, \n",
" verbose=1\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## step 4: train the model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### define customized metrics"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"class newMetrics(keras.callbacks.Callback):\n",
" def on_epoch_end(self, epoch, logs):\n",
"# print(len(self.validation_data))\n",
"# print(self.validation_data[0][:5])\n",
"# print(self.validation_data[1][:5]) \n",
"# print(self.validation_data[2][:5])\n",
"# print(self.validation_data[3][:5]) \n",
"# X_val, y_val = self.validation_data[0], self.validation_data[1]\n",
" X_val = [self.validation_data[0],self.validation_data[1]]\n",
" y_val = self.validation_data[2]\n",
" y_predict = model.predict(x = X_val)\n",
" logs['val_auc'] = evaluation.auc(y_val, y_predict)\n",
"\n",
"metrics2 = newMetrics()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train on 4970845 samples, validate on 610038 samples\n",
"Epoch 1/20\n",
" - 141s - loss: 0.3309 - accuracy: 0.8533 - val_loss: 0.1642 - val_accuracy: 0.9342\n",
"\n",
"Epoch 00001: val_loss improved from inf to 0.16419, saving model to ../metadata/gmf/gmf-weights-improvement-01-0.1642.hdf5\n",
"Epoch 2/20\n",
" - 127s - loss: 0.2979 - accuracy: 0.8706 - val_loss: 0.1629 - val_accuracy: 0.9331\n",
"\n",
"Epoch 00002: val_loss improved from 0.16419 to 0.16291, saving model to ../metadata/gmf/gmf-weights-improvement-02-0.1629.hdf5\n",
"Epoch 3/20\n",
" - 133s - loss: 0.2978 - accuracy: 0.8721 - val_loss: 0.1971 - val_accuracy: 0.9157\n",
"\n",
"Epoch 00003: val_loss did not improve from 0.16291\n",
"Epoch 4/20\n",
" - 141s - loss: 0.3012 - accuracy: 0.8717 - val_loss: 0.1345 - val_accuracy: 0.9442\n",
"\n",
"Epoch 00004: val_loss improved from 0.16291 to 0.13455, saving model to ../metadata/gmf/gmf-weights-improvement-04-0.1345.hdf5\n",
"Epoch 5/20\n",
" - 138s - loss: 0.3032 - accuracy: 0.8722 - val_loss: 0.1558 - val_accuracy: 0.9334\n",
"\n",
"Epoch 00005: val_loss did not improve from 0.13455\n",
"Epoch 6/20\n",
" - 138s - loss: 0.3065 - accuracy: 0.8717 - val_loss: 0.1512 - val_accuracy: 0.9353\n",
"\n",
"Epoch 00006: val_loss did not improve from 0.13455\n",
"Epoch 7/20\n",
" - 140s - loss: 0.3101 - accuracy: 0.8711 - val_loss: 0.2357 - val_accuracy: 0.8980\n",
"\n",
"Epoch 00007: val_loss did not improve from 0.13455\n",
"Epoch 8/20\n",
" - 136s - loss: 0.3128 - accuracy: 0.8706 - val_loss: 0.1754 - val_accuracy: 0.9287\n",
"\n",
"Epoch 00008: val_loss did not improve from 0.13455\n",
"\n",
"Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0029999999329447745.\n",
"Epoch 9/20\n",
" - 142s - loss: 0.2770 - accuracy: 0.8824 - val_loss: 0.1284 - val_accuracy: 0.9473\n",
"\n",
"Epoch 00009: val_loss improved from 0.13455 to 0.12836, saving model to ../metadata/gmf/gmf-weights-improvement-09-0.1284.hdf5\n",
"Epoch 10/20\n",
" - 143s - loss: 0.2738 - accuracy: 0.8839 - val_loss: 0.1577 - val_accuracy: 0.9319\n",
"\n",
"Epoch 00010: val_loss did not improve from 0.12836\n",
"Epoch 11/20\n",
" - 146s - loss: 0.2700 - accuracy: 0.8859 - val_loss: 0.1619 - val_accuracy: 0.9309\n",
"\n",
"Epoch 00011: val_loss did not improve from 0.12836\n",
"Epoch 12/20\n",
" - 143s - loss: 0.2650 - accuracy: 0.8885 - val_loss: 0.1763 - val_accuracy: 0.9251\n",
"\n",
"Epoch 00012: val_loss did not improve from 0.12836\n",
"Epoch 13/20\n",
" - 177s - loss: 0.2601 - accuracy: 0.8912 - val_loss: 0.1537 - val_accuracy: 0.9356\n",
"\n",
"Epoch 00013: val_loss did not improve from 0.12836\n",
"\n",
"Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.0009000000078231095.\n",
"Epoch 14/20\n",
" - 135s - loss: 0.2380 - accuracy: 0.9007 - val_loss: 0.1668 - val_accuracy: 0.9293\n",
"\n",
"Epoch 00014: val_loss did not improve from 0.12836\n",
"Epoch 15/20\n",
" - 137s - loss: 0.2358 - accuracy: 0.9017 - val_loss: 0.1608 - val_accuracy: 0.9320\n",
"\n",
"Epoch 00015: val_loss did not improve from 0.12836\n",
"Epoch 16/20\n",
" - 131s - loss: 0.2340 - accuracy: 0.9026 - val_loss: 0.1760 - val_accuracy: 0.9249\n",
"\n",
"Epoch 00016: val_loss did not improve from 0.12836\n",
"Epoch 17/20\n",
" - 125s - loss: 0.2325 - accuracy: 0.9033 - val_loss: 0.1782 - val_accuracy: 0.9243\n",
"\n",
"Epoch 00017: val_loss did not improve from 0.12836\n",
"\n",
"Epoch 00017: ReduceLROnPlateau reducing learning rate to 0.00026999999536201356.\n",
"Epoch 18/20\n",
" - 133s - loss: 0.2248 - accuracy: 0.9068 - val_loss: 0.1582 - val_accuracy: 0.9338\n",
"\n",
"Epoch 00018: val_loss did not improve from 0.12836\n",
"Epoch 19/20\n",
" - 143s - loss: 0.2243 - accuracy: 0.9070 - val_loss: 0.1655 - val_accuracy: 0.9303\n",
"\n",
"Epoch 00019: val_loss did not improve from 0.12836\n",
"Epoch 20/20\n",
" - 143s - loss: 0.2238 - accuracy: 0.9072 - val_loss: 0.1588 - val_accuracy: 0.9338\n",
"\n",
"Epoch 00020: val_loss did not improve from 0.12836\n"
]
}
],
"source": [
"#### train\n",
"hist = model.fit(\n",
" x = [\n",
" np.array(dataset.users),\n",
" np.array(dataset.items)\n",
" ],\n",
" y = np.array(dataset.ratings),\n",
" batch_size=batch_size,\n",
" epochs=num_epochs,\n",
" verbose=2,\n",
" shuffle=True,\n",
" callbacks=[metrics2,checkpoint,csvlog,earlystop,lrreduce],\n",
" validation_data=(\n",
" [\n",
" np.array(dataset.users_test),\n",
" np.array(dataset.items_test)\n",
" ],\n",
" np.array(dataset.ratings_test)\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"([0, 0, 0, 0, 0],\n",
" [398, 1981, 873, 752, 1481],\n",
" [1.0, 0.0, 0.0, 0.0, 0.0],\n",
" array([1., 0., 0., 0., 0.]))"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.users_test[:5], dataset.items_test[:5], dataset.ratings_test[:5], dataset.ratings[:5]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: \"model_1\"\n",
"__________________________________________________________________________________________________\n",
"Layer (type) Output Shape Param # Connected to \n",
"==================================================================================================\n",
"user_input (InputLayer) (None, 1) 0 \n",
"__________________________________________________________________________________________________\n",
"item_input (InputLayer) (None, 1) 0 \n",
"__________________________________________________________________________________________________\n",
"embedding_gmf_User (Embedding) (None, 1, 32) 193280 user_input[0][0] \n",
"__________________________________________________________________________________________________\n",
"embedding_gmf_Item (Embedding) (None, 1, 32) 118528 item_input[0][0] \n",
"__________________________________________________________________________________________________\n",
"flatten_gmf_User (Flatten) (None, 32) 0 embedding_gmf_User[0][0] \n",
"__________________________________________________________________________________________________\n",
"flatten_gmf_Item (Flatten) (None, 32) 0 embedding_gmf_Item[0][0] \n",
"__________________________________________________________________________________________________\n",
"multiply_gmf_UserItem (Multiply (None, 32) 0 flatten_gmf_User[0][0] \n",
" flatten_gmf_Item[0][0] \n",
"__________________________________________________________________________________________________\n",
"output (Dense) (None, 1) 33 multiply_gmf_UserItem[0][0] \n",
"==================================================================================================\n",
"Total params: 311,841\n",
"Trainable params: 311,841\n",
"Non-trainable params: 0\n",
"__________________________________________________________________________________________________\n"
]
}
],
"source": [
"model.summary()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}