{ "cells": [ { "cell_type": "raw", "metadata": {}, "source": [ "%env MKL_NUM_THREADS=12\n", "%env OMP_NUM_THREADS=12" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from collections import defaultdict\n", "\n", "import numpy as np\n", "import scipy as sp\n", "import pandas as pd\n", "from ipypb import track\n", "\n", "from polara.evaluation import evaluation_engine as ee\n", "from polara.evaluation.pipelines import random_grid, find_optimal_config\n", "\n", "from lce import LCEModel, LCEModelItemColdStart\n", "from data_preprocessing import (get_amazon_data,\n", " get_similarity_data,\n", " prepare_data_model,\n", " prepare_cold_start_data_model)\n", "from utils import (report_results, save_results,\n", " apply_config, print_data_stats,\n", " save_training_time, save_cv_training_time)\n", "\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from polara.recommender import defaults\n", "defaults.memory_hard_limit = 15 # allowed memory usage during recommendations generationa\n", "max_test_workers = 6 # use this manyparallel thread for evaluation each using up to {memory_hard_limit} Gb of RAM" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "seed = 42" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "experiment_name = 'lce'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Experiment setup" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "data_labels = ['AMZe', 'AMZvg']" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "init_config = dict(seed = seed,\n", " max_iterations = 75,\n", " alpha = 0.1,\n", " beta = 0.05,\n", " max_neighbours=10,\n", " )\n", "lce_init_config = dict.fromkeys(data_labels, {'LCE': init_config, # standard scenario\n", " 'LCE(cs)': init_config}) # cold start" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "params = {\n", " 'regularization': [1, 3, 10, 30],\n", " 'rank': [40] # for initial tuning (exploration)\n", " }" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "coeffs = {\n", " 'alpha': [0.1, 0.3, 0.5, 0.7, 0.9],\n", " 'beta': [0, 0.05, 0.1, 0.3]\n", "}" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "ranks_grid = [1, 30, 50, 100, 150, 200, 300, 400, 500,\n", " 750, 1000, 1250, 1500, 2000, 2500, 3000]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "lce_ranks = {'AMZe': ranks_grid,\n", " 'AMZvg': ranks_grid\n", " }" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "topk_values = [1, 3, 10, 20, 30]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "target_metric = 'mrr'" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "data_dict = dict.fromkeys(data_labels)\n", "meta_dict = dict.fromkeys(data_labels)\n", "similarities = dict.fromkeys(data_labels)\n", "feature_idx = dict.fromkeys(data_labels)\n", "sim_indices = dict.fromkeys(data_labels)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "all_data = [data_dict, similarities, sim_indices, meta_dict]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Amazon Electronics" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "lbl = 'AMZe'" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "data_dict[lbl], meta_dict[lbl] = get_amazon_data('/mnt/bulky/datasets/recsys/amazon/ratings_Electronics.csv',\n", " meta_path='/mnt/bulky/datasets/recsys/amazon/meta/meta_Electronics.json.gz',\n", " implicit=True,\n", " pcore=5,\n", " filter_no_meta=True,\n", " flat_categories=True) # take only bottom level categories" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "similarities[lbl], sim_indices[lbl], feature_idx[lbl] = get_similarity_data(meta_dict[lbl])" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.0" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(meta_dict[lbl].applymap(len).sum(axis=1)==0).mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Amazon Video Games" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "lbl = 'AMZvg'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data_dict[lbl], meta_dict[lbl] = get_amazon_data('/mnt/bulky/datasets/recsys/amazon/ratings_Video_Games.csv',\n", " meta_path='/mnt/bulky/datasets/recsys/amazon/meta/meta_Video_Games.json.gz',\n", " implicit=True,\n", " pcore=5,\n", " filter_data={'categories': ['Games']}, # filter uniformative category\n", " filter_no_meta=True,\n", " flat_categories=True) # take only bottom level categories" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "similarities[lbl], sim_indices[lbl], feature_idx[lbl] = get_similarity_data(meta_dict[lbl])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.0" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(meta_dict[lbl].applymap(len).sum(axis=1)==0).mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data stats" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AMZe\n", "{'userid': 124895, 'asin': 44843}\n", "density 0.019153791836615672\n", "similarity matrix density 1.1054998336712965\n", "AMZvg\n", "{'userid': 14251, 'asin': 6858}\n", "density 0.13281340440589384\n", "similarity matrix density 9.081814734274188\n" ] } ], "source": [ "print_data_stats(data_labels, all_data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Standard experiment" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def prepare_recommender_models(data_label, data_models, config):\n", " data_model = data_models[data_label]\n", " lce = LCEModel(data_model, item_features=meta_dict[data_label])\n", " lce.method = 'LCE'\n", " models = [lce]\n", " apply_config(models, config, data_label)\n", " return models\n", "\n", "def fine_tune_lce(model, params, label, ntrials=60, record_time_as=None):\n", " param_grid, param_names = random_grid(params, n=ntrials)\n", " best_lce_config, lce_scores = find_optimal_config(model, param_grid, param_names,\n", " target_metric,\n", " return_scores=True,\n", " force_build=True,\n", " iterator=lambda x: track(x, label=label))\n", " model_config = {model.method: dict(zip(param_names, best_lce_config))}\n", " model_scores = {model.method: lce_scores}\n", " try:\n", " if record_time_as:\n", " save_training_time(f'{experiment_name}_{record_time_as}', model, lce_scores.index, label)\n", " finally:\n", " return model_config, model_scores" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## tuning" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "config = {}\n", "scores = {}\n", "data_models = {}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'seed': 42,\n", " 'max_iterations': 75,\n", " 'alpha': 0.1,\n", " 'beta': 0.05,\n", " 'max_neighbours': 10}" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lce_init_config['AMZe']['LCE']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### regularization" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | \n", " | AMZe | \n", "AMZvg | \n", "
|---|---|---|---|
| type | \n", "metric | \n", "\n", " | \n", " |
| relevance | \n", "hr | \n", "0.030930 | \n", "0.116132 | \n", "
| ranking | \n", "mrr | \n", "0.014787 | \n", "0.051645 | \n", "