{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: MKL_NUM_THREADS=16\n", "env: OMP_NUM_THREADS=16\n" ] } ], "source": [ "%env MKL_NUM_THREADS=16\n", "%env OMP_NUM_THREADS=16" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from collections import defaultdict\n", "\n", "import numpy as np\n", "import scipy as sp\n", "import pandas as pd\n", "from ipypb import track\n", "\n", "from polara.evaluation import evaluation_engine as ee\n", "from polara.evaluation.pipelines import random_grid, find_optimal_config\n", "from polara.recommender.coldstart.models import ItemColdStartEvaluationMixin\n", "from polara.recommender.external.turi.turiwrapper import (TuriFactorizationRecommender,\n", " ColdStartRecommendationsMixin)\n", "\n", "from data_preprocessing import (get_movielens_data,\n", " get_bookcrossing_data,\n", " get_similarity_data,\n", " prepare_data_model,\n", " prepare_cold_start_data_model)\n", "from utils import (report_results, save_results,\n", " apply_config, print_data_stats,\n", " save_training_time, save_cv_training_time)\n", "\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "seed = 42" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "experiment_name = 'fm'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Experiment setup" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "data_labels = ['ML1M', 'ML10M', 'BX']" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# according to https://apple.github.io/turicreate/docs/api/generated/turicreate.recommender.ranking_factorization_recommender.RankingFactorizationRecommender.html\n", "init_config = dict(with_data_feedback = False, # implicit case\n", " ranking_optimization = True,\n", " solver = 'adagrad',\n", " sgd_step_size = 0, # let Turi autotune it\n", " seed = seed,\n", " max_iterations = 25,\n", " other_tc_params = {}\n", " )\n", "fm_init_config = dict.fromkeys(data_labels, {'FM': init_config, # standard scenario\n", " 'FM(cs)': init_config}) # cold start" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "params = {\n", " 'regularization': [1e-10, 3e-10, 1e-9, 3e-9, 1e-8, 3e-8, 1e-7, 3e-7, 1e-6, 3e-6],\n", " 'linear_regularization': [1e-10, 3e-10, 1e-9, 3e-9, 1e-8, 3e-8, 1e-7, 3e-7, 1e-6, 3e-6],\n", " 'rank': [40] # for initial tuning (exploration)\n", " }\n", "\n", "if init_config['solver'] == 'adagrad':\n", " params.update({\n", " 'adagrad_momentum_weighting': [0.9, 0.95, 0.99]\n", " }) " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "ranks_grid = [1, 5, 10, 15, 20, 30, 50, 60, 75, 100, 125, 150, 200, 250, 300, 350, 400,\n", " 500, 750, 1000, 1250, 1500, 1750, 2000, 2500, 3000]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "fm_ranks = {'ML1M': [r for r in ranks_grid if r <= 1000],\n", " 'ML10M': [r for r in ranks_grid if r <= 1000],\n", " 'BX': [r for r in ranks_grid if r <= 2000]}" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "topk_values = [1, 3, 10, 20, 30]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "target_metric = 'mrr'" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "data_dict = dict.fromkeys(data_labels)\n", "meta_dict = dict.fromkeys(data_labels)\n", "similarities = dict.fromkeys(data_labels)\n", "sim_indices = dict.fromkeys(data_labels)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "all_data = [data_dict, similarities, sim_indices, meta_dict]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Movielens1M" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "lbl = 'ML1M'" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "data_dict[lbl], meta_dict[lbl] = get_movielens_data('/mnt/bulky/datasets/recsys/movielens/ml-1m.zip',\n", " meta_path='data/meta_info_ml1m.csv',\n", " implicit=True,\n", " filter_no_meta=True)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# not used actually, simply to onform with general pipeline\n", "itemid = meta_dict[lbl].index.name\n", "sim_indices[lbl] = {itemid: meta_dict[lbl].index}\n", "similarities[lbl] = {itemid: sp.sparse.eye(len(meta_dict[lbl].index))}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Movielens10M" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "lbl = 'ML10M'" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "data_dict[lbl], meta_dict[lbl] = get_movielens_data('/mnt/bulky/datasets/recsys/movielens/ml-10m.zip',\n", " meta_path='data/meta_info_ml10m.csv',\n", " implicit=True,\n", " filter_no_meta=True)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# not used actually, simply to onform with general pipeline\n", "itemid = meta_dict[lbl].index.name\n", "sim_indices[lbl] = {itemid: meta_dict[lbl].index}\n", "similarities[lbl] = {itemid: sp.sparse.eye(len(meta_dict[lbl].index))}" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.0" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(meta_dict[lbl].applymap(len).sum(axis=1)==0).mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## BookCrossing" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "lbl = 'BX'" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/evfro/miniconda3/envs/polara_dev/lib/python3.6/site-packages/pandas/io/parsers.py:1990: DeprecationWarning: invalid escape sequence '\\8'\n", " data = self._reader.read(nrows)\n", "/home/evfro/miniconda3/envs/polara_dev/lib/python3.6/site-packages/pandas/io/parsers.py:1990: DeprecationWarning: invalid escape sequence '\\9'\n", " data = self._reader.read(nrows)\n" ] } ], "source": [ "data_dict[lbl], meta_dict[lbl] = get_bookcrossing_data('/mnt/bulky/datasets/recsys/bookcrossing/BX-CSV-Dump.zip',\n", " get_books_meta=True,\n", " implicit=True,\n", " pcore=5,\n", " filter_no_meta=True)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# not used actually, simply to onform with general pipeline\n", "itemid = meta_dict[lbl].index.name\n", "sim_indices[lbl] = {itemid: meta_dict[lbl].index}\n", "similarities[lbl] = {itemid: sp.sparse.eye(len(meta_dict[lbl].index))}" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.0" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(meta_dict[lbl].applymap(len).sum(axis=1)==0).mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data stats" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ML1M\n", "{'userid': 6038, 'movieid': 3522}\n", "density 2.699052132255699\n", "similarity matrix density 0.028392958546280524\n", "ML10M\n", "{'userid': 69797, 'movieid': 10258}\n", "density 0.6991397242349022\n", "similarity matrix density 0.009748488984207448\n", "BX\n", "{'userid': 7160, 'isbn': 16273}\n", "density 0.18925598044812894\n", "similarity matrix density 0.0005841769822585451\n" ] } ], "source": [ "print_data_stats(data_labels, all_data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Standard experiment" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "def prepare_recommender_models(data_label, data_models, config):\n", " data_model = data_models[data_label]\n", " fm = TuriFactorizationRecommender(data_model, item_side_info=meta_dict[data_label])\n", " fm.method = 'FM'\n", " models = [fm]\n", " apply_config(models, config, data_label)\n", " return models\n", "\n", "def fine_tune_fm(model, params, label, ntrials=60, record_time_as=None):\n", " param_grid, param_names = random_grid(params, n=ntrials)\n", " best_fm_config, fm_scores = find_optimal_config(model, param_grid, param_names,\n", " target_metric,\n", " return_scores=True,\n", " force_build=True,\n", " iterator=lambda x: track(x, label=label))\n", " model_config = {model.method: dict(zip(param_names, best_fm_config))}\n", " model_scores = {model.method: fm_scores}\n", " try:\n", " if record_time_as:\n", " save_training_time(f'{experiment_name}_{record_time_as}', model, fm_scores.index, label)\n", " finally:\n", " return model_config, model_scores" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## tuning" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "config = {}\n", "scores = {}\n", "times = {}\n", "data_models = {}" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'with_data_feedback': False,\n", " 'ranking_optimization': True,\n", " 'solver': 'adagrad',\n", " 'sgd_step_size': 0,\n", " 'seed': 42,\n", " 'max_iterations': 25,\n", " 'other_tc_params': {}}" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fm_init_config['ML1M']['FM']" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | \n", " | BX | \n", "ML10M | \n", "ML1M | \n", "
|---|---|---|---|---|
| type | \n", "metric | \n", "\n", " | \n", " | \n", " |
| relevance | \n", "hr | \n", "0.061034 | \n", "0.257876 | \n", "0.190189 | \n", "
| ranking | \n", "mrr | \n", "0.024515 | \n", "0.109986 | \n", "0.082559 | \n", "