{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "%config Completer.use_jedi = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "from optuna.exceptions import ExperimentalWarning\n",
    "warnings.filterwarnings(\"ignore\", category=UserWarning)\n",
    "warnings.filterwarnings(\"ignore\", category=ExperimentalWarning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql.types import IntegerType\n",
    "from pyspark.sql.functions import array_contains, col, explode, split, substring\n",
    "\n",
    "from replay.data_preparator import DataPreparator\n",
    "from replay.experiment import Experiment\n",
    "from replay.metrics import HitRate, NDCG, MAP, Coverage\n",
    "from replay.models import LightFMWrap\n",
    "from replay.session_handler import State\n",
    "from replay.splitters import UserSplitter\n",
    "from rs_datasets import MovieLens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "K=10\n",
    "SEED=1234"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# The notebook contains an example of LightFM model usage and dataset preprocessing with RePlay, including:\n",
    "1. Data loading\n",
    "2. Features preprocessing with pyspark\n",
    "3. Building LightFM model based on interaction matrix and features\n",
    "4. Model evaluation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1) Data loading"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will use MovieLens 10m dataset from rs_datasets package, which contains a list of recommendations datasets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style type='text/css'>\n",
       ".datatable table.frame { margin-bottom: 0; }\n",
       ".datatable table.frame thead { border-bottom: none; }\n",
       ".datatable table.frame tr.coltypes td {  color: #FFFFFF;  line-height: 6px;  padding: 0 0.5em;}\n",
       ".datatable .bool    { background: #DDDD99; }\n",
       ".datatable .object  { background: #565656; }\n",
       ".datatable .int     { background: #5D9E5D; }\n",
       ".datatable .float   { background: #4040CC; }\n",
       ".datatable .str     { background: #CC4040; }\n",
       ".datatable .time    { background: #40CC40; }\n",
       ".datatable .row_index {  background: var(--jp-border-color3);  border-right: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  font-size: 9px;}\n",
       ".datatable .frame tbody td { text-align: left; }\n",
       ".datatable .frame tr.coltypes .row_index {  background: var(--jp-border-color0);}\n",
       ".datatable th:nth-child(2) { padding-left: 12px; }\n",
       ".datatable .hellipsis {  color: var(--jp-cell-editor-border-color);}\n",
       ".datatable .vellipsis {  background: var(--jp-layout-color0);  color: var(--jp-cell-editor-border-color);}\n",
       ".datatable .na {  color: var(--jp-cell-editor-border-color);  font-size: 80%;}\n",
       ".datatable .sp {  opacity: 0.25;}\n",
       ".datatable .footer { font-size: 9px; }\n",
       ".datatable .frame_dimensions {  background: var(--jp-border-color3);  border-top: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  display: inline-block;  opacity: 0.6;  padding: 1px 10px 1px 5px;}\n",
       "</style>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ratings\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>item_id</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>122</td>\n",
       "      <td>5.0</td>\n",
       "      <td>838985046</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>185</td>\n",
       "      <td>5.0</td>\n",
       "      <td>838983525</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>231</td>\n",
       "      <td>5.0</td>\n",
       "      <td>838983392</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  item_id  rating  timestamp\n",
       "0        1      122     5.0  838985046\n",
       "1        1      185     5.0  838983525\n",
       "2        1      231     5.0  838983392"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "items\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>item_id</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Jumanji (1995)</td>\n",
       "      <td>Adventure|Children|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Grumpier Old Men (1995)</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   item_id                    title  \\\n",
       "0        1         Toy Story (1995)   \n",
       "1        2           Jumanji (1995)   \n",
       "2        3  Grumpier Old Men (1995)   \n",
       "\n",
       "                                        genres  \n",
       "0  Adventure|Animation|Children|Comedy|Fantasy  \n",
       "1                   Adventure|Children|Fantasy  \n",
       "2                               Comedy|Romance  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "tags\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>item_id</th>\n",
       "      <th>tag</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>15</td>\n",
       "      <td>4973</td>\n",
       "      <td>excellent!</td>\n",
       "      <td>1215184630</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>20</td>\n",
       "      <td>1747</td>\n",
       "      <td>politics</td>\n",
       "      <td>1188263867</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>20</td>\n",
       "      <td>1747</td>\n",
       "      <td>satire</td>\n",
       "      <td>1188263867</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  item_id         tag   timestamp\n",
       "0       15     4973  excellent!  1215184630\n",
       "1       20     1747    politics  1188263867\n",
       "2       20     1747      satire  1188263867"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "data = MovieLens(\"10m\")\n",
    "data.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Convert interaction log to RePlay format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING: An illegal reflective access operation has occurred\n",
      "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/u19893556/miniconda3/envs/replay/lib/python3.7/site-packages/pyspark/jars/spark-unsafe_2.12-3.1.2.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
      "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
      "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
      "WARNING: All illegal access operations will be denied in a future release\n",
      "22/02/27 22:14:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
      "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
      "Setting default log level to \"WARN\".\n",
      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
      "22/02/27 22:14:17 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).\n",
      "22/02/27 22:14:18 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n",
      "22/02/27 22:14:18 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n",
      "22/02/27 22:14:23 WARN TaskSetManager: Stage 0 contains a task of very large size (4073 KiB). The maximum recommended task size is 1000 KiB.\n",
      "22/02/27 22:14:27 WARN TaskSetManager: Stage 2 contains a task of very large size (4073 KiB). The maximum recommended task size is 1000 KiB.\n",
      "22/02/27 22:14:30 WARN TaskSetManager: Stage 4 contains a task of very large size (4073 KiB). The maximum recommended task size is 1000 KiB.\n",
      "22/02/27 22:14:38 WARN TaskSetManager: Stage 6 contains a task of very large size (4073 KiB). The maximum recommended task size is 1000 KiB.\n",
      "                                                                                \r"
     ]
    }
   ],
   "source": [
    "preparator = DataPreparator()\n",
    "log, _, item_features = preparator(data.ratings, item_features=data.items, mapping={\"relevance\": \"rating\"})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_random_splitter = UserSplitter(\n",
    "    item_test_size=K,\n",
    "    user_test_size=500,\n",
    "    drop_cold_items=True,\n",
    "    drop_cold_users=True,\n",
    "    shuffle=True,\n",
    "    seed=SEED\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "22/02/27 22:14:47 WARN DAGScheduler: Broadcasting large task binary with size 2004.4 KiB\n",
      "22/02/27 22:14:47 WARN TaskSetManager: Stage 10 contains a task of very large size (4073 KiB). The maximum recommended task size is 1000 KiB.\n",
      "22/02/27 22:14:49 WARN DAGScheduler: Broadcasting large task binary with size 2011.1 KiB\n",
      "22/02/27 22:14:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.\n",
      "22/02/27 22:14:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.\n",
      "22/02/27 22:14:51 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB\n",
      "22/02/27 22:14:51 WARN TaskSetManager: Stage 15 contains a task of very large size (4073 KiB). The maximum recommended task size is 1000 KiB.\n",
      "22/02/27 22:14:51 WARN DAGScheduler: Broadcasting large task binary with size 2004.7 KiB\n",
      "22/02/27 22:14:52 WARN TaskSetManager: Stage 13 contains a task of very large size (4073 KiB). The maximum recommended task size is 1000 KiB.\n",
      "22/02/27 22:14:55 WARN DAGScheduler: Broadcasting large task binary with size 2008.8 KiB\n",
      "22/02/27 22:14:56 WARN DAGScheduler: Broadcasting large task binary with size 2013.9 KiB\n",
      "22/02/27 22:14:56 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:01 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:01 WARN DAGScheduler: Broadcasting large task binary with size 2004.8 KiB\n",
      "22/02/27 22:15:01 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB\n",
      "22/02/27 22:15:02 WARN TaskSetManager: Stage 19 contains a task of very large size (4073 KiB). The maximum recommended task size is 1000 KiB.\n",
      "22/02/27 22:15:03 WARN TaskSetManager: Stage 22 contains a task of very large size (4073 KiB). The maximum recommended task size is 1000 KiB.\n",
      "22/02/27 22:15:05 WARN DAGScheduler: Broadcasting large task binary with size 2009.1 KiB\n",
      "22/02/27 22:15:08 WARN DAGScheduler: Broadcasting large task binary with size 2014.1 KiB\n",
      "22/02/27 22:15:10 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB\n",
      "22/02/27 22:15:15 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:17 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "                                                                                \r"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(9995054, 5000)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train, test = user_random_splitter.split(log)\n",
    "train.count(), test.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "22/02/27 22:15:18 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.\n",
      "22/02/27 22:15:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.\n",
      "22/02/27 22:15:20 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:21 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:21 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:26 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:26 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:27 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:34 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:39 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:41 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "                                                                                \r"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(9990054, 5000)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_opt, val_opt = user_random_splitter.split(train)\n",
    "train_opt.count(), val_opt.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2) Features preprocessing with pyspark"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Year"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+----+\n",
      "|item_idx|year|\n",
      "+--------+----+\n",
      "|      11|1995|\n",
      "|     117|1995|\n",
      "+--------+----+\n",
      "only showing top 2 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "year = item_features.withColumn('year', substring(col('title'), -5, 4).astype(IntegerType())).select('item_idx', 'year')\n",
    "year.show(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Genres"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "genres = (\n",
    "    State().session.createDataFrame(data.items[[\"item_id\", \"genres\"]].rename({'item_id': 'item_idx'}, axis=1))\n",
    "    .select(\n",
    "        \"item_idx\",\n",
    "        split(\"genres\", \"\\|\").alias(\"genres\")\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+--------------------+\n",
      "|item_idx|              genres|\n",
      "+--------+--------------------+\n",
      "|       1|[Adventure, Anima...|\n",
      "|       2|[Adventure, Child...|\n",
      "|       3|   [Comedy, Romance]|\n",
      "|       4|[Comedy, Drama, R...|\n",
      "|       5|            [Comedy]|\n",
      "|       6|[Action, Crime, T...|\n",
      "|       7|   [Comedy, Romance]|\n",
      "|       8|[Adventure, Child...|\n",
      "|       9|            [Action]|\n",
      "|      10|[Action, Adventur...|\n",
      "|      11|[Comedy, Drama, R...|\n",
      "|      12|    [Comedy, Horror]|\n",
      "|      13|[Animation, Child...|\n",
      "|      14|             [Drama]|\n",
      "|      15|[Action, Adventur...|\n",
      "|      16|      [Crime, Drama]|\n",
      "|      17|[Comedy, Drama, R...|\n",
      "|      18|[Comedy, Drama, T...|\n",
      "|      19|            [Comedy]|\n",
      "|      20|[Action, Comedy, ...|\n",
      "+--------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "genres.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "genres_list = (\n",
    "    genres.select(explode(\"genres\").alias(\"genre\"))\n",
    "    .distinct().filter('genre <> \"(no genres listed)\"')\n",
    "    .toPandas()[\"genre\"].tolist()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Documentary',\n",
       " 'IMAX',\n",
       " 'Adventure',\n",
       " 'Animation',\n",
       " 'Comedy',\n",
       " 'Thriller',\n",
       " 'Sci-Fi',\n",
       " 'Musical',\n",
       " 'Horror',\n",
       " 'Action',\n",
       " 'Fantasy',\n",
       " 'War',\n",
       " 'Mystery',\n",
       " 'Drama',\n",
       " 'Film-Noir',\n",
       " 'Crime',\n",
       " 'Western',\n",
       " 'Romance',\n",
       " 'Children']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "genres_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10681"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "item_features = genres\n",
    "for genre in genres_list:\n",
    "    item_features = item_features.withColumn(\n",
    "        genre,\n",
    "        array_contains(col(\"genres\"), genre).astype(IntegerType())\n",
    "    )\n",
    "item_features = item_features.drop(\"genres\").cache()\n",
    "item_features.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8316"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "item_features = item_features.join(year, on='item_idx', how='inner')\n",
    "item_features.cache()\n",
    "item_features.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3) Building LightFM model based on interaction matrix and features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_feat = LightFMWrap(random_state=SEED, loss='warp', no_components=128)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "22/02/27 22:15:44 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:45 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:49 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:49 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:50 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:50 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:52 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:54 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:55 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:57 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:15:58 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:16:00 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:16:00 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:16:02 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:16:03 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:16:10 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "                                                                                \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 13h 4min 1s, sys: 53.2 s, total: 13h 4min 54s\n",
      "Wall time: 17min 29s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "model_feat.fit(train, item_features=item_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "27-Feb-22 22:33:13, replay, WARNING: This model can't predict cold users, they will be ignored\n",
      "27-Feb-22 22:33:13, replay, WARNING: This model can't predict cold users, they will be ignored\n",
      "22/02/27 22:33:14 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:14 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:14 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:15 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:33:16 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:33:17 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:17 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:17 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:33:19 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:33:20 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:20 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:20 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:33:25 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:33:26 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:26 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:27 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:33:28 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:33:29 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:29 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:30 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:33:33 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:33:35 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:35 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:35 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:33:36 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 205 ms, sys: 247 ms, total: 452 ms\n",
      "Wall time: 23.1 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "recs = model_feat.predict(\n",
    "    k=K,\n",
    "    users=test.select('user_idx').distinct(),\n",
    "    log=train,\n",
    "    filter_seen_items=True,\n",
    "    item_features=item_features\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4) Model evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-02-10T16:07:28.942205Z",
     "start_time": "2020-02-10T16:07:26.281475Z"
    },
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "22/02/27 22:33:37 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:39 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "                                                                                \r"
     ]
    }
   ],
   "source": [
    "metrics = Experiment(test, {NDCG(): K,\n",
    "                            MAP() : K,\n",
    "                            HitRate(): [1, K],\n",
    "                           Coverage(train): K})\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "22/02/27 22:33:40 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:40 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:40 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:40 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:42 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:42 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:33:42 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:33:43 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:33:47 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:33:55 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:33:57 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:33:57 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:33:58 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:33:59 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:59 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:59 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:33:59 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:34:01 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:34:01 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:34:01 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:34:03 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:34:06 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:34:07 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:34:08 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:34:08 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:34:08 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:34:08 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:34:09 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB\n",
      "22/02/27 22:34:10 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:34:10 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:34:11 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:34:13 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:34:15 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:34:16 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB\n",
      "22/02/27 22:34:19 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB\n",
      "22/02/27 22:34:20 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB\n",
      "22/02/27 22:34:21 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB\n",
      "22/02/27 22:34:21 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Coverage@10</th>\n",
       "      <th>HitRate@1</th>\n",
       "      <th>HitRate@10</th>\n",
       "      <th>MAP@10</th>\n",
       "      <th>NDCG@10</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>LightFM_item_features</th>\n",
       "      <td>0.066404</td>\n",
       "      <td>0.348</td>\n",
       "      <td>0.784</td>\n",
       "      <td>0.121321</td>\n",
       "      <td>0.229198</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       Coverage@10  HitRate@1  HitRate@10    MAP@10   NDCG@10\n",
       "LightFM_item_features     0.066404      0.348       0.784  0.121321  0.229198"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metrics.add_result(\"LightFM_item_features\", recs)\n",
    "metrics.results"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}