{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "scrolled": true }, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "

SparkSession - hive

\n", " \n", "
\n", "

SparkContext

\n", "\n", "

Spark UI

\n", "\n", "
\n", "
Version
\n", "
v3.0.0
\n", "
Master
\n", "
local[*]
\n", "
AppName
\n", "
pyspark-shell
\n", "
\n", "
\n", " \n", "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from replay.session_handler import State\n", "\n", "spark = State().session\n", "spark" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_iditem_idrelevancetimestamp
0111935978300760
116613978302109
219143978301968
3134084978300275
4123555978824291
\n", "
" ], "text/plain": [ " user_id item_id relevance timestamp\n", "0 1 1193 5 978300760\n", "1 1 661 3 978302109\n", "2 1 914 3 978301968\n", "3 1 3408 4 978300275\n", "4 1 2355 5 978824291" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv(\"data/ml1m_ratings.dat\", sep=\"\\t\", names=[\"user_id\", \"item_id\", \"relevance\", \"timestamp\"])\n", "items = pd.read_csv(\"data/ml1m_items.dat\", sep=\"\\t\", names=[\"item_id\", \"titile\", \"genres\"])\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from replay.data_preparator import DataPreparator\n", "\n", "log = DataPreparator().transform(\n", " data=df,\n", " columns_names={\n", " \"user_id\": \"user_id\",\n", " \"item_id\": \"item_id\",\n", " \"relevance\": \"relevance\",\n", " \"timestamp\": \"timestamp\"\n", " }\n", ")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.preprocessing import MultiLabelBinarizer,LabelBinarizer\n", "\n", "mlb = MultiLabelBinarizer()\n", "lb = LabelBinarizer()\n", "item_features = pd.DataFrame(mlb.fit_transform(items.genres.apply(lambda x: x.split(\"|\"))),\n", " columns=list(map(lambda x: f\"genre_{x}\",mlb.classes_)),\n", " index=items.item_id).reset_index()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "item_features_spark = DataPreparator().transform(\n", " data=item_features,\n", " columns_names={\n", " \"item_id\": \"item_id\"\n", " }\n", ").drop(\"timestamp\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from replay.splitters import UserSplitter\n", "\n", "second_stage_splitter = UserSplitter(\n", " drop_cold_items=True,\n", " drop_cold_users=True,\n", " item_test_size=10,\n", " seed=1234,\n", " shuffle=True\n", ")\n", "\n", "first_stage_splitter = UserSplitter(\n", " drop_cold_items=False, item_test_size=0.5, shuffle=True, seed=42\n", ")\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/darel/python/sponge-bob-magic/.new_env2/lib/python3.7/site-packages/lightfm/_lightfm_fast.py:9: UserWarning: LightFM was compiled without OpenMP support. Only a single thread will be used.\n", " warnings.warn('LightFM was compiled without OpenMP support. '\n" ] } ], "source": [ "from replay.models import ALSWrap\n", "# при 98 все падает с Java heap space error\n", "first_model = ALSWrap(rank=40)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from replay.models import ClassifierRec\n", "from pyspark.ml.classification import RandomForestClassifier\n", "second_model = ClassifierRec(RandomForestClassifier(seed=47), use_recs_value=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Двухуровневый сценарий со статистическими фичами" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "from replay.scenarios import TwoStagesScenario\n", "from replay.metrics import NDCG, HitRate, Precision, Recall, RocAuc\n", "\n", "two_stages_with_stat = TwoStagesScenario(\n", " second_stage_splitter=second_stage_splitter,\n", " second_model=second_model,\n", " first_model=first_model,\n", " metrics={NDCG(): [1, 5, 10], HitRate(): [1, 5, 10]},\n", " stat_features=True\n", ")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "02-Mar-21 18:18:31, replay, DEBUG: mixed_train stat: total lines: 939809, total users: 6040, total items: 3699\n", "DEBUG:replay:mixed_train stat: total lines: 939809, total users: 6040, total items: 3699\n", "02-Mar-21 18:18:34, replay, DEBUG: test stat: total lines: 60393, total users: 6040, total items: 3051\n", "DEBUG:replay:test stat: total lines: 60393, total users: 6040, total items: 3051\n", "02-Mar-21 18:18:37, replay, DEBUG: first_train stat: total lines: 471386, total users: 6040, total items: 3604\n", "DEBUG:replay:first_train stat: total lines: 471386, total users: 6040, total items: 3604\n", "02-Mar-21 18:18:39, replay, DEBUG: first_test stat: total lines: 468423, total users: 6040, total items: 3611\n", "DEBUG:replay:first_test stat: total lines: 468423, total users: 6040, total items: 3611\n", "02-Mar-21 18:18:39, replay, DEBUG: Начало обучения ALSWrap\n", "DEBUG:replay:Начало обучения ALSWrap\n", "02-Mar-21 18:18:39, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n", "DEBUG:replay:Предварительная стадия обучения (pre-fit)\n", "02-Mar-21 18:18:40, replay, DEBUG: Основная стадия обучения (fit)\n", "DEBUG:replay:Основная стадия обучения (fit)\n", "02-Mar-21 18:18:51, replay, DEBUG: Начало предикта ALSWrap\n", "DEBUG:replay:Начало предикта ALSWrap\n", "02-Mar-21 18:19:42, replay, DEBUG: баланс классов: положительных 164401 из 604000\n", "DEBUG:replay:баланс классов: положительных 164401 из 604000\n", "02-Mar-21 18:19:42, replay, DEBUG: Начало предикта ALSWrap\n", "DEBUG:replay:Начало предикта ALSWrap\n", "02-Mar-21 18:19:44, replay, WARNING: Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n", "WARNING:replay:Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n", "02-Mar-21 18:19:45, replay, DEBUG: Начало обучения ClassifierRec\n", "DEBUG:replay:Начало обучения ClassifierRec\n", "02-Mar-21 18:19:45, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n", "DEBUG:replay:Предварительная стадия обучения (pre-fit)\n", "02-Mar-21 18:20:27, replay, DEBUG: Основная стадия обучения (fit)\n", "DEBUG:replay:Основная стадия обучения (fit)\n", "02-Mar-21 18:23:16, replay, DEBUG: ROC AUC модели второго уровня (как классификатора): 0.8018\n", "DEBUG:replay:ROC AUC модели второго уровня (как классификатора): 0.8018\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.69 s, sys: 674 ms, total: 4.37 s\n", "Wall time: 5min 47s\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
HitRate@1HitRate@5HitRate@10NDCG@1NDCG@5NDCG@10
two_stages_scenario0.2135760.5488410.7069540.2135760.1743690.148692
\n", "
" ], "text/plain": [ " HitRate@1 HitRate@5 HitRate@10 NDCG@1 NDCG@5 \\\n", "two_stages_scenario 0.213576 0.548841 0.706954 0.213576 0.174369 \n", "\n", " NDCG@10 \n", "two_stages_scenario 0.148692 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time \n", "recs_with_stat = two_stages_with_stat.get_recs(log, 10, item_features=item_features_spark)\n", "two_stages_with_stat.experiment.results\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Двухуровневый сценарий без статистических фичей" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "02-Mar-21 18:25:17, replay, DEBUG: mixed_train stat: total lines: 939809, total users: 6040, total items: 3699\n", "DEBUG:replay:mixed_train stat: total lines: 939809, total users: 6040, total items: 3699\n", "02-Mar-21 18:25:17, replay, DEBUG: test stat: total lines: 60393, total users: 6040, total items: 3051\n", "DEBUG:replay:test stat: total lines: 60393, total users: 6040, total items: 3051\n", "02-Mar-21 18:25:18, replay, DEBUG: first_train stat: total lines: 471386, total users: 6040, total items: 3604\n", "DEBUG:replay:first_train stat: total lines: 471386, total users: 6040, total items: 3604\n", "02-Mar-21 18:25:18, replay, DEBUG: first_test stat: total lines: 468423, total users: 6040, total items: 3611\n", "DEBUG:replay:first_test stat: total lines: 468423, total users: 6040, total items: 3611\n", "02-Mar-21 18:25:18, replay, DEBUG: Начало обучения ALSWrap\n", "DEBUG:replay:Начало обучения ALSWrap\n", "02-Mar-21 18:25:18, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n", "DEBUG:replay:Предварительная стадия обучения (pre-fit)\n", "02-Mar-21 18:25:18, replay, DEBUG: Основная стадия обучения (fit)\n", "DEBUG:replay:Основная стадия обучения (fit)\n", "02-Mar-21 18:25:26, replay, DEBUG: Начало предикта ALSWrap\n", "DEBUG:replay:Начало предикта ALSWrap\n", "02-Mar-21 18:26:08, replay, DEBUG: баланс классов: положительных 164401 из 604000\n", "DEBUG:replay:баланс классов: положительных 164401 из 604000\n", "02-Mar-21 18:26:08, replay, DEBUG: Начало предикта ALSWrap\n", "DEBUG:replay:Начало предикта ALSWrap\n", "02-Mar-21 18:26:10, replay, WARNING: Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n", "WARNING:replay:Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n", "02-Mar-21 18:26:11, replay, DEBUG: Начало обучения ClassifierRec\n", "DEBUG:replay:Начало обучения ClassifierRec\n", "02-Mar-21 18:26:11, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n", "DEBUG:replay:Предварительная стадия обучения (pre-fit)\n", "02-Mar-21 18:27:03, replay, DEBUG: Основная стадия обучения (fit)\n", "DEBUG:replay:Основная стадия обучения (fit)\n", "02-Mar-21 18:29:48, replay, DEBUG: ROC AUC модели второго уровня (как классификатора): 0.8006\n", "DEBUG:replay:ROC AUC модели второго уровня (как классификатора): 0.8006\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.63 s, sys: 629 ms, total: 4.26 s\n", "Wall time: 5min 22s\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
HitRate@1HitRate@5HitRate@10NDCG@1NDCG@5NDCG@10
two_stages_scenario0.2236750.5607620.7213580.2236750.1800950.153606
\n", "
" ], "text/plain": [ " HitRate@1 HitRate@5 HitRate@10 NDCG@1 NDCG@5 \\\n", "two_stages_scenario 0.223675 0.560762 0.721358 0.223675 0.180095 \n", "\n", " NDCG@10 \n", "two_stages_scenario 0.153606 " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "two_stages_without_stat = TwoStagesScenario(\n", " second_stage_splitter=second_stage_splitter,\n", " second_model=second_model,\n", " first_model=first_model,\n", " metrics={NDCG(): [1, 5, 10], HitRate(): [1, 5, 10]},\n", " stat_features=False\n", ")\n", "recs_without_stat = two_stages_without_stat.get_recs(log, 10, item_features=item_features_spark)\n", "two_stages_without_stat.experiment.results\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
HitRate@1HitRate@5HitRate@10NDCG@1NDCG@5NDCG@10
two_stages_scenario0.2135760.5488410.7069540.2135760.1743690.148692
two_stages_without_stat0.2236750.5607620.7213580.2236750.1800950.153606
\n", "
" ], "text/plain": [ " HitRate@1 HitRate@5 HitRate@10 NDCG@1 NDCG@5 \\\n", "two_stages_scenario 0.213576 0.548841 0.706954 0.213576 0.174369 \n", "two_stages_without_stat 0.223675 0.560762 0.721358 0.223675 0.180095 \n", "\n", " NDCG@10 \n", "two_stages_scenario 0.148692 \n", "two_stages_without_stat 0.153606 " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "two_stages_with_stat.experiment.add_result(\"two_stages_without_stat\", recs_without_stat)\n", "two_stages_with_stat.experiment.results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Модель первого уровня, обученная на всем train" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "scrolled": true }, "outputs": [], "source": [ "train, test = second_stage_splitter.split(log)\n", "first_train, first_test = first_stage_splitter.split(train)\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "02-Mar-21 18:31:25, replay, DEBUG: Начало обучения ALSWrap\n", "DEBUG:replay:Начало обучения ALSWrap\n", "02-Mar-21 18:31:25, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n", "DEBUG:replay:Предварительная стадия обучения (pre-fit)\n", "02-Mar-21 18:31:25, replay, DEBUG: Основная стадия обучения (fit)\n", "DEBUG:replay:Основная стадия обучения (fit)\n", "02-Mar-21 18:31:35, replay, DEBUG: Начало предикта ALSWrap\n", "DEBUG:replay:Начало предикта ALSWrap\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.14 s, sys: 179 ms, total: 1.32 s\n", "Wall time: 13.1 s\n" ] } ], "source": [ "%%time\n", "first_recs_all = first_model.fit_predict(\n", " log=train,\n", " k=10,\n", " users=test.select(\"user_id\").distinct().cache(),\n", " items=train.select(\"item_id\").distinct().cache(),\n", ")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
HitRate@1HitRate@5HitRate@10NDCG@1NDCG@5NDCG@10
two_stages_scenario0.2135760.5488410.7069540.2135760.1743690.148692
two_stages_without_stat0.2236750.5607620.7213580.2236750.1800950.153606
first_stage_all0.3370860.7259930.8706950.3370860.2656480.224414
\n", "
" ], "text/plain": [ " HitRate@1 HitRate@5 HitRate@10 NDCG@1 NDCG@5 \\\n", "two_stages_scenario 0.213576 0.548841 0.706954 0.213576 0.174369 \n", "two_stages_without_stat 0.223675 0.560762 0.721358 0.223675 0.180095 \n", "first_stage_all 0.337086 0.725993 0.870695 0.337086 0.265648 \n", "\n", " NDCG@10 \n", "two_stages_scenario 0.148692 \n", "two_stages_without_stat 0.153606 \n", "first_stage_all 0.224414 " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "two_stages_with_stat.experiment.add_result(\"first_stage_all\", first_recs_all)\n", "two_stages_with_stat.experiment.results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Модель первого уровня, обученная на половине train (как в двухуровневом сценарии)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "02-Mar-21 18:32:42, replay, DEBUG: Начало обучения ALSWrap\n", "DEBUG:replay:Начало обучения ALSWrap\n", "02-Mar-21 18:32:42, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n", "DEBUG:replay:Предварительная стадия обучения (pre-fit)\n", "02-Mar-21 18:32:43, replay, DEBUG: Основная стадия обучения (fit)\n", "DEBUG:replay:Основная стадия обучения (fit)\n", "02-Mar-21 18:32:50, replay, DEBUG: Начало предикта ALSWrap\n", "DEBUG:replay:Начало предикта ALSWrap\n", "02-Mar-21 18:32:50, replay, WARNING: Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n", "WARNING:replay:Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n", "02-Mar-21 18:32:51, replay, WARNING: Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n", "WARNING:replay:Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.22 s, sys: 498 ms, total: 1.72 s\n", "Wall time: 1min 33s\n" ] } ], "source": [ "%%time\n", "first_model.fit(log=first_train)\n", "first_model_half = first_model.predict(\n", " log=train,\n", " k=10,\n", " users=test.select(\"user_id\").distinct().cache(),\n", " items=train.select(\"item_id\").distinct().cache(),\n", ")\n", "\n", "two_stages_with_stat.experiment.add_result(\"first_stage_half\", first_model_half)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
HitRate@1HitRate@5HitRate@10NDCG@1NDCG@5NDCG@10
two_stages_scenario0.2135760.5488410.7069540.2135760.1743690.148692
two_stages_without_stat0.2236750.5607620.7213580.2236750.1800950.153606
first_stage_all0.3370860.7259930.8706950.3370860.2656480.224414
first_stage_half0.2758280.6524830.8109270.2758280.2200980.187830
\n", "
" ], "text/plain": [ " HitRate@1 HitRate@5 HitRate@10 NDCG@1 NDCG@5 \\\n", "two_stages_scenario 0.213576 0.548841 0.706954 0.213576 0.174369 \n", "two_stages_without_stat 0.223675 0.560762 0.721358 0.223675 0.180095 \n", "first_stage_all 0.337086 0.725993 0.870695 0.337086 0.265648 \n", "first_stage_half 0.275828 0.652483 0.810927 0.275828 0.220098 \n", "\n", " NDCG@10 \n", "two_stages_scenario 0.148692 \n", "two_stages_without_stat 0.153606 \n", "first_stage_all 0.224414 \n", "first_stage_half 0.187830 " ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "two_stages_with_stat.experiment.results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Двухуровневый сценарий с усиленным классификатором" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "second_model = ClassifierRec(spark_classifier=RandomForestClassifier(numTrees=100, seed=47), use_recs_value=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Двухуровневый сценарий со статистическими фичами" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "two_stages_with_stat_strong = TwoStagesScenario(\n", " second_stage_splitter=second_stage_splitter,\n", " second_model=second_model,\n", " first_model=first_model,\n", " metrics={NDCG(): [1, 5, 10], HitRate(): [1, 5, 10]},\n", " stat_features=True\n", ")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "02-Mar-21 18:46:21, replay, DEBUG: mixed_train stat: total lines: 939809, total users: 6040, total items: 3699\n", "DEBUG:replay:mixed_train stat: total lines: 939809, total users: 6040, total items: 3699\n", "02-Mar-21 18:46:21, replay, DEBUG: test stat: total lines: 60393, total users: 6040, total items: 3051\n", "DEBUG:replay:test stat: total lines: 60393, total users: 6040, total items: 3051\n", "02-Mar-21 18:46:22, replay, DEBUG: first_train stat: total lines: 471386, total users: 6040, total items: 3604\n", "DEBUG:replay:first_train stat: total lines: 471386, total users: 6040, total items: 3604\n", "02-Mar-21 18:46:22, replay, DEBUG: first_test stat: total lines: 468423, total users: 6040, total items: 3611\n", "DEBUG:replay:first_test stat: total lines: 468423, total users: 6040, total items: 3611\n", "02-Mar-21 18:46:22, replay, DEBUG: Начало обучения ALSWrap\n", "DEBUG:replay:Начало обучения ALSWrap\n", "02-Mar-21 18:46:22, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n", "DEBUG:replay:Предварительная стадия обучения (pre-fit)\n", "02-Mar-21 18:46:23, replay, DEBUG: Основная стадия обучения (fit)\n", "DEBUG:replay:Основная стадия обучения (fit)\n", "02-Mar-21 18:46:31, replay, DEBUG: Начало предикта ALSWrap\n", "DEBUG:replay:Начало предикта ALSWrap\n", "02-Mar-21 18:47:16, replay, DEBUG: баланс классов: положительных 164401 из 604000\n", "DEBUG:replay:баланс классов: положительных 164401 из 604000\n", "02-Mar-21 18:47:16, replay, DEBUG: Начало предикта ALSWrap\n", "DEBUG:replay:Начало предикта ALSWrap\n", "02-Mar-21 18:47:18, replay, WARNING: Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n", "WARNING:replay:Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n", "02-Mar-21 18:47:19, replay, DEBUG: Начало обучения ClassifierRec\n", "DEBUG:replay:Начало обучения ClassifierRec\n", "02-Mar-21 18:47:19, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n", "DEBUG:replay:Предварительная стадия обучения (pre-fit)\n", "02-Mar-21 18:48:12, replay, DEBUG: Основная стадия обучения (fit)\n", "DEBUG:replay:Основная стадия обучения (fit)\n", "02-Mar-21 18:52:10, replay, DEBUG: ROC AUC модели второго уровня (как классификатора): 0.8058\n", "DEBUG:replay:ROC AUC модели второго уровня (как классификатора): 0.8058\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.75 s, sys: 802 ms, total: 4.55 s\n", "Wall time: 7min 54s\n" ] } ], "source": [ "%%time \n", "recs_with_stat = two_stages_with_stat_strong.get_recs(log, 10, item_features=item_features_spark)\n", "two_stages_with_stat.experiment.add_result(\"two_stages_with_stat_strong\", recs_with_stat)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Двухуровневый сценарий без статистических фичей" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "02-Mar-21 18:54:16, replay, DEBUG: mixed_train stat: total lines: 939809, total users: 6040, total items: 3699\n", "DEBUG:replay:mixed_train stat: total lines: 939809, total users: 6040, total items: 3699\n", "02-Mar-21 18:54:16, replay, DEBUG: test stat: total lines: 60393, total users: 6040, total items: 3051\n", "DEBUG:replay:test stat: total lines: 60393, total users: 6040, total items: 3051\n", "02-Mar-21 18:54:16, replay, DEBUG: first_train stat: total lines: 471386, total users: 6040, total items: 3604\n", "DEBUG:replay:first_train stat: total lines: 471386, total users: 6040, total items: 3604\n", "02-Mar-21 18:54:17, replay, DEBUG: first_test stat: total lines: 468423, total users: 6040, total items: 3611\n", "DEBUG:replay:first_test stat: total lines: 468423, total users: 6040, total items: 3611\n", "02-Mar-21 18:54:17, replay, DEBUG: Начало обучения ALSWrap\n", "DEBUG:replay:Начало обучения ALSWrap\n", "02-Mar-21 18:54:17, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n", "DEBUG:replay:Предварительная стадия обучения (pre-fit)\n", "02-Mar-21 18:54:17, replay, DEBUG: Основная стадия обучения (fit)\n", "DEBUG:replay:Основная стадия обучения (fit)\n", "02-Mar-21 18:54:29, replay, DEBUG: Начало предикта ALSWrap\n", "DEBUG:replay:Начало предикта ALSWrap\n", "02-Mar-21 18:55:25, replay, DEBUG: баланс классов: положительных 164401 из 604000\n", "DEBUG:replay:баланс классов: положительных 164401 из 604000\n", "02-Mar-21 18:55:25, replay, DEBUG: Начало предикта ALSWrap\n", "DEBUG:replay:Начало предикта ALSWrap\n", "02-Mar-21 18:55:27, replay, WARNING: Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n", "WARNING:replay:Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n", "02-Mar-21 18:55:29, replay, DEBUG: Начало обучения ClassifierRec\n", "DEBUG:replay:Начало обучения ClassifierRec\n", "02-Mar-21 18:55:29, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n", "DEBUG:replay:Предварительная стадия обучения (pre-fit)\n", "02-Mar-21 18:56:11, replay, DEBUG: Основная стадия обучения (fit)\n", "DEBUG:replay:Основная стадия обучения (fit)\n", "02-Mar-21 18:59:32, replay, DEBUG: ROC AUC модели второго уровня (как классификатора): 0.8053\n", "DEBUG:replay:ROC AUC модели второго уровня (как классификатора): 0.8053\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 6.23 s, sys: 1.24 s, total: 7.46 s\n", "Wall time: 6min 57s\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
HitRate@1HitRate@5HitRate@10NDCG@1NDCG@5NDCG@10
two_stages_scenario0.2307950.5640730.719040.2307950.1837480.155696
\n", "
" ], "text/plain": [ " HitRate@1 HitRate@5 HitRate@10 NDCG@1 NDCG@5 \\\n", "two_stages_scenario 0.230795 0.564073 0.71904 0.230795 0.183748 \n", "\n", " NDCG@10 \n", "two_stages_scenario 0.155696 " ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "two_stages_without_stat_strong = TwoStagesScenario(\n", " second_stage_splitter=second_stage_splitter,\n", " second_model=second_model,\n", " first_model=first_model,\n", " metrics={NDCG(): [1, 5, 10], HitRate(): [1, 5, 10]},\n", " stat_features=False\n", ")\n", "recs_without_stat = two_stages_without_stat_strong.get_recs(log, 10, item_features=item_features_spark)\n", "two_stages_with_stat.experiment.add_result(\"two_stages_without_stat_strong\", recs_without_stat)\n", "two_stages_without_stat_strong.experiment.results" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "two_stages_with_stat.experiment.add_result(\"two_stages_without_stat_strong\", recs_without_stat)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
HitRate@1HitRate@5HitRate@10NDCG@1NDCG@5NDCG@10
first_stage_all0.3370860.7259930.8706950.3370860.2656480.224414
first_stage_half0.2758280.6524830.8109270.2758280.2200980.187830
two_stages_with_stat_strong0.2471850.5899010.7509930.2471850.1946240.164429
two_stages_without_stat_strong0.2307950.5640730.7190400.2307950.1837480.155696
two_stages_without_stat0.2236750.5607620.7213580.2236750.1800950.153606
two_stages_scenario0.2135760.5488410.7069540.2135760.1743690.148692
\n", "
" ], "text/plain": [ " HitRate@1 HitRate@5 HitRate@10 NDCG@1 \\\n", "first_stage_all 0.337086 0.725993 0.870695 0.337086 \n", "first_stage_half 0.275828 0.652483 0.810927 0.275828 \n", "two_stages_with_stat_strong 0.247185 0.589901 0.750993 0.247185 \n", "two_stages_without_stat_strong 0.230795 0.564073 0.719040 0.230795 \n", "two_stages_without_stat 0.223675 0.560762 0.721358 0.223675 \n", "two_stages_scenario 0.213576 0.548841 0.706954 0.213576 \n", "\n", " NDCG@5 NDCG@10 \n", "first_stage_all 0.265648 0.224414 \n", "first_stage_half 0.220098 0.187830 \n", "two_stages_with_stat_strong 0.194624 0.164429 \n", "two_stages_without_stat_strong 0.183748 0.155696 \n", "two_stages_without_stat 0.180095 0.153606 \n", "two_stages_scenario 0.174369 0.148692 " ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "two_stages_with_stat.experiment.results.sort_values('NDCG@10', ascending=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Модель первого уровня работает лучше, чем двухуровневый сценарий. Двухуровневый сценарий, использущий статистические признаки, работает лучше, чем без них." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" }, "name": "two_levels.ipynb" }, "nbformat": 4, "nbformat_minor": 4 }