{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
"
SparkSession - hive
\n",
" \n",
"
\n",
"
SparkContext
\n",
"\n",
"
Spark UI
\n",
"\n",
"
\n",
" - Version
\n",
" v3.0.0
\n",
" - Master
\n",
" local[*]
\n",
" - AppName
\n",
" pyspark-shell
\n",
"
\n",
"
\n",
" \n",
"
\n",
" "
],
"text/plain": [
""
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from replay.session_handler import State\n",
"\n",
"spark = State().session\n",
"spark"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user_id | \n",
" item_id | \n",
" relevance | \n",
" timestamp | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1193 | \n",
" 5 | \n",
" 978300760 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 661 | \n",
" 3 | \n",
" 978302109 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 914 | \n",
" 3 | \n",
" 978301968 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 3408 | \n",
" 4 | \n",
" 978300275 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 2355 | \n",
" 5 | \n",
" 978824291 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user_id item_id relevance timestamp\n",
"0 1 1193 5 978300760\n",
"1 1 661 3 978302109\n",
"2 1 914 3 978301968\n",
"3 1 3408 4 978300275\n",
"4 1 2355 5 978824291"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv(\"data/ml1m_ratings.dat\", sep=\"\\t\", names=[\"user_id\", \"item_id\", \"relevance\", \"timestamp\"])\n",
"items = pd.read_csv(\"data/ml1m_items.dat\", sep=\"\\t\", names=[\"item_id\", \"titile\", \"genres\"])\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from replay.data_preparator import DataPreparator\n",
"\n",
"log = DataPreparator().transform(\n",
" data=df,\n",
" columns_names={\n",
" \"user_id\": \"user_id\",\n",
" \"item_id\": \"item_id\",\n",
" \"relevance\": \"relevance\",\n",
" \"timestamp\": \"timestamp\"\n",
" }\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.preprocessing import MultiLabelBinarizer,LabelBinarizer\n",
"\n",
"mlb = MultiLabelBinarizer()\n",
"lb = LabelBinarizer()\n",
"item_features = pd.DataFrame(mlb.fit_transform(items.genres.apply(lambda x: x.split(\"|\"))),\n",
" columns=list(map(lambda x: f\"genre_{x}\",mlb.classes_)),\n",
" index=items.item_id).reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"item_features_spark = DataPreparator().transform(\n",
" data=item_features,\n",
" columns_names={\n",
" \"item_id\": \"item_id\"\n",
" }\n",
").drop(\"timestamp\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from replay.splitters import UserSplitter\n",
"\n",
"second_stage_splitter = UserSplitter(\n",
" drop_cold_items=True,\n",
" drop_cold_users=True,\n",
" item_test_size=10,\n",
" seed=1234,\n",
" shuffle=True\n",
")\n",
"\n",
"first_stage_splitter = UserSplitter(\n",
" drop_cold_items=False, item_test_size=0.5, shuffle=True, seed=42\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/darel/python/sponge-bob-magic/.new_env2/lib/python3.7/site-packages/lightfm/_lightfm_fast.py:9: UserWarning: LightFM was compiled without OpenMP support. Only a single thread will be used.\n",
" warnings.warn('LightFM was compiled without OpenMP support. '\n"
]
}
],
"source": [
"from replay.models import ALSWrap\n",
"# при 98 все падает с Java heap space error\n",
"first_model = ALSWrap(rank=40)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from replay.models import ClassifierRec\n",
"from pyspark.ml.classification import RandomForestClassifier\n",
"second_model = ClassifierRec(RandomForestClassifier(seed=47), use_recs_value=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Двухуровневый сценарий со статистическими фичами"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from replay.scenarios import TwoStagesScenario\n",
"from replay.metrics import NDCG, HitRate, Precision, Recall, RocAuc\n",
"\n",
"two_stages_with_stat = TwoStagesScenario(\n",
" second_stage_splitter=second_stage_splitter,\n",
" second_model=second_model,\n",
" first_model=first_model,\n",
" metrics={NDCG(): [1, 5, 10], HitRate(): [1, 5, 10]},\n",
" stat_features=True\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"02-Mar-21 18:18:31, replay, DEBUG: mixed_train stat: total lines: 939809, total users: 6040, total items: 3699\n",
"DEBUG:replay:mixed_train stat: total lines: 939809, total users: 6040, total items: 3699\n",
"02-Mar-21 18:18:34, replay, DEBUG: test stat: total lines: 60393, total users: 6040, total items: 3051\n",
"DEBUG:replay:test stat: total lines: 60393, total users: 6040, total items: 3051\n",
"02-Mar-21 18:18:37, replay, DEBUG: first_train stat: total lines: 471386, total users: 6040, total items: 3604\n",
"DEBUG:replay:first_train stat: total lines: 471386, total users: 6040, total items: 3604\n",
"02-Mar-21 18:18:39, replay, DEBUG: first_test stat: total lines: 468423, total users: 6040, total items: 3611\n",
"DEBUG:replay:first_test stat: total lines: 468423, total users: 6040, total items: 3611\n",
"02-Mar-21 18:18:39, replay, DEBUG: Начало обучения ALSWrap\n",
"DEBUG:replay:Начало обучения ALSWrap\n",
"02-Mar-21 18:18:39, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n",
"DEBUG:replay:Предварительная стадия обучения (pre-fit)\n",
"02-Mar-21 18:18:40, replay, DEBUG: Основная стадия обучения (fit)\n",
"DEBUG:replay:Основная стадия обучения (fit)\n",
"02-Mar-21 18:18:51, replay, DEBUG: Начало предикта ALSWrap\n",
"DEBUG:replay:Начало предикта ALSWrap\n",
"02-Mar-21 18:19:42, replay, DEBUG: баланс классов: положительных 164401 из 604000\n",
"DEBUG:replay:баланс классов: положительных 164401 из 604000\n",
"02-Mar-21 18:19:42, replay, DEBUG: Начало предикта ALSWrap\n",
"DEBUG:replay:Начало предикта ALSWrap\n",
"02-Mar-21 18:19:44, replay, WARNING: Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n",
"WARNING:replay:Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n",
"02-Mar-21 18:19:45, replay, DEBUG: Начало обучения ClassifierRec\n",
"DEBUG:replay:Начало обучения ClassifierRec\n",
"02-Mar-21 18:19:45, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n",
"DEBUG:replay:Предварительная стадия обучения (pre-fit)\n",
"02-Mar-21 18:20:27, replay, DEBUG: Основная стадия обучения (fit)\n",
"DEBUG:replay:Основная стадия обучения (fit)\n",
"02-Mar-21 18:23:16, replay, DEBUG: ROC AUC модели второго уровня (как классификатора): 0.8018\n",
"DEBUG:replay:ROC AUC модели второго уровня (как классификатора): 0.8018\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3.69 s, sys: 674 ms, total: 4.37 s\n",
"Wall time: 5min 47s\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" HitRate@1 | \n",
" HitRate@5 | \n",
" HitRate@10 | \n",
" NDCG@1 | \n",
" NDCG@5 | \n",
" NDCG@10 | \n",
"
\n",
" \n",
" \n",
" \n",
" two_stages_scenario | \n",
" 0.213576 | \n",
" 0.548841 | \n",
" 0.706954 | \n",
" 0.213576 | \n",
" 0.174369 | \n",
" 0.148692 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" HitRate@1 HitRate@5 HitRate@10 NDCG@1 NDCG@5 \\\n",
"two_stages_scenario 0.213576 0.548841 0.706954 0.213576 0.174369 \n",
"\n",
" NDCG@10 \n",
"two_stages_scenario 0.148692 "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time \n",
"recs_with_stat = two_stages_with_stat.get_recs(log, 10, item_features=item_features_spark)\n",
"two_stages_with_stat.experiment.results\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Двухуровневый сценарий без статистических фичей"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"02-Mar-21 18:25:17, replay, DEBUG: mixed_train stat: total lines: 939809, total users: 6040, total items: 3699\n",
"DEBUG:replay:mixed_train stat: total lines: 939809, total users: 6040, total items: 3699\n",
"02-Mar-21 18:25:17, replay, DEBUG: test stat: total lines: 60393, total users: 6040, total items: 3051\n",
"DEBUG:replay:test stat: total lines: 60393, total users: 6040, total items: 3051\n",
"02-Mar-21 18:25:18, replay, DEBUG: first_train stat: total lines: 471386, total users: 6040, total items: 3604\n",
"DEBUG:replay:first_train stat: total lines: 471386, total users: 6040, total items: 3604\n",
"02-Mar-21 18:25:18, replay, DEBUG: first_test stat: total lines: 468423, total users: 6040, total items: 3611\n",
"DEBUG:replay:first_test stat: total lines: 468423, total users: 6040, total items: 3611\n",
"02-Mar-21 18:25:18, replay, DEBUG: Начало обучения ALSWrap\n",
"DEBUG:replay:Начало обучения ALSWrap\n",
"02-Mar-21 18:25:18, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n",
"DEBUG:replay:Предварительная стадия обучения (pre-fit)\n",
"02-Mar-21 18:25:18, replay, DEBUG: Основная стадия обучения (fit)\n",
"DEBUG:replay:Основная стадия обучения (fit)\n",
"02-Mar-21 18:25:26, replay, DEBUG: Начало предикта ALSWrap\n",
"DEBUG:replay:Начало предикта ALSWrap\n",
"02-Mar-21 18:26:08, replay, DEBUG: баланс классов: положительных 164401 из 604000\n",
"DEBUG:replay:баланс классов: положительных 164401 из 604000\n",
"02-Mar-21 18:26:08, replay, DEBUG: Начало предикта ALSWrap\n",
"DEBUG:replay:Начало предикта ALSWrap\n",
"02-Mar-21 18:26:10, replay, WARNING: Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n",
"WARNING:replay:Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n",
"02-Mar-21 18:26:11, replay, DEBUG: Начало обучения ClassifierRec\n",
"DEBUG:replay:Начало обучения ClassifierRec\n",
"02-Mar-21 18:26:11, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n",
"DEBUG:replay:Предварительная стадия обучения (pre-fit)\n",
"02-Mar-21 18:27:03, replay, DEBUG: Основная стадия обучения (fit)\n",
"DEBUG:replay:Основная стадия обучения (fit)\n",
"02-Mar-21 18:29:48, replay, DEBUG: ROC AUC модели второго уровня (как классификатора): 0.8006\n",
"DEBUG:replay:ROC AUC модели второго уровня (как классификатора): 0.8006\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3.63 s, sys: 629 ms, total: 4.26 s\n",
"Wall time: 5min 22s\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" HitRate@1 | \n",
" HitRate@5 | \n",
" HitRate@10 | \n",
" NDCG@1 | \n",
" NDCG@5 | \n",
" NDCG@10 | \n",
"
\n",
" \n",
" \n",
" \n",
" two_stages_scenario | \n",
" 0.223675 | \n",
" 0.560762 | \n",
" 0.721358 | \n",
" 0.223675 | \n",
" 0.180095 | \n",
" 0.153606 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" HitRate@1 HitRate@5 HitRate@10 NDCG@1 NDCG@5 \\\n",
"two_stages_scenario 0.223675 0.560762 0.721358 0.223675 0.180095 \n",
"\n",
" NDCG@10 \n",
"two_stages_scenario 0.153606 "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"two_stages_without_stat = TwoStagesScenario(\n",
" second_stage_splitter=second_stage_splitter,\n",
" second_model=second_model,\n",
" first_model=first_model,\n",
" metrics={NDCG(): [1, 5, 10], HitRate(): [1, 5, 10]},\n",
" stat_features=False\n",
")\n",
"recs_without_stat = two_stages_without_stat.get_recs(log, 10, item_features=item_features_spark)\n",
"two_stages_without_stat.experiment.results\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" HitRate@1 | \n",
" HitRate@5 | \n",
" HitRate@10 | \n",
" NDCG@1 | \n",
" NDCG@5 | \n",
" NDCG@10 | \n",
"
\n",
" \n",
" \n",
" \n",
" two_stages_scenario | \n",
" 0.213576 | \n",
" 0.548841 | \n",
" 0.706954 | \n",
" 0.213576 | \n",
" 0.174369 | \n",
" 0.148692 | \n",
"
\n",
" \n",
" two_stages_without_stat | \n",
" 0.223675 | \n",
" 0.560762 | \n",
" 0.721358 | \n",
" 0.223675 | \n",
" 0.180095 | \n",
" 0.153606 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" HitRate@1 HitRate@5 HitRate@10 NDCG@1 NDCG@5 \\\n",
"two_stages_scenario 0.213576 0.548841 0.706954 0.213576 0.174369 \n",
"two_stages_without_stat 0.223675 0.560762 0.721358 0.223675 0.180095 \n",
"\n",
" NDCG@10 \n",
"two_stages_scenario 0.148692 \n",
"two_stages_without_stat 0.153606 "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"two_stages_with_stat.experiment.add_result(\"two_stages_without_stat\", recs_without_stat)\n",
"two_stages_with_stat.experiment.results"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Модель первого уровня, обученная на всем train"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"train, test = second_stage_splitter.split(log)\n",
"first_train, first_test = first_stage_splitter.split(train)\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"02-Mar-21 18:31:25, replay, DEBUG: Начало обучения ALSWrap\n",
"DEBUG:replay:Начало обучения ALSWrap\n",
"02-Mar-21 18:31:25, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n",
"DEBUG:replay:Предварительная стадия обучения (pre-fit)\n",
"02-Mar-21 18:31:25, replay, DEBUG: Основная стадия обучения (fit)\n",
"DEBUG:replay:Основная стадия обучения (fit)\n",
"02-Mar-21 18:31:35, replay, DEBUG: Начало предикта ALSWrap\n",
"DEBUG:replay:Начало предикта ALSWrap\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.14 s, sys: 179 ms, total: 1.32 s\n",
"Wall time: 13.1 s\n"
]
}
],
"source": [
"%%time\n",
"first_recs_all = first_model.fit_predict(\n",
" log=train,\n",
" k=10,\n",
" users=test.select(\"user_id\").distinct().cache(),\n",
" items=train.select(\"item_id\").distinct().cache(),\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" HitRate@1 | \n",
" HitRate@5 | \n",
" HitRate@10 | \n",
" NDCG@1 | \n",
" NDCG@5 | \n",
" NDCG@10 | \n",
"
\n",
" \n",
" \n",
" \n",
" two_stages_scenario | \n",
" 0.213576 | \n",
" 0.548841 | \n",
" 0.706954 | \n",
" 0.213576 | \n",
" 0.174369 | \n",
" 0.148692 | \n",
"
\n",
" \n",
" two_stages_without_stat | \n",
" 0.223675 | \n",
" 0.560762 | \n",
" 0.721358 | \n",
" 0.223675 | \n",
" 0.180095 | \n",
" 0.153606 | \n",
"
\n",
" \n",
" first_stage_all | \n",
" 0.337086 | \n",
" 0.725993 | \n",
" 0.870695 | \n",
" 0.337086 | \n",
" 0.265648 | \n",
" 0.224414 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" HitRate@1 HitRate@5 HitRate@10 NDCG@1 NDCG@5 \\\n",
"two_stages_scenario 0.213576 0.548841 0.706954 0.213576 0.174369 \n",
"two_stages_without_stat 0.223675 0.560762 0.721358 0.223675 0.180095 \n",
"first_stage_all 0.337086 0.725993 0.870695 0.337086 0.265648 \n",
"\n",
" NDCG@10 \n",
"two_stages_scenario 0.148692 \n",
"two_stages_without_stat 0.153606 \n",
"first_stage_all 0.224414 "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"two_stages_with_stat.experiment.add_result(\"first_stage_all\", first_recs_all)\n",
"two_stages_with_stat.experiment.results"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Модель первого уровня, обученная на половине train (как в двухуровневом сценарии)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"02-Mar-21 18:32:42, replay, DEBUG: Начало обучения ALSWrap\n",
"DEBUG:replay:Начало обучения ALSWrap\n",
"02-Mar-21 18:32:42, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n",
"DEBUG:replay:Предварительная стадия обучения (pre-fit)\n",
"02-Mar-21 18:32:43, replay, DEBUG: Основная стадия обучения (fit)\n",
"DEBUG:replay:Основная стадия обучения (fit)\n",
"02-Mar-21 18:32:50, replay, DEBUG: Начало предикта ALSWrap\n",
"DEBUG:replay:Начало предикта ALSWrap\n",
"02-Mar-21 18:32:50, replay, WARNING: Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n",
"WARNING:replay:Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n",
"02-Mar-21 18:32:51, replay, WARNING: Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n",
"WARNING:replay:Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.22 s, sys: 498 ms, total: 1.72 s\n",
"Wall time: 1min 33s\n"
]
}
],
"source": [
"%%time\n",
"first_model.fit(log=first_train)\n",
"first_model_half = first_model.predict(\n",
" log=train,\n",
" k=10,\n",
" users=test.select(\"user_id\").distinct().cache(),\n",
" items=train.select(\"item_id\").distinct().cache(),\n",
")\n",
"\n",
"two_stages_with_stat.experiment.add_result(\"first_stage_half\", first_model_half)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" HitRate@1 | \n",
" HitRate@5 | \n",
" HitRate@10 | \n",
" NDCG@1 | \n",
" NDCG@5 | \n",
" NDCG@10 | \n",
"
\n",
" \n",
" \n",
" \n",
" two_stages_scenario | \n",
" 0.213576 | \n",
" 0.548841 | \n",
" 0.706954 | \n",
" 0.213576 | \n",
" 0.174369 | \n",
" 0.148692 | \n",
"
\n",
" \n",
" two_stages_without_stat | \n",
" 0.223675 | \n",
" 0.560762 | \n",
" 0.721358 | \n",
" 0.223675 | \n",
" 0.180095 | \n",
" 0.153606 | \n",
"
\n",
" \n",
" first_stage_all | \n",
" 0.337086 | \n",
" 0.725993 | \n",
" 0.870695 | \n",
" 0.337086 | \n",
" 0.265648 | \n",
" 0.224414 | \n",
"
\n",
" \n",
" first_stage_half | \n",
" 0.275828 | \n",
" 0.652483 | \n",
" 0.810927 | \n",
" 0.275828 | \n",
" 0.220098 | \n",
" 0.187830 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" HitRate@1 HitRate@5 HitRate@10 NDCG@1 NDCG@5 \\\n",
"two_stages_scenario 0.213576 0.548841 0.706954 0.213576 0.174369 \n",
"two_stages_without_stat 0.223675 0.560762 0.721358 0.223675 0.180095 \n",
"first_stage_all 0.337086 0.725993 0.870695 0.337086 0.265648 \n",
"first_stage_half 0.275828 0.652483 0.810927 0.275828 0.220098 \n",
"\n",
" NDCG@10 \n",
"two_stages_scenario 0.148692 \n",
"two_stages_without_stat 0.153606 \n",
"first_stage_all 0.224414 \n",
"first_stage_half 0.187830 "
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"two_stages_with_stat.experiment.results"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Двухуровневый сценарий с усиленным классификатором"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"second_model = ClassifierRec(spark_classifier=RandomForestClassifier(numTrees=100, seed=47), use_recs_value=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Двухуровневый сценарий со статистическими фичами"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"two_stages_with_stat_strong = TwoStagesScenario(\n",
" second_stage_splitter=second_stage_splitter,\n",
" second_model=second_model,\n",
" first_model=first_model,\n",
" metrics={NDCG(): [1, 5, 10], HitRate(): [1, 5, 10]},\n",
" stat_features=True\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"02-Mar-21 18:46:21, replay, DEBUG: mixed_train stat: total lines: 939809, total users: 6040, total items: 3699\n",
"DEBUG:replay:mixed_train stat: total lines: 939809, total users: 6040, total items: 3699\n",
"02-Mar-21 18:46:21, replay, DEBUG: test stat: total lines: 60393, total users: 6040, total items: 3051\n",
"DEBUG:replay:test stat: total lines: 60393, total users: 6040, total items: 3051\n",
"02-Mar-21 18:46:22, replay, DEBUG: first_train stat: total lines: 471386, total users: 6040, total items: 3604\n",
"DEBUG:replay:first_train stat: total lines: 471386, total users: 6040, total items: 3604\n",
"02-Mar-21 18:46:22, replay, DEBUG: first_test stat: total lines: 468423, total users: 6040, total items: 3611\n",
"DEBUG:replay:first_test stat: total lines: 468423, total users: 6040, total items: 3611\n",
"02-Mar-21 18:46:22, replay, DEBUG: Начало обучения ALSWrap\n",
"DEBUG:replay:Начало обучения ALSWrap\n",
"02-Mar-21 18:46:22, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n",
"DEBUG:replay:Предварительная стадия обучения (pre-fit)\n",
"02-Mar-21 18:46:23, replay, DEBUG: Основная стадия обучения (fit)\n",
"DEBUG:replay:Основная стадия обучения (fit)\n",
"02-Mar-21 18:46:31, replay, DEBUG: Начало предикта ALSWrap\n",
"DEBUG:replay:Начало предикта ALSWrap\n",
"02-Mar-21 18:47:16, replay, DEBUG: баланс классов: положительных 164401 из 604000\n",
"DEBUG:replay:баланс классов: положительных 164401 из 604000\n",
"02-Mar-21 18:47:16, replay, DEBUG: Начало предикта ALSWrap\n",
"DEBUG:replay:Начало предикта ALSWrap\n",
"02-Mar-21 18:47:18, replay, WARNING: Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n",
"WARNING:replay:Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n",
"02-Mar-21 18:47:19, replay, DEBUG: Начало обучения ClassifierRec\n",
"DEBUG:replay:Начало обучения ClassifierRec\n",
"02-Mar-21 18:47:19, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n",
"DEBUG:replay:Предварительная стадия обучения (pre-fit)\n",
"02-Mar-21 18:48:12, replay, DEBUG: Основная стадия обучения (fit)\n",
"DEBUG:replay:Основная стадия обучения (fit)\n",
"02-Mar-21 18:52:10, replay, DEBUG: ROC AUC модели второго уровня (как классификатора): 0.8058\n",
"DEBUG:replay:ROC AUC модели второго уровня (как классификатора): 0.8058\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3.75 s, sys: 802 ms, total: 4.55 s\n",
"Wall time: 7min 54s\n"
]
}
],
"source": [
"%%time \n",
"recs_with_stat = two_stages_with_stat_strong.get_recs(log, 10, item_features=item_features_spark)\n",
"two_stages_with_stat.experiment.add_result(\"two_stages_with_stat_strong\", recs_with_stat)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Двухуровневый сценарий без статистических фичей"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"02-Mar-21 18:54:16, replay, DEBUG: mixed_train stat: total lines: 939809, total users: 6040, total items: 3699\n",
"DEBUG:replay:mixed_train stat: total lines: 939809, total users: 6040, total items: 3699\n",
"02-Mar-21 18:54:16, replay, DEBUG: test stat: total lines: 60393, total users: 6040, total items: 3051\n",
"DEBUG:replay:test stat: total lines: 60393, total users: 6040, total items: 3051\n",
"02-Mar-21 18:54:16, replay, DEBUG: first_train stat: total lines: 471386, total users: 6040, total items: 3604\n",
"DEBUG:replay:first_train stat: total lines: 471386, total users: 6040, total items: 3604\n",
"02-Mar-21 18:54:17, replay, DEBUG: first_test stat: total lines: 468423, total users: 6040, total items: 3611\n",
"DEBUG:replay:first_test stat: total lines: 468423, total users: 6040, total items: 3611\n",
"02-Mar-21 18:54:17, replay, DEBUG: Начало обучения ALSWrap\n",
"DEBUG:replay:Начало обучения ALSWrap\n",
"02-Mar-21 18:54:17, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n",
"DEBUG:replay:Предварительная стадия обучения (pre-fit)\n",
"02-Mar-21 18:54:17, replay, DEBUG: Основная стадия обучения (fit)\n",
"DEBUG:replay:Основная стадия обучения (fit)\n",
"02-Mar-21 18:54:29, replay, DEBUG: Начало предикта ALSWrap\n",
"DEBUG:replay:Начало предикта ALSWrap\n",
"02-Mar-21 18:55:25, replay, DEBUG: баланс классов: положительных 164401 из 604000\n",
"DEBUG:replay:баланс классов: положительных 164401 из 604000\n",
"02-Mar-21 18:55:25, replay, DEBUG: Начало предикта ALSWrap\n",
"DEBUG:replay:Начало предикта ALSWrap\n",
"02-Mar-21 18:55:27, replay, WARNING: Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n",
"WARNING:replay:Список item содержит элементы, которые отсутствовали при обучении. Результат предсказания будет не полным.\n",
"02-Mar-21 18:55:29, replay, DEBUG: Начало обучения ClassifierRec\n",
"DEBUG:replay:Начало обучения ClassifierRec\n",
"02-Mar-21 18:55:29, replay, DEBUG: Предварительная стадия обучения (pre-fit)\n",
"DEBUG:replay:Предварительная стадия обучения (pre-fit)\n",
"02-Mar-21 18:56:11, replay, DEBUG: Основная стадия обучения (fit)\n",
"DEBUG:replay:Основная стадия обучения (fit)\n",
"02-Mar-21 18:59:32, replay, DEBUG: ROC AUC модели второго уровня (как классификатора): 0.8053\n",
"DEBUG:replay:ROC AUC модели второго уровня (как классификатора): 0.8053\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 6.23 s, sys: 1.24 s, total: 7.46 s\n",
"Wall time: 6min 57s\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" HitRate@1 | \n",
" HitRate@5 | \n",
" HitRate@10 | \n",
" NDCG@1 | \n",
" NDCG@5 | \n",
" NDCG@10 | \n",
"
\n",
" \n",
" \n",
" \n",
" two_stages_scenario | \n",
" 0.230795 | \n",
" 0.564073 | \n",
" 0.71904 | \n",
" 0.230795 | \n",
" 0.183748 | \n",
" 0.155696 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" HitRate@1 HitRate@5 HitRate@10 NDCG@1 NDCG@5 \\\n",
"two_stages_scenario 0.230795 0.564073 0.71904 0.230795 0.183748 \n",
"\n",
" NDCG@10 \n",
"two_stages_scenario 0.155696 "
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"two_stages_without_stat_strong = TwoStagesScenario(\n",
" second_stage_splitter=second_stage_splitter,\n",
" second_model=second_model,\n",
" first_model=first_model,\n",
" metrics={NDCG(): [1, 5, 10], HitRate(): [1, 5, 10]},\n",
" stat_features=False\n",
")\n",
"recs_without_stat = two_stages_without_stat_strong.get_recs(log, 10, item_features=item_features_spark)\n",
"two_stages_with_stat.experiment.add_result(\"two_stages_without_stat_strong\", recs_without_stat)\n",
"two_stages_without_stat_strong.experiment.results"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"two_stages_with_stat.experiment.add_result(\"two_stages_without_stat_strong\", recs_without_stat)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" HitRate@1 | \n",
" HitRate@5 | \n",
" HitRate@10 | \n",
" NDCG@1 | \n",
" NDCG@5 | \n",
" NDCG@10 | \n",
"
\n",
" \n",
" \n",
" \n",
" first_stage_all | \n",
" 0.337086 | \n",
" 0.725993 | \n",
" 0.870695 | \n",
" 0.337086 | \n",
" 0.265648 | \n",
" 0.224414 | \n",
"
\n",
" \n",
" first_stage_half | \n",
" 0.275828 | \n",
" 0.652483 | \n",
" 0.810927 | \n",
" 0.275828 | \n",
" 0.220098 | \n",
" 0.187830 | \n",
"
\n",
" \n",
" two_stages_with_stat_strong | \n",
" 0.247185 | \n",
" 0.589901 | \n",
" 0.750993 | \n",
" 0.247185 | \n",
" 0.194624 | \n",
" 0.164429 | \n",
"
\n",
" \n",
" two_stages_without_stat_strong | \n",
" 0.230795 | \n",
" 0.564073 | \n",
" 0.719040 | \n",
" 0.230795 | \n",
" 0.183748 | \n",
" 0.155696 | \n",
"
\n",
" \n",
" two_stages_without_stat | \n",
" 0.223675 | \n",
" 0.560762 | \n",
" 0.721358 | \n",
" 0.223675 | \n",
" 0.180095 | \n",
" 0.153606 | \n",
"
\n",
" \n",
" two_stages_scenario | \n",
" 0.213576 | \n",
" 0.548841 | \n",
" 0.706954 | \n",
" 0.213576 | \n",
" 0.174369 | \n",
" 0.148692 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" HitRate@1 HitRate@5 HitRate@10 NDCG@1 \\\n",
"first_stage_all 0.337086 0.725993 0.870695 0.337086 \n",
"first_stage_half 0.275828 0.652483 0.810927 0.275828 \n",
"two_stages_with_stat_strong 0.247185 0.589901 0.750993 0.247185 \n",
"two_stages_without_stat_strong 0.230795 0.564073 0.719040 0.230795 \n",
"two_stages_without_stat 0.223675 0.560762 0.721358 0.223675 \n",
"two_stages_scenario 0.213576 0.548841 0.706954 0.213576 \n",
"\n",
" NDCG@5 NDCG@10 \n",
"first_stage_all 0.265648 0.224414 \n",
"first_stage_half 0.220098 0.187830 \n",
"two_stages_with_stat_strong 0.194624 0.164429 \n",
"two_stages_without_stat_strong 0.183748 0.155696 \n",
"two_stages_without_stat 0.180095 0.153606 \n",
"two_stages_scenario 0.174369 0.148692 "
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"two_stages_with_stat.experiment.results.sort_values('NDCG@10', ascending=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Модель первого уровня работает лучше, чем двухуровневый сценарий. Двухуровневый сценарий, использущий статистические признаки, работает лучше, чем без них."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
},
"name": "two_levels.ipynb"
},
"nbformat": 4,
"nbformat_minor": 4
}