In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from replay.session_handler import State

spark = State().session
spark

In [3]:
import pandas as pd

df = pd.read_csv("data/ml1m_ratings.dat", sep="\t", names=["user_id", "item_id", "relevance", "timestamp"])
items = pd.read_csv("data/ml1m_items.dat", sep="\t", names=["item_id", "titile", "genres"])
df.head()

Unnamed: 0,user_id,item_id,relevance,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
from replay.data_preparator import DataPreparator

log = DataPreparator().transform(
    data=df,
    columns_names={
        "user_id": "user_id",
        "item_id": "item_id",
        "relevance": "relevance",
        "timestamp": "timestamp"
    }
)

In [6]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer,LabelBinarizer

mlb = MultiLabelBinarizer()
lb = LabelBinarizer()
item_features = pd.DataFrame(mlb.fit_transform(items.genres.apply(lambda x: x.split("|"))),
                   columns=list(map(lambda x: f"genre_{x}",mlb.classes_)),
                   index=items.item_id).reset_index()

In [7]:
item_features_spark = DataPreparator().transform(
    data=item_features,
    columns_names={
        "item_id": "item_id"
    }
).drop("timestamp")

In [8]:
from replay.splitters import UserSplitter

second_stage_splitter = UserSplitter(
    drop_cold_items=True,
    drop_cold_users=True,
    item_test_size=10,
    seed=1234,
    shuffle=True
)

first_stage_splitter = UserSplitter(
    drop_cold_items=False, item_test_size=0.5, shuffle=True, seed=42
)


In [9]:
from replay.models import ALSWrap
# при 98 все падает с Java heap space error
first_model = ALSWrap(rank=40)



In [10]:
from replay.models import ClassifierRec
from pyspark.ml.classification import RandomForestClassifier
second_model = ClassifierRec(RandomForestClassifier(seed=47), use_recs_value=True)

## Двухуровневый сценарий со статистическими фичами

In [12]:
from replay.scenarios import TwoStagesScenario
from replay.metrics import NDCG, HitRate, Precision, Recall, RocAuc

two_stages_with_stat = TwoStagesScenario(
    second_stage_splitter=second_stage_splitter,
    second_model=second_model,
    first_model=first_model,
    metrics={NDCG(): [1, 5, 10], HitRate(): [1, 5, 10]},
    stat_features=True
)

In [13]:
%%time 
recs_with_stat = two_stages_with_stat.get_recs(log, 10, item_features=item_features_spark)
two_stages_with_stat.experiment.results


02-Mar-21 18:18:31, replay, DEBUG: mixed_train stat: total lines: 939809, total users: 6040, total items: 3699
DEBUG:replay:mixed_train stat: total lines: 939809, total users: 6040, total items: 3699
02-Mar-21 18:18:34, replay, DEBUG: test stat: total lines: 60393, total users: 6040, total items: 3051
DEBUG:replay:test stat: total lines: 60393, total users: 6040, total items: 3051
02-Mar-21 18:18:37, replay, DEBUG: first_train stat: total lines: 471386, total users: 6040, total items: 3604
DEBUG:replay:first_train stat: total lines: 471386, total users: 6040, total items: 3604
02-Mar-21 18:18:39, replay, DEBUG: first_test stat: total lines: 468423, total users: 6040, total items: 3611
DEBUG:replay:first_test stat: total lines: 468423, total users: 6040, total items: 3611
02-Mar-21 18:18:39, replay, DEBUG: Начало обучения ALSWrap
DEBUG:replay:Начало обучения ALSWrap
02-Mar-21 18:18:39, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения 

CPU times: user 3.69 s, sys: 674 ms, total: 4.37 s
Wall time: 5min 47s


Unnamed: 0,HitRate@1,HitRate@5,HitRate@10,NDCG@1,NDCG@5,NDCG@10
two_stages_scenario,0.213576,0.548841,0.706954,0.213576,0.174369,0.148692


## Двухуровневый сценарий без статистических фичей

In [16]:
%%time
two_stages_without_stat = TwoStagesScenario(
    second_stage_splitter=second_stage_splitter,
    second_model=second_model,
    first_model=first_model,
    metrics={NDCG(): [1, 5, 10], HitRate(): [1, 5, 10]},
    stat_features=False
)
recs_without_stat = two_stages_without_stat.get_recs(log, 10, item_features=item_features_spark)
two_stages_without_stat.experiment.results


02-Mar-21 18:25:17, replay, DEBUG: mixed_train stat: total lines: 939809, total users: 6040, total items: 3699
DEBUG:replay:mixed_train stat: total lines: 939809, total users: 6040, total items: 3699
02-Mar-21 18:25:17, replay, DEBUG: test stat: total lines: 60393, total users: 6040, total items: 3051
DEBUG:replay:test stat: total lines: 60393, total users: 6040, total items: 3051
02-Mar-21 18:25:18, replay, DEBUG: first_train stat: total lines: 471386, total users: 6040, total items: 3604
DEBUG:replay:first_train stat: total lines: 471386, total users: 6040, total items: 3604
02-Mar-21 18:25:18, replay, DEBUG: first_test stat: total lines: 468423, total users: 6040, total items: 3611
DEBUG:replay:first_test stat: total lines: 468423, total users: 6040, total items: 3611
02-Mar-21 18:25:18, replay, DEBUG: Начало обучения ALSWrap
DEBUG:replay:Начало обучения ALSWrap
02-Mar-21 18:25:18, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения 

CPU times: user 3.63 s, sys: 629 ms, total: 4.26 s
Wall time: 5min 22s


Unnamed: 0,HitRate@1,HitRate@5,HitRate@10,NDCG@1,NDCG@5,NDCG@10
two_stages_scenario,0.223675,0.560762,0.721358,0.223675,0.180095,0.153606


In [17]:
two_stages_with_stat.experiment.add_result("two_stages_without_stat", recs_without_stat)
two_stages_with_stat.experiment.results

Unnamed: 0,HitRate@1,HitRate@5,HitRate@10,NDCG@1,NDCG@5,NDCG@10
two_stages_scenario,0.213576,0.548841,0.706954,0.213576,0.174369,0.148692
two_stages_without_stat,0.223675,0.560762,0.721358,0.223675,0.180095,0.153606


## Модель первого уровня, обученная на всем train

In [18]:
train, test = second_stage_splitter.split(log)
first_train, first_test = first_stage_splitter.split(train)


In [19]:
%%time
first_recs_all = first_model.fit_predict(
    log=train,
    k=10,
    users=test.select("user_id").distinct().cache(),
    items=train.select("item_id").distinct().cache(),
)

02-Mar-21 18:31:25, replay, DEBUG: Начало обучения ALSWrap
DEBUG:replay:Начало обучения ALSWrap
02-Mar-21 18:31:25, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения (pre-fit)
02-Mar-21 18:31:25, replay, DEBUG: Основная стадия обучения (fit)
DEBUG:replay:Основная стадия обучения (fit)
02-Mar-21 18:31:35, replay, DEBUG: Начало предикта ALSWrap
DEBUG:replay:Начало предикта ALSWrap


CPU times: user 1.14 s, sys: 179 ms, total: 1.32 s
Wall time: 13.1 s


In [20]:
two_stages_with_stat.experiment.add_result("first_stage_all", first_recs_all)
two_stages_with_stat.experiment.results

Unnamed: 0,HitRate@1,HitRate@5,HitRate@10,NDCG@1,NDCG@5,NDCG@10
two_stages_scenario,0.213576,0.548841,0.706954,0.213576,0.174369,0.148692
two_stages_without_stat,0.223675,0.560762,0.721358,0.223675,0.180095,0.153606
first_stage_all,0.337086,0.725993,0.870695,0.337086,0.265648,0.224414


## Модель первого уровня, обученная на половине train (как в двухуровневом сценарии)

In [21]:
%%time
first_model.fit(log=first_train)
first_model_half = first_model.predict(
    log=train,
    k=10,
    users=test.select("user_id").distinct().cache(),
    items=train.select("item_id").distinct().cache(),
)

two_stages_with_stat.experiment.add_result("first_stage_half", first_model_half)

02-Mar-21 18:32:42, replay, DEBUG: Начало обучения ALSWrap
DEBUG:replay:Начало обучения ALSWrap
02-Mar-21 18:32:42, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения (pre-fit)
02-Mar-21 18:32:43, replay, DEBUG: Основная стадия обучения (fit)
DEBUG:replay:Основная стадия обучения (fit)
02-Mar-21 18:32:50, replay, DEBUG: Начало предикта ALSWrap
DEBUG:replay:Начало предикта ALSWrap


CPU times: user 1.22 s, sys: 498 ms, total: 1.72 s
Wall time: 1min 33s


In [22]:
two_stages_with_stat.experiment.results

Unnamed: 0,HitRate@1,HitRate@5,HitRate@10,NDCG@1,NDCG@5,NDCG@10
two_stages_scenario,0.213576,0.548841,0.706954,0.213576,0.174369,0.148692
two_stages_without_stat,0.223675,0.560762,0.721358,0.223675,0.180095,0.153606
first_stage_all,0.337086,0.725993,0.870695,0.337086,0.265648,0.224414
first_stage_half,0.275828,0.652483,0.810927,0.275828,0.220098,0.18783


## Двухуровневый сценарий с усиленным классификатором

In [23]:
second_model = ClassifierRec(spark_classifier=RandomForestClassifier(numTrees=100, seed=47), use_recs_value=True)

### Двухуровневый сценарий со статистическими фичами

In [25]:
two_stages_with_stat_strong = TwoStagesScenario(
    second_stage_splitter=second_stage_splitter,
    second_model=second_model,
    first_model=first_model,
    metrics={NDCG(): [1, 5, 10], HitRate(): [1, 5, 10]},
    stat_features=True
)

In [26]:
%%time 
recs_with_stat = two_stages_with_stat_strong.get_recs(log, 10, item_features=item_features_spark)
two_stages_with_stat.experiment.add_result("two_stages_with_stat_strong", recs_with_stat)

02-Mar-21 18:46:21, replay, DEBUG: mixed_train stat: total lines: 939809, total users: 6040, total items: 3699
DEBUG:replay:mixed_train stat: total lines: 939809, total users: 6040, total items: 3699
02-Mar-21 18:46:21, replay, DEBUG: test stat: total lines: 60393, total users: 6040, total items: 3051
DEBUG:replay:test stat: total lines: 60393, total users: 6040, total items: 3051
02-Mar-21 18:46:22, replay, DEBUG: first_train stat: total lines: 471386, total users: 6040, total items: 3604
DEBUG:replay:first_train stat: total lines: 471386, total users: 6040, total items: 3604
02-Mar-21 18:46:22, replay, DEBUG: first_test stat: total lines: 468423, total users: 6040, total items: 3611
DEBUG:replay:first_test stat: total lines: 468423, total users: 6040, total items: 3611
02-Mar-21 18:46:22, replay, DEBUG: Начало обучения ALSWrap
DEBUG:replay:Начало обучения ALSWrap
02-Mar-21 18:46:22, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения 

CPU times: user 3.75 s, sys: 802 ms, total: 4.55 s
Wall time: 7min 54s


### Двухуровневый сценарий без статистических фичей

In [27]:
%%time
two_stages_without_stat_strong = TwoStagesScenario(
    second_stage_splitter=second_stage_splitter,
    second_model=second_model,
    first_model=first_model,
    metrics={NDCG(): [1, 5, 10], HitRate(): [1, 5, 10]},
    stat_features=False
)
recs_without_stat = two_stages_without_stat_strong.get_recs(log, 10, item_features=item_features_spark)
two_stages_with_stat.experiment.add_result("two_stages_without_stat_strong", recs_without_stat)
two_stages_without_stat_strong.experiment.results

02-Mar-21 18:54:16, replay, DEBUG: mixed_train stat: total lines: 939809, total users: 6040, total items: 3699
DEBUG:replay:mixed_train stat: total lines: 939809, total users: 6040, total items: 3699
02-Mar-21 18:54:16, replay, DEBUG: test stat: total lines: 60393, total users: 6040, total items: 3051
DEBUG:replay:test stat: total lines: 60393, total users: 6040, total items: 3051
02-Mar-21 18:54:16, replay, DEBUG: first_train stat: total lines: 471386, total users: 6040, total items: 3604
DEBUG:replay:first_train stat: total lines: 471386, total users: 6040, total items: 3604
02-Mar-21 18:54:17, replay, DEBUG: first_test stat: total lines: 468423, total users: 6040, total items: 3611
DEBUG:replay:first_test stat: total lines: 468423, total users: 6040, total items: 3611
02-Mar-21 18:54:17, replay, DEBUG: Начало обучения ALSWrap
DEBUG:replay:Начало обучения ALSWrap
02-Mar-21 18:54:17, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения 

CPU times: user 6.23 s, sys: 1.24 s, total: 7.46 s
Wall time: 6min 57s


Unnamed: 0,HitRate@1,HitRate@5,HitRate@10,NDCG@1,NDCG@5,NDCG@10
two_stages_scenario,0.230795,0.564073,0.71904,0.230795,0.183748,0.155696


In [28]:
two_stages_with_stat.experiment.add_result("two_stages_without_stat_strong", recs_without_stat)

In [29]:
two_stages_with_stat.experiment.results.sort_values('NDCG@10', ascending=False)

Unnamed: 0,HitRate@1,HitRate@5,HitRate@10,NDCG@1,NDCG@5,NDCG@10
first_stage_all,0.337086,0.725993,0.870695,0.337086,0.265648,0.224414
first_stage_half,0.275828,0.652483,0.810927,0.275828,0.220098,0.18783
two_stages_with_stat_strong,0.247185,0.589901,0.750993,0.247185,0.194624,0.164429
two_stages_without_stat_strong,0.230795,0.564073,0.71904,0.230795,0.183748,0.155696
two_stages_without_stat,0.223675,0.560762,0.721358,0.223675,0.180095,0.153606
two_stages_scenario,0.213576,0.548841,0.706954,0.213576,0.174369,0.148692


Модель первого уровня работает лучше, чем двухуровневый сценарий. Двухуровневый сценарий, использущий статистические признаки, работает лучше, чем без них.