{"cells":[{"cell_type":"markdown","metadata":{},"source":["# Itempop and two-stage recommender on MTS data"]},{"cell_type":"markdown","metadata":{"id":"Ey05k9RtFXlQ"},"source":["## Setup"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"OZHUcZZCmCxf"},"outputs":[],"source":["!pip install --upgrade pip setuptools wheel\n","!git clone https://github.com/benfred/implicit\n","!cd implicit && pip install .\n","!pip install -q catboost\n","!pip install recohut"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"EJ_UIHjq9NnK"},"outputs":[],"source":["import os\n","import numpy as np\n","import pandas as pd\n","import scipy.sparse as sp\n","\n","import random\n","import datetime\n","\n","import pickle\n","from sklearn.model_selection import train_test_split\n","from sklearn.utils import shuffle\n","\n","from implicit import nearest_neighbours as NN\n","from implicit.nearest_neighbours import TFIDFRecommender\n","\n","from catboost import CatBoostClassifier\n","\n","from recohut.datasets.mts import MTSDataset\n","from recohut.utils.common_utils import get_coo_matrix\n","from recohut.transforms.splitting import TimeRangeSplit\n","from recohut.models.itempop import ItemPop as PopularRecommender"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"nmhnVyko6Ynx"},"outputs":[],"source":["ds = MTSDataset(data_dir='/content/data', sample_frac=0.1)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"UJwOSLrd9Tmg"},"outputs":[],"source":["users_df = pd.read_csv(os.path.join(ds.processed_dir, 'users_processed.csv'))\n","items_df = pd.read_csv(os.path.join(ds.processed_dir, 'items_processed.csv'))\n","interactions_df = pd.read_csv(os.path.join(ds.processed_dir, 'interactions_processed.csv'))"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"lQiBlArS-IVP"},"outputs":[],"source":["interactions_df['last_watch_dt'] = pd.to_datetime(interactions_df['last_watch_dt'])\n","interactions_df.sort_values(by='last_watch_dt', inplace=True)"]},{"cell_type":"markdown","metadata":{"id":"NJhm264qm293"},"source":["## Winning Solution\n","\n","This solution includes a two-stage model. I used item-item CF from implicit library to generate candidates with their scores and Catboost classifier to predict final ranks with classification objective. Recommendations for cold users were made with Popular items.\n","\n","Implicit model parameters were chosen on sliding time window cross validation. The best scores were achieved by Cosine recommender model, taking only last 20 interactions for each user. 100 candidates with their scores were generated for each user, filtering all items that user had interactions with.\n","\n","Implicit candidates were calculated for the last 14 days of the interactions. Then catboost model was trained on positive interactions from the candidates list on last 14 days. Random negative sampling was applied.\n","\n","For final submission implicit candidates and catboost predictions were recalculated on the whole dataset.\n","\n","Ref: [Daria](https://github.com/blondered/ods_MTS_RecSys_Challenge_solution)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Oa4FR6zv_lsB"},"outputs":[],"source":["# Creating items and users mapping\n","users_inv_mapping = dict(enumerate(interactions_df['user_id'].unique()))\n","users_mapping = {v: k for k, v in users_inv_mapping.items()}\n","items_inv_mapping = dict(enumerate(interactions_df['item_id'].unique()))\n","items_mapping = {v: k for k, v in items_inv_mapping.items()}"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"FbsUVNeImJrm"},"outputs":[],"source":["# Preparing data\n","last_date_df = interactions_df['last_watch_dt'].max()\n","boosting_split_date = last_date_df - pd.Timedelta(days=14)\n","boosting_data = interactions_df[(interactions_df['last_watch_dt'] >\n"," boosting_split_date)].copy()\n","boost_idx = boosting_data['user_id'].unique() \n","before_boosting = interactions_df[(interactions_df['last_watch_dt'] <=\n"," boosting_split_date)].copy()\n","before_boosting_known_items = before_boosting.groupby(\n"," 'user_id')['item_id'].apply(list).to_dict()\n","\n","before_boosting_known_items_mapped = {}\n","for user, recommend in before_boosting_known_items.items():\n"," before_boosting_known_items_mapped[user] = list(map(lambda x:\n"," items_mapping[x],\n"," recommend))\n","before_boosting['order_from_recent'] = before_boosting.sort_values(\n"," by=['last_watch_dt'], ascending=False).groupby('user_id').cumcount() + 1\n","boost_warm_idx = np.intersect1d(before_boosting['user_id'].unique(),\n"," boosting_data['user_id'].unique())"]},{"cell_type":"markdown","metadata":{"id":"70FwEuTwIYD4"},"source":[" Calculates top candidates from implicit model with their scores. Implicit parameters were chosen on time range split cross-validation. History offset stands for taking only lask X items from user history. Day offset stands for taking items from last X days of user history."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"9X0MRF9TBLvs"},"outputs":[],"source":["k_neighbours = 200\n","day_offset = 170\n","history_offset = 20\n","distance = 'Cosine'\n","num_candidates = 100"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"-S2vvbAWBTAZ"},"outputs":[],"source":["before_boosting['order_from_recent'] = before_boosting.sort_values(\n"," by=['last_watch_dt'], ascending=False).groupby('user_id').cumcount() + 1\n","train = before_boosting.copy()\n","date_window = train['last_watch_dt'].max() - pd.DateOffset(days=day_offset)\n","train = train[train['last_watch_dt'] >= date_window]"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"b1sORvcLCc2V"},"outputs":[],"source":["if history_offset:\n"," train = train[train['order_from_recent'] < history_offset]\n"," \n","if distance == 'Cosine':\n"," model = NN.CosineRecommender(K=k_neighbours)\n"," weights = None\n","else:\n"," model = NN.TFIDFRecommender(K=k_neighbours)\n"," weights = None"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"XL39gZ51CnWc"},"outputs":[],"source":["train_mat = get_coo_matrix(\n"," train,\n"," users_mapping=users_mapping,\n"," items_mapping=items_mapping,\n"," weight_col=weights\n",").tocsr()"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":49,"referenced_widgets":["1150ae9b77d04ee089151cdf9b3c97fd","6404fcf9360b4c8bafac3d0c62dc7d58","cd203aa954364d05a2d9197f04bac18a","7d5a993575214d4189c013bb12fc9080","552747c596b440929459610765a70c67","6b9c40da36c746169708e1251c893b47","e6fb3757a3db480aa1be1f0a91e19f4d","163ead0dd46c434ea3412e462a0938db","b6183320801a4148b75f6b36b65a9b13","6773a61987794d45bf453ac8a8f78a34","1cfcd7deb21741cc95481b1ece102ced"]},"executionInfo":{"elapsed":29070,"status":"ok","timestamp":1642187764981,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"sjI4cw1ZCpoz","outputId":"6660fc68-d9fa-4477-8a4a-10b9073d4b0f"},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"1150ae9b77d04ee089151cdf9b3c97fd","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/266854 [00:00, ?it/s]"]},"metadata":{},"output_type":"display_data"}],"source":["model.fit(train_mat.T, show_progress=True)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"gnnc6uFECxM5"},"outputs":[],"source":["def generate_implicit_recs_mapper(\n"," model,\n"," train_matrix,\n"," top_N,\n"," user_mapping,\n"," item_inv_mapping,\n"," filter_already_liked_items,\n"," known_items=None,\n"," filter_items=None,\n"," return_scores=False\n","):\n"," def _recs_mapper(user):\n"," user_id = user_mapping[user]\n"," if filter_items:\n"," if user in known_items:\n"," filtering = set(known_items[user]).union(set(filter_items))\n"," else:\n"," filtering = filter_items\n"," else:\n"," if known_items and user in known_items:\n"," filtering = known_items[user]\n"," else:\n"," filtering = None\n"," recs = model.recommend(user_id,\n"," train_matrix,\n"," N=top_N,\n"," filter_already_liked_items=filter_already_liked_items,\n"," filter_items=filtering)\n"," if return_scores:\n"," return recs\n"," return recs[0]\n","\n"," return _recs_mapper"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"GhInkhX_C17s"},"outputs":[],"source":["mapper = generate_implicit_recs_mapper(\n"," model,\n"," train_mat,\n"," num_candidates,\n"," users_mapping,\n"," items_inv_mapping,\n"," filter_already_liked_items=False,\n"," known_items=before_boosting_known_items_mapped,\n"," filter_items=None,\n"," return_scores=True\n",")"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":424},"executionInfo":{"elapsed":6395,"status":"ok","timestamp":1642188036867,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"pWPnNFW-4phP","outputId":"0b6f43f5-0329-4622-fec5-82d438260602"},"outputs":[{"data":{"text/html":["\n","
\n","
\n","
\n","\n","
\n"," \n"," \n"," \n"," user_id \n"," item_id \n"," implicit_score \n"," \n"," \n"," \n"," \n"," 0 \n"," 30 \n"," 199262.0 \n"," 0.707107 \n"," \n"," \n"," 0 \n"," 30 \n"," 203105.0 \n"," 0.707107 \n"," \n"," \n"," 0 \n"," 30 \n"," 199886.0 \n"," 0.707107 \n"," \n"," \n"," 0 \n"," 30 \n"," 219904.0 \n"," 0.707107 \n"," \n"," \n"," 0 \n"," 30 \n"," 203206.0 \n"," 0.707107 \n"," \n"," \n"," ... \n"," ... \n"," ... \n"," ... \n"," \n"," \n"," 22231 \n"," 1097544 \n"," 263721.0 \n"," 0.577350 \n"," \n"," \n"," 22231 \n"," 1097544 \n"," 227113.0 \n"," 0.577350 \n"," \n"," \n"," 22231 \n"," 1097544 \n"," 239830.0 \n"," 0.577350 \n"," \n"," \n"," 22231 \n"," 1097544 \n"," 139002.0 \n"," 0.577350 \n"," \n"," \n"," 22231 \n"," 1097544 \n"," 243127.0 \n"," 0.577350 \n"," \n"," \n","
\n","
2109153 rows × 3 columns
\n","
\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "],"text/plain":[" user_id item_id implicit_score\n","0 30 199262.0 0.707107\n","0 30 203105.0 0.707107\n","0 30 199886.0 0.707107\n","0 30 219904.0 0.707107\n","0 30 203206.0 0.707107\n","... ... ... ...\n","22231 1097544 263721.0 0.577350\n","22231 1097544 227113.0 0.577350\n","22231 1097544 239830.0 0.577350\n","22231 1097544 139002.0 0.577350\n","22231 1097544 243127.0 0.577350\n","\n","[2109153 rows x 3 columns]"]},"execution_count":17,"metadata":{},"output_type":"execute_result"}],"source":["recs = pd.DataFrame({'user_id': boost_warm_idx})\n","recs['item_id_score'] = recs['user_id'].map(mapper)\n","recs['item_id'] = recs['item_id_score'].apply(lambda x: x[0])\n","recs['implicit_score'] = recs['item_id_score'].apply(lambda x: x[1])\n","recs['tmp'] = recs.apply(lambda row: list(zip(row['item_id'], row['implicit_score'])), axis=1) \n","recs = recs.explode('tmp')\n","recs[['item_id','implicit_score']] = pd.DataFrame(recs['tmp'].tolist(), index=recs.index)\n","recs.drop(columns='tmp', inplace=True)\n","recs.drop(['item_id_score'], axis=1, inplace=True)\n","recs"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"roQ4due5DKVl"},"outputs":[],"source":["recs.to_csv(os.path.join(ds.processed_dir, 'impl_scores.csv'), index=False)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Xje9T6CtI7kM"},"outputs":[],"source":["# taking candidates from implicit model and generating positive samples\n","candidates = pd.read_csv(os.path.join(ds.processed_dir, 'impl_scores.csv'))\n","candidates['item_id'] = candidates['item_id'].fillna(0.).astype('int64')\n","candidates['id'] = candidates.index\n","pos = candidates.merge(boosting_data[['user_id', 'item_id']], \n"," on=['user_id', 'item_id'], how='inner')\n","pos['target'] = 1"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":677},"executionInfo":{"elapsed":8,"status":"ok","timestamp":1642188049285,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"XmD3UFnzK4el","outputId":"b11e5bae-1d3b-41f9-af99-6a0c4c532065"},"outputs":[{"data":{"text/html":["\n"," \n","
\n","
\n","\n","
\n"," \n"," \n"," \n"," user_id \n"," item_id \n"," implicit_score \n"," id \n"," target \n"," \n"," \n"," \n"," \n"," 0 \n"," 109925 \n"," 5543 \n"," 1.000000 \n"," 211288 \n"," 1 \n"," \n"," \n"," 1 \n"," 126087 \n"," 5518 \n"," 1.000000 \n"," 240448 \n"," 1 \n"," \n"," \n"," 2 \n"," 131803 \n"," 7807 \n"," 0.707107 \n"," 250989 \n"," 1 \n"," \n"," \n"," 3 \n"," 140179 \n"," 5011 \n"," 0.707107 \n"," 264967 \n"," 1 \n"," \n"," \n"," 4 \n"," 223763 \n"," 2780 \n"," 1.000000 \n"," 425032 \n"," 1 \n"," \n"," \n"," 5 \n"," 316074 \n"," 7033 \n"," 1.000000 \n"," 604543 \n"," 1 \n"," \n"," \n"," 6 \n"," 419536 \n"," 10267 \n"," 1.000000 \n"," 806723 \n"," 1 \n"," \n"," \n"," 7 \n"," 482854 \n"," 13237 \n"," 1.000000 \n"," 923066 \n"," 1 \n"," \n"," \n"," 8 \n"," 484834 \n"," 7558 \n"," 0.500000 \n"," 927130 \n"," 1 \n"," \n"," \n"," 9 \n"," 487160 \n"," 3784 \n"," 1.000000 \n"," 931333 \n"," 1 \n"," \n"," \n"," 10 \n"," 522481 \n"," 13787 \n"," 0.516398 \n"," 995099 \n"," 1 \n"," \n"," \n"," 11 \n"," 616140 \n"," 8254 \n"," 0.169031 \n"," 1176238 \n"," 1 \n"," \n"," \n"," 12 \n"," 626147 \n"," 5216 \n"," 1.000000 \n"," 1193879 \n"," 1 \n"," \n"," \n"," 13 \n"," 779743 \n"," 10971 \n"," 0.353553 \n"," 1494276 \n"," 1 \n"," \n"," \n"," 14 \n"," 860928 \n"," 14431 \n"," 0.500000 \n"," 1650890 \n"," 1 \n"," \n"," \n"," 15 \n"," 928023 \n"," 9113 \n"," 0.500000 \n"," 1784750 \n"," 1 \n"," \n"," \n"," 16 \n"," 947916 \n"," 1173 \n"," 1.000000 \n"," 1822962 \n"," 1 \n"," \n"," \n"," 17 \n"," 1030860 \n"," 657 \n"," 0.333333 \n"," 1983602 \n"," 1 \n"," \n"," \n"," 18 \n"," 1043861 \n"," 15384 \n"," 1.000000 \n"," 2006821 \n"," 1 \n"," \n"," \n"," 19 \n"," 1093253 \n"," 11769 \n"," 1.000000 \n"," 2101280 \n"," 1 \n"," \n"," \n","
\n","
\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "],"text/plain":[" user_id item_id implicit_score id target\n","0 109925 5543 1.000000 211288 1\n","1 126087 5518 1.000000 240448 1\n","2 131803 7807 0.707107 250989 1\n","3 140179 5011 0.707107 264967 1\n","4 223763 2780 1.000000 425032 1\n","5 316074 7033 1.000000 604543 1\n","6 419536 10267 1.000000 806723 1\n","7 482854 13237 1.000000 923066 1\n","8 484834 7558 0.500000 927130 1\n","9 487160 3784 1.000000 931333 1\n","10 522481 13787 0.516398 995099 1\n","11 616140 8254 0.169031 1176238 1\n","12 626147 5216 1.000000 1193879 1\n","13 779743 10971 0.353553 1494276 1\n","14 860928 14431 0.500000 1650890 1\n","15 928023 9113 0.500000 1784750 1\n","16 947916 1173 1.000000 1822962 1\n","17 1030860 657 0.333333 1983602 1\n","18 1043861 15384 1.000000 2006821 1\n","19 1093253 11769 1.000000 2101280 1"]},"execution_count":20,"metadata":{},"output_type":"execute_result"}],"source":["pos"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"xnuzDSyeKhEa"},"outputs":[],"source":["# Generating negative samples\n","num_negatives = 3\n","pos_group = pos.groupby('user_id')['item_id'].count()\n","neg = candidates[~candidates['id'].isin(pos['id'])].copy()\n","neg_sampling = pd.DataFrame(neg.groupby('user_id')['id'].apply(\n"," list)).join(pos_group, on='user_id', rsuffix='p', how='right')\n","neg_sampling['num_choices'] = np.clip(neg_sampling['item_id'] * num_negatives, \n"," a_min=0, a_max=25)\n","func = lambda row: np.random.choice(row['id'],\n"," size=row['num_choices'],\n"," replace=False)\n","neg_sampling['sample_idx'] = neg_sampling.apply(func, axis=1)\n","idx_chosen = neg_sampling['sample_idx'].explode().values\n","neg = neg[neg['id'].isin(idx_chosen)]\n","neg['target'] = 0"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":1000},"executionInfo":{"elapsed":8,"status":"ok","timestamp":1642188051315,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"iReJGyJNMpVg","outputId":"adcff675-6618-47f3-a534-67ef145b3ccb"},"outputs":[{"data":{"text/html":["\n"," \n","
\n","
\n","\n","
\n"," \n"," \n"," \n"," user_id \n"," item_id \n"," implicit_score \n"," id \n"," target \n"," \n"," \n"," \n"," \n"," 211232 \n"," 109925 \n"," 12948 \n"," 1.000000 \n"," 211232 \n"," 0 \n"," \n"," \n"," 211234 \n"," 109925 \n"," 31205 \n"," 1.000000 \n"," 211234 \n"," 0 \n"," \n"," \n"," 211287 \n"," 109925 \n"," 251132 \n"," 1.000000 \n"," 211287 \n"," 0 \n"," \n"," \n"," 240482 \n"," 126087 \n"," 38859 \n"," 1.000000 \n"," 240482 \n"," 0 \n"," \n"," \n"," 240493 \n"," 126087 \n"," 65257 \n"," 1.000000 \n"," 240493 \n"," 0 \n"," \n"," \n"," 240494 \n"," 126087 \n"," 41067 \n"," 1.000000 \n"," 240494 \n"," 0 \n"," \n"," \n"," 250980 \n"," 131803 \n"," 207587 \n"," 0.577350 \n"," 250980 \n"," 0 \n"," \n"," \n"," 250988 \n"," 131803 \n"," 6113 \n"," 0.707107 \n"," 250988 \n"," 0 \n"," \n"," \n"," 251041 \n"," 131803 \n"," 107381 \n"," 1.000000 \n"," 251041 \n"," 0 \n"," \n"," \n"," 265003 \n"," 140179 \n"," 30433 \n"," 1.000000 \n"," 265003 \n"," 0 \n"," \n"," \n"," 265014 \n"," 140179 \n"," 21064 \n"," 1.000000 \n"," 265014 \n"," 0 \n"," \n"," \n"," 265031 \n"," 140179 \n"," 16373 \n"," 1.000000 \n"," 265031 \n"," 0 \n"," \n"," \n"," 425049 \n"," 223763 \n"," 77169 \n"," 1.000000 \n"," 425049 \n"," 0 \n"," \n"," \n"," 425052 \n"," 223763 \n"," 12948 \n"," 1.000000 \n"," 425052 \n"," 0 \n"," \n"," \n"," 425074 \n"," 223763 \n"," 109280 \n"," 1.000000 \n"," 425074 \n"," 0 \n"," \n"," \n"," 604542 \n"," 316074 \n"," 7107 \n"," 1.000000 \n"," 604542 \n"," 0 \n"," \n"," \n"," 604554 \n"," 316074 \n"," 11829 \n"," 1.000000 \n"," 604554 \n"," 0 \n"," \n"," \n"," 604556 \n"," 316074 \n"," 73997 \n"," 1.000000 \n"," 604556 \n"," 0 \n"," \n"," \n"," 806662 \n"," 419536 \n"," 12854 \n"," 1.000000 \n"," 806662 \n"," 0 \n"," \n"," \n"," 806668 \n"," 419536 \n"," 13076 \n"," 1.000000 \n"," 806668 \n"," 0 \n"," \n"," \n"," 806742 \n"," 419536 \n"," 9204 \n"," 1.000000 \n"," 806742 \n"," 0 \n"," \n"," \n"," 923002 \n"," 482854 \n"," 34763 \n"," 1.000000 \n"," 923002 \n"," 0 \n"," \n"," \n"," 923018 \n"," 482854 \n"," 11361 \n"," 1.000000 \n"," 923018 \n"," 0 \n"," \n"," \n"," 923061 \n"," 482854 \n"," 12965 \n"," 1.000000 \n"," 923061 \n"," 0 \n"," \n"," \n"," 927151 \n"," 484834 \n"," 30217 \n"," 0.707107 \n"," 927151 \n"," 0 \n"," \n"," \n"," 927180 \n"," 484834 \n"," 12652 \n"," 0.500000 \n"," 927180 \n"," 0 \n"," \n"," \n"," 927201 \n"," 484834 \n"," 65037 \n"," 0.707107 \n"," 927201 \n"," 0 \n"," \n"," \n"," 931310 \n"," 487160 \n"," 7616 \n"," 0.707107 \n"," 931310 \n"," 0 \n"," \n"," \n"," 931343 \n"," 487160 \n"," 7107 \n"," 1.000000 \n"," 931343 \n"," 0 \n"," \n"," \n"," 931370 \n"," 487160 \n"," 21317 \n"," 1.000000 \n"," 931370 \n"," 0 \n"," \n"," \n"," 995170 \n"," 522481 \n"," 120210 \n"," 0.707107 \n"," 995170 \n"," 0 \n"," \n"," \n"," 995173 \n"," 522481 \n"," 33260 \n"," 1.000000 \n"," 995173 \n"," 0 \n"," \n"," \n"," 995183 \n"," 522481 \n"," 176089 \n"," 1.000000 \n"," 995183 \n"," 0 \n"," \n"," \n"," 1176201 \n"," 616140 \n"," 40776 \n"," 0.133631 \n"," 1176201 \n"," 0 \n"," \n"," \n"," 1176203 \n"," 616140 \n"," 35411 \n"," 0.133631 \n"," 1176203 \n"," 0 \n"," \n"," \n"," 1176252 \n"," 616140 \n"," 75552 \n"," 0.267261 \n"," 1176252 \n"," 0 \n"," \n"," \n"," 1193820 \n"," 626147 \n"," 118355 \n"," 0.707107 \n"," 1193820 \n"," 0 \n"," \n"," \n"," 1193836 \n"," 626147 \n"," 245945 \n"," 0.707107 \n"," 1193836 \n"," 0 \n"," \n"," \n"," 1193877 \n"," 626147 \n"," 2239 \n"," 1.000000 \n"," 1193877 \n"," 0 \n"," \n"," \n"," 1494296 \n"," 779743 \n"," 73398 \n"," 0.353553 \n"," 1494296 \n"," 0 \n"," \n"," \n"," 1494302 \n"," 779743 \n"," 2209 \n"," 0.500000 \n"," 1494302 \n"," 0 \n"," \n"," \n"," 1494338 \n"," 779743 \n"," 88020 \n"," 0.500000 \n"," 1494338 \n"," 0 \n"," \n"," \n"," 1650910 \n"," 860928 \n"," 22021 \n"," 0.577350 \n"," 1650910 \n"," 0 \n"," \n"," \n"," 1650945 \n"," 860928 \n"," 43326 \n"," 1.000000 \n"," 1650945 \n"," 0 \n"," \n"," \n"," 1650951 \n"," 860928 \n"," 38736 \n"," 0.577350 \n"," 1650951 \n"," 0 \n"," \n"," \n"," 1784731 \n"," 928023 \n"," 11357 \n"," 0.447214 \n"," 1784731 \n"," 0 \n"," \n"," \n"," 1784770 \n"," 928023 \n"," 220290 \n"," 1.000000 \n"," 1784770 \n"," 0 \n"," \n"," \n"," 1784781 \n"," 928023 \n"," 30716 \n"," 0.324443 \n"," 1784781 \n"," 0 \n"," \n"," \n"," 1822946 \n"," 947916 \n"," 8637 \n"," 1.000000 \n"," 1822946 \n"," 0 \n"," \n"," \n"," 1822954 \n"," 947916 \n"," 226402 \n"," 0.707107 \n"," 1822954 \n"," 0 \n"," \n"," \n"," 1822957 \n"," 947916 \n"," 21772 \n"," 0.707107 \n"," 1822957 \n"," 0 \n"," \n"," \n"," 1983609 \n"," 1030860 \n"," 7223 \n"," 0.500000 \n"," 1983609 \n"," 0 \n"," \n"," \n"," 1983645 \n"," 1030860 \n"," 110861 \n"," 0.707107 \n"," 1983645 \n"," 0 \n"," \n"," \n"," 1983663 \n"," 1030860 \n"," 16007 \n"," 0.408248 \n"," 1983663 \n"," 0 \n"," \n"," \n"," 2006811 \n"," 1043861 \n"," 108934 \n"," 0.707107 \n"," 2006811 \n"," 0 \n"," \n"," \n"," 2006872 \n"," 1043861 \n"," 38242 \n"," 1.000000 \n"," 2006872 \n"," 0 \n"," \n"," \n"," 2006889 \n"," 1043861 \n"," 57489 \n"," 1.000000 \n"," 2006889 \n"," 0 \n"," \n"," \n"," 2101213 \n"," 1093253 \n"," 194512 \n"," 0.707107 \n"," 2101213 \n"," 0 \n"," \n"," \n"," 2101240 \n"," 1093253 \n"," 150161 \n"," 0.707107 \n"," 2101240 \n"," 0 \n"," \n"," \n"," 2101261 \n"," 1093253 \n"," 225008 \n"," 0.707107 \n"," 2101261 \n"," 0 \n"," \n"," \n","
\n","
\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "],"text/plain":[" user_id item_id implicit_score id target\n","211232 109925 12948 1.000000 211232 0\n","211234 109925 31205 1.000000 211234 0\n","211287 109925 251132 1.000000 211287 0\n","240482 126087 38859 1.000000 240482 0\n","240493 126087 65257 1.000000 240493 0\n","240494 126087 41067 1.000000 240494 0\n","250980 131803 207587 0.577350 250980 0\n","250988 131803 6113 0.707107 250988 0\n","251041 131803 107381 1.000000 251041 0\n","265003 140179 30433 1.000000 265003 0\n","265014 140179 21064 1.000000 265014 0\n","265031 140179 16373 1.000000 265031 0\n","425049 223763 77169 1.000000 425049 0\n","425052 223763 12948 1.000000 425052 0\n","425074 223763 109280 1.000000 425074 0\n","604542 316074 7107 1.000000 604542 0\n","604554 316074 11829 1.000000 604554 0\n","604556 316074 73997 1.000000 604556 0\n","806662 419536 12854 1.000000 806662 0\n","806668 419536 13076 1.000000 806668 0\n","806742 419536 9204 1.000000 806742 0\n","923002 482854 34763 1.000000 923002 0\n","923018 482854 11361 1.000000 923018 0\n","923061 482854 12965 1.000000 923061 0\n","927151 484834 30217 0.707107 927151 0\n","927180 484834 12652 0.500000 927180 0\n","927201 484834 65037 0.707107 927201 0\n","931310 487160 7616 0.707107 931310 0\n","931343 487160 7107 1.000000 931343 0\n","931370 487160 21317 1.000000 931370 0\n","995170 522481 120210 0.707107 995170 0\n","995173 522481 33260 1.000000 995173 0\n","995183 522481 176089 1.000000 995183 0\n","1176201 616140 40776 0.133631 1176201 0\n","1176203 616140 35411 0.133631 1176203 0\n","1176252 616140 75552 0.267261 1176252 0\n","1193820 626147 118355 0.707107 1193820 0\n","1193836 626147 245945 0.707107 1193836 0\n","1193877 626147 2239 1.000000 1193877 0\n","1494296 779743 73398 0.353553 1494296 0\n","1494302 779743 2209 0.500000 1494302 0\n","1494338 779743 88020 0.500000 1494338 0\n","1650910 860928 22021 0.577350 1650910 0\n","1650945 860928 43326 1.000000 1650945 0\n","1650951 860928 38736 0.577350 1650951 0\n","1784731 928023 11357 0.447214 1784731 0\n","1784770 928023 220290 1.000000 1784770 0\n","1784781 928023 30716 0.324443 1784781 0\n","1822946 947916 8637 1.000000 1822946 0\n","1822954 947916 226402 0.707107 1822954 0\n","1822957 947916 21772 0.707107 1822957 0\n","1983609 1030860 7223 0.500000 1983609 0\n","1983645 1030860 110861 0.707107 1983645 0\n","1983663 1030860 16007 0.408248 1983663 0\n","2006811 1043861 108934 0.707107 2006811 0\n","2006872 1043861 38242 1.000000 2006872 0\n","2006889 1043861 57489 1.000000 2006889 0\n","2101213 1093253 194512 0.707107 2101213 0\n","2101240 1093253 150161 0.707107 2101240 0\n","2101261 1093253 225008 0.707107 2101261 0"]},"execution_count":22,"metadata":{},"output_type":"execute_result"}],"source":["neg"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"B3h45yA7MzMe"},"outputs":[],"source":["# Creating training data sample and early stopping data sample\n","boost_idx_train = np.intersect1d(boost_idx, pos['user_id'].unique())\n","boost_train_users, boost_eval_users = train_test_split(boost_idx_train, \n"," test_size=0.1,\n"," random_state=345)\n","select_col = ['user_id', 'item_id', 'implicit_score', 'target']\n","boost_train = shuffle(\n"," pd.concat([\n"," pos[pos['user_id'].isin(boost_train_users)],\n"," neg[neg['user_id'].isin(boost_train_users)]\n"," ])[select_col]\n",")\n","boost_eval = shuffle(\n"," pd.concat([\n"," pos[pos['user_id'].isin(boost_eval_users)],\n"," neg[neg['user_id'].isin(boost_eval_users)]\n"," ])[select_col]\n",")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ojeLRc9iM-LQ"},"outputs":[],"source":["user_col = ['user_id','age','income','sex','kids_flg','boost_user_watch_cnt_all',\n"," 'boost_user_watch_cnt_last_14']\n","\n","item_col = ['item_id','content_type','countries_max','for_kids','age_rating',\n"," 'studios_max','genres_max','genres_min','genres_med','release_novelty']\n","\n","item_stats_col = ['item_id','watched_in_7_days','watch_ts_std','trend_slope',\n"," 'watch_ts_quantile_95_diff','watch_ts_median_diff',\n"," 'watched_in_all_time','male_watchers_fraction',\n"," 'female_watchers_fraction','younger_35_fraction','older_35_fraction']\n"," \n","cat_col = ['age','income','sex','content_type']"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":364},"executionInfo":{"elapsed":610,"status":"ok","timestamp":1642188056980,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"-gP7TfKGNMba","outputId":"e9484e51-24c6-4da8-e152-b51c1c68bc9e"},"outputs":[{"data":{"text/html":["\n"," \n","
\n","
\n","\n","
\n"," \n"," \n"," \n"," user_id \n"," item_id \n"," implicit_score \n"," target \n"," age \n"," income \n"," sex \n"," kids_flg \n"," boost_user_watch_cnt_all \n"," boost_user_watch_cnt_last_14 \n"," content_type \n"," countries_max \n"," for_kids \n"," age_rating \n"," studios_max \n"," genres_max \n"," genres_min \n"," genres_med \n"," release_novelty \n"," \n"," \n"," \n"," \n"," 0 \n"," 316074 \n"," 7033 \n"," 1.000000 \n"," 1 \n"," age_18_24 \n"," income_20_40 \n"," F \n"," False \n"," 4.0 \n"," 0.0 \n"," series \n"," 4340.0 \n"," False \n"," 16.0 \n"," 14898.0 \n"," 3858.0 \n"," 2778.0 \n"," 3318.0 \n"," 5.0 \n"," \n"," \n"," 1 \n"," 131803 \n"," 6113 \n"," 0.707107 \n"," 0 \n"," age_35_44 \n"," income_20_40 \n"," M \n"," False \n"," 0.0 \n"," 0.0 \n"," film \n"," 5065.0 \n"," False \n"," 12.0 \n"," 14898.0 \n"," 3503.0 \n"," 1820.0 \n"," 1877.0 \n"," 1.0 \n"," \n"," \n"," 2 \n"," 316074 \n"," 11829 \n"," 1.000000 \n"," 0 \n"," age_18_24 \n"," income_20_40 \n"," F \n"," False \n"," 4.0 \n"," 0.0 \n"," film \n"," 5065.0 \n"," False \n"," 18.0 \n"," 14898.0 \n"," 1820.0 \n"," 1033.0 \n"," 1426.5 \n"," 6.0 \n"," \n"," \n"," 3 \n"," 131803 \n"," 207587 \n"," 0.577350 \n"," 0 \n"," age_35_44 \n"," income_20_40 \n"," M \n"," False \n"," 0.0 \n"," 0.0 \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 4 \n"," 316074 \n"," 7107 \n"," 1.000000 \n"," 0 \n"," age_18_24 \n"," income_20_40 \n"," F \n"," False \n"," 4.0 \n"," 0.0 \n"," series \n"," 4340.0 \n"," False \n"," 12.0 \n"," 14898.0 \n"," 5431.0 \n"," 626.0 \n"," 1877.0 \n"," 6.0 \n"," \n"," \n"," 5 \n"," 131803 \n"," 7807 \n"," 0.707107 \n"," 1 \n"," age_35_44 \n"," income_20_40 \n"," M \n"," False \n"," 0.0 \n"," 0.0 \n"," film \n"," 4340.0 \n"," False \n"," 16.0 \n"," 14898.0 \n"," 3858.0 \n"," 3858.0 \n"," 3858.0 \n"," 5.0 \n"," \n"," \n"," 6 \n"," 316074 \n"," 73997 \n"," 1.000000 \n"," 0 \n"," age_18_24 \n"," income_20_40 \n"," F \n"," False \n"," 4.0 \n"," 0.0 \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 7 \n"," 131803 \n"," 107381 \n"," 1.000000 \n"," 0 \n"," age_35_44 \n"," income_20_40 \n"," M \n"," False \n"," 0.0 \n"," 0.0 \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n","
\n","
\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "],"text/plain":[" user_id item_id implicit_score ... genres_min genres_med release_novelty\n","0 316074 7033 1.000000 ... 2778.0 3318.0 5.0\n","1 131803 6113 0.707107 ... 1820.0 1877.0 1.0\n","2 316074 11829 1.000000 ... 1033.0 1426.5 6.0\n","3 131803 207587 0.577350 ... NaN NaN NaN\n","4 316074 7107 1.000000 ... 626.0 1877.0 6.0\n","5 131803 7807 0.707107 ... 3858.0 3858.0 5.0\n","6 316074 73997 1.000000 ... NaN NaN NaN\n","7 131803 107381 1.000000 ... NaN NaN NaN\n","\n","[8 rows x 19 columns]"]},"execution_count":25,"metadata":{},"output_type":"execute_result"}],"source":["train_feat = boost_train.merge(users_df[user_col],\n"," on=['user_id'],\n"," how='left')\\\n"," .merge(items_df[item_col],\n"," on=['item_id'],\n"," how='left')\n"," \n","eval_feat = boost_eval.merge(users_df[user_col],\n"," on=['user_id'],\n"," how='left') \\\n"," .merge(items_df[item_col],\n"," on=['item_id'],\n"," how='left')\n"," \n","eval_feat"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":488},"executionInfo":{"elapsed":623,"status":"ok","timestamp":1642188058306,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"ECh2RhahNSsQ","outputId":"2cafea2d-7165-47ce-a45b-cc5acd6840e6"},"outputs":[{"data":{"text/html":["\n"," \n","
\n","
\n","\n","
\n"," \n"," \n"," \n"," implicit_score \n"," age \n"," income \n"," sex \n"," kids_flg \n"," boost_user_watch_cnt_all \n"," boost_user_watch_cnt_last_14 \n"," content_type \n"," countries_max \n"," for_kids \n"," age_rating \n"," studios_max \n"," genres_max \n"," genres_min \n"," genres_med \n"," release_novelty \n"," watched_in_7_days \n"," watch_ts_std \n"," trend_slope \n"," watch_ts_quantile_95_diff \n"," watch_ts_median_diff \n"," watched_in_all_time \n"," male_watchers_fraction \n"," female_watchers_fraction \n"," younger_35_fraction \n"," older_35_fraction \n"," \n"," \n"," \n"," \n"," 0 \n"," 1.000000 \n"," age_35_44 \n"," income_20_40 \n"," F \n"," False \n"," 2 \n"," 1 \n"," film \n"," 5065 \n"," False \n"," 16 \n"," 14898 \n"," 2418 \n"," 1820 \n"," 2119 \n"," 3 \n"," 46 \n"," 0.787585 \n"," 0.195783 \n"," 0 \n"," 1 \n"," 46 \n"," 0.422222 \n"," 0.355556 \n"," 0.311111 \n"," 0.466667 \n"," \n"," \n"," 1 \n"," 1.000000 \n"," age_unknown \n"," income_unknown \n"," sex_unknown \n"," False \n"," 1 \n"," 1 \n"," series \n"," 4340 \n"," False \n"," 12 \n"," 14898 \n"," 1339 \n"," 1339 \n"," 1339 \n"," 4 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," \n"," \n"," 2 \n"," 0.500000 \n"," age_18_24 \n"," income_20_40 \n"," M \n"," False \n"," 2 \n"," 1 \n"," film \n"," 5065 \n"," False \n"," 18 \n"," 14898 \n"," 5431 \n"," 1224 \n"," 2418 \n"," 5 \n"," 5 \n"," 46.3813 \n"," -0.0692771 \n"," 5 \n"," 74 \n"," 89 \n"," 0.431818 \n"," 0.409091 \n"," 0.420455 \n"," 0.420455 \n"," \n"," \n"," 3 \n"," 1.000000 \n"," age_25_34 \n"," income_20_40 \n"," M \n"," True \n"," 1 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," \n"," \n"," 4 \n"," 0.707107 \n"," age_18_24 \n"," income_20_40 \n"," M \n"," False \n"," 3 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," \n"," \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," \n"," \n"," 67 \n"," 0.707107 \n"," age_35_44 \n"," income_20_40 \n"," F \n"," False \n"," 2 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," \n"," \n"," 68 \n"," 0.447214 \n"," age_18_24 \n"," income_20_40 \n"," M \n"," False \n"," 2 \n"," 1 \n"," film \n"," 295 \n"," False \n"," 18 \n"," 14898 \n"," 3858 \n"," 31 \n"," 3140.5 \n"," 4 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," \n"," \n"," 69 \n"," 0.500000 \n"," age_25_34 \n"," income_40_60 \n"," M \n"," True \n"," 1 \n"," 1 \n"," film \n"," 1272 \n"," False \n"," 18 \n"," 14898 \n"," 5431 \n"," 254 \n"," 3503 \n"," 5 \n"," 0 \n"," 0 \n"," 0 \n"," 68 \n"," 68 \n"," 1 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," \n"," \n"," 70 \n"," 1.000000 \n"," age_45_54 \n"," income_40_60 \n"," M \n"," True \n"," 2 \n"," 0 \n"," film \n"," 5065 \n"," False \n"," 16 \n"," 14898 \n"," 3858 \n"," 2778 \n"," 3503 \n"," 4 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," 0 \n"," \n"," \n"," 71 \n"," 0.707107 \n"," age_65_inf \n"," income_20_40 \n"," F \n"," False \n"," 5 \n"," 5 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," \n"," \n","
\n","
72 rows × 26 columns
\n","
\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "],"text/plain":[" implicit_score age ... younger_35_fraction older_35_fraction\n","0 1.000000 age_35_44 ... 0.311111 0.466667\n","1 1.000000 age_unknown ... 0 0\n","2 0.500000 age_18_24 ... 0.420455 0.420455\n","3 1.000000 age_25_34 ... None None\n","4 0.707107 age_18_24 ... None None\n",".. ... ... ... ... ...\n","67 0.707107 age_35_44 ... None None\n","68 0.447214 age_18_24 ... 0 0\n","69 0.500000 age_25_34 ... 0 0\n","70 1.000000 age_45_54 ... 0 0\n","71 0.707107 age_65_inf ... None None\n","\n","[72 rows x 26 columns]"]},"execution_count":26,"metadata":{},"output_type":"execute_result"}],"source":["item_stats = pd.read_csv(os.path.join(ds.processed_dir, 'item_stats.csv'))\n","item_stats = item_stats[item_stats_col]\n","train_feat = train_feat.join(item_stats.set_index('item_id'), \n"," on='item_id', how='left')\n","eval_feat = eval_feat.join(item_stats.set_index('item_id'), \n"," on='item_id', how='left')\n","drop_col = ['user_id', 'item_id']\n","target_col = ['target']\n","\n","X_train = train_feat.drop(drop_col + target_col, axis=1)\n","y_train = train_feat[target_col]\n","X_val = eval_feat.drop(drop_col + target_col, axis=1)\n","y_val = eval_feat[target_col]\n","X_train.fillna('None', inplace=True)\n","X_val.fillna('None', inplace=True)\n","X_train[cat_col] = X_train[cat_col].astype('category')\n","X_val[cat_col] = X_val[cat_col].astype('category')\n","\n","X_train"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3256,"status":"ok","timestamp":1642188064223,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"NPARxAndNoNV","outputId":"7d21dbf9-a091-40c6-990e-84db9fb7f9a1"},"outputs":[{"name":"stdout","output_type":"stream","text":["0:\tlearn: 0.6814278\ttest: 0.6853672\tbest: 0.6853672 (0)\ttotal: 57.5ms\tremaining: 1m 54s\n","200:\tlearn: 0.1793975\ttest: 0.5471784\tbest: 0.5422113 (146)\ttotal: 1.19s\tremaining: 10.7s\n","Stopped by overfitting detector (200 iterations wait)\n","\n","bestTest = 0.5422113159\n","bestIteration = 146\n","\n","Shrink model to first 147 iterations.\n"]},{"data":{"text/plain":[""]},"execution_count":27,"metadata":{},"output_type":"execute_result"}],"source":["# Training CatBoost classifier with parameters previously chosen on cross validation\n","params = {\n"," 'subsample': 0.97, \n"," 'max_depth': 9,\n"," 'n_estimators': 2000,\n"," 'learning_rate': 0.03, \n"," 'scale_pos_weight': num_negatives, \n"," 'l2_leaf_reg': 27, \n"," 'thread_count': -1,\n"," 'verbose': 200,\n"," 'task_type': \"CPU\",\n"," 'devices': '0:1',\n"," # 'bootstrap_type': 'Poisson'\n","}\n","boost_model = CatBoostClassifier(**params)\n","boost_model.fit(X_train,\n"," y_train,\n"," eval_set=(X_val, y_val),\n"," early_stopping_rounds=200,\n"," cat_features=cat_col,\n"," plot=False)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"KH3ZUwyqmS-f"},"outputs":[],"source":["with open(\"catboost_trained.pkl\", 'wb') as f:\n"," pickle.dump(boost_model, f)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":5,"status":"ok","timestamp":1642188107612,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"NRchFAmAPdBh","outputId":"b3675f02-b241-4c38-8d08-095ffc172f50"},"outputs":[{"data":{"text/plain":[""]},"execution_count":29,"metadata":{},"output_type":"execute_result"}],"source":["# with open(\"catboost_trained.pkl\", 'rb') as f:\n","# boost_model = pickle.load(f)\n","boost_model"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"X0yG3zmjPqC1"},"outputs":[],"source":["random_items = list(np.random.choice(interactions_df['user_id'], size=5, replace=False))\n","cold_items = [10000, 20000]\n","random_items.extend(cold_items)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":15,"status":"ok","timestamp":1642188281959,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"JBqnIdlKQFIy","outputId":"6e1a8e42-562f-4ad9-a45f-f196fc4e5f74"},"outputs":[{"data":{"text/plain":["array([ 20000, 133452, 332832, 341075, 622570, 728808])"]},"execution_count":31,"metadata":{},"output_type":"execute_result"}],"source":["warm_idx = np.intersect1d(random_items, interactions_df['user_id'].unique())\n","warm_idx"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"xvkws1XHQP1O"},"outputs":[],"source":["_candidates = candidates.copy()\n","_candidates.dropna(subset=['item_id'], axis=0, inplace=True)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":488},"executionInfo":{"elapsed":1745,"status":"ok","timestamp":1642188284831,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"DmhCuWoOQgA4","outputId":"38d1ca58-104c-4aca-c15f-3c5f70bc9510"},"outputs":[{"data":{"text/html":["\n"," \n","
\n","
\n","\n","
\n"," \n"," \n"," \n"," user_id \n"," item_id \n"," implicit_score \n"," id \n"," age \n"," income \n"," sex \n"," kids_flg \n"," boost_user_watch_cnt_all \n"," boost_user_watch_cnt_last_14 \n"," content_type \n"," countries_max \n"," for_kids \n"," age_rating \n"," studios_max \n"," genres_max \n"," genres_min \n"," genres_med \n"," release_novelty \n"," \n"," \n"," \n"," \n"," 0 \n"," 30 \n"," 199262 \n"," 0.707107 \n"," 0 \n"," age_unknown \n"," income_unknown \n"," sex_unknown \n"," False \n"," 2.0 \n"," 1.0 \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 1 \n"," 30 \n"," 203105 \n"," 0.707107 \n"," 1 \n"," age_unknown \n"," income_unknown \n"," sex_unknown \n"," False \n"," 2.0 \n"," 1.0 \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 2 \n"," 30 \n"," 199886 \n"," 0.707107 \n"," 2 \n"," age_unknown \n"," income_unknown \n"," sex_unknown \n"," False \n"," 2.0 \n"," 1.0 \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 3 \n"," 30 \n"," 219904 \n"," 0.707107 \n"," 3 \n"," age_unknown \n"," income_unknown \n"," sex_unknown \n"," False \n"," 2.0 \n"," 1.0 \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 4 \n"," 30 \n"," 203206 \n"," 0.707107 \n"," 4 \n"," age_unknown \n"," income_unknown \n"," sex_unknown \n"," False \n"," 2.0 \n"," 1.0 \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," \n"," \n"," 2109148 \n"," 1097544 \n"," 263721 \n"," 0.577350 \n"," 2109148 \n"," age_25_34 \n"," income_20_40 \n"," F \n"," True \n"," 1.0 \n"," 1.0 \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 2109149 \n"," 1097544 \n"," 227113 \n"," 0.577350 \n"," 2109149 \n"," age_25_34 \n"," income_20_40 \n"," F \n"," True \n"," 1.0 \n"," 1.0 \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 2109150 \n"," 1097544 \n"," 239830 \n"," 0.577350 \n"," 2109150 \n"," age_25_34 \n"," income_20_40 \n"," F \n"," True \n"," 1.0 \n"," 1.0 \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 2109151 \n"," 1097544 \n"," 139002 \n"," 0.577350 \n"," 2109151 \n"," age_25_34 \n"," income_20_40 \n"," F \n"," True \n"," 1.0 \n"," 1.0 \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 2109152 \n"," 1097544 \n"," 243127 \n"," 0.577350 \n"," 2109152 \n"," age_25_34 \n"," income_20_40 \n"," F \n"," True \n"," 1.0 \n"," 1.0 \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n","
\n","
2109153 rows × 19 columns
\n","
\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "],"text/plain":[" user_id item_id ... genres_med release_novelty\n","0 30 199262 ... NaN NaN\n","1 30 203105 ... NaN NaN\n","2 30 199886 ... NaN NaN\n","3 30 219904 ... NaN NaN\n","4 30 203206 ... NaN NaN\n","... ... ... ... ... ...\n","2109148 1097544 263721 ... NaN NaN\n","2109149 1097544 227113 ... NaN NaN\n","2109150 1097544 239830 ... NaN NaN\n","2109151 1097544 139002 ... NaN NaN\n","2109152 1097544 243127 ... NaN NaN\n","\n","[2109153 rows x 19 columns]"]},"execution_count":33,"metadata":{},"output_type":"execute_result"}],"source":["submit_feat = _candidates.merge(users_df[user_col],\n"," on=['user_id'],\n"," how='left') \\\n"," .merge(items_df[item_col],\n"," on=['item_id'],\n"," how='left')\n","submit_feat"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"P00tQC_ZQrYm"},"outputs":[],"source":["full_train = submit_feat.fillna('None')\n","full_train[cat_col] = full_train[cat_col].astype('category')\n","# item_stats = pd.read_csv('data/item_stats_for_submit.csv')\n","full_train = full_train.join(item_stats.set_index('item_id'),\n"," on='item_id', how='left')"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":488},"executionInfo":{"elapsed":733,"status":"ok","timestamp":1642188360258,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"mwa-Fp1wDbW-","outputId":"a80d2a9c-dba5-4d17-f339-dea2e3894792"},"outputs":[{"data":{"text/html":["\n"," \n","
\n","
\n","\n","
\n"," \n"," \n"," \n"," user_id \n"," item_id \n"," implicit_score \n"," id \n"," age \n"," income \n"," sex \n"," kids_flg \n"," boost_user_watch_cnt_all \n"," boost_user_watch_cnt_last_14 \n"," content_type \n"," countries_max \n"," for_kids \n"," age_rating \n"," studios_max \n"," genres_max \n"," genres_min \n"," genres_med \n"," release_novelty \n"," watched_in_7_days \n"," watch_ts_std \n"," trend_slope \n"," watch_ts_quantile_95_diff \n"," watch_ts_median_diff \n"," watched_in_all_time \n"," male_watchers_fraction \n"," female_watchers_fraction \n"," younger_35_fraction \n"," older_35_fraction \n"," \n"," \n"," \n"," \n"," 0 \n"," 30 \n"," 199262 \n"," 0.707107 \n"," 0 \n"," age_unknown \n"," income_unknown \n"," sex_unknown \n"," False \n"," 2 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 1 \n"," 30 \n"," 203105 \n"," 0.707107 \n"," 1 \n"," age_unknown \n"," income_unknown \n"," sex_unknown \n"," False \n"," 2 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 2 \n"," 30 \n"," 199886 \n"," 0.707107 \n"," 2 \n"," age_unknown \n"," income_unknown \n"," sex_unknown \n"," False \n"," 2 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 3 \n"," 30 \n"," 219904 \n"," 0.707107 \n"," 3 \n"," age_unknown \n"," income_unknown \n"," sex_unknown \n"," False \n"," 2 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 4 \n"," 30 \n"," 203206 \n"," 0.707107 \n"," 4 \n"," age_unknown \n"," income_unknown \n"," sex_unknown \n"," False \n"," 2 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," \n"," \n"," 2109148 \n"," 1097544 \n"," 263721 \n"," 0.57735 \n"," 2109148 \n"," age_25_34 \n"," income_20_40 \n"," F \n"," True \n"," 1 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 2109149 \n"," 1097544 \n"," 227113 \n"," 0.57735 \n"," 2109149 \n"," age_25_34 \n"," income_20_40 \n"," F \n"," True \n"," 1 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 2109150 \n"," 1097544 \n"," 239830 \n"," 0.57735 \n"," 2109150 \n"," age_25_34 \n"," income_20_40 \n"," F \n"," True \n"," 1 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 2109151 \n"," 1097544 \n"," 139002 \n"," 0.57735 \n"," 2109151 \n"," age_25_34 \n"," income_20_40 \n"," F \n"," True \n"," 1 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 2109152 \n"," 1097544 \n"," 243127 \n"," 0.57735 \n"," 2109152 \n"," age_25_34 \n"," income_20_40 \n"," F \n"," True \n"," 1 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n","
\n","
2109153 rows × 29 columns
\n","
\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "],"text/plain":[" user_id item_id ... younger_35_fraction older_35_fraction\n","0 30 199262 ... NaN NaN\n","1 30 203105 ... NaN NaN\n","2 30 199886 ... NaN NaN\n","3 30 219904 ... NaN NaN\n","4 30 203206 ... NaN NaN\n","... ... ... ... ... ...\n","2109148 1097544 263721 ... NaN NaN\n","2109149 1097544 227113 ... NaN NaN\n","2109150 1097544 239830 ... NaN NaN\n","2109151 1097544 139002 ... NaN NaN\n","2109152 1097544 243127 ... NaN NaN\n","\n","[2109153 rows x 29 columns]"]},"execution_count":38,"metadata":{},"output_type":"execute_result"}],"source":["full_train"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":892,"status":"ok","timestamp":1642188385461,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"5vqmW3AADiFL","outputId":"72dc7b75-7c6a-4d58-f554-01f8a3a76b78"},"outputs":[{"data":{"text/plain":["['user_id',\n"," 'item_id',\n"," 'implicit_score',\n"," 'age',\n"," 'income',\n"," 'sex',\n"," 'kids_flg',\n"," 'user_watch_cnt_all',\n"," 'user_watch_cnt_last_14',\n"," 'content_type',\n"," 'countries_max',\n"," 'for_kids',\n"," 'age_rating',\n"," 'studios_max',\n"," 'genres_max',\n"," 'genres_min',\n"," 'genres_med',\n"," 'release_novelty',\n"," 'watched_in_7_days',\n"," 'watch_ts_std',\n"," 'trend_slope',\n"," 'watch_ts_quantile_95_diff',\n"," 'watch_ts_median_diff',\n"," 'watched_in_all_time',\n"," 'male_watchers_fraction',\n"," 'female_watchers_fraction',\n"," 'younger_35_fraction',\n"," 'older_35_fraction']"]},"execution_count":39,"metadata":{},"output_type":"execute_result"}],"source":["cols"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":488},"executionInfo":{"elapsed":1394,"status":"ok","timestamp":1642188508132,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"i96FiqviQu42","outputId":"679f1b7a-7de9-46b7-a850-ddeeeedcf102"},"outputs":[{"data":{"text/html":["\n"," \n","
\n","
\n","\n","
\n"," \n"," \n"," \n"," user_id \n"," item_id \n"," implicit_score \n"," age \n"," income \n"," sex \n"," kids_flg \n"," boost_user_watch_cnt_all \n"," boost_user_watch_cnt_last_14 \n"," content_type \n"," countries_max \n"," for_kids \n"," age_rating \n"," studios_max \n"," genres_max \n"," genres_min \n"," genres_med \n"," release_novelty \n"," watched_in_7_days \n"," watch_ts_std \n"," trend_slope \n"," watch_ts_quantile_95_diff \n"," watch_ts_median_diff \n"," watched_in_all_time \n"," male_watchers_fraction \n"," female_watchers_fraction \n"," younger_35_fraction \n"," older_35_fraction \n"," \n"," \n"," \n"," \n"," 0 \n"," 30 \n"," 199262 \n"," 0.707107 \n"," age_unknown \n"," income_unknown \n"," sex_unknown \n"," False \n"," 2 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 1 \n"," 30 \n"," 203105 \n"," 0.707107 \n"," age_unknown \n"," income_unknown \n"," sex_unknown \n"," False \n"," 2 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 2 \n"," 30 \n"," 199886 \n"," 0.707107 \n"," age_unknown \n"," income_unknown \n"," sex_unknown \n"," False \n"," 2 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 3 \n"," 30 \n"," 219904 \n"," 0.707107 \n"," age_unknown \n"," income_unknown \n"," sex_unknown \n"," False \n"," 2 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 4 \n"," 30 \n"," 203206 \n"," 0.707107 \n"," age_unknown \n"," income_unknown \n"," sex_unknown \n"," False \n"," 2 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," \n"," \n"," 2109148 \n"," 1097544 \n"," 263721 \n"," 0.57735 \n"," age_25_34 \n"," income_20_40 \n"," F \n"," True \n"," 1 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 2109149 \n"," 1097544 \n"," 227113 \n"," 0.57735 \n"," age_25_34 \n"," income_20_40 \n"," F \n"," True \n"," 1 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 2109150 \n"," 1097544 \n"," 239830 \n"," 0.57735 \n"," age_25_34 \n"," income_20_40 \n"," F \n"," True \n"," 1 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 2109151 \n"," 1097544 \n"," 139002 \n"," 0.57735 \n"," age_25_34 \n"," income_20_40 \n"," F \n"," True \n"," 1 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n"," 2109152 \n"," 1097544 \n"," 243127 \n"," 0.57735 \n"," age_25_34 \n"," income_20_40 \n"," F \n"," True \n"," 1 \n"," 1 \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," None \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," NaN \n"," \n"," \n","
\n","
2109153 rows × 28 columns
\n","
\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "],"text/plain":[" user_id item_id ... younger_35_fraction older_35_fraction\n","0 30 199262 ... NaN NaN\n","1 30 203105 ... NaN NaN\n","2 30 199886 ... NaN NaN\n","3 30 219904 ... NaN NaN\n","4 30 203206 ... NaN NaN\n","... ... ... ... ... ...\n","2109148 1097544 263721 ... NaN NaN\n","2109149 1097544 227113 ... NaN NaN\n","2109150 1097544 239830 ... NaN NaN\n","2109151 1097544 139002 ... NaN NaN\n","2109152 1097544 243127 ... NaN NaN\n","\n","[2109153 rows x 28 columns]"]},"execution_count":40,"metadata":{},"output_type":"execute_result"}],"source":["# Renaming columns to match classifier feature names\n","cols = ['user_id', 'item_id']\n","cols.extend(boost_model.feature_names_)\n","cols = cols[:7] + ['boost_user_watch_cnt_all', 'boost_user_watch_cnt_last_14'] + cols[9:]\n","full_train = full_train[cols]\n","full_train_new_names = ['user_id', 'item_id'] + boost_model.feature_names_\n","full_train.columns = full_train_new_names\n","full_train"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":424},"executionInfo":{"elapsed":7830,"status":"ok","timestamp":1642188520391,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"PdLtTGROQ11b","outputId":"5e6dc810-1edb-4565-88af-1b9487ee9372"},"outputs":[{"data":{"text/html":["\n"," \n","
\n","
\n","\n","
\n"," \n"," \n"," \n"," user_id \n"," item_id \n"," \n"," \n"," \n"," \n"," 0 \n"," 30 \n"," [16986, 199262, 203105, 199886, 219904, 203206... \n"," \n"," \n"," 1 \n"," 55 \n"," [12232, 7634, 6489, 15987, 14556, 5573, 15058,... \n"," \n"," \n"," 2 \n"," 106 \n"," [8821, 10700, 10497, 3399, 9154, 3629, 12189, ... \n"," \n"," \n"," 3 \n"," 144 \n"," [79668, 85771, 79780, 100360, 87071, 80158, 14... \n"," \n"," \n"," 4 \n"," 155 \n"," [10747, 2236, 67784, 78954, 139975, 137705, 22... \n"," \n"," \n"," ... \n"," ... \n"," ... \n"," \n"," \n"," 22227 \n"," 1097444 \n"," [7300, 16181, 110702, 114582, 113097, 86716, 1... \n"," \n"," \n"," 22228 \n"," 1097459 \n"," [68578, 71663, 68642, 74552, 71682, 68811, 777... \n"," \n"," \n"," 22229 \n"," 1097470 \n"," [196242, 201115, 196364, 201461, 203105, 19904... \n"," \n"," \n"," 22230 \n"," 1097508 \n"," [207809, 210545, 208388, 212164, 213627, 21296... \n"," \n"," \n"," 22231 \n"," 1097544 \n"," [71485, 75317, 72714, 94880, 75852, 72851, 112... \n"," \n"," \n","
\n","
22232 rows × 2 columns
\n","
\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "],"text/plain":[" user_id item_id\n","0 30 [16986, 199262, 203105, 199886, 219904, 203206...\n","1 55 [12232, 7634, 6489, 15987, 14556, 5573, 15058,...\n","2 106 [8821, 10700, 10497, 3399, 9154, 3629, 12189, ...\n","3 144 [79668, 85771, 79780, 100360, 87071, 80158, 14...\n","4 155 [10747, 2236, 67784, 78954, 139975, 137705, 22...\n","... ... ...\n","22227 1097444 [7300, 16181, 110702, 114582, 113097, 86716, 1...\n","22228 1097459 [68578, 71663, 68642, 74552, 71682, 68811, 777...\n","22229 1097470 [196242, 201115, 196364, 201461, 203105, 19904...\n","22230 1097508 [207809, 210545, 208388, 212164, 213627, 21296...\n","22231 1097544 [71485, 75317, 72714, 94880, 75852, 72851, 112...\n","\n","[22232 rows x 2 columns]"]},"execution_count":41,"metadata":{},"output_type":"execute_result"}],"source":["# Making predictions for warm users\n","y_pred_all = boost_model.predict_proba(full_train.drop(\n"," ['user_id', 'item_id'], axis=1))\n","full_train['boost_pred'] = y_pred_all[:, 1]\n","full_train = full_train[['user_id', 'item_id', 'boost_pred']]\n","full_train = full_train.sort_values(by=['user_id', 'boost_pred'],\n"," ascending=[True, False])\n","full_train['rank'] = full_train.groupby('user_id').cumcount() + 1\n","full_train = full_train[full_train['rank'] <= 10].drop('boost_pred', axis=1)\n","full_train['item_id'] = full_train['item_id'].astype('int64')\n","boost_recs = full_train.groupby('user_id')['item_id'].apply(list)\n","boost_recs = pd.DataFrame(boost_recs)\n","boost_recs.reset_index(inplace=True)\n","boost_recs"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":4,"status":"ok","timestamp":1642188521208,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"2bCBSBe_57JM","outputId":"5328db55-be6d-42fd-c8a3-4857f13c1e32"},"outputs":[{"data":{"text/plain":["[20000, 728808, 622570, 133452, 10000, 341075]"]},"execution_count":42,"metadata":{},"output_type":"execute_result"}],"source":["# Making predictions for cold users with Popular Recommender\n","idx_for_popular = list(set(pd.Series(random_items).unique()).difference(\n"," set(boost_recs['user_id'].unique())))\n","idx_for_popular"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":424},"executionInfo":{"elapsed":6,"status":"ok","timestamp":1642188521956,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"yCctjmQT6AJV","outputId":"b2273d78-9891-4580-d238-1598366c3bb7"},"outputs":[{"data":{"text/html":["\n"," \n","
\n","
\n","\n","
\n"," \n"," \n"," \n"," user_id \n"," item_id \n"," last_watch_dt \n"," total_dur \n"," watched_pct \n"," \n"," \n"," \n"," \n"," 0 \n"," 917575 \n"," 10353 \n"," 2021-03-13 \n"," 11131 \n"," 58 \n"," \n"," \n"," 1060 \n"," 275080 \n"," 15574 \n"," 2021-03-13 \n"," 670 \n"," 11 \n"," \n"," \n"," 1059 \n"," 120517 \n"," 9550 \n"," 2021-03-13 \n"," 32456 \n"," 100 \n"," \n"," \n"," 1058 \n"," 15045 \n"," 6115 \n"," 2021-03-13 \n"," 22830 \n"," 100 \n"," \n"," \n"," 1057 \n"," 92904 \n"," 10135 \n"," 2021-03-13 \n"," 3709 \n"," 71 \n"," \n"," \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," \n"," \n"," 542914 \n"," 484870 \n"," 9157 \n"," 2021-08-22 \n"," 9435 \n"," 6 \n"," \n"," \n"," 542913 \n"," 8428 \n"," 5732 \n"," 2021-08-22 \n"," 6570 \n"," 100 \n"," \n"," \n"," 542912 \n"," 818134 \n"," 11505 \n"," 2021-08-22 \n"," 60 \n"," 0 \n"," \n"," \n"," 542923 \n"," 314358 \n"," 14111 \n"," 2021-08-22 \n"," 2590 \n"," 35 \n"," \n"," \n"," 547624 \n"," 755517 \n"," 5693 \n"," 2021-08-22 \n"," 6174 \n"," 88 \n"," \n"," \n","
\n","
547625 rows × 5 columns
\n","
\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "],"text/plain":[" user_id item_id last_watch_dt total_dur watched_pct\n","0 917575 10353 2021-03-13 11131 58\n","1060 275080 15574 2021-03-13 670 11\n","1059 120517 9550 2021-03-13 32456 100\n","1058 15045 6115 2021-03-13 22830 100\n","1057 92904 10135 2021-03-13 3709 71\n","... ... ... ... ... ...\n","542914 484870 9157 2021-08-22 9435 6\n","542913 8428 5732 2021-08-22 6570 100\n","542912 818134 11505 2021-08-22 60 0\n","542923 314358 14111 2021-08-22 2590 35\n","547624 755517 5693 2021-08-22 6174 88\n","\n","[547625 rows x 5 columns]"]},"execution_count":43,"metadata":{},"output_type":"execute_result"}],"source":["interactions_df"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"tQKdXPRl6C5f"},"outputs":[],"source":["pop_model = PopularRecommender(days=30, dt_column='last_watch_dt',\n"," with_filter=True)\n","pop_model.fit(interactions_df)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":238},"executionInfo":{"elapsed":8734,"status":"ok","timestamp":1642188532800,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"ec2JpRUH6LZv","outputId":"66f7fa3a-78dc-4051-b1f4-59a7b374e4dd"},"outputs":[{"data":{"text/html":["\n"," \n","
\n","
\n","\n","
\n"," \n"," \n"," \n"," user_id \n"," item_id \n"," \n"," \n"," \n"," \n"," 4 \n"," 10000 \n"," [10440, 9728, 15297, 13865, 3734, 12192, 4151,... \n"," \n"," \n"," 0 \n"," 20000 \n"," [10440, 9728, 15297, 13865, 3734, 12192, 4151,... \n"," \n"," \n"," 1 \n"," 728808 \n"," [10440, 9728, 15297, 13865, 12192, 4151, 11863... \n"," \n"," \n"," 2 \n"," 622570 \n"," [10440, 9728, 15297, 13865, 12192, 4151, 11863... \n"," \n"," \n"," 3 \n"," 133452 \n"," [10440, 9728, 15297, 13865, 3734, 12192, 4151,... \n"," \n"," \n"," 5 \n"," 341075 \n"," [10440, 9728, 15297, 13865, 3734, 12192, 4151,... \n"," \n"," \n","
\n","
\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "],"text/plain":[" user_id item_id\n","4 10000 [10440, 9728, 15297, 13865, 3734, 12192, 4151,...\n","0 20000 [10440, 9728, 15297, 13865, 3734, 12192, 4151,...\n","1 728808 [10440, 9728, 15297, 13865, 12192, 4151, 11863...\n","2 622570 [10440, 9728, 15297, 13865, 12192, 4151, 11863...\n","3 133452 [10440, 9728, 15297, 13865, 3734, 12192, 4151,...\n","5 341075 [10440, 9728, 15297, 13865, 3734, 12192, 4151,..."]},"execution_count":45,"metadata":{},"output_type":"execute_result"}],"source":["recs_popular = pop_model.recommend_with_filter(interactions_df, idx_for_popular, top_K=10)\n","recs_popular"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"KTbSaiIyRBgu"},"outputs":[],"source":["all_recs = pd.concat([boost_recs, recs_popular], axis=0)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"rLyH2YZaShKv"},"outputs":[],"source":["def fill_with_popular(recs, pop_model_fitted, interactions_df, top_K=10):\n"," \"\"\"\n"," Fills missing recommendations with Popular Recommender.\n"," Takes top_K first recommendations if length of recs exceeds top_K\n"," \"\"\"\n"," recs['len'] = recs['item_id'].apply(lambda x: len(x))\n"," recs_good = recs[recs['len'] >= top_K].copy()\n"," recs_good.loc[(recs_good['len'] > top_K), 'item_id'] = recs_good.loc[\n"," (recs_good['len'] > 10), 'item_id'].apply(lambda x: x[:10])\n"," recs_bad = recs[recs['len'] < top_K].copy()\n"," recs_bad['num_popular'] = top_K - recs_bad.len\n"," idx_for_filling = recs_bad['user_id'].unique()\n"," filling_recs = pop_model_fitted.recommend_with_filter(\n"," interactions_df, idx_for_filling, top_K=top_K)\n"," recs_bad = recs_bad.join(filling_recs.set_index('user_id'),\n"," on='user_id', how='left', rsuffix='1')\n"," recs_bad.loc[(recs_bad['len'] > 0), 'item_id'] = \\\n"," recs_bad.loc[(recs_bad['len'] > 0), 'item_id'] + \\\n"," recs_bad.loc[(recs_bad['len'] > 0), 'item_id1']\n"," recs_bad.loc[(recs_bad['len'] == 0), 'item_id'] = recs_bad.loc[\n"," (recs_bad['len'] == 0), 'item_id1']\n"," recs_bad['item_id'] = recs_bad['item_id'].apply(lambda x: x[:top_K])\n"," total_recs = pd.concat([recs_good[['user_id', 'item_id']],\n"," recs_bad[['user_id', 'item_id']]], axis=0)\n"," return total_recs"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":424},"executionInfo":{"elapsed":8980,"status":"ok","timestamp":1642188541766,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"397i5e_fmiHS","outputId":"eb66e6ee-3a57-451c-9c22-07a629ee8f4b"},"outputs":[{"data":{"text/html":["\n"," \n","
\n","
\n","\n","
\n"," \n"," \n"," \n"," user_id \n"," item_id \n"," \n"," \n"," \n"," \n"," 0 \n"," 30 \n"," [16986, 199262, 203105, 199886, 219904, 203206... \n"," \n"," \n"," 1 \n"," 55 \n"," [12232, 7634, 6489, 15987, 14556, 5573, 15058,... \n"," \n"," \n"," 2 \n"," 106 \n"," [8821, 10700, 10497, 3399, 9154, 3629, 12189, ... \n"," \n"," \n"," 3 \n"," 144 \n"," [79668, 85771, 79780, 100360, 87071, 80158, 14... \n"," \n"," \n"," 4 \n"," 155 \n"," [10747, 2236, 67784, 78954, 139975, 137705, 22... \n"," \n"," \n"," ... \n"," ... \n"," ... \n"," \n"," \n"," 22054 \n"," 1087746 \n"," [366, 4784, 33316, 63977, 10440, 9728, 15297, ... \n"," \n"," \n"," 22137 \n"," 1092833 \n"," [15355, 198132, 191636, 50599, 177761, 10440, ... \n"," \n"," \n"," 22159 \n"," 1093784 \n"," [296, 124311, 20002, 219743, 10440, 9728, 1529... \n"," \n"," \n"," 22160 \n"," 1093836 \n"," [1343, 11710, 3254, 1967, 3356, 5292, 70331, 2... \n"," \n"," \n"," 22171 \n"," 1094683 \n"," [15355, 198132, 191636, 50599, 177761, 10440, ... \n"," \n"," \n","
\n","
22238 rows × 2 columns
\n","
\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "],"text/plain":[" user_id item_id\n","0 30 [16986, 199262, 203105, 199886, 219904, 203206...\n","1 55 [12232, 7634, 6489, 15987, 14556, 5573, 15058,...\n","2 106 [8821, 10700, 10497, 3399, 9154, 3629, 12189, ...\n","3 144 [79668, 85771, 79780, 100360, 87071, 80158, 14...\n","4 155 [10747, 2236, 67784, 78954, 139975, 137705, 22...\n","... ... ...\n","22054 1087746 [366, 4784, 33316, 63977, 10440, 9728, 15297, ...\n","22137 1092833 [15355, 198132, 191636, 50599, 177761, 10440, ...\n","22159 1093784 [296, 124311, 20002, 219743, 10440, 9728, 1529...\n","22160 1093836 [1343, 11710, 3254, 1967, 3356, 5292, 70331, 2...\n","22171 1094683 [15355, 198132, 191636, 50599, 177761, 10440, ...\n","\n","[22238 rows x 2 columns]"]},"execution_count":48,"metadata":{},"output_type":"execute_result"}],"source":["# Filling short recommendations woth popular items\n","all_recs = fill_with_popular(all_recs, pop_model, interactions_df)\n","all_recs"]},{"cell_type":"markdown","metadata":{"id":"unhZ55xCzSII"},"source":["## Baseline\n","\n","Popularity based model\n","\n","Ref: [Official baseline tutorial](https://github.com/recohut/notebooks/blob/main/extras/mts_baseline.ipynb)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"LT8NuO96TICh"},"outputs":[],"source":["def calculate_novelty(train_interactions, recommendations, top_n): \n"," users = recommendations['user_id'].unique()\n"," n_users = train_interactions['user_id'].nunique()\n"," n_users_per_item = train_interactions.groupby('item_id')['user_id'].nunique()\n","\n"," recommendations = recommendations.loc[recommendations['rank'] <= top_n].copy()\n"," recommendations['n_users_per_item'] = recommendations['item_id'].map(n_users_per_item)\n"," recommendations['n_users_per_item'] = recommendations['n_users_per_item'].fillna(1)\n"," recommendations['item_novelty'] = -np.log2(recommendations['n_users_per_item'] / n_users)\n","\n"," item_novelties = recommendations[['user_id', 'rank', 'item_novelty']]\n"," \n"," miuf_at_k = item_novelties.loc[item_novelties['rank'] <= top_n, ['user_id', 'item_novelty']]\n"," miuf_at_k = miuf_at_k.groupby('user_id').agg('mean').squeeze()\n","\n"," return miuf_at_k.reindex(users).mean()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"MujfY8TjTICi"},"outputs":[],"source":["def compute_metrics(train, test, recs, top_N):\n"," result = {}\n"," test_recs = test.set_index(['user_id', 'item_id']).join(recs.set_index(['user_id', 'item_id']))\n"," test_recs = test_recs.sort_values(by=['user_id', 'rank'])\n","\n"," test_recs['users_item_count'] = test_recs.groupby(level='user_id')['rank'].transform(np.size)\n"," test_recs['reciprocal_rank'] = (1 / test_recs['rank']).fillna(0)\n"," test_recs['cumulative_rank'] = test_recs.groupby(level='user_id').cumcount() + 1\n"," test_recs['cumulative_rank'] = test_recs['cumulative_rank'] / test_recs['rank']\n"," \n"," users_count = test_recs.index.get_level_values('user_id').nunique()\n","\n"," for k in range(1, top_N + 1):\n"," hit_k = f'hit@{k}'\n"," test_recs[hit_k] = test_recs['rank'] <= k\n"," result[f'Precision@{k}'] = (test_recs[hit_k] / k).sum() / users_count\n"," result[f'Recall@{k}'] = (test_recs[hit_k] / test_recs['users_item_count']).sum() / users_count\n"," \n"," result[f'MAP@{top_N}'] = (test_recs['cumulative_rank'] / test_recs['users_item_count']).sum() / users_count\n"," result[f'Novelty@{top_N}'] = calculate_novelty(train, recs, top_N)\n"," \n"," return pd.Series(result)"]},{"cell_type":"markdown","metadata":{"id":"P28xd48xTICz"},"source":["### Example on one fold"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"uJVqVnskTIC0"},"outputs":[],"source":["test = interactions_df[interactions_df['last_watch_dt'] == interactions_df['last_watch_dt'].max()]\n","train = interactions_df[interactions_df['last_watch_dt'] < interactions_df['last_watch_dt'].max()]"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"yWAZei5ETIC1"},"outputs":[],"source":["pop_model = PopularRecommender(days=7, dt_column='last_watch_dt')\n","pop_model.fit(train)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":14,"status":"ok","timestamp":1642188574386,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"t9hL7kFfTIC2","outputId":"fbf7c514-37eb-4f69-8510-b71704236b99"},"outputs":[{"data":{"text/plain":["array([ 9728, 15297, 10440, 13865, 12360, 14488, 12192, 512, 341,\n"," 3734])"]},"execution_count":54,"metadata":{},"output_type":"execute_result"}],"source":["top10_recs = pop_model.recommend()\n","top10_recs"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"TKmJ8SydTIC3"},"outputs":[],"source":["item_titles = pd.Series(items_df['title'].values, index=items_df['item_id']).to_dict()"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":15,"status":"ok","timestamp":1642188574389,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"3n6RAbh6TIC4","outputId":"6b113f1b-2cbc-4d0f-9c81-bf20fcb16787"},"outputs":[{"data":{"text/plain":["['гнев человеческий',\n"," 'клиника счастья',\n"," 'хрустальный',\n"," 'девятаев',\n"," 'круэлла',\n"," 'мастер меча',\n"," 'фемида видит',\n"," 'рядовой чээрин',\n"," 'лето - это море',\n"," 'прабабушка легкого поведения']"]},"execution_count":56,"metadata":{},"output_type":"execute_result"}],"source":["list(map(item_titles.get, top10_recs))"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"executionInfo":{"elapsed":11,"status":"ok","timestamp":1642188574390,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"lVk4_qpvTIC4","outputId":"87000191-3012-4a90-e132-7969e06fa538"},"outputs":[{"data":{"text/html":["\n"," \n","
\n","
\n","\n","
\n"," \n"," \n"," \n"," user_id \n"," item_id \n"," \n"," \n"," \n"," \n"," 0 \n"," 936370 \n"," [9728, 15297, 10440, 13865, 12360, 14488, 1219... \n"," \n"," \n"," 1 \n"," 279776 \n"," [9728, 15297, 10440, 13865, 12360, 14488, 1219... \n"," \n"," \n"," 2 \n"," 321739 \n"," [9728, 15297, 10440, 13865, 12360, 14488, 1219... \n"," \n"," \n"," 3 \n"," 98693 \n"," [9728, 15297, 10440, 13865, 12360, 14488, 1219... \n"," \n"," \n"," 4 \n"," 267998 \n"," [9728, 15297, 10440, 13865, 12360, 14488, 1219... \n"," \n"," \n","
\n","
\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "],"text/plain":[" user_id item_id\n","0 936370 [9728, 15297, 10440, 13865, 12360, 14488, 1219...\n","1 279776 [9728, 15297, 10440, 13865, 12360, 14488, 1219...\n","2 321739 [9728, 15297, 10440, 13865, 12360, 14488, 1219...\n","3 98693 [9728, 15297, 10440, 13865, 12360, 14488, 1219...\n","4 267998 [9728, 15297, 10440, 13865, 12360, 14488, 1219..."]},"execution_count":57,"metadata":{},"output_type":"execute_result"}],"source":["recs = pd.DataFrame({'user_id': test['user_id'].unique()})\n","top_N = 10\n","recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)\n","recs.head()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"UPjTuarETIC5"},"outputs":[],"source":["recs = recs.explode('item_id')"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":426},"executionInfo":{"elapsed":9,"status":"ok","timestamp":1642188576837,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"XqjGRg15TIC6","outputId":"25c15140-b328-4cb9-d875-8f6733f89229"},"outputs":[{"data":{"text/html":["\n"," \n","
\n","
\n","\n","
\n"," \n"," \n"," \n"," user_id \n"," item_id \n"," rank \n"," \n"," \n"," \n"," \n"," 0 \n"," 936370 \n"," 9728 \n"," 1 \n"," \n"," \n"," 0 \n"," 936370 \n"," 15297 \n"," 2 \n"," \n"," \n"," 0 \n"," 936370 \n"," 10440 \n"," 3 \n"," \n"," \n"," 0 \n"," 936370 \n"," 13865 \n"," 4 \n"," \n"," \n"," 0 \n"," 936370 \n"," 12360 \n"," 5 \n"," \n"," \n"," 0 \n"," 936370 \n"," 14488 \n"," 6 \n"," \n"," \n"," 0 \n"," 936370 \n"," 12192 \n"," 7 \n"," \n"," \n"," 0 \n"," 936370 \n"," 512 \n"," 8 \n"," \n"," \n"," 0 \n"," 936370 \n"," 341 \n"," 9 \n"," \n"," \n"," 0 \n"," 936370 \n"," 3734 \n"," 10 \n"," \n"," \n"," 1 \n"," 279776 \n"," 9728 \n"," 1 \n"," \n"," \n"," 1 \n"," 279776 \n"," 15297 \n"," 2 \n"," \n"," \n","
\n","
\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "],"text/plain":[" user_id item_id rank\n","0 936370 9728 1\n","0 936370 15297 2\n","0 936370 10440 3\n","0 936370 13865 4\n","0 936370 12360 5\n","0 936370 14488 6\n","0 936370 12192 7\n","0 936370 512 8\n","0 936370 341 9\n","0 936370 3734 10\n","1 279776 9728 1\n","1 279776 15297 2"]},"execution_count":59,"metadata":{},"output_type":"execute_result"}],"source":["recs['rank'] = recs.groupby('user_id').cumcount() + 1\n","recs.head(top_N + 2)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":2593,"status":"ok","timestamp":1642188579423,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"Yp74HHqnTIC6","outputId":"5a94f5ac-7cdd-4bc4-a50e-3ce1d97bdc7a"},"outputs":[{"data":{"text/plain":["Precision@1 0.034862\n","Recall@1 0.033231\n","Precision@2 0.033945\n","Recall@2 0.065418\n","Precision@3 0.032875\n","Recall@3 0.095387\n","Precision@4 0.029128\n","Recall@4 0.112564\n","Precision@5 0.023425\n","Recall@5 0.113175\n","Precision@6 0.022273\n","Recall@6 0.128721\n","Precision@7 0.021669\n","Recall@7 0.145846\n","Precision@8 0.019897\n","Recall@8 0.152727\n","Precision@9 0.018926\n","Recall@9 0.163532\n","Precision@10 0.018211\n","Recall@10 0.174618\n","MAP@10 0.071974\n","Novelty@10 6.242784\n","dtype: float64"]},"execution_count":60,"metadata":{},"output_type":"execute_result"}],"source":["compute_metrics(train, test, recs, 10)"]},{"cell_type":"markdown","metadata":{"id":"_3I9v8q7UlYk"},"source":["### Folder validation\n","\n","Let's take the last 3 weeks from our data and test them sequentially (1 test fold - 1 week). Don't forget about the cold start problem."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":688,"status":"ok","timestamp":1642188582610,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"8VkbYiXhTIC8","outputId":"19421d57-7726-4866-e826-ec8a7298d36b"},"outputs":[{"data":{"text/plain":["(Timestamp('2021-08-01 00:00:00'), Timestamp('2021-08-22 00:00:00'))"]},"execution_count":61,"metadata":{},"output_type":"execute_result"}],"source":["last_date = interactions_df['last_watch_dt'].max().normalize()\n","folds = 3\n","start_date = last_date - pd.Timedelta(days=folds*7)\n","start_date, last_date"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":6,"status":"ok","timestamp":1642188583154,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"1ByOzE3HTIC9","outputId":"573e4e37-8d82-4e81-fe1c-986cf0e7a466"},"outputs":[{"data":{"text/plain":["(3, 3)"]},"execution_count":62,"metadata":{},"output_type":"execute_result"}],"source":["cv = TimeRangeSplit(start_date=start_date, periods=folds+1, freq='W')\n","\n","cv.max_n_splits, cv.get_n_splits(interactions_df, datetime_column='last_watch_dt')"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":5,"status":"ok","timestamp":1642188585451,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"cwvzIFYcTIC9","outputId":"9edcb93e-acb9-406b-9e10-4e019c598275"},"outputs":[{"data":{"text/plain":["DatetimeIndex(['2021-08-01', '2021-08-08', '2021-08-15', '2021-08-22'], dtype='datetime64[ns]', freq='W-SUN')"]},"execution_count":63,"metadata":{},"output_type":"execute_result"}],"source":["cv.date_range"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":2877,"status":"ok","timestamp":1642188588772,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"6-Ridv5hTIC-","outputId":"b3ad2cd1-5a25-4a09-8c61-0bab9774fe19"},"outputs":[{"name":"stdout","output_type":"stream","text":["Already seen number: 0\n","Already seen number: 0\n","Already seen number: 0\n"]}],"source":["folds_with_stats = list(cv.split(\n"," interactions_df, \n"," user_column='user_id',\n"," item_column='item_id',\n"," datetime_column='last_watch_dt',\n"," fold_stats=True\n","))\n","\n","folds_info_with_stats = pd.DataFrame([info for _, _, info in folds_with_stats])"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":257},"executionInfo":{"elapsed":13,"status":"ok","timestamp":1642188588773,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"pPHoISGQTIC_","outputId":"e3595ffb-eaee-4cc4-ad09-e400a4f71543"},"outputs":[{"data":{"text/html":["\n"," \n","
\n","
\n","\n","
\n"," \n"," \n"," \n"," Start date \n"," End date \n"," Train \n"," New users \n"," New users interactions \n"," New items \n"," New items interactions \n"," Known interactions \n"," Test \n"," \n"," \n"," \n"," \n"," 0 \n"," 2021-08-01 \n"," 2021-08-08 \n"," 420915 \n"," 19360 \n"," 22608 \n"," 166 \n"," 907 \n"," 0 \n"," 14717 \n"," \n"," \n"," 1 \n"," 2021-08-08 \n"," 2021-08-15 \n"," 459147 \n"," 19615 \n"," 22955 \n"," 136 \n"," 609 \n"," 0 \n"," 15979 \n"," \n"," \n"," 2 \n"," 2021-08-15 \n"," 2021-08-22 \n"," 498690 \n"," 20501 \n"," 24032 \n"," 99 \n"," 476 \n"," 0 \n"," 17371 \n"," \n"," \n","
\n","
\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "],"text/plain":[" Start date End date ... Known interactions Test\n","0 2021-08-01 2021-08-08 ... 0 14717\n","1 2021-08-08 2021-08-15 ... 0 15979\n","2 2021-08-15 2021-08-22 ... 0 17371\n","\n","[3 rows x 9 columns]"]},"execution_count":65,"metadata":{},"output_type":"execute_result"}],"source":["folds_info_with_stats"]},{"cell_type":"markdown","metadata":{"id":"oMuGmqVBTIC_"},"source":["### Popular on folds"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Q1bXpzaMTIC_"},"outputs":[],"source":["top_N = 10\n","last_n_days = 7"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Y9guCzRqTIDA","scrolled":true},"outputs":[],"source":["final_results = []\n","validation_results = pd.DataFrame()\n","\n","for train_idx, test_idx, info in folds_with_stats:\n"," train = interactions_df.loc[train_idx]\n"," test = interactions_df.loc[test_idx]\n"," \n"," pop_model = PopularRecommender(days=last_n_days, dt_column='last_watch_dt')\n"," pop_model.fit(train)\n","\n"," recs = pd.DataFrame({'user_id': test['user_id'].unique()})\n"," recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)\n"," recs = recs.explode('item_id')\n"," recs['rank'] = recs.groupby('user_id').cumcount() + 1\n","\n"," fold_result = compute_metrics(train, test, recs, top_N)\n","\n"," validation_results = validation_results.append(fold_result, ignore_index=True)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":30,"status":"ok","timestamp":1642188603077,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"cyFK7eHhTIDA","outputId":"fc4c35a5-3c20-400f-c48d-4690dee2f79b"},"outputs":[{"data":{"text/plain":["MAP@10 0.039814\n","Novelty@10 5.778481\n","dtype: float64"]},"execution_count":68,"metadata":{},"output_type":"execute_result"}],"source":["validation_results.agg({'MAP@10':'mean', 'Novelty@10':'mean'})"]},{"cell_type":"markdown","metadata":{"id":"hcFFZpFATIDA"},"source":["### Popular Prediction\n","\n","Let's see if it makes sense to predict the popular depending on the social group"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"sfqxhgZuTIDB"},"outputs":[],"source":["train_idx, test_idx, info = folds_with_stats[0]\n","train = interactions_df.loc[train_idx]\n","test = interactions_df.loc[test_idx]\n","date_window_for_popular = train['last_watch_dt'].max() - pd.DateOffset(days=last_n_days)\n","train_slice = pd.merge(train[train['last_watch_dt'] >= date_window_for_popular], users_df, on='user_id', how='left')"]},{"cell_type":"markdown","metadata":{"id":"ydpUgqh6TIDB"},"source":["we have users without features, so we need to define padding for them"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":270},"executionInfo":{"elapsed":27,"status":"ok","timestamp":1642188603078,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"tH4n5EAnTIDC","outputId":"e8d1f2bd-23ef-4b83-b0bb-b91a08f4eac5"},"outputs":[{"data":{"text/html":["\n"," \n","
\n","
\n","\n","
\n"," \n"," \n"," \n"," user_id \n"," item_id \n"," last_watch_dt \n"," total_dur \n"," watched_pct \n"," age \n"," income \n"," sex \n"," kids_flg \n"," boost_user_watch_cnt_all \n"," boost_user_watch_cnt_last_14 \n"," user_watch_cnt_all \n"," user_watch_cnt_last_14 \n"," \n"," \n"," \n"," \n"," 0 \n"," 689871 \n"," 6404 \n"," 2021-07-24 \n"," 905 \n"," 16 \n"," age_45_54 \n"," income_20_40 \n"," M \n"," False \n"," 1.0 \n"," 0.0 \n"," 1.0 \n"," 0.0 \n"," \n"," \n"," 1 \n"," 482718 \n"," 2624 \n"," 2021-07-24 \n"," 1898 \n"," 25 \n"," age_18_24 \n"," income_40_60 \n"," F \n"," False \n"," 1.0 \n"," 0.0 \n"," 4.0 \n"," 3.0 \n"," \n"," \n"," 2 \n"," 183195 \n"," 11239 \n"," 2021-07-24 \n"," 1037 \n"," 14 \n"," age_35_44 \n"," income_20_40 \n"," F \n"," True \n"," 5.0 \n"," 0.0 \n"," 5.0 \n"," 0.0 \n"," \n"," \n"," 3 \n"," 1077534 \n"," 4457 \n"," 2021-07-24 \n"," 151 \n"," 2 \n"," age_25_34 \n"," income_20_40 \n"," M \n"," False \n"," 0.0 \n"," 0.0 \n"," 0.0 \n"," 0.0 \n"," \n"," \n"," 4 \n"," 274241 \n"," 16228 \n"," 2021-07-24 \n"," 19306 \n"," 18 \n"," age_65_inf \n"," income_20_40 \n"," F \n"," False \n"," 4.0 \n"," 0.0 \n"," 4.0 \n"," 0.0 \n"," \n"," \n","
\n","
\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "],"text/plain":[" user_id item_id ... user_watch_cnt_all user_watch_cnt_last_14\n","0 689871 6404 ... 1.0 0.0\n","1 482718 2624 ... 4.0 3.0\n","2 183195 11239 ... 5.0 0.0\n","3 1077534 4457 ... 0.0 0.0\n","4 274241 16228 ... 4.0 0.0\n","\n","[5 rows x 13 columns]"]},"execution_count":70,"metadata":{},"output_type":"execute_result"}],"source":["train_slice.head()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"b8VbywpUTIDC"},"outputs":[],"source":["train_slice.fillna({'age':'age_unknown',\n"," 'sex':'sex_unknown',\n"," 'income': 'income_unknown',\n"," 'kids_flg': False\n"," }, inplace=True)"]},{"cell_type":"markdown","metadata":{"id":"X8edftA_TIDD"},"source":["For example, you can watch popular by age, gender and presence of children"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":270},"executionInfo":{"elapsed":8,"status":"ok","timestamp":1642188605384,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"D-q6NC2ZTIDD","outputId":"57c555d5-544d-4e78-f25b-e13d5b58436f"},"outputs":[{"data":{"text/html":["\n"," \n","
\n","
\n","\n","
\n"," \n"," \n"," \n"," user_id \n"," item_id \n"," last_watch_dt \n"," total_dur \n"," watched_pct \n"," age \n"," income \n"," sex \n"," kids_flg \n"," boost_user_watch_cnt_all \n"," boost_user_watch_cnt_last_14 \n"," user_watch_cnt_all \n"," user_watch_cnt_last_14 \n"," \n"," \n"," \n"," \n"," 0 \n"," 689871 \n"," 6404 \n"," 2021-07-24 \n"," 905 \n"," 16 \n"," age_45_54 \n"," income_20_40 \n"," M \n"," False \n"," 1.0 \n"," 0.0 \n"," 1.0 \n"," 0.0 \n"," \n"," \n"," 1 \n"," 482718 \n"," 2624 \n"," 2021-07-24 \n"," 1898 \n"," 25 \n"," age_18_24 \n"," income_40_60 \n"," F \n"," False \n"," 1.0 \n"," 0.0 \n"," 4.0 \n"," 3.0 \n"," \n"," \n"," 2 \n"," 183195 \n"," 11239 \n"," 2021-07-24 \n"," 1037 \n"," 14 \n"," age_35_44 \n"," income_20_40 \n"," F \n"," True \n"," 5.0 \n"," 0.0 \n"," 5.0 \n"," 0.0 \n"," \n"," \n"," 3 \n"," 1077534 \n"," 4457 \n"," 2021-07-24 \n"," 151 \n"," 2 \n"," age_25_34 \n"," income_20_40 \n"," M \n"," False \n"," 0.0 \n"," 0.0 \n"," 0.0 \n"," 0.0 \n"," \n"," \n"," 4 \n"," 274241 \n"," 16228 \n"," 2021-07-24 \n"," 19306 \n"," 18 \n"," age_65_inf \n"," income_20_40 \n"," F \n"," False \n"," 4.0 \n"," 0.0 \n"," 4.0 \n"," 0.0 \n"," \n"," \n","
\n","
\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "],"text/plain":[" user_id item_id ... user_watch_cnt_all user_watch_cnt_last_14\n","0 689871 6404 ... 1.0 0.0\n","1 482718 2624 ... 4.0 3.0\n","2 183195 11239 ... 5.0 0.0\n","3 1077534 4457 ... 0.0 0.0\n","4 274241 16228 ... 4.0 0.0\n","\n","[5 rows x 13 columns]"]},"execution_count":72,"metadata":{},"output_type":"execute_result"}],"source":["train_slice.head()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"zFTxZZNBTIDD"},"outputs":[],"source":["soc_dem_recommendations = train_slice.groupby(\n"," ['age', 'sex', 'income', 'item_id']\n",").size().to_frame().reset_index()"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":424},"executionInfo":{"elapsed":8,"status":"ok","timestamp":1642188606950,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"66aG48_cTIDD","outputId":"d8da5e9a-c3c5-4b4c-fbaf-c971ec676690"},"outputs":[{"data":{"text/html":["\n"," \n","
\n","
\n","\n","
\n"," \n"," \n"," \n"," age \n"," sex \n"," income \n"," item_id \n"," 0 \n"," \n"," \n"," \n"," \n"," 0 \n"," age_18_24 \n"," F \n"," income_0_20 \n"," 14 \n"," 1 \n"," \n"," \n"," 1 \n"," age_18_24 \n"," F \n"," income_0_20 \n"," 111 \n"," 1 \n"," \n"," \n"," 2 \n"," age_18_24 \n"," F \n"," income_0_20 \n"," 162 \n"," 1 \n"," \n"," \n"," 3 \n"," age_18_24 \n"," F \n"," income_0_20 \n"," 288 \n"," 1 \n"," \n"," \n"," 4 \n"," age_18_24 \n"," F \n"," income_0_20 \n"," 334 \n"," 1 \n"," \n"," \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," ... \n"," \n"," \n"," 18651 \n"," age_unknown \n"," sex_unknown \n"," income_unknown \n"," 16488 \n"," 1 \n"," \n"," \n"," 18652 \n"," age_unknown \n"," sex_unknown \n"," income_unknown \n"," 16498 \n"," 1 \n"," \n"," \n"," 18653 \n"," age_unknown \n"," sex_unknown \n"," income_unknown \n"," 16499 \n"," 3 \n"," \n"," \n"," 18654 \n"," age_unknown \n"," sex_unknown \n"," income_unknown \n"," 16509 \n"," 21 \n"," \n"," \n"," 18655 \n"," age_unknown \n"," sex_unknown \n"," income_unknown \n"," 16516 \n"," 1 \n"," \n"," \n","
\n","
18656 rows × 5 columns
\n","
\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "],"text/plain":[" age sex income item_id 0\n","0 age_18_24 F income_0_20 14 1\n","1 age_18_24 F income_0_20 111 1\n","2 age_18_24 F income_0_20 162 1\n","3 age_18_24 F income_0_20 288 1\n","4 age_18_24 F income_0_20 334 1\n","... ... ... ... ... ..\n","18651 age_unknown sex_unknown income_unknown 16488 1\n","18652 age_unknown sex_unknown income_unknown 16498 1\n","18653 age_unknown sex_unknown income_unknown 16499 3\n","18654 age_unknown sex_unknown income_unknown 16509 21\n","18655 age_unknown sex_unknown income_unknown 16516 1\n","\n","[18656 rows x 5 columns]"]},"execution_count":74,"metadata":{},"output_type":"execute_result"}],"source":["soc_dem_recommendations"]},{"cell_type":"markdown","metadata":{"id":"MMRvQGlxTIDE"},"source":["Now you just need to select for each user the most popular top_n objects in his group"]},{"cell_type":"markdown","metadata":{"id":"qxY3Q_uETIDE"},"source":["We can check this option on folds\n"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"-Nm6HPYhTIDF"},"outputs":[],"source":["validation_results = pd.DataFrame()\n","\n","for train_idx, test_idx, info in folds_with_stats:\n"," train = interactions_df.loc[train_idx]\n"," test = interactions_df.loc[test_idx]\n"," date_window = train['last_watch_dt'].max() - pd.DateOffset(days=last_n_days)\n"," train_slice = pd.merge(train[train['last_watch_dt'] >= date_window], users_df, on='user_id', how='left')\n"," \n"," train_slice.fillna({\n"," 'age':'age_unknown',\n"," 'sex':'sex_unknown',\n"," 'income': 'income_unknown',\n"," 'kids_flg': False\n"," },inplace=True)\n"," \n"," soc_dem_recommendations = train_slice.groupby(\n"," ['age', 'sex', 'income', 'item_id']\n"," ).size().to_frame().reset_index()\n"," \n"," top_soc_dem = []\n","\n"," for age in soc_dem_recommendations.age.unique():\n"," for income in soc_dem_recommendations.income.unique():\n"," for sex in soc_dem_recommendations.sex.unique():\n"," top_items = soc_dem_recommendations[\n"," (soc_dem_recommendations.age == age)\n"," & (soc_dem_recommendations.income == income)\n"," & (soc_dem_recommendations.sex == sex)].sort_values(0, ascending=False).head(10).item_id.values\n"," top_soc_dem.append([age, income, sex, top_items])\n","\n"," top_soc_dem = pd.DataFrame(top_soc_dem, columns = ['age', 'income', 'sex', 'item_id'])\n"," \n"," recs = pd.DataFrame({'user_id': test['user_id'].unique()})\n"," recs = pd.merge(recs[['user_id']], users_df, on='user_id', how='left')\n"," recs.fillna({\n"," 'age':'age_unknown',\n"," 'sex':'sex_unknown',\n"," 'income': 'income_unknown',\n"," 'kids_flg': False\n"," }, inplace=True)\n"," \n"," recs = pd.merge(recs, top_soc_dem, on = ['age', 'sex', 'income'], how = 'left')\n"," recs = recs.drop(columns = ['age', 'sex', 'income'])\n"," \n"," recs = recs.explode('item_id')\n"," recs['rank'] = recs.groupby('user_id').cumcount() + 1\n"," fold_result = compute_metrics(train, test, recs, top_N)\n"," \n"," validation_results = validation_results.append(fold_result, ignore_index=True)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":17,"status":"ok","timestamp":1642188624221,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"k81s04vuTIDF","outputId":"b6d5d848-4ce9-4b88-aa4f-294094d0396c"},"outputs":[{"data":{"text/plain":["MAP@10 0.040677\n","Novelty@10 6.050588\n","dtype: float64"]},"execution_count":76,"metadata":{},"output_type":"execute_result"}],"source":["validation_results.agg({'MAP@10':'mean', 'Novelty@10':'mean'})"]},{"cell_type":"markdown","metadata":{"id":"8vbHhlUiTIDG"},"source":["In this case, the features by which you build the popular are selected, as well as the number of days that you take to calculate the popular"]},{"cell_type":"markdown","metadata":{"id":"OPv2gBKETIDG"},"source":["### Tfidf"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"pitmMjB2TIDH"},"outputs":[],"source":["users_inv_mapping = dict(enumerate(interactions_df['user_id'].unique()))\n","users_mapping = {v: k for k, v in users_inv_mapping.items()}\n","\n","items_inv_mapping = dict(enumerate(interactions_df['item_id'].unique()))\n","items_mapping = {v: k for k, v in items_inv_mapping.items()}"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"MQtorexITIDH"},"outputs":[],"source":["validation_results = pd.DataFrame()\n","\n","for train_idx, test_idx, info in folds_with_stats:\n"," train = interactions_df.loc[train_idx]\n","\n"," date_window = train['last_watch_dt'].max() - pd.DateOffset(days=60)\n"," train = train[train['last_watch_dt'] >= date_window]\n","\n"," test = interactions_df.loc[test_idx]\n","\n"," train_mat = get_coo_matrix(\n"," train,\n"," users_mapping=users_mapping,\n"," items_mapping=items_mapping,\n"," ).tocsr()\n","\n"," model = TFIDFRecommender(K=top_N)\n"," model.fit(train_mat.T, show_progress=False) \n","\n"," mapper = generate_implicit_recs_mapper( \n"," model,\n"," train_mat,\n"," top_N,\n"," users_mapping,\n"," items_inv_mapping,\n"," filter_already_liked_items=True\n"," )\n","\n"," recs = pd.DataFrame({'user_id': test['user_id'].unique()})\n"," recs['item_id'] = recs['user_id'].map(mapper)\n"," recs = recs.explode('item_id')\n"," recs['rank'] = recs.groupby('user_id').cumcount() + 1\n"," fold_result = compute_metrics(train, test, recs, top_N)\n","\n"," validation_results = validation_results.append(fold_result, ignore_index=True)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":27,"status":"ok","timestamp":1642188699563,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"tm0_BSCLTIDI","outputId":"2aa86238-a2d5-4f3a-9190-ea77c87dc56b"},"outputs":[{"data":{"text/plain":["MAP@10 0.698575\n","Novelty@10 17.440547\n","dtype: float64"]},"execution_count":81,"metadata":{},"output_type":"execute_result"}],"source":["validation_results.agg({'MAP@10':'mean', 'Novelty@10':'mean',})"]},{"cell_type":"markdown","metadata":{"id":"dzI0rVytTIDI"},"source":["Simply using the code above for submission won't work due to cold users. We'll have to figure out how to process them."]},{"cell_type":"markdown","metadata":{"id":"4d54eqKGTIDI"},"source":["### Predictions"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"OOswWzXxWxGK"},"outputs":[],"source":["random_items = list(np.random.choice(interactions_df['user_id'], size=5, replace=False))\n","cold_items = [10000, 20000]\n","random_items.extend(cold_items)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":25,"status":"ok","timestamp":1642188699565,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"SAbO_8EFXAgY","outputId":"11260a68-a229-48e2-a724-5e5d52019917"},"outputs":[{"data":{"text/plain":["[754950, 758416, 83485, 636568, 669127, 10000, 20000]"]},"execution_count":83,"metadata":{},"output_type":"execute_result"}],"source":["random_items"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"bZd04bj0TIDJ"},"outputs":[],"source":["train = interactions_df\n","test = random_items\n","\n","pop_model = PopularRecommender(days=last_n_days, dt_column='last_watch_dt')\n","pop_model.fit(train)\n","\n","recs = pd.DataFrame({'user_id': pd.Series(test).unique()})\n","recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)\n","recs = recs.explode('item_id')\n","recs['rank'] = recs.groupby('user_id').cumcount() + 1\n","recs = recs.groupby('user_id').agg({'item_id': list}).reset_index()"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"executionInfo":{"elapsed":23,"status":"ok","timestamp":1642188699568,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"hmb6rWr2TIDJ","outputId":"3a1a7d85-1d2a-4be6-993a-c02b902e18f1"},"outputs":[{"data":{"text/html":["\n"," \n","
\n","
\n","\n","
\n"," \n"," \n"," \n"," user_id \n"," item_id \n"," \n"," \n"," \n"," \n"," 0 \n"," 10000 \n"," [9728, 15297, 10440, 14488, 13865, 12192, 341,... \n"," \n"," \n"," 1 \n"," 20000 \n"," [9728, 15297, 10440, 14488, 13865, 12192, 341,... \n"," \n"," \n"," 2 \n"," 83485 \n"," [9728, 15297, 10440, 14488, 13865, 12192, 341,... \n"," \n"," \n"," 3 \n"," 636568 \n"," [9728, 15297, 10440, 14488, 13865, 12192, 341,... \n"," \n"," \n"," 4 \n"," 669127 \n"," [9728, 15297, 10440, 14488, 13865, 12192, 341,... \n"," \n"," \n","
\n","
\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "],"text/plain":[" user_id item_id\n","0 10000 [9728, 15297, 10440, 14488, 13865, 12192, 341,...\n","1 20000 [9728, 15297, 10440, 14488, 13865, 12192, 341,...\n","2 83485 [9728, 15297, 10440, 14488, 13865, 12192, 341,...\n","3 636568 [9728, 15297, 10440, 14488, 13865, 12192, 341,...\n","4 669127 [9728, 15297, 10440, 14488, 13865, 12192, 341,..."]},"execution_count":85,"metadata":{},"output_type":"execute_result"}],"source":["recs.head()"]},{"cell_type":"markdown","metadata":{"id":"YT7-dpYKEqub"},"source":["---"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":714,"status":"ok","timestamp":1642188912940,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"},"user_tz":-330},"id":"SwzKjSnTFmfa","outputId":"490c855f-1a59-440e-8529-0690e102aa2c"},"outputs":[{"name":"stdout","output_type":"stream","text":["numpy 1.19.5\n","pandas 1.1.5\n","Sparsh A. \n","last updated: 2022-01-14 19:35:09 \n","\n","implicit 0.4.8\n","catboost 1.0.4\n","recohut 0.0.11\n","\n","compiler : GCC 7.5.0\n","system : Linux\n","release : 5.4.144+\n","machine : x86_64\n","processor : x86_64\n","CPU cores : 2\n","interpreter: 64bit\n"]}],"source":["!pip install -q watermark\n","%reload_ext watermark\n","%watermark -a \"Sparsh A.\" -m -iv -u -t -d -p implicit,catboost,recohut"]},{"cell_type":"markdown","metadata":{"id":"VaKjWG8IEquj"},"source":["---"]},{"cell_type":"markdown","metadata":{"id":"c1vxSboeEquj"},"source":["**END**"]}],"metadata":{"colab":{"authorship_tag":"ABX9TyNLNtjY3nwpcDeDocgy8rzQ","collapsed_sections":["1KypvcFZI64_"],"mount_file_id":"1FEZmnoLGIsTsGiK2gi1TsIHLAaWCXF_a","name":"itempop and two-stage recommender on mts data","provenance":[{"file_id":"1FEZmnoLGIsTsGiK2gi1TsIHLAaWCXF_a","timestamp":1642188758676}],"toc_visible":true},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"},"widgets":{"application/vnd.jupyter.widget-state+json":{"1150ae9b77d04ee089151cdf9b3c97fd":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HBoxModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_cd203aa954364d05a2d9197f04bac18a","IPY_MODEL_7d5a993575214d4189c013bb12fc9080","IPY_MODEL_552747c596b440929459610765a70c67"],"layout":"IPY_MODEL_6404fcf9360b4c8bafac3d0c62dc7d58"}},"163ead0dd46c434ea3412e462a0938db":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"ProgressStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"1cfcd7deb21741cc95481b1ece102ced":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"552747c596b440929459610765a70c67":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_1cfcd7deb21741cc95481b1ece102ced","placeholder":"","style":"IPY_MODEL_6773a61987794d45bf453ac8a8f78a34","value":" 266854/266854 [00:23<00:00, 14342.15it/s]"}},"6404fcf9360b4c8bafac3d0c62dc7d58":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"6773a61987794d45bf453ac8a8f78a34":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"6b9c40da36c746169708e1251c893b47":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"7d5a993575214d4189c013bb12fc9080":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"FloatProgressModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_b6183320801a4148b75f6b36b65a9b13","max":266854,"min":0,"orientation":"horizontal","style":"IPY_MODEL_163ead0dd46c434ea3412e462a0938db","value":266854}},"b6183320801a4148b75f6b36b65a9b13":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"cd203aa954364d05a2d9197f04bac18a":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_e6fb3757a3db480aa1be1f0a91e19f4d","placeholder":"","style":"IPY_MODEL_6b9c40da36c746169708e1251c893b47","value":"100%"}},"e6fb3757a3db480aa1be1f0a91e19f4d":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}}}}},"nbformat":4,"nbformat_minor":0}