{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"2022-01-24-movielens-data-split.ipynb","provenance":[{"file_id":"https://github.com/recohut/nbs/blob/main/raw/T273370%20%7C%20MovieLens%20Data%20Splitting.ipynb","timestamp":1644668893373},{"file_id":"1d01gFWcCm9PtVSYcAPKW8aKdNs8IDmN6","timestamp":1630677495334}],"collapsed_sections":[],"mount_file_id":"1d01gFWcCm9PtVSYcAPKW8aKdNs8IDmN6","authorship_tag":"ABX9TyMf1MNBKD3QFsOS/ZXU7G//"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# MovieLens Data Splitting"],"metadata":{"id":"ACNt2GFq6ItJ"}},{"cell_type":"markdown","metadata":{"id":"Hx4U0cpYvhih"},"source":["### Data Split"]},{"cell_type":"code","metadata":{"id":"9hZnrRahvp2b"},"source":["import time\n","import math\n","import numpy as np\n","import pandas as pd\n","from sklearn.model_selection import train_test_split"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"GwEypSUXwO6B"},"source":["def _check_and_convert_ratio(test_size, multi_ratios):\n"," if not test_size and not multi_ratios:\n"," raise ValueError(\"must provide either 'test_size' or 'multi_ratios'\")\n","\n"," elif test_size is not None:\n"," assert isinstance(test_size, float), \"test_size must be float value\"\n"," assert 0.0 < test_size < 1.0, \"test_size must be in (0.0, 1.0)\"\n"," ratios = [1 - test_size, test_size]\n"," return ratios, 2\n","\n"," elif isinstance(multi_ratios, (list, tuple)):\n"," assert len(multi_ratios) > 1, (\n"," \"multi_ratios must at least have two elements\")\n"," assert all([r > 0.0 for r in multi_ratios]), (\n"," \"ratios should be positive values\")\n"," if math.fsum(multi_ratios) != 1.0:\n"," ratios = [r / math.fsum(multi_ratios) for r in multi_ratios]\n"," else:\n"," ratios = multi_ratios\n"," return ratios, len(ratios)\n","\n"," else:\n"," raise ValueError(\"multi_ratios should be list or tuple\")\n","\n","\n","def _filter_unknown_user_item(data_list):\n"," train_data = data_list[0]\n"," unique_values = dict(user=set(train_data.user.tolist()),\n"," item=set(train_data.item.tolist()))\n","\n"," split_data_all = [train_data]\n"," for i, test_data in enumerate(data_list[1:], start=1):\n"," # print(f\"Non_train_data {i} size before filtering: {len(test_data)}\")\n"," out_of_bounds_row_indices = set()\n"," for col in [\"user\", \"item\"]:\n"," for j, val in enumerate(test_data[col]):\n"," if val not in unique_values[col]:\n"," out_of_bounds_row_indices.add(j)\n","\n"," mask = np.arange(len(test_data))\n"," test_data_clean = test_data[~np.isin(\n"," mask, list(out_of_bounds_row_indices))]\n"," split_data_all.append(test_data_clean)\n"," # print(f\"Non_train_data {i} size after filtering: \"\n"," # f\"{len(test_data_clean)}\")\n"," return split_data_all\n","\n","\n","def _pad_unknown_user_item(data_list):\n"," train_data, test_data = data_list\n"," n_users = train_data.user.nunique()\n"," n_items = train_data.item.nunique()\n"," unique_users = set(train_data.user.tolist())\n"," unique_items = set(train_data.item.tolist())\n","\n"," split_data_all = [train_data]\n"," for i, test_data in enumerate(data_list[1:], start=1):\n"," test_data.loc[~test_data.user.isin(unique_users), \"user\"] = n_users\n"," test_data.loc[~test_data.item.isin(unique_items), \"item\"] = n_items\n"," split_data_all.append(test_data)\n"," return split_data_all\n","\n","\n","def _groupby_user(user_indices, order):\n"," sort_kind = \"mergesort\" if order else \"quicksort\"\n"," users, user_position, user_counts = np.unique(user_indices,\n"," return_inverse=True,\n"," return_counts=True)\n"," user_split_indices = np.split(np.argsort(user_position, kind=sort_kind),\n"," np.cumsum(user_counts)[:-1])\n"," return user_split_indices"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"SeQjDRWjwBqP"},"source":["def random_split(data, test_size=None, multi_ratios=None, shuffle=True,\n"," filter_unknown=True, pad_unknown=False, seed=42):\n"," ratios, n_splits = _check_and_convert_ratio(test_size, multi_ratios)\n"," if not isinstance(ratios, list):\n"," ratios = list(ratios)\n","\n"," # if we want to split data in multiple folds,\n"," # then iteratively split data based on modified ratios\n"," train_data = data.copy()\n"," split_data_all = []\n"," for i in range(n_splits - 1):\n"," size = ratios.pop(-1)\n"," ratios = [r / math.fsum(ratios) for r in ratios]\n"," train_data, split_data = train_test_split(train_data,\n"," test_size=size,\n"," shuffle=shuffle,\n"," random_state=seed)\n"," split_data_all.insert(0, split_data)\n"," split_data_all.insert(0, train_data) # insert final fold of data\n","\n"," if filter_unknown:\n"," split_data_all = _filter_unknown_user_item(split_data_all)\n"," elif pad_unknown:\n"," split_data_all = _pad_unknown_user_item(split_data_all)\n"," return split_data_all\n","\n","\n","def split_by_ratio(data, order=True, shuffle=False, test_size=None,\n"," multi_ratios=None, filter_unknown=True, pad_unknown=False,\n"," seed=42):\n"," np.random.seed(seed)\n"," assert (\"user\" in data.columns), \"data must contains user column\"\n"," ratios, n_splits = _check_and_convert_ratio(test_size, multi_ratios)\n","\n"," n_users = data.user.nunique()\n"," user_indices = data.user.to_numpy()\n"," user_split_indices = _groupby_user(user_indices, order)\n","\n"," cum_ratios = np.cumsum(ratios).tolist()[:-1]\n"," split_indices_all = [[] for _ in range(n_splits)]\n"," for u in range(n_users):\n"," u_data = user_split_indices[u]\n"," u_data_len = len(u_data)\n"," if u_data_len <= 3: # keep items of rare users in trainset\n"," split_indices_all[0].extend(u_data)\n"," else:\n"," u_split_data = np.split(u_data, [\n"," round(cum * u_data_len) for cum in cum_ratios\n"," ])\n"," for i in range(n_splits):\n"," split_indices_all[i].extend(list(u_split_data[i]))\n","\n"," if shuffle:\n"," split_data_all = tuple(\n"," np.random.permutation(data[idx]) for idx in split_indices_all)\n"," else:\n"," split_data_all = list(data.iloc[idx] for idx in split_indices_all)\n","\n"," if filter_unknown:\n"," split_data_all = _filter_unknown_user_item(split_data_all)\n"," elif pad_unknown:\n"," split_data_all = _pad_unknown_user_item(split_data_all)\n"," return split_data_all\n","\n","\n","def split_by_num(data, order=True, shuffle=False, test_size=1,\n"," filter_unknown=True, pad_unknown=False, seed=42):\n"," np.random.seed(seed)\n"," assert (\"user\" in data.columns), \"data must contains user column\"\n"," assert isinstance(test_size, int), \"test_size must be int value\"\n"," assert 0 < test_size < len(data), \"test_size must be in (0, len(data))\"\n","\n"," n_users = data.user.nunique()\n"," user_indices = data.user.to_numpy()\n"," user_split_indices = _groupby_user(user_indices, order)\n","\n"," train_indices = []\n"," test_indices = []\n"," for u in range(n_users):\n"," u_data = user_split_indices[u]\n"," u_data_len = len(u_data)\n"," if u_data_len <= 3: # keep items of rare users in trainset\n"," train_indices.extend(u_data)\n"," elif u_data_len <= test_size:\n"," train_indices.extend(u_data[:-1])\n"," test_indices.extend(u_data[-1:])\n"," else:\n"," k = test_size\n"," train_indices.extend(u_data[:(u_data_len-k)])\n"," test_indices.extend(u_data[-k:])\n","\n"," if shuffle:\n"," train_indices = np.random.permutation(train_indices)\n"," test_indices = np.random.permutation(test_indices)\n","\n"," split_data_all = (data.iloc[train_indices], data.iloc[test_indices])\n"," if filter_unknown:\n"," split_data_all = _filter_unknown_user_item(split_data_all)\n"," elif pad_unknown:\n"," split_data_all = _pad_unknown_user_item(split_data_all)\n"," return split_data_all\n","\n","\n","def split_by_ratio_chrono(data, order=True, shuffle=False, test_size=None,\n"," multi_ratios=None, seed=42):\n"," assert all([\n"," \"user\" in data.columns,\n"," \"time\" in data.columns\n"," ]), \"data must contains user and time column\"\n","\n"," data.sort_values(by=[\"time\"], inplace=True)\n"," data.reset_index(drop=True, inplace=True)\n"," return split_by_ratio(**locals())\n","\n","\n","def split_by_num_chrono(data, order=True, shuffle=False, test_size=1, seed=42):\n"," assert all([\n"," \"user\" in data.columns,\n"," \"time\" in data.columns\n"," ]), \"data must contains user and time column\"\n","\n"," data.sort_values(by=[\"time\"], inplace=True)\n"," data.reset_index(drop=True, inplace=True)\n"," return split_by_num(**locals())"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":886},"id":"_ANES5MHySvI","executionInfo":{"status":"ok","timestamp":1630675873079,"user_tz":-330,"elapsed":919,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"1a6b0dbf-4973-473e-a325-3c1e41a4d6dd"},"source":["data = pd.read_csv('sample_movielens_merged.csv')\n","data"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
useritemlabeltimesexageoccupationgenre1genre2genre3
046172603964137835F256actionadventurefantasy
1461724684964137948F256actioncomedyromance
246175935964138086F256dramathrillermissing
346172962964138229F256crimedramamissing
446176082964138310F256crimedramathriller
546177803964138459F256actionsci-fiwar
6461716433964138734F256dramaromancemissing
746174403964138734F256comedyromancemissing
8461715694964138754F256comedyromancemissing
9461717321964138882F256comedycrimemystery
1037064362966279993M2512dramathrillermissing
11370617722966280154M2512actioncomedymusical
12370633343966280523M2512crimedramafilm-noir
13370611365966376465M2512comedymissingmissing
14370613944966376516M2512comedymissingmissing
15213739485974639801F110comedymissingmissing
16213712153974640099F110actionadventurecomedy
17213713564974640343F110actionadventuresci-fi
18213720211974640436F110fantasysci-fimissing
1921377805974640455F110actionsci-fiwar
20213720124974640506F110comedysci-fiwestern
21213710375974640534F110actionsci-fithriller
22213727013974640720F110actionsci-fiwestern
232137344974641074F110children'scomedydrama
2421377484974641742F110actionsci-fithriller
25213737455974641844F110adventureanimationsci-fi
26213737935974641844F110actionsci-fimissing
\n","
"],"text/plain":[" user item label time ... occupation genre1 genre2 genre3\n","0 4617 260 3 964137835 ... 6 action adventure fantasy\n","1 4617 2468 4 964137948 ... 6 action comedy romance\n","2 4617 593 5 964138086 ... 6 drama thriller missing\n","3 4617 296 2 964138229 ... 6 crime drama missing\n","4 4617 608 2 964138310 ... 6 crime drama thriller\n","5 4617 780 3 964138459 ... 6 action sci-fi war\n","6 4617 1643 3 964138734 ... 6 drama romance missing\n","7 4617 440 3 964138734 ... 6 comedy romance missing\n","8 4617 1569 4 964138754 ... 6 comedy romance missing\n","9 4617 1732 1 964138882 ... 6 comedy crime mystery\n","10 3706 436 2 966279993 ... 12 drama thriller missing\n","11 3706 1772 2 966280154 ... 12 action comedy musical\n","12 3706 3334 3 966280523 ... 12 crime drama film-noir\n","13 3706 1136 5 966376465 ... 12 comedy missing missing\n","14 3706 1394 4 966376516 ... 12 comedy missing missing\n","15 2137 3948 5 974639801 ... 10 comedy missing missing\n","16 2137 1215 3 974640099 ... 10 action adventure comedy\n","17 2137 1356 4 974640343 ... 10 action adventure sci-fi\n","18 2137 2021 1 974640436 ... 10 fantasy sci-fi missing\n","19 2137 780 5 974640455 ... 10 action sci-fi war\n","20 2137 2012 4 974640506 ... 10 comedy sci-fi western\n","21 2137 1037 5 974640534 ... 10 action sci-fi thriller\n","22 2137 2701 3 974640720 ... 10 action sci-fi western\n","23 2137 34 4 974641074 ... 10 children's comedy drama\n","24 2137 748 4 974641742 ... 10 action sci-fi thriller\n","25 2137 3745 5 974641844 ... 10 adventure animation sci-fi\n","26 2137 3793 5 974641844 ... 10 action sci-fi missing\n","\n","[27 rows x 10 columns]"]},"metadata":{},"execution_count":37}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"NhRMOUyKydVz","executionInfo":{"status":"ok","timestamp":1630675845213,"user_tz":-330,"elapsed":723,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"978b98a0-88d6-42f8-b2f8-1710916e071e"},"source":["data.info()"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","RangeIndex: 27 entries, 0 to 26\n","Data columns (total 10 columns):\n"," # Column Non-Null Count Dtype \n","--- ------ -------------- ----- \n"," 0 user 27 non-null int64 \n"," 1 item 27 non-null int64 \n"," 2 label 27 non-null int64 \n"," 3 time 27 non-null int64 \n"," 4 sex 27 non-null object\n"," 5 age 27 non-null int64 \n"," 6 occupation 27 non-null int64 \n"," 7 genre1 27 non-null object\n"," 8 genre2 27 non-null object\n"," 9 genre3 27 non-null object\n","dtypes: int64(6), object(4)\n","memory usage: 2.2+ KB\n"]}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"JFq9ehHuymz4","executionInfo":{"status":"ok","timestamp":1630676818903,"user_tz":-330,"elapsed":727,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"3f2178df-e54d-447b-f754-b178bcb64869"},"source":["train_data, eval_data, test_data = random_split(data, multi_ratios=[0.5, 0.1, 0.1], seed=42,\n"," filter_unknown=False)\n","\n","train_data.shape, eval_data.shape, test_data.shape"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["((19, 10), (4, 10), (4, 10))"]},"metadata":{},"execution_count":41}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":173},"id":"Qp7UwylEy4v6","executionInfo":{"status":"ok","timestamp":1630676823222,"user_tz":-330,"elapsed":13,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"9df2c1ee-401a-4535-b945-8417c6cf7bf1"},"source":["test_data.head()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
useritemlabeltimesexageoccupationgenre1genre2genre3
8461715694964138754F256comedyromancemissing
13370611365966376465M2512comedymissingmissing
9461717321964138882F256comedycrimemystery
21213710375974640534F110actionsci-fithriller
\n","
"],"text/plain":[" user item label time ... occupation genre1 genre2 genre3\n","8 4617 1569 4 964138754 ... 6 comedy romance missing\n","13 3706 1136 5 966376465 ... 12 comedy missing missing\n","9 4617 1732 1 964138882 ... 6 comedy crime mystery\n","21 2137 1037 5 974640534 ... 10 action sci-fi thriller\n","\n","[4 rows x 10 columns]"]},"metadata":{},"execution_count":42}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"tnAklPyo6ha2","executionInfo":{"status":"ok","timestamp":1630677356407,"user_tz":-330,"elapsed":1731,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"cf8774b7-b70a-4912-b122-bcda12f51f6b"},"source":["train_data, eval_data, test_data = random_split(data,\n"," multi_ratios=[0.8, 0.1, 0.1],\n"," seed=42,\n"," filter_unknown=True,\n"," pad_unknown=False)\n","\n","train_data.shape, eval_data.shape, test_data.shape"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["((21, 10), (0, 10), (0, 10))"]},"metadata":{},"execution_count":48}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":49},"id":"wq2Q5IRq6hbM","executionInfo":{"status":"ok","timestamp":1630677358093,"user_tz":-330,"elapsed":1693,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"e40fe4dd-c2d3-4576-feb3-cdd4f6815620"},"source":["eval_data.head()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
useritemlabeltimesexageoccupationgenre1genre2genre3
\n","
"],"text/plain":["Empty DataFrame\n","Columns: [user, item, label, time, sex, age, occupation, genre1, genre2, genre3]\n","Index: []"]},"metadata":{},"execution_count":49}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"JFTnd-bHzMaF","executionInfo":{"status":"ok","timestamp":1630674989174,"user_tz":-330,"elapsed":725,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"18727508-ec39-4c12-885b-5532c051c87a"},"source":["train_data, eval_data = split_by_ratio(data, test_size=0.2)\n","\n","train_data.shape, eval_data.shape"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["((80490, 10), (19473, 10))"]},"metadata":{},"execution_count":20}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":204},"id":"S82MAfhVzvRz","executionInfo":{"status":"ok","timestamp":1630675079062,"user_tz":-330,"elapsed":507,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"eb0c8abb-1ff5-460e-aa1d-28e3a97d2307"},"source":["eval_data.head()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
useritemlabeltimesexageoccupationgenre1genre2genre3
90449215444978300174M5616actionadventuresci-fi
90418323555978298430M2515animationchildren'scomedy
90382530792978246162M2520dramamissingmissing
90344637174978238371F509actioncrimemissing
90335715734978234874M351actionsci-fithriller
\n","
"],"text/plain":[" user item label ... genre1 genre2 genre3\n","90449 2 1544 4 ... action adventure sci-fi\n","90418 3 2355 5 ... animation children's comedy\n","90382 5 3079 2 ... drama missing missing\n","90344 6 3717 4 ... action crime missing\n","90335 7 1573 4 ... action sci-fi thriller\n","\n","[5 rows x 10 columns]"]},"metadata":{},"execution_count":24}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"kQddKIvbzh8k","executionInfo":{"status":"ok","timestamp":1630675027448,"user_tz":-330,"elapsed":686,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"b1c8e59f-6f37-4746-b086-b680ae326aee"},"source":["train_data, eval_data = split_by_num(data, test_size=1)\n","\n","train_data.shape, eval_data.shape"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["((95128, 10), (4882, 10))"]},"metadata":{},"execution_count":21}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"n01ZszMgzmpP","executionInfo":{"status":"ok","timestamp":1630675046442,"user_tz":-330,"elapsed":515,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"1019c798-da35-4782-e511-bcc5d74a425c"},"source":["train_data, eval_data = split_by_ratio_chrono(data, test_size=0.2)\n","\n","train_data.shape, eval_data.shape"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["((80490, 10), (19392, 10))"]},"metadata":{},"execution_count":22}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":204},"id":"vJieaGB10En2","executionInfo":{"status":"ok","timestamp":1630675169223,"user_tz":-330,"elapsed":698,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"128d265c-a156-4440-9346-28a14f8ee5be"},"source":["eval_data.head()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
useritemlabeltimesexageoccupationgenre1genre2genre3
90449215444978300174M5616actionadventuresci-fi
90418323555978298430M2515animationchildren'scomedy
90382530792978246162M2520dramamissingmissing
90344637174978238371F509actioncrimemissing
90335715734978234874M351actionsci-fithriller
\n","
"],"text/plain":[" user item label ... genre1 genre2 genre3\n","90449 2 1544 4 ... action adventure sci-fi\n","90418 3 2355 5 ... animation children's comedy\n","90382 5 3079 2 ... drama missing missing\n","90344 6 3717 4 ... action crime missing\n","90335 7 1573 4 ... action sci-fi thriller\n","\n","[5 rows x 10 columns]"]},"metadata":{},"execution_count":25}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"a16L78A9zsC6","executionInfo":{"status":"ok","timestamp":1630675067401,"user_tz":-330,"elapsed":824,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"f07810ba-4b43-45ee-e555-2aed83654a95"},"source":["train_data, eval_data = split_by_num_chrono(data, test_size=1)\n","\n","train_data.shape, eval_data.shape"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["((95128, 10), (4880, 10))"]},"metadata":{},"execution_count":23}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":204},"id":"jl2LGa51vjBZ","executionInfo":{"status":"ok","timestamp":1630675191287,"user_tz":-330,"elapsed":497,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"19d1f059-d07d-405d-f1a4-a3be71ffa551"},"source":["eval_data.head()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
useritemlabeltimesexageoccupationgenre1genre2genre3
90449215444978300174M5616actionadventuresci-fi
90418323555978298430M2515animationchildren'scomedy
90382530792978246162M2520dramamissingmissing
90344637174978238371F509actioncrimemissing
90335715734978234874M351actionsci-fithriller
\n","
"],"text/plain":[" user item label ... genre1 genre2 genre3\n","90449 2 1544 4 ... action adventure sci-fi\n","90418 3 2355 5 ... animation children's comedy\n","90382 5 3079 2 ... drama missing missing\n","90344 6 3717 4 ... action crime missing\n","90335 7 1573 4 ... action sci-fi thriller\n","\n","[5 rows x 10 columns]"]},"metadata":{},"execution_count":26}]}]}