{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"2022-01-24-group-data.ipynb","provenance":[{"file_id":"https://github.com/recohut/nbs/blob/main/raw/T277604%20%7C%20Group%20Data%20Generator%20on%20ML-1m.ipynb","timestamp":1644668818466},{"file_id":"1xBn4HDiXTzxOUsCgEh9WymDFl9Ldo7O6","timestamp":1637861705275}],"collapsed_sections":[],"authorship_tag":"ABX9TyP5GYPb3ae+z6Hs3VmrlX/5"},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","metadata":{"id":"YhZ-Pzmo-jRE"},"source":["# Group Data Generator on ML-1m"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"oWPJ3gsjEk7U","executionInfo":{"status":"ok","timestamp":1637860150517,"user_tz":-330,"elapsed":2746,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"7687619a-2332-4d95-a860-c95567ef379e"},"source":["!wget -q --show-progress https://files.grouplens.org/datasets/movielens/ml-1m.zip"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["ml-1m.zip 100%[===================>] 5.64M 5.84MB/s in 1.0s \n"]}]},{"cell_type":"code","metadata":{"id":"9Q_85jBJEvf3"},"source":["import os\n","import shutil\n","import zipfile\n","\n","import numpy as np\n","import scipy.sparse as sp\n","from collections import Counter"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"BlVc0qp_E8GQ"},"source":["class GroupGenerator(object):\n"," \"\"\"\n"," Group Data Generator\n"," \"\"\"\n"," def __init__(self, data_path, output_path, rating_threshold, num_groups,\n"," group_sizes, min_num_ratings, train_ratio, val_ratio,\n"," negative_sample_size, verbose=False):\n"," self.rating_threshold = rating_threshold\n"," self.negative_sample_size = negative_sample_size\n"," users_path = os.path.join(data_path, 'users.dat')\n"," items_path = os.path.join(data_path, 'movies.dat')\n"," ratings_path = os.path.join(data_path, 'ratings.dat')\n","\n"," users = self.load_users_file(users_path)\n"," items = self.load_items_file(items_path)\n"," rating_mat, timestamp_mat = \\\n"," self.load_ratings_file(ratings_path, max(users), max(items))\n","\n"," groups, group_ratings, groups_rated_items_dict, groups_rated_items_set = \\\n"," self.generate_group_ratings(users, rating_mat, timestamp_mat,\n"," num_groups=num_groups,\n"," group_sizes=group_sizes,\n"," min_num_ratings=min_num_ratings)\n"," members, group_ratings_train, group_ratings_val, group_ratings_test, \\\n"," group_negative_items_val, group_negative_items_test, \\\n"," user_ratings_train, user_ratings_val, user_ratings_test, \\\n"," user_negative_items_val, user_negative_items_test = \\\n"," self.split_ratings(group_ratings, rating_mat, timestamp_mat,\n"," groups, groups_rated_items_dict, groups_rated_items_set,\n"," train_ratio=train_ratio, val_ratio=val_ratio)\n","\n"," groups_path = os.path.join(output_path, 'groupMember.dat')\n"," group_ratings_train_path = os.path.join(output_path, 'groupRatingTrain.dat')\n"," group_ratings_val_path = os.path.join(output_path, 'groupRatingVal.dat')\n"," group_ratings_test_path = os.path.join(output_path, 'groupRatingTest.dat')\n"," group_negative_items_val_path = os.path.join(output_path, 'groupRatingValNegative.dat')\n"," group_negative_items_test_path = os.path.join(output_path, 'groupRatingTestNegative.dat')\n"," user_ratings_train_path = os.path.join(output_path, 'userRatingTrain.dat')\n"," user_ratings_val_path = os.path.join(output_path, 'userRatingVal.dat')\n"," user_ratings_test_path = os.path.join(output_path, 'userRatingTest.dat')\n"," user_negative_items_val_path = os.path.join(output_path, 'userRatingValNegative.dat')\n"," user_negative_items_test_path = os.path.join(output_path, 'userRatingTestNegative.dat')\n","\n"," self.save_groups(groups_path, groups)\n"," self.save_ratings(group_ratings_train, group_ratings_train_path)\n"," self.save_ratings(group_ratings_val, group_ratings_val_path)\n"," self.save_ratings(group_ratings_test, group_ratings_test_path)\n"," self.save_negative_samples(group_negative_items_val, group_negative_items_val_path)\n"," self.save_negative_samples(group_negative_items_test, group_negative_items_test_path)\n"," self.save_ratings(user_ratings_train, user_ratings_train_path)\n"," self.save_ratings(user_ratings_val, user_ratings_val_path)\n"," self.save_ratings(user_ratings_test, user_ratings_test_path)\n"," self.save_negative_samples(user_negative_items_val, user_negative_items_val_path)\n"," self.save_negative_samples(user_negative_items_test, user_negative_items_test_path)\n"," shutil.copyfile(src=os.path.join(data_path, 'movies.dat'), dst=os.path.join(output_path, 'movies.dat'))\n"," shutil.copyfile(src=os.path.join(data_path, 'users.dat'), dst=os.path.join(output_path, 'users.dat'))\n","\n"," if verbose:\n"," num_group_ratings = len(group_ratings)\n"," num_user_ratings = len(user_ratings_train) + len(user_ratings_val) + len(user_ratings_test)\n"," num_rated_items = len(groups_rated_items_set)\n","\n"," print('Save data: ' + output_path)\n"," print('# Users: ' + str(len(members)))\n"," print('# Items: ' + str(num_rated_items))\n"," print('# Groups: ' + str(len(groups)))\n"," print('# U-I ratings: ' + str(num_user_ratings))\n"," print('# G-I ratings: ' + str(num_group_ratings))\n"," print('Avg. # ratings / user: {:.2f}'.format(num_user_ratings / len(members)))\n"," print('Avg. # ratings / group: {:.2f}'.format(num_group_ratings / len(groups)))\n"," print('Avg. group size: {:.2f}'.format(np.mean(list(map(len, groups)))))\n","\n"," def load_users_file(self, users_path):\n"," users = []\n","\n"," with open(users_path, 'r') as file:\n"," for line in file.readlines():\n"," users.append(int(line.split('::')[0]))\n","\n"," return users\n","\n"," def load_items_file(self, items_path):\n"," items = []\n","\n"," with open(items_path, 'r', encoding='iso-8859-1') as file:\n"," for line in file.readlines():\n"," items.append(int(line.split('::')[0]))\n","\n"," return items\n","\n"," def load_ratings_file(self, ratings_path, max_num_users, max_num_items):\n"," rating_mat = sp.dok_matrix((max_num_users + 1, max_num_items + 1),\n"," dtype=np.int)\n"," timestamp_mat = rating_mat.copy()\n","\n"," with open(ratings_path, 'r') as file:\n"," for line in file.readlines():\n"," arr = line.replace('\\n', '').split('::')\n"," user, item, rating, timestamp = \\\n"," int(arr[0]), int(arr[1]), int(arr[2]), int(arr[3])\n"," rating_mat[user, item] = rating\n"," timestamp_mat[user, item] = timestamp\n","\n"," return rating_mat, timestamp_mat\n","\n"," def generate_group_ratings(self, users, rating_mat, timestamp_mat,\n"," num_groups, group_sizes, min_num_ratings):\n"," np.random.seed(0)\n"," groups = set()\n"," groups_ratings = []\n"," groups_rated_items_dict = {}\n"," groups_rated_items_set = set()\n","\n"," while len(groups) < num_groups:\n"," group_id = len(groups) + 1\n","\n"," while True:\n"," group = tuple(np.sort(\n"," np.random.choice(users, np.random.choice(group_sizes),\n"," replace=False)))\n"," if group not in groups:\n"," break\n","\n"," pos_group_rating_counter = Counter()\n"," neg_group_rating_counter = Counter()\n"," group_rating_list = []\n"," group_rated_items = set()\n","\n"," for member in group:\n"," _, items = rating_mat[member, :].nonzero()\n"," pos_items = [item for item in items\n"," if rating_mat[member, item] >= self.rating_threshold]\n"," neg_items = [item for item in items\n"," if rating_mat[member, item] < self.rating_threshold]\n"," pos_group_rating_counter.update(pos_items)\n"," neg_group_rating_counter.update(neg_items)\n","\n"," for item, num_ratings in pos_group_rating_counter.items():\n"," if num_ratings == len(group):\n"," timestamp = max([timestamp_mat[member, item]\n"," for member in group])\n"," group_rated_items.add(item)\n"," group_rating_list.append((group_id, item, 1, timestamp))\n","\n"," for item, num_ratings in neg_group_rating_counter.items():\n"," if (num_ratings == len(group)) \\\n"," or (num_ratings + pos_group_rating_counter[item] == len(group)):\n"," timestamp = max([timestamp_mat[member, item]\n"," for member in group])\n"," group_rated_items.add(item)\n"," group_rating_list.append((group_id, item, 0, timestamp))\n","\n"," if len(group_rating_list) >= min_num_ratings:\n"," groups.add(group)\n"," groups_rated_items_dict[group_id] = group_rated_items\n"," groups_rated_items_set.update(group_rated_items)\n"," for group_rating in group_rating_list:\n"," groups_ratings.append(group_rating)\n","\n"," return list(groups), groups_ratings, groups_rated_items_dict, groups_rated_items_set\n","\n"," def split_ratings(self, group_ratings, rating_mat, timestamp_mat,\n"," groups, groups_rated_items_dict, groups_rated_items_set, train_ratio, val_ratio):\n"," num_group_ratings = len(group_ratings)\n"," num_train = int(num_group_ratings * train_ratio)\n"," num_test = int(num_group_ratings * (1 - train_ratio - val_ratio))\n","\n"," group_ratings = \\\n"," sorted(group_ratings, key=lambda group_rating: group_rating[-1])\n"," group_ratings_train = group_ratings[:num_train]\n"," group_ratings_val = group_ratings[num_train:-num_test]\n"," group_ratings_test = group_ratings[-num_test:]\n","\n"," timestamp_split_train = group_ratings_train[-1][-1]\n"," timestamp_split_val = group_ratings_val[-1][-1]\n","\n"," user_ratings_train = []\n"," user_ratings_val = []\n"," user_ratings_test = []\n","\n"," members = set()\n"," users_rated_items_dict = {}\n","\n"," for group in groups:\n"," for member in group:\n"," if member in members:\n"," continue\n"," members.add(member)\n"," user_rated_items = set()\n"," _, items = rating_mat[member, :].nonzero()\n"," for item in items:\n"," if item not in groups_rated_items_set:\n"," continue\n"," user_rated_items.add(item)\n"," if rating_mat[member, item] >= self.rating_threshold:\n"," rating_tuple = (member, item, 1,\n"," timestamp_mat[member, item])\n"," else:\n"," rating_tuple = (member, item, 0,\n"," timestamp_mat[member, item])\n"," if timestamp_mat[member, item] <= timestamp_split_train:\n"," user_ratings_train.append(rating_tuple)\n"," elif timestamp_split_train < timestamp_mat[member, item] <= timestamp_split_val:\n"," user_ratings_val.append(rating_tuple)\n"," else:\n"," user_ratings_test.append(rating_tuple)\n","\n"," users_rated_items_dict[member] = user_rated_items\n","\n"," np.random.seed(0)\n","\n"," user_negative_items_val = self.get_negative_samples(\n"," user_ratings_val, groups_rated_items_set, users_rated_items_dict)\n"," user_negative_items_test = self.get_negative_samples(\n"," user_ratings_test, groups_rated_items_set, users_rated_items_dict)\n"," group_negative_items_val = self.get_negative_samples(\n"," group_ratings_val, groups_rated_items_set, groups_rated_items_dict)\n"," group_negative_items_test = self.get_negative_samples(\n"," group_ratings_test, groups_rated_items_set, groups_rated_items_dict)\n","\n"," return members, group_ratings_train, group_ratings_val, group_ratings_test, \\\n"," group_negative_items_val, group_negative_items_test, \\\n"," user_ratings_train, user_ratings_val, user_ratings_test, \\\n"," user_negative_items_val, user_negative_items_test\n","\n"," def get_negative_samples(self, ratings, groups_rated_items_set, rated_items_dict):\n"," negative_items_list = []\n"," for sample in ratings:\n"," sample_id, item, _, _ = sample\n"," missed_items = groups_rated_items_set - rated_items_dict[sample_id]\n"," negative_items = \\\n"," np.random.choice(list(missed_items), self.negative_sample_size,\n"," replace=(len(missed_items) < self.negative_sample_size))\n"," negative_items_list.append((sample_id, item, negative_items))\n"," return negative_items_list\n","\n"," def save_groups(self, groups_path, groups):\n"," with open(groups_path, 'w') as file:\n"," for i, group in enumerate(groups):\n"," file.write(str(i + 1) + ' '\n"," + ','.join(map(str, list(group))) + '\\n')\n","\n"," def save_ratings(self, ratings, ratings_path):\n"," with open(ratings_path, 'w') as file:\n"," for rating in ratings:\n"," file.write(' '.join(map(str, list(rating))) + '\\n')\n","\n"," def save_negative_samples(self, negative_items, negative_items_path):\n"," with open(negative_items_path, 'w') as file:\n"," for samples in negative_items:\n"," user, item, negative_items = samples\n"," file.write('({},{}) '.format(user, item)\n"," + ' '.join(map(str, list(negative_items))) + '\\n')"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"UMKVslZjF3lX","executionInfo":{"status":"ok","timestamp":1637927961707,"user_tz":-330,"elapsed":446,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"5c7a7409-16fe-4963-870e-0db6fd7445e8"},"source":["import pandas as pd\n","files_info = pd.DataFrame({\n"," 'Files': ['movies.dat', 'users.dat', 'groupMember.dat',\n"," 'group(user)RatingTrain.dat', 'group(user)RatingVal(Test).dat',\n"," 'group(user)RatingVal(Test)Negative.dat'],\n"," 'Description': ['Movie information file from MovieLens-1M',\n"," 'User information file from MovieLens-1M',\n"," 'File including group members. Each line is a group instance: groupID userID1,userID2,...',\n"," 'Train file. Each line is a training instance: groupID(userID) itemID rating timestamp',\n"," 'group (user) validation (test) file (positive instances). Each line is a validation (test) instance: groupID(userID) itemID rating timestamp',\n"," 'group (user) validation (test) file (negative instances). Each line corresponds to the line of group(user)RatingVal(Test).dat, containing 100 negative samples. Each line is in the format: (groupID(userID),itemID) negativeItemID1, negativeItemID2, ...',\n"," ]})\n","print(files_info.to_markdown())"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["| | Files | Description |\n","|---:|:---------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n","| 0 | movies.dat | Movie information file from MovieLens-1M |\n","| 1 | users.dat | User information file from MovieLens-1M |\n","| 2 | groupMember.dat | File including group members. Each line is a group instance: groupID userID1,userID2,... |\n","| 3 | group(user)RatingTrain.dat | Train file. Each line is a training instance: groupID(userID) itemID rating timestamp |\n","| 4 | group(user)RatingVal(Test).dat | group (user) validation (test) file (positive instances). Each line is a validation (test) instance: groupID(userID) itemID rating timestamp |\n","| 5 | group(user)RatingVal(Test)Negative.dat | group (user) validation (test) file (negative instances). Each line corresponds to the line of group(user)RatingVal(Test).dat, containing 100 negative samples. Each line is in the format: (groupID(userID),itemID) negativeItemID1, negativeItemID2, ... |\n"]}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":35},"id":"20qArY6_J6jG","executionInfo":{"status":"ok","timestamp":1637861569716,"user_tz":-330,"elapsed":741,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"e6a1db75-c09d-4412-8819-2c60f86d1b73"},"source":["print('\\nTakes approx. 5 mins...')"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"},"text/plain":["'Takes approx. 5 mins...'"]},"metadata":{},"execution_count":19}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"_sprYbwKFPUU","executionInfo":{"status":"ok","timestamp":1637861506641,"user_tz":-330,"elapsed":306120,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"bed68e1c-961c-4011-f139-4b88d2764565"},"source":["data_folder_path = '.'\n","data_path = os.path.join(data_folder_path, 'ml-1m')\n","data_zip_path = os.path.join(data_folder_path, 'ml-1m.zip')\n","output_path = os.path.join(data_folder_path, 'MovieLens-Rand')\n","\n","if not os.path.exists(data_path):\n"," with zipfile.ZipFile(data_zip_path, 'r') as data_zip:\n"," data_zip.extractall(data_folder_path)\n"," print('Unzip file: ' + data_zip_path)\n","\n","if not os.path.exists(output_path):\n"," os.mkdir(output_path)\n","\n","group_generator = GroupGenerator(data_path, output_path,\n"," rating_threshold=4,\n"," num_groups=1000,\n"," group_sizes=[2, 3, 4, 5],\n"," min_num_ratings=20,\n"," train_ratio=0.7,\n"," val_ratio=0.1,\n"," negative_sample_size=100,\n"," verbose=True)"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Save data: ./MovieLens-Rand\n","# Users: 1626\n","# Items: 1998\n","# Groups: 1000\n","# U-I ratings: 438129\n","# G-I ratings: 53248\n","Avg. # ratings / user: 269.45\n","Avg. # ratings / group: 53.25\n","Avg. group size: 2.19\n"]}]},{"cell_type":"markdown","metadata":{"id":"8eSxb6TyKSdx"},"source":["---"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cf5sqHBSF8P3","executionInfo":{"status":"ok","timestamp":1637861586624,"user_tz":-330,"elapsed":6161,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"e0c7938e-5981-41d8-8a4e-c0ea9a5ee7a7"},"source":["!apt-get -qq install tree"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Selecting previously unselected package tree.\n","(Reading database ... 155222 files and directories currently installed.)\n","Preparing to unpack .../tree_1.7.0-5_amd64.deb ...\n","Unpacking tree (1.7.0-5) ...\n","Setting up tree (1.7.0-5) ...\n","Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n"]}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"GThrPK-YGG87","executionInfo":{"status":"ok","timestamp":1637861619498,"user_tz":-330,"elapsed":617,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"cf426ef4-1182-4b0d-d2ee-da706c080baa"},"source":["!tree --du -h -C ."],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\u001b[01;34m.\u001b[00m\n","├── [ 24M] \u001b[01;34mml-1m\u001b[00m\n","│   ├── [167K] movies.dat\n","│   ├── [ 23M] ratings.dat\n","│   ├── [5.4K] README\n","│   └── [131K] users.dat\n","├── [5.6M] \u001b[01;31mml-1m.zip\u001b[00m\n","└── [ 55M] \u001b[01;34mMovieLens-Rand\u001b[00m\n"," ├── [ 14K] groupMember.dat\n"," ├── [218K] groupRatingTest.dat\n"," ├── [4.9M] groupRatingTestNegative.dat\n"," ├── [749K] groupRatingTrain.dat\n"," ├── [107K] groupRatingVal.dat\n"," ├── [2.5M] groupRatingValNegative.dat\n"," ├── [167K] movies.dat\n"," ├── [1.1M] userRatingTest.dat\n"," ├── [ 24M] userRatingTestNegative.dat\n"," ├── [7.4M] userRatingTrain.dat\n"," ├── [561K] userRatingVal.dat\n"," ├── [ 13M] userRatingValNegative.dat\n"," └── [131K] users.dat\n","\n"," 84M used in 2 directories, 18 files\n"]}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"8v_qPkSRKSd0","executionInfo":{"status":"ok","timestamp":1637861656761,"user_tz":-330,"elapsed":2890,"user":{"displayName":"Sparsh Agarwal","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"13037694610922482904"}},"outputId":"069cc3d9-357d-4713-c191-0bfcfed8923d"},"source":["!pip install -q watermark\n","%reload_ext watermark\n","%watermark -a \"Sparsh A.\" -m -iv -u -t -d"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Author: Sparsh A.\n","\n","Last updated: 2021-11-25 17:34:14\n","\n","Compiler : GCC 7.5.0\n","OS : Linux\n","Release : 5.4.104+\n","Machine : x86_64\n","Processor : x86_64\n","CPU cores : 2\n","Architecture: 64bit\n","\n","IPython: 5.5.0\n","numpy : 1.19.5\n","scipy : 1.4.1\n","\n"]}]},{"cell_type":"markdown","metadata":{"id":"0OwxdA_cKSd1"},"source":["---"]},{"cell_type":"markdown","metadata":{"id":"gfOimpkwKSd1"},"source":["**END**"]}]}