{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "C299883 | Epinions Data Preprocessing",
"provenance": [],
"collapsed_sections": [],
"mount_file_id": "1mR-aJmCRlP6UAgs_VMhfoGgcXd3YU79T",
"authorship_tag": "ABX9TyNP3C4I3YRxVLuv/2Qdc87K"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"# Epinions Data Preprocessing\n",
"> Loading and transformation of epinions user item interaction dataset\n",
"\n",
"- toc: true\n",
"- badges: true\n",
"- comments: true\n",
"- categories: [data processing]"
],
"metadata": {
"id": "pej35mf5F4Rj"
}
},
{
"cell_type": "code",
"metadata": {
"id": "ErXSaQ9RGWV_"
},
"source": [
"!pip install lenskit"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "hTa3ROMZGeYW",
"outputId": "a1962f18-7bdf-49b5-8581-8948ddf4cf72"
},
"source": [
"!wget -q --show-progress https://github.com/RecoHut-Datasets/epinions/raw/v1/trust_data.txt"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\rtrust_data.txt 0%[ ] 0 --.-KB/s \rtrust_data.txt 100%[===================>] 6.06M --.-KB/s in 0.08s \n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "aOkn7m2pGod4"
},
"source": [
"import pandas as pd\n",
"import lenskit.crossfold as xf\n",
"import numpy as np\n",
"import json"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "id3lZcQXGyKY",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "4afd4a84-9086-4292-85f9-576e5b8c5bef"
},
"source": [
"ratings = pd.read_csv('trust_data.txt', header=None, index_col=None, sep=' ')\n",
"ratings.dropna(axis=1, how='all', inplace=True) \n",
"columns = ['user', 'item', 'rating']\n",
"ratings.columns = columns\n",
"print(ratings.head())"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" user item rating\n",
"0 22605 42915 1\n",
"1 22605 5052 1\n",
"2 22605 42913 1\n",
"3 22605 18420 1\n",
"4 22605 42914 1\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "lnHJJIXLG7jd",
"outputId": "eac78cb0-e56e-4678-bdfc-ed80b339fc5c"
},
"source": [
"n_user = len(pd.unique(ratings.user))\n",
"n_item = len(pd.unique(ratings.item))\n",
"\n",
"print(\"Num_of_users: {}\\nNum_of_items: {}\".format(n_user, n_item))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Num_of_users: 33960\n",
"Num_of_items: 49288\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "qWH3j4SNPTTI",
"outputId": "cf883704-3777-4ceb-e260-1b79bee9a14d"
},
"source": [
"ratings.head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user | \n",
" item | \n",
" rating | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 22605 | \n",
" 42915 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" 22605 | \n",
" 5052 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" 22605 | \n",
" 42913 | \n",
" 1 | \n",
"
\n",
" \n",
" | 3 | \n",
" 22605 | \n",
" 18420 | \n",
" 1 | \n",
"
\n",
" \n",
" | 4 | \n",
" 22605 | \n",
" 42914 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user item rating\n",
"0 22605 42915 1\n",
"1 22605 5052 1\n",
"2 22605 42913 1\n",
"3 22605 18420 1\n",
"4 22605 42914 1"
]
},
"metadata": {},
"execution_count": 9
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "LsdLwUbUIcrV",
"outputId": "0f667bfd-f457-4429-9014-c0f298bf8df5"
},
"source": [
"df_25 = ratings[ratings.user.isin(ratings.user.value_counts()[ratings.user.value_counts() >= 25].index)]\n",
"df_25 = df_25.reset_index(drop=True)\n",
"print(\"\\033[4mCount after only keeping users with at least 25 relevant interactions\\033[0m\")\n",
"print(\"Num_of_users: {}\\nNum_of_items: {}\\nTotal_interactions: {}\".format(len(pd.unique(df_25.user)), len(pd.unique(df_25.item)), len(df_25)))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\u001b[4mCount after only keeping users with at least 25 relevant interactions\u001b[0m\n",
"Num_of_users: 4718\n",
"Num_of_items: 36165\n",
"Total_interactions: 346035\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "qASWgDMHPFeK",
"outputId": "5535886d-5faf-49cf-bad2-c1d17a88adbf"
},
"source": [
"print(df_25.head())"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" user item rating\n",
"0 2824 2696 1\n",
"1 2824 14915 1\n",
"2 2824 18333 1\n",
"3 2824 2143 1\n",
"4 2824 10308 1\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "vIIkD0GBJMfE"
},
"source": [
"def get_unique_id(data_pd: pd.DataFrame, column: str) -> (dict, pd.DataFrame):\n",
"\t\"\"\"\n",
"\tclear the ids\n",
"\t:param data_pd: pd.DataFrame \n",
"\t:param column: specified col\n",
"\t:return: dict: {value: id}\n",
"\t\"\"\"\n",
"\tnew_column = '{}_id'.format(column)\n",
"\tassert new_column not in data_pd.columns\n",
"\ttemp = data_pd.loc[:, [column]].drop_duplicates().reset_index(drop=True)\n",
"\ttemp[new_column] = temp.index\n",
"\ttemp.index = temp[column]\n",
"\tdel temp[column]\n",
"\t# data_pd.merge()\n",
"\tdata_pd = pd.merge(left=data_pd,\n",
"\t\tright=temp,\n",
"\t\tleft_on=column,\n",
"\t\tright_index=True,\n",
"\t\thow='left')\n",
"\n",
"\treturn temp[new_column].to_dict(), data_pd"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "FtaemIZSJf-2",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "42fb51df-ee68-4ad7-db81-4b8ac5e8add2"
},
"source": [
"_, df_25 = get_unique_id(df_25, 'user')\n",
"_, df_25 = get_unique_id(df_25, 'item')\n",
"print(df_25.head())"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" user item rating user_id item_id\n",
"0 2824 2696 1 0 0\n",
"1 2824 14915 1 0 1\n",
"2 2824 18333 1 0 2\n",
"3 2824 2143 1 0 3\n",
"4 2824 10308 1 0 4\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-HPTdD6dJg0C",
"outputId": "e814e61d-9e09-4039-d462-008b0adea60a"
},
"source": [
"n_user = df_25.user_id.drop_duplicates().size\n",
"n_item = df_25.item_id.drop_duplicates().size\n",
"print(n_user, n_item)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"4718 36165\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "QAIOH1exJgxf"
},
"source": [
"import os\n",
"\n",
"dataset_meta_info = {'dataset_size': len(df_25),\n",
" 'user_size': n_user,\n",
" 'item_size': n_item\n",
" }\n",
"with open(os.path.join('dataset_meta_info.json'), 'w') as f:\n",
"\tjson.dump(dataset_meta_info, f) "
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "QhpdSqNCJgu9",
"outputId": "15cd2ccf-167b-40cc-f41d-5c4fc7acb13a"
},
"source": [
"seeds = [1, 777, 1992, 2003, 2020]\n",
"\n",
"for j in range(len(seeds)):\n",
"\tfor i, tp in enumerate(xf.partition_users(df_25, partitions=1, method=xf.SampleN(20), rng_spec=seeds[j])):\n",
"\t\tsave_path = '.'\n",
"\t\tif not os.path.exists(save_path):\n",
"\t\t\tos.makedirs(save_path)\n",
"\t\ttrain = tp.test\n",
"\t\ttest = tp.train\n",
"\n",
"\t\ttrain.to_csv(os.path.join(save_path, 'train.csv'))\n",
"\t\ttest.to_csv(os.path.join(save_path, 'test.csv'))\n",
"\t\tprint(len(tp.train))\n",
"\t\tprint(len(tp.test))"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"251675\n",
"94360\n",
"251675\n",
"94360\n",
"251675\n",
"94360\n",
"251675\n",
"94360\n",
"251675\n",
"94360\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "JkWsWld0LArn",
"outputId": "f0dd0cae-f48d-451f-89a1-6f901f1e10e8"
},
"source": [
"train_df = pd.read_csv('train.csv', index_col=0)\n",
"train_df.head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user | \n",
" item | \n",
" rating | \n",
" user_id | \n",
" item_id | \n",
"
\n",
" \n",
" \n",
" \n",
" | 86621 | \n",
" 1 | \n",
" 14 | \n",
" 1 | \n",
" 1201 | \n",
" 5233 | \n",
"
\n",
" \n",
" | 86470 | \n",
" 1 | \n",
" 77 | \n",
" 1 | \n",
" 1201 | \n",
" 2226 | \n",
"
\n",
" \n",
" | 86531 | \n",
" 1 | \n",
" 163 | \n",
" 1 | \n",
" 1201 | \n",
" 1312 | \n",
"
\n",
" \n",
" | 86603 | \n",
" 1 | \n",
" 297 | \n",
" 1 | \n",
" 1201 | \n",
" 6344 | \n",
"
\n",
" \n",
" | 86451 | \n",
" 1 | \n",
" 319 | \n",
" 1 | \n",
" 1201 | \n",
" 426 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user item rating user_id item_id\n",
"86621 1 14 1 1201 5233\n",
"86470 1 77 1 1201 2226\n",
"86531 1 163 1 1201 1312\n",
"86603 1 297 1 1201 6344\n",
"86451 1 319 1 1201 426"
]
},
"metadata": {},
"execution_count": 18
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "7TS-BvKDLAoh",
"outputId": "354d9c3c-f00d-40f5-a5d5-2274e7ac53cd"
},
"source": [
"train_df.info()"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"Int64Index: 94360 entries, 86621 to 238809\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype\n",
"--- ------ -------------- -----\n",
" 0 user 94360 non-null int64\n",
" 1 item 94360 non-null int64\n",
" 2 rating 94360 non-null int64\n",
" 3 user_id 94360 non-null int64\n",
" 4 item_id 94360 non-null int64\n",
"dtypes: int64(5)\n",
"memory usage: 4.3 MB\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "iEe13D6ALAlS",
"outputId": "ecb1b33e-450b-4da6-b4fa-be874b706ca7"
},
"source": [
"train_df.describe().T"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count | \n",
" mean | \n",
" std | \n",
" min | \n",
" 25% | \n",
" 50% | \n",
" 75% | \n",
" max | \n",
"
\n",
" \n",
" \n",
" \n",
" | user | \n",
" 94360.0 | \n",
" 6666.024375 | \n",
" 6900.551097 | \n",
" 1.0 | \n",
" 1652.0 | \n",
" 4199.0 | \n",
" 9581.0 | \n",
" 47624.0 | \n",
"
\n",
" \n",
" | item | \n",
" 94360.0 | \n",
" 6524.838650 | \n",
" 9194.674987 | \n",
" 1.0 | \n",
" 729.0 | \n",
" 2287.0 | \n",
" 8658.0 | \n",
" 49046.0 | \n",
"
\n",
" \n",
" | rating | \n",
" 94360.0 | \n",
" 1.000000 | \n",
" 0.000000 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | user_id | \n",
" 94360.0 | \n",
" 2358.500000 | \n",
" 1361.976471 | \n",
" 0.0 | \n",
" 1179.0 | \n",
" 2358.5 | \n",
" 3538.0 | \n",
" 4717.0 | \n",
"
\n",
" \n",
" | item_id | \n",
" 94360.0 | \n",
" 4957.928116 | \n",
" 7146.040509 | \n",
" 0.0 | \n",
" 684.0 | \n",
" 2012.0 | \n",
" 5690.0 | \n",
" 36158.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" count mean std ... 50% 75% max\n",
"user 94360.0 6666.024375 6900.551097 ... 4199.0 9581.0 47624.0\n",
"item 94360.0 6524.838650 9194.674987 ... 2287.0 8658.0 49046.0\n",
"rating 94360.0 1.000000 0.000000 ... 1.0 1.0 1.0\n",
"user_id 94360.0 2358.500000 1361.976471 ... 2358.5 3538.0 4717.0\n",
"item_id 94360.0 4957.928116 7146.040509 ... 2012.0 5690.0 36158.0\n",
"\n",
"[5 rows x 8 columns]"
]
},
"metadata": {},
"execution_count": 20
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BIyZZlffKTpy"
},
"source": [
"---"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "x1bfRShBKU-l",
"outputId": "5c570644-2391-4113-cc8f-b1bf13fc91c2"
},
"source": [
"!ls -al ."
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"total 15684\n",
"drwxr-xr-x 1 root root 4096 Nov 25 14:04 .\n",
"drwxr-xr-x 1 root root 4096 Nov 25 13:08 ..\n",
"drwxr-xr-x 4 root root 4096 Nov 18 14:35 .config\n",
"-rw-r--r-- 1 root root 63 Nov 25 14:04 dataset_meta_info.json\n",
"drwxr-xr-x 1 root root 4096 Nov 18 14:36 sample_data\n",
"-rw-r--r-- 1 root root 7020158 Nov 25 14:04 test.csv\n",
"-rw-r--r-- 1 root root 2655420 Nov 25 14:04 train.csv\n",
"-rw-r--r-- 1 root root 6357397 Nov 25 13:13 trust_data.txt\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "uFongZU4KTp2",
"outputId": "efcf3245-3866-4fd8-82ae-fc40b0273419"
},
"source": [
"!pip install -q watermark\n",
"%reload_ext watermark\n",
"%watermark -a \"Sparsh A.\" -m -iv -u -t -d"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Author: Sparsh A.\n",
"\n",
"Last updated: 2021-11-25 12:56:37\n",
"\n",
"Compiler : GCC 7.5.0\n",
"OS : Linux\n",
"Release : 5.4.104+\n",
"Machine : x86_64\n",
"Processor : x86_64\n",
"CPU cores : 2\n",
"Architecture: 64bit\n",
"\n",
"numpy : 1.19.5\n",
"pandas : 1.1.5\n",
"lenskit: 0.13.1\n",
"json : 2.0.9\n",
"IPython: 5.5.0\n",
"\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "O1HWMD6TKTp4"
},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "t5RatZHVKTp5"
},
"source": [
"**END**"
]
}
]
}