{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "C299883 | Epinions Data Preprocessing", "provenance": [], "collapsed_sections": [], "mount_file_id": "1mR-aJmCRlP6UAgs_VMhfoGgcXd3YU79T", "authorship_tag": "ABX9TyNP3C4I3YRxVLuv/2Qdc87K" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "source": [ "# Epinions Data Preprocessing\n", "> Loading and transformation of epinions user item interaction dataset\n", "\n", "- toc: true\n", "- badges: true\n", "- comments: true\n", "- categories: [data processing]" ], "metadata": { "id": "pej35mf5F4Rj" } }, { "cell_type": "code", "metadata": { "id": "ErXSaQ9RGWV_" }, "source": [ "!pip install lenskit" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "hTa3ROMZGeYW", "outputId": "a1962f18-7bdf-49b5-8581-8948ddf4cf72" }, "source": [ "!wget -q --show-progress https://github.com/RecoHut-Datasets/epinions/raw/v1/trust_data.txt" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\rtrust_data.txt 0%[ ] 0 --.-KB/s \rtrust_data.txt 100%[===================>] 6.06M --.-KB/s in 0.08s \n" ] } ] }, { "cell_type": "code", "metadata": { "id": "aOkn7m2pGod4" }, "source": [ "import pandas as pd\n", "import lenskit.crossfold as xf\n", "import numpy as np\n", "import json" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "id3lZcQXGyKY", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "4afd4a84-9086-4292-85f9-576e5b8c5bef" }, "source": [ "ratings = pd.read_csv('trust_data.txt', header=None, index_col=None, sep=' ')\n", "ratings.dropna(axis=1, how='all', inplace=True) \n", "columns = ['user', 'item', 'rating']\n", "ratings.columns = columns\n", "print(ratings.head())" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " user item rating\n", "0 22605 42915 1\n", "1 22605 5052 1\n", "2 22605 42913 1\n", "3 22605 18420 1\n", "4 22605 42914 1\n" ] } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "lnHJJIXLG7jd", "outputId": "eac78cb0-e56e-4678-bdfc-ed80b339fc5c" }, "source": [ "n_user = len(pd.unique(ratings.user))\n", "n_item = len(pd.unique(ratings.item))\n", "\n", "print(\"Num_of_users: {}\\nNum_of_items: {}\".format(n_user, n_item))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Num_of_users: 33960\n", "Num_of_items: 49288\n" ] } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "qWH3j4SNPTTI", "outputId": "cf883704-3777-4ceb-e260-1b79bee9a14d" }, "source": [ "ratings.head()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
useritemrating
022605429151
12260550521
222605429131
322605184201
422605429141
\n", "
" ], "text/plain": [ " user item rating\n", "0 22605 42915 1\n", "1 22605 5052 1\n", "2 22605 42913 1\n", "3 22605 18420 1\n", "4 22605 42914 1" ] }, "metadata": {}, "execution_count": 9 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "LsdLwUbUIcrV", "outputId": "0f667bfd-f457-4429-9014-c0f298bf8df5" }, "source": [ "df_25 = ratings[ratings.user.isin(ratings.user.value_counts()[ratings.user.value_counts() >= 25].index)]\n", "df_25 = df_25.reset_index(drop=True)\n", "print(\"\\033[4mCount after only keeping users with at least 25 relevant interactions\\033[0m\")\n", "print(\"Num_of_users: {}\\nNum_of_items: {}\\nTotal_interactions: {}\".format(len(pd.unique(df_25.user)), len(pd.unique(df_25.item)), len(df_25)))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[4mCount after only keeping users with at least 25 relevant interactions\u001b[0m\n", "Num_of_users: 4718\n", "Num_of_items: 36165\n", "Total_interactions: 346035\n" ] } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qASWgDMHPFeK", "outputId": "5535886d-5faf-49cf-bad2-c1d17a88adbf" }, "source": [ "print(df_25.head())" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " user item rating\n", "0 2824 2696 1\n", "1 2824 14915 1\n", "2 2824 18333 1\n", "3 2824 2143 1\n", "4 2824 10308 1\n" ] } ] }, { "cell_type": "code", "metadata": { "id": "vIIkD0GBJMfE" }, "source": [ "def get_unique_id(data_pd: pd.DataFrame, column: str) -> (dict, pd.DataFrame):\n", "\t\"\"\"\n", "\tclear the ids\n", "\t:param data_pd: pd.DataFrame \n", "\t:param column: specified col\n", "\t:return: dict: {value: id}\n", "\t\"\"\"\n", "\tnew_column = '{}_id'.format(column)\n", "\tassert new_column not in data_pd.columns\n", "\ttemp = data_pd.loc[:, [column]].drop_duplicates().reset_index(drop=True)\n", "\ttemp[new_column] = temp.index\n", "\ttemp.index = temp[column]\n", "\tdel temp[column]\n", "\t# data_pd.merge()\n", "\tdata_pd = pd.merge(left=data_pd,\n", "\t\tright=temp,\n", "\t\tleft_on=column,\n", "\t\tright_index=True,\n", "\t\thow='left')\n", "\n", "\treturn temp[new_column].to_dict(), data_pd" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "FtaemIZSJf-2", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "42fb51df-ee68-4ad7-db81-4b8ac5e8add2" }, "source": [ "_, df_25 = get_unique_id(df_25, 'user')\n", "_, df_25 = get_unique_id(df_25, 'item')\n", "print(df_25.head())" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " user item rating user_id item_id\n", "0 2824 2696 1 0 0\n", "1 2824 14915 1 0 1\n", "2 2824 18333 1 0 2\n", "3 2824 2143 1 0 3\n", "4 2824 10308 1 0 4\n" ] } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-HPTdD6dJg0C", "outputId": "e814e61d-9e09-4039-d462-008b0adea60a" }, "source": [ "n_user = df_25.user_id.drop_duplicates().size\n", "n_item = df_25.item_id.drop_duplicates().size\n", "print(n_user, n_item)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "4718 36165\n" ] } ] }, { "cell_type": "code", "metadata": { "id": "QAIOH1exJgxf" }, "source": [ "import os\n", "\n", "dataset_meta_info = {'dataset_size': len(df_25),\n", " 'user_size': n_user,\n", " 'item_size': n_item\n", " }\n", "with open(os.path.join('dataset_meta_info.json'), 'w') as f:\n", "\tjson.dump(dataset_meta_info, f) " ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "QhpdSqNCJgu9", "outputId": "15cd2ccf-167b-40cc-f41d-5c4fc7acb13a" }, "source": [ "seeds = [1, 777, 1992, 2003, 2020]\n", "\n", "for j in range(len(seeds)):\n", "\tfor i, tp in enumerate(xf.partition_users(df_25, partitions=1, method=xf.SampleN(20), rng_spec=seeds[j])):\n", "\t\tsave_path = '.'\n", "\t\tif not os.path.exists(save_path):\n", "\t\t\tos.makedirs(save_path)\n", "\t\ttrain = tp.test\n", "\t\ttest = tp.train\n", "\n", "\t\ttrain.to_csv(os.path.join(save_path, 'train.csv'))\n", "\t\ttest.to_csv(os.path.join(save_path, 'test.csv'))\n", "\t\tprint(len(tp.train))\n", "\t\tprint(len(tp.test))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "251675\n", "94360\n", "251675\n", "94360\n", "251675\n", "94360\n", "251675\n", "94360\n", "251675\n", "94360\n" ] } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "JkWsWld0LArn", "outputId": "f0dd0cae-f48d-451f-89a1-6f901f1e10e8" }, "source": [ "train_df = pd.read_csv('train.csv', index_col=0)\n", "train_df.head()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
useritemratinguser_iditem_id
86621114112015233
86470177112012226
865311163112011312
866031297112016344
86451131911201426
\n", "
" ], "text/plain": [ " user item rating user_id item_id\n", "86621 1 14 1 1201 5233\n", "86470 1 77 1 1201 2226\n", "86531 1 163 1 1201 1312\n", "86603 1 297 1 1201 6344\n", "86451 1 319 1 1201 426" ] }, "metadata": {}, "execution_count": 18 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7TS-BvKDLAoh", "outputId": "354d9c3c-f00d-40f5-a5d5-2274e7ac53cd" }, "source": [ "train_df.info()" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "Int64Index: 94360 entries, 86621 to 238809\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype\n", "--- ------ -------------- -----\n", " 0 user 94360 non-null int64\n", " 1 item 94360 non-null int64\n", " 2 rating 94360 non-null int64\n", " 3 user_id 94360 non-null int64\n", " 4 item_id 94360 non-null int64\n", "dtypes: int64(5)\n", "memory usage: 4.3 MB\n" ] } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "iEe13D6ALAlS", "outputId": "ecb1b33e-450b-4da6-b4fa-be874b706ca7" }, "source": [ "train_df.describe().T" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countmeanstdmin25%50%75%max
user94360.06666.0243756900.5510971.01652.04199.09581.047624.0
item94360.06524.8386509194.6749871.0729.02287.08658.049046.0
rating94360.01.0000000.0000001.01.01.01.01.0
user_id94360.02358.5000001361.9764710.01179.02358.53538.04717.0
item_id94360.04957.9281167146.0405090.0684.02012.05690.036158.0
\n", "
" ], "text/plain": [ " count mean std ... 50% 75% max\n", "user 94360.0 6666.024375 6900.551097 ... 4199.0 9581.0 47624.0\n", "item 94360.0 6524.838650 9194.674987 ... 2287.0 8658.0 49046.0\n", "rating 94360.0 1.000000 0.000000 ... 1.0 1.0 1.0\n", "user_id 94360.0 2358.500000 1361.976471 ... 2358.5 3538.0 4717.0\n", "item_id 94360.0 4957.928116 7146.040509 ... 2012.0 5690.0 36158.0\n", "\n", "[5 rows x 8 columns]" ] }, "metadata": {}, "execution_count": 20 } ] }, { "cell_type": "markdown", "metadata": { "id": "BIyZZlffKTpy" }, "source": [ "---" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "x1bfRShBKU-l", "outputId": "5c570644-2391-4113-cc8f-b1bf13fc91c2" }, "source": [ "!ls -al ." ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "total 15684\n", "drwxr-xr-x 1 root root 4096 Nov 25 14:04 .\n", "drwxr-xr-x 1 root root 4096 Nov 25 13:08 ..\n", "drwxr-xr-x 4 root root 4096 Nov 18 14:35 .config\n", "-rw-r--r-- 1 root root 63 Nov 25 14:04 dataset_meta_info.json\n", "drwxr-xr-x 1 root root 4096 Nov 18 14:36 sample_data\n", "-rw-r--r-- 1 root root 7020158 Nov 25 14:04 test.csv\n", "-rw-r--r-- 1 root root 2655420 Nov 25 14:04 train.csv\n", "-rw-r--r-- 1 root root 6357397 Nov 25 13:13 trust_data.txt\n" ] } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uFongZU4KTp2", "outputId": "efcf3245-3866-4fd8-82ae-fc40b0273419" }, "source": [ "!pip install -q watermark\n", "%reload_ext watermark\n", "%watermark -a \"Sparsh A.\" -m -iv -u -t -d" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Author: Sparsh A.\n", "\n", "Last updated: 2021-11-25 12:56:37\n", "\n", "Compiler : GCC 7.5.0\n", "OS : Linux\n", "Release : 5.4.104+\n", "Machine : x86_64\n", "Processor : x86_64\n", "CPU cores : 2\n", "Architecture: 64bit\n", "\n", "numpy : 1.19.5\n", "pandas : 1.1.5\n", "lenskit: 0.13.1\n", "json : 2.0.9\n", "IPython: 5.5.0\n", "\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "O1HWMD6TKTp4" }, "source": [ "---" ] }, { "cell_type": "markdown", "metadata": { "id": "t5RatZHVKTp5" }, "source": [ "**END**" ] } ] }