{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "2021-06-19-recsys20-tutorial-feature-engineering-part-1.ipynb", "provenance": [], "collapsed_sections": [], "mount_file_id": "1bnZOC6wVT95f8hS9bWb6x03D_FnxaYWL", "authorship_tag": "ABX9TyPmPXgo7GPE2xbCVlv6wGUY" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "Qp9CyeCjAP6I" }, "source": [ "# Recsys'20 Feature Engineering Tutorial Part 1\n", "> RecSys'20 tutorial on feature engineering on a large retail dataset\n", "\n", "- toc: true\n", "- badges: true\n", "- comments: true\n", "- categories: [features, recsys, cudf, retail, bigdata]\n", "- image: " ] }, { "cell_type": "markdown", "metadata": { "id": "glqcrXsxGNq1" }, "source": [ "### Data download" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nMLPnhbYEm8A", "outputId": "594c3c96-9665-4eb1-964b-6e2c1d3ac780" }, "source": [ "!pip install -q -U kaggle\n", "!pip install --upgrade --force-reinstall --no-deps kaggle\n", "!mkdir ~/.kaggle\n", "!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/\n", "!chmod 600 ~/.kaggle/kaggle.json\n", "\n", "!kaggle datasets download -d mkechinov/ecommerce-behavior-data-from-multi-category-store" ], "execution_count": 1, "outputs": [ { "output_type": "stream", "text": [ "Collecting kaggle\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/3a/e7/3bac01547d2ed3d308ac92a0878fbdb0ed0f3d41fb1906c319ccbba1bfbc/kaggle-1.5.12.tar.gz (58kB)\n", "\r\u001b[K |█████▋ | 10kB 6.6MB/s eta 0:00:01\r\u001b[K |███████████▏ | 20kB 10.2MB/s eta 0:00:01\r\u001b[K |████████████████▊ | 30kB 13.1MB/s eta 0:00:01\r\u001b[K |██████████████████████▎ | 40kB 11.6MB/s eta 0:00:01\r\u001b[K |███████████████████████████▉ | 51kB 6.1MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 61kB 2.9MB/s \n", "\u001b[?25hBuilding wheels for collected packages: kaggle\n", " Building wheel for kaggle (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for kaggle: filename=kaggle-1.5.12-cp37-none-any.whl size=73053 sha256=845e8bef0aaf19c5d7a8d3d66934eb5e52047f126c5e82fe7dcb5a6c1f107b0a\n", " Stored in directory: /root/.cache/pip/wheels/a1/6a/26/d30b7499ff85a4a4593377a87ecf55f7d08af42f0de9b60303\n", "Successfully built kaggle\n", "Installing collected packages: kaggle\n", " Found existing installation: kaggle 1.5.12\n", " Uninstalling kaggle-1.5.12:\n", " Successfully uninstalled kaggle-1.5.12\n", "Successfully installed kaggle-1.5.12\n", "Downloading ecommerce-behavior-data-from-multi-category-store.zip to /content\n", "100% 4.29G/4.29G [01:24<00:00, 50.2MB/s]\n", "100% 4.29G/4.29G [01:24<00:00, 54.3MB/s]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2Tac1tdjBvqs", "outputId": "aef95b25-7997-4245-8491-b58692b814fc" }, "source": [ "!gdown --id 1qZIwMbMgMmgDC5EoMdJ8aI9lQPsWA3-P\n", "!gdown --id 1x5ohrrZNhWQN4Q-zww0RmXOwctKHH9PT\n", "!gdown --id 1-Rov9fFtGJqb7_ePc6qH-Rhzxn0cIcKB\n", "!gdown --id 1zr_RXpGvOWN2PrWI6itWL8HnRsCpyqz8\n", "!gdown --id 1g5WoIgLe05UMdREbxAjh0bEFgVCjA1UL" ], "execution_count": 2, "outputs": [ { "output_type": "stream", "text": [ "Downloading...\n", "From: https://drive.google.com/uc?id=1qZIwMbMgMmgDC5EoMdJ8aI9lQPsWA3-P\n", "To: /content/2019-Dec.csv.gz\n", "2.95GB [01:08, 42.9MB/s]\n", "Downloading...\n", "From: https://drive.google.com/uc?id=1x5ohrrZNhWQN4Q-zww0RmXOwctKHH9PT\n", "To: /content/2020-Jan.csv.gz\n", "2.39GB [00:58, 40.7MB/s]\n", "Downloading...\n", "From: https://drive.google.com/uc?id=1-Rov9fFtGJqb7_ePc6qH-Rhzxn0cIcKB\n", "To: /content/2020-Feb.csv.gz\n", "2.35GB [00:56, 41.7MB/s]\n", "Downloading...\n", "From: https://drive.google.com/uc?id=1zr_RXpGvOWN2PrWI6itWL8HnRsCpyqz8\n", "To: /content/2020-Mar.csv.gz\n", "2.42GB [00:40, 60.3MB/s]\n", "Downloading...\n", "From: https://drive.google.com/uc?id=1g5WoIgLe05UMdREbxAjh0bEFgVCjA1UL\n", "To: /content/2020-Apr.csv.gz\n", "2.93GB [00:44, 66.3MB/s]\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "cbqqSo1mGRFU" }, "source": [ "### Data extract" ] }, { "cell_type": "code", "metadata": { "id": "yEDIXNVkGuCW" }, "source": [ "import os\n", "import gc\n", "import glob\n", "import pandas as pd" ], "execution_count": 3, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "eWEtIxIXDlfW", "outputId": "51f4a064-f076-40e9-a89b-a7f48bb67465" }, "source": [ "!unzip /content/ecommerce-behavior-data-from-multi-category-store.zip\n", "!rm /content/ecommerce-behavior-data-from-multi-category-store.zip" ], "execution_count": 4, "outputs": [ { "output_type": "stream", "text": [ "Archive: /content/ecommerce-behavior-data-from-multi-category-store.zip\n", " inflating: 2019-Nov.csv \n", " inflating: 2019-Oct.csv \n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IOxOL1pJGpWM", "outputId": "2caa1dbf-462e-40fc-c8fe-fdfb41fcebcf" }, "source": [ "list_gz_files = glob.glob('/content/*.gz')\n", "list_gz_files" ], "execution_count": 5, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['/content/2020-Jan.csv.gz',\n", " '/content/2020-Mar.csv.gz',\n", " '/content/2019-Dec.csv.gz',\n", " '/content/2020-Apr.csv.gz',\n", " '/content/2020-Feb.csv.gz']" ] }, "metadata": { "tags": [] }, "execution_count": 5 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PVC7SGvzHFm1", "outputId": "8ee095e2-ad18-4d92-8a0b-013ff484933c" }, "source": [ "for file in list_gz_files:\n", " print(file)\n", " !gunzip $file" ], "execution_count": 6, "outputs": [ { "output_type": "stream", "text": [ "/content/2020-Jan.csv.gz\n", "/content/2020-Mar.csv.gz\n", "/content/2019-Dec.csv.gz\n", "/content/2020-Apr.csv.gz\n", "/content/2020-Feb.csv.gz\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 479 }, "id": "teiuKEb1QzcQ", "outputId": "c64a375c-41c4-4040-e92a-49e80d875932" }, "source": [ "snapshot = pd.read_csv(\"/content/2019-Oct.csv\").sample(1000)\n", "snapshot.head()" ], "execution_count": 7, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
event_timeevent_typeproduct_idcategory_idcategory_codebrandpriceuser_iduser_session
197046812019-10-15 12:13:56 UTCview13073482053013558920217191computers.notebookacer694.975152632771f854496-4dae-4114-af89-4f027ba31e56
416614712019-10-31 09:41:21 UTCview127091692053013553559896355NaNmichelin68.21512784291caa01956-1d21-4966-8767-3fa5013ef13a
146364832019-10-12 06:19:13 UTCview113000592053013555531219353electronics.telephonetexet17.46519299950d67e54c3-88e4-4df2-bc37-d8c7cf656a6a
412728372019-10-31 03:29:40 UTCview115002732053013554625249641NaNjbl94.71543868618a162e395-7b69-49aa-9d45-f05a60e19641
35274622019-10-03 20:29:36 UTCview10047772053013555631882655electronics.smartphonexiaomi135.0152083467198d5af9f-c141-4b79-bdd1-dc9bd1d9a7cb
\n", "
" ], "text/plain": [ " event_time ... user_session\n", "19704681 2019-10-15 12:13:56 UTC ... 1f854496-4dae-4114-af89-4f027ba31e56\n", "41661471 2019-10-31 09:41:21 UTC ... caa01956-1d21-4966-8767-3fa5013ef13a\n", "14636483 2019-10-12 06:19:13 UTC ... d67e54c3-88e4-4df2-bc37-d8c7cf656a6a\n", "41272837 2019-10-31 03:29:40 UTC ... a162e395-7b69-49aa-9d45-f05a60e19641\n", "3527462 2019-10-03 20:29:36 UTC ... 98d5af9f-c141-4b79-bdd1-dc9bd1d9a7cb\n", "\n", "[5 rows x 9 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 7 } ] }, { "cell_type": "code", "metadata": { "id": "Jq2EU5X4RAgG" }, "source": [ "!mkdir -p /content/data/tmp\n", "snapshot.to_csv('/content/data/tmp/snapshot.csv', index=False)" ], "execution_count": 8, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "CYgxGyUMHmDQ", "outputId": "72be5f21-5abd-4169-ca86-432920d660f7" }, "source": [ "gc.collect()" ], "execution_count": 9, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "123" ] }, "metadata": { "tags": [] }, "execution_count": 9 } ] }, { "cell_type": "markdown", "metadata": { "id": "CD_eROhoNxEu" }, "source": [ "### Data transform" ] }, { "cell_type": "code", "metadata": { "id": "yTtSL-aeN464" }, "source": [ "import os\n", "import gc\n", "import glob\n", "import pandas as pd\n", "from pathlib import Path" ], "execution_count": 1, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "bD1X1HtbXxqU" }, "source": [ "file = '/content/data/tmp/snapshot.csv'" ], "execution_count": 11, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "LlHEnxnZR056", "colab": { "base_uri": "https://localhost:8080/", "height": 374 }, "outputId": "6db3f1c9-8543-4270-e3e2-4d05defa1923" }, "source": [ "df_tmp = pd.read_csv(file)\n", "df_tmp['session_purchase'] = df_tmp['user_session'] + '_' + df_tmp['product_id'].astype(str)\n", "df_purchase = df_tmp[df_tmp['event_type']=='purchase']\n", "df_cart = df_tmp[df_tmp['event_type']=='cart']\n", "df_purchase = df_purchase[df_purchase['session_purchase'].isin(df_cart['session_purchase'])]\n", "df_cart = df_cart[~(df_cart['session_purchase'].isin(df_purchase['session_purchase']))]\n", "df_cart['target'] = 0\n", "df_purchase['target'] = 1\n", "df = pd.concat([df_cart, df_purchase])\n", "df = df.drop('category_id', axis=1)\n", "df = df.drop('session_purchase', axis=1)\n", "df.head()" ], "execution_count": 12, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
event_timeevent_typeproduct_idcategory_codebrandpriceuser_iduser_sessiontarget
552019-10-01 14:17:59 UTCcart4804055electronics.audio.headphoneapple189.10522564858c3f614f6-bdcb-46d7-ad93-18d03b0e11ba0
752019-10-10 10:16:18 UTCcart1004708electronics.smartphonehuawei153.96516882359b513996d-0335-4337-bb4d-09c3c14ee2cf0
1332019-10-18 08:22:44 UTCcart2701288appliances.kitchen.refrigeratorssamsung849.265127625672dd77291-d54e-4d3c-a25a-2c915df9fff90
1402019-10-10 04:24:55 UTCcart1004767electronics.smartphonesamsung250.93558658709d6c93da2-f4ee-42a1-951a-45fc52bbbf820
1632019-10-12 20:19:27 UTCcart1004750electronics.smartphonesamsung196.80549588267d63ac7ef-d492-4034-b846-351fd2a5c32b0
\n", "
" ], "text/plain": [ " event_time ... target\n", "55 2019-10-01 14:17:59 UTC ... 0\n", "75 2019-10-10 10:16:18 UTC ... 0\n", "133 2019-10-18 08:22:44 UTC ... 0\n", "140 2019-10-10 04:24:55 UTC ... 0\n", "163 2019-10-12 20:19:27 UTC ... 0\n", "\n", "[5 rows x 9 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 12 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 479 }, "id": "H_rxW2oGRkIh", "outputId": "1830c320-50e1-41cf-bbbf-bfe8887306d6" }, "source": [ "df[['cat_0', 'cat_1', 'cat_2']] = df['category_code'].str.split(\"\\.\", n = 3, expand = True).fillna('NA')\n", "# df[['cat_0', 'cat_1', 'cat_2', 'cat_3']] = df['category_code'].str.split(\"\\.\", n = 3, expand = True).fillna('NA')\n", "df['brand'] = df['brand'].fillna('NA')\n", "df = df.drop('category_code', axis=1)\n", "df['timestamp'] = pd.to_datetime(df['event_time'].str.replace(' UTC', ''))\n", "df['ts_hour'] = df['timestamp'].dt.hour\n", "df['ts_minute'] = df['timestamp'].dt.minute\n", "df['ts_weekday'] = df['timestamp'].dt.weekday\n", "df['ts_day'] = df['timestamp'].dt.day\n", "df['ts_month'] = df['timestamp'].dt.month\n", "df['ts_year'] = df['timestamp'].dt.year\n", "df.head()" ], "execution_count": 13, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
event_timeevent_typeproduct_idbrandpriceuser_iduser_sessiontargetcat_0cat_1cat_2timestampts_hourts_minutets_weekdayts_dayts_monthts_year
552019-10-01 14:17:59 UTCcart4804055apple189.10522564858c3f614f6-bdcb-46d7-ad93-18d03b0e11ba0electronicsaudioheadphone2019-10-01 14:17:59141711102019
752019-10-10 10:16:18 UTCcart1004708huawei153.96516882359b513996d-0335-4337-bb4d-09c3c14ee2cf0electronicssmartphoneNA2019-10-10 10:16:181016310102019
1332019-10-18 08:22:44 UTCcart2701288samsung849.265127625672dd77291-d54e-4d3c-a25a-2c915df9fff90applianceskitchenrefrigerators2019-10-18 08:22:44822418102019
1402019-10-10 04:24:55 UTCcart1004767samsung250.93558658709d6c93da2-f4ee-42a1-951a-45fc52bbbf820electronicssmartphoneNA2019-10-10 04:24:55424310102019
1632019-10-12 20:19:27 UTCcart1004750samsung196.80549588267d63ac7ef-d492-4034-b846-351fd2a5c32b0electronicssmartphoneNA2019-10-12 20:19:272019512102019
\n", "
" ], "text/plain": [ " event_time event_type product_id ... ts_day ts_month ts_year\n", "55 2019-10-01 14:17:59 UTC cart 4804055 ... 1 10 2019\n", "75 2019-10-10 10:16:18 UTC cart 1004708 ... 10 10 2019\n", "133 2019-10-18 08:22:44 UTC cart 2701288 ... 18 10 2019\n", "140 2019-10-10 04:24:55 UTC cart 1004767 ... 10 10 2019\n", "163 2019-10-12 20:19:27 UTC cart 1004750 ... 12 10 2019\n", "\n", "[5 rows x 18 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 13 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "M95wOkSINI_K", "outputId": "8b637a53-be8f-4c29-a286-d5207cf1b41e" }, "source": [ "list_files = glob.glob('/content/*.csv')\n", "list_files" ], "execution_count": 14, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['/content/2020-Apr.csv',\n", " '/content/2019-Nov.csv',\n", " '/content/2020-Jan.csv',\n", " '/content/2019-Dec.csv',\n", " '/content/2020-Mar.csv',\n", " '/content/2019-Oct.csv',\n", " '/content/2020-Feb.csv']" ] }, "metadata": { "tags": [] }, "execution_count": 14 } ] }, { "cell_type": "code", "metadata": { "id": "AAVAUULwNriQ" }, "source": [ "def process_files(df_tmp, chunkname):\n", " df_tmp['session_purchase'] = df_tmp['user_session'] + '_' + df_tmp['product_id'].astype(str)\n", " df_purchase = df_tmp[df_tmp['event_type']=='purchase']\n", " df_cart = df_tmp[df_tmp['event_type']=='cart']\n", " df_purchase = df_purchase[df_purchase['session_purchase'].isin(df_cart['session_purchase'])]\n", " df_cart = df_cart[~(df_cart['session_purchase'].isin(df_purchase['session_purchase']))]\n", " df_cart['target'] = 0\n", " df_purchase['target'] = 1\n", " df = pd.concat([df_cart, df_purchase])\n", " df = df.drop('category_id', axis=1)\n", " df = df.drop('session_purchase', axis=1)\n", " # df[['cat_0', 'cat_1', 'cat_2', 'cat_3']] = df['category_code'].str.split(\"\\.\", n = 3, expand = True).fillna('NA')\n", " df['brand'] = df['brand'].fillna('NA')\n", " # df = df.drop('category_code', axis=1)\n", " df['timestamp'] = pd.to_datetime(df['event_time'].str.replace(' UTC', ''))\n", " df['ts_hour'] = df['timestamp'].dt.hour\n", " df['ts_minute'] = df['timestamp'].dt.minute\n", " df['ts_weekday'] = df['timestamp'].dt.weekday\n", " df['ts_day'] = df['timestamp'].dt.day\n", " df['ts_month'] = df['timestamp'].dt.month\n", " df['ts_year'] = df['timestamp'].dt.year\n", " df.to_csv(chunkname, index=False)" ], "execution_count": 15, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "l8_QGBIzWxcO" }, "source": [ "base_path_silver = \"/content/data/silver\"\n", "!mkdir -p $base_path_silver" ], "execution_count": 2, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "rlpfP0TRdsUK", "outputId": "1d002eaf-c8d7-475b-a63b-21a17e6edd1b" }, "source": [ "for idx, chunk in enumerate(list_files[:2]):\n", " chunkname = os.path.join(base_path_silver, Path(chunk).stem + '-' + str(idx) + '.csv')\n", " print(chunkname)" ], "execution_count": 24, "outputs": [ { "output_type": "stream", "text": [ "/content/data/silver/2020-Apr-0.csv\n", "/content/data/silver/2019-Nov-1.csv\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "RiN1K5yxOSik", "outputId": "7ee593d1-0673-444e-8ff2-64ca109a30f7" }, "source": [ "chunksize = 10 ** 6\n", "\n", "for file in list_files:\n", " print(file)\n", " for idx, chunk in enumerate(pd.read_csv(file, chunksize=chunksize)):\n", " chunkname = os.path.join(base_path_silver, Path(file).stem + '-' + str(idx) + '.csv')\n", " print(chunkname)\n", " if not os.path.exists(chunkname):\n", " process_files(chunk, chunkname)" ], "execution_count": 26, "outputs": [ { "output_type": "stream", "text": [ "/content/2020-Apr.csv\n", "/content/data/silver/2020-Apr-0.csv\n", "/content/data/silver/2020-Apr-1.csv\n", "/content/data/silver/2020-Apr-2.csv\n", "/content/data/silver/2020-Apr-3.csv\n", "/content/data/silver/2020-Apr-4.csv\n", "/content/data/silver/2020-Apr-5.csv\n", "/content/data/silver/2020-Apr-6.csv\n", "/content/data/silver/2020-Apr-7.csv\n", "/content/data/silver/2020-Apr-8.csv\n", "/content/data/silver/2020-Apr-9.csv\n", "/content/data/silver/2020-Apr-10.csv\n", "/content/data/silver/2020-Apr-11.csv\n", "/content/data/silver/2020-Apr-12.csv\n", "/content/data/silver/2020-Apr-13.csv\n", "/content/data/silver/2020-Apr-14.csv\n", "/content/data/silver/2020-Apr-15.csv\n", "/content/data/silver/2020-Apr-16.csv\n", "/content/data/silver/2020-Apr-17.csv\n", "/content/data/silver/2020-Apr-18.csv\n", "/content/data/silver/2020-Apr-19.csv\n", "/content/data/silver/2020-Apr-20.csv\n", "/content/data/silver/2020-Apr-21.csv\n", "/content/data/silver/2020-Apr-22.csv\n", "/content/data/silver/2020-Apr-23.csv\n", "/content/data/silver/2020-Apr-24.csv\n", "/content/data/silver/2020-Apr-25.csv\n", "/content/data/silver/2020-Apr-26.csv\n", "/content/data/silver/2020-Apr-27.csv\n", "/content/data/silver/2020-Apr-28.csv\n", "/content/data/silver/2020-Apr-29.csv\n", "/content/data/silver/2020-Apr-30.csv\n", "/content/data/silver/2020-Apr-31.csv\n", "/content/data/silver/2020-Apr-32.csv\n", "/content/data/silver/2020-Apr-33.csv\n", "/content/data/silver/2020-Apr-34.csv\n", "/content/data/silver/2020-Apr-35.csv\n", "/content/data/silver/2020-Apr-36.csv\n", "/content/data/silver/2020-Apr-37.csv\n", "/content/data/silver/2020-Apr-38.csv\n", "/content/data/silver/2020-Apr-39.csv\n", "/content/data/silver/2020-Apr-40.csv\n", "/content/data/silver/2020-Apr-41.csv\n", "/content/data/silver/2020-Apr-42.csv\n", "/content/data/silver/2020-Apr-43.csv\n", "/content/data/silver/2020-Apr-44.csv\n", "/content/data/silver/2020-Apr-45.csv\n", "/content/data/silver/2020-Apr-46.csv\n", "/content/data/silver/2020-Apr-47.csv\n", "/content/data/silver/2020-Apr-48.csv\n", "/content/data/silver/2020-Apr-49.csv\n", "/content/data/silver/2020-Apr-50.csv\n", "/content/data/silver/2020-Apr-51.csv\n", "/content/data/silver/2020-Apr-52.csv\n", "/content/data/silver/2020-Apr-53.csv\n", "/content/data/silver/2020-Apr-54.csv\n", "/content/data/silver/2020-Apr-55.csv\n", "/content/data/silver/2020-Apr-56.csv\n", "/content/data/silver/2020-Apr-57.csv\n", "/content/data/silver/2020-Apr-58.csv\n", "/content/data/silver/2020-Apr-59.csv\n", "/content/data/silver/2020-Apr-60.csv\n", "/content/data/silver/2020-Apr-61.csv\n", "/content/data/silver/2020-Apr-62.csv\n", "/content/data/silver/2020-Apr-63.csv\n", "/content/data/silver/2020-Apr-64.csv\n", "/content/data/silver/2020-Apr-65.csv\n", "/content/data/silver/2020-Apr-66.csv\n", "/content/2019-Nov.csv\n", "/content/data/silver/2019-Nov-0.csv\n", "/content/data/silver/2019-Nov-1.csv\n", "/content/data/silver/2019-Nov-2.csv\n", "/content/data/silver/2019-Nov-3.csv\n", "/content/data/silver/2019-Nov-4.csv\n", "/content/data/silver/2019-Nov-5.csv\n", "/content/data/silver/2019-Nov-6.csv\n", "/content/data/silver/2019-Nov-7.csv\n", "/content/data/silver/2019-Nov-8.csv\n", "/content/data/silver/2019-Nov-9.csv\n", "/content/data/silver/2019-Nov-10.csv\n", "/content/data/silver/2019-Nov-11.csv\n", "/content/data/silver/2019-Nov-12.csv\n", "/content/data/silver/2019-Nov-13.csv\n", "/content/data/silver/2019-Nov-14.csv\n", "/content/data/silver/2019-Nov-15.csv\n", "/content/data/silver/2019-Nov-16.csv\n", "/content/data/silver/2019-Nov-17.csv\n", "/content/data/silver/2019-Nov-18.csv\n", "/content/data/silver/2019-Nov-19.csv\n", "/content/data/silver/2019-Nov-20.csv\n", "/content/data/silver/2019-Nov-21.csv\n", "/content/data/silver/2019-Nov-22.csv\n", "/content/data/silver/2019-Nov-23.csv\n", "/content/data/silver/2019-Nov-24.csv\n", "/content/data/silver/2019-Nov-25.csv\n", "/content/data/silver/2019-Nov-26.csv\n", "/content/data/silver/2019-Nov-27.csv\n", "/content/data/silver/2019-Nov-28.csv\n", "/content/data/silver/2019-Nov-29.csv\n", "/content/data/silver/2019-Nov-30.csv\n", "/content/data/silver/2019-Nov-31.csv\n", "/content/data/silver/2019-Nov-32.csv\n", "/content/data/silver/2019-Nov-33.csv\n", "/content/data/silver/2019-Nov-34.csv\n", "/content/data/silver/2019-Nov-35.csv\n", "/content/data/silver/2019-Nov-36.csv\n", "/content/data/silver/2019-Nov-37.csv\n", "/content/data/silver/2019-Nov-38.csv\n", "/content/data/silver/2019-Nov-39.csv\n", "/content/data/silver/2019-Nov-40.csv\n", "/content/data/silver/2019-Nov-41.csv\n", "/content/data/silver/2019-Nov-42.csv\n", "/content/data/silver/2019-Nov-43.csv\n", "/content/data/silver/2019-Nov-44.csv\n", "/content/data/silver/2019-Nov-45.csv\n", "/content/data/silver/2019-Nov-46.csv\n", "/content/data/silver/2019-Nov-47.csv\n", "/content/data/silver/2019-Nov-48.csv\n", "/content/data/silver/2019-Nov-49.csv\n", "/content/data/silver/2019-Nov-50.csv\n", "/content/data/silver/2019-Nov-51.csv\n", "/content/data/silver/2019-Nov-52.csv\n", "/content/data/silver/2019-Nov-53.csv\n", "/content/data/silver/2019-Nov-54.csv\n", "/content/data/silver/2019-Nov-55.csv\n", "/content/data/silver/2019-Nov-56.csv\n", "/content/data/silver/2019-Nov-57.csv\n", "/content/data/silver/2019-Nov-58.csv\n", "/content/data/silver/2019-Nov-59.csv\n", "/content/data/silver/2019-Nov-60.csv\n", "/content/data/silver/2019-Nov-61.csv\n", "/content/data/silver/2019-Nov-62.csv\n", "/content/data/silver/2019-Nov-63.csv\n", "/content/data/silver/2019-Nov-64.csv\n", "/content/data/silver/2019-Nov-65.csv\n", "/content/data/silver/2019-Nov-66.csv\n", "/content/data/silver/2019-Nov-67.csv\n", "/content/2020-Jan.csv\n", "/content/data/silver/2020-Jan-0.csv\n", "/content/data/silver/2020-Jan-1.csv\n", "/content/data/silver/2020-Jan-2.csv\n", "/content/data/silver/2020-Jan-3.csv\n", "/content/data/silver/2020-Jan-4.csv\n", "/content/data/silver/2020-Jan-5.csv\n", "/content/data/silver/2020-Jan-6.csv\n", "/content/data/silver/2020-Jan-7.csv\n", "/content/data/silver/2020-Jan-8.csv\n", "/content/data/silver/2020-Jan-9.csv\n", "/content/data/silver/2020-Jan-10.csv\n", "/content/data/silver/2020-Jan-11.csv\n", "/content/data/silver/2020-Jan-12.csv\n", "/content/data/silver/2020-Jan-13.csv\n", "/content/data/silver/2020-Jan-14.csv\n", "/content/data/silver/2020-Jan-15.csv\n", "/content/data/silver/2020-Jan-16.csv\n", "/content/data/silver/2020-Jan-17.csv\n", "/content/data/silver/2020-Jan-18.csv\n", "/content/data/silver/2020-Jan-19.csv\n", "/content/data/silver/2020-Jan-20.csv\n", "/content/data/silver/2020-Jan-21.csv\n", "/content/data/silver/2020-Jan-22.csv\n", "/content/data/silver/2020-Jan-23.csv\n", "/content/data/silver/2020-Jan-24.csv\n", "/content/data/silver/2020-Jan-25.csv\n", "/content/data/silver/2020-Jan-26.csv\n", "/content/data/silver/2020-Jan-27.csv\n", "/content/data/silver/2020-Jan-28.csv\n", "/content/data/silver/2020-Jan-29.csv\n", "/content/data/silver/2020-Jan-30.csv\n", "/content/data/silver/2020-Jan-31.csv\n", "/content/data/silver/2020-Jan-32.csv\n", "/content/data/silver/2020-Jan-33.csv\n", "/content/data/silver/2020-Jan-34.csv\n", "/content/data/silver/2020-Jan-35.csv\n", "/content/data/silver/2020-Jan-36.csv\n", "/content/data/silver/2020-Jan-37.csv\n", "/content/data/silver/2020-Jan-38.csv\n", "/content/data/silver/2020-Jan-39.csv\n", "/content/data/silver/2020-Jan-40.csv\n", "/content/data/silver/2020-Jan-41.csv\n", "/content/data/silver/2020-Jan-42.csv\n", "/content/data/silver/2020-Jan-43.csv\n", "/content/data/silver/2020-Jan-44.csv\n", "/content/data/silver/2020-Jan-45.csv\n", "/content/data/silver/2020-Jan-46.csv\n", "/content/data/silver/2020-Jan-47.csv\n", "/content/data/silver/2020-Jan-48.csv\n", "/content/data/silver/2020-Jan-49.csv\n", "/content/data/silver/2020-Jan-50.csv\n", "/content/data/silver/2020-Jan-51.csv\n", "/content/data/silver/2020-Jan-52.csv\n", "/content/data/silver/2020-Jan-53.csv\n", "/content/data/silver/2020-Jan-54.csv\n", "/content/data/silver/2020-Jan-55.csv\n", "/content/2019-Dec.csv\n", "/content/data/silver/2019-Dec-0.csv\n", "/content/data/silver/2019-Dec-1.csv\n", "/content/data/silver/2019-Dec-2.csv\n", "/content/data/silver/2019-Dec-3.csv\n", "/content/data/silver/2019-Dec-4.csv\n", "/content/data/silver/2019-Dec-5.csv\n", "/content/data/silver/2019-Dec-6.csv\n", "/content/data/silver/2019-Dec-7.csv\n", "/content/data/silver/2019-Dec-8.csv\n", "/content/data/silver/2019-Dec-9.csv\n", "/content/data/silver/2019-Dec-10.csv\n", "/content/data/silver/2019-Dec-11.csv\n", "/content/data/silver/2019-Dec-12.csv\n", "/content/data/silver/2019-Dec-13.csv\n", "/content/data/silver/2019-Dec-14.csv\n", "/content/data/silver/2019-Dec-15.csv\n", "/content/data/silver/2019-Dec-16.csv\n", "/content/data/silver/2019-Dec-17.csv\n", "/content/data/silver/2019-Dec-18.csv\n", "/content/data/silver/2019-Dec-19.csv\n", "/content/data/silver/2019-Dec-20.csv\n", "/content/data/silver/2019-Dec-21.csv\n", "/content/data/silver/2019-Dec-22.csv\n", "/content/data/silver/2019-Dec-23.csv\n", "/content/data/silver/2019-Dec-24.csv\n", "/content/data/silver/2019-Dec-25.csv\n", "/content/data/silver/2019-Dec-26.csv\n", "/content/data/silver/2019-Dec-27.csv\n", "/content/data/silver/2019-Dec-28.csv\n", "/content/data/silver/2019-Dec-29.csv\n", "/content/data/silver/2019-Dec-30.csv\n", "/content/data/silver/2019-Dec-31.csv\n", "/content/data/silver/2019-Dec-32.csv\n", "/content/data/silver/2019-Dec-33.csv\n", "/content/data/silver/2019-Dec-34.csv\n", "/content/data/silver/2019-Dec-35.csv\n", "/content/data/silver/2019-Dec-36.csv\n", "/content/data/silver/2019-Dec-37.csv\n", "/content/data/silver/2019-Dec-38.csv\n", "/content/data/silver/2019-Dec-39.csv\n", "/content/data/silver/2019-Dec-40.csv\n", "/content/data/silver/2019-Dec-41.csv\n", "/content/data/silver/2019-Dec-42.csv\n", "/content/data/silver/2019-Dec-43.csv\n", "/content/data/silver/2019-Dec-44.csv\n", "/content/data/silver/2019-Dec-45.csv\n", "/content/data/silver/2019-Dec-46.csv\n", "/content/data/silver/2019-Dec-47.csv\n", "/content/data/silver/2019-Dec-48.csv\n", "/content/data/silver/2019-Dec-49.csv\n", "/content/data/silver/2019-Dec-50.csv\n", "/content/data/silver/2019-Dec-51.csv\n", "/content/data/silver/2019-Dec-52.csv\n", "/content/data/silver/2019-Dec-53.csv\n", "/content/data/silver/2019-Dec-54.csv\n", "/content/data/silver/2019-Dec-55.csv\n", "/content/data/silver/2019-Dec-56.csv\n", "/content/data/silver/2019-Dec-57.csv\n", "/content/data/silver/2019-Dec-58.csv\n", "/content/data/silver/2019-Dec-59.csv\n", "/content/data/silver/2019-Dec-60.csv\n", "/content/data/silver/2019-Dec-61.csv\n", "/content/data/silver/2019-Dec-62.csv\n", "/content/data/silver/2019-Dec-63.csv\n", "/content/data/silver/2019-Dec-64.csv\n", "/content/data/silver/2019-Dec-65.csv\n", "/content/data/silver/2019-Dec-66.csv\n", "/content/data/silver/2019-Dec-67.csv\n", "/content/2020-Mar.csv\n", "/content/data/silver/2020-Mar-0.csv\n", "/content/data/silver/2020-Mar-1.csv\n", "/content/data/silver/2020-Mar-2.csv\n", "/content/data/silver/2020-Mar-3.csv\n", "/content/data/silver/2020-Mar-4.csv\n", "/content/data/silver/2020-Mar-5.csv\n", "/content/data/silver/2020-Mar-6.csv\n", "/content/data/silver/2020-Mar-7.csv\n", "/content/data/silver/2020-Mar-8.csv\n", "/content/data/silver/2020-Mar-9.csv\n", "/content/data/silver/2020-Mar-10.csv\n", "/content/data/silver/2020-Mar-11.csv\n", "/content/data/silver/2020-Mar-12.csv\n", "/content/data/silver/2020-Mar-13.csv\n", "/content/data/silver/2020-Mar-14.csv\n", "/content/data/silver/2020-Mar-15.csv\n", "/content/data/silver/2020-Mar-16.csv\n", "/content/data/silver/2020-Mar-17.csv\n", "/content/data/silver/2020-Mar-18.csv\n", "/content/data/silver/2020-Mar-19.csv\n", "/content/data/silver/2020-Mar-20.csv\n", "/content/data/silver/2020-Mar-21.csv\n", "/content/data/silver/2020-Mar-22.csv\n", "/content/data/silver/2020-Mar-23.csv\n", "/content/data/silver/2020-Mar-24.csv\n", "/content/data/silver/2020-Mar-25.csv\n", "/content/data/silver/2020-Mar-26.csv\n", "/content/data/silver/2020-Mar-27.csv\n", "/content/data/silver/2020-Mar-28.csv\n", "/content/data/silver/2020-Mar-29.csv\n", "/content/data/silver/2020-Mar-30.csv\n", "/content/data/silver/2020-Mar-31.csv\n", "/content/data/silver/2020-Mar-32.csv\n", "/content/data/silver/2020-Mar-33.csv\n", "/content/data/silver/2020-Mar-34.csv\n", "/content/data/silver/2020-Mar-35.csv\n", "/content/data/silver/2020-Mar-36.csv\n", "/content/data/silver/2020-Mar-37.csv\n", "/content/data/silver/2020-Mar-38.csv\n", "/content/data/silver/2020-Mar-39.csv\n", "/content/data/silver/2020-Mar-40.csv\n", "/content/data/silver/2020-Mar-41.csv\n", "/content/data/silver/2020-Mar-42.csv\n", "/content/data/silver/2020-Mar-43.csv\n", "/content/data/silver/2020-Mar-44.csv\n", "/content/data/silver/2020-Mar-45.csv\n", "/content/data/silver/2020-Mar-46.csv\n", "/content/data/silver/2020-Mar-47.csv\n", "/content/data/silver/2020-Mar-48.csv\n", "/content/data/silver/2020-Mar-49.csv\n", "/content/data/silver/2020-Mar-50.csv\n", "/content/data/silver/2020-Mar-51.csv\n", "/content/data/silver/2020-Mar-52.csv\n", "/content/data/silver/2020-Mar-53.csv\n", "/content/data/silver/2020-Mar-54.csv\n", "/content/data/silver/2020-Mar-55.csv\n", "/content/data/silver/2020-Mar-56.csv\n", "/content/2019-Oct.csv\n", "/content/data/silver/2019-Oct-0.csv\n", "/content/data/silver/2019-Oct-1.csv\n", "/content/data/silver/2019-Oct-2.csv\n", "/content/data/silver/2019-Oct-3.csv\n", "/content/data/silver/2019-Oct-4.csv\n", "/content/data/silver/2019-Oct-5.csv\n", "/content/data/silver/2019-Oct-6.csv\n", "/content/data/silver/2019-Oct-7.csv\n", "/content/data/silver/2019-Oct-8.csv\n", "/content/data/silver/2019-Oct-9.csv\n", "/content/data/silver/2019-Oct-10.csv\n", "/content/data/silver/2019-Oct-11.csv\n", "/content/data/silver/2019-Oct-12.csv\n", "/content/data/silver/2019-Oct-13.csv\n", "/content/data/silver/2019-Oct-14.csv\n", "/content/data/silver/2019-Oct-15.csv\n", "/content/data/silver/2019-Oct-16.csv\n", "/content/data/silver/2019-Oct-17.csv\n", "/content/data/silver/2019-Oct-18.csv\n", "/content/data/silver/2019-Oct-19.csv\n", "/content/data/silver/2019-Oct-20.csv\n", "/content/data/silver/2019-Oct-21.csv\n", "/content/data/silver/2019-Oct-22.csv\n", "/content/data/silver/2019-Oct-23.csv\n", "/content/data/silver/2019-Oct-24.csv\n", "/content/data/silver/2019-Oct-25.csv\n", "/content/data/silver/2019-Oct-26.csv\n", "/content/data/silver/2019-Oct-27.csv\n", "/content/data/silver/2019-Oct-28.csv\n", "/content/data/silver/2019-Oct-29.csv\n", "/content/data/silver/2019-Oct-30.csv\n", "/content/data/silver/2019-Oct-31.csv\n", "/content/data/silver/2019-Oct-32.csv\n", "/content/data/silver/2019-Oct-33.csv\n", "/content/data/silver/2019-Oct-34.csv\n", "/content/data/silver/2019-Oct-35.csv\n", "/content/data/silver/2019-Oct-36.csv\n", "/content/data/silver/2019-Oct-37.csv\n", "/content/data/silver/2019-Oct-38.csv\n", "/content/data/silver/2019-Oct-39.csv\n", "/content/data/silver/2019-Oct-40.csv\n", "/content/data/silver/2019-Oct-41.csv\n", "/content/data/silver/2019-Oct-42.csv\n", "/content/2020-Feb.csv\n", "/content/data/silver/2020-Feb-0.csv\n", "/content/data/silver/2020-Feb-1.csv\n", "/content/data/silver/2020-Feb-2.csv\n", "/content/data/silver/2020-Feb-3.csv\n", "/content/data/silver/2020-Feb-4.csv\n", "/content/data/silver/2020-Feb-5.csv\n", "/content/data/silver/2020-Feb-6.csv\n", "/content/data/silver/2020-Feb-7.csv\n", "/content/data/silver/2020-Feb-8.csv\n", "/content/data/silver/2020-Feb-9.csv\n", "/content/data/silver/2020-Feb-10.csv\n", "/content/data/silver/2020-Feb-11.csv\n", "/content/data/silver/2020-Feb-12.csv\n", "/content/data/silver/2020-Feb-13.csv\n", "/content/data/silver/2020-Feb-14.csv\n", "/content/data/silver/2020-Feb-15.csv\n", "/content/data/silver/2020-Feb-16.csv\n", "/content/data/silver/2020-Feb-17.csv\n", "/content/data/silver/2020-Feb-18.csv\n", "/content/data/silver/2020-Feb-19.csv\n", "/content/data/silver/2020-Feb-20.csv\n", "/content/data/silver/2020-Feb-21.csv\n", "/content/data/silver/2020-Feb-22.csv\n", "/content/data/silver/2020-Feb-23.csv\n", "/content/data/silver/2020-Feb-24.csv\n", "/content/data/silver/2020-Feb-25.csv\n", "/content/data/silver/2020-Feb-26.csv\n", "/content/data/silver/2020-Feb-27.csv\n", "/content/data/silver/2020-Feb-28.csv\n", "/content/data/silver/2020-Feb-29.csv\n", "/content/data/silver/2020-Feb-30.csv\n", "/content/data/silver/2020-Feb-31.csv\n", "/content/data/silver/2020-Feb-32.csv\n", "/content/data/silver/2020-Feb-33.csv\n", "/content/data/silver/2020-Feb-34.csv\n", "/content/data/silver/2020-Feb-35.csv\n", "/content/data/silver/2020-Feb-36.csv\n", "/content/data/silver/2020-Feb-37.csv\n", "/content/data/silver/2020-Feb-38.csv\n", "/content/data/silver/2020-Feb-39.csv\n", "/content/data/silver/2020-Feb-40.csv\n", "/content/data/silver/2020-Feb-41.csv\n", "/content/data/silver/2020-Feb-42.csv\n", "/content/data/silver/2020-Feb-43.csv\n", "/content/data/silver/2020-Feb-44.csv\n", "/content/data/silver/2020-Feb-45.csv\n", "/content/data/silver/2020-Feb-46.csv\n", "/content/data/silver/2020-Feb-47.csv\n", "/content/data/silver/2020-Feb-48.csv\n", "/content/data/silver/2020-Feb-49.csv\n", "/content/data/silver/2020-Feb-50.csv\n", "/content/data/silver/2020-Feb-51.csv\n", "/content/data/silver/2020-Feb-52.csv\n", "/content/data/silver/2020-Feb-53.csv\n", "/content/data/silver/2020-Feb-54.csv\n", "/content/data/silver/2020-Feb-55.csv\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "fluni49HqMrd" }, "source": [ "for file in list_files:\n", " !rm $file" ], "execution_count": 27, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-f6cVkEiqtzh", "outputId": "f7d12876-69c3-4184-ae7d-a5c5936d5ea0" }, "source": [ "list_chunks = glob.glob(os.path.join(base_path_silver,'*.csv'))\n", "list_chunks[:10]" ], "execution_count": 3, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['/content/data/silver/2020-Jan-40.csv',\n", " '/content/data/silver/2019-Dec-54.csv',\n", " '/content/data/silver/2019-Oct-27.csv',\n", " '/content/data/silver/2019-Nov-12.csv',\n", " '/content/data/silver/2020-Mar-36.csv',\n", " '/content/data/silver/2020-Apr-14.csv',\n", " '/content/data/silver/2020-Feb-40.csv',\n", " '/content/data/silver/2019-Nov-3.csv',\n", " '/content/data/silver/2020-Mar-3.csv',\n", " '/content/data/silver/2020-Apr-39.csv']" ] }, "metadata": { "tags": [] }, "execution_count": 3 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "N5VRhVZsq0-Z", "outputId": "e2a638bd-8a64-4a39-abbc-4e0596289b82" }, "source": [ "!cd $base_path_silver && zip /content/data_silver.zip ./*.csv" ], "execution_count": 31, "outputs": [ { "output_type": "stream", "text": [ " adding: 2019-Dec-0.csv (deflated 76%)\n", " adding: 2019-Dec-10.csv (deflated 76%)\n", " adding: 2019-Dec-11.csv (deflated 76%)\n", " adding: 2019-Dec-12.csv (deflated 76%)\n", " adding: 2019-Dec-13.csv (deflated 76%)\n", " adding: 2019-Dec-14.csv (deflated 76%)\n", " adding: 2019-Dec-15.csv (deflated 76%)\n", " adding: 2019-Dec-16.csv (deflated 77%)\n", " adding: 2019-Dec-17.csv (deflated 76%)\n", " adding: 2019-Dec-18.csv (deflated 76%)\n", " adding: 2019-Dec-19.csv (deflated 76%)\n", " adding: 2019-Dec-1.csv (deflated 76%)\n", " adding: 2019-Dec-20.csv (deflated 76%)\n", " adding: 2019-Dec-21.csv (deflated 76%)\n", " adding: 2019-Dec-22.csv (deflated 76%)\n", " adding: 2019-Dec-23.csv (deflated 77%)\n", " adding: 2019-Dec-24.csv (deflated 76%)\n", " adding: 2019-Dec-25.csv (deflated 77%)\n", " adding: 2019-Dec-26.csv (deflated 76%)\n", " adding: 2019-Dec-27.csv (deflated 77%)\n", " adding: 2019-Dec-28.csv (deflated 77%)\n", " adding: 2019-Dec-29.csv (deflated 76%)\n", " adding: 2019-Dec-2.csv (deflated 76%)\n", " adding: 2019-Dec-30.csv (deflated 76%)\n", " adding: 2019-Dec-31.csv (deflated 77%)\n", " adding: 2019-Dec-32.csv (deflated 76%)\n", " adding: 2019-Dec-33.csv (deflated 76%)\n", " adding: 2019-Dec-34.csv (deflated 77%)\n", " adding: 2019-Dec-35.csv (deflated 76%)\n", " adding: 2019-Dec-36.csv (deflated 76%)\n", " adding: 2019-Dec-37.csv (deflated 76%)\n", " adding: 2019-Dec-38.csv (deflated 76%)\n", " adding: 2019-Dec-39.csv (deflated 76%)\n", " adding: 2019-Dec-3.csv (deflated 76%)\n", " adding: 2019-Dec-40.csv (deflated 76%)\n", " adding: 2019-Dec-41.csv (deflated 76%)\n", " adding: 2019-Dec-42.csv (deflated 76%)\n", " adding: 2019-Dec-43.csv (deflated 76%)\n", " adding: 2019-Dec-44.csv (deflated 76%)\n", " adding: 2019-Dec-45.csv (deflated 76%)\n", " adding: 2019-Dec-46.csv (deflated 76%)\n", " adding: 2019-Dec-47.csv (deflated 76%)\n", " adding: 2019-Dec-48.csv (deflated 76%)\n", " adding: 2019-Dec-49.csv (deflated 76%)\n", " adding: 2019-Dec-4.csv (deflated 76%)\n", " adding: 2019-Dec-50.csv (deflated 76%)\n", " adding: 2019-Dec-51.csv (deflated 76%)\n", " adding: 2019-Dec-52.csv (deflated 76%)\n", " adding: 2019-Dec-53.csv (deflated 76%)\n", " adding: 2019-Dec-54.csv (deflated 76%)\n", " adding: 2019-Dec-55.csv (deflated 76%)\n", " adding: 2019-Dec-56.csv (deflated 76%)\n", " adding: 2019-Dec-57.csv (deflated 77%)\n", " adding: 2019-Dec-58.csv (deflated 76%)\n", " adding: 2019-Dec-59.csv (deflated 77%)\n", " adding: 2019-Dec-5.csv (deflated 76%)\n", " adding: 2019-Dec-60.csv (deflated 76%)\n", " adding: 2019-Dec-61.csv (deflated 77%)\n", " adding: 2019-Dec-62.csv (deflated 77%)\n", " adding: 2019-Dec-63.csv (deflated 76%)\n", " adding: 2019-Dec-64.csv (deflated 77%)\n", " adding: 2019-Dec-65.csv (deflated 77%)\n", " adding: 2019-Dec-66.csv (deflated 77%)\n", " adding: 2019-Dec-67.csv (deflated 78%)\n", " adding: 2019-Dec-6.csv (deflated 76%)\n", " adding: 2019-Dec-7.csv (deflated 76%)\n", " adding: 2019-Dec-8.csv (deflated 76%)\n", " adding: 2019-Dec-9.csv (deflated 76%)\n", " adding: 2019-Nov-0.csv (deflated 77%)\n", " adding: 2019-Nov-10.csv (deflated 77%)\n", " adding: 2019-Nov-11.csv (deflated 76%)\n", " adding: 2019-Nov-12.csv (deflated 76%)\n", " adding: 2019-Nov-13.csv (deflated 75%)\n", " adding: 2019-Nov-14.csv (deflated 76%)\n", " adding: 2019-Nov-15.csv (deflated 75%)\n", " adding: 2019-Nov-16.csv (deflated 76%)\n", " adding: 2019-Nov-17.csv (deflated 75%)\n", " adding: 2019-Nov-18.csv (deflated 76%)\n", " adding: 2019-Nov-19.csv (deflated 75%)\n", " adding: 2019-Nov-1.csv (deflated 77%)\n", " adding: 2019-Nov-20.csv (deflated 76%)\n", " adding: 2019-Nov-21.csv (deflated 75%)\n", " adding: 2019-Nov-22.csv (deflated 76%)\n", " adding: 2019-Nov-23.csv (deflated 75%)\n", " adding: 2019-Nov-24.csv (deflated 76%)\n", " adding: 2019-Nov-25.csv (deflated 78%)\n", " adding: 2019-Nov-26.csv (deflated 76%)\n", " adding: 2019-Nov-27.csv (deflated 76%)\n", " adding: 2019-Nov-28.csv (deflated 76%)\n", " adding: 2019-Nov-29.csv (deflated 77%)\n", " adding: 2019-Nov-2.csv (deflated 78%)\n", " adding: 2019-Nov-30.csv (deflated 77%)\n", " adding: 2019-Nov-31.csv (deflated 76%)\n", " adding: 2019-Nov-32.csv (deflated 77%)\n", " adding: 2019-Nov-33.csv (deflated 77%)\n", " adding: 2019-Nov-34.csv (deflated 76%)\n", " adding: 2019-Nov-35.csv (deflated 75%)\n", " adding: 2019-Nov-36.csv (deflated 75%)\n", " adding: 2019-Nov-37.csv (deflated 77%)\n", " adding: 2019-Nov-38.csv (deflated 76%)\n", " adding: 2019-Nov-39.csv (deflated 76%)\n", " adding: 2019-Nov-3.csv (deflated 77%)\n", " adding: 2019-Nov-40.csv (deflated 75%)\n", " adding: 2019-Nov-41.csv (deflated 75%)\n", " adding: 2019-Nov-42.csv (deflated 75%)\n", " adding: 2019-Nov-43.csv (deflated 75%)\n", " adding: 2019-Nov-44.csv (deflated 75%)\n", " adding: 2019-Nov-45.csv (deflated 76%)\n", " adding: 2019-Nov-46.csv (deflated 76%)\n", " adding: 2019-Nov-47.csv (deflated 75%)\n", " adding: 2019-Nov-48.csv (deflated 76%)\n", " adding: 2019-Nov-49.csv (deflated 75%)\n", " adding: 2019-Nov-4.csv (deflated 77%)\n", " adding: 2019-Nov-50.csv (deflated 76%)\n", " adding: 2019-Nov-51.csv (deflated 76%)\n", " adding: 2019-Nov-52.csv (deflated 76%)\n", " adding: 2019-Nov-53.csv (deflated 76%)\n", " adding: 2019-Nov-54.csv (deflated 76%)\n", " adding: 2019-Nov-55.csv (deflated 76%)\n", " adding: 2019-Nov-56.csv (deflated 76%)\n", " adding: 2019-Nov-57.csv (deflated 75%)\n", " adding: 2019-Nov-58.csv (deflated 76%)\n", " adding: 2019-Nov-59.csv (deflated 76%)\n", " adding: 2019-Nov-5.csv (deflated 78%)\n", " adding: 2019-Nov-60.csv (deflated 76%)\n", " adding: 2019-Nov-61.csv (deflated 76%)\n", " adding: 2019-Nov-62.csv (deflated 76%)\n", " adding: 2019-Nov-63.csv (deflated 76%)\n", " adding: 2019-Nov-64.csv (deflated 76%)\n", " adding: 2019-Nov-65.csv (deflated 76%)\n", " adding: 2019-Nov-66.csv (deflated 76%)\n", " adding: 2019-Nov-67.csv (deflated 76%)\n", " adding: 2019-Nov-6.csv (deflated 77%)\n", " adding: 2019-Nov-7.csv (deflated 77%)\n", " adding: 2019-Nov-8.csv (deflated 77%)\n", " adding: 2019-Nov-9.csv (deflated 77%)\n", " adding: 2019-Oct-0.csv (deflated 78%)\n", " adding: 2019-Oct-10.csv (deflated 78%)\n", " adding: 2019-Oct-11.csv (deflated 78%)\n", " adding: 2019-Oct-12.csv (deflated 78%)\n", " adding: 2019-Oct-13.csv (deflated 78%)\n", " adding: 2019-Oct-14.csv (deflated 78%)\n", " adding: 2019-Oct-15.csv (deflated 78%)\n", " adding: 2019-Oct-16.csv (deflated 78%)\n", " adding: 2019-Oct-17.csv (deflated 77%)\n", " adding: 2019-Oct-18.csv (deflated 77%)\n", " adding: 2019-Oct-19.csv (deflated 78%)\n", " adding: 2019-Oct-1.csv (deflated 78%)\n", " adding: 2019-Oct-20.csv (deflated 77%)\n", " adding: 2019-Oct-21.csv (deflated 77%)\n", " adding: 2019-Oct-22.csv (deflated 77%)\n", " adding: 2019-Oct-23.csv (deflated 77%)\n", " adding: 2019-Oct-24.csv (deflated 77%)\n", " adding: 2019-Oct-25.csv (deflated 77%)\n", " adding: 2019-Oct-26.csv (deflated 77%)\n", " adding: 2019-Oct-27.csv (deflated 77%)\n", " adding: 2019-Oct-28.csv (deflated 77%)\n", " adding: 2019-Oct-29.csv (deflated 77%)\n", " adding: 2019-Oct-2.csv (deflated 78%)\n", " adding: 2019-Oct-30.csv (deflated 77%)\n", " adding: 2019-Oct-31.csv (deflated 77%)\n", " adding: 2019-Oct-32.csv (deflated 77%)\n", " adding: 2019-Oct-33.csv (deflated 77%)\n", " adding: 2019-Oct-34.csv (deflated 77%)\n", " adding: 2019-Oct-35.csv (deflated 77%)\n", " adding: 2019-Oct-36.csv (deflated 77%)\n", " adding: 2019-Oct-37.csv (deflated 77%)\n", " adding: 2019-Oct-38.csv (deflated 78%)\n", " adding: 2019-Oct-39.csv (deflated 78%)\n", " adding: 2019-Oct-3.csv (deflated 78%)\n", " adding: 2019-Oct-40.csv (deflated 77%)\n", " adding: 2019-Oct-41.csv (deflated 77%)\n", " adding: 2019-Oct-42.csv (deflated 77%)\n", " adding: 2019-Oct-4.csv (deflated 78%)\n", " adding: 2019-Oct-5.csv (deflated 78%)\n", " adding: 2019-Oct-6.csv (deflated 78%)\n", " adding: 2019-Oct-7.csv (deflated 78%)\n", " adding: 2019-Oct-8.csv (deflated 78%)\n", " adding: 2019-Oct-9.csv (deflated 78%)\n", " adding: 2020-Apr-0.csv (deflated 77%)\n", " adding: 2020-Apr-10.csv (deflated 78%)\n", " adding: 2020-Apr-11.csv (deflated 77%)\n", " adding: 2020-Apr-12.csv (deflated 77%)\n", " adding: 2020-Apr-13.csv (deflated 77%)\n", " adding: 2020-Apr-14.csv (deflated 77%)\n", " adding: 2020-Apr-15.csv (deflated 77%)\n", " adding: 2020-Apr-16.csv (deflated 77%)\n", " adding: 2020-Apr-17.csv (deflated 77%)\n", " adding: 2020-Apr-18.csv (deflated 77%)\n", " adding: 2020-Apr-19.csv (deflated 77%)\n", " adding: 2020-Apr-1.csv (deflated 77%)\n", " adding: 2020-Apr-20.csv (deflated 77%)\n", " adding: 2020-Apr-21.csv (deflated 77%)\n", " adding: 2020-Apr-22.csv (deflated 78%)\n", " adding: 2020-Apr-23.csv (deflated 77%)\n", " adding: 2020-Apr-24.csv (deflated 77%)\n", " adding: 2020-Apr-25.csv (deflated 77%)\n", " adding: 2020-Apr-26.csv (deflated 77%)\n", " adding: 2020-Apr-27.csv (deflated 76%)\n", " adding: 2020-Apr-28.csv (deflated 77%)\n", " adding: 2020-Apr-29.csv (deflated 77%)\n", " adding: 2020-Apr-2.csv (deflated 77%)\n", " adding: 2020-Apr-30.csv (deflated 76%)\n", " adding: 2020-Apr-31.csv (deflated 76%)\n", " adding: 2020-Apr-32.csv (deflated 77%)\n", " adding: 2020-Apr-33.csv (deflated 77%)\n", " adding: 2020-Apr-34.csv (deflated 77%)\n", " adding: 2020-Apr-35.csv (deflated 77%)\n", " adding: 2020-Apr-36.csv (deflated 77%)\n", " adding: 2020-Apr-37.csv (deflated 77%)\n", " adding: 2020-Apr-38.csv (deflated 76%)\n", " adding: 2020-Apr-39.csv (deflated 76%)\n", " adding: 2020-Apr-3.csv (deflated 77%)\n", " adding: 2020-Apr-40.csv (deflated 76%)\n", " adding: 2020-Apr-41.csv (deflated 76%)\n", " adding: 2020-Apr-42.csv (deflated 76%)\n", " adding: 2020-Apr-43.csv (deflated 75%)\n", " adding: 2020-Apr-44.csv (deflated 75%)\n", " adding: 2020-Apr-45.csv (deflated 75%)\n", " adding: 2020-Apr-46.csv (deflated 75%)\n", " adding: 2020-Apr-47.csv (deflated 75%)\n", " adding: 2020-Apr-48.csv (deflated 75%)\n", " adding: 2020-Apr-49.csv (deflated 75%)\n", " adding: 2020-Apr-4.csv (deflated 77%)\n", " adding: 2020-Apr-50.csv (deflated 75%)\n", " adding: 2020-Apr-51.csv (deflated 75%)\n", " adding: 2020-Apr-52.csv (deflated 75%)\n", " adding: 2020-Apr-53.csv (deflated 75%)\n", " adding: 2020-Apr-54.csv (deflated 75%)\n", " adding: 2020-Apr-55.csv (deflated 75%)\n", " adding: 2020-Apr-56.csv (deflated 75%)\n", " adding: 2020-Apr-57.csv (deflated 75%)\n", " adding: 2020-Apr-58.csv (deflated 75%)\n", " adding: 2020-Apr-59.csv (deflated 75%)\n", " adding: 2020-Apr-5.csv (deflated 77%)\n", " adding: 2020-Apr-60.csv (deflated 75%)\n", " adding: 2020-Apr-61.csv (deflated 75%)\n", " adding: 2020-Apr-62.csv (deflated 75%)\n", " adding: 2020-Apr-63.csv (deflated 75%)\n", " adding: 2020-Apr-64.csv (deflated 75%)\n", " adding: 2020-Apr-65.csv (deflated 75%)\n", " adding: 2020-Apr-66.csv (deflated 75%)\n", " adding: 2020-Apr-6.csv (deflated 77%)\n", " adding: 2020-Apr-7.csv (deflated 77%)\n", " adding: 2020-Apr-8.csv (deflated 77%)\n", " adding: 2020-Apr-9.csv (deflated 77%)\n", " adding: 2020-Feb-0.csv (deflated 78%)\n", " adding: 2020-Feb-10.csv (deflated 77%)\n", " adding: 2020-Feb-11.csv (deflated 77%)\n", " adding: 2020-Feb-12.csv (deflated 78%)\n", " adding: 2020-Feb-13.csv (deflated 77%)\n", " adding: 2020-Feb-14.csv (deflated 78%)\n", " adding: 2020-Feb-15.csv (deflated 77%)\n", " adding: 2020-Feb-16.csv (deflated 78%)\n", " adding: 2020-Feb-17.csv (deflated 80%)\n", " adding: 2020-Feb-18.csv (deflated 82%)\n", " adding: 2020-Feb-19.csv (deflated 82%)\n", " adding: 2020-Feb-1.csv (deflated 79%)\n", " adding: 2020-Feb-20.csv (deflated 82%)\n", " adding: 2020-Feb-21.csv (deflated 82%)\n", " adding: 2020-Feb-22.csv (deflated 82%)\n", " adding: 2020-Feb-23.csv (deflated 82%)\n", " adding: 2020-Feb-24.csv (deflated 82%)\n", " adding: 2020-Feb-25.csv (deflated 82%)\n", " adding: 2020-Feb-26.csv (deflated 82%)\n", " adding: 2020-Feb-27.csv (deflated 82%)\n", " adding: 2020-Feb-28.csv (deflated 82%)\n", " adding: 2020-Feb-29.csv (deflated 84%)\n", " adding: 2020-Feb-2.csv (deflated 78%)\n", " adding: 2020-Feb-30.csv (deflated 82%)\n", " adding: 2020-Feb-31.csv (deflated 82%)\n", " adding: 2020-Feb-32.csv (deflated 82%)\n", " adding: 2020-Feb-33.csv (deflated 82%)\n", " adding: 2020-Feb-34.csv (deflated 82%)\n", " adding: 2020-Feb-35.csv (deflated 79%)\n", " adding: 2020-Feb-36.csv (deflated 77%)\n", " adding: 2020-Feb-37.csv (deflated 77%)\n", " adding: 2020-Feb-38.csv (deflated 77%)\n", " adding: 2020-Feb-39.csv (deflated 77%)\n", " adding: 2020-Feb-3.csv (deflated 78%)\n", " adding: 2020-Feb-40.csv (deflated 77%)\n", " adding: 2020-Feb-41.csv (deflated 77%)\n", " adding: 2020-Feb-42.csv (deflated 77%)\n", " adding: 2020-Feb-43.csv (deflated 77%)\n", " adding: 2020-Feb-44.csv (deflated 77%)\n", " adding: 2020-Feb-45.csv (deflated 77%)\n", " adding: 2020-Feb-46.csv (deflated 77%)\n", " adding: 2020-Feb-47.csv (deflated 77%)\n", " adding: 2020-Feb-48.csv (deflated 77%)\n", " adding: 2020-Feb-49.csv (deflated 77%)\n", " adding: 2020-Feb-4.csv (deflated 78%)\n", " adding: 2020-Feb-50.csv (deflated 77%)\n", " adding: 2020-Feb-51.csv (deflated 76%)\n", " adding: 2020-Feb-52.csv (deflated 77%)\n", " adding: 2020-Feb-53.csv (deflated 77%)\n", " adding: 2020-Feb-54.csv (deflated 77%)\n", " adding: 2020-Feb-55.csv (deflated 77%)\n", " adding: 2020-Feb-5.csv (deflated 78%)\n", " adding: 2020-Feb-6.csv (deflated 78%)\n", " adding: 2020-Feb-7.csv (deflated 78%)\n", " adding: 2020-Feb-8.csv (deflated 78%)\n", " adding: 2020-Feb-9.csv (deflated 78%)\n", " adding: 2020-Jan-0.csv (deflated 78%)\n", " adding: 2020-Jan-10.csv (deflated 76%)\n", " adding: 2020-Jan-11.csv (deflated 76%)\n", " adding: 2020-Jan-12.csv (deflated 76%)\n", " adding: 2020-Jan-13.csv (deflated 76%)\n", " adding: 2020-Jan-14.csv (deflated 76%)\n", " adding: 2020-Jan-15.csv (deflated 76%)\n", " adding: 2020-Jan-16.csv (deflated 76%)\n", " adding: 2020-Jan-17.csv (deflated 76%)\n", " adding: 2020-Jan-18.csv (deflated 77%)\n", " adding: 2020-Jan-19.csv (deflated 76%)\n", " adding: 2020-Jan-1.csv (deflated 78%)\n", " adding: 2020-Jan-20.csv (deflated 76%)\n", " adding: 2020-Jan-21.csv (deflated 76%)\n", " adding: 2020-Jan-22.csv (deflated 76%)\n", " adding: 2020-Jan-23.csv (deflated 76%)\n", " adding: 2020-Jan-24.csv (deflated 76%)\n", " adding: 2020-Jan-25.csv (deflated 76%)\n", " adding: 2020-Jan-26.csv (deflated 76%)\n", " adding: 2020-Jan-27.csv (deflated 77%)\n", " adding: 2020-Jan-28.csv (deflated 77%)\n", " adding: 2020-Jan-29.csv (deflated 77%)\n", " adding: 2020-Jan-2.csv (deflated 79%)\n", " adding: 2020-Jan-30.csv (deflated 77%)\n", " adding: 2020-Jan-31.csv (deflated 77%)\n", " adding: 2020-Jan-32.csv (deflated 77%)\n", " adding: 2020-Jan-33.csv (deflated 77%)\n", " adding: 2020-Jan-34.csv (deflated 77%)\n", " adding: 2020-Jan-35.csv (deflated 77%)\n", " adding: 2020-Jan-36.csv (deflated 77%)\n", " adding: 2020-Jan-37.csv (deflated 77%)\n", " adding: 2020-Jan-38.csv (deflated 77%)\n", " adding: 2020-Jan-39.csv (deflated 77%)\n", " adding: 2020-Jan-3.csv (deflated 78%)\n", " adding: 2020-Jan-40.csv (deflated 77%)\n", " adding: 2020-Jan-41.csv (deflated 76%)\n", " adding: 2020-Jan-42.csv (deflated 76%)\n", " adding: 2020-Jan-43.csv (deflated 76%)\n", " adding: 2020-Jan-44.csv (deflated 76%)\n", " adding: 2020-Jan-45.csv (deflated 76%)\n", " adding: 2020-Jan-46.csv (deflated 76%)\n", " adding: 2020-Jan-47.csv (deflated 76%)\n", " adding: 2020-Jan-48.csv (deflated 76%)\n", " adding: 2020-Jan-49.csv (deflated 77%)\n", " adding: 2020-Jan-4.csv (deflated 77%)\n", " adding: 2020-Jan-50.csv (deflated 76%)\n", " adding: 2020-Jan-51.csv (deflated 76%)\n", " adding: 2020-Jan-52.csv (deflated 78%)\n", " adding: 2020-Jan-53.csv (deflated 80%)\n", " adding: 2020-Jan-54.csv (deflated 79%)\n", " adding: 2020-Jan-55.csv (deflated 79%)\n", " adding: 2020-Jan-5.csv (deflated 76%)\n", " adding: 2020-Jan-6.csv (deflated 76%)\n", " adding: 2020-Jan-7.csv (deflated 76%)\n", " adding: 2020-Jan-8.csv (deflated 76%)\n", " adding: 2020-Jan-9.csv (deflated 76%)\n", " adding: 2020-Mar-0.csv (deflated 77%)\n", " adding: 2020-Mar-10.csv (deflated 76%)\n", " adding: 2020-Mar-11.csv (deflated 76%)\n", " adding: 2020-Mar-12.csv (deflated 76%)\n", " adding: 2020-Mar-13.csv (deflated 77%)\n", " adding: 2020-Mar-14.csv (deflated 77%)\n", " adding: 2020-Mar-15.csv (deflated 77%)\n", " adding: 2020-Mar-16.csv (deflated 77%)\n", " adding: 2020-Mar-17.csv (deflated 77%)\n", " adding: 2020-Mar-18.csv (deflated 77%)\n", " adding: 2020-Mar-19.csv (deflated 77%)\n", " adding: 2020-Mar-1.csv (deflated 77%)\n", " adding: 2020-Mar-20.csv (deflated 76%)\n", " adding: 2020-Mar-21.csv (deflated 77%)\n", " adding: 2020-Mar-22.csv (deflated 76%)\n", " adding: 2020-Mar-23.csv (deflated 77%)\n", " adding: 2020-Mar-24.csv (deflated 76%)\n", " adding: 2020-Mar-25.csv (deflated 76%)\n", " adding: 2020-Mar-26.csv (deflated 76%)\n", " adding: 2020-Mar-27.csv (deflated 77%)\n", " adding: 2020-Mar-28.csv (deflated 76%)\n", " adding: 2020-Mar-29.csv (deflated 76%)\n", " adding: 2020-Mar-2.csv (deflated 77%)\n", " adding: 2020-Mar-30.csv (deflated 76%)\n", " adding: 2020-Mar-31.csv (deflated 76%)\n", " adding: 2020-Mar-32.csv (deflated 76%)\n", " adding: 2020-Mar-33.csv (deflated 76%)\n", " adding: 2020-Mar-34.csv (deflated 77%)\n", " adding: 2020-Mar-35.csv (deflated 76%)\n", " adding: 2020-Mar-36.csv (deflated 76%)\n", " adding: 2020-Mar-37.csv (deflated 76%)\n", " adding: 2020-Mar-38.csv (deflated 77%)\n", " adding: 2020-Mar-39.csv (deflated 76%)\n", " adding: 2020-Mar-3.csv (deflated 77%)\n", " adding: 2020-Mar-40.csv (deflated 76%)\n", " adding: 2020-Mar-41.csv (deflated 77%)\n", " adding: 2020-Mar-42.csv (deflated 77%)\n", " adding: 2020-Mar-43.csv (deflated 76%)\n", " adding: 2020-Mar-44.csv (deflated 77%)\n", " adding: 2020-Mar-45.csv (deflated 77%)\n", " adding: 2020-Mar-46.csv (deflated 77%)\n", " adding: 2020-Mar-47.csv (deflated 77%)\n", " adding: 2020-Mar-48.csv (deflated 77%)\n", " adding: 2020-Mar-49.csv (deflated 77%)\n", " adding: 2020-Mar-4.csv (deflated 76%)\n", " adding: 2020-Mar-50.csv (deflated 77%)\n", " adding: 2020-Mar-51.csv (deflated 77%)\n", " adding: 2020-Mar-52.csv (deflated 77%)\n", " adding: 2020-Mar-53.csv (deflated 77%)\n", " adding: 2020-Mar-54.csv (deflated 77%)\n", " adding: 2020-Mar-55.csv (deflated 77%)\n", " adding: 2020-Mar-56.csv (deflated 77%)\n", " adding: 2020-Mar-5.csv (deflated 76%)\n", " adding: 2020-Mar-6.csv (deflated 77%)\n", " adding: 2020-Mar-7.csv (deflated 76%)\n", " adding: 2020-Mar-8.csv (deflated 76%)\n", " adding: 2020-Mar-9.csv (deflated 76%)\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "vYCsnvmgrpAW" }, "source": [ "!cp data_silver.zip /content/drive/MyDrive/Recommendation" ], "execution_count": 32, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "S4FtC2JHOzpY" }, "source": [ "lp = []\n", "for file in list_chunks:\n", " lp.append(pd.read_csv(file))" ], "execution_count": 4, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7Bc_EWETuEcP", "outputId": "91a58438-9285-4773-d175-bad2a70d7b72" }, "source": [ "df = pd.concat(lp)\n", "df.shape" ], "execution_count": 5, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(16742775, 16)" ] }, "metadata": { "tags": [] }, "execution_count": 5 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 462 }, "id": "mDU1BYyisccz", "outputId": "a4d2c045-1b99-47d9-ac46-d93e732e8525" }, "source": [ "df.head()" ], "execution_count": 35, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
event_timeevent_typeproduct_idcategory_codebrandpriceuser_iduser_sessiontargettimestampts_hourts_minutets_weekdayts_dayts_monthts_year
02020-01-23 04:32:33 UTCcart100039953appliances.personal.massagerkivi267.445129131399bebab16-bce8-43cd-ac97-f763ee4664dd02020-01-23 04:32:3343232312020
12020-01-23 04:32:36 UTCcart1004767construction.tools.lightsamsung226.27549154456659387e4-032e-4e71-8b0b-3b90f0538f1402020-01-23 04:32:3643232312020
22020-01-23 04:32:38 UTCcart1003306construction.tools.lightapple614.7958562479841a24f4f-afb8-4342-ae6f-003aa968f82f02020-01-23 04:32:3843232312020
32020-01-23 04:32:38 UTCcart4804055sport.bicycleapple192.0752586080670b5ed9a-4395-481e-9f91-d811ced8a4c302020-01-23 04:32:3843232312020
42020-01-23 04:32:43 UTCcart1005100construction.tools.lightsamsung141.06599724791bc35693d-9c73-457f-a7ef-8ced2f0bb72902020-01-23 04:32:4343232312020
\n", "
" ], "text/plain": [ " event_time event_type product_id ... ts_day ts_month ts_year\n", "0 2020-01-23 04:32:33 UTC cart 100039953 ... 23 1 2020\n", "1 2020-01-23 04:32:36 UTC cart 1004767 ... 23 1 2020\n", "2 2020-01-23 04:32:38 UTC cart 1003306 ... 23 1 2020\n", "3 2020-01-23 04:32:38 UTC cart 4804055 ... 23 1 2020\n", "4 2020-01-23 04:32:43 UTC cart 1005100 ... 23 1 2020\n", "\n", "[5 rows x 16 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 35 } ] }, { "cell_type": "code", "metadata": { "id": "lMDUiiz7sarl" }, "source": [ "# df2 = df['category_code'].str.split(\"\\.\", n=3, expand=True).fillna('NA')\n", "# df2.columns = ['cat_{}'.format(x+1) for x in df2.columns]\n", "# df2.to_parquet('/content/data/silver_l2/df_cat.parquet', index=False)" ], "execution_count": 2, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "YFkf_K_jPEGS" }, "source": [ "df_test = df[df['ts_month']==4]\n", "df_valid = df[df['ts_month']==3]\n", "df_train = df[(df['ts_month']!=3)&(df['ts_month']!=4)]" ], "execution_count": 6, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "OXVudXP-yIJG", "outputId": "14323bda-293f-4d13-e202-5b9f0b4779e4" }, "source": [ "df_train.shape, df_valid.shape, df_test.shape" ], "execution_count": 7, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "((11495245, 16), (2466048, 16), (2781482, 16))" ] }, "metadata": { "tags": [] }, "execution_count": 7 } ] }, { "cell_type": "code", "metadata": { "id": "KBEAURWbxtBa" }, "source": [ "!mkdir -p /content/data/silver_l2\n", "df_train.to_parquet('/content/data/silver_l2/train.parquet', index=False)\n", "df_valid.to_parquet('/content/data/silver_l2/valid.parquet', index=False)\n", "df_test.to_parquet('/content/data/silver_l2/test.parquet', index=False)" ], "execution_count": 8, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "Q8E5moaL3pb6" }, "source": [ "### Data load" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "pN-Foz-x2-uO", "outputId": "b2e3eed1-23fa-4b3c-ce21-09c3337e0d21" }, "source": [ "!cd /content/data/silver_l2 && zip /content/data_silver_l2.zip ./*.parquet\n", "!cp /content/data_silver_l2.zip /content/drive/MyDrive/Recommendation" ], "execution_count": 9, "outputs": [ { "output_type": "stream", "text": [ " adding: test.parquet (deflated 24%)\n", " adding: train.parquet (deflated 25%)\n", " adding: valid.parquet (deflated 24%)\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "t5g5KwR66gTi", "outputId": "784bda5b-5cb6-49fd-9afa-ded102c66b9e" }, "source": [ "import pandas as pd\n", "\n", "df_train = pd.read_parquet('/content/data/silver_l2/train.parquet')\n", "df_valid = pd.read_parquet('/content/data/silver_l2/valid.parquet')\n", "df_test = pd.read_parquet('/content/data/silver_l2/test.parquet')\n", "\n", "df = pd.concat([df_train, df_valid, df_test],ignore_index=True)\n", "\n", "df.shape" ], "execution_count": 1, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(16742775, 16)" ] }, "metadata": { "tags": [] }, "execution_count": 1 } ] }, { "cell_type": "markdown", "metadata": { "id": "tcmIcf3s3sZo" }, "source": [ "### EDA" ] }, { "cell_type": "code", "metadata": { "id": "L_qjZpHn5izl" }, "source": [ "import IPython\n", "\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "%matplotlib inline" ], "execution_count": 2, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "H16gdtbZ3aMb", "outputId": "6c4c7561-2041-43c2-876d-3e152f0a880d" }, "source": [ "df.dtypes" ], "execution_count": 3, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "event_time object\n", "event_type object\n", "product_id int64\n", "category_code object\n", "brand object\n", "price float64\n", "user_id int64\n", "user_session object\n", "target int64\n", "timestamp object\n", "ts_hour int64\n", "ts_minute int64\n", "ts_weekday int64\n", "ts_day int64\n", "ts_month int64\n", "ts_year int64\n", "dtype: object" ] }, "metadata": { "tags": [] }, "execution_count": 3 } ] }, { "cell_type": "code", "metadata": { "id": "LIdlCF-l4wvD" }, "source": [ "df['timestamp'] = pd.to_datetime(df['timestamp'])" ], "execution_count": 4, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "zXcy3O_q40cX", "outputId": "5a67dad2-b1d7-4e34-9aba-5c3662474169" }, "source": [ "df.target.mean()" ], "execution_count": 5, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.36775719676099095" ] }, "metadata": { "tags": [] }, "execution_count": 5 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "v9P5YniL5Bh9", "outputId": "3ad07e88-1560-4303-cafa-756c8a0effb2" }, "source": [ "df['event_type'].value_counts(normalize=True)" ], "execution_count": 6, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "cart 0.632243\n", "purchase 0.367757\n", "Name: event_type, dtype: float64" ] }, "metadata": { "tags": [] }, "execution_count": 6 } ] }, { "cell_type": "markdown", "metadata": { "id": "XxVZV8Vn5HdU" }, "source": [ "Around 37.0% of datapoints are purchases. Lets take a look on the sparsity of the data." ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uukjSMRT5FQG", "outputId": "8b8ab191-dedf-4b8f-c6f1-d9ad81da7f68" }, "source": [ "print('# of datapoints:' + str(df.shape))\n", "print('# of unique users:' + str(df['user_id'].drop_duplicates().shape))\n", "print('# of unique products:' + str(df['product_id'].drop_duplicates().shape))\n", "print('# of unique sessions:' + str(df['user_session'].drop_duplicates().shape))" ], "execution_count": 7, "outputs": [ { "output_type": "stream", "text": [ "# of datapoints:(16742775, 16)\n", "# of unique users:(3584809,)\n", "# of unique products:(214907,)\n", "# of unique sessions:(10715053,)\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "gIWDc9iF5Ktq" }, "source": [ "def plot_sparse(df, col):\n", " stats = df[[col, 'target']].groupby(col).agg(['count', 'mean', 'sum'])\n", " stats = stats.reset_index()\n", " stats.columns = [col, 'count', 'mean', 'sum']\n", " stats_sort = stats['count'].value_counts().reset_index()\n", " stats_sort = stats_sort.sort_values('index')\n", " plt.figure(figsize=(15,4))\n", " plt.bar(stats_sort['index'].astype(str).values[0:20], stats_sort['count'].values[0:20])\n", " plt.title('Frequency of ' + str(col))\n", " plt.xlabel('Number frequency')\n", " plt.ylabel('Frequency')" ], "execution_count": 8, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 295 }, "id": "ThKVAx3K5bqH", "outputId": "d174827b-6d6a-49be-acd4-e057872a4fba" }, "source": [ "plot_sparse(df, 'product_id')" ], "execution_count": 9, "outputs": [ { "output_type": "display_data", "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAA4cAAAEWCAYAAADVSOJZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3dd7hkVZm28fuBBgkqiLSoNNqooKIiAiKGGRUMKApm8TOgMqCfGPAzYRjjMINjQJ1xjDAEA2BGwQEU0XFUoEFSg0iLIEloBUVFQeD9/tjrjMXhnO5q6Kpdp7l/11XX2Xvt9Ow63afqrbX3qlQVkiRJkqTbt9X6DiBJkiRJ6p/FoSRJkiTJ4lCSJEmSZHEoSZIkScLiUJIkSZKExaEkSZIkCYtDSZJ6keT+SU5P8ockr+3h+BcmecIYj/fJJP+4jOWV5H7jyiNJuqV5fQeQJK16klwIbATcONC8eVVd1k+iifRm4HtVtVXfQW6tJC8F/qGqHrO8davqlaNPJEm6Lew5lCSNytOr6o4Dj5sVhklu7x9Q3htYvDJ25HMpSVoZLA4lSWPTLh3cO8n5wPmt7Wnt8srfJflRki0H1n9YktPapZdHJDk8yT+1ZS9N8sMZ9n+/Nn2HJB9M8qskV7TLGtduyx6X5JIkb0hyZZLLk7xsYD9rJ/lQkouS/D7JD1vb0UleM+2YZyZ55iznu0uSxe3cTkzywNZ+AvB44N+T/DHJ5jNse2KSf0lycpJrknwjyQZt2cJ2rnsk+RVwQpLVkryjZb4yyaFJ1hvY34vbst8mefu0Yx089bwOPj8D85sk+WqSpW37f2/n8kngke0cfjfTc7CMY7ypPe+XJXn5sraVJI2HxaEkadyeATwC2CLJw4CDgFcAdwU+BRzVCrs1ga8DhwEbAF8Cnr0Cx9kf2BzYCrgfsDHwzoHldwfWa+17AB9Pcpe27IPANsCj2rHfDNwEHAK8aGoHSR7atj96+sFbwfdFYB9gPnAM8M0ka1bVDsB/A69uvao/n+UcXgK8HLgHcAPwsWnLHws8EHgy8NL2eDxwH+COwL+3LFsAnwBeDNyT7rleMMsxp5/H6sC3gIuAhe18D6+qc4FXAj9u57D+MPtr+9wJeCPwRGAzYGz3PkqSZmdxKEkala+3HrPfJfn6QPu/VNVVVfVnYC/gU1V1UlXdWFWHANcB27fHGsBHquqvVfVl4JRhDpwkbd+vb8f6A/DPwG4Dq/0VeG/b9zHAH4H7J1mNriB7XVVd2nL9qKquA44CNk+yWdvHi4Ejqur6GWI8Hzi6qo6vqr/SFZxr0xWcwzqsqs6uqj8B/wg8rxVrU95dVX9qz+ULgQ9X1QVV9UfgrcBu7ZLT5wDfqqoftPP4R7pidxjb0RWUb2rH+ktV/XB5Gy3H84D/HDi3d9/G/UmSVgLvUZAkjcozquo7M7RfPDB9b2D3aZdqrklXjBRwaVXVwLKLhjz2fGAd4NSuTgQgwGBh9duqumFg/lq63rYNgbWAX0zfaVX9JckRwIuSvAd4AV3hNZN7DuatqpuSXEzX8zaswefqIrpiecNZlt/seG16Ht3AQPccXLeq/pTkt0Nm2AS4aNpzdVvdEzh1YH7Y36skaYTsOZQkjdtgsXcxsF9VrT/wWKeqvghcDmycgeoOuNfA9J/oCkAAktx9YNlvgD8DDxrY73pVdcch8v0G+Atw31mWH0LXS7cjcG1V/XiW9S6jK36n8oWu0Lp0iAxTNhmYvhddb+dvBtoGn8ubHa+tfwNwBd1z+b/7SrIO3aWlU272XNJdcjvlYuBeswx6UzO0DeNmebj571WS1BOLQ0lSnz4DvDLJI9JZN8nOSe4E/JiuuHltkjWSPIvuEscpZwAPSrJVkrUYuDSxqm5q+z4gyd0Akmyc5MnLC9S2PQj4cJJ7Jlk9ySOT3KEt/zHdJZkforsfcjZHAjsn2THJGsAb6C6Z/dFQz0znRUm2aMXce4EvV9WNs6z7ReD1STZNcke6y2iPaD1+XwaeluQx7V7O93Lz9wCnA09NskErsvcZWHYyXTG3f/v9rJXk0W3ZFcCCts8VcSTw0oFze9cKbi9JGgGLQ0lSb6pqEbAn3cApVwNL6AZVod3H96w2fxXdPXxfHdj253RFznfoRj6dfh/cW9r+fpLkmrbe/YeM9kbgLLp7HK8C3s/NXzMPBR4CfG4Z53Ye3eA1/0bX2/d0uq/3mOn+xNkcBhwM/JruUtfXLmPdg9r6PwB+Sdf7+ZqWZTGwN/AFukLvauCSgW0Poyu2LwSOA44YOI8bW/b7Ab9q2z2/LT6B7us4fp1ksEdzmarq28BH2vZL2k9JUs9y81s5JEmaXEkOBi6pqnf0nOMlwF7DfPn7bTjGicDnquqzozqGJEmD7DmUJGkFtMsgXwV8uu8skiStTBaHkiQNqd2zuJTuXrsv9Bxn4iRZnOSPMzxe2Hc2SdLyeVmpJEmSJMmeQ0mSJElS9+W4tysbbrhhLVy4sO8YkiRJktSLU0899TdVNX96++2uOFy4cCGLFi3qO4YkSZIk9SLJRTO1e1mpJEmSJMniUJIkSZJkcShJkiRJwuJQkiRJkoTFoSRJkiQJi0NJkiRJEhaHkiRJkiQsDiVJkiRJWBxKkiRJkoB5fQdQZ+G+R/dy3Av337mX40qSJEmaLPYcSpIkSZIsDiVJkiRJFoeSJEmSJCwOJUmSJElYHEqSJEmSsDiUJEmSJGFxKEmSJEnC4lCSJEmShMWhJEmSJAmLQ0mSJEkSFoeSJEmSJCwOJUmSJElYHEqSJEmSsDiUJEmSJDGG4jDJ6kl+muRbbX7TJCclWZLkiCRrtvY7tPklbfnCgX28tbWfl+TJA+07tbYlSfYd9blIkiRJ0qpqHD2HrwPOHZh/P3BAVd0PuBrYo7XvAVzd2g9o65FkC2A34EHATsB/tIJzdeDjwFOALYAXtHUlSZIkSStopMVhkgXAzsBn23yAHYAvt1UOAZ7Rpndt87TlO7b1dwUOr6rrquqXwBJgu/ZYUlUXVNX1wOFtXUmSJEnSChp1z+FHgDcDN7X5uwK/q6ob2vwlwMZtemPgYoC2/Pdt/f9tn7bNbO23kGSvJIuSLFq6dOltPSdJkiRJWuWMrDhM8jTgyqo6dVTHGFZVfbqqtq2qbefPn993HEmSJEmaOPNGuO9HA7skeSqwFnBn4KPA+knmtd7BBcClbf1LgU2AS5LMA9YDfjvQPmVwm9naJUmSJEkrYGQ9h1X11qpaUFUL6QaUOaGqXgh8D3hOW2134Btt+qg2T1t+QlVVa9+tjWa6KbAZcDJwCrBZG/10zXaMo0Z1PpIkSZK0Khtlz+Fs3gIcnuSfgJ8CB7b2A4HDkiwBrqIr9qiqxUmOBM4BbgD2rqobAZK8GjgWWB04qKoWj/VMJEmSJGkVMZbisKpOBE5s0xfQjTQ6fZ2/AM+dZfv9gP1maD8GOGYlRpUkSZKk26VxfM+hJEmSJGnCWRxKkiRJkiwOJUmSJEkWh5IkSZIkLA4lSZIkSVgcSpIkSZKwOJQkSZIkYXEoSZIkScLiUJIkSZKExaEkSZIkCYtDSZIkSRIWh5IkSZIkLA4lSZIkSVgcSpIkSZKwOJQkSZIkYXEoSZIkScLiUJIkSZKExaEkSZIkCYtDSZIkSRIWh5IkSZIkLA4lSZIkSVgcSpIkSZKwOJQkSZIkYXEoSZIkScLiUJIkSZKExaEkSZIkCYtDSZIkSRIWh5IkSZIkLA4lSZIkSVgcSpIkSZKwOJQkSZIkYXEoSZIkScLiUJIkSZKExaEkSZIkCYtDSZIkSRIWh5IkSZIkLA4lSZIkSVgcSpIkSZIYYXGYZK0kJyc5I8niJO9p7ZsmOSnJkiRHJFmztd+hzS9pyxcO7Outrf28JE8eaN+ptS1Jsu+ozkWSJEmSVnWj7Dm8Dtihqh4KbAXslGR74P3AAVV1P+BqYI+2/h7A1a39gLYeSbYAdgMeBOwE/EeS1ZOsDnwceAqwBfCCtq4kSZIkaQWNrDiszh/b7BrtUcAOwJdb+yHAM9r0rm2etnzHJGnth1fVdVX1S2AJsF17LKmqC6rqeuDwtq4kSZIkaQWN9J7D1sN3OnAlcDzwC+B3VXVDW+USYOM2vTFwMUBb/nvgroPt07aZrX2mHHslWZRk0dKlS1fGqUmSJEnSKmWkxWFV3VhVWwEL6Hr6HjDK4y0jx6eratuq2nb+/Pl9RJAkSZKkiTaW0Uqr6nfA94BHAusnmdcWLQAubdOXApsAtOXrAb8dbJ+2zWztkiRJkqQVNMrRSucnWb9Nrw08ETiXrkh8Tlttd+AbbfqoNk9bfkJVVWvfrY1muimwGXAycAqwWRv9dE26QWuOGtX5SJIkSdKqbN7yV7nV7gEc0kYVXQ04sqq+leQc4PAk/wT8FDiwrX8gcFiSJcBVdMUeVbU4yZHAOcANwN5VdSNAklcDxwKrAwdV1eIRno8kSZIkrbJGVhxW1ZnAw2Zov4Du/sPp7X8BnjvLvvYD9puh/RjgmNscVpIkSZJu58Zyz6EkSZIkabJZHEqSJEmSLA4lSZIkSRaHkiRJkiSGLA6TPGTUQSRJkiRJ/Rm25/A/kpyc5FVJ1htpIkmSJEnS2A1VHFbV3wEvBDYBTk3yhSRPHGkySZIkSdLYDH3PYVWdD7wDeAvwWOBjSX6W5FmjCidJkiRJGo9h7zncMskBwLnADsDTq+qBbfqAEeaTJEmSJI3BvCHX+zfgs8DbqurPU41VdVmSd4wkmSRJkiRpbIYtDncG/lxVNwIkWQ1Yq6qurarDRpZOkiRJkjQWwxaH3wGeAPyxza8DHAc8ahShNDkW7nt0b8e+cP+dezu2JEmSdHsz7IA0a1XVVGFIm15nNJEkSZIkSeM2bHH4pyRbT80k2Qb48zLWlyRJkiTNIcNeVroP8KUklwEB7g48f2SpJEmSJEljNVRxWFWnJHkAcP/WdF5V/XV0sSRJkiRJ4zRszyHAw4GFbZutk1BVh44klSRJkiRprIYqDpMcBtwXOB24sTUXYHEoSZIkSauAYXsOtwW2qKoaZRhJkiRJUj+GHa30bLpBaCRJkiRJq6Bhew43BM5JcjJw3VRjVe0yklSSJEmSpLEatjh89yhDSJIkSZL6NexXWXw/yb2BzarqO0nWAVYfbTRJkiRJ0rgMdc9hkj2BLwOfak0bA18fVShJkiRJ0ngNOyDN3sCjgWsAqup84G6jCiVJkiRJGq9hi8Prqur6qZkk8+i+51CSJEmStAoYtjj8fpK3AWsneSLwJeCbo4slSZIkSRqnYYvDfYGlwFnAK4BjgHeMKpQkSZIkabyGHa30JuAz7SFJkiRJWsUMVRwm+SUz3GNYVfdZ6YkkSZIkSWM3VHEIbDswvRbwXGCDlR9HkiRJktSHoe45rKrfDjwuraqPADuPOJskSZIkaUyGvax064HZ1eh6EoftdZQkSZIkTbhhC7wPDUzfAFwIPG+lp5EkSZIk9WLY0UofP+ogkiRJkqT+DHtZ6f9b1vKq+vDKiSNJkiRJ6sOKjFb6cOCoNv904GTg/FGEkiRJkiSN17DF4QJg66r6A0CSdwNHV9WLRhVMkiRJkjQ+Q32VBbARcP3A/PWtbVZJNknyvSTnJFmc5HWtfYMkxyc5v/28S2tPko8lWZLkzMERUpPs3tY/P8nuA+3bJDmrbfOxJBn2xCVJkiRJfzNscXgocHKSd7dew5OAQ5azzQ3AG6pqC2B7YO8kWwD7At+tqs2A77Z5gKcAm7XHXsAnoCsmgXcBjwC2A941VVC2dfYc2G6nIc9HkiRJkjRgqOKwqvYDXgZc3R4vq6p/Xs42l1fVaW36D8C5wMbArvytsDwEeEab3hU4tDo/AdZPcg/gycDxVXVVVV0NHA/s1Jbduap+UlVFV8BO7UuSJEmStAKG7TkEWAe4pqo+ClySZNNhN0yyEHgYXY/jRlV1eVv0a/52eerGwMUDm13S2pbVfskM7TMdf68ki5IsWrp06bCxJUmSJOl2Y6jiMMm7gLcAb21NawCfG3LbOwJfAfapqmsGl7Uevxo67a1UVZ+uqm2ratv58+eP+nCSJEmSNOcM23P4TGAX4E8AVXUZcKflbZRkDbrC8PNV9dXWfEW7JJT288rWfimwycDmC1rbstoXzNAuSZIkSVpBwxaH1w/28iVZd3kbtJFDDwTOraoPDyw6CpgacXR34BsD7S9po5ZuD/y+XX56LPCkJHdpA9E8CTi2LbsmyfbtWC8Z2JckSZIkaQUM+z2HRyb5FN0gMXsCLwc+s5xtHg28GDgryemt7W3A/m1/ewAXAc9ry44BngosAa6lGwCHqroqyfuAU9p6762qq9r0q4CDgbWBb7eHJEmSJGkFLbc4bL1yRwAPAK4B7g+8s6qOX9Z2VfVDYLbvHdxxhvUL2HuWfR0EHDRD+yLgwcvKIUmSJElavuUWh1VVSY6pqofQfY2EJEmSJGkVM+w9h6clefhIk0iSJEmSejPsPYePAF6U5EK6EUtD16m45aiCSZIkSZLGZ5nFYZJ7VdWvgCePKY8kSZIkqQfL6zn8OrB1VV2U5CtV9exxhJIkSZIkjdfy7jkcHG30PqMMIkmSJEnqz/J6DmuWaalXC/c9urdjX7j/zr0dW5IkSRqV5RWHD01yDV0P4tptGv42IM2dR5pOkiRJkjQWyywOq2r1cQWRJEmSJPVn2O85lCRJkiStwiwOJUmSJEkWh5IkSZIki0NJkiRJEhaHkiRJkiQsDiVJkiRJWBxKkiRJkrA4lCRJkiRhcShJkiRJwuJQkiRJkoTFoSRJkiQJi0NJkiRJEhaHkiRJkiQsDiVJkiRJWBxKkiRJkrA4lCRJkiQB8/oOIK1qFu57dG/HvnD/nXs7tiRJkuY2ew4lSZIkSRaHkiRJkiSLQ0mSJEkSFoeSJEmSJCwOJUmSJElYHEqSJEmSsDiUJEmSJGFxKEmSJEnC4lCSJEmShMWhJEmSJAmLQ0mSJEkSFoeSJEmSJEZYHCY5KMmVSc4eaNsgyfFJzm8/79Lak+RjSZYkOTPJ1gPb7N7WPz/J7gPt2yQ5q23zsSQZ1blIkiRJ0qpulD2HBwM7TWvbF/huVW0GfLfNAzwF2Kw99gI+AV0xCbwLeASwHfCuqYKyrbPnwHbTjyVJkiRJGtK8Ue24qn6QZOG05l2Bx7XpQ4ATgbe09kOrqoCfJFk/yT3ausdX1VUASY4HdkpyInDnqvpJaz8UeAbw7VGdjzTXLdz36N6OfeH+O/d2bEmSJA1n3PccblRVl7fpXwMbtemNgYsH1ruktS2r/ZIZ2iVJkiRJt0JvA9K0XsIax7GS7JVkUZJFS5cuHcchJUmSJGlOGXdxeEW7XJT288rWfimwycB6C1rbstoXzNA+o6r6dFVtW1Xbzp8//zafhCRJkiStasZdHB4FTI04ujvwjYH2l7RRS7cHft8uPz0WeFKSu7SBaJ4EHNuWXZNk+zZK6UsG9iVJkiRJWkEjG5AmyRfpBpTZMMkldKOO7g8cmWQP4CLgeW31Y4CnAkuAa4GXAVTVVUneB5zS1nvv1OA0wKvoRkRdm24gGgejkSRJkqRbaZSjlb5glkU7zrBuAXvPsp+DgINmaF8EPPi2ZJQkSZIkdXobkEaSJEmSNDksDiVJkiRJFoeSJEmSpBHecyhJw1q479G9HfvC/Xfu7diSJEmTxJ5DSZIkSZLFoSRJkiTJ4lCSJEmShMWhJEmSJAkHpJGkWTlQjiRJuj2x51CSJEmSZHEoSZIkSbI4lCRJkiRhcShJkiRJwuJQkiRJkoSjlUrSnNTXSKqOoipJ0qrLnkNJkiRJksWhJEmSJMniUJIkSZKE9xxKklaivu6FBO+HlCTptrLnUJIkSZJkcShJkiRJ8rJSSdLtgJe7SpK0fPYcSpIkSZIsDiVJkiRJXlYqSVKvvORVkjQp7DmUJEmSJNlzKEmSbskeTUm6/bHnUJIkSZJkz6EkSZpb7NWUpNGwOJQkSVoJLFolzXUWh5IkSau4vgpXi1ZpbrE4lCRJUi/sbZUmi8WhJEmSNGCSi9ZJzqa5z9FKJUmSJEn2HEqSJEm6bSa5R3OSs00aew4lSZIkSRaHkiRJkiSLQ0mSJEkSFoeSJEmSJFaB4jDJTknOS7Ikyb5955EkSZKkuWhOF4dJVgc+DjwF2AJ4QZIt+k0lSZIkSXPPnC4Oge2AJVV1QVVdDxwO7NpzJkmSJEmac1JVfWe41ZI8B9ipqv6hzb8YeERVvXraensBe7XZ+wPnjTXo6G0I/KbvEDOY1FwwudkmNRdMbrZJzQWTm81cK25Ss01qLpjcbJOaCyY3m7lW3KRmm9RcMLnZJjXXbXXvqpo/vXFeH0nGrao+DXy67xyjkmRRVW3bd47pJjUXTG62Sc0Fk5ttUnPB5GYz14qb1GyTmgsmN9uk5oLJzWauFTep2SY1F0xutknNNSpz/bLSS4FNBuYXtDZJkiRJ0gqY68XhKcBmSTZNsiawG3BUz5kkSZIkac6Z05eVVtUNSV4NHAusDhxUVYt7jtWHSb1kdlJzweRmm9RcMLnZJjUXTG42c624Sc02qblgcrNNai6Y3GzmWnGTmm1Sc8HkZpvUXCMxpwekkSRJkiStHHP9slJJkiRJ0kpgcShJkiRJsjicy5IclOTKJGf3nWVQkk2SfC/JOUkWJ3ld35kAkqyV5OQkZ7Rc7+k703RJVk/y0yTf6jvLlCQXJjkryelJFvWdZ1CS9ZN8OcnPkpyb5JETkOn+7bmaelyTZJ++c01J8vr27//sJF9MslbfmQCSvK5lWtz38zXT39YkGyQ5Psn57eddJiTXc9tzdlOSXoZanyXXB9r/yzOTfC3J+hOU7X0t1+lJjktyz0nINbDsDUkqyYbjzjVbtiTvTnLpwN+1p05Crtb+mvZvbXGSfx13rtmyJTli4Pm6MMnpE5JrqyQ/mXpNT7LdhOR6aJIft/cb30xy53HnajlmfA87Ca8B42JxOLcdDOzUd4gZ3AC8oaq2ALYH9k6yRc+ZAK4DdqiqhwJbATsl2b7nTNO9Dji37xAzeHxVbTWB3/PzUeC/quoBwEOZgOeuqs5rz9VWwDbAtcDXeo4FQJKNgdcC21bVg+kG8tqt31SQ5MHAnsB2dL/HpyW5X4+RDuaWf1v3Bb5bVZsB323z43Ywt8x1NvAs4AdjT/M3B3PLXMcDD66qLYGfA28dd6jmYG6Z7QNVtWX7P/ot4J1jTzXL63eSTYAnAb8ad6ABBzPze4sDpv62VdUxY84EM+RK8nhgV+ChVfUg4IM95IIZslXV8wdeC74CfHUScgH/Cryn5Xpnmx+3g7llrs8C+1bVQ+heM9807lDNbO9hJ+E1YCwsDuewqvoBcFXfOaarqsur6rQ2/Qe6N+wb95sKqvPHNrtGe0zMiExJFgA70/2B1HIkWQ/4e+BAgKq6vqp+12+qW9gR+EVVXdR3kAHzgLWTzAPWAS7rOQ/AA4GTquraqroB+D5dwdOLWf627goc0qYPAZ4x1lDMnKuqzq2q88adZVqGmXId136XAD+h+x7isZsl2zUDs+vSw+vAMl6/DwDeTI+vTRP83mKmXP8X2L+qrmvrXDn2YCz7OUsS4HnAF8caillzFTDVK7cePbwGzJJrc/72IdfxwLPHGqpZxnvY3l8DxsXiUCOVZCHwMOCkfpN02mWbpwNXAsdX1UTkaj5C96bgpr6DTFPAcUlOTbJX32EGbAosBf4z3aW4n02ybt+hptmNHt4QzKaqLqX7ZP1XwOXA76vquH5TAV3v198luWuSdYCnApv0nGm6jarq8jb9a2CjPsPMMS8Hvt13iEFJ9ktyMfBC+uk5vIUkuwKXVtUZfWeZxavb5bgHTdAldZvT/e04Kcn3kzy870Az+Dvgiqo6v+8gzT7AB9q//w/SX6/+dIvpCjCA5zIBrwHT3sPebl4DLA41MknuSHcpxT7TPqntTVXd2C6lWABs1y5n612SpwFXVtWpfWeZwWOqamvgKXSXV/x934GaecDWwCeq6mHAn5igyzySrAnsAnyp7yxT2hu6XekK63sC6yZ5Ub+put4v4P3AccB/AacDN/Yaahmq+w6oibnqYJIleTvdZVqf7zvLoKp6e1VtQpfr1X3naR+KvI0JKVRn8AngvnS3ZFwOfKjfOP9rHrAB3eV/bwKObD11k+QFTNCHhHS9ra9v//5fT7v6ZgK8HHhVklOBOwHX9xlmWe9hV/XXAItDjUSSNej+U32+qvq4zn6Z2uWH32Ny7tl8NLBLkguBw4Edknyu30id1ts0dbnO1+juC5sElwCXDPT+fpmuWJwUTwFOq6or+g4y4AnAL6tqaVX9le4emEf1nAmAqjqwqrapqr8Hrqa7T22SXJHkHgDtZy+Xr80lSV4KPA14YU3ulyp/np4uX5vmvnQf2pzRXgcWAKcluXuvqZqquqJ9uHoT8Bkm63Xgq+22kZPprrzpZSCfmbTL958FHNF3lgG787f7H7/EhPwuq+pnVfWkqtqGrpj+RV9ZZnkPe7t5DbA41ErXPrU7EDi3qj7cd54pSeZPjZiXZG3gicDP+k3Vqaq3VtWCqlpIdyniCVXVe49OknWT3Glqmm6ghIkYHbeqfg1cnOT+rWlH4JweI003aZ8WQ3c56fZJ1mn/T3dkAgbxAUhyt/bzXnRvpr7Qb6JbOIruTRXt5zd6zDLxkuxEd5n8LlV1bd95BiXZbGB2VybgdaCqzqqqu1XVwvY6cAmwdfs717upN8XNM5mQ1wHg68DjAZJsDqwJ/KbXRDf3BOBnVXVJ30EGXAY8tk3vAEzE5a4DrwGrAe8APtlTjtnew95+XgOqysccfdC98bwc+CvdC8kefWdquR5D191+Jt3lYacDT52AXFsCP225zgbe2XemWXI+DvhW3zlalvsAZ7THYuDtfWealm8rYFH7nX4duEvfmVqudYHfAuv1nWWGbO+hezN8NnAYcIe+M7Vc/01X3J8B7Nhzllv8bQXuSjdC3fnAd4ANJiTXM9v0dcAVwLETkmsJcPHAa8AnJ+h3+ZX27/9M4JvAxpOQa9ryC4ENJ+g5Oww4qz1nRwH3mJBcawKfa7/P0+hGJJ+I56y1Hwy8smKKr6AAAAQ4SURBVI9My3jOHgOc2v7WngRsMyG5Xkd3xcjPgf2B9PSczfgedhJeA8b1SHsiJEmSJEm3Y15WKkmSJEmyOJQkSZIkWRxKkiRJkrA4lCRJkiRhcShJkiRJwuJQkjRHJakkHxqYf2OSd6+kfR+c5DkrY19tf89Ncm6S762sfUqStLJZHEqS5qrrgGcl2bDvIIOSzJuheQ9gz6p6/BDrSpLUC4tDSdJcdQPwaeD10xdM7/lL8sf283FJvp/kG0kuSLJ/khcmOTnJWUnuO7CbJyRZlOTnSZ7Wtl89yQeSnJLkzCSvGNjvfyc5CjhnWpZ30n2x8oFt25cmOSrJCcB3k6yb5KCW4adJdm3brZ3k8Nbj+LUkJyXZdvB82vRzkhzcpucn+UrLd0qSR7f2d7djnNjO+7UD27+kncsZSQ5Lcqckv0yyRlt+58F5SdKqy08sJUlz2ceBM5P86wps81DggcBVwAXAZ6tquySvA14D7NPWWwhsB9wX+F6S+wEvAX5fVQ9Pcgfgf5Ic19bfGnhwVf1y8GBV9d4kOwBvrKpFSV7a1t2yqq5K8s/ACVX18iTrAycn+Q7wCuDaqnpgki2B04Y4t48CB1TVD5PcCzi2nSvAA4DHA3cCzkvyCWBz4B3Ao6rqN0k2qKo/JDkR2Bn4OrAb8NWq+usQx5ckzWEWh5KkOauqrklyKPBa4M9DbnZKVV0OkOQXwFRxdxZd8TTlyKq6CTg/yQV0xdWTgC0HeiXXAzYDrgdOnl4YLsPxVXVVm34SsEuSN7b5tYB7AX8PfKyd55lJzhxiv08AtkgyNX/nJHds00dX1XXAdUmuBDYCdgC+VFW/aceZyvRZ4M10xeHLgD2HPC9J0hxmcShJmus+Qter9p8DbTfQbp1Ishqw5sCy6wambxqYv4mbvy7WtOMUEOA1VXXs4IIkjwP+tAKZB9cN8OyqOm/aPpe1/WC2tQamVwO2r6q/zLCvwfO+kWW8B6iq/0mysJ3X6lV19rLCSJJWDd5zKEma01pv15F0g75MuRDYpk3vAtya++Wem2S1dh/ifYDz6C7T/L8D9+NtnmTdW5u9ORZ4TVoFl+Rhrf0HwP9pbQ8GthzY5ookD2yF7zMH2o+juzSWtt1Wyzn2CXTnede2/gYDyw4FvsDNi25J0irM4lCStCr4EDA4aulngMcmOQN4JCvWqzflV8DJwLeBV7beuM/SDThzWpKzgU9x26/CeR9d8XpmksVtHuATwB2TnAu8Fzh1YJt9gW8BPwIuH2h/LbBtG2DmHOCVyzpwVS0G9gO+356rDw8s/jxwF+CLt/bEJElzS6qmXzUjSZImTRsk5o1VtWhMx3sOsGtVvXgcx5Mk9c97DiVJ0s0k+TfgKcBT+84iSRofew4lSZIkSd5zKEmSJEmyOJQkSZIkYXEoSZIkScLiUJIkSZKExaEkSZIkCfj/orSt3CJmAcUAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 295 }, "id": "zlYjAUzq5fD9", "outputId": "765f7fa1-5560-4f40-bbaf-ad0854c08020" }, "source": [ "plot_sparse(df, 'user_id')" ], "execution_count": 10, "outputs": [ { "output_type": "display_data", "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAA3gAAAEWCAYAAAA0DzVNAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3dd5glZZ328e9NkiiGGRPBMQCKShzRXROYFkHBLKxZFN0V05pw9WUV130x6yoGVHYUFQRUdhRcMGBYlTAgaUAEEWWAhSGsqChBfu8fVfNyaLpnTs/M6Tpd8/1cV19UrrtOM13nd56nnpOqQpIkSZI0+63VdQBJkiRJ0uphgSdJkiRJPWGBJ0mSJEk9YYEnSZIkST1hgSdJkiRJPWGBJ0mSJEk9YYEnSdIIJdkmyVlJ/pDk9V3nmSjJC5OctJz1P0zyypnMJElaeet0HUCSNHsluRS4N/DXgcVbV9UV3SQaS28DTq6qHboOMpmq+grwla5zSJJWD1vwJEmr6hlVtfHAzx2KuyRr+oeJ9wcWdxnA34EkrTks8CRJq12SSvLaJBcBF7XLnt52VfzfJD9Lst3A9jsmObPtxvi1JEcl+dd23cuS/Pckx39wO32XJB9K8rskVyX5TJIN2nW7JlmS5M1Jrk5yZZKXDxxngyQfTvLbJL9P8t/tsuOTvG7COc9J8qwprnevJIvba/thkoe2y38A7AZ8Mskfk2w9yb6XJnnywPy7k3y5nV4/yZeTXNse+/Qk927XbZrkC+01XZ7kX5OsPfCa/TTJR5NcC7x7Ob+rO7y+SZ6S5Jft6/FJIFPtK0kaPxZ4kqRReSbwKGDbJDsChwOvBu4JfBZY2BZn6wHHAUcA9wCOAZ4zjfMcAmwN7AA8GNgMOGhg/X2ATdvl+wGHJrl7u+5DwM7A37bnfhtwG/BF4EXLDpBk+3b/4yeevC3ajgTeCMwFTgC+lWS9qnoi8BPggLZ181fTuC6Al7bZt6B53V4D/LldtwC4tb3mHYGnAoPPyj0KuISmC+37hjlZkjnAN4B3AXOAXwOPmWZmSVKHZmWBl+Tw9pPY84bc/vlJzm8/Xf3qqPNJ0hrmuLZ16X+THDew/P9W1XVV9Wdgf+CzVXVqVf21qr4I3AQ8uv1ZF/hYVd1SVccCpw9z4iRpj/2m9lx/AP4N2Gdgs1uAg9tjnwD8EdgmyVrAK4A3VNXlba6fVdVNwEJg6yRbtcd4MfC1qrp5khgvAI6vqu9W1S00ReMGNEXjqrqFprB7cJvvjKq6oW3F2wN4Y1X9qaquBj464bqvqKpPVNWt7e9gGHsAi6vq2PZaPgb8z2q4DknSDJmtffIXAJ8EvrSiDdub8zuAx1TV9UnuNeJskrSmeWZVfW+S5ZcNTN8feOmEbo/rAfcDCri8qmpg3W+HPPdcYEPgjKbWA5ouhWsPbHNtVd06MH8jsDFNC9X6NK1Ud1BVf0nyNeBFSd4D7As8d4oM9xvMW1W3JbmMpsVvVR1B03p3VJK7AV8G3knzeq4LXDlw3Wtxx9d8cHpY9xvcr6qqvRZJ0iwxK1vwqurHwHWDy5I8KMl/JTkjyU+SPKRd9Srg0Kq6vt336hmOK0lrqsGC7TLgfVV1t4GfDavqSOBKYLMMVCrAlgPTf6Ip4gBIcp+BddfQdFl82MBxN62qjYfIdw3wF+BBU6z/IvBC4EnAjVX18ym2u4Km4FqWLzRF2eVDZIAJ10fTpRSAttXxPVW1LU2L4NOBl9C8njcBcwau+65V9bCB4wy+/sO6ss0+8VokSbPErCzwpnAY8Lqq2hl4C/CpdvnWNN1sfprklCS7d5ZQktZcnwNek+RRaWyUZM8kmwA/p3mW7PVJ1k3ybGCXgX3PBh6WZIck6zMwYEhV3dYe+6PLemgk2SzJ360oULvv4cBHktwvydpJ/ibJXdr1P6d5Hu/DNC1pUzka2DPJk5KsC7yZpvj62VCvDJwF7NNe+3wGWgqT7JbkEe3gKTfQdNm8raquBE4CPpzkrknWaj/ofMKQ55zK8TSv9bPTjLz5egYKTknS+OtFgZdkY5pPNo9JchbNw/v3bVevA2wF7ErTxeZzbTcXSdIMqapFND0qPglcD1wMvKxddzPw7Hb+Oppn2r4xsO+vgIOB79GMyHmHETWBt7fHOyXJDe122wwZ7S3AuTTP/F0HvJ873hu/BDyCpmvkVNd2Ic2ALJ+gaRV8Bs1XR0z2vN5k/g9NK+L1wHuAwWfF7wMcS1PcXQD8iNuLzZfQdHM9v933WG6/962UqroGeB7NwDXX0tw/f7oqx5Qkzazc8ZGH2SPJPODbVfXwJHcFLqyqO93YknwGOLWq/qOd/z5wYFUN9QC/JGnmJVkALKmqd3Wc4yXA/lX12C5zSJI0rF604FXVDcBvkjwPmmcG2iGtoRl6e9d2+RyaLpuXdJFTkjR7JNkQ+EeaRwAkSZoVZmWBl+RImmc2tknzBbb70TwIv1+Ss4HFwN7t5icC1yY5HzgZeGtVXdtFbknS7NA+w7cUuIo7dpmcldJ8+fsfJ/n5TNfZJEmr16ztoilJkiRJuqNZ2YInSZIkSbqzWfdF53PmzKl58+Z1HUOSJEmSOnHGGWdcU1VzJ1s36wq8efPmsWjRoq5jSJIkSVInkvx2qnV20ZQkSZKknrDAkyRJkqSesMCTJEmSpJ6wwJMkSZKknrDAkyRJkqSesMCTJEmSpJ6wwJMkSZKknrDAkyRJkqSesMCTJEmSpJ5Yp+sAfTHvwOM7O/elh+zZ2bklSZIkjQ9b8CRJkiSpJyzwJEmSJKknLPAkSZIkqScs8CRJkiSpJyzwJEmSJKknLPAkSZIkqScs8CRJkiSpJyzwJEmSJKknLPAkSZIkqScs8CRJkiSpJyzwJEmSJKknLPAkSZIkqScs8CRJkiSpJyzwJEmSJKknRlbgJTk8ydVJzpti/QuTnJPk3CQ/S7L9qLJIkiRJ0ppglC14C4Ddl7P+N8ATquoRwHuBw0aYRZIkSZJ6b51RHbiqfpxk3nLW/2xg9hRg81FlkSRJkqQ1wbg8g7cf8J2pVibZP8miJIuWLl06g7EkSZIkafbovMBLshtNgff2qbapqsOqan5VzZ87d+7MhZMkSZKkWWRkXTSHkWQ74PPA06rq2i6zSJIkSdJs11kLXpItgW8AL66qX3WVQ5IkSZL6YmQteEmOBHYF5iRZAvwLsC5AVX0GOAi4J/CpJAC3VtX8UeWRJEmSpL4b5Sia+65g/SuBV47q/JIkSZK0pul8kBVJkiRJ0uphgSdJkiRJPWGBJ0mSJEk9YYEnSZIkST1hgSdJkiRJPWGBJ0mSJEk9YYEnSZIkST1hgSdJkiRJPWGBJ0mSJEk9YYEnSZIkST1hgSdJkiRJPWGBJ0mSJEk9YYEnSZIkST1hgSdJkiRJPWGBJ0mSJEk9YYEnSZIkST1hgSdJkiRJPWGBJ0mSJEk9YYEnSZIkST1hgSdJkiRJPWGBJ0mSJEk9YYEnSZIkST0xsgIvyeFJrk5y3hTrk+Tfk1yc5JwkO40qiyRJkiStCUbZgrcA2H05658GbNX+7A98eoRZJEmSJKn3RlbgVdWPgeuWs8newJeqcQpwtyT3HVUeSZIkSeq7Lp/B2wy4bGB+SbvsTpLsn2RRkkVLly6dkXCSJEmSNNvMikFWquqwqppfVfPnzp3bdRxJkiRJGktdFniXA1sMzG/eLpMkSZIkrYQuC7yFwEva0TQfDfy+qq7sMI8kSZIkzWrrjOrASY4EdgXmJFkC/AuwLkBVfQY4AdgDuBi4EXj5qLJIkiRJ0ppgZAVeVe27gvUFvHZU55ckSZKkNc2sGGRFkiRJkrRiFniSJEmS1BMWeJIkSZLUExZ4kiRJktQTFniSJEmS1BMWeJIkSZLUExZ4kiRJktQTFniSJEmS1BMWeJIkSZLUExZ4kiRJktQTFniSJEmS1BMWeJIkSZLUExZ4kiRJktQTFniSJEmS1BMWeJIkSZLUExZ4kiRJktQTFniSJEmS1BMWeJIkSZLUExZ4kiRJktQTFniSJEmS1BNDFXhJHjHqIJIkSZKkVTNsC96nkpyW5B+TbDrSRJIkSZKklTJUgVdVjwNeCGwBnJHkq0meMtJkkiRJkqRpGfoZvKq6CHgX8HbgCcC/J/llkmdPtU+S3ZNcmOTiJAdOsn7LJCcn+UWSc5LssTIXIUmSJEka/hm87ZJ8FLgAeCLwjKp6aDv90Sn2WRs4FHgasC2wb5JtJ2z2LuDoqtoR2Af41EpdhSRJkiRp6Ba8TwBnAttX1Wur6kyAqrqCpkibzC7AxVV1SVXdDBwF7D1hmwLu2k5vClwxnfCSJEmSpNutM+R2ewJ/rqq/AiRZC1i/qm6sqiOm2Gcz4LKB+SXAoyZs827gpCSvAzYCnjzZgZLsD+wPsOWWWw4ZWZIkSZLWLMO24H0P2GBgfsN22araF1hQVZsDewBHtMXjHVTVYVU1v6rmz507dzWcVpIkSZL6Z9gCb/2q+uOymXZ6wxXscznNqJvLbN4uG7QfcHR7zJ8D6wNzhswkSZIkSRowbIH3pyQ7LZtJsjPw5xXsczqwVZIHJFmPZhCVhRO2+R3wpPaYD6Up8JYOmUmSJEmSNGDYZ/DeCByT5AogwH2AFyxvh6q6NckBwInA2sDhVbU4ycHAoqpaCLwZ+FySN9EMuPKyqqqVvBZJkiRJWqMNVeBV1elJHgJs0y66sKpuGWK/E4ATJiw7aGD6fOAxw8eVJEmSJE1l2BY8gEcC89p9dkpCVX1pJKkkSZIkSdM2VIGX5AjgQcBZwF/bxQVY4EmSJEnSmBi2BW8+sK3Px0mSJEnS+Bp2FM3zaAZWkSRJkiSNqWFb8OYA5yc5Dbhp2cKq2mskqSRJkiRJ0zZsgffuUYaQJEmSJK26Yb8m4UdJ7g9sVVXfS7IhzXfbSZIkSZLGxFDP4CV5FXAs8Nl20WbAcaMKJUmSJEmavmEHWXktzReS3wBQVRcB9xpVKEmSJEnS9A1b4N1UVTcvm0myDs334EmSJEmSxsSwBd6PkvwzsEGSpwDHAN8aXSxJkiRJ0nQNW+AdCCwFzgVeDZwAvGtUoSRJkiRJ0zfsKJq3AZ9rfyRJkiRJY2ioAi/Jb5jkmbuqeuBqTyRJkiRJWinDftH5/IHp9YHnAfdY/XEkSZIkSStrqGfwquragZ/Lq+pjwJ4jziZJkiRJmoZhu2juNDC7Fk2L3rCtf5IkSZKkGTBskfbhgelbgUuB56/2NBqJeQce38l5Lz3ERl5JkiRpJg07iuZuow4iSZIkSVo1w3bR/Kflra+qj6yeOJIkSZKklTWdUTQfCSxs558BnAZcNIpQkiRJkqTpG7bA2xzYqar+AJDk3cDxVfWiUQWTJEmSJE3PUF+TANwbuHlg/uZ22XIl2T3JhUkuTnLgFNs8P8n5SRYn+eqQeSRJkiRJEwzbgvcl4LQk32znnwl8cXk7JFkbOBR4CrAEOD3Jwqo6f2CbrYB3AI+pquuT3Gu6FyBJkiRJagw7iub7knwHeFy76OVV9YsV7LYLcHFVXQKQ5Chgb+D8gW1eBRxaVde357l6OuElSZIkSbcbtosmwIbADVX1cWBJkgesYPvNgMsG5pe0ywZtDWyd5KdJTkmy+2QHSrJ/kkVJFi1dunQakSVJkiRpzTFUgZfkX4C303SnBFgX+PJqOP86wFbArsC+wOeS3G3iRlV1WFXNr6r5c+fOXQ2nlSRJkqT+GbYF71nAXsCfAKrqCmCTFexzObDFwPzm7bJBS4CFVXVLVf0G+BVNwSdJkiRJmqZhC7ybq6qAAkiy0RD7nA5sleQBSdYD9uH279Fb5jia1juSzKHpsnnJkJkkSZIkSQOGLfCOTvJZ4G5JXgV8D/jc8naoqluBA4ATgQuAo6tqcZKDk+zVbnYicG2S84GTgbdW1bUrcyGSJEmStKZb4SiaSQJ8DXgIcAOwDXBQVX13RftW1QnACROWHTQwXcA/tT+SJEmSpFWwwgKvqirJCVX1CGCFRZ0kSZIkqRvDdtE8M8kjR5pEkiRJkrRKhvqic+BRwIuSXEozkmZoGve2G1UwSZIkSdL0LLfAS7JlVf0O+LsZyiNJkiRJWkkrasE7Dtipqn6b5OtV9ZyZCCVJkiRJmr4VPYOXgekHjjKIJEmSJGnVrKjAqymmJUmSJEljZkVdNLdPcgNNS94G7TTcPsjKXUeaTpIkSZI0tOUWeFW19kwFkSRJkiStmmG/B0+SJEmSNOYs8CRJkiSpJyzwJEmSJKknLPAkSZIkqScs8CRJkiSpJyzwJEmSJKknLPAkSZIkqScs8CRJkiSpJyzwJEmSJKknLPAkSZIkqScs8CRJkiSpJyzwJEmSJKknLPAkSZIkqScs8CRJkiSpJ0Za4CXZPcmFSS5OcuBytntOkkoyf5R5JEmSJKnPRlbgJVkbOBR4GrAtsG+SbSfZbhPgDcCpo8oiSZIkSWuCUbbg7QJcXFWXVNXNwFHA3pNs917g/cBfRphFkiRJknpvlAXeZsBlA/NL2mX/X5KdgC2q6vjlHSjJ/kkWJVm0dOnS1Z9UkiRJknqgs0FWkqwFfAR484q2rarDqmp+Vc2fO3fu6MNJkiRJ0iy0zgiPfTmwxcD85u2yZTYBHg78MAnAfYCFSfaqqkUjzKUxMe/A5TbcjtSlh+zZ2bklSZKkURllC97pwFZJHpBkPWAfYOGylVX1+6qaU1XzqmoecApgcSdJkiRJK2lkBV5V3QocAJwIXAAcXVWLkxycZK9RnVeSJEmS1lSj7KJJVZ0AnDBh2UFTbLvrKLNIkiRJUt91NsiKJEmSJGn1ssCTJEmSpJ6wwJMkSZKknrDAkyRJkqSesMCTJEmSpJ6wwJMkSZKknrDAkyRJkqSesMCTJEmSpJ6wwJMkSZKknrDAkyRJkqSesMCTJEmSpJ6wwJMkSZKknrDAkyRJkqSesMCTJEmSpJ6wwJMkSZKknrDAkyRJkqSeWKfrANI4mnfg8Z2c99JD9uzkvJIkSeoHW/AkSZIkqScs8CRJkiSpJyzwJEmSJKknLPAkSZIkqScs8CRJkiSpJ0Za4CXZPcmFSS5OcuAk6/8pyflJzkny/ST3H2UeSZIkSeqzkRV4SdYGDgWeBmwL7Jtk2wmb/QKYX1XbAccCHxhVHkmSJEnqu1G24O0CXFxVl1TVzcBRwN6DG1TVyVV1Yzt7CrD5CPNIkiRJUq+NssDbDLhsYH5Ju2wq+wHfmWxFkv2TLEqyaOnSpasxoiRJkiT1x1gMspLkRcB84IOTra+qw6pqflXNnzt37syGkyRJkqRZYp0RHvtyYIuB+c3bZXeQ5MnAO4EnVNVNI8wjSZIkSb02ygLvdGCrJA+gKez2Af5+cIMkOwKfBXavqqtHmEXqhXkHHt/ZuS89ZM/Ozi1JkqThjKyLZlXdChwAnAhcABxdVYuTHJxkr3azDwIbA8ckOSvJwlHlkSRJkqS+G2ULHlV1AnDChGUHDUw/eZTnlyRJkqQ1yVgMsiJJkiRJWnUWeJIkSZLUExZ4kiRJktQTFniSJEmS1BMjHWRF0prBr2+QJEkaD7bgSZIkSVJPWOBJkiRJUk9Y4EmSJElST1jgSZIkSVJPOMiKpF5zABhJkrQmsQVPkiRJknrCAk+SJEmSesICT5IkSZJ6wmfwJKkDPhsoSZJGwRY8SZIkSeoJW/AkSXfQVeuiLYuSJK06W/AkSZIkqSdswZMkzQo+tyhJ0orZgidJkiRJPWELniRJq8jnFiVJ48ICT5KknrJbqySteSzwJEnSjLLwlKTRscCTJElqjWvxOa65JI2fkRZ4SXYHPg6sDXy+qg6ZsP4uwJeAnYFrgRdU1aWjzCRJkqTVZ1yfQbUo1ppqZAVekrWBQ4GnAEuA05MsrKrzBzbbD7i+qh6cZB/g/cALRpVJkiRJ6ppF8Z3Zgr36jPJrEnYBLq6qS6rqZuAoYO8J2+wNfLGdPhZ4UpKMMJMkSZIk9VaqajQHTp4L7F5Vr2znXww8qqoOGNjmvHabJe38r9ttrplwrP2B/dvZbYALRxK6W3OAa1a41cwz1/SNa7ZxzQXjm21cc8H4ZjPX9I1rtnHNBeObbVxzwfhmM9f0jWu2cc0F451tZd2/quZOtmJWDLJSVYcBh3WdY5SSLKqq+V3nmMhc0zeu2cY1F4xvtnHNBeObzVzTN67ZxjUXjG+2cc0F45vNXNM3rtnGNReMd7ZRGGUXzcuBLQbmN2+XTbpNknWATWkGW5EkSZIkTdMoC7zTga2SPCDJesA+wMIJ2ywEXtpOPxf4QY2qz6gkSZIk9dzIumhW1a1JDgBOpPmahMOranGSg4FFVbUQ+AJwRJKLgetoisA11bh2QTXX9I1rtnHNBeObbVxzwfhmM9f0jWu2cc0F45ttXHPB+GYz1/SNa7ZxzQXjnW21G9kgK5IkSZKkmTXKLpqSJEmSpBlkgSdJkiRJPWGB17Ekhye5uv1OwLGRZIskJyc5P8niJG/oOhNAkvWTnJbk7DbXe7rONCjJ2kl+keTbXWcZlOTSJOcmOSvJoq7zLJPkbkmOTfLLJBck+ZuuMwEk2aZ9rZb93JDkjV3nAkjypvb//fOSHJlk/a4zASR5Q5tpcdev1WR/V5PcI8l3k1zU/vfuY5Ttee3rdluSTobxniLXB9t/m+ck+WaSu41Rtve2uc5KclKS+41DroF1b05SSeaMQ64k705y+cDftD1mOtdU2drlr2v/X1uc5APjkCvJ1wZer0uTnDXTuZaTbYckpyy7pyfZZUxybZ/k5+37jW8luWsHuSZ9/zou94CZYoHXvQXA7l2HmMStwJuralvg0cBrk2zbcSaAm4AnVtX2wA7A7kke3XGmQW8ALug6xBR2q6odxux7YD4O/FdVPQTYnjF57arqwva12gHYGbgR+GbHsUiyGfB6YH5VPZxmAKvOB6dK8nDgVcAuNL/Hpyd5cIeRFnDnv6sHAt+vqq2A77fzXVjAnbOdBzwb+PGMp7ndAu6c67vAw6tqO+BXwDtmOlRrAXfO9sGq2q79N/pt4KAZTzXF/TvJFsBTgd/NdKDWAiZ/X/HRZX/XquqEGc60zAImZEuyG7A3sH1VPQz40DjkqqoXDNwHvg58o4NcMPnv8wPAe9psB7XzM20Bd871eeDAqnoEzT3zrTMdiqnfv47LPWBGWOB1rKp+TDOC6Fipqiur6sx2+g80b7w36zYVVOOP7ey67c9YjBSUZHNgT5o/cFqBJJsCj6cZTZequrmq/rfbVJN6EvDrqvpt10Fa6wAbpPnu0A2BKzrOA/BQ4NSqurGqbgV+RFOwdGKKv6t7A19sp78IPHNGQ7Umy1ZVF1TVhV3kGcgwWa6T2t8nwCk032c746bIdsPA7EZ0cB9Yzv37o8Db6OjeNK7vK2DKbP8AHFJVN7XbXD0muQBIEuD5wJEzGqo1RbYClrWObUoH94Epcm3N7R9UfRd4zoyGYrnvX8fiHjBTLPC0QknmATsCp3abpNF2gzwLuBr4blWNRS7gYzQ39du6DjKJAk5KckaS/bsO03oAsBT4jzTdWj+fZKOuQ01iHzq6sU9UVZfTfLr9O+BK4PdVdVK3qYCmBepxSe6ZZENgD2CLjjNNdO+qurKd/h/g3l2GmYVeAXyn6xCDkrwvyWXAC+mmBe9OkuwNXF5VZ3edZRIHtN1aDx+z7mlb0/z9ODXJj5I8sutAEzwOuKqqLuo6yIA3Ah9s////EN21rk+0mKaQAngeHd8HJrx/XaPuARZ4Wq4kG9N0TXjjhE9MO1NVf227JWwO7NJ2D+tUkqcDV1fVGV1nmcJjq2on4Gk03RUe33UgmpaonYBPV9WOwJ8Ysy4TSdYD9gKO6ToLQPumbG+a4vh+wEZJXtRtqqYFCng/cBLwX8BZwF87DbUc1Xw/0Fi0/M8GSd5J0+3pK11nGVRV76yqLWhyHdB1nvbDjX9mTIrNCT4NPIjm0YYrgQ93G+cO1gHuQdOd7q3A0W2r2bjYlzH5kG/APwBvav//fxNtT5gx8ArgH5OcAWwC3NxVkOW9f10T7gEWeJpSknVp/nF8paq66ns+pbY738mMxzOMjwH2SnIpcBTwxCRf7jbS7dqWn2VdX75J86xU15YASwZaYI+lKfjGydOAM6vqqq6DtJ4M/KaqllbVLTTPhPxtx5kAqKovVNXOVfV44HqaZ7bGyVVJ7gvQ/nfGu4HNRkleBjwdeGGN7xfnfoUOuoJN4kE0H76c3d4LNgfOTHKfTlMBVXVV++HobcDnGI97wDJLgG+0j2CcRtMLZsYHp5lM2xX+2cDXus4ywUu5/ZnAYxiT32dV/bKqnlpVO9MUxb/uIscU71/XqHuABZ4m1X569gXggqr6SNd5lkkyd9lIbkk2AJ4C/LLbVFBV76iqzatqHk2Xvh9UVectKwBJNkqyybJpmof/Ox+1tar+B7gsyTbtoicB53cYaTLj9snt74BHJ9mw/Tf6JMZkYJok92r/uyXNG6KvdpvoThbSvCmi/e9/dphlVkiyO023872q6sau8wxKstXA7N6Mx33g3Kq6V1XNa+8FS4Cd2r91nVr2xrb1LMbgHjDgOGA3gCRbA+sB13Sa6HZPBn5ZVUu6DjLBFcAT2uknAmPRfXTgPrAW8C7gMx1kmOr965p1D6gqfzr8oXnzeCVwC83NYL+uM7W5HkvTfH0OTXers4A9xiDXdsAv2lznAQd1nWmSjLsC3+46x0CeBwJntz+LgXd2nWkg2w7Aovb3eRxw964zDWTbCLgW2LTrLBNyvYfmzex5wBHAXbrO1Ob6CU2BfjbwpI6z3OnvKnBPmpHTLgK+B9xjjLI9q52+CbgKOHFMcl0MXDZwD/jMGL1mX2//DZwDfAvYbBxyTVh/KTBnHHK1fyvObV+vhcB9x+h3uR7w5fb3eSbNSNmd52qXLwBe08VrtYLX7LHAGe3f21OBncck1xtoem/8CjgESAe5Jn3/Oi73gJn6SftiSJIkSZJmObtoSpIkSVJPWOBJkiRJUk9Y4EmSJElST1jgSZIkSVJPWOBJkiRJUk9Y4EmSOpOkknx4YP4tSd69mo69IMlzV8ex2uM9L8kFSU5eXceUJGl1s8CTJHXpJuDZSVburRcAAAO5SURBVOZ0HWRQknUmWbwf8Kqq2m2IbSVJ6oQFniSpS7cChwFvmrhiYgtckj+2/901yY+S/GeSS5IckuSFSU5Lcm6SBw0c5slJFiX5VZKnt/uvneSDSU5Pck6SVw8c9ydJFtJ8aftgloNovkD3C+2+L0uyMMkPgO8n2SjJ4W2GXyTZu91vgyRHtS1/30xyapL5g9fTTj83yYJ2em6Sr7f5Tk/ymHb5u9tz/LC97tcP7P+S9lrOTnJEkk2S/CbJuu36uw7OS5L6y08dJUldOxQ4J8kHprHP9sBDgeuAS4DPV9UuSd4AvA54Y7vdPGAX4EHAyUkeDLwE+H1VPTLJXYCfJjmp3X4n4OFV9ZvBk1XVwUmeCLylqhYleVm77XZVdV2SfwN+UFWvSHI34LQk3wNeDdxYVQ9Nsh1w5hDX9nHgo1X130m2BE5srxXgIcBuwCbAhUk+DWwNvAv426q6Jsk9quoPSX4I7AkcB+wDfKOqbhni/JKkWcwCT5LUqaq6IcmXgNcDfx5yt9Or6kqAJL8GlhVo59IUQMscXVW3ARcluYSmQHoqsN1A6+CmwFbAzcBpE4u75fhuVV3XTj8V2CvJW9r59YEtgccD/95e5zlJzhniuE8Gtk2ybP6uSTZup4+vqpuAm5JcDdwbeCJwTFVd055nWabPA2+jKfBeDrxqyOuSJM1iFniSpHHwMZrWrf8YWHYr7aMESdYC1htYd9PA9G0D87dxx3tbTThPAQFeV1UnDq5Isivwp2lkHtw2wHOq6sIJx1ze/oPZ1h+YXgt4dFX9ZZJjDV73X1nOfbyqfppkXntda1fVecsLI0nqB5/BkyR1rm11OppmIJNlLgV2bqf3Albm+bHnJVmrfS7vgcCFNF0e/2Hg+bStk2y0stlbJwKvS1uFJdmxXf5j4O/bZQ8HthvY56okD22L12cNLD+Jppsp7X47rODcP6C5znu2299jYN2XgK9yx8JZktRjFniSpHHxYWBwNM3PAU9IcjbwN0yvdW2Z3wGnAd8BXtO2in2eZhCVM5OcB3yWVe/R8l6aAvScJIvbeYBPAxsnuQA4GDhjYJ8DgW8DPwOuHFj+emB+O2jK+cBrlnfiqloMvA/4UftafWRg9VeAuwNHruyFSZJml1RN7L0iSZJGoR345C1VtWiGzvdcYO+qevFMnE+S1D2fwZMkqYeSfAJ4GrBH11kkSTPHFjxJkiRJ6gmfwZMkSZKknrDAkyRJkqSesMCTJEmSpJ6wwJMkSZKknrDAkyRJkqSe+H/u9BS9ZW39XQAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 295 }, "id": "8powyDiy50ef", "outputId": "7d6155a4-b4f8-459b-b1c1-6a63b634f28c" }, "source": [ "plot_sparse(df, 'brand')" ], "execution_count": 11, "outputs": [ { "output_type": "display_data", "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "markdown", "metadata": { "id": "NgUmBmmS597f" }, "source": [ "We can observe following pattern:\n", "- There are ~45000 products which appear only once in the dataset\n", "- There are 1200000 users which appear only once in the dataset\n", "- There are 350 brands which appears only once in the dataset" ] }, { "cell_type": "markdown", "metadata": { "id": "YN9112LL8mHy" }, "source": [ "The sparsity is important for understanding which features can be better used in a model. Product_id and User_id have many values which appear only once and the model is less able to learn a good patterns from them.\n", "On the other hand, brands has many observations and can be leveraged for prediction." ] }, { "cell_type": "code", "metadata": { "id": "7Y2MwjKF55ES" }, "source": [ "def plot_top20(df, col):\n", " stats = df[[col, 'target']].groupby(col).agg(['count', 'mean', 'sum'])\n", " stats = stats.reset_index()\n", " stats.columns = [col, 'count', 'mean', 'sum']\n", " stats = stats.sort_values('count', ascending=False)\n", " fig, ax1 = plt.subplots(figsize=(15,4))\n", " ax2 = ax1.twinx()\n", " ax1.bar(stats[col].astype(str).values[0:20], stats['count'].values[0:20])\n", " ax1.set_xticklabels(stats[col].astype(str).values[0:20], rotation='vertical')\n", " ax2.plot(stats['mean'].values[0:20], color='red')\n", " ax2.set_ylim(0,1)\n", " ax2.set_ylabel('Mean Target')\n", " ax1.set_ylabel('Frequency')\n", " ax1.set_xlabel(col)\n", " ax1.set_title('Top20 ' + col + 's based on frequency')" ], "execution_count": 12, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 342 }, "id": "QrCM9B2b8tLV", "outputId": "27b7658e-8cfe-4a20-c2d9-b39a827c7741" }, "source": [ "plot_top20(df, 'product_id')" ], "execution_count": 13, "outputs": [ { "output_type": "display_data", "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 343 }, "id": "-Daz2Etw8ywW", "outputId": "197a2029-58ce-46b9-bcb3-c336b47d7cb3" }, "source": [ "plot_top20(df, 'user_id')" ], "execution_count": 14, "outputs": [ { "output_type": "display_data", "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 331 }, "id": "IfVtFT4V80yK", "outputId": "3a38366e-6cc6-4d40-d751-14538abd4997" }, "source": [ "plot_top20(df, 'brand')" ], "execution_count": 15, "outputs": [ { "output_type": "display_data", "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "code", "metadata": { "id": "TeNKLjJU81_Y" }, "source": [ "df['date'] = pd.to_datetime(df['timestamp']).dt.date" ], "execution_count": 3, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 344 }, "id": "yyUDEaxC9fKc", "outputId": "61e2cb92-fc44-4e3c-9711-e029d0a4682d" }, "source": [ "plt.figure(figsize=(15,4))\n", "plt.plot(df[['date', 'target']].groupby('date').target.mean())\n", "plt.ylabel('average mean')\n", "plt.xlabel('date')\n", "plt.xticks(df[['date', 'target']].groupby('date').target.mean().index[::3], rotation='vertical')\n", "print('')" ], "execution_count": 4, "outputs": [ { "output_type": "stream", "text": [ "\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "p2xBxTn2-3KX", "outputId": "03cdc5af-6190-48c9-80b7-c2a5609f0884" }, "source": [ "df[['date', 'target']].groupby('date').target.mean().sort_values().head(20)" ], "execution_count": 5, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "date\n", "2019-11-15 0.000000\n", "2020-01-02 0.000000\n", "2020-04-20 0.000096\n", "2020-04-21 0.000147\n", "2020-01-01 0.063013\n", "2020-02-27 0.114378\n", "2019-11-14 0.135596\n", "2019-11-16 0.147902\n", "2020-01-31 0.161678\n", "2020-02-01 0.203322\n", "2020-01-30 0.223492\n", "2020-02-02 0.258615\n", "2020-02-03 0.270665\n", "2020-01-03 0.282014\n", "2020-02-04 0.295918\n", "2020-02-29 0.300146\n", "2020-02-28 0.300432\n", "2020-02-05 0.310188\n", "2020-02-07 0.317174\n", "2020-02-08 0.318376\n", "Name: target, dtype: float64" ] }, "metadata": { "tags": [] }, "execution_count": 5 } ] }, { "cell_type": "markdown", "metadata": { "id": "LJ7POegh-7rk" }, "source": [ "- We explored the data and saw the different raw features available in the dataset.\n", "- We analzyed basic statistics of the raw features and saw long-tail distribution for categorical features (user, item, brand)\n", "- Some categorical features (categories) have high occurances\n", "- In general, we see that categorical features have variance in the target, which we can leverage to engineer more powerful features" ] } ] }