{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Big data? 🤗 Datasets ao resgate" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers, Datasets, and Evaluate libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets evaluate transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install zstandard" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['meta', 'text'],\n", " num_rows: 15518009\n", "})" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from datasets import load_dataset\n", "\n", "# This takes a few minutes to run, so go grab a tea or coffee while you wait :)\n", "data_files = \"https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst\"\n", "pubmed_dataset = load_dataset(\"json\", data_files=data_files, split=\"train\")\n", "pubmed_dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'meta': {'pmid': 11409574, 'language': 'eng'},\n", " 'text': 'Epidemiology of hypoxaemia in children with acute lower respiratory infection.\\nTo determine the prevalence of hypoxaemia in children aged under 5 years suffering acute lower respiratory infections (ALRI), the risk factors for hypoxaemia in children under 5 years of age with ALRI, and the association of hypoxaemia with an increased risk of dying in children of the same age ...'}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pubmed_dataset[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install psutil" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RAM used: 5678.33 MB" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import psutil\n", "\n", "# Process.memory_info is expressed in bytes, so convert to megabytes\n", "print(f\"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Number of files in dataset : 20979437051\n", "Dataset size (cache file) : 19.54 GB" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(f\"Number of files in dataset : {pubmed_dataset.dataset_size}\")\n", "size_gb = pubmed_dataset.dataset_size / (1024**3)\n", "print(f\"Dataset size (cache file) : {size_gb:.2f} GB\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Iterated over 15518009 examples (about 19.5 GB) in 64.2s, i.e. 0.304 GB/s'" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import timeit\n", "\n", "code_snippet = \"\"\"batch_size = 1000\n", "\n", "for idx in range(0, len(pubmed_dataset), batch_size):\n", " _ = pubmed_dataset[idx:idx + batch_size]\n", "\"\"\"\n", "\n", "time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())\n", "print(\n", " f\"Iterated over {len(pubmed_dataset)} examples (about {size_gb:.1f} GB) in \"\n", " f\"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pubmed_dataset_streamed = load_dataset(\n", " \"json\", data_files=data_files, split=\"train\", streaming=True\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'meta': {'pmid': 11409574, 'language': 'eng'},\n", " 'text': 'Epidemiology of hypoxaemia in children with acute lower respiratory infection.\\nTo determine the prevalence of hypoxaemia in children aged under 5 years suffering acute lower respiratory infections (ALRI), the risk factors for hypoxaemia in children under 5 years of age with ALRI, and the association of hypoxaemia with an increased risk of dying in children of the same age ...'}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "next(iter(pubmed_dataset_streamed))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'input_ids': [101, 4958, 5178, 4328, 6779, ...], 'attention_mask': [1, 1, 1, 1, 1, ...]}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", "tokenized_dataset = pubmed_dataset_streamed.map(lambda x: tokenizer(x[\"text\"]))\n", "next(iter(tokenized_dataset))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'meta': {'pmid': 11410799, 'language': 'eng'},\n", " 'text': 'Randomized study of dose or schedule modification of granulocyte colony-stimulating factor in platinum-based chemotherapy for elderly patients with lung cancer ...'}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "shuffled_dataset = pubmed_dataset_streamed.shuffle(buffer_size=10_000, seed=42)\n", "next(iter(shuffled_dataset))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'meta': {'pmid': 11409574, 'language': 'eng'},\n", " 'text': 'Epidemiology of hypoxaemia in children with acute lower respiratory infection ...'},\n", " {'meta': {'pmid': 11409575, 'language': 'eng'},\n", " 'text': 'Clinical signs of hypoxaemia in children with acute lower respiratory infection: indicators of oxygen therapy ...'},\n", " {'meta': {'pmid': 11409576, 'language': 'eng'},\n", " 'text': \"Hypoxaemia in children with severe pneumonia in Papua New Guinea ...\"},\n", " {'meta': {'pmid': 11409577, 'language': 'eng'},\n", " 'text': 'Oxygen concentrators and cylinders ...'},\n", " {'meta': {'pmid': 11409578, 'language': 'eng'},\n", " 'text': 'Oxygen supply in rural africa: a personal experience ...'}]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset_head = pubmed_dataset_streamed.take(5)\n", "list(dataset_head)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Skip the first 1,000 examples and include the rest in the training set\n", "train_dataset = shuffled_dataset.skip(1000)\n", "# Take the first 1,000 examples for the validation set\n", "validation_dataset = shuffled_dataset.take(1000)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'meta': {'case_ID': '110921.json',\n", " 'case_jurisdiction': 'scotus.tar.gz',\n", " 'date_created': '2010-04-28T17:12:49Z'},\n", " 'text': '\\n461 U.S. 238 (1983)\\nOLIM ET AL.\\nv.\\nWAKINEKONA\\nNo. 81-1581.\\nSupreme Court of United States.\\nArgued January 19, 1983.\\nDecided April 26, 1983.\\nCERTIORARI TO THE UNITED STATES COURT OF APPEALS FOR THE NINTH CIRCUIT\\n*239 Michael A. Lilly, First Deputy Attorney General of Hawaii, argued the cause for petitioners. With him on the brief was James H. Dannenberg, Deputy Attorney General...'}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "law_dataset_streamed = load_dataset(\n", " \"json\",\n", " data_files=\"https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst\",\n", " split=\"train\",\n", " streaming=True,\n", ")\n", "next(iter(law_dataset_streamed))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'meta': {'pmid': 11409574, 'language': 'eng'},\n", " 'text': 'Epidemiology of hypoxaemia in children with acute lower respiratory infection ...'},\n", " {'meta': {'case_ID': '110921.json',\n", " 'case_jurisdiction': 'scotus.tar.gz',\n", " 'date_created': '2010-04-28T17:12:49Z'},\n", " 'text': '\\n461 U.S. 238 (1983)\\nOLIM ET AL.\\nv.\\nWAKINEKONA\\nNo. 81-1581.\\nSupreme Court of United States.\\nArgued January 19, 1983.\\nDecided April 26, 1983.\\nCERTIORARI TO THE UNITED STATES COURT OF APPEALS FOR THE NINTH CIRCUIT\\n*239 Michael A. Lilly, First Deputy Attorney General of Hawaii, argued the cause for petitioners. With him on the brief was James H. Dannenberg, Deputy Attorney General...'}]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from itertools import islice\n", "from datasets import interleave_datasets\n", "\n", "combined_dataset = interleave_datasets([pubmed_dataset_streamed, law_dataset_streamed])\n", "list(islice(combined_dataset, 2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'meta': {'pile_set_name': 'Pile-CC'},\n", " 'text': 'It is done, and submitted. You can play “Survival of the Tastiest” on Android, and on the web...'}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "base_url = \"https://the-eye.eu/public/AI/pile/\"\n", "data_files = {\n", " \"train\": [base_url + \"train/\" + f\"{idx:02d}.jsonl.zst\" for idx in range(30)],\n", " \"validation\": base_url + \"val.jsonl.zst\",\n", " \"test\": base_url + \"test.jsonl.zst\",\n", "}\n", "pile_dataset = load_dataset(\"json\", data_files=data_files, streaming=True)\n", "next(iter(pile_dataset[\"train\"]))" ] } ], "metadata": { "colab": { "name": "Big data? 🤗 Datasets ao resgate", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }