{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Données massives ? 🤗 Datasets à la rescousse !" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce *notebook*." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets evaluate transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install zstandard" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "# Cela prend quelques minutes à exécuter, alors allez prendre un thé ou un café en attendant :)\n", "data_files = \"https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst\"\n", "pubmed_dataset = load_dataset(\"json\", data_files=data_files, split=\"train\")\n", "pubmed_dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pubmed_dataset[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install psutil" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import psutil\n", "\n", "# Process.memory_info est exprimé en octets, donc convertir en mégaoctets\n", "print(f\"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(f\"Number of files in dataset : {pubmed_dataset.dataset_size}\")\n", "size_gb = pubmed_dataset.dataset_size / (1024**3)\n", "print(f\"Dataset size (cache file) : {size_gb:.2f} GB\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import timeit\n", "\n", "code_snippet = \"\"\"batch_size = 1000\n", "\n", "for idx in range(0, len(pubmed_dataset), batch_size):\n", " _ = pubmed_dataset[idx:idx + batch_size]\n", "\"\"\"\n", "\n", "time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())\n", "print(\n", " f\"Iterated over {len(pubmed_dataset)} examples (about {size_gb:.1f} GB) in \"\n", " f\"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pubmed_dataset_streamed = load_dataset(\n", " \"json\", data_files=data_files, split=\"train\", streaming=True\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "next(iter(pubmed_dataset_streamed))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", "tokenized_dataset = pubmed_dataset_streamed.map(lambda x: tokenizer(x[\"text\"]))\n", "next(iter(tokenized_dataset))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "shuffled_dataset = pubmed_dataset_streamed.shuffle(buffer_size=10_000, seed=42)\n", "next(iter(shuffled_dataset))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dataset_head = pubmed_dataset_streamed.take(5)\n", "list(dataset_head)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Ignorer les 1 000 premiers exemples et inclure le reste dans l'ensemble d'apprentissage.\n", "train_dataset = shuffled_dataset.skip(1000)\n", "# Prendre les 1 000 premiers exemples pour l'ensemble de validation.\n", "validation_dataset = shuffled_dataset.take(1000)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "law_dataset_streamed = load_dataset(\n", " \"json\",\n", " data_files=\"https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst\",\n", " split=\"train\",\n", " streaming=True,\n", ")\n", "next(iter(law_dataset_streamed))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from itertools import islice\n", "from datasets import interleave_datasets\n", "\n", "combined_dataset = interleave_datasets([pubmed_dataset_streamed, law_dataset_streamed])\n", "list(islice(combined_dataset, 2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "base_url = \"https://the-eye.eu/public/AI/pile/\"\n", "data_files = {\n", " \"train\": [base_url + \"train/\" + f\"{idx:02d}.jsonl.zst\" for idx in range(30)],\n", " \"validation\": base_url + \"val.jsonl.zst\",\n", " \"test\": base_url + \"test.jsonl.zst\",\n", "}\n", "pile_dataset = load_dataset(\"json\", data_files=data_files, streaming=True)\n", "next(iter(pile_dataset[\"train\"]))" ] } ], "metadata": { "colab": { "name": "Données massives ? 🤗 Datasets à la rescousse !", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }