{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Création de votre propre jeu de données" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce *notebook*." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets evaluate transformers[sentencepiece]\n", "!apt install git-lfs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Vous aurez besoin de configurer git, adaptez votre email et votre nom dans la cellule suivante." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!git config --global user.email \"you@example.com\"\n", "!git config --global user.name \"Your Name\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Vous devrez également être connecté au *Hub* d'Hugging Face. Exécutez ce qui suit et entrez vos informations d'identification." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import notebook_login\n", "\n", "notebook_login()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install requests" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import requests\n", "\n", "url = \"https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1\"\n", "response = requests.get(url)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "response.status_code" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "response.json()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "GITHUB_TOKEN = xxx # Copiez votre jeton GitHub ici\n", "headers = {\"Authorization\": f\"token {GITHUB_TOKEN}\"}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import time\n", "import math\n", "from pathlib import Path\n", "import pandas as pd\n", "from tqdm.notebook import tqdm\n", "\n", "\n", "def fetch_issues(\n", " owner=\"huggingface\",\n", " repo=\"datasets\",\n", " num_issues=10_000,\n", " rate_limit=5_000,\n", " issues_path=Path(\".\"),\n", "):\n", " if not issues_path.is_dir():\n", " issues_path.mkdir(exist_ok=True)\n", "\n", " batch = []\n", " all_issues = []\n", " per_page = 100 # Nombre d'issues à renvoyer par page\n", " num_pages = math.ceil(num_issues / per_page)\n", " base_url = \"https://api.github.com/repos\"\n", "\n", " for page in tqdm(range(num_pages)):\n", " # Requête avec state=all pour obtenir les questions ouvertes et fermées\n", " query = f\"issues?page={page}&per_page={per_page}&state=all\"\n", " issues = requests.get(f\"{base_url}/{owner}/{repo}/{query}\", headers=headers)\n", " batch.extend(issues.json())\n", "\n", " if len(batch) > rate_limit and len(all_issues) < num_issues:\n", " all_issues.extend(batch)\n", " batch = [] # Vider le batch pour la prochaine période de temps\n", " print(f\"Reached GitHub rate limit. Sleeping for one hour ...\")\n", " time.sleep(60 * 60 + 1)\n", "\n", " all_issues.extend(batch)\n", " df = pd.DataFrame.from_records(all_issues)\n", " df.to_json(f\"{issues_path}/{repo}-issues.jsonl\", orient=\"records\", lines=True)\n", " print(\n", " f\"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl\"\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# En fonction de votre connexion Internet, l'exécution peut prendre plusieurs minutes...\n", "fetch_issues()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "issues_dataset = load_dataset(\"json\", data_files=\"datasets-issues.jsonl\", split=\"train\")\n", "issues_dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sample = issues_dataset.shuffle(seed=666).select(range(3))\n", "\n", "# Afficher l'URL et les entrées de la demande de tirage\n", "for url, pr in zip(sample[\"html_url\"], sample[\"pull_request\"]):\n", " print(f\">> URL: {url}\")\n", " print(f\">> Pull request: {pr}\\n\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "issues_dataset = issues_dataset.map(\n", " lambda x: {\"is_pull_request\": False if x[\"pull_request\"] is None else True}\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "issue_number = 2792\n", "url = f\"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments\"\n", "response = requests.get(url, headers=headers)\n", "response.json()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_comments(issue_number):\n", " url = f\"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments\"\n", " response = requests.get(url, headers=headers)\n", " return [r[\"body\"] for r in response.json()]\n", "\n", "\n", "# Tester notre fonction fonctionne comme prévu\n", "get_comments(2792)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Selon votre connexion internet, cela peut prendre quelques minutes...\n", "issues_with_comments_dataset = issues_dataset.map(\n", " lambda x: {\"comments\": get_comments(x[\"number\"])}\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "issues_with_comments_dataset.to_json(\"issues-datasets-with-comments.jsonl\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import list_datasets\n", "\n", "all_datasets = list_datasets()\n", "print(f\"Number of datasets on Hub: {len(all_datasets)}\")\n", "print(all_datasets[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import notebook_login\n", "\n", "notebook_login()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import create_repo\n", "\n", "repo_url = create_repo(name=\"github-issues\", repo_type=\"dataset\")\n", "repo_url" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import Repository\n", "\n", "repo = Repository(local_dir=\"github-issues\", clone_from=repo_url)\n", "!cp datasets-issues-with-comments.jsonl github-issues/" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "repo.lfs_track(\"*.jsonl\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "repo.push_to_hub()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "remote_dataset = load_dataset(\"lewtun/github-issues\", split=\"train\")\n", "remote_dataset" ] } ], "metadata": { "colab": { "name": "Création de votre propre jeu de données", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }