# Création de votre propre jeu de données

Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce *notebook*.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!apt install git-lfs

Vous aurez besoin de configurer git, adaptez votre email et votre nom dans la cellule suivante.

In [None]:
!git config --global user.email "you@example.com"
!git config --global user.name "Your Name"

Vous devrez également être connecté au *Hub* d'Hugging Face. Exécutez ce qui suit et entrez vos informations d'identification.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!pip install requests

In [None]:
import requests

url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)

In [None]:
response.status_code

In [None]:
response.json()

In [None]:
GITHUB_TOKEN = xxx # Copiez votre jeton GitHub ici
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

In [None]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm


def fetch_issues(
 owner="huggingface",
 repo="datasets",
 num_issues=10_000,
 rate_limit=5_000,
 issues_path=Path("."),
):
 if not issues_path.is_dir():
 issues_path.mkdir(exist_ok=True)

 batch = []
 all_issues = []
 per_page = 100 # Nombre d'issues à renvoyer par page
 num_pages = math.ceil(num_issues / per_page)
 base_url = "https://api.github.com/repos"

 for page in tqdm(range(num_pages)):
 # Requête avec state=all pour obtenir les questions ouvertes et fermées
 query = f"issues?page={page}&per_page={per_page}&state=all"
 issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
 batch.extend(issues.json())

 if len(batch) > rate_limit and len(all_issues) < num_issues:
 all_issues.extend(batch)
 batch = [] # Vider le batch pour la prochaine période de temps
 print(f"Reached GitHub rate limit. Sleeping for one hour ...")
 time.sleep(60 * 60 + 1)

 all_issues.extend(batch)
 df = pd.DataFrame.from_records(all_issues)
 df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
 print(
 f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
 )

In [None]:
# En fonction de votre connexion Internet, l'exécution peut prendre plusieurs minutes...
fetch_issues()

In [None]:
issues_dataset = load_dataset("json", data_files="datasets-issues.jsonl", split="train")
issues_dataset

In [None]:
sample = issues_dataset.shuffle(seed=666).select(range(3))

# Afficher l'URL et les entrées de la demande de tirage
for url, pr in zip(sample["html_url"], sample["pull_request"]):
 print(f">> URL: {url}")
 print(f">> Pull request: {pr}\n")

In [None]:
issues_dataset = issues_dataset.map(
 lambda x: {"is_pull_request": False if x["pull_request"] is None else True}
)

In [None]:
issue_number = 2792
url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
response = requests.get(url, headers=headers)
response.json()

In [None]:
def get_comments(issue_number):
 url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
 response = requests.get(url, headers=headers)
 return [r["body"] for r in response.json()]


# Tester notre fonction fonctionne comme prévu
get_comments(2792)

In [None]:
# Selon votre connexion internet, cela peut prendre quelques minutes...
issues_with_comments_dataset = issues_dataset.map(
 lambda x: {"comments": get_comments(x["number"])}
)

In [None]:
issues_with_comments_dataset.to_json("issues-datasets-with-comments.jsonl")

In [None]:
from huggingface_hub import list_datasets

all_datasets = list_datasets()
print(f"Number of datasets on Hub: {len(all_datasets)}")
print(all_datasets[0])

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from huggingface_hub import create_repo

repo_url = create_repo(name="github-issues", repo_type="dataset")
repo_url

In [None]:
from huggingface_hub import Repository

repo = Repository(local_dir="github-issues", clone_from=repo_url)
!cp datasets-issues-with-comments.jsonl github-issues/

In [None]:
repo.lfs_track("*.jsonl")

In [None]:
repo.push_to_hub()

In [None]:
remote_dataset = load_dataset("lewtun/github-issues", split="train")
remote_dataset