{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Criando seu prĂ³prio dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers, Datasets, and Evaluate libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets evaluate transformers[sentencepiece]\n", "!apt install git-lfs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You will need to setup git, adapt your email and name in the following cell." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!git config --global user.email \"you@example.com\"\n", "!git config --global user.name \"Your Name\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import notebook_login\n", "\n", "notebook_login()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install requests" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import requests\n", "\n", "url = \"https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1\"\n", "response = requests.get(url)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "200" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "response.status_code" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/2792',\n", " 'repository_url': 'https://api.github.com/repos/huggingface/datasets',\n", " 'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/2792/labels{/name}',\n", " 'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/2792/comments',\n", " 'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/2792/events',\n", " 'html_url': 'https://github.com/huggingface/datasets/pull/2792',\n", " 'id': 968650274,\n", " 'node_id': 'MDExOlB1bGxSZXF1ZXN0NzEwNzUyMjc0',\n", " 'number': 2792,\n", " 'title': 'Update GooAQ',\n", " 'user': {'login': 'bhavitvyamalik',\n", " 'id': 19718818,\n", " 'node_id': 'MDQ6VXNlcjE5NzE4ODE4',\n", " 'avatar_url': 'https://avatars.githubusercontent.com/u/19718818?v=4',\n", " 'gravatar_id': '',\n", " 'url': 'https://api.github.com/users/bhavitvyamalik',\n", " 'html_url': 'https://github.com/bhavitvyamalik',\n", " 'followers_url': 'https://api.github.com/users/bhavitvyamalik/followers',\n", " 'following_url': 'https://api.github.com/users/bhavitvyamalik/following{/other_user}',\n", " 'gists_url': 'https://api.github.com/users/bhavitvyamalik/gists{/gist_id}',\n", " 'starred_url': 'https://api.github.com/users/bhavitvyamalik/starred{/owner}{/repo}',\n", " 'subscriptions_url': 'https://api.github.com/users/bhavitvyamalik/subscriptions',\n", " 'organizations_url': 'https://api.github.com/users/bhavitvyamalik/orgs',\n", " 'repos_url': 'https://api.github.com/users/bhavitvyamalik/repos',\n", " 'events_url': 'https://api.github.com/users/bhavitvyamalik/events{/privacy}',\n", " 'received_events_url': 'https://api.github.com/users/bhavitvyamalik/received_events',\n", " 'type': 'User',\n", " 'site_admin': False},\n", " 'labels': [],\n", " 'state': 'open',\n", " 'locked': False,\n", " 'assignee': None,\n", " 'assignees': [],\n", " 'milestone': None,\n", " 'comments': 1,\n", " 'created_at': '2021-08-12T11:40:18Z',\n", " 'updated_at': '2021-08-12T12:31:17Z',\n", " 'closed_at': None,\n", " 'author_association': 'CONTRIBUTOR',\n", " 'active_lock_reason': None,\n", " 'pull_request': {'url': 'https://api.github.com/repos/huggingface/datasets/pulls/2792',\n", " 'html_url': 'https://github.com/huggingface/datasets/pull/2792',\n", " 'diff_url': 'https://github.com/huggingface/datasets/pull/2792.diff',\n", " 'patch_url': 'https://github.com/huggingface/datasets/pull/2792.patch'},\n", " 'body': '[GooAQ](https://github.com/allenai/gooaq) dataset was recently updated after splits were added for the same. This PR contains new updated GooAQ with train/val/test splits and updated README as well.',\n", " 'performed_via_github_app': None}]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "response.json()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "GITHUB_TOKEN = xxx # Copy your GitHub token here\n", "headers = {\"Authorization\": f\"token {GITHUB_TOKEN}\"}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import time\n", "import math\n", "from pathlib import Path\n", "import pandas as pd\n", "from tqdm.notebook import tqdm\n", "\n", "\n", "def fetch_issues(\n", " owner=\"huggingface\",\n", " repo=\"datasets\",\n", " num_issues=10_000,\n", " rate_limit=5_000,\n", " issues_path=Path(\".\"),\n", "):\n", " if not issues_path.is_dir():\n", " issues_path.mkdir(exist_ok=True)\n", "\n", " batch = []\n", " all_issues = []\n", " per_page = 100 # Number of issues to return per page\n", " num_pages = math.ceil(num_issues / per_page)\n", " base_url = \"https://api.github.com/repos\"\n", "\n", " for page in tqdm(range(num_pages)):\n", " # Query with state=all to get both open and closed issues\n", " query = f\"issues?page={page}&per_page={per_page}&state=all\"\n", " issues = requests.get(f\"{base_url}/{owner}/{repo}/{query}\", headers=headers)\n", " batch.extend(issues.json())\n", "\n", " if len(batch) > rate_limit and len(all_issues) < num_issues:\n", " all_issues.extend(batch)\n", " batch = [] # Flush batch for next time period\n", " print(f\"Reached GitHub rate limit. Sleeping for one hour ...\")\n", " time.sleep(60 * 60 + 1)\n", "\n", " all_issues.extend(batch)\n", " df = pd.DataFrame.from_records(all_issues)\n", " df.to_json(f\"{issues_path}/{repo}-issues.jsonl\", orient=\"records\", lines=True)\n", " print(\n", " f\"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl\"\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Depending on your internet connection, this can take several minutes to run...\n", "fetch_issues()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app'],\n", " num_rows: 3019\n", "})" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "issues_dataset = load_dataset(\"json\", data_files=\"datasets-issues.jsonl\", split=\"train\")\n", "issues_dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ ">> URL: https://github.com/huggingface/datasets/pull/850\n", ">> Pull request: {'url': 'https://api.github.com/repos/huggingface/datasets/pulls/850', 'html_url': 'https://github.com/huggingface/datasets/pull/850', 'diff_url': 'https://github.com/huggingface/datasets/pull/850.diff', 'patch_url': 'https://github.com/huggingface/datasets/pull/850.patch'}\n", "\n", ">> URL: https://github.com/huggingface/datasets/issues/2773\n", ">> Pull request: None\n", "\n", ">> URL: https://github.com/huggingface/datasets/pull/783\n", ">> Pull request: {'url': 'https://api.github.com/repos/huggingface/datasets/pulls/783', 'html_url': 'https://github.com/huggingface/datasets/pull/783', 'diff_url': 'https://github.com/huggingface/datasets/pull/783.diff', 'patch_url': 'https://github.com/huggingface/datasets/pull/783.patch'}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample = issues_dataset.shuffle(seed=666).select(range(3))\n", "\n", "# Print out the URL and pull request entries\n", "for url, pr in zip(sample[\"html_url\"], sample[\"pull_request\"]):\n", " print(f\">> URL: {url}\")\n", " print(f\">> Pull request: {pr}\\n\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "issues_dataset = issues_dataset.map(\n", " lambda x: {\"is_pull_request\": False if x[\"pull_request\"] is None else True}\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/comments/897594128',\n", " 'html_url': 'https://github.com/huggingface/datasets/pull/2792#issuecomment-897594128',\n", " 'issue_url': 'https://api.github.com/repos/huggingface/datasets/issues/2792',\n", " 'id': 897594128,\n", " 'node_id': 'IC_kwDODunzps41gDMQ',\n", " 'user': {'login': 'bhavitvyamalik',\n", " 'id': 19718818,\n", " 'node_id': 'MDQ6VXNlcjE5NzE4ODE4',\n", " 'avatar_url': 'https://avatars.githubusercontent.com/u/19718818?v=4',\n", " 'gravatar_id': '',\n", " 'url': 'https://api.github.com/users/bhavitvyamalik',\n", " 'html_url': 'https://github.com/bhavitvyamalik',\n", " 'followers_url': 'https://api.github.com/users/bhavitvyamalik/followers',\n", " 'following_url': 'https://api.github.com/users/bhavitvyamalik/following{/other_user}',\n", " 'gists_url': 'https://api.github.com/users/bhavitvyamalik/gists{/gist_id}',\n", " 'starred_url': 'https://api.github.com/users/bhavitvyamalik/starred{/owner}{/repo}',\n", " 'subscriptions_url': 'https://api.github.com/users/bhavitvyamalik/subscriptions',\n", " 'organizations_url': 'https://api.github.com/users/bhavitvyamalik/orgs',\n", " 'repos_url': 'https://api.github.com/users/bhavitvyamalik/repos',\n", " 'events_url': 'https://api.github.com/users/bhavitvyamalik/events{/privacy}',\n", " 'received_events_url': 'https://api.github.com/users/bhavitvyamalik/received_events',\n", " 'type': 'User',\n", " 'site_admin': False},\n", " 'created_at': '2021-08-12T12:21:52Z',\n", " 'updated_at': '2021-08-12T12:31:17Z',\n", " 'author_association': 'CONTRIBUTOR',\n", " 'body': \"@albertvillanova my tests are failing here:\\r\\n```\\r\\ndataset_name = 'gooaq'\\r\\n\\r\\n def test_load_dataset(self, dataset_name):\\r\\n configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\\r\\n> self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\\r\\n\\r\\ntests/test_dataset_common.py:234: \\r\\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \\r\\ntests/test_dataset_common.py:187: in check_load_dataset\\r\\n self.parent.assertTrue(len(dataset[split]) > 0)\\r\\nE AssertionError: False is not true\\r\\n```\\r\\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?\",\n", " 'performed_via_github_app': None}]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "issue_number = 2792\n", "url = f\"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments\"\n", "response = requests.get(url, headers=headers)\n", "response.json()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[\"@albertvillanova my tests are failing here:\\r\\n```\\r\\ndataset_name = 'gooaq'\\r\\n\\r\\n def test_load_dataset(self, dataset_name):\\r\\n configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\\r\\n> self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\\r\\n\\r\\ntests/test_dataset_common.py:234: \\r\\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \\r\\ntests/test_dataset_common.py:187: in check_load_dataset\\r\\n self.parent.assertTrue(len(dataset[split]) > 0)\\r\\nE AssertionError: False is not true\\r\\n```\\r\\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?\"]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def get_comments(issue_number):\n", " url = f\"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments\"\n", " response = requests.get(url, headers=headers)\n", " return [r[\"body\"] for r in response.json()]\n", "\n", "\n", "# Test our function works as expected\n", "get_comments(2792)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Depending on your internet connection, this can take a few minutes...\n", "issues_with_comments_dataset = issues_dataset.map(\n", " lambda x: {\"comments\": get_comments(x[\"number\"])}\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "issues_with_comments_dataset.to_json(\"issues-datasets-with-comments.jsonl\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Number of datasets on Hub: 1487\n", "Dataset Name: acronym_identification, Tags: ['annotations_creators:expert-generated', 'language_creators:found', 'languages:en', 'licenses:mit', 'multilinguality:monolingual', 'size_categories:10K