{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Criando seu próprio dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install the Transformers, Datasets, and Evaluate libraries to run this notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install datasets evaluate transformers[sentencepiece]\n",
    "!apt install git-lfs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You will need to setup git, adapt your email and name in the following cell."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!git config --global user.email \"you@example.com\"\n",
    "!git config --global user.name \"Your Name\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install requests"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "\n",
    "url = \"https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1\"\n",
    "response = requests.get(url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "200"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "response.status_code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/2792',\n",
       "  'repository_url': 'https://api.github.com/repos/huggingface/datasets',\n",
       "  'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/2792/labels{/name}',\n",
       "  'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/2792/comments',\n",
       "  'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/2792/events',\n",
       "  'html_url': 'https://github.com/huggingface/datasets/pull/2792',\n",
       "  'id': 968650274,\n",
       "  'node_id': 'MDExOlB1bGxSZXF1ZXN0NzEwNzUyMjc0',\n",
       "  'number': 2792,\n",
       "  'title': 'Update GooAQ',\n",
       "  'user': {'login': 'bhavitvyamalik',\n",
       "   'id': 19718818,\n",
       "   'node_id': 'MDQ6VXNlcjE5NzE4ODE4',\n",
       "   'avatar_url': 'https://avatars.githubusercontent.com/u/19718818?v=4',\n",
       "   'gravatar_id': '',\n",
       "   'url': 'https://api.github.com/users/bhavitvyamalik',\n",
       "   'html_url': 'https://github.com/bhavitvyamalik',\n",
       "   'followers_url': 'https://api.github.com/users/bhavitvyamalik/followers',\n",
       "   'following_url': 'https://api.github.com/users/bhavitvyamalik/following{/other_user}',\n",
       "   'gists_url': 'https://api.github.com/users/bhavitvyamalik/gists{/gist_id}',\n",
       "   'starred_url': 'https://api.github.com/users/bhavitvyamalik/starred{/owner}{/repo}',\n",
       "   'subscriptions_url': 'https://api.github.com/users/bhavitvyamalik/subscriptions',\n",
       "   'organizations_url': 'https://api.github.com/users/bhavitvyamalik/orgs',\n",
       "   'repos_url': 'https://api.github.com/users/bhavitvyamalik/repos',\n",
       "   'events_url': 'https://api.github.com/users/bhavitvyamalik/events{/privacy}',\n",
       "   'received_events_url': 'https://api.github.com/users/bhavitvyamalik/received_events',\n",
       "   'type': 'User',\n",
       "   'site_admin': False},\n",
       "  'labels': [],\n",
       "  'state': 'open',\n",
       "  'locked': False,\n",
       "  'assignee': None,\n",
       "  'assignees': [],\n",
       "  'milestone': None,\n",
       "  'comments': 1,\n",
       "  'created_at': '2021-08-12T11:40:18Z',\n",
       "  'updated_at': '2021-08-12T12:31:17Z',\n",
       "  'closed_at': None,\n",
       "  'author_association': 'CONTRIBUTOR',\n",
       "  'active_lock_reason': None,\n",
       "  'pull_request': {'url': 'https://api.github.com/repos/huggingface/datasets/pulls/2792',\n",
       "   'html_url': 'https://github.com/huggingface/datasets/pull/2792',\n",
       "   'diff_url': 'https://github.com/huggingface/datasets/pull/2792.diff',\n",
       "   'patch_url': 'https://github.com/huggingface/datasets/pull/2792.patch'},\n",
       "  'body': '[GooAQ](https://github.com/allenai/gooaq) dataset was recently updated after splits were added for the same. This PR contains new updated GooAQ with train/val/test splits and updated README as well.',\n",
       "  'performed_via_github_app': None}]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "response.json()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "GITHUB_TOKEN = xxx  # Copy your GitHub token here\n",
    "headers = {\"Authorization\": f\"token {GITHUB_TOKEN}\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import math\n",
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "\n",
    "def fetch_issues(\n",
    "    owner=\"huggingface\",\n",
    "    repo=\"datasets\",\n",
    "    num_issues=10_000,\n",
    "    rate_limit=5_000,\n",
    "    issues_path=Path(\".\"),\n",
    "):\n",
    "    if not issues_path.is_dir():\n",
    "        issues_path.mkdir(exist_ok=True)\n",
    "\n",
    "    batch = []\n",
    "    all_issues = []\n",
    "    per_page = 100  # Number of issues to return per page\n",
    "    num_pages = math.ceil(num_issues / per_page)\n",
    "    base_url = \"https://api.github.com/repos\"\n",
    "\n",
    "    for page in tqdm(range(num_pages)):\n",
    "        # Query with state=all to get both open and closed issues\n",
    "        query = f\"issues?page={page}&per_page={per_page}&state=all\"\n",
    "        issues = requests.get(f\"{base_url}/{owner}/{repo}/{query}\", headers=headers)\n",
    "        batch.extend(issues.json())\n",
    "\n",
    "        if len(batch) > rate_limit and len(all_issues) < num_issues:\n",
    "            all_issues.extend(batch)\n",
    "            batch = []  # Flush batch for next time period\n",
    "            print(f\"Reached GitHub rate limit. Sleeping for one hour ...\")\n",
    "            time.sleep(60 * 60 + 1)\n",
    "\n",
    "    all_issues.extend(batch)\n",
    "    df = pd.DataFrame.from_records(all_issues)\n",
    "    df.to_json(f\"{issues_path}/{repo}-issues.jsonl\", orient=\"records\", lines=True)\n",
    "    print(\n",
    "        f\"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Depending on your internet connection, this can take several minutes to run...\n",
    "fetch_issues()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app'],\n",
       "    num_rows: 3019\n",
       "})"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "issues_dataset = load_dataset(\"json\", data_files=\"datasets-issues.jsonl\", split=\"train\")\n",
    "issues_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       ">> URL: https://github.com/huggingface/datasets/pull/850\n",
       ">> Pull request: {'url': 'https://api.github.com/repos/huggingface/datasets/pulls/850', 'html_url': 'https://github.com/huggingface/datasets/pull/850', 'diff_url': 'https://github.com/huggingface/datasets/pull/850.diff', 'patch_url': 'https://github.com/huggingface/datasets/pull/850.patch'}\n",
       "\n",
       ">> URL: https://github.com/huggingface/datasets/issues/2773\n",
       ">> Pull request: None\n",
       "\n",
       ">> URL: https://github.com/huggingface/datasets/pull/783\n",
       ">> Pull request: {'url': 'https://api.github.com/repos/huggingface/datasets/pulls/783', 'html_url': 'https://github.com/huggingface/datasets/pull/783', 'diff_url': 'https://github.com/huggingface/datasets/pull/783.diff', 'patch_url': 'https://github.com/huggingface/datasets/pull/783.patch'}"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample = issues_dataset.shuffle(seed=666).select(range(3))\n",
    "\n",
    "# Print out the URL and pull request entries\n",
    "for url, pr in zip(sample[\"html_url\"], sample[\"pull_request\"]):\n",
    "    print(f\">> URL: {url}\")\n",
    "    print(f\">> Pull request: {pr}\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "issues_dataset = issues_dataset.map(\n",
    "    lambda x: {\"is_pull_request\": False if x[\"pull_request\"] is None else True}\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/comments/897594128',\n",
       "  'html_url': 'https://github.com/huggingface/datasets/pull/2792#issuecomment-897594128',\n",
       "  'issue_url': 'https://api.github.com/repos/huggingface/datasets/issues/2792',\n",
       "  'id': 897594128,\n",
       "  'node_id': 'IC_kwDODunzps41gDMQ',\n",
       "  'user': {'login': 'bhavitvyamalik',\n",
       "   'id': 19718818,\n",
       "   'node_id': 'MDQ6VXNlcjE5NzE4ODE4',\n",
       "   'avatar_url': 'https://avatars.githubusercontent.com/u/19718818?v=4',\n",
       "   'gravatar_id': '',\n",
       "   'url': 'https://api.github.com/users/bhavitvyamalik',\n",
       "   'html_url': 'https://github.com/bhavitvyamalik',\n",
       "   'followers_url': 'https://api.github.com/users/bhavitvyamalik/followers',\n",
       "   'following_url': 'https://api.github.com/users/bhavitvyamalik/following{/other_user}',\n",
       "   'gists_url': 'https://api.github.com/users/bhavitvyamalik/gists{/gist_id}',\n",
       "   'starred_url': 'https://api.github.com/users/bhavitvyamalik/starred{/owner}{/repo}',\n",
       "   'subscriptions_url': 'https://api.github.com/users/bhavitvyamalik/subscriptions',\n",
       "   'organizations_url': 'https://api.github.com/users/bhavitvyamalik/orgs',\n",
       "   'repos_url': 'https://api.github.com/users/bhavitvyamalik/repos',\n",
       "   'events_url': 'https://api.github.com/users/bhavitvyamalik/events{/privacy}',\n",
       "   'received_events_url': 'https://api.github.com/users/bhavitvyamalik/received_events',\n",
       "   'type': 'User',\n",
       "   'site_admin': False},\n",
       "  'created_at': '2021-08-12T12:21:52Z',\n",
       "  'updated_at': '2021-08-12T12:31:17Z',\n",
       "  'author_association': 'CONTRIBUTOR',\n",
       "  'body': \"@albertvillanova my tests are failing here:\\r\\n```\\r\\ndataset_name = 'gooaq'\\r\\n\\r\\n    def test_load_dataset(self, dataset_name):\\r\\n        configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\\r\\n>       self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\\r\\n\\r\\ntests/test_dataset_common.py:234: \\r\\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \\r\\ntests/test_dataset_common.py:187: in check_load_dataset\\r\\n    self.parent.assertTrue(len(dataset[split]) > 0)\\r\\nE   AssertionError: False is not true\\r\\n```\\r\\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?\",\n",
       "  'performed_via_github_app': None}]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "issue_number = 2792\n",
    "url = f\"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments\"\n",
    "response = requests.get(url, headers=headers)\n",
    "response.json()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[\"@albertvillanova my tests are failing here:\\r\\n```\\r\\ndataset_name = 'gooaq'\\r\\n\\r\\n    def test_load_dataset(self, dataset_name):\\r\\n        configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\\r\\n>       self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\\r\\n\\r\\ntests/test_dataset_common.py:234: \\r\\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \\r\\ntests/test_dataset_common.py:187: in check_load_dataset\\r\\n    self.parent.assertTrue(len(dataset[split]) > 0)\\r\\nE   AssertionError: False is not true\\r\\n```\\r\\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?\"]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def get_comments(issue_number):\n",
    "    url = f\"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments\"\n",
    "    response = requests.get(url, headers=headers)\n",
    "    return [r[\"body\"] for r in response.json()]\n",
    "\n",
    "\n",
    "# Test our function works as expected\n",
    "get_comments(2792)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Depending on your internet connection, this can take a few minutes...\n",
    "issues_with_comments_dataset = issues_dataset.map(\n",
    "    lambda x: {\"comments\": get_comments(x[\"number\"])}\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "issues_with_comments_dataset.to_json(\"issues-datasets-with-comments.jsonl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Number of datasets on Hub: 1487\n",
       "Dataset Name: acronym_identification, Tags: ['annotations_creators:expert-generated', 'language_creators:found', 'languages:en', 'licenses:mit', 'multilinguality:monolingual', 'size_categories:10K<n<100K', 'source_datasets:original', 'task_categories:structure-prediction', 'task_ids:structure-prediction-other-acronym-identification']"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from huggingface_hub import list_datasets\n",
    "\n",
    "all_datasets = list_datasets()\n",
    "print(f\"Number of datasets on Hub: {len(all_datasets)}\")\n",
    "print(all_datasets[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://huggingface.co/datasets/lewtun/github-issues'"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from huggingface_hub import create_repo\n",
    "\n",
    "repo_url = create_repo(name=\"github-issues\", repo_type=\"dataset\")\n",
    "repo_url"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import Repository\n",
    "\n",
    "repo = Repository(local_dir=\"github-issues\", clone_from=repo_url)\n",
    "!cp datasets-issues-with-comments.jsonl github-issues/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "repo.lfs_track(\"*.jsonl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "repo.push_to_hub()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'performed_via_github_app', 'is_pull_request'],\n",
       "    num_rows: 2855\n",
       "})"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "remote_dataset = load_dataset(\"lewtun/github-issues\", split=\"train\")\n",
    "remote_dataset"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "Criando seu próprio dataset",
   "provenance": []
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}