{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 如果我的数据集不在 Hub 上怎么办?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install the Transformers, Datasets, and Evaluate libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install datasets evaluate transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz\n", "!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!gzip -dkv SQuAD_it-*.json.gz" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "squad_it_dataset = load_dataset(\"json\", data_files=\"SQuAD_it-train.json\", field=\"data\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['title', 'paragraphs'],\n", " num_rows: 442\n", " })\n", "})" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "squad_it_dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{\n", " \"title\": \"Terremoto del Sichuan del 2008\",\n", " \"paragraphs\": [\n", " {\n", " \"context\": \"Il terremoto del Sichuan del 2008 o il terremoto...\",\n", " \"qas\": [\n", " {\n", " \"answers\": [{\"answer_start\": 29, \"text\": \"2008\"}],\n", " \"id\": \"56cdca7862d2951400fa6826\",\n", " \"question\": \"In quale anno si è verificato il terremoto nel Sichuan?\",\n", " },\n", " ...\n", " ],\n", " },\n", " ...\n", " ],\n", "}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "squad_it_dataset[\"train\"][0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['title', 'paragraphs'],\n", " num_rows: 442\n", " })\n", " test: Dataset({\n", " features: ['title', 'paragraphs'],\n", " num_rows: 48\n", " })\n", "})" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_files = {\"train\": \"SQuAD_it-train.json\", \"test\": \"SQuAD_it-test.json\"}\n", "squad_it_dataset = load_dataset(\"json\", data_files=data_files, field=\"data\")\n", "squad_it_dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data_files = {\"train\": \"SQuAD_it-train.json.gz\", \"test\": \"SQuAD_it-test.json.gz\"}\n", "squad_it_dataset = load_dataset(\"json\", data_files=data_files, field=\"data\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "url = \"https://github.com/crux82/squad-it/raw/master/\"\n", "data_files = {\n", " \"train\": url + \"SQuAD_it-train.json.gz\",\n", " \"test\": url + \"SQuAD_it-test.json.gz\",\n", "}\n", "squad_it_dataset = load_dataset(\"json\", data_files=data_files, field=\"data\")" ] } ], "metadata": { "colab": { "name": "如果我的数据集不在 Hub 上怎么办?", "provenance": [] } }, "nbformat": 4, "nbformat_minor": 4 }