openapi: 3.0.0
info:
title: S2AG Datasets
version: '1.0'
description: "Download full-corpus datasets from the Semantic Scholar Academic Graph\
\ (S2AG)\n
\n Some python demonstrating usage of the datasets API:\n\
\ \n r1 = requests.get('https://api.semanticscholar.org/datasets/v1/release').json()\n\
\ print(r1[-3:])\n ['2023-03-14', '2023-03-21', '2023-03-28']\n\n r2\
\ = requests.get('https://api.semanticscholar.org/datasets/v1/release/latest').json()\n\
\ print(r2['release_id'])\n 2023-03-28\n\n print(json.dumps(r2['datasets'][0],\
\ indent=2))\n {\n \"name\": \"abstracts\",\n \"description\"\
: \"Paper abstract text, where available. 100M records in 30 1.8GB files.\",\n\
\ \"README\": \"Semantic Scholar Academic Graph Datasets The \"abstracts\"\
\ dataset provides...\"\n }\n\n r3 = requests.get('https://api.semanticscholar.org/datasets/v1/release/latest/dataset/abstracts').json()\n\
\ print(json.dumps(r3, indent=2))\n {\n \"name\": \"abstracts\",\n\
\ \"description\": \"Paper abstract text, where available. 100M records in\
\ 30 1.8GB files.\",\n \"README\": \"Semantic Scholar Academic Graph Datasets\
\ The \"abstracts\" dataset provides...\",\n \"files\": [\n \"https://ai2-s2ag.s3.amazonaws.com/dev/staging/2023-03-28/abstracts/20230331_0...\"\
\n ]\n }\n "
servers:
- url: https://api.semanticscholar.org/datasets/v1
paths:
/diffs/{start_release_id}/to/{end_release_id}/{dataset_name}:
get:
summary: Download Links for Incremental Diffs
operationId: get_diff
tags:
- Incremental Updates
description: "Full datasets can be updated from one release to another to avoid\n\
downloading and processing data that hasn't changed. This method returns\n\
a list of all the \"diffs\" that are required to catch a given dataset up\n\
from its current release to a newer one.\n\nEach \"diff\" represents changes\
\ between two sequential releases, and\ncontains two lists of files, an \"\
updated\" list and a \"deleted\" list.\nRecords in the \"updated\" list need\
\ to be inserted or replaced by their\nprimary key. Records in the \"deleted\"\
\ list should be removed.\n\nExample code for updating a database or key/value\
\ store:\n\n difflist = requests.get('https://api.semanticscholar.org/datasets/v1/diffs/2023-08-01/to/latest/papers').json()\n\
\ for diff in difflist['diffs']:\n for url in diff['update_files']:\n\
\ for json_line in requests.get(url).iter_lines():\n \
\ record = json.loads(json_line)\n datastore.upsert(record['corpusid'],\
\ record)\n for url in diff['delete_files']:\n for json_line\
\ in requests.get(url).iter_lines():\n record = json.loads(json_line)\n\
\ datastore.delete(record['corpusid'])\n\nExample code for\
\ updating via a join in Spark:\n\n current = sc.textFile('s3://curr-dataset-location').map(json.loads).keyBy(lambda\
\ x: x['corpusid'])\n updates = sc.textFile('s3://diff-updates-location').map(json.loads).keyBy(lambda\
\ x: x['corpusid'])\n deletes = sc.textFile('s3://diff-deletes-location').map(json.loads).keyBy(lambda\
\ x: x['corpusid'])\n\n updated = current.fullOuterJoin(updates).mapValues(lambda\
\ x: x[1] if x[1] is not None else x[0])\n updated = updated.fullOuterJoin(deletes).mapValues(lambda\
\ x: None if x[1] is not None else x[0]).filter(lambda x: x[1] is not None)\n\
\ updated.values().map(json.dumps).saveAsTextFile('s3://updated-dataset-location')"
responses:
'200':
description: List of download links for one dataset between given releases
content:
application/json:
schema:
$ref: '#/components/schemas/Dataset%20Diff%20List'
/release/:
get:
summary: List of Available Releases
operationId: get_releases
tags:
- Release Data
description: Releases are identified by a date stamp such as "2023-08-01". Each
release contains full data for each dataset.
responses:
'200':
description: List of Available Releases
content:
application/json:
schema:
type: array
items:
type: string
description: Release ids in the form of date stamps
example: '2022-01-17'
/release/{release_id}:
get:
summary: List of Datasets in a Release
operationId: get_release
tags:
- Release Data
description: Metadata describing a particular release, including a list of datasets
available.
responses:
'200':
description: Contents of the release with the given ID
content:
application/json:
schema:
$ref: '#/components/schemas/Release%20Metadata'
/release/{release_id}/dataset/{dataset_name}:
get:
summary: Download Links for a Dataset
operationId: get_dataset
tags:
- Release Data
description: 'Datasets are partitioned and stored on S3. Clients can retrieve
them by requesting this list
of pre-signed download urls and fetching all the partitions.'
responses:
'200':
description: Description and download links for the given dataset within
the given release
content:
application/json:
schema:
$ref: '#/components/schemas/Dataset%20Metadata'
components:
schemas:
Release Metadata:
properties:
release_id:
type: string
example: '2022-01-17'
README:
type: string
description: License and usage
example: Subject to the following terms ...
datasets:
type: array
description: Dataset metadata
items:
$ref: '#/components/schemas/Dataset Summary'
type: object
Dataset Summary:
properties:
name:
type: string
description: Dataset name
example: papers
description:
type: string
description: Description of the data in the dataset
example: Core paper metadata
README:
type: string
description: Documentation and attribution for the dataset
example: This dataset contains ...
type: object
Dataset Metadata:
properties:
name:
type: string
description: Name of the dataset
example: papers
description:
type: string
description: Description of the data contained in this dataset.
example: Core paper metadata
README:
type: string
description: License and usage
example: Subject to terms of use as follows ...
files:
type: array
description: Temporary, pre-signed download links for dataset files
items:
type: string
example: https://...
type: object
Dataset Diff List:
properties:
dataset:
type: string
description: Dataset these diffs are for.
example: papers
start_release:
type: string
description: Beginning release, i.e. the release currently held by the client.
example: '2023-08-01'
end_release:
type: string
description: Ending release, i.e. the release the client wants to update
to.
example: '2023-08-29'
diffs:
type: array
description: List of diffs that need to be applied to bring the dataset
at 'start_release' up to date with 'end_release'.
items:
$ref: '#/components/schemas/Dataset Diff'
type: object
Dataset Diff:
properties:
from_release:
type: string
description: Basline release for this diff.
example: '2023-08-01'
to_release:
type: string
description: Target release for this diff.
example: '2023-08-07'
update_files:
type: array
description: List of files that contain updates to the dataset. Each record
in these files needs to be insterted or updated.
items:
type: string
example: http://...
delete_files:
type: array
description: List of files that contain deletes from the dataset. Each record
in these files needs to be deleted.
items:
type: string
example: http://...
type: object
securitySchemes: {}