arazzo: 1.0.1
info:
  title: Adobe Extract Content From a PDF
  summary: Upload a PDF, extract text and tables into structured JSON, poll the job, and fetch the result.
  description: >-
    Drives the Adobe PDF Services asynchronous extract pipeline that pulls
    text, tables, and figures out of a PDF into structured JSON using Adobe
    Sensei. The workflow registers the source PDF by requesting a pre-signed
    upload URI and asset ID, submits an extractPDF job describing which
    elements and renditions to extract, polls the job until it reports done,
    and resolves a download URI for the resulting ZIP. Each step spells out its
    request inline. The polling job identifier is supplied as a workflow input
    because the submit response exposes only an opaque Location header.
  version: 1.0.0
sourceDescriptions:
- name: pdfServicesApi
  url: ../openapi/adobe-pdf-services-api-openapi.yml
  type: openapi
workflows:
- workflowId: extract-pdf
  summary: Extract structured text and tables from an uploaded PDF.
  description: >-
    Requests an upload slot for the source PDF, submits an extractPDF job for
    the requested elements and table format, polls job status until extraction
    finishes, and retrieves the download URI for the structured output ZIP.
  inputs:
    type: object
    required:
    - accessToken
    - jobID
    properties:
      accessToken:
        type: string
        description: OAuth 2.0 bearer access token from Adobe IMS.
      elementsToExtract:
        type: array
        description: Element types to extract (text and/or tables).
        items:
          type: string
        default:
        - text
        - tables
      tableOutputFormat:
        type: string
        description: Output format for extracted tables (csv or xlsx).
        default: csv
      jobID:
        type: string
        description: The job identifier taken from the extractPDF response Location header, used to poll status.
  steps:
  - stepId: requestUpload
    description: >-
      Request a pre-signed upload URI and asset ID for the source PDF, which is
      then PUT to the returned uploadUri out of band.
    operationId: uploadAsset
    parameters:
    - name: Authorization
      in: header
      value: "Bearer $inputs.accessToken"
    requestBody:
      contentType: application/json
      payload:
        mediaType: application/pdf
    successCriteria:
    - condition: $statusCode == 200
    outputs:
      assetID: $response.body#/assetID
      uploadUri: $response.body#/uploadUri
  - stepId: submitExtract
    description: >-
      Submit an asynchronous extractPDF job that extracts structured content
      from the uploaded PDF. Returns 201 with an in-progress job status.
    operationId: extractPDF
    parameters:
    - name: Authorization
      in: header
      value: "Bearer $inputs.accessToken"
    requestBody:
      contentType: application/json
      payload:
        assetID: $steps.requestUpload.outputs.assetID
        elementsToExtract: $inputs.elementsToExtract
        elementsToExtractRenditions:
        - tables
        tableOutputFormat: $inputs.tableOutputFormat
        getStylingInfo: true
    successCriteria:
    - condition: $statusCode == 201
    outputs:
      status: $response.body#/status
  - stepId: pollStatus
    description: >-
      Poll the extractPDF job until it is no longer in progress, looping back
      while the status remains "in progress".
    operationId: getJobStatus
    parameters:
    - name: Authorization
      in: header
      value: "Bearer $inputs.accessToken"
    - name: operationType
      in: path
      value: extractpdf
    - name: jobID
      in: path
      value: $inputs.jobID
    successCriteria:
    - condition: $statusCode == 200
    outputs:
      status: $response.body#/status
      outputAssetID: $response.body#/asset/assetID
    onSuccess:
    - name: stillRunning
      type: goto
      stepId: pollStatus
      criteria:
      - context: $response.body
        condition: $.status == "in progress"
        type: jsonpath
    - name: finished
      type: goto
      stepId: getOutput
      criteria:
      - context: $response.body
        condition: $.status == "done"
        type: jsonpath
  - stepId: getOutput
    description: >-
      Resolve a pre-signed download URI for the extracted output ZIP.
    operationId: getAsset
    parameters:
    - name: Authorization
      in: header
      value: "Bearer $inputs.accessToken"
    - name: assetID
      in: path
      value: $steps.pollStatus.outputs.outputAssetID
    successCriteria:
    - condition: $statusCode == 200
    outputs:
      downloadUri: $response.body#/downloadUri
  outputs:
    sourceAssetID: $steps.requestUpload.outputs.assetID
    outputAssetID: $steps.pollStatus.outputs.outputAssetID
    downloadUri: $steps.getOutput.outputs.downloadUri