arazzo: 1.0.1
info:
  title: Adobe OCR a Scanned PDF
  summary: Upload a scanned PDF, run OCR to make it searchable, poll the job, and fetch the result.
  description: >-
    Drives the Adobe PDF Services asynchronous OCR pipeline that converts a
    scanned, image-based PDF into a searchable and selectable document. The
    workflow registers the source PDF by requesting a pre-signed upload URI and
    asset ID, submits an ocrPDF job with the chosen language and OCR type,
    polls the job until it reports done, and resolves a download URI for the
    searchable output. Each step spells out its request inline. The polling job
    identifier is supplied as a workflow input because the submit response
    exposes only an opaque Location header.
  version: 1.0.0
sourceDescriptions:
- name: pdfServicesApi
  url: ../openapi/adobe-pdf-services-api-openapi.yml
  type: openapi
workflows:
- workflowId: ocr-pdf
  summary: Make a scanned PDF searchable with optical character recognition.
  description: >-
    Requests an upload slot for the source PDF, submits an ocrPDF job, polls
    job status until OCR finishes, and retrieves the download URI for the
    searchable PDF.
  inputs:
    type: object
    required:
    - accessToken
    - jobID
    properties:
      accessToken:
        type: string
        description: OAuth 2.0 bearer access token from Adobe IMS.
      ocrLanguage:
        type: string
        description: Language for OCR processing (e.g. en-US).
        default: en-US
      ocrType:
        type: string
        description: OCR output type (SEARCHABLE_IMAGE or SEARCHABLE_IMAGE_EXACT).
        default: SEARCHABLE_IMAGE
      jobID:
        type: string
        description: The job identifier taken from the ocrPDF response Location header, used to poll status.
  steps:
  - stepId: requestUpload
    description: >-
      Request a pre-signed upload URI and asset ID for the source PDF, which is
      then PUT to the returned uploadUri out of band.
    operationId: uploadAsset
    parameters:
    - name: Authorization
      in: header
      value: "Bearer $inputs.accessToken"
    requestBody:
      contentType: application/json
      payload:
        mediaType: application/pdf
    successCriteria:
    - condition: $statusCode == 200
    outputs:
      assetID: $response.body#/assetID
      uploadUri: $response.body#/uploadUri
  - stepId: submitOcr
    description: >-
      Submit an asynchronous ocrPDF job that makes the uploaded PDF searchable.
      Returns 201 with an in-progress job status.
    operationId: ocrPDF
    parameters:
    - name: Authorization
      in: header
      value: "Bearer $inputs.accessToken"
    requestBody:
      contentType: application/json
      payload:
        assetID: $steps.requestUpload.outputs.assetID
        ocrLanguage: $inputs.ocrLanguage
        ocrType: $inputs.ocrType
    successCriteria:
    - condition: $statusCode == 201
    outputs:
      status: $response.body#/status
  - stepId: pollStatus
    description: >-
      Poll the ocrPDF job until it is no longer in progress, looping back while
      the status remains "in progress".
    operationId: getJobStatus
    parameters:
    - name: Authorization
      in: header
      value: "Bearer $inputs.accessToken"
    - name: operationType
      in: path
      value: ocr
    - name: jobID
      in: path
      value: $inputs.jobID
    successCriteria:
    - condition: $statusCode == 200
    outputs:
      status: $response.body#/status
      outputAssetID: $response.body#/asset/assetID
    onSuccess:
    - name: stillRunning
      type: goto
      stepId: pollStatus
      criteria:
      - context: $response.body
        condition: $.status == "in progress"
        type: jsonpath
    - name: finished
      type: goto
      stepId: getOutput
      criteria:
      - context: $response.body
        condition: $.status == "done"
        type: jsonpath
  - stepId: getOutput
    description: >-
      Resolve a pre-signed download URI for the searchable output PDF.
    operationId: getAsset
    parameters:
    - name: Authorization
      in: header
      value: "Bearer $inputs.accessToken"
    - name: assetID
      in: path
      value: $steps.pollStatus.outputs.outputAssetID
    successCriteria:
    - condition: $statusCode == 200
    outputs:
      downloadUri: $response.body#/downloadUri
  outputs:
    sourceAssetID: $steps.requestUpload.outputs.assetID
    outputAssetID: $steps.pollStatus.outputs.outputAssetID
    downloadUri: $steps.getOutput.outputs.downloadUri