arazzo: 1.0.1 info: title: Adobe OCR a Scanned PDF summary: Upload a scanned PDF, run OCR to make it searchable, poll the job, and fetch the result. description: >- Drives the Adobe PDF Services asynchronous OCR pipeline that converts a scanned, image-based PDF into a searchable and selectable document. The workflow registers the source PDF by requesting a pre-signed upload URI and asset ID, submits an ocrPDF job with the chosen language and OCR type, polls the job until it reports done, and resolves a download URI for the searchable output. Each step spells out its request inline. The polling job identifier is supplied as a workflow input because the submit response exposes only an opaque Location header. version: 1.0.0 sourceDescriptions: - name: pdfServicesApi url: ../openapi/adobe-pdf-services-api-openapi.yml type: openapi workflows: - workflowId: ocr-pdf summary: Make a scanned PDF searchable with optical character recognition. description: >- Requests an upload slot for the source PDF, submits an ocrPDF job, polls job status until OCR finishes, and retrieves the download URI for the searchable PDF. inputs: type: object required: - accessToken - jobID properties: accessToken: type: string description: OAuth 2.0 bearer access token from Adobe IMS. ocrLanguage: type: string description: Language for OCR processing (e.g. en-US). default: en-US ocrType: type: string description: OCR output type (SEARCHABLE_IMAGE or SEARCHABLE_IMAGE_EXACT). default: SEARCHABLE_IMAGE jobID: type: string description: The job identifier taken from the ocrPDF response Location header, used to poll status. steps: - stepId: requestUpload description: >- Request a pre-signed upload URI and asset ID for the source PDF, which is then PUT to the returned uploadUri out of band. operationId: uploadAsset parameters: - name: Authorization in: header value: "Bearer $inputs.accessToken" requestBody: contentType: application/json payload: mediaType: application/pdf successCriteria: - condition: $statusCode == 200 outputs: assetID: $response.body#/assetID uploadUri: $response.body#/uploadUri - stepId: submitOcr description: >- Submit an asynchronous ocrPDF job that makes the uploaded PDF searchable. Returns 201 with an in-progress job status. operationId: ocrPDF parameters: - name: Authorization in: header value: "Bearer $inputs.accessToken" requestBody: contentType: application/json payload: assetID: $steps.requestUpload.outputs.assetID ocrLanguage: $inputs.ocrLanguage ocrType: $inputs.ocrType successCriteria: - condition: $statusCode == 201 outputs: status: $response.body#/status - stepId: pollStatus description: >- Poll the ocrPDF job until it is no longer in progress, looping back while the status remains "in progress". operationId: getJobStatus parameters: - name: Authorization in: header value: "Bearer $inputs.accessToken" - name: operationType in: path value: ocr - name: jobID in: path value: $inputs.jobID successCriteria: - condition: $statusCode == 200 outputs: status: $response.body#/status outputAssetID: $response.body#/asset/assetID onSuccess: - name: stillRunning type: goto stepId: pollStatus criteria: - context: $response.body condition: $.status == "in progress" type: jsonpath - name: finished type: goto stepId: getOutput criteria: - context: $response.body condition: $.status == "done" type: jsonpath - stepId: getOutput description: >- Resolve a pre-signed download URI for the searchable output PDF. operationId: getAsset parameters: - name: Authorization in: header value: "Bearer $inputs.accessToken" - name: assetID in: path value: $steps.pollStatus.outputs.outputAssetID successCriteria: - condition: $statusCode == 200 outputs: downloadUri: $response.body#/downloadUri outputs: sourceAssetID: $steps.requestUpload.outputs.assetID outputAssetID: $steps.pollStatus.outputs.outputAssetID downloadUri: $steps.getOutput.outputs.downloadUri