arazzo: 1.0.1 info: title: Adobe Extract Content From a PDF summary: Upload a PDF, extract text and tables into structured JSON, poll the job, and fetch the result. description: >- Drives the Adobe PDF Services asynchronous extract pipeline that pulls text, tables, and figures out of a PDF into structured JSON using Adobe Sensei. The workflow registers the source PDF by requesting a pre-signed upload URI and asset ID, submits an extractPDF job describing which elements and renditions to extract, polls the job until it reports done, and resolves a download URI for the resulting ZIP. Each step spells out its request inline. The polling job identifier is supplied as a workflow input because the submit response exposes only an opaque Location header. version: 1.0.0 sourceDescriptions: - name: pdfServicesApi url: ../openapi/adobe-pdf-services-api-openapi.yml type: openapi workflows: - workflowId: extract-pdf summary: Extract structured text and tables from an uploaded PDF. description: >- Requests an upload slot for the source PDF, submits an extractPDF job for the requested elements and table format, polls job status until extraction finishes, and retrieves the download URI for the structured output ZIP. inputs: type: object required: - accessToken - jobID properties: accessToken: type: string description: OAuth 2.0 bearer access token from Adobe IMS. elementsToExtract: type: array description: Element types to extract (text and/or tables). items: type: string default: - text - tables tableOutputFormat: type: string description: Output format for extracted tables (csv or xlsx). default: csv jobID: type: string description: The job identifier taken from the extractPDF response Location header, used to poll status. steps: - stepId: requestUpload description: >- Request a pre-signed upload URI and asset ID for the source PDF, which is then PUT to the returned uploadUri out of band. operationId: uploadAsset parameters: - name: Authorization in: header value: "Bearer $inputs.accessToken" requestBody: contentType: application/json payload: mediaType: application/pdf successCriteria: - condition: $statusCode == 200 outputs: assetID: $response.body#/assetID uploadUri: $response.body#/uploadUri - stepId: submitExtract description: >- Submit an asynchronous extractPDF job that extracts structured content from the uploaded PDF. Returns 201 with an in-progress job status. operationId: extractPDF parameters: - name: Authorization in: header value: "Bearer $inputs.accessToken" requestBody: contentType: application/json payload: assetID: $steps.requestUpload.outputs.assetID elementsToExtract: $inputs.elementsToExtract elementsToExtractRenditions: - tables tableOutputFormat: $inputs.tableOutputFormat getStylingInfo: true successCriteria: - condition: $statusCode == 201 outputs: status: $response.body#/status - stepId: pollStatus description: >- Poll the extractPDF job until it is no longer in progress, looping back while the status remains "in progress". operationId: getJobStatus parameters: - name: Authorization in: header value: "Bearer $inputs.accessToken" - name: operationType in: path value: extractpdf - name: jobID in: path value: $inputs.jobID successCriteria: - condition: $statusCode == 200 outputs: status: $response.body#/status outputAssetID: $response.body#/asset/assetID onSuccess: - name: stillRunning type: goto stepId: pollStatus criteria: - context: $response.body condition: $.status == "in progress" type: jsonpath - name: finished type: goto stepId: getOutput criteria: - context: $response.body condition: $.status == "done" type: jsonpath - stepId: getOutput description: >- Resolve a pre-signed download URI for the extracted output ZIP. operationId: getAsset parameters: - name: Authorization in: header value: "Bearer $inputs.accessToken" - name: assetID in: path value: $steps.pollStatus.outputs.outputAssetID successCriteria: - condition: $statusCode == 200 outputs: downloadUri: $response.body#/downloadUri outputs: sourceAssetID: $steps.requestUpload.outputs.assetID outputAssetID: $steps.pollStatus.outputs.outputAssetID downloadUri: $steps.getOutput.outputs.downloadUri