openapi: 3.1.0 info: title: Reducto Extract API version: 1.0.0 description: Extract structured data from documents using a JSON Schema. Supports deep extract, array extraction, citations, and per-field optimization. contact: name: Reducto Support email: support@reducto.ai url: https://reducto.ai/contact license: name: Reducto Terms of Service url: https://reducto.ai/terms servers: - url: https://platform.reducto.ai description: Reducto production platform security: - SkippableHTTPBearer: [] tags: - name: Extract paths: /extract: post: summary: Extract operationId: extract_extract_post security: - SkippableHTTPBearer: [] parameters: - name: user-id in: header required: false schema: anyOf: - type: string - type: 'null' title: User-Id requestBody: required: true content: application/json: schema: oneOf: - $ref: '#/components/schemas/SyncExtractConfig' - $ref: '#/components/schemas/AsyncExtractConfig' responses: '200': description: Successful Response content: application/json: schema: oneOf: - $ref: '#/components/schemas/V3ExtractResponse' - $ref: '#/components/schemas/AsyncExtractResponse' '422': description: Validation Error content: application/json: schema: $ref: '#/components/schemas/HTTPValidationError' tags: - Extract /extract_async: post: summary: Extract Async operationId: extract_async_extract_async_post security: - SkippableHTTPBearer: [] parameters: - name: user-id in: header required: false schema: anyOf: - type: string - type: 'null' title: User-Id requestBody: required: true content: application/json: schema: $ref: '#/components/schemas/AsyncExtractConfig' responses: '200': description: Successful Response content: application/json: schema: $ref: '#/components/schemas/AsyncExtractResponse' '422': description: Validation Error content: application/json: schema: $ref: '#/components/schemas/HTTPValidationError' tags: - Extract components: schemas: SyncExtractConfig: properties: input: anyOf: - type: string - items: type: string type: array - $ref: '#/components/schemas/UploadResponse' title: Input description: "For parse/split/extract pipelines, the URL of the document to be processed. You can provide one of\ \ the following:\n 1. A publicly available URL\n 2. A presigned S3 URL\n 3. A\ \ reducto:// prefixed URL obtained from the /upload endpoint after directly uploading a document\n \ \ 4. A jobid:// prefixed URL obtained from a previous /parse invocation\n 5. A list of URLs (for multi-document\ \ pipelines, V3 API only)\n\n For edit pipelines, this should be a string containing the edit instructions " parsing: $ref: '#/components/schemas/ParseOptions' description: The configuration options for parsing the document. If you are passing in a jobid:// URL for the file, then this configuration will be ignored. default: enhance: agentic: [] intelligent_ordering: false summarize_figures: true retrieval: chunking: chunk_mode: disabled chunk_overlap: 0 embedding_optimized: false filter_blocks: [] formatting: add_page_markers: false include: [] merge_tables: false table_output_format: dynamic spreadsheet: clustering: accurate exclude: [] include: [] split_large_tables: enabled: true size: 50 settings: embed_pdf_metadata: false embed_pdf_metadata_dpi: 100 extraction_mode: hybrid force_url_result: false ocr_system: standard persist_results: false return_images: [] return_ocr_data: false instructions: $ref: '#/components/schemas/Instructions' description: The instructions to use for the extraction. default: schema: {} system_prompt: Be precise and thorough. settings: $ref: '#/components/schemas/ExtractSettings' description: The settings to use for the extraction. default: include_images: false optimize_for_latency: false array_extract: false deep_extract: false citations: enabled: false numerical_confidence: true type: object required: - input title: SyncExtractConfig Settings: properties: ocr_system: type: string enum: - standard - legacy title: Ocr System description: Standard is our best multilingual OCR system. Legacy only supports germanic languages and is available for backwards compatibility. default: standard extraction_mode: type: string enum: - ocr - hybrid title: Extraction Mode description: The mode to use for text extraction from PDFs. OCR mode uses optical character recognition only. Hybrid mode combines OCR with embedded PDF text for best accuracy (default). default: hybrid force_url_result: type: boolean title: Force Url Result description: Force the result to be returned in URL form. default: false force_file_extension: anyOf: - type: string - type: 'null' title: Force File Extension description: Force the URL to be downloaded as a specific file extension (e.g. `.png`). return_ocr_data: type: boolean title: Return Ocr Data description: If True, return OCR data in the result. Defaults to False. default: false return_images: items: type: string enum: - figure - table - page type: array title: Return Images description: Whether to return images for the specified block types. 'page' returns full page images. By default, no images are returned. default: [] embed_pdf_metadata: type: boolean title: Embed Pdf Metadata description: If True, embed OCR metadata into the returned PDF. Defaults to False. default: false embed_pdf_metadata_dpi: type: integer maximum: 250 minimum: 50 title: Embed Pdf Metadata Dpi description: Render DPI used when rasterizing the source PDF before embedding the OCR text layer (only applies when ``embed_pdf_metadata`` is True). Lower values produce dramatically smaller output PDFs; higher values preserve more detail when zoomed past 200%. Defaults to 100 (good for on-screen viewing); raise toward the source scan DPI for crisper output. Min 50, max 250. default: 100 persist_results: type: boolean title: Persist Results description: If True, persist the results indefinitely. Defaults to False. default: false timeout: anyOf: - type: number - type: 'null' title: Timeout description: The timeout for the job in seconds. page_range: anyOf: - $ref: '#/components/schemas/PageRange' - items: $ref: '#/components/schemas/PageRange' type: array - items: type: integer type: array - items: type: string type: array - type: 'null' title: Page Range description: The page range to process (1-indexed). By default, the entire document is processed. For spreadsheets, you can also provide a list of sheet names. document_password: anyOf: - type: string - type: 'null' title: Document Password description: Password to decrypt password-protected documents. type: object title: Settings ValidationError: properties: loc: items: anyOf: - type: string - type: integer type: array title: Location msg: type: string title: Message type: type: string title: Error Type input: title: Input ctx: type: object title: Context type: object required: - loc - msg - type title: ValidationError Enhance: properties: agentic: items: anyOf: - $ref: '#/components/schemas/TableAgentic' - $ref: '#/components/schemas/FigureAgentic' - $ref: '#/components/schemas/TextAgentic' type: array title: Agentic description: Agentic uses vision language models to enhance the accuracy of the output of different types of extraction. This will incur a cost and latency increase. default: [] summarize_figures: type: boolean title: Summarize Figures description: If True, summarize figures using a small vision language model. Defaults to True. default: true intelligent_ordering: type: boolean title: Intelligent Ordering description: If True, use an advanced vision language model to improve reading order accuracy, with a small increase in latency. Defaults to False. default: false type: object title: Enhance Citations: properties: enabled: type: boolean title: Enabled description: If True, include citations in the extraction. default: false numerical_confidence: type: boolean title: Numerical Confidence description: If True, enable numeric citation confidence scores. Defaults to True. default: true type: object title: Citations config__v3__AsyncConfig: properties: metadata: title: Metadata description: JSON metadata included in webhook request body. Defaults to None. priority: type: boolean title: Priority description: If True, attempts to process the job with priority if the user has priority processing budget available; by default, sync jobs are prioritized above async jobs. default: false webhook: anyOf: - $ref: '#/components/schemas/SvixWebhookConfig' - $ref: '#/components/schemas/DirectWebhookConfig' - type: 'null' title: Webhook description: The webhook configuration for the asynchronous processing. type: object title: AsyncConfig ParseOptions: properties: enhance: $ref: '#/components/schemas/Enhance' default: agentic: [] summarize_figures: true intelligent_ordering: false retrieval: $ref: '#/components/schemas/Retrieval' default: chunking: chunk_mode: disabled chunk_overlap: 0 filter_blocks: [] embedding_optimized: false formatting: $ref: '#/components/schemas/Formatting' default: add_page_markers: false table_output_format: dynamic merge_tables: false include: [] spreadsheet: $ref: '#/components/schemas/Spreadsheet' default: split_large_tables: enabled: true size: 50 include: [] clustering: accurate exclude: [] settings: $ref: '#/components/schemas/Settings' default: ocr_system: standard extraction_mode: hybrid force_url_result: false return_ocr_data: false return_images: [] embed_pdf_metadata: false embed_pdf_metadata_dpi: 100 persist_results: false type: object title: ParseOptions Spreadsheet: properties: split_large_tables: $ref: '#/components/schemas/SplitLargeTables' default: enabled: true size: 50 include: items: type: string enum: - cell_colors - formula - dropdowns type: array title: Include description: Whether to include cell color, formula, and dropdown information in the output. default: [] clustering: type: string enum: - accurate - fast - disabled title: Clustering description: "In a spreadsheet with different tables inside, we enable splitting up the tables by default. Accurate\ \ mode applies more powerful models for superior accuracy, at 5\xD7 the default per-cell rate. Disabling will\ \ register as one large table." default: accurate exclude: items: type: string enum: - hidden_sheets - hidden_rows - hidden_cols - styling - spreadsheet_images type: array title: Exclude description: Whether to exclude hidden sheets, rows, or columns in the output. default: [] type: object title: Spreadsheet FigureAgentic: properties: scope: type: string const: figure title: Scope prompt: anyOf: - type: string - type: 'null' title: Prompt description: Custom prompt for figure agentic. advanced_chart_agent: type: boolean title: Advanced Chart Agent description: If True, use the advanced chart agent. Defaults to False. default: false return_overlays: type: boolean title: Return Overlays description: If True, return overlays for the figure. This is so you can use the overlays to double check the quality of the extraction default: false type: object required: - scope title: FigureAgentic SvixWebhookConfig: properties: mode: type: string const: svix title: Mode default: svix channels: items: type: string type: array title: Channels description: A list of Svix channels the message will be delivered down, omit to send to all channels. type: object title: SvixWebhookConfig Chunking: properties: chunk_mode: type: string enum: - variable - section - page - disabled - block - page_sections title: Chunk Mode description: Choose how to partition chunks. Variable mode chunks by character length and visual context. Section mode chunks by section headers. Page mode chunks according to pages. Page sections mode chunks first by page, then by sections within each page. Disabled returns one single chunk. default: disabled chunk_size: anyOf: - type: integer - type: 'null' title: Chunk Size description: The approximate size of chunks (in characters) that the document will be split into. Defaults to null, in which case the chunk size is variable between 250 - 1500 characters. chunk_overlap: type: integer title: Chunk Overlap description: Number of characters of overlap to include from adjacent chunks. Defaults to 0. default: 0 type: object title: Chunking TextAgentic: properties: scope: type: string const: text title: Scope prompt: anyOf: - type: string - type: 'null' title: Prompt description: 'Custom instructions for agentic text. Note: This only applies to form regions (key-value).' type: object required: - scope title: TextAgentic DirectWebhookConfig: properties: mode: type: string const: direct title: Mode default: direct url: type: string title: Url type: object required: - url title: DirectWebhookConfig V3ExtractResponse: additionalProperties: true type: object TableAgentic: properties: scope: type: string const: table title: Scope prompt: anyOf: - type: string - type: 'null' title: Prompt description: Custom prompt for table agentic. mode: type: string enum: - default - auto title: Mode description: 'Routing mode for table agentic: ''default'' runs enrichment on all tables, ''auto'' uses the router to skip tables where enrichment is unlikely to help.' default: default type: object required: - scope title: TableAgentic SplitLargeTables: properties: enabled: type: boolean title: Enabled description: If True, split large tables into smaller tables. Defaults to True. default: true size: anyOf: - type: integer - $ref: '#/components/schemas/SplitLargeTableSizes' title: Size description: The size of the tables to split into. Defaults to 50. Use 'row' and 'column' to independently specify the number of rows and columns to include when splitting. If you only want to split by rows or columns, set the other value to None. default: 50 type: object title: SplitLargeTables Instructions: properties: schema: title: Schema description: The JSON schema to use for the extraction. default: {} system_prompt: type: string title: System Prompt description: The system prompt to use for the extraction. default: Be precise and thorough. type: object title: Instructions AsyncExtractConfig: properties: async: $ref: '#/components/schemas/config__v3__AsyncConfig' description: The configuration options for asynchronous processing (default synchronous). default: priority: false input: anyOf: - type: string - items: type: string type: array - $ref: '#/components/schemas/UploadResponse' title: Input description: "For parse/split/extract pipelines, the URL of the document to be processed. You can provide one of\ \ the following:\n 1. A publicly available URL\n 2. A presigned S3 URL\n 3. A\ \ reducto:// prefixed URL obtained from the /upload endpoint after directly uploading a document\n \ \ 4. A jobid:// prefixed URL obtained from a previous /parse invocation\n 5. A list of URLs (for multi-document\ \ pipelines, V3 API only)\n\n For edit pipelines, this should be a string containing the edit instructions " parsing: $ref: '#/components/schemas/ParseOptions' description: The configuration options for parsing the document. If you are passing in a jobid:// URL for the file, then this configuration will be ignored. default: enhance: agentic: [] intelligent_ordering: false summarize_figures: true retrieval: chunking: chunk_mode: disabled chunk_overlap: 0 embedding_optimized: false filter_blocks: [] formatting: add_page_markers: false include: [] merge_tables: false table_output_format: dynamic spreadsheet: clustering: accurate exclude: [] include: [] split_large_tables: enabled: true size: 50 settings: embed_pdf_metadata: false embed_pdf_metadata_dpi: 100 extraction_mode: hybrid force_url_result: false ocr_system: standard persist_results: false return_images: [] return_ocr_data: false instructions: $ref: '#/components/schemas/Instructions' description: The instructions to use for the extraction. default: schema: {} system_prompt: Be precise and thorough. settings: $ref: '#/components/schemas/ExtractSettings' description: The settings to use for the extraction. default: include_images: false optimize_for_latency: false array_extract: false deep_extract: false citations: enabled: false numerical_confidence: true type: object required: - input title: AsyncExtractConfig PageRange: properties: start: anyOf: - type: integer - type: 'null' title: Start description: The page number to start processing from (1-indexed). end: anyOf: - type: integer - type: 'null' title: End description: The page number to stop processing at (1-indexed). type: object title: PageRange UploadResponse: properties: file_id: type: string title: File Id presigned_url: anyOf: - type: string - type: 'null' title: Presigned Url type: object required: - file_id title: UploadResponse SplitLargeTableSizes: properties: row: anyOf: - type: integer - type: 'null' title: Row description: The number of rows to include in each chunk when splitting large tables. Does not chunk rows if set to None. column: anyOf: - type: integer - type: 'null' title: Column description: The number of columns to include in each chunk when splitting large tables. Does not chunk columns if set to None. type: object title: SplitLargeTableSizes AsyncExtractResponse: properties: job_id: type: string title: Job Id type: object required: - job_id title: AsyncExtractResponse Retrieval: properties: chunking: $ref: '#/components/schemas/Chunking' default: chunk_mode: disabled chunk_overlap: 0 filter_blocks: items: type: string enum: - Header - Footer - Title - Section Header - Page Number - List Item - Figure - Table - Key Value - Text - Comment - Signature type: array title: Filter Blocks description: A list of block types to filter out from 'content' and 'embed' fields. By default, no blocks are filtered. default: [] embedding_optimized: type: boolean title: Embedding Optimized description: If True, use embedding optimized mode. Defaults to False. default: false type: object title: Retrieval ExtractSettings: properties: include_images: type: boolean title: Include Images description: If True, include images in the extraction. default: false optimize_for_latency: type: boolean title: Optimize For Latency description: If True, jobs will be processed with a higher throughput and priority at a higher cost. Defaults to False. default: false array_extract: type: boolean title: Array Extract description: If True, use array extraction. default: false deep_extract: type: boolean title: Deep Extract description: If True, use Deep Extract, an agentic extraction mode that iteratively refines its output to achieve near-perfect accuracy. Best for complex documents where accuracy is critical. default: false citations: $ref: '#/components/schemas/Citations' description: The citations to use for the extraction. default: enabled: false numerical_confidence: true type: object title: ExtractSettings Formatting: properties: add_page_markers: type: boolean title: Add Page Markers description: If True, add page markers to the output. Defaults to False. Useful for extracting data with page specific information. default: false table_output_format: type: string enum: - html - json - md - jsonbbox - dynamic - csv title: Table Output Format description: The mode to use for table output. Defaults to dynamic, which returns md for simpler tables and html for more complex tables. default: dynamic merge_tables: type: boolean title: Merge Tables description: A flag to indicate if consecutive tables with the same number of columns should be merged. Defaults to False. default: false include: items: type: string enum: - change_tracking - highlight - comments - hyperlinks - signatures - ignore_watermarks type: array title: Include description: A list of formatting to include in the output. default: [] type: object title: Formatting HTTPValidationError: properties: detail: items: $ref: '#/components/schemas/ValidationError' type: array title: Detail type: object title: HTTPValidationError securitySchemes: SkippableHTTPBearer: type: http scheme: bearer