openapi: 3.1.0 info: title: Unstructured Partition API version: 1.5.58 servers: - url: https://api.unstructuredapp.io description: Serverless SaaS API x-speakeasy-server-id: saas-api - url: http://localhost:8000 description: Development server x-speakeasy-server-id: development paths: /general/v0/general: post: tags: - general summary: Summary description: Description operationId: partition parameters: - name: unstructured-api-key in: header required: false schema: anyOf: - type: string - type: 'null' title: Unstructured-Api-Key requestBody: required: true content: multipart/form-data: schema: $ref: '#/components/schemas/partition_parameters' responses: '200': description: Successful Response content: application/json: schema: items: $ref: '#/components/schemas/Element' description: A list of element dictionaries extracted from the file title: Elements type: array x-speakeasy-name-override: elements text/csv: schema: items: $ref: '#/components/schemas/CSV-Element' description: A string containing elements in csv format title: CSV-Element type: string x-speakeasy-name-override: csv_elements '422': description: Validation Error content: application/json: schema: $ref: '#/components/schemas/HTTPValidationError' 5XX: description: Server Error content: application/json: schema: $ref: '#/components/schemas/ServerError' x-speakeasy-name-override: partition components: schemas: HTTPValidationError: type: object properties: detail: oneOf: - type: array items: $ref: '#/components/schemas/ValidationError' - type: string example: detail: - type: int_parsing loc: - body - combine_under_n_chars msg: Input should be a valid integer, unable to parse string as an integer input: forty ValidationError: properties: loc: items: anyOf: - type: string - type: integer type: array title: Location msg: type: string title: Message type: type: string title: Error Type input: title: Input ctx: type: object title: Context type: object required: - loc - msg - type title: ValidationError partition_parameters: properties: files: type: string format: binary description: The file to extract examples: - summary: File to be partitioned externalValue: https://github.com/Unstructured-IO/unstructured/blob/98d3541909f64290b5efb65a226fc3ee8a7cc5ee/example-docs/layout-parser-paper.pdf coordinates: type: boolean title: Coordinates description: 'If `True`, return coordinates for each element extracted via OCR. Default: `False`' default: false content_type: anyOf: - type: string - type: 'null' title: Content type description: A hint about the content type to use (such as text/markdown), when there are problems processing a specific file. This value is a MIME type in the format type/subtype. encoding: anyOf: - type: string - type: 'null' title: Encoding description: 'The encoding method used to decode the text input. Default: utf-8' extract_image_block_types: items: type: string type: array title: Image block types to extract description: The types of elements to extract, for use in extracting image blocks as base64 encoded data stored in metadata fields. default: [] gz_uncompressed_content_type: anyOf: - type: string - type: 'null' title: Uncompressed Content Type description: If file is gzipped, use this content type after unzipping. hi_res_model_name: anyOf: - type: string - type: 'null' title: Hi Res Model Name description: The name of the inference model used when strategy is hi_res include_page_breaks: type: boolean title: Include Page Breaks description: 'If true, the output will include page breaks if the filetype supports it. Default: false' default: false languages: items: type: string type: array title: OCR Languages description: The languages present in the document, for use in partitioning and/or OCR. See the Tesseract documentation for a full list of languages. default: [] ocr_languages: items: type: string type: array title: OCR Languages description: Deprecated! The languages present in the document, for use in partitioning and/or OCR default: [] output_format: type: string enum: - application/json - text/csv title: Output Format description: 'The format of the response. Supported formats are application/json and text/csv. Default: application/json.' default: application/json x-speakeasy-unknown-values: allow pdf_infer_table_structure: type: boolean title: Pdf Infer Table Structure description: Deprecated! Use skip_infer_table_types to opt out of table extraction for any file type. If False and strategy=hi_res, no Table Elements will be extracted from pdf files regardless of skip_infer_table_types contents. default: true skip_infer_table_types: items: type: string type: array title: Skip Infer Table Types description: 'The document types that you want to skip table extraction with. Default: []' default: [] starting_page_number: anyOf: - type: integer - type: 'null' title: PDF Starting Page Number description: When PDF is split into pages before sending it into the API, providing this information will allow the page number to be assigned correctly. Introduced in 1.0.27. strategy: type: string enum: - fast - hi_res - auto - ocr_only - od_only - vlm title: Strategy description: 'The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. Default: hi_res' default: hi_res examples: - auto - hi_res - vlm x-speakeasy-unknown-values: allow vlm_model_provider: type: string enum: - openai - anthropic - bedrock - anthropic_bedrock - vertexai - google - azure_openai title: VLM Model Provider description: The VLM Model provider to use. examples: - openai - anthropic - bedrock x-speakeasy-unknown-values: allow vlm_model: type: string title: VLM Model description: The VLM Model to use. examples: - gpt-4o x-speakeasy-unknown-values: allow table_ocr_agent: anyOf: - type: string - type: 'null' title: Table OCR Agent description: The OCR agent to use for table ocr inference. default: unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract unique_element_ids: type: boolean title: Unique Element IDs description: 'When `True`, assign UUIDs to element IDs, which guarantees their uniqueness (useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: `False`' default: false xml_keep_tags: type: boolean title: Xml Keep Tags description: If `True`, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to XML documents. default: false chunking_strategy: anyOf: - type: string - type: 'null' title: Chunking Strategy description: 'Use one of the supported strategies to chunk the returned elements after partitioning. When ''chunking_strategy'' is not specified, no chunking is performed and any other chunking parameters provided are ignored. Supported strategies: ''basic'', ''by_page'', ''by_similarity'', or ''by_title''' examples: - by_title - basic x-speakeasy-unknown-values: allow combine_under_n_chars: anyOf: - type: integer - type: 'null' title: Combine Under N Chars description: 'If chunking strategy is set, combine elements until a section reaches a length of n chars. Default: 500' include_orig_elements: anyOf: - type: boolean - type: 'null' title: Include original elements in chunks description: 'When a chunking strategy is specified, each returned chunk will include the elements consolidated to form that chunk as `.metadata.orig_elements`. Default: true.' tracking_enabled: anyOf: - type: boolean - type: 'null' title: Tracking enabled description: Tracking enabled max_characters: anyOf: - type: integer - type: 'null' title: Max Characters description: 'If chunking strategy is set, cut off new sections after reaching a length of n chars (hard max). Default: 500' multipage_sections: type: boolean title: Multipage Sections description: 'If chunking strategy is set, determines if sections can span multiple sections. Default: true' default: true new_after_n_chars: anyOf: - type: integer - type: 'null' title: New after n chars description: 'If chunking strategy is set, cut off new sections after reaching a length of n chars (soft max). Default: 1500' overlap: type: integer title: Overlap description: 'Specifies the length of a string (''tail'') to be drawn from each chunk and prefixed to the next chunk as a context-preserving mechanism. By default, this only applies to split-chunks where an oversized element is divided into multiple chunks by text-splitting. Default: 0' default: 0 overlap_all: type: boolean title: Overlap all description: 'When `True`, apply overlap between ''normal'' chunks formed from whole elements and not subject to text-splitting. Use this with caution as it entails a certain level of ''pollution'' of otherwise clean semantic chunk boundaries. Default: False' default: false skip_table_chunking: type: boolean title: Skip table chunking description: 'When `True`, tables are not chunked and always kept unchanged. Default: False' default: false isolate_table: anyOf: - type: boolean - type: 'null' title: Isolate table description: 'When `True`, tables are never chunked together with other elements. Default: True' default: true similarity_threshold: anyOf: - type: number - type: 'null' title: similarity-threshold description: A value between 0.0 and 1.0 describing the minimum similarity two elements must have to be included in the same chunk. Note that similar elements may be separated to meet chunk-size criteria; this value can only guarantees that two elements with similarity below the threshold will appear in separate chunks. do_not_break_similarity_on_footer_header: type: boolean title: do-not-break-similarity-on-footer-header description: When `True`, footer, header, and page number are always considered similar to the text before them for chunk by similarity method. This allows chunk by similarity to connect contents across page better. default: false contextual_chunking_service_name: anyOf: - type: string - type: 'null' title: Contextual Chunking Service Name description: Pre-resolved prompt service name for contextual chunking (e.g. 'BedrockContextualChunking'). When set, uses this service with the provided auth instead of the default env-var-based model selection. contextual_chunking_auth: anyOf: - type: string - type: 'null' title: Contextual Chunking Auth description: JSON-encoded auth credentials for the contextual chunking provider. Structure depends on the provider. include_slide_notes: type: boolean title: include_slide_notes description: 'When `True`, slide notes from .ppt and .pptx files will be included in the response. Default: `True`' default: true pdfminer_line_overlap: anyOf: - type: number - type: 'null' title: PDFMiner Line Overlap description: If two characters have more overlap than this they are considered to be on the same line. The overlap is specified relative to the minimum height of both characters. pdfminer_char_margin: anyOf: - type: number - type: 'null' title: PDFMiner Char Margin description: If two characters are closer together than this margin they are considered part of the same line. The margin is specified relative to the width of the character. pdfminer_line_margin: anyOf: - type: number - type: 'null' title: PDFMiner Line Margin description: If two lines are close together they are considered to be part of the same paragraph. The margin is specified relative to the height of a line. pdfminer_word_margin: anyOf: - type: number - type: 'null' title: PDFMiner Word Margin description: If two characters on the same line are further apart than this margin then they are considered to be two separate words, and an intermediate space will be added for readability. The margin is specified relative to the width of the character. default: 0.185 type: object required: - files title: Partition Parameters Element: title: Element type: object additionalProperties: true example: type: Title element_id: 6aa0ff22f91bbe7e26e8e25ca8052acd text: 'LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis' metadata: languages: - eng page_number: 1 filename: layout-parser-paper.pdf filetype: application/pdf CSV-Element: title: CSV-Element type: string example: "type,element_id,text,languages,page_number,filename,filetype,parent_id,links\n\ \ Title,b7f58c2fd9c15949a55a62eb84e39575,LayoutParser: A Unified Toolkit\ \ for Deep Learning Based Document Image Analysis,['eng'],1,layout-parser-paper.pdf,application/pdf,," ServerError: type: object properties: detail: type: string example: detail: An error occurred securitySchemes: ApiKeyAuth: type: apiKey name: unstructured-api-key in: header x-speakeasy-example: YOUR_API_KEY tags: - name: general security: - ApiKeyAuth: [] - {} x-speakeasy-retries: strategy: backoff backoff: initialInterval: 3000 maxInterval: 720000 maxElapsedTime: 1800000 exponent: 1.88 statusCodes: - 5xx retryConnectionErrors: true