openapi: 3.1.0
info:
  title: Unstructured Partition API
  version: 1.5.58
servers:
- url: https://api.unstructuredapp.io
  description: Serverless SaaS API
  x-speakeasy-server-id: saas-api
- url: http://localhost:8000
  description: Development server
  x-speakeasy-server-id: development
paths:
  /general/v0/general:
    post:
      tags:
      - general
      summary: Summary
      description: Description
      operationId: partition
      parameters:
      - name: unstructured-api-key
        in: header
        required: false
        schema:
          anyOf:
          - type: string
          - type: 'null'
          title: Unstructured-Api-Key
      requestBody:
        required: true
        content:
          multipart/form-data:
            schema:
              $ref: '#/components/schemas/partition_parameters'
      responses:
        '200':
          description: Successful Response
          content:
            application/json:
              schema:
                items:
                  $ref: '#/components/schemas/Element'
                description: A list of element dictionaries extracted from the file
                title: Elements
                type: array
                x-speakeasy-name-override: elements
            text/csv:
              schema:
                items:
                  $ref: '#/components/schemas/CSV-Element'
                description: A string containing elements in csv format
                title: CSV-Element
                type: string
                x-speakeasy-name-override: csv_elements
        '422':
          description: Validation Error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/HTTPValidationError'
        5XX:
          description: Server Error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ServerError'
      x-speakeasy-name-override: partition
components:
  schemas:
    HTTPValidationError:
      type: object
      properties:
        detail:
          oneOf:
          - type: array
            items:
              $ref: '#/components/schemas/ValidationError'
          - type: string
      example:
        detail:
        - type: int_parsing
          loc:
          - body
          - combine_under_n_chars
          msg: Input should be a valid integer, unable to parse string as an integer
          input: forty
    ValidationError:
      properties:
        loc:
          items:
            anyOf:
            - type: string
            - type: integer
          type: array
          title: Location
        msg:
          type: string
          title: Message
        type:
          type: string
          title: Error Type
        input:
          title: Input
        ctx:
          type: object
          title: Context
      type: object
      required:
      - loc
      - msg
      - type
      title: ValidationError
    partition_parameters:
      properties:
        files:
          type: string
          format: binary
          description: The file to extract
          examples:
          - summary: File to be partitioned
            externalValue: https://github.com/Unstructured-IO/unstructured/blob/98d3541909f64290b5efb65a226fc3ee8a7cc5ee/example-docs/layout-parser-paper.pdf
        coordinates:
          type: boolean
          title: Coordinates
          description: 'If `True`, return coordinates for each element extracted via
            OCR. Default: `False`'
          default: false
        content_type:
          anyOf:
          - type: string
          - type: 'null'
          title: Content type
          description: A hint about the content type to use (such as text/markdown),
            when there are problems processing a specific file. This value is a MIME
            type in the format type/subtype.
        encoding:
          anyOf:
          - type: string
          - type: 'null'
          title: Encoding
          description: 'The encoding method used to decode the text input. Default:
            utf-8'
        extract_image_block_types:
          items:
            type: string
          type: array
          title: Image block types to extract
          description: The types of elements to extract, for use in extracting image
            blocks as base64 encoded data stored in metadata fields.
          default: []
        gz_uncompressed_content_type:
          anyOf:
          - type: string
          - type: 'null'
          title: Uncompressed Content Type
          description: If file is gzipped, use this content type after unzipping.
        hi_res_model_name:
          anyOf:
          - type: string
          - type: 'null'
          title: Hi Res Model Name
          description: The name of the inference model used when strategy is hi_res
        include_page_breaks:
          type: boolean
          title: Include Page Breaks
          description: 'If true, the output will include page breaks if the filetype
            supports it. Default: false'
          default: false
        languages:
          items:
            type: string
          type: array
          title: OCR Languages
          description: The languages present in the document, for use in partitioning
            and/or OCR. See the Tesseract documentation for a full list of languages.
          default: []
        ocr_languages:
          items:
            type: string
          type: array
          title: OCR Languages
          description: Deprecated! The languages present in the document, for use
            in partitioning and/or OCR
          default: []
        output_format:
          type: string
          enum:
          - application/json
          - text/csv
          title: Output Format
          description: 'The format of the response. Supported formats are application/json
            and text/csv. Default: application/json.'
          default: application/json
          x-speakeasy-unknown-values: allow
        pdf_infer_table_structure:
          type: boolean
          title: Pdf Infer Table Structure
          description: Deprecated! Use skip_infer_table_types to opt out of table
            extraction for any file type. If False and strategy=hi_res, no Table Elements
            will be extracted from pdf files regardless of skip_infer_table_types
            contents.
          default: true
        skip_infer_table_types:
          items:
            type: string
          type: array
          title: Skip Infer Table Types
          description: 'The document types that you want to skip table extraction
            with. Default: []'
          default: []
        starting_page_number:
          anyOf:
          - type: integer
          - type: 'null'
          title: PDF Starting Page Number
          description: When PDF is split into pages before sending it into the API,
            providing this information will allow the page number to be assigned correctly.
            Introduced in 1.0.27.
        strategy:
          type: string
          enum:
          - fast
          - hi_res
          - auto
          - ocr_only
          - od_only
          - vlm
          title: Strategy
          description: 'The strategy to use for partitioning PDF/image. Options are
            fast, hi_res, auto. Default: hi_res'
          default: hi_res
          examples:
          - auto
          - hi_res
          - vlm
          x-speakeasy-unknown-values: allow
        vlm_model_provider:
          type: string
          enum:
          - openai
          - anthropic
          - bedrock
          - anthropic_bedrock
          - vertexai
          - google
          - azure_openai
          title: VLM Model Provider
          description: The VLM Model provider to use.
          examples:
          - openai
          - anthropic
          - bedrock
          x-speakeasy-unknown-values: allow
        vlm_model:
          type: string
          title: VLM Model
          description: The VLM Model to use.
          examples:
          - gpt-4o
          x-speakeasy-unknown-values: allow
        table_ocr_agent:
          anyOf:
          - type: string
          - type: 'null'
          title: Table OCR Agent
          description: The OCR agent to use for table ocr inference.
          default: unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract
        unique_element_ids:
          type: boolean
          title: Unique Element IDs
          description: 'When `True`, assign UUIDs to element IDs, which guarantees
            their uniqueness (useful when using them as primary keys in database).
            Otherwise a SHA-256 of element text is used. Default: `False`'
          default: false
        xml_keep_tags:
          type: boolean
          title: Xml Keep Tags
          description: If `True`, will retain the XML tags in the output. Otherwise
            it will simply extract the text from within the tags. Only applies to
            XML documents.
          default: false
        chunking_strategy:
          anyOf:
          - type: string
          - type: 'null'
          title: Chunking Strategy
          description: 'Use one of the supported strategies to chunk the returned
            elements after partitioning. When ''chunking_strategy'' is not specified,
            no chunking is performed and any other chunking parameters provided are
            ignored. Supported strategies: ''basic'', ''by_page'', ''by_similarity'',
            or ''by_title'''
          examples:
          - by_title
          - basic
          x-speakeasy-unknown-values: allow
        combine_under_n_chars:
          anyOf:
          - type: integer
          - type: 'null'
          title: Combine Under N Chars
          description: 'If chunking strategy is set, combine elements until a section
            reaches a length of n chars. Default: 500'
        include_orig_elements:
          anyOf:
          - type: boolean
          - type: 'null'
          title: Include original elements in chunks
          description: 'When a chunking strategy is specified, each returned chunk
            will include the elements consolidated to form that chunk as `.metadata.orig_elements`.
            Default: true.'
        tracking_enabled:
          anyOf:
          - type: boolean
          - type: 'null'
          title: Tracking enabled
          description: Tracking enabled
        max_characters:
          anyOf:
          - type: integer
          - type: 'null'
          title: Max Characters
          description: 'If chunking strategy is set, cut off new sections after reaching
            a length of n chars (hard max). Default: 500'
        multipage_sections:
          type: boolean
          title: Multipage Sections
          description: 'If chunking strategy is set, determines if sections can span
            multiple sections. Default: true'
          default: true
        new_after_n_chars:
          anyOf:
          - type: integer
          - type: 'null'
          title: New after n chars
          description: 'If chunking strategy is set, cut off new sections after reaching
            a length of n chars (soft max). Default: 1500'
        overlap:
          type: integer
          title: Overlap
          description: 'Specifies the length of a string (''tail'') to be drawn from
            each chunk and prefixed to the next chunk as a context-preserving mechanism.
            By default, this only applies to split-chunks where an oversized element
            is divided into multiple chunks by text-splitting. Default: 0'
          default: 0
        overlap_all:
          type: boolean
          title: Overlap all
          description: 'When `True`, apply overlap between ''normal'' chunks formed
            from whole elements and not subject to text-splitting. Use this with caution
            as it entails a certain level of ''pollution'' of otherwise clean semantic
            chunk boundaries. Default: False'
          default: false
        skip_table_chunking:
          type: boolean
          title: Skip table chunking
          description: 'When `True`, tables are not chunked and always kept unchanged.
            Default: False'
          default: false
        isolate_table:
          anyOf:
          - type: boolean
          - type: 'null'
          title: Isolate table
          description: 'When `True`, tables are never chunked together with other
            elements. Default: True'
          default: true
        similarity_threshold:
          anyOf:
          - type: number
          - type: 'null'
          title: similarity-threshold
          description: A value between 0.0 and 1.0 describing the minimum similarity
            two elements must have to be included in the same chunk. Note that similar
            elements may be separated to meet chunk-size criteria; this value can
            only guarantees that two elements with similarity below the threshold
            will appear in separate chunks.
        do_not_break_similarity_on_footer_header:
          type: boolean
          title: do-not-break-similarity-on-footer-header
          description: When `True`, footer, header, and page number are always considered
            similar to the text before them for chunk by similarity method. This allows
            chunk by similarity to connect contents across page better.
          default: false
        contextual_chunking_service_name:
          anyOf:
          - type: string
          - type: 'null'
          title: Contextual Chunking Service Name
          description: Pre-resolved prompt service name for contextual chunking (e.g.
            'BedrockContextualChunking'). When set, uses this service with the provided
            auth instead of the default env-var-based model selection.
        contextual_chunking_auth:
          anyOf:
          - type: string
          - type: 'null'
          title: Contextual Chunking Auth
          description: JSON-encoded auth credentials for the contextual chunking provider.
            Structure depends on the provider.
        include_slide_notes:
          type: boolean
          title: include_slide_notes
          description: 'When `True`, slide notes from .ppt and .pptx files will be
            included in the response. Default: `True`'
          default: true
        pdfminer_line_overlap:
          anyOf:
          - type: number
          - type: 'null'
          title: PDFMiner Line Overlap
          description: If two characters have more overlap than this they are considered
            to be on the same line. The overlap is specified relative to the minimum
            height of both characters.
        pdfminer_char_margin:
          anyOf:
          - type: number
          - type: 'null'
          title: PDFMiner Char Margin
          description: If two characters are closer together than this margin they
            are considered part of the same line. The margin is specified relative
            to the width of the character.
        pdfminer_line_margin:
          anyOf:
          - type: number
          - type: 'null'
          title: PDFMiner Line Margin
          description: If two lines are close together they are considered to be part
            of the same paragraph. The margin is specified relative to the height
            of a line.
        pdfminer_word_margin:
          anyOf:
          - type: number
          - type: 'null'
          title: PDFMiner Word Margin
          description: If two characters on the same line are further apart than this
            margin then they are considered to be two separate words, and an intermediate
            space will be added for readability. The margin is specified relative
            to the width of the character.
          default: 0.185
      type: object
      required:
      - files
      title: Partition Parameters
    Element:
      title: Element
      type: object
      additionalProperties: true
      example:
        type: Title
        element_id: 6aa0ff22f91bbe7e26e8e25ca8052acd
        text: 'LayoutParser: A Unified Toolkit for Deep Learning Based Document Image
          Analysis'
        metadata:
          languages:
          - eng
          page_number: 1
          filename: layout-parser-paper.pdf
          filetype: application/pdf
    CSV-Element:
      title: CSV-Element
      type: string
      example: "type,element_id,text,languages,page_number,filename,filetype,parent_id,links\n\
        \    Title,b7f58c2fd9c15949a55a62eb84e39575,LayoutParser: A Unified Toolkit\
        \ for Deep Learning Based Document Image Analysis,['eng'],1,layout-parser-paper.pdf,application/pdf,,"
    ServerError:
      type: object
      properties:
        detail:
          type: string
      example:
        detail: An error occurred
  securitySchemes:
    ApiKeyAuth:
      type: apiKey
      name: unstructured-api-key
      in: header
      x-speakeasy-example: YOUR_API_KEY
tags:
- name: general
security:
- ApiKeyAuth: []
- {}
x-speakeasy-retries:
  strategy: backoff
  backoff:
    initialInterval: 3000
    maxInterval: 720000
    maxElapsedTime: 1800000
    exponent: 1.88
  statusCodes:
  - 5xx
  retryConnectionErrors: true