datasets: - name: structure description: Markdown files containing structural information about datasets, including their schemas, relationships, and metadata. tables: - name: documents description: One row per source document with extracted metadata, relationships, and embedded content references. Additional document metadata fields are stored either as scalar columns or serialized *_list columns as needed. columns: - { name: id, type: int, primary: true, autoincrement: true, description: Autoincrement row id for the versioned document } - { name: version_id, type: string, description: Encoded seconds-since-2000 identifier for the collection run that produced the row } - { name: sid, type: string, description: Stable short hash for the document } - { name: uid, type: string, description: Human-friendly identifier derived from the file path or slug } - { name: path, type: string, description: Relative source path inside the content tree } - { name: url, type: string, description: Public route exposed for the document } - { name: url_type, type: string, description: Indicates whether the URL maps to a directory or a file } - { name: slug, type: string, description: Slug used for building anchors and nested references } - { name: title, type: string, description: Document title resolved from folder names or front matter } - { name: level, type: int, description: Depth of the document in the navigation tree } - { name: order, type: int, description: Ordering index scoped to siblings sharing the same directory and level; defaults to alphabetical order when omitted } - { name: tags, type: string_list, description: JSON list of tags assigned to the document via front matter or metadata } - { name: date, type: string, description: ISO 8601 date assigned to the document via front matter or metadata } - { name: lastmod, type: string, description: ISO 8601 timestamp of the last modification time of the source file } - { name: format, type: string, values: [markdown, markdown_card], description: Format of the source document such as standard markdown or markdown_card } - { name: meta_data, type: string, description: JSON string of metadata fields not mapped to schema columns } - name: items description: Flattened AST items representing headings, paragraphs, and asset-backed nodes in reading order. columns: - { name: id, type: int, primary: true, autoincrement: true, description: Autoincrement row id for the versioned item } - { name: version_id, type: string, description: Encoded seconds-since-2000 identifier for the collection run } - { name: doc_sid, type: string, description: SID of the parent document } - { name: slug, type: string, description: Page-unique slug usable as intra-page anchor } - { name: asset_uid, type: string, description: UID of a single referenced asset when applicable } - { name: level, type: int, description: Approximate nesting or heading depth derived from the AST } - { name: order_index, type: int, description: Order of appearance within the document } - { name: type, type: string, description: Item type such as heading, paragraph, table, code, or image } - { name: body_text, type: string, description: Textual body for the item; asset-linked items reference their asset via asset_uid } - { name: ast, type: string, description: Serialized AST subtree for complex items, otherwise null } - name: assets description: Version-specific rows that connect each document asset to the collection run for downstream joins. columns: - { name: id, type: int, primary: true, autoincrement: true, description: Autoincrement row id for the versioned asset } - { name: version_id, type: string, description: Encoded seconds-since-2000 identifier for the collection run } - { name: doc_sid, type: string, description: SID of the document that owns the item } - { name: asset_uid, type: string, description: UID of the referenced asset as stored in the asset_info table } - { name: blob_uid, type: string, description: UID of the blob row referenced by this asset } - { name: type, type: string, description: Asset type recorded for this version (mirrors asset_info.type) } - name: asset_info description: Asset catalog linking documents to their concrete payloads via blob hashes derived from code blocks, tables, image references, and loose files. columns: - { name: id, type: int, primary: true, autoincrement: true, description: Autoincrement row id for the asset catalog } - { name: uid, type: string, description: Stable identifier composed from the document uid and asset slug } - { name: type, type: string, values: [file, codeblock, table, model, image, linked_file, gallery_asset, found], description: Asset type such as codeblock, table, file, or model } - { name: blob_uid, type: string, description: UID joining to the blob_store table entry } - { name: parent_doc_uid, type: string, description: UID of the document that introduced the asset } - { name: path, type: string, description: Source-relative path for file-backed assets } - { name: ext, type: string, description: File extension for referenced files when available } - { name: params, type: string, description: Raw params associated with the asset (e.g., code block meta) } - { name: first_seen, type: string, description: ISO timestamp when the asset was first observed } - { name: last_seen, type: string, description: ISO timestamp when the asset was most recently observed } - name: blob_store description: Unique blob payloads materialized on disk and shared across assets via their SHA-512 hash. columns: - { name: blob_uid, type: string, primary: true, description: Monotonic hex identifier for the blob row } - { name: hash, type: string, description: SHA-512 hash identifying the blob } - { name: path, type: string, description: Directory path within the blob store hierarchy such as YYYY/MM/ff } - { name: first_seen, type: string, description: ISO timestamp when the blob was first observed } - { name: last_seen, type: string, description: ISO timestamp when the blob was most recently observed } - { name: size, type: int, description: Size of the stored blob in bytes } - { name: compression, type: boolean, description: True when payload is gzip-compressed, false otherwise, null for external blobs } - { name: payload, type: blob, description: Inline blob payload when stored directly in the table } - name: images description: Image metadata derived from on-disk files for image and gallery assets. columns: - { name: id, type: int, primary: true, autoincrement: true, description: Autoincrement row id for the image metadata } - { name: uid, type: string, description: UID of the backing asset (image or gallery_asset) } - { name: blob_uid, type: string, description: UID of the blob backing this image when available } - { name: type, type: string, description: Asset type used when the image was extracted } - { name: name, type: string, description: Filename without extension derived from the asset path } - { name: extension, type: string, description: File extension for the image } - { name: width, type: int, description: Pixel width after orientation correction } - { name: height, type: int, description: Pixel height after orientation correction } - { name: ratio, type: string, description: Aspect ratio width/height as a decimal string } - name: versions description: One row per collection run to track version metadata. columns: - { name: version_id, type: string, primary: true, description: Encoded seconds-since-2000 identifier for the collection run } - { name: created_at, type: string, description: ISO timestamp of when the version was produced } - { name: type, type: string, values: [daily, weekly, monthly, early, baseline], description: Run cadence classification } - { name: tags, type: string_list, description: Free-form tags associated with the version }