# LLaDA architecture definition.
#
# Dense diffusion-based decoder with bidirectional (non-causal) attention.
# Standard multi-head attention (MHA), standard (interleaved) RoPE, SwiGLU FFN.
# No QK-norm (unlike LLaDA-MoE), no gated attention output.
#
# Known model sizes:
#   32 layers → 8B

[architecture]
name = "llada"
non_causal = true
generation = "diffusion"

[params]
n_layers       = "llada.block_count"
n_heads        = "llada.attention.head_count"
n_kv_heads     = "llada.attention.head_count_kv?"
n_embd         = "llada.embedding_length"
n_ff           = "llada.feed_forward_length"
rms_eps        = "llada.attention.layer_norm_rms_epsilon"
rope_freq_base = "llada.rope.freq_base"

[params.derived]
n_vocab    = "token_embd.ne[1]"
head_dim   = "n_embd / n_heads"
rope_n_rot = "head_dim"

[params.defaults]
n_kv_heads = "n_heads"

[weights.global]
token_embd  = "token_embd.weight"
output_norm = "output_norm.weight"
output      = "output.weight"

[layers]
count  = "n_layers"
prefix = "blk.@{layer_idx}."

[layers.routing]
uniform = "attention"

[layers.common_weights]
attn_norm = "attn_norm.weight"
ffn_norm  = "ffn_norm.weight"

[blocks.attention]
builder = "attention"

  [blocks.attention.weights]
  attn_q      = "attn_q.weight"
  attn_k      = "attn_k.weight"
  attn_v      = "attn_v.weight"
  attn_output = "attn_output.weight"

  [blocks.attention.config]
  rope = "standard"

  [blocks.attention.cache]
  k = { dims = ["head_dim", "max_seq_len", "n_kv_heads"], dtype = "f32" }
  v = { dims = ["head_dim", "max_seq_len", "n_kv_heads"], dtype = "f32" }

[ffn]
builder = "swiglu"

  [ffn.weights]
  gate = "ffn_gate.weight"
  up   = "ffn_up.weight"
  down = "ffn_down.weight"

[tokens]
extra_eos = ["<|role_end|>", "<|eot_id|>"]

# Diagram-only example values; no effect on inference.
[example]
n_layers = 32