# Qwen3.5 hybrid architecture definition.
#
# Hybrid structure: 32 layers total. Every 4th layer (indices 3,7,11,...,31) is
# full softmax attention; the other 24 are delta-net SSM (recurrent) layers.
# This is controlled by full_attn_interval — the routing rule below implements it.
#
# Full attention layers: joint Q+gate projection, separate K/V, QK-norm, MRoPE
# (sections [11,11,10,0]), GQA (16Q/4KV heads), sigmoid-gated output.
#
# Delta-net (SSM) layers: combined QKV projection, conv1d, L2-normalized Q/K,
# fused gated-delta-net op, gated RMSNorm output (rms_norm × silu(z)).
#
# Common to all layers: RMSNorm (eps=1e-6), SwiGLU FFN, post-attention norm.
#
# Tied embeddings: the LM head reuses token_embd.weight (no separate output.weight).
# MRoPE sections [11,11,10,0] allow the model to encode different positional
# signals for different head-dim slices (temporal, spatial, etc.).

[architecture]
name = "qwen35"
tied_embeddings = true

[params]
block_count          = "qwen35.block_count"
# Qwen3-Next ships an MTP (next-token-prediction) head as extra block(s) past the
# transformer stack; newer converters fold it into block_count. bench has no
# MTP/speculative-decode path, so n_layers (derived below) excludes it.
nextn_predict_layers = "qwen35.nextn_predict_layers?"
n_heads            = "qwen35.attention.head_count"
n_kv_heads         = "qwen35.attention.head_count_kv"
n_embd             = "qwen35.embedding_length"
n_ff               = "qwen35.feed_forward_length"
rms_eps            = "qwen35.attention.layer_norm_rms_epsilon"
head_dim           = "qwen35.attention.key_length"
ssm_d_conv         = "qwen35.ssm.conv_kernel"
ssm_d_inner        = "qwen35.ssm.inner_size"
ssm_d_state        = "qwen35.ssm.state_size"
ssm_dt_rank        = "qwen35.ssm.time_step_rank"
ssm_n_group        = "qwen35.ssm.group_count"
full_attn_interval = "qwen35.full_attention_interval"
rope_n_rot         = "qwen35.rope.dimension_count"
rope_freq_base     = "qwen35.rope.freq_base"
rope_sections      = "qwen35.rope.dimension_sections"
rope_mode          = "neox"

[params.derived]
n_layers      = "block_count - nextn_predict_layers"   # exclude MTP head(s) from the layer loop
n_vocab       = "token_embd.ne[1]"
head_v_dim    = "ssm_d_inner / ssm_dt_rank"
conv_channels = "ssm_d_inner + 2 * ssm_n_group * ssm_d_state"

[params.defaults]
n_kv_heads           = "n_heads"   # GGUF convention: 0 means same as n_heads (no GQA)
nextn_predict_layers = "0"         # non-MTP models: no nextn head → n_layers == block_count

[weights.global]
token_embd  = "token_embd.weight"
output_norm = "output_norm.weight"
output      = "output.weight"

[layers]
count  = "n_layers"
prefix = "blk.@{layer_idx}."

[layers.routing]
# @{layer_idx} = layer index (0-based), a builtin. ${name} dereferences a resolved param.
rule     = "(@{layer_idx} + 1) % ${full_attn_interval} != 0"
if_true  = "recurrent_ssm"
if_false = "full_attention"

[layers.common_weights]
attn_norm      = "attn_norm.weight"
ffn_norm       = "post_attention_norm.weight"

[blocks.full_attention]
builder = "full_attention_gated"

  [blocks.full_attention.weights]
  attn_q      = "attn_q.weight"
  attn_k      = "attn_k.weight"
  attn_v      = "attn_v.weight"
  attn_output = "attn_output.weight"
  attn_q_norm = "attn_q_norm.weight"
  attn_k_norm = "attn_k_norm.weight"

  [blocks.full_attention.config]
  q_has_gate    = true
  qk_norm       = "rms"
  # imrope: interleaved multi-section M-RoPE (GGML_ROPE_TYPE_IMROPE, Qwen3-VL).
  # For text (no image) it reduces to the plain NEOX `multi` result; image spans
  # get 2-D grid positions via the decoder get_rope_index analogue. Requires the
  # [4*n_tokens] InpPos buffer (built in graph.go::buildDecoderPositions).
  rope          = "imrope"
  output_gate   = "sigmoid"

  [blocks.full_attention.cache]
  k = { dims = ["head_dim", "max_seq_len", "n_kv_heads"], dtype = "f32" }
  v = { dims = ["head_dim", "max_seq_len", "n_kv_heads"], dtype = "f32" }

[blocks.recurrent_ssm]
builder = "gated_delta_net"

  [blocks.recurrent_ssm.weights]
  attn_qkv    = "attn_qkv.weight"
  attn_gate   = "attn_gate.weight"
  ssm_a       = "ssm_a"
  ssm_alpha   = "ssm_alpha.weight"
  ssm_beta    = "ssm_beta.weight"
  ssm_conv1d  = "ssm_conv1d.weight"
  ssm_dt_bias = "ssm_dt.bias"
  ssm_norm    = "ssm_norm.weight"
  ssm_out     = "ssm_out.weight"

  [blocks.recurrent_ssm.config]
  conv_activation = "silu"
  qk_norm         = "l2"
  gate_norm        = "rms"
  gate_activation  = "silu"

  [blocks.recurrent_ssm.cache]
  conv_state = { dims = ["ssm_d_conv - 1", "conv_channels"], dtype = "f32" }
  ssm_state  = { dims = ["head_v_dim", "head_v_dim", "ssm_dt_rank"], dtype = "f32" }

[ffn]
builder = "swiglu"

  [ffn.weights]
  gate = "ffn_gate.weight"
  up   = "ffn_up.weight"
  down = "ffn_down.weight"

[tokens]
think_open  = "<think>"
think_close = "</think>"
extra_eos   = ["<|endoftext|>"]

# ===========================================================================
# Vision tower (Qwen3.5-VL — PROJECTOR_TYPE_QWEN3VL mmproj).
#
# Runs through the same generic block/FFN builders as the decoder via
# vision.go::BuildVisionGraph's §4 dispatch loop. Differences from Gemma 4's
# tower are all data-driven (no model-specific Go): LayerNorm-with-bias (not
# RMS), fused QKV (split into views in-graph), M-RoPE GGML_ROPE_TYPE_VISION
# (not axial 2D), dual-conv merge-grouped patch embed, bilinearly-interpolated
# learned position grid, and a 2-layer MLP projector (not single-linear+norm).
#
# Hparams source: models/mmproj-Qwen3.5-9B.gguf metadata (clip.vision.*):
#   block_count 27, embedding_length 1152, feed_forward_length 4304,
#   head_count 16 (head_dim 72), patch 16, image 768, spatial_merge 2,
#   projection_dim 4096, GELU (tanh), eps 1e-6, mean/std [0.5].
# Full attention all 27 layers; no window/deepstack/temporal for this mmproj.
# ===========================================================================

[vision]
patch_size      = 16
image_token     = "<|image_pad|>"
n_image_tokens  = 0          # variable per image (smart-resize); splice computes the run length
norm_type       = "layernorm"

# Architectural constants not in the mmproj metadata block but fixed per-arch.
# n_merge = spatial_merge_size (2); token bounds from clip.cpp
# set_limit_image_tokens(8, 4096); rope_theta = 10000 (qwen3vl.cpp ggml_rope_multi).
n_merge          = 2
rope_theta       = 10000.0
image_min_tokens = 8
image_max_tokens = 4096

[vision.params]
n_layers   = "vision.block_count"
n_heads    = "vision.attention.head_count"
n_embd     = "vision.embedding_length"
n_ff       = "vision.feed_forward_length"
head_dim   = "vision.attention.key_length?"  # mmproj omits it; derived from n_embd/n_heads
rms_eps    = "vision.attention.layer_norm_epsilon"
patch_size = "vision.patch_size"

[vision.weights.global]
patch_embd      = "v.patch_embd.weight"
patch_embd_1    = "v.patch_embd.weight.1"   # second conv kernel (summed)
patch_embd_bias = "v.patch_embd.bias"
position_embd   = "v.position_embd.weight"  # learned 48x48 grid, bilinearly interpolated
post_ln         = "v.post_ln.weight"
post_ln_bias    = "v.post_ln.bias"

[vision.layers]
count  = "n_layers"
prefix = "v.blk.@{layer_idx}."

[vision.layers.routing]
uniform = "encoder_layer"

# Per-layer vision weights (LayerNorm carries a bias; fused QKV is one tensor).
[vision.layers.common_weights]
ln1                = "ln1.weight"
ln1_bias           = "ln1.bias"
ln2                = "ln2.weight"
ln2_bias           = "ln2.bias"
attn_qkv           = "attn_qkv.weight"   # fused QKV — split into q/k/v views in-graph (qkv_fused)
attn_qkv_bias      = "attn_qkv.bias"
attn_output        = "attn_out.weight"
attn_output_bias   = "attn_out.bias"
ffn_up             = "ffn_up.weight"
ffn_up_bias        = "ffn_up.bias"
ffn_down           = "ffn_down.weight"
ffn_down_bias      = "ffn_down.bias"

# Vision encoder block — generic `attention` builder.
#   rope=mrope_vision → GGML_ROPE_TYPE_VISION M-RoPE (theta 10000 via rope_freq_base).
#   kq_prec=native    → no forced-F32 KQ accumulation (clip-encoder parity).
#   non_causal=true   → bidirectional patch attention.
#   qkv_fused=true    → split the single attn_qkv weight/bias into q/k/v views.
# No kq_scale override → default 1/sqrt(head_dim). No qk_norm, no v_norm.
[vision.blocks.encoder_layer]
builder = "attention"

  [vision.blocks.encoder_layer.weights]
  attn_output      = "attn_output"
  attn_output_bias = "attn_output_bias"

  [vision.blocks.encoder_layer.config]
  rope       = "mrope_vision"
  kq_prec    = "native"
  non_causal = true
  qkv_fused  = true

# Vision FFN — generic plain MLP: up → GELU → down (no gate), with biases.
[vision.ffn]
builder = "mlp"

  [vision.ffn.config]
  activation = "gelu"

  [vision.ffn.weights]
  up        = "ffn_up"
  up_bias   = "ffn_up_bias"
  down      = "ffn_down"
  down_bias = "ffn_down_bias"

# Projector — 2-layer MLP: mm.0 → GELU → mm.2 (with biases, no norm).
# Input [n_embd*4=4608, n_pos/4]; output [projection_dim=4096, n_pos/4].
[projector]
type = "mlp"

[projector.weights]
proj       = "mm.0.weight"
proj_bias  = "mm.0.bias"
proj2      = "mm.2.weight"
proj2_bias = "mm.2.bias"

# Diagram-only example values; no effect on inference.
[example]
n_layers        = 32
vision_n_layers = 27   # Qwen3.5-VL vision_config.depth
full_attn_every = 4