# Gemma 4 architecture definition (ISWA — Interleaved Sliding Window Attention). # # Dense transformer with two attention types: full attention (512-dim heads, proportional RoPE) # and sliding-window attention (256-dim heads, standard RoPE). Layers are assigned by a bool # array from GGUF metadata. The last N layers share KV cache with earlier layers (driven by # n_kv_shared_layers param, handled at cache allocation time — no TOML-level cache sharing). # # Novel features: GeGLU FFN, post-attention/post-FFN norms, per-layer embeddings with GELU # gating, embedding scaling, logit softcapping, V-norm (raw RMS), layer output scaling, # RoPE frequency factors for full attention layers. # # MoE variant (e.g. 26B-A4B): same GGUF architecture name ("gemma4"), auto-detected via # [ffn_alt] weights. All layers use routed MoE + shared expert. n_kv_heads is a per-layer # array in the GGUF (8 for SWA, 2 for full-attn); param fallback = first element (8). # Full-attn n_kv_heads inferred at runtime from K tensor shape. # # Known model sizes: # 30 layers → 26B-A4B (MoE: 128 experts, top-8) # 35 layers → E2B # 42 layers → E4B [architecture] name = "gemma4" tied_embeddings = true embed_scale = true [params] n_layers = "gemma4.block_count" n_heads = "gemma4.attention.head_count" n_kv_heads = "gemma4.attention.head_count_kv" n_embd = "gemma4.embedding_length" n_ff = "gemma4.feed_forward_length" rms_eps = "gemma4.attention.layer_norm_rms_epsilon" head_dim = "gemma4.attention.key_length" rope_n_rot = "gemma4.rope.dimension_count" rope_freq_base = "gemma4.rope.freq_base" head_dim_swa = "gemma4.attention.key_length_swa" rope_n_rot_swa = "gemma4.rope.dimension_count_swa" rope_freq_base_swa = "gemma4.rope.freq_base_swa" sliding_window = "gemma4.attention.sliding_window" n_kv_shared_layers = "gemma4.attention.shared_kv_layers" n_embd_per_layer = "gemma4.embedding_length_per_layer_input" logit_softcapping = "gemma4.final_logit_softcapping" swa_pattern = "gemma4.attention.sliding_window_pattern" rope_mode = "standard" # MoE params (absent in dense models) n_expert = "gemma4.expert_count?" n_expert_used = "gemma4.expert_used_count?" n_ff_exp = "gemma4.expert_feed_forward_length?" [params.derived] n_vocab = "token_embd.ne[1]" [weights.global] token_embd = "token_embd.weight" output_norm = "output_norm.weight" output = "output.weight" tok_embd_per_layer = "per_layer_token_embd.weight" per_layer_model_proj = "per_layer_model_proj.weight" per_layer_proj_norm = "per_layer_proj_norm.weight" [layers] count = "n_layers" prefix = "blk.@{layer_idx}." [layers.routing] pattern = "swa_pattern" if_true = "swa_attention" if_false = "full_attention" [layers.common_weights] attn_norm = "attn_norm.weight" ffn_norm = "ffn_norm.weight" attn_post_norm = "post_attention_norm.weight" ffn_post_norm = "post_ffw_norm.weight" pe_inp_gate = "inp_gate.weight" pe_proj = "proj.weight" pe_post_norm = "post_norm.weight" layer_output_scale = "layer_output_scale.weight" [blocks.full_attention] builder = "attention" [blocks.full_attention.weights] attn_q = "attn_q.weight" attn_k = "attn_k.weight" attn_v = "attn_v.weight" attn_output = "attn_output.weight" attn_q_norm = "attn_q_norm.weight" attn_k_norm = "attn_k_norm.weight" rope_freqs = "rope_freqs.weight" [blocks.full_attention.config] rope = "neox" v_norm = "rms" kq_scale = 1.0 shared_kv_group = "full" [blocks.full_attention.cache] k = { dims = ["head_dim", "max_seq_len", "n_kv_heads"], dtype = "f32" } v = { dims = ["head_dim", "max_seq_len", "n_kv_heads"], dtype = "f32" } [blocks.swa_attention] builder = "attention" [blocks.swa_attention.weights] attn_q = "attn_q.weight" attn_k = "attn_k.weight" attn_v = "attn_v.weight" attn_output = "attn_output.weight" attn_q_norm = "attn_q_norm.weight" attn_k_norm = "attn_k_norm.weight" [blocks.swa_attention.config] rope = "neox" v_norm = "rms" sliding_window = true kq_scale = 1.0 shared_kv_group = "swa" head_dim = "head_dim_swa" n_kv_heads = "n_kv_heads" rope_n_rot = "rope_n_rot_swa" rope_freq_base = "rope_freq_base_swa" [blocks.swa_attention.cache] k = { dims = ["head_dim_swa", "max_seq_len", "n_kv_heads"], dtype = "f32" } v = { dims = ["head_dim_swa", "max_seq_len", "n_kv_heads"], dtype = "f32" } [ffn] builder = "geglu" [ffn.weights] gate = "ffn_gate.weight" up = "ffn_up.weight" down = "ffn_down.weight" # --- MoE FFN (auto-detected from weight presence in 26B-A4B and similar) --- [ffn_alt] builder = "moe" [ffn_alt.config] activation = "gelu" self_normed = true norm_w = true [ffn_alt.weights] # Router (normalized: rms_norm + learned scale) gate_inp = "ffn_gate_inp.weight" gate_inp_s = "ffn_gate_inp.scale" # Fused gate+up experts (3D: stacked by expert) gate_up_exps = "ffn_gate_up_exps.weight" down_exps = "ffn_down_exps.weight" # Per-expert output scaling down_exps_s = "ffn_down_exps.scale" # Shared expert (plain FFN, same activation as experts) gate = "ffn_gate.weight" up = "ffn_up.weight" down = "ffn_down.weight" # Self-normed: separate pre/post norms for shared and expert paths # (norm = shared expert pre-norm, same tensor as common ffn_norm) norm = "ffn_norm.weight" pre_norm_2 = "pre_ffw_norm_2.weight" post_norm_1 = "post_ffw_norm_1.weight" post_norm_2 = "post_ffw_norm_2.weight" [tokens] think_open = "<|channel>" think_close = "" # Gemma 4's generation_config.json declares three EOS-class tokens (ids 1, # 106, 50). The GGUF format spec carries only the canonical EOS slot (token 1 # = ), so the other two need to be listed here as single-vocab-entry # stop strings. They are semantically distinct turn-yield signals: # (id 106) — end-of-turn (chat hand-off) # <|tool_response> (id 50) — model emitted a tool_call and is yielding # until the orchestrator returns a result # Both are CONTROL / USER_DEFINED type tokens in the vocab so they tokenize # as single entries (no BPE-decomposition warning at load time). extra_eos = ["", "<|tool_response>"] # =========================================================================== # Vision tower (multimodal models only). # # The per-layer encoder runs through the same generic block/FFN builders as # the decoder (vision.go::BuildVisionGraph's §4 dispatch loop): the # [vision.blocks.*] / [vision.ffn] sections below select and configure them, # exactly as [blocks.*] / [ffn] do for the decoder. The tower's I/O — patch # embedding, 2D position embedding, pooler, and projector — stays procedural # in BuildVisionGraph and shares the existing ggml op wrappers (Conv2D, # RmsNorm, Pool2D, etc.); it mirrors llama.cpp's clip_graph_gemma4v::build(). # # The vision tensors are loaded into the same WeightStore as the decoder # via the existing TensorNames()/uploadWeights pipeline (their GGUF # names — `v.blk..*`, `v.patch_embd.weight`, `v.position_embd.weight`, # `mm.input_projection.weight` — are disjoint from the decoder's # `blk..*` / global names). ResolveVisionWeights in vision.go # rebuilds the per-layer logical→GGUF name map at load time from the # templates below. # # Unimodal Gemma 4 variants (E2B for example) simply don't have a # `vision_config` in their config.json, so vision.* params won't # resolve, and BuildVisionGraph is never invoked. # =========================================================================== [vision] patch_size = 16 image_token = "<|image|>" n_image_tokens = 280 # vision_soft_tokens_per_image (max after pooling) # Architectural constants. These match upstream llama.cpp's # clip.cpp PROJECTOR_TYPE_GEMMA4V case (n_merge = pooling_kernel_size, # rope_theta from rope_parameters.rope_theta, image-token bounds from # set_limit_image_tokens(252, 280)). They aren't in the model file's # own metadata, but they're fixed per-arch — declaring them here is the # difference between data-driven configuration and model-specific Go. n_merge = 3 rope_theta = 100.0 image_min_tokens = 252 image_max_tokens = 280 # Gemma decodes the image ubatch with causal_attn=false (llama-mtmd # mtmd_decode_use_non_causal is true for GEMMA4V) → the decoder attends to each # image's soft-token span bidirectionally. Qwen3-VL and the default keep image # tokens causal, so this flag is Gemma-only. decoder_non_causal = true [vision.params] n_layers = "vision.block_count" n_heads = "vision.attention.head_count" n_embd = "vision.embedding_length" n_ff = "vision.feed_forward_length" head_dim = "vision.attention.key_length?" # optional: mmproj GGUFs omit it (clip.* namespace lacks key_length); ResolveVisionParams derives from n_embd/n_heads rms_eps = "vision.attention.layer_norm_epsilon" patch_size = "vision.patch_size" [vision.weights.global] patch_embd = "v.patch_embd.weight" position_embd = "v.position_embd.weight" [vision.layers] count = "n_layers" prefix = "v.blk.@{layer_idx}." # Gemma's tower is uniform: every layer is the same encoder block. [vision.layers.routing] uniform = "encoder_layer" # Per-layer vision weights. These logical names are resolved to GGUF tensor # names (prefix + suffix) and stored in VisionTensors.Layers[il]; the §4 # dispatch frame applies the norms (ln1/ln2/*_post_norm) directly and the # block/FFN builders read the rest via the weight remaps in # [vision.blocks.*.weights] / [vision.ffn.weights] below. [vision.layers.common_weights] ln1 = "ln1.weight" # pre-attention norm (RmsNorm in Gemma 4) attn_post_norm = "attn_post_norm.weight" ln2 = "ln2.weight" # pre-FFN norm ffn_post_norm = "ffn_post_norm.weight" attn_q = "attn_q.weight" attn_k = "attn_k.weight" attn_v = "attn_v.weight" attn_output = "attn_out.weight" # upstream GGUF mmproj convention (vs decoder's "attn_output") attn_q_norm = "attn_q_norm.weight" # per-head Q-norm (head_dim-wide) attn_k_norm = "attn_k_norm.weight" ffn_gate = "ffn_gate.weight" ffn_up = "ffn_up.weight" ffn_down = "ffn_down.weight" # Vision encoder block — generic `attention` builder, configured to reproduce # the old hand-coded Gemma-vision attention op-for-op: # rope=axial2d → 2D split RoPE (first half of head_dim by PosX, second by # PosY); theta comes from [vision].rope_theta via the # synthetic vision ResolvedParams (rope_freq_base). # v_norm=rms → raw RMS norm on V (no learned scale). # qk_norm → applied because attn_q_norm/attn_k_norm weights are present. # kq_scale=1.0 → Gemma vision overrides the 1/sqrt(d_head) default. # kq_prec=native → skip the forced-F32 KQ accumulation (clip-encoder parity). # non_causal=true→ no mask (bidirectional patch attention). # The .weights map is builder-key → per-layer logical name (above), since the # tensors are already resolved into VisionTensors.Layers[il] by that name. [vision.blocks.encoder_layer] builder = "attention" [vision.blocks.encoder_layer.weights] attn_q = "attn_q" attn_k = "attn_k" attn_v = "attn_v" attn_output = "attn_output" attn_q_norm = "attn_q_norm" attn_k_norm = "attn_k_norm" [vision.blocks.encoder_layer.config] rope = "axial2d" v_norm = "rms" kq_scale = 1.0 kq_prec = "native" non_causal = true # Vision FFN — generic geglu builder: gelu(gate·x) * (up·x) → down. The # builder reads gate/up/down; the remap pulls them from the ffn_* per-layer # logical names. [vision.ffn] # Quick-GELU (x*sigmoid(1.702x)), not tanh-GELU: the Gemma 4 mmproj sets # neither use_gelu nor use_silu, so llama.cpp's clip falls through to # FFN_GELU_QUICK (clip.cpp; dump shows `ffn_geglu_quick`). The text decoder's # [ffn] stays on plain "geglu" (tanh) — only the ViT encoder is quick-GELU. builder = "geglu_quick" [vision.ffn.weights] gate = "ffn_gate" up = "ffn_up" down = "ffn_down" # Projector — single linear matmul + post-projection RmsNorm. # Output: [n_decoder_embd=2560, n_image_tokens=280]. [projector] type = "linear_post_norm" [projector.weights] proj = "mm.input_projection.weight" # Diagram-only example values; no effect on inference. [example] n_layers = 30 vision_n_layers = 16 # gemma-4 vision_config.num_hidden_layers attn_pattern_false_every = 5