// Package main implements a minimal GPT-2 in a single, self-contained Go file.
//
// This is a Go port of Andrej Karpathy's microgpt, built to study transformer
// internals with zero external dependencies. It is a simple, personal learning
// implementation meant to help a Go developer understand the core ideas.
//
// For Quick Start, configuration, and benchmarks, see README.md.
//
// - Requirements: Go 1.22+ (uses range over integer)
// - Repository: https://github.com/KEINOS/go-microgpt/
// - References:
//   - microgpt (Python): https://gist.github.com/karpathy/8627fe009c40f57531cb18360106ce95
//   - microgpt (blog): https://karpathy.github.io/2026/02/12/microgpt/
//   - Training data: https://github.com/karpathy/makemore
package main

import (
	"fmt"
	"io"
	"math"
	"math/rand"
	"net/http"
	"os"
	"slices"
	"sort"
	"strings"
)

// ============================================================================
//  Configuration & Constants
// ============================================================================

// Magic Numbers.
const (
	zeroDivisionEps   = 1e-5 // Small constant to prevent division by zero
	mlpExpansionRatio = 4    // MLP hidden layer expansion ratio (nEmbd -> 4*nEmbd -> nEmbd)
)

// Model architecture configuration.
const (
	nLayer    = 1             // Number of transformer layers
	nEmbd     = 16            // Embedding dimension
	blockSize = 16            // Maximum context window length
	nHead     = 4             // Number of attention heads
	headDim   = nEmbd / nHead // Dimension per head = 4
)

// Training configuration.
const (
	numSteps     = 1000 // Training steps
	learningRate = 0.01 // Initial learning rate
	beta1        = 0.85 // Adam first-moment decay
	beta2        = 0.99 // Adam second-moment decay
	epsAdam      = 1e-8 // Adam epsilon
)

// Data and inference configuration.
const (
	randomSeed          = 42 // Global RNG seed
	dataURL             = "https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt"
	temperature         = 0.5  // Sampling temperature for inference
	initStd             = 0.08 // Std dev for weight initialization
	numInferenceSamples = 20   // Number of samples to generate during inference
)

// Global RNG - seeded once at startup.
var rng *rand.Rand

// ============================================================================
//  Main Function (Training & Inference)
// ============================================================================

func main() {
	// Initialize global Random Number Generator
	initRNG(randomSeed)

	// ------------------------------------------------------------------------
	//  Let there be a Dataset `docs`: []string of documents (e.g. a list of names)
	// ------------------------------------------------------------------------

	// Download dataset if needed
	err := downloadDataset(dataURL, "input.txt")
	if err != nil {
		fmt.Fprintf(os.Stderr, "error downloading dataset: %v\n", err)
		os.Exit(1)
	}

	// Load documents (1 sample per line)
	docs, err := loadDocs("input.txt")
	if err != nil {
		fmt.Fprintf(os.Stderr, "error loading docs: %v\n", err)
		os.Exit(1)
	}

	// Shuffle documents
	shuffleDocs(docs)
	fmt.Printf("num docs: %d\n", len(docs))

	// ------------------------------------------------------------------------
	//  Let there be a Tokenizer to translate strings to sequences of integers
	// ("tokens") and back
	// ------------------------------------------------------------------------

	// Build vocabulary.
	// * `uchars` is the sorted list of unique characters in the dataset
	// * `BOS` is a special token ID for "beginning of sequence". Also used as
	//   "end of sequence" (a.k.a. EOS) as well
	// * `vocabSize` is the total number of tokens (unique chars + BOS)
	uchars, BOS, vocabSize := buildVocab(docs)
	fmt.Printf("vocab size: %d\n", vocabSize)

	// ------------------------------------------------------------------------
	//  Initialize the parameters, to store the knowledge of the model
	// ------------------------------------------------------------------------

	stateDict := newStateDict(vocabSize) // initialized model weight matrices

	params := flattenParams(stateDict)
	fmt.Printf("num params: %d\n", len(params))

	// ------------------------------------------------------------------------
	//  Let there be Adam, the blessed optimizer and its buffers
	// ------------------------------------------------------------------------

	// Create Adam optimizer
	optimizer := newAdamOptimizer(params, learningRate, beta1, beta2, epsAdam)

	// ------------------------------------------------------------------------
	//  Repeat in sequence (training loop)
	// ------------------------------------------------------------------------

	train(numSteps, docs, uchars, BOS, vocabSize, stateDict, optimizer)

	// ------------------------------------------------------------------------
	//  Inference: may the model babble back to us
	// ------------------------------------------------------------------------

	fmt.Println("\n--- inference (new, hallucinated names) ---")

	for sampleIdx := range numInferenceSamples {
		result := sample(temperature, blockSize, uchars, BOS, vocabSize, stateDict)
		fmt.Printf("sample %2d: %s\n", sampleIdx+1, result)
	}
}

// ============================================================================
//  Type: Value - Autograd Node
// ============================================================================
// Let there be Autograd to recursively apply the chain rule through a computation
// graph (a.k.a. backpropagation).

// Let there be Autograd to recursively apply the chain rule through a computation
// graph (a.k.a. backpropagation).

// Value represents a scalar node in a computation graph for automatic differentiation.
// All scalars in computation must be *Value (not Value) to support gradient accumulation
// on shared weight nodes during backward pass.
type Value struct {
	Data       float64   // Forward pass scalar value
	Grad       float64   // Gradient accumulated during backward pass
	children   []*Value  // Child nodes (operands)
	localGrads []float64 // Local derivatives w.r.t. children (chain rule coefficients)
}

// ----------------------------------------------------------------------------
//  Constructor
// ----------------------------------------------------------------------------

// newValue creates a new Value node.
func newValue(data float64, children []*Value, localGrads []float64) *Value {
	return &Value{
		Data:       data,
		Grad:       0,
		children:   children,
		localGrads: localGrads,
	}
}

// ----------------------------------------------------------------------------
//  Methods
// ----------------------------------------------------------------------------

// backward performs reverse-mode automatic differentiation on the computation
// graph (apply the chain rule through a computation graph, a.k.a. backpropagation).
func (v *Value) backward() {
	// Build topological order via DFS post-order traversal
	visited := make(map[*Value]bool)
	topo := make([]*Value, 0)

	var buildTopo func(*Value)

	buildTopo = func(node *Value) {
		if visited[node] {
			return
		}

		visited[node] = true
		for _, child := range node.children {
			buildTopo(child)
		}

		topo = append(topo, node)
	}

	buildTopo(v)

	// Set root gradient to 1
	v.Grad = 1.0

	// Propagate gradients in reverse topological order
	for i := len(topo) - 1; i >= 0; i-- {
		node := topo[i]
		for j, child := range node.children {
			child.Grad += node.localGrads[j] * node.Grad
		}
	}
}

// log returns ln(v).
func (v *Value) log() *Value {
	return newValue(
		math.Log(v.Data),
		[]*Value{v},
		[]float64{1 / v.Data},
	)
}

// exp returns e^v.
func (v *Value) exp() *Value {
	return newValue(
		math.Exp(v.Data),
		[]*Value{v},
		[]float64{math.Exp(v.Data)},
	)
}

// relu returns max(0, v).
func (v *Value) relu() *Value {
	relu := 0.0
	if v.Data > 0 {
		relu = 1.0
	}

	return newValue(
		math.Max(0, v.Data),
		[]*Value{v},
		[]float64{relu},
	)
}

// ============================================================================
//  Type: StateDict - Model Parameters
// ============================================================================

// StateDict stores all model weight matrices indexed by name.
type StateDict map[string][][]*Value

// ----------------------------------------------------------------------------
//  Constructor
// ----------------------------------------------------------------------------

// newStateDict creates and initializes all model weight matrices.
func newStateDict(vocabSize int) StateDict {
	sd := make(StateDict)

	// Embedding tables
	sd["wte"] = matrix(vocabSize, nEmbd, initStd) // token embeddings
	sd["wpe"] = matrix(blockSize, nEmbd, initStd) // position embeddings

	// Output logits projection.
	//
	// Note: Modern GPTs use weight tying (lm_head = wte.T) to reduce parameters
	// and improve performance, but this implementation keeps them separate to
	// match the Python reference (microgpt.py) for 1:1 parity.
	sd["lm_head"] = matrix(vocabSize, nEmbd, initStd)

	// Transformer layers
	for i := range nLayer {
		key := fmt.Sprintf("layer%d", i)
		// Attention weights
		sd[key+".attn_wq"] = matrix(nEmbd, nEmbd, initStd) // query projection
		sd[key+".attn_wk"] = matrix(nEmbd, nEmbd, initStd) // key projection
		sd[key+".attn_wv"] = matrix(nEmbd, nEmbd, initStd) // value projection
		sd[key+".attn_wo"] = matrix(nEmbd, nEmbd, initStd) // output projection
		// MLP weights
		sd[key+".mlp_fc1"] = matrix(mlpExpansionRatio*nEmbd, nEmbd, initStd) // expand layer
		sd[key+".mlp_fc2"] = matrix(nEmbd, mlpExpansionRatio*nEmbd, initStd) // contract layer
	}

	return sd
}

// ============================================================================
//  Type: AdamOptimizer - Optimization Algorithm
// ============================================================================

// adamOptimizer implements the Adam optimization algorithm.
type adamOptimizer struct {
	params []*Value
	m      []float64 // First moment (momentum)
	v      []float64 // Second moment (variance)
	lr     float64   // Learning rate
	beta1  float64   // First moment decay
	beta2  float64   // Second moment decay
	eps    float64   // Epsilon for numerical stability
}

// ----------------------------------------------------------------------------
//  Constructor
// ----------------------------------------------------------------------------

// newAdamOptimizer creates a new Adam optimizer.
func newAdamOptimizer(params []*Value, lr, beta1, beta2, eps float64) *adamOptimizer {
	return &adamOptimizer{
		params: params,
		m:      make([]float64, len(params)),
		v:      make([]float64, len(params)),
		lr:     lr,
		beta1:  beta1,
		beta2:  beta2,
		eps:    eps,
	}
}

// ----------------------------------------------------------------------------
//  Method
// ----------------------------------------------------------------------------

// step performs one optimization step with learning rate decay.
func (o *adamOptimizer) step(stepNum int) {
	// Learning rate decay: lr_t = lr * (1 - step / numSteps)
	// CRITICAL: Avoid integer division - cast to float64
	lrT := o.lr * (1.0 - float64(stepNum)/float64(numSteps))

	for i, p := range o.params {
		// Update biased first moment
		o.m[i] = o.beta1*o.m[i] + (1-o.beta1)*p.Grad

		// Update biased second moment
		o.v[i] = o.beta2*o.v[i] + (1-o.beta2)*p.Grad*p.Grad

		// Compute bias-corrected first moment
		mHat := o.m[i] / (1 - math.Pow(o.beta1, float64(stepNum+1)))

		// Compute bias-corrected second moment
		vHat := o.v[i] / (1 - math.Pow(o.beta2, float64(stepNum+1)))

		// Update parameter
		p.Data -= lrT * mHat / (math.Sqrt(vHat) + o.eps)

		// Reset gradient
		p.Grad = 0
	}
}

// ============================================================================
//  Arithmetic Operations/Functions
// ============================================================================

// add returns a + b.
func add(a, b *Value) *Value {
	return newValue(a.Data+b.Data, []*Value{a, b}, []float64{1, 1})
}

// mul returns a * b.
func mul(a, b *Value) *Value {
	return newValue(a.Data*b.Data, []*Value{a, b}, []float64{b.Data, a.Data})
}

// pow returns a ^ exp.
func pow(a *Value, exp float64) *Value {
	return newValue(
		math.Pow(a.Data, exp),
		[]*Value{a},
		[]float64{exp * math.Pow(a.Data, exp-1)},
	)
}

// neg returns -a.
func neg(a *Value) *Value {
	return mul(a, newValue(-1, nil, nil))
}

// sub returns a - b.
func sub(a, b *Value) *Value {
	return add(a, neg(b))
}

// div returns a / b.
func div(a, b *Value) *Value {
	return mul(a, pow(b, -1))
}

// ============================================================================
//  Model Architecture
// ============================================================================
// Function mapping tokens and parameters to logits ([]*Value) over what comes
// next Follow. GPT-2, blessed among the GPTs, with minor differences:
//   layernorm -> rmsnorm, no biases, GeLU -> ReLU

// linear (matrix-vector product) computes y = W @ x (matrix-vector product, no bias)
// w is shape [nOut, nIn], x is shape [nIn], returns shape [nOut].
func linear(x []*Value, w [][]*Value) []*Value {
	out := make([]*Value, len(w))
	for i := range w {
		sum := newValue(0, nil, nil)
		for j := range x {
			sum = add(sum, mul(w[i][j], x[j]))
		}

		out[i] = sum
	}

	return out
}

// softmax computes numerically stable softmax over logits.
func softmax(logits []*Value) []*Value {
	// Extract max as plain float64 for numerical stability
	maxVal := logits[0].Data
	for _, v := range logits[1:] {
		if v.Data > maxVal {
			maxVal = v.Data
		}
	}

	// Compute exps and their sum
	exps := make([]*Value, len(logits))
	maxValNode := newValue(maxVal, nil, nil)
	total := newValue(0, nil, nil)

	for i, v := range logits {
		exps[i] = sub(v, maxValNode).exp()
		total = add(total, exps[i])
	}

	// Normalize
	out := make([]*Value, len(exps))
	for i := range exps {
		out[i] = div(exps[i], total)
	}

	return out
}

// rmsnorm computes root-mean-square normalization.
//
// Note: Standard RMSNorm includes a learnable gamma scale parameter that
// scales the normalized output. This implementation uses parameter-free
// RMSNorm for educational clarity to match the original microgpt.py.
func rmsnorm(x []*Value) []*Value {
	// ms = sum(xi * xi) / len(x)
	sumSq := newValue(0, nil, nil)
	for _, xi := range x {
		sumSq = add(sumSq, mul(xi, xi))
	}

	ms := mul(sumSq, newValue(1.0/float64(len(x)), nil, nil))

	// scale = pow(ms + 1e-5, -0.5)
	scale := pow(add(ms, newValue(zeroDivisionEps, nil, nil)), -0.5)

	// return [xi * scale for xi in x]
	out := make([]*Value, len(x))
	for i := range x {
		out[i] = mul(x[i], scale)
	}

	return out
}

// gpt computes the forward pass of the GPT model
// Returns logits of shape [vocabSize].
func gpt(tokenID, posID int, keys, values [][][]*Value, stateDict StateDict) []*Value {
	// Step 1: Token + Position Embedding
	tokEmb := stateDict["wte"][tokenID] // [nEmbd]
	posEmb := stateDict["wpe"][posID]   // [nEmbd]

	// Element-wise add embeddings
	x := make([]*Value, nEmbd)
	for i := range nEmbd {
		x[i] = add(tokEmb[i], posEmb[i])
	}

	// Apply RMSNorm
	x = rmsnorm(x)

	// Step 2: Transformer Layers
	for li := range nLayer {
		layerKey := fmt.Sprintf("layer%d", li)
		// ============================================================
		//  Attention Sub-block
		// ============================================================
		xRes := x
		x = rmsnorm(x)

		// Compute Q, K, V
		q := linear(x, stateDict[layerKey+".attn_wq"]) // [nEmbd]
		k := linear(x, stateDict[layerKey+".attn_wk"]) // [nEmbd]
		v := linear(x, stateDict[layerKey+".attn_wv"]) // [nEmbd]

		// Append to KV cache
		keys[li] = append(keys[li], k)
		values[li] = append(values[li], v)

		// Per-head attention
		xAttn := make([]*Value, 0, nEmbd)

		for h := range nHead {
			hs := h * headDim
			qH := q[hs : hs+headDim]
			kH := make([][]*Value, len(keys[li]))

			vH := make([][]*Value, len(values[li]))
			// We only have past tokens in KV cache, so no explicit causal mask is needed.
			// If you later batch full sequences, add a mask to block future tokens.
			for t := range len(keys[li]) {
				kH[t] = keys[li][t][hs : hs+headDim]
				vH[t] = values[li][t][hs : hs+headDim]
			}

			// Compute scaled dot-product attention scores
			scores := make([]*Value, len(keys[li]))
			for t := range len(keys[li]) {
				sum := newValue(0, nil, nil)
				for j := range headDim {
					sum = add(sum, mul(qH[j], kH[t][j]))
				}

				scores[t] = mul(sum, newValue(1.0/math.Sqrt(float64(headDim)), nil, nil))
			}

			attnW := softmax(scores) // get attention weights

			// Weighted value aggregation
			headOut := make([]*Value, headDim)
			for j := range headDim {
				headOut[j] = newValue(0, nil, nil)
				for t := range len(values[li]) {
					headOut[j] = add(headOut[j], mul(attnW[t], vH[t][j]))
				}
			}

			xAttn = append(xAttn, headOut...)
		}

		// Output projection
		x = linear(xAttn, stateDict[layerKey+".attn_wo"])

		// Residual connection
		for i := range nEmbd {
			x[i] = add(x[i], xRes[i])
		}

		// ============================================================
		//  MLP Sub-block
		// ============================================================
		xRes = x
		x = rmsnorm(x)

		// Expand layer
		x = linear(x, stateDict[layerKey+".mlp_fc1"]) // [4*nEmbd]

		// ReLU activation
		for i := range x {
			x[i] = x[i].relu()
		}

		// Contract layer
		x = linear(x, stateDict[layerKey+".mlp_fc2"]) // [nEmbd]

		// Residual connection
		for i := range nEmbd {
			x[i] = add(x[i], xRes[i])
		}
	}

	// Step 3: Output logits
	logits := linear(x, stateDict["lm_head"])

	return logits
}

// ============================================================================
//  Training Loop
// ============================================================================

// train runs the training loop for numSteps iterations.
func train(numSteps int, docs []string, uchars []rune, BOS, _ int, stateDict StateDict, optimizer *adamOptimizer) {
	lenBOS := 2
	vocabIndex := buildVocabIndex(uchars)

	for step := range numSteps {
		// Select document (round-robin)
		doc := docs[step%len(docs)]
		tokens := encode(doc, vocabIndex)

		// Wrap with BOS tokens
		allTokens := make([]int, 0, len(tokens)+lenBOS)
		allTokens = append(allTokens, BOS)
		allTokens = append(allTokens, tokens...)
		allTokens = append(allTokens, BOS)

		// Limit to blockSize
		n := min(len(allTokens)-1, blockSize)

		// Forward pass
		// Note: We build one full graph for the whole sequence (clear but not fast).
		// This creates a large autograd graph in memory. Garbage collection handles cleanup,
		// but performance will degrade with larger blockSize. For production use, consider
		// chunking sequences or truncated backpropagation through time (TBPTT).
		keys := make([][][]*Value, nLayer)

		values := make([][][]*Value, nLayer)
		for i := range nLayer {
			keys[i] = make([][]*Value, 0)
			values[i] = make([][]*Value, 0)
		}

		losses := make([]*Value, 0, n)

		for posID := range n {
			tokenID := allTokens[posID]
			targetID := allTokens[posID+1]

			logits := gpt(tokenID, posID, keys, values, stateDict)
			probs := softmax(logits)

			// Cross-entropy loss: -log(probs[targetID])
			// Note: This keeps the softmax graph for clarity; log-sum-exp is a faster alternative.
			loss := neg(probs[targetID].log())
			losses = append(losses, loss)
		}

		// Average loss: (1/n) * sum(losses)
		// CRITICAL: Avoid integer division
		// Sum all losses first, then average (multiply by 1/n once)
		loss := losses[0] // Semantically Safe: n >= 1 guaranteed (allTokens has min 2 BOS tokens)
		for i := 1; i < len(losses); i++ {
			loss = add(loss, losses[i])
		}

		loss = mul(loss, newValue(1.0/float64(n), nil, nil))

		// Backward pass
		loss.backward()

		// Adam update
		optimizer.step(step)

		// Log progress (overwrite same line)
		fmt.Printf("\rstep %4d / %4d | loss %.4f", step+1, numSteps, loss.Data)
	}

	fmt.Println()
}

// ============================================================================
//  Inference / Sampling
// ============================================================================

// sample generates a single sample from the model.
func sample(temperature float64, maxLen int, uchars []rune, BOS, _ int, stateDict StateDict) string {
	keys := make([][][]*Value, nLayer)

	values := make([][][]*Value, nLayer)
	for i := range nLayer {
		keys[i] = make([][]*Value, 0)
		values[i] = make([][]*Value, 0)
	}

	tokenID := BOS

	var result strings.Builder

	for posID := range maxLen {
		logits := gpt(tokenID, posID, keys, values, stateDict)

		// Temperature scaling: divide each logit by temperature
		scaledLogits := make([]*Value, len(logits))
		for i, l := range logits {
			scaledLogits[i] = mul(l, newValue(1.0/temperature, nil, nil))
		}

		// Softmax
		probs := softmax(scaledLogits)

		// Extract probabilities as float64 for weighted choice
		weights := make([]float64, len(probs))
		for i, p := range probs {
			weights[i] = p.Data
		}

		// Sample token
		tokenID = weightedChoice(weights)

		// Stop on BOS
		if tokenID == BOS {
			break
		}

		// Append character to result
		if tokenID < len(uchars) {
			result.WriteRune(uchars[tokenID])
		}
	}

	return result.String()
}

// ----------------------------------------------------------------------------
//  Weighted Random Sampling
// ----------------------------------------------------------------------------

// weightedChoice selects an index based on weights (normalized probabilities).
func weightedChoice(weights []float64) int {
	// Calculate cumulative sum
	cumsum := make([]float64, len(weights))

	cumsum[0] = weights[0]
	for i := 1; i < len(weights); i++ {
		cumsum[i] = cumsum[i-1] + weights[i]
	}

	// Generate random value in [0, total)
	total := cumsum[len(cumsum)-1]
	if total <= 0 {
		// Handle edge case: all weights are zero or negative
		return 0
	}

	r := rng.Float64() * total

	// Binary search or linear scan to find selected index
	for i, cs := range cumsum {
		if r < cs {
			return i
		}
	}

	return len(weights) - 1
}

// ============================================================================
//  Dataset & Tokenization
// ============================================================================

// downloadDataset downloads the dataset from a URL if it doesn't exist locally.
func downloadDataset(url, filename string) error {
	if _, err := os.Stat(filename); err == nil {
		// File already exists
		return nil
	}

	resp, err := http.Get(url)
	if err != nil {
		return err
	}

	defer func() {
		_ = resp.Body.Close() // Ignore close error in defer
	}()

	file, err := os.Create(filename)
	if err != nil {
		return err
	}

	defer func() {
		_ = file.Close() // Ignore close error in defer
	}()

	_, err = io.Copy(file, resp.Body)

	return err
}

// loadDocs reads all lines from a file, strips whitespace, and returns non-empty lines.
func loadDocs(filename string) ([]string, error) {
	data, err := os.ReadFile(filename)
	if err != nil {
		return nil, err
	}

	lines := strings.Split(string(data), "\n")

	var docs []string

	for _, line := range lines {
		line = strings.TrimSpace(line)
		if line != "" {
			docs = append(docs, line)
		}
	}

	return docs, nil
}

// shuffleDocs shuffles documents in-place using the global RNG.
func shuffleDocs(docs []string) {
	rng.Shuffle(len(docs), func(i, j int) {
		docs[i], docs[j] = docs[j], docs[i]
	})
}

// encode converts a string to token IDs using a precomputed vocab index.
func encode(doc string, vocabIndex map[rune]int) []int {
	tokens := make([]int, len(doc))
	for i, ch := range doc {
		index, ok := vocabIndex[ch]
		if !ok {
			panic(fmt.Sprintf("character %c not in vocabulary", ch))
		}

		tokens[i] = index
	}

	return tokens
}

// decode converts token IDs back to a string, skipping BOS tokens.
func decode(tokens []int, uchars []rune, BOS int) string {
	var result strings.Builder

	for _, tokenID := range tokens {
		if tokenID != BOS && tokenID < len(uchars) {
			result.WriteRune(uchars[tokenID])
		}
	}

	return result.String()
}

// ============================================================================
//  Initialization Helpers
// ============================================================================

// initRNG initializes the global random number generator with a seed.
func initRNG(seed int64) {
	source := rand.NewSource(seed)
	rng = rand.New(source)
}

// buildVocab extracts unique characters from docs, sorts them, and returns:
// uchars (sorted unique characters), BOS (special token ID), vocabSize (total tokens).
func buildVocab(docs []string) ([]rune, int, int) {
	// Collect all unique characters
	charSet := make(map[rune]bool)

	for _, doc := range docs {
		for _, ch := range doc {
			charSet[ch] = true
		}
	}

	// Convert to sorted slice
	uchars := make([]rune, 0, len(charSet))
	for ch := range charSet {
		uchars = append(uchars, ch)
	}

	slices.Sort(uchars)

	BOS := len(uchars) // uchars = 'a', 'b',... 'z' = 25, BOS ID = 25+1 = 26
	vocabSize := len(uchars) + 1

	return uchars, BOS, vocabSize
}

// buildVocabIndex builds a rune->index map for fast token lookup.
func buildVocabIndex(uchars []rune) map[rune]int {
	index := make(map[rune]int, len(uchars))
	for i, ch := range uchars {
		index[ch] = i
	}

	return index
}

// matrix creates a matrix of shape [nOut, nIn] initialized with Normal(0, std).
func matrix(nOut, nIn int, std float64) [][]*Value {
	mat := make([][]*Value, nOut)
	for i := range mat {
		mat[i] = make([]*Value, nIn)
		for j := range mat[i] {
			// rng.NormFloat64() returns standard normal; multiply by std.
			// Note: Go and Python use different Gaussian generators, so exact weights differ.
			mat[i][j] = newValue(rng.NormFloat64()*std, nil, nil)
		}
	}

	return mat
}

// flattenParams extracts all parameters from stateDict into a single flat list.
func flattenParams(stateDict StateDict) []*Value {
	var params []*Value

	keys := make([]string, 0, len(stateDict))
	for key := range stateDict {
		keys = append(keys, key)
	}

	// Use deterministic key order to keep optimizer buffers stable across runs.
	sort.Strings(keys)

	for _, key := range keys {
		mat := stateDict[key]
		for _, row := range mat {
			params = append(params, row...)
		}
	}

	return params
}