/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ // @ts-check /** * @import { PipelineOptions } from "chrome://global/content/ml/EngineProcess.sys.mjs" * @import { BackendError } from "./Pipeline.mjs" * @import { MLEngineWorker } from "../MLEngine.worker.mjs" * @import { EmbeddingDType, EmbeddingRequest, EmbeddingResponse, PreTrainedTokenizer } from "./StaticEmbeddingsPipeline.d.ts" */ /** * @typedef {object} Lazy * @property {typeof import("chrome://global/content/ml/Utils.sys.mjs").createFileUrl} createFileUrl * @property {typeof import("chrome://global/content/ml/Utils.sys.mjs").parseNpy} parseNpy * @property {typeof import("chrome://global/content/ml/OPFS.sys.mjs").OPFS} OPFS * @property {typeof import("chrome://global/content/ml/backends/ONNXPipeline.mjs").importTransformers} importTransformers * @property {typeof import("chrome://global/content/ml/EngineProcess.sys.mjs").QuantizationLevel} QuantizationLevel */ /** @type {Lazy} */ const lazy = /** @type {any} */ ({}); ChromeUtils.defineESModuleGetters( lazy, { createFileUrl: "chrome://global/content/ml/Utils.sys.mjs", parseNpy: "chrome://global/content/ml/Utils.sys.mjs", OPFS: "chrome://global/content/ml/OPFS.sys.mjs", importTransformers: "chrome://global/content/ml/backends/ONNXPipeline.mjs", QuantizationLevel: "chrome://global/content/ml/EngineProcess.sys.mjs", }, { global: "current" } ); /** * Mock out a response object for tests. This should have the same type as @see {Response} */ class MockedResponse { /** @type {any} */ #value; /** * @param {any} value */ constructor(value) { this.#value = value; } /** * @returns {ReturnType} */ json() { return this.#value; } /** * @returns {ReturnType} */ arrayBuffer() { return this.#value; } } /** * Embeddings are typically generated through running text through a BERT-like model * that is an encoder-only transformer. However, this is expensive and slow. Static * embeddings allow for a cheaper way to generate an embedding by just averaging the * values of each token's embedding vector. This involves a simple lookup per token, and * then some vector math. These embeddings are often good enough for looking up * semantically similar values. */ export class StaticEmbeddingsPipeline { /** @type {PreTrainedTokenizer} */ #tokenizer; /** * The embedding dimensions size, e.g. 128, 256, 512 * * @type {number} */ #dimensions; /** @type {number | null} */ #initializeStart; /** * Get a native JS double out of the backing data array. * * @type {(index: number) => number} */ #getFloat; /** * @param {PreTrainedTokenizer} tokenizer * @param {ArrayBuffer} npyData * @param {EmbeddingDType} dtype * @param {number} dimensions * @param {number} initializeStart */ constructor(tokenizer, npyData, dtype, dimensions, initializeStart) { this.#tokenizer = tokenizer; this.#dimensions = dimensions; this.#initializeStart = initializeStart; const { data: embeddings, shape: [vocabSize, dimActual], } = lazy.parseNpy(npyData); if (dimActual != this.#dimensions) { throw new Error( `The dimensions requested (${this.#dimensions}) and the dimensions received (${dimActual}) did not match` ); } if (tokenizer.model.vocab.length != vocabSize) { throw new Error( `The tokenizer vocab size (${this.#dimensions}) did not match the data vocab size (${vocabSize})` ); } switch (dtype) { case lazy.QuantizationLevel.FP32: case lazy.QuantizationLevel.FP16: // No processing is needed. this.#getFloat = index => this.embeddings[index]; break; case lazy.QuantizationLevel.FP8_E5M2: this.#getFloat = this.#getFp8_E5M2; break; case lazy.QuantizationLevel.FP8_E4M3: this.#getFloat = this.#getFp8_E4M3; break; default: throw new Error("Unsupported dtype: " + dtype); } /** @type {ArrayBufferLike} */ this.embeddings = embeddings; } /** * @param {MLEngineWorker} worker * @param {null} _wasm * @param {PipelineOptions} pipelineOptions * @param {(error: any) => BackendError} _createError */ static async initialize(worker, _wasm, pipelineOptions, _createError) { let initializeStart = ChromeUtils.now(); const { backend, modelHubRootUrl, modelHubUrlTemplate, modelId, modelRevision, staticEmbeddingsOptions, } = pipelineOptions; // These are the options that are specific to this engine. const { subfolder, dtype, dimensions, compression, mockedValues } = staticEmbeddingsOptions; const extension = compression ? ".zst" : ""; const files = [ `${subfolder}/tokenizer.json${extension}`, `${subfolder}/${dtype}.d${dimensions}.npy${extension}`, ]; /** * @param {string} fileName * @returns {Promise} */ async function getResponse(fileName) { const url = lazy.createFileUrl({ file: fileName, model: modelId, revision: modelRevision, urlTemplate: modelHubUrlTemplate, rootUrl: modelHubRootUrl, }); if (mockedValues) { const mockedValue = mockedValues[url]; if (!mockedValue) { throw new Error( "Could not find mocked value for requested url: " + url ); } if (url.endsWith(`.json${extension}`)) { return new MockedResponse(mockedValue); } return new MockedResponse(new Uint8Array(mockedValue).buffer); } const modelFile = await worker.getModelFile({ url }); const filePath = modelFile.ok[2]; const fileHandle = await lazy.OPFS.getFileHandle(filePath); const file = await fileHandle.getFile(); let stream = file.stream(); if (compression) { const decompressionStream = new DecompressionStream("zstd"); stream = stream.pipeThrough(decompressionStream); } return new Response(stream); } const [tokenizerJsonResponse, npyDataResponse] = await Promise.all( files.map(getResponse) ); const npyData = await npyDataResponse.arrayBuffer(); const tokenizerJson = await tokenizerJsonResponse.json(); let assetsLoad = ChromeUtils.now(); ChromeUtils.addProfilerMarker( "StaticEmbeddingsPipeline", initializeStart, "Assets load" ); const { PreTrainedTokenizer } = await lazy.importTransformers(backend); const tokenizer = new PreTrainedTokenizer(tokenizerJson, {}); ChromeUtils.addProfilerMarker( "StaticEmbeddingsPipeline", assetsLoad, "Tokenizer load" ); return new StaticEmbeddingsPipeline( tokenizer, npyData, dtype, dimensions, initializeStart ); } /** * @param {number} index */ #getFp8_E5M2 = index => { const byte = this.embeddings[index]; // Do some bit manipulation to extract the sign (S), the exponent (E), and the // mantissa (M) // This is format: | S E E E | E E M M | // To do the manipulation, shift the bits to the right (>>) and mask off the relevant // bits with an & operation. const sign = (byte >> 7) & 0b0000_0001; const exponent = (byte >> 2) & 0b0001_1111; const mantissa = byte & 0b0000_0011; const bias = 15; if (exponent === 0) { if (mantissa === 0) { // Zero return sign ? -0 : 0; } // Subnormal: exponent = 1 - bias, no implicit leading 1 const frac = mantissa / 4; // 2 mantissa bits → divide by 2^2 const value = frac * Math.pow(2, 1 - bias); return sign ? -value : value; } else if (exponent === 0x1f) { if (mantissa === 0) { return sign ? -Infinity : Infinity; } return NaN; } // Normalized const frac = 1 + mantissa / 4; const value = frac * Math.pow(2, exponent - bias); return sign ? -value : value; }; /** * @param {number} index */ #getFp8_E4M3 = index => { const byte = this.embeddings[index]; // Do some bit manipulation to extract the sign (S), the exponent (E), and the // mantissa (M) // This is format: | S E E E | E M M M | // To do the manipulation, shift the bits to the right (>>) and mask off the relevant // bits with an & operation. const sign = (byte >> 7) & 0b0000_0001; const exponent = (byte >> 3) & 0b0000_1111; const mantissa = byte & 0b0000_0111; const bias = 7; if (exponent === 0) { if (mantissa === 0) { return sign ? -0 : 0; } // Subnormal const frac = mantissa / 8; // 3 mantissa bits → divide by 2^3 const value = frac * Math.pow(2, 1 - bias); return sign ? -value : value; } else if (exponent === 0xf) { if (mantissa === 0) { return sign ? -Infinity : Infinity; } return NaN; } // Normalized const frac = 1 + mantissa / 8; const value = frac * Math.pow(2, exponent - bias); return sign ? -value : value; }; /** * @param {EmbeddingRequest} request * @param {number} _requestId * @param {null} _engineRunOptions * @returns {EmbeddingResponse} */ run(request, _requestId, _engineRunOptions) { if (request.options.pooling != "mean") { throw new Error( `Only "mean" pooling is currently supported, please add support "${request.options.pooling}" here.` ); } let tokenCount = 0; const sequenceCount = request.args.length; let beforeResponse = ChromeUtils.now(); const response = { metrics: [], output: request.args.map(text => { // Always do the vector math in f32 space, even if the underlying precision // is lower. const embedding = new Float32Array(this.#dimensions); /** @type {number[]} */ const tokenIds = this.#tokenizer.encode(text); tokenCount += tokenIds.length; // Sum up the embeddings. for (const tokenId of tokenIds) { for (let i = 0; i < this.#dimensions; i++) { // Inflate the double into a JavaScript double, then add it. embedding[i] += this.#getFloat(tokenId * this.#dimensions + i); } } if (request.options.normalize) { // Compute the average by dividing by the tokens provided. // Also compute the sum of the squares while we're here. let sumSquares = 0; for (let i = 0; i < this.#dimensions; i++) { const n = embedding[i] / tokenIds.length; embedding[i] = n; sumSquares += n * n; } // Apply the normalization. const magnitude = Math.sqrt(sumSquares); if (magnitude != 0) { for (let i = 0; i < this.#dimensions; i++) { embedding[i] = embedding[i] / magnitude; } } } else { // Only compute the average by dividing by the tokens provided. for (let i = 0; i < this.#dimensions; i++) { embedding[i] = embedding[i] / tokenIds.length; } } return embedding; }), }; ChromeUtils.addProfilerMarker( "StaticEmbeddingsPipeline", beforeResponse, `Processed ${sequenceCount} sequences with ${tokenCount} tokens.` ); if (this.#initializeStart) { ChromeUtils.addProfilerMarker( "StaticEmbeddingsPipeline", this.#initializeStart, "Time to first response" ); this.#initializeStart = null; } return response; } }