/* Any copyright is dedicated to the Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ */ "use strict"; const rootDataUrl = "chrome://mochitests/content/browser/toolkit/components/ml/tests/browser/data/articles"; async function fetchArticle(url) { const response = await fetch(url); return await response.text(); } let testData = []; const smollm2Model = { taskName: "text-generation", modelId: "HuggingFaceTB/SmolLM2-360M-Instruct-GGUF", modelFile: "smollm2-360m-instruct-q8_0.gguf", kvCacheDtype: "q8_0", flashAttn: true, useMmap: true, useMlock: false, perfModelId: "HuggingFaceTB/SmolLM2-360M-Instruct", backend: "llama.cpp", }; const qwen3Model = { taskName: "text-generation", modelId: "unsloth/Qwen3-0.6B-GGUF", modelFile: "Qwen3-0.6B-Q8_0.gguf", kvCacheDtype: "q8_0", flashAttn: true, useMmap: true, useMlock: false, perfModelId: "unsloth/Qwen3-0.6B-GGUF", }; const qwen3ModelNative = { taskName: "text-generation", modelId: "unsloth/Qwen3-0.6B-GGUF", modelFile: "Qwen3-0.6B-Q8_0.gguf", kvCacheDtype: "q8_0", flashAttn: false, useMmap: false, useMlock: true, perfModelId: "unsloth/Qwen3-0.6B-GGUF", backend: "llama.cpp", }; const articles = [ { data: `${rootDataUrl}/tiny.txt`, type: "tiny", numTokens: 200, }, { data: `${rootDataUrl}/medium.txt`, type: "medium", numTokens: 568, }, { data: `${rootDataUrl}/big.txt`, type: "big", numTokens: 1100, }, ]; let numEngines = 0; for (const model of [qwen3ModelNative]) { for (const article of articles) { // Replace all non-alphabnumeric or dash or underscore by underscore const perfName = `${model.perfModelId.replace(/\//g, "-")}_${article.type}`; const engineId = `engine-${numEngines}`; const options = { ...model, article: article.data, engineId, perfName, numTokens: article.numTokens, }; numEngines += 1; testData.push(options); } } const perfMetadata = { owner: "GenAI Team", name: "browser_ml_llama_summarizer_perf.js", description: "Template test for latency for Summarizer model using Llama.cpp WASM", options: { default: { perfherder: true, perfherder_metrics: [ { name: "latency", unit: "ms", shouldAlert: false, }, { name: "memory", unit: "MB", shouldAlert: false, }, { name: "tokenSpeed", unit: "tokens/s", shouldAlert: false, lowerIsBetter: false, }, { name: "charactersSpeed", unit: "chars/s", shouldAlert: false, lowerIsBetter: false, }, ], verbose: true, manifest: "perftest.toml", manifest_flavor: "browser-chrome", try_platform: ["linux", "mac", "win"], }, }, }; requestLongerTimeout(20); // To run locally // pip install huggingface-hub // huggingface-cli download {model_id} --local-dir MOZ_ML_LOCAL_DIR/onnx-models/{model_id}/{revision} // Update your test in // Then run: ./mach lint -l perfdocs --fix . // This will auto-generate docs async function run_summarizer_with_perf({ taskName, modelId, article, engineId, perfName, numTokens, trackPeakMemory, ...llamaOptions }) { let chatInput = await fetchArticle(article); const numNewTokens = 80; const nextPowerOf2 = n => { if (n <= 1) { return 1; } n--; n |= n >> 1; n |= n >> 2; n |= n >> 4; n |= n >> 8; n |= n >> 16; return n + 1; }; const numContext = nextPowerOf2(numTokens + numNewTokens + 10); const options = new PipelineOptions({ engineId, taskName, modelHubUrlTemplate: "{model}/{revision}", modelId, modelRevision: "main", numContext, numBatch: Math.min(numContext, 64), numUbatch: Math.min(numContext, 64), backend: "wllama", timeoutMS: -1, ...llamaOptions, }); console.log("detected concurrency", navigator.hardwareConcurrency); if (taskName.includes("text-generation")) { chatInput = [ { role: "system", content: "Your role is to summarize the provided content as succinctly as possible while retaining the most important information", }, { role: "user", content: chatInput, }, ]; } const request = { prompt: chatInput, nPredict: numNewTokens, skipPrompt: false, stopOnEndOfGenerationTokens: false, context: { swaFull: false, flashAttn: false }, }; await perfTest({ name: `sum-${perfName}`, options, request, trackPeakMemory, }); } add_task(async function test_ml_smollm_tiny_article() { await run_summarizer_with_perf(testData[0]); }); add_task(async function test_ml_smollm_medium_article() { await run_summarizer_with_perf(testData[1]); }); add_task(async function test_ml_smollm_medium_article() { await run_summarizer_with_perf(testData[2]); }); add_task(async function test_ml_smollm_tiny_article_with_mem() { await run_summarizer_with_perf({ ...testData[0], trackPeakMemory: true }); }); add_task(async function test_ml_smollm_medium_article_with_mem() { await run_summarizer_with_perf({ ...testData[1], trackPeakMemory: true }); }); add_task(async function test_ml_smollm_medium_article_with_mem() { await run_summarizer_with_perf({ ...testData[2], trackPeakMemory: true }); });