/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set ts=8 sts=2 et sw=2 tw=80: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #ifndef mozilla_dom_LlamaRunner_h #define mozilla_dom_LlamaRunner_h // Primary WebIDL interface for llama.cpp-based streaming chat model // integration. Exposes LLM-backed ReadableStream generation and chat prompt // formatting to JavaScript via LlamaRunner. #include "MediaInfo.h" #include "mozilla/dom/LlamaRunnerBinding.h" #include "mozilla/dom/MessagePort.h" #include "mozilla/dom/ReadableStream.h" #include "nsIFileStreams.h" #include "nsISupports.h" #include "nsWrapperCache.h" #include "mozilla/llama/LlamaBackend.h" #include "prsystem.h" #include "mozilla/Casting.h" #include "mozilla/SPSCQueue.h" #include "mozilla/Array.h" #include #include "mozilla/dom/Promise.h" #include "mozilla/dom/ReadableStream.h" #include "mozilla/dom/ReadableStreamDefaultController.h" #include "mozilla/dom/UnderlyingSourceCallbackHelpers.h" #include "nsThreadUtils.h" #include "mozilla/TaskQueue.h" #include "mozilla/ScopeExit.h" #include "mozilla/RefPtr.h" #include "nsIThread.h" #include "nsError.h" #include "mozilla/dom/ContentChild.h" #include "mozilla/Atomics.h" #include "mozilla/llama/LlamaBackend.h" #include "mozilla/Logging.h" #include "nsFmtString.h" #include "nsThread.h" #include "nsThreadManager.h" #include "mozilla/ThreadEventQueue.h" #include "mozilla/EventQueue.h" #include "nsDeque.h" #include "mozilla/dom/Blob.h" namespace mozilla::dom { class LlamaStreamSource; } namespace mozilla::llama { // When the Maybe has no value, it indicates the model has finished generating. // Otherwise, it contains a LlamaChatResponse with partial or final content. using LlamaGenerateTaskPromise = MozPromise, nsCString, /* IsExclusive = */ true>; /** * LlamaGenerateTask runs the orchestration of model inference on a background * thread, but delegates all compute-intensive operations (e.g., token * generation, prompt processing, decoding) to the LlamaBackend's internal * threadpool. * * The task itself manages: * - Calling LlamaBackend::Generate with user options and callbacks. * - Buffering responses and applying streaming heuristics (e.g., flush size, * phase boundaries). * - Monitoring for cancellation requests. * * This task is launched during the first JS stream pull and executes * independently. It posts all intermediate/final results back to the * LlamaStreamSource on the JS stream thread. * * Note: This class does not do the heavy lifting; it schedules and collects * results from backend-owned threads. This enables concurrency across multiple * LlamaRunner instances with minimal blocking. */ class LlamaGenerateTask final : public mozilla::CancelableRunnable { public: using LlamaChatResponse = mozilla::dom::LlamaChatResponse; enum class TaskState : uint32_t { // Task has not started yet. Idle, // Task is actively running. Running, // Task completed successfully. CompletedSuccess, // Task failed due to an error. CompletedFailure, // Task was externally cancelled (if not already completed). Cancelled }; LlamaGenerateTask(RefPtr aBackend, const LlamaChatOptions& aOptions); NS_IMETHOD Run() override; nsresult Cancel() override; // Returns the next message if available, or a promise that will resolve once // a message is ready. Rejects immediately if the task has failed. RefPtr GetMessage(); private: // Attempts to push a message only if a consumer is actively waiting. // Returns true if the message was consumed immediately or queued to resolve a // pending promise. bool MaybePushMessage(mozilla::Maybe aMessage); // Unconditionally pushes a message. First tries MaybePushMessage(); if that // fails, enqueues the message into the internal queue. Returns true if the // message was accepted. bool PushMessage(mozilla::Maybe aMessage); // Thread-safe task state mozilla::Atomic mState{TaskState::Idle}; // Error message set if the task fails. We store it separately instead of // using the message-passing queue, so that errors can still be surfaced even // if the queue mechanism fails. nsCString mErrorMessage; // Shared reference to backend (not exclusive owner) RefPtr mBackend; const LlamaChatOptions mChatOptions; // Index of the currently active promise holder (0 or 1). // The producer resolves the promise at this index, then toggles it. // The consumer uses the *other* index to safely create a new promise. int mCurrentPromiseHolderIdx{0}; // Double-buffered promise holders (index toggles between 0 and 1) to avoid a // rare race where a resolved promise might be consumed by the other thread // *before* we’ve officially marked it as resolved. While this race is highly // unlikely (would require the consumer thread to request a new promise // between the resolve() call and its internal state update), this design // fully eliminates the possibility. The producer resolves the current index, // then switches to the next; the consumer always creates new promises from // the non-current index. Array, 2> mPromiseHolders; // Thread-safe flag indicating whether a consumer is waiting for data. Atomic mHasPendingConsumer{false}; // Thread-safe buffer for messages to be sent back to the consumer. SPSCQueue> mMessagesQueue; ~LlamaGenerateTask(); }; } // namespace mozilla::llama namespace mozilla::dom { using LlamaBackend = ::mozilla::llama::LlamaBackend; /** * LlamaStreamSource is a bridge between LlamaGenerateTask and JS * ReadableStream. * * Implements UnderlyingSourceAlgorithmsWrapper so it can be used with * ReadableStream::CreateNative. This object owns the lifecycle of the * generation task (LlamaGenerateTask) and buffers intermediate results * for consumption by JS pull callbacks. * * It holds a shared strong reference to the backend (LlamaBackend), * which may also be retained by other components (e.g., LlamaRunner). * All compute-heavy work is performed by the backend’s internal threadpool. * * Generation results are delivered via LlamaGenerateTask::Generate(), which * returns a promise. Once resolved, the result is forwarded to the JS consumer. * * The stream starts when PullCallbackImpl is first called from JS, * launching a background generation task and associating it with a thread. */ class LlamaStreamSource final : public UnderlyingSourceAlgorithmsWrapper { public: MOZ_DECLARE_REFCOUNTED_TYPENAME(LlamaStreamSource) NS_DECL_ISUPPORTS_INHERITED NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(LlamaStreamSource, UnderlyingSourceAlgorithmsWrapper) LlamaStreamSource(RefPtr aBackend, const LlamaChatOptions& aOptions); MOZ_CAN_RUN_SCRIPT already_AddRefed CancelCallbackImpl( JSContext* aCx, const Optional>& aReason, ErrorResult& aRv) override; MOZ_CAN_RUN_SCRIPT already_AddRefed PullCallbackImpl( JSContext* aCx, ReadableStreamControllerBase& aController, ErrorResult& aRv) override; // Links the JS-side stream controller to this source void SetControllerStream(RefPtr aStream); private: ~LlamaStreamSource(); RefPtr mBackend; const LlamaChatOptions mChatOptions; // Background generation task RefPtr mTask; // Background worker thread nsCOMPtr mGenerateThread; // Holds the event queue where PullCallbackImpl is // called from nsCOMPtr mOriginalEventTarget; // Associated JS stream object RefPtr mControllerStream; }; class MetadataCallback; /** * LlamaRunner is the primary WebIDL-exposed controller for llama.cpp chat. * * It provides JavaScript with an API to format prompts, launch inference, and * receive output as a `ReadableStream`. It delegates inference to a * thread-safe LlamaBackend and manages stream logic via LlamaStreamSource. * * This class is designed for use in JS. */ class LlamaRunner final : public nsISupports, public nsWrapperCache { public: MOZ_DECLARE_REFCOUNTED_TYPENAME(LlamaRunner) NS_DECL_CYCLE_COLLECTING_ISUPPORTS NS_DECL_CYCLE_COLLECTION_WRAPPERCACHE_CLASS(LlamaRunner) explicit LlamaRunner(const GlobalObject& aGlobal); nsISupports* GetParentObject() const { return mGlobal; } static already_AddRefed Constructor(const GlobalObject& aGlobal, ErrorResult& aRv); already_AddRefed Initialize(const LlamaModelOptions& aOptions, Blob& aModelBlob, ErrorResult& aRv); virtual JSObject* WrapObject(JSContext* aCx, JS::Handle aGivenProto) override; /** * Creates a readable stream that incrementally yields language model * responses. * * This function initiates a new generation session using the provided options * and returns a JavaScript `ReadableStream`. The stream will asynchronously * emit `LlamaChatResponse` chunks as the model produces output. * * @param aOptions Configuration for the generation session. * @param aRv Used to report any errors during stream setup or execution. * * @returns A `ReadableStream` that yields `LlamaChatResponse` objects from * the language model, suitable for consumption in JavaScript via async * iteration or stream readers. * * @note This function is designed for use in JavaScript via WebIDL. It * supports streaming output for real-time use cases such as chat UIs or * progressive rendering. * * @example JavaScript usage: * const stream = CreateGenerationStream(chatOptions); * const reader = stream.getReader(); * * while (true) { * const { value, done } = await reader.read(); * if (done) break; * console.log(value); // `value` is a LlamaChatResponse * } */ already_AddRefed CreateGenerationStream( const LlamaChatOptions& aOptions, ErrorResult& aRv); /** * Formats a sequence of chat messages into a prompt string for LLM inference. * * This function takes a structured list of chat messages (user, assistant, or * system roles) and formats them into a prompt string suitable for processing * by a llama.cpp-based language model. The function is asynchronous and * returns a JavaScript Promise that resolves to the formatted string. * * @param aOptions An object containing the input messages and formatting * options. * * @param aRv Used to report any errors that occur during formatting. * * @returns A Promise that resolves to the final prompt string formatted for * LLM input. * * @throws DOMException via `aRv` on failure (e.g., invalid message roles or * empty input). * * @example JavaScript usage: * FormatChat({ * messages: [ * { role: "user", content: "What's the weather like?" }, * { role: "assistant", content: "It's sunny and 25°C." } * ], * addAssistant: true * }).then(prompt => { * // Pass prompt to LLM for inference * }); */ already_AddRefed FormatChat(const LlamaFormatChatOptions& aOptions, ErrorResult& aRv); static bool InInferenceProcess(JSContext*, JSObject*); void OnMetadataReceived(); private: ~LlamaRunner() = default; RefPtr mBackend; RefPtr mStreamSource; protected: LlamaModelOptions mModelOptions; nsCOMPtr mGlobal; RefPtr mInitPromise; nsCOMPtr mStream; RefPtr mMetadataCallback; }; } // namespace mozilla::dom #endif