/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- * vim: set ts=8 sts=2 et sw=2 tw=80: * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #ifndef wasm_WasmHeuristics_h #define wasm_WasmHeuristics_h #include #include "js/Prefs.h" #include "threading/ExclusiveData.h" #include "vm/MutexIDs.h" #include "wasm/WasmConstants.h" namespace js { namespace wasm { // Classes LazyTieringHeuristics and InliningHeuristics allow answering of // simple questions relating to lazy tiering and inlining, eg, "is this // function small enough to inline?" They do not answer questions that involve // carrying state (eg, remaining inlining budget) across multiple queries. // // Note also, they may be queried in parallel without locking, by multiple // instantiating / compilation threads, and so must be immutable once created. // For both LazyTieringHeuristics and InliningHeuristics, the default `level_` // is set to 5 in modules/libpref/init/StaticPrefList.yaml. The scaling // factors and tables defined in this file have been set so as to give // near-optimal performance on Barista-3 and another benchmark; they are // generally within 2% of the best value that can be found by changing the // `level_` numbers. Further performance gains may depend on improving the // accuracy of estimateIonCompilationCost(). // // Performance was measured on a mid/high-end Intel CPU (Core i5-1135G7 -- // Tiger Lake) and a low end Intel (Celeron N3050 -- Goldmont). class LazyTieringHeuristics { static constexpr uint32_t MIN_LEVEL = 1; static constexpr uint32_t MAX_LEVEL = 9; static constexpr uint32_t SMALL_MODULE_THRESH = 150000; // A scaling table for levels 2 .. 8. Levels 1 and 9 are special-cased. In // this table, each value differs from its neighbour by a factor of 3, giving // a dynamic range in the table of 3 ^ 6 == 729, hence a wide selection of // tier-up aggressiveness. static constexpr float scale_[7] = {27.0, 9.0, 3.0, 1.0, // default 0.333, 0.111, 0.037}; public: // 1 = min (almost never, set tiering threshold to max possible, == 2^31-1) // 5 = default // 9 = max (request tier up at first call, set tiering threshold to zero) // // Don't use this directly, except for logging etc. static uint32_t rawLevel() { uint32_t level = JS::Prefs::wasm_lazy_tiering_level(); return std::clamp(level, MIN_LEVEL, MAX_LEVEL); } // Estimate the cost of compiling a function of bytecode size `bodyLength` // using Ion, in terms of arbitrary work-units. The baseline code for the // function counts down from the returned value as it runs. When the value // goes negative it requests tier-up. See "[SMDOC] WebAssembly baseline // compiler -- Lazy Tier-Up mechanism" in WasmBaselineCompile.cpp. static int32_t estimateIonCompilationCost(uint32_t bodyLength, size_t codeSectionSize) { uint32_t level = rawLevel(); // Increase the aggressiveness of tiering for small modules, since they // don't generate much optimised-tier compilation work, so we might as well // try to get them into optimized code sooner. But don't overdo it, since // we don't want to lose indirect-target resolution as a result. See bug // 1965195. MOZ_ASSERT(codeSectionSize > 0); if (codeSectionSize <= SMALL_MODULE_THRESH && level < MAX_LEVEL) { level += 1; } if (MOZ_LIKELY(MIN_LEVEL < level && level < MAX_LEVEL)) { // The estimated cost, in X86_64 insns, for Ion compilation: // 30k up-front cost + 4k per bytecode byte. // // This is derived from measurements of an optimized build of Ion // compiling about 99000 functions. Each estimate is pretty bad, but // averaged over a number of functions it's often within 20% of correct. // However, this is with no inlining; that causes a much wider variance // of costs. This will need to be revisited at some point. float thresholdF = 30000.0 + 4000.0 * float(bodyLength); // Rescale to step-down work units, so that the default `level` setting // (5) gives pretty good results. thresholdF *= 0.25; // Rescale again to take into account `level`. thresholdF *= scale_[level - (MIN_LEVEL + 1)]; // Clamp and convert. constexpr float thresholdHigh = 2.0e9f; // at most 2 billion; int32_t thresholdI = int32_t(std::clamp(thresholdF, 10.f, thresholdHigh)); MOZ_RELEASE_ASSERT(thresholdI >= 0); return thresholdI; } if (level == MIN_LEVEL) { // "almost never tier up"; produce our closest approximation to infinity return INT32_MAX; } if (level == MAX_LEVEL) { // request tier up at the first call; return the lowest possible value return 0; } MOZ_CRASH(); } }; // [SMDOC] Per-function and per-module inlining limits // `class InliningHeuristics` makes inlining decisions on a per-call-site // basis. Even with that in place, it is still possible to create a small // input function for which inlining produces a huge (1000 x) expansion. Hence // we also need a backstop mechanism to limit growth of functions and of // modules as a whole. // // The following scheme is therefore implemented: // // * no function can have an inlining-based expansion of more than a constant // factor (here, 99 x). // // * for a module as a whole there is also a max expansion factor, and this is // much lower, perhaps 1 x. // // This means that // // * no individual function can cause too much trouble (due to the 99 x limit), // yet any function that needs a lot of inlining can still get it. In // practice most functions have an inlining expansion, at default settings, // of much less than 5 x. // // * the module as a whole cannot chew up excessive resources. // // Once a limit is exhausted, Ion compilation is still possible, but no // inlining will be done. // // The per-module limit needs to be interpreted in the light of lazy tiering. // Many modules only tier up a small subset of their functions. Hence the // relatively low per-module limit still allows a high level of expansion of // the functions that do get tiered up. // // In effect, the tiering mechanism gives hot functions (early tierer-uppers) // preferential access to the module-level inlining budget. Colder functions // that tier up later may find the budget to be exhausted, in which case they // get no inlining. It would be feasible to gradually reduce inlining // aggressiveness as the budget is used up, rather than have cliff-edge // behaviour, but it hardly seems worth the hassle. // // To implement this, we have // // * `int64_t WasmCodeMetadata::ProtectedOptimizationStats::inliningBudget`: // this is initially set as the maximum copied-in bytecode length allowable // for the module. Inlining of individual call sites decreases the value and // may drive it negative. Once the value is negative, no more inlining is // allowed. // // * `int64_t FunctionCompiler::inliningBudget_` does the same at a // per-function level. Its initial value takes into account the current // value of the module-level budget; hence if the latter is exhausted, the // function-level budget will be zero and so no inlining occurs. // // If either limit is exceeded, a message is printed on the // `MOZ_LOG=wasmCodeMetaStats:3` channel. // Allowing budgets to be driven negative means we slightly overshoot them. An // alternative to be to ensure they can never be driven negative, in which case // we will slightly undershoot them instead, given that the sum of inlined // function sizes is unlikely to exactly match the budget. We use the // overshoot scheme only because it makes it simple to decide when to log a // budget-overshoot message and not emit any duplicates. // There is a (logical, not-TSan-detectable) race condition in that the // inlining budget for a function is set in part from the module-level budget // at the time that compilation of the function begins, and the module-level // budget is updated when compilation of a function ends -- see // FunctionCompiler::initToplevel and ::finish. If there are multiple // compilation threads, it can happen that multiple threads individually // overrun the module-level budget, and so collectively overshoot the budget // multiple times. // // The worst-case total overshoot is equal to the worst-case per-function // overshoot multiplied by the max number of functions that can be concurrently // compiled: // // // * MaxPartialTier2CompileTasks // // which with current settings is 320 * 1 == 320. // // We never expect to hit either limit in normal operation -- they exist only // to protect against the worst case. So the imprecision doesn't matter. // Setting the multiplier here to 1 means that inlining can copy in at maximum // the same amount of bytecode as is in the module; 2 means twice as much, etc, // and setting it to 0 would completely disable inlining. static constexpr int64_t PerModuleMaxInliningRatio = 1; // Same meaning as above, except at a per-function level. static constexpr int64_t PerFunctionMaxInliningRatio = 99; class InliningHeuristics { static constexpr uint32_t MIN_LEVEL = 1; static constexpr uint32_t MAX_LEVEL = 9; static constexpr uint32_t LARGE_FUNCTION_THRESH_1 = 400000; static constexpr uint32_t LARGE_FUNCTION_THRESH_2 = 800000; static constexpr uint32_t LARGE_FUNCTION_THRESH_3 = 1200000; public: // 1 = no inlining allowed // 2 = min (minimal inlining) // 5 = default // 9 = max (very aggressive inlining) // // Don't use these directly, except for logging etc. static uint32_t rawLevel() { uint32_t level = JS::Prefs::wasm_inlining_level(); return std::clamp(level, MIN_LEVEL, MAX_LEVEL); } static bool rawDirectAllowed() { return JS::Prefs::wasm_direct_inlining(); } static bool rawCallRefAllowed() { return JS::Prefs::wasm_call_ref_inlining(); } // For a call_ref site, returns the percentage of total calls made by that // site, that any single target has to make in order to be considered as a // candidate for speculative inlining. static uint32_t rawCallRefPercent() { uint32_t percent = JS::Prefs::wasm_call_ref_inlining_percent(); // Clamp to range 10 .. 100 (%). return std::clamp(percent, 10u, 100u); } // Calculate the total inlining budget for a module, based on the size of the // code section. static int64_t moduleInliningBudget(size_t codeSectionSize) { int64_t budget = int64_t(codeSectionSize) * PerModuleMaxInliningRatio; // Don't be overly stingy for tiny modules. Function-level inlining // limits will still protect us from excessive inlining. return std::max(budget, 1000); } // Given a call of kind `callKind` to a function of bytecode size // `bodyLength` at `inliningDepth`, decide whether the it is allowable to // inline the call. Note that `inliningDepth` starts at zero, not one. In // other words, a value of zero means the query relates to a function which // (if approved) would be inlined into the top-level function currently being // compiled. // // `rootFunctionBodyLength` is the bytecode size of the function at the root // of this inlining stack. If that is (very) large, we back off somewhat on // inlining. `*largeFunctionBackoff` indicates whether or not that happened. enum class CallKind { Direct, CallRef }; static bool isSmallEnoughToInline(CallKind callKind, uint32_t inliningDepth, uint32_t bodyLength, uint32_t rootFunctionBodyLength, bool* largeFunctionBackoff) { *largeFunctionBackoff = false; // If this fails, something's seriously wrong; bail out. MOZ_RELEASE_ASSERT(inliningDepth <= 10); // because 10 > (320 / 40) MOZ_ASSERT(rootFunctionBodyLength > 0 && rootFunctionBodyLength <= wasm::MaxFunctionBytes); // Check whether calls of this kind are currently allowed if ((callKind == CallKind::Direct && !rawDirectAllowed()) || (callKind == CallKind::CallRef && !rawCallRefAllowed())) { return false; } // Check the size is allowable. This depends on how deep we are in the // stack and on the setting of level_. We allow inlining of functions of // size up to the `baseSize[]` value at depth zero, but reduce the // allowable size by 40 for each further level of inlining, so that only // smaller and smaller functions are allowed as we inline deeper. // // At some point `allowedSize` goes negative and thereby disallows all // further inlining. Note that the `baseSize` entry for // `level_ == MIN_LEVEL (== 1)` is set so as to disallow inlining even at // depth zero. Hence `level_ == MIN_LEVEL` disallows all inlining. static constexpr int32_t baseSize[9] = {0, 40, 80, 120, 160, // default 200, 240, 280, 320}; uint32_t level = rawLevel(); // If the root function is large, back off somewhat on inlining, so as to // limit its further growth. The limits are set so high that almost all // functions will be unaffected by this. See bug 1967644. if (rootFunctionBodyLength > LARGE_FUNCTION_THRESH_1 && level > MIN_LEVEL) { level--; *largeFunctionBackoff = true; } if (rootFunctionBodyLength > LARGE_FUNCTION_THRESH_2 && level > MIN_LEVEL) { level--; *largeFunctionBackoff = true; } if (rootFunctionBodyLength > LARGE_FUNCTION_THRESH_3 && level > MIN_LEVEL) { level--; *largeFunctionBackoff = true; } // Having established `level`, check whether the callee is small enough. MOZ_RELEASE_ASSERT(level >= MIN_LEVEL && level <= MAX_LEVEL); int32_t allowedSize = baseSize[level - MIN_LEVEL]; allowedSize -= int32_t(40 * inliningDepth); return allowedSize > 0 && bodyLength <= uint32_t(allowedSize); } }; } // namespace wasm } // namespace js #endif /* wasm_WasmHeuristics_h */