#pragma once // ============================================================================ // == ./include/llama/Tuple.hpp == // == // Copyright 2023 Alexander Matthes, Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // ============================================================================ // == ./include/llama/Meta.hpp == // == // Copyright 2022 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // ============================================================================ // == ./include/llama/macros.hpp == // == // Copyright 2022 Alexander Matthes, Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once #ifdef __INTEL_COMPILER # error LLAMA has stopped supporting the Intel Classic Compiler after Intel announced its planned deprecation and \ replacement by the Intel LLVM-based compiler. Please migrate to the Intel LLVM-based compiler. #endif #if defined(__INTEL_LLVM_COMPILER) // icx supports #pragma ivdep, but it will issue a diagnostic that it needs vectorize(assume_safety) to vectorize. // Let's keep both pragmas for now. # define LLAMA_INDEPENDENT_DATA \ _Pragma("ivdep") _Pragma("clang loop vectorize(assume_safety) interleave(assume_safety)") #elif defined(__clang__) # define LLAMA_INDEPENDENT_DATA _Pragma("clang loop vectorize(assume_safety) interleave(assume_safety)") #elif defined(__NVCOMPILER) # define LLAMA_INDEPENDENT_DATA _Pragma("ivdep") #elif defined(__GNUC__) # define LLAMA_INDEPENDENT_DATA _Pragma("GCC ivdep") #elif defined(_MSC_VER) # define LLAMA_INDEPENDENT_DATA __pragma(loop(ivdep)) #else /// May be put in front of a loop statement. Indicates that all (!) data access inside the loop is indepent, so the /// loop can be safely vectorized. Example: \code{.cpp} /// LLAMA_INDEPENDENT_DATA /// for(int i = 0; i < N; ++i) /// // because of LLAMA_INDEPENDENT_DATA the compiler knows that a and b /// // do not overlap and the operation can safely be vectorized /// a[i] += b[i]; /// \endcode # define LLAMA_INDEPENDENT_DATA #endif #ifndef LLAMA_FORCE_INLINE # if defined(__NVCC__) || defined(__HIP__) # define LLAMA_FORCE_INLINE __forceinline__ # elif defined(__GNUC__) || defined(__clang__) # define LLAMA_FORCE_INLINE inline __attribute__((always_inline)) # elif defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) # define LLAMA_FORCE_INLINE __forceinline # else /// Forces the compiler to inline a function annotated with this macro # define LLAMA_FORCE_INLINE inline # warning LLAMA_FORCE_INLINE is only defined to "inline" for this compiler # endif #endif #ifndef LLAMA_PRAGMA # define LLAMA_PRAGMA(tokens) _Pragma(#tokens) #endif #ifndef LLAMA_UNROLL # if defined(__HIP__) || defined(__NVCC__) || defined(__NVCOMPILER) || defined(__clang__) \ || defined(__INTEL_LLVM_COMPILER) # define LLAMA_UNROLL(...) LLAMA_PRAGMA(unroll __VA_ARGS__) # elif defined(__GNUG__) # define LLAMA_UNROLL(...) LLAMA_PRAGMA(GCC unroll __VA_ARGS__) # elif defined(_MSC_VER) // MSVC does not support a pragma for unrolling # define LLAMA_UNROLL(...) # else /// Requests the compiler to unroll the loop following this directive. An optional unrolling count may be provided as /// argument, which must be a constant expression. # define LLAMA_UNROLL(...) # warning LLAMA_UNROLL is not implemented for your compiler # endif #endif #ifndef LLAMA_ACC # if defined(__HIP__) || defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__)) # define LLAMA_ACC __device__ # elif defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) # define LLAMA_ACC # else # define LLAMA_ACC # warning LLAMA_HOST_ACC is only defined empty for this compiler # endif #endif #ifndef LLAMA_HOST_ACC # if defined(__HIP__) || defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__)) # define LLAMA_HOST_ACC __host__ __device__ # elif defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) # define LLAMA_HOST_ACC # else /// Some offloading parallelization language extensions such a CUDA, OpenACC or OpenMP 4.5 need to specify whether a /// class, struct, function or method "resides" on the host, the accelerator (the offloading device) or both. LLAMA /// supports this with marking every function needed on an accelerator with `LLAMA_HOST_ACC`. # define LLAMA_HOST_ACC # warning LLAMA_HOST_ACC is only defined empty for this compiler # endif #endif #define LLAMA_FN_HOST_ACC_INLINE LLAMA_FORCE_INLINE LLAMA_HOST_ACC #ifndef LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS # if defined(__clang__) || defined(__INTEL_LLVM_COMPILER) # define LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(...) __attribute__((always_inline)) __VA_ARGS__ # elif defined(__GNUC__) || (defined(__NVCC__) && !defined(_MSC_VER)) # define LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(...) __VA_ARGS__ __attribute__((always_inline)) # elif defined(_MSC_VER) # define LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(...) \ __VA_ARGS__ /* FIXME: MSVC cannot combine constexpr and [[msvc::forceinline]] */ # else # define LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(...) __VA_ARGS__ # warning LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS not defined for this compiler # endif #endif #ifndef LLAMA_LAMBDA_INLINE /// Gives strong indication to the compiler to inline the attributed lambda. # define LLAMA_LAMBDA_INLINE LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS() #endif #ifndef LLAMA_SUPPRESS_HOST_DEVICE_WARNING # if defined(__NVCC__) && !defined(__clang__) # define LLAMA_SUPPRESS_HOST_DEVICE_WARNING _Pragma("nv_exec_check_disable") # else /// Suppresses the nvcc warning: 'calling a __host__ function from __host__ __device__ function.' /// This macro can be applied to function declarations # define LLAMA_SUPPRESS_HOST_DEVICE_WARNING # endif #endif #ifndef LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING # ifdef __NVCC__ # ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ # define LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING \ _Pragma("nv_diag_suppress 20011") _Pragma("nv_diag_suppress 20014") # else # define LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING \ _Pragma("diag_suppress 20011") _Pragma("diag_suppress 20014") # endif # else /// Suppresses the nvcc warnings: /// 'calling a __host__ function from a __host__ __device__ function is not allowed' and /// 'calling a __host__ function("...") from a __host__ __device__ function("...") is not allowed' /// This macro can be applied before the concerned code block, which then needs to be ended with \ref /// LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING. # define LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING # endif #endif #ifndef LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING # ifdef __NVCC__ # ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ # define LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING \ _Pragma("nv_diag_default 20011") _Pragma("nv_diag_default 20014") # else # define LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING _Pragma("diag_default 20011") _Pragma("diag_default 20014") # endif # else # define LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING # endif #endif /// Forces a copy of a value. This is useful to prevent ODR usage of constants when compiling for GPU targets. #define LLAMA_COPY(x) decltype(x)(x) // https://devblogs.microsoft.com/cppblog/optimizing-the-layout-of-empty-base-classes-in-vs2015-update-2-3/ #if defined(_MSC_VER) # define LLAMA_DECLSPEC_EMPTY_BASES __declspec(empty_bases) #else # define LLAMA_DECLSPEC_EMPTY_BASES #endif /// Expands to likely if [[likely]] supported by the compiler. Use as [[LLAMA_LIKELY]]. #if __has_cpp_attribute(likely) # define LLAMA_LIKELY likely #else # define LLAMA_LIKELY #endif /// Expands to unlikely if [[unlikely]] supported by the compiler. Use as [[LLAMA_UNLIKELY]]. #if __has_cpp_attribute(unlikely) # define LLAMA_UNLIKELY unlikely #else # define LLAMA_UNLIKELY #endif /// Expands to consteval if the compilers supports the keyword, otherwise to constexpr. // TODO(bgruber): reevalute with nvhpc in the future or find workaround #if defined(__cpp_consteval) && !defined(__NVCOMPILER) # define LLAMA_CONSTEVAL consteval #else # define LLAMA_CONSTEVAL constexpr #endif #ifndef LLAMA_EXPORT /// Annotation of all LLAMA public APIs. Expands to nothing by default. Can be defined to 'export' when building LLAMA /// as a C++20 module. # define LLAMA_EXPORT #endif // TODO(bgruber): clang 12-15 (libstdc++ from gcc 11.2 or gcc 12.1) fail to compile this currently with the issue // described here: // https://stackoverflow.com/questions/64300832/why-does-clang-think-gccs-subrange-does-not-satisfy-gccs-ranges-begin-functi // Intel LLVM compiler is also using the clang frontend #define CAN_USE_RANGES 0 #if __has_include() # include # if defined(__cpp_concepts) && defined(__cpp_lib_ranges) && (!defined(__clang__) || __clang_major__ >= 16) \ && !defined(__INTEL_LLVM_COMPILER) && (!defined(_MSC_VER) || _MSC_VER > 1932) && !defined(__NVCOMPILER) # undef CAN_USE_RANGES # define CAN_USE_RANGES 1 # endif #endif // == // == ./include/llama/macros.hpp == // ============================================================================ #include namespace llama { // make mp11 directly available in the llama namespace using namespace boost::mp11; namespace internal { // adapted from boost::mp11, but with LLAMA_FN_HOST_ACC_INLINE template typename L, typename... T, typename F> // NOLINTNEXTLINE(readability-identifier-naming) LLAMA_FN_HOST_ACC_INLINE constexpr void mp_for_each_inline_impl(L, F&& f) { using A = int[sizeof...(T)]; (void) A{((void) f(T{}), 0)...}; } template class ToList> struct mp_unwrap_values_into_impl; template class FromList, typename... Values, template class ToList> struct mp_unwrap_values_into_impl, ToList> { using type = ToList; }; template class ToList> using mp_unwrap_values_into = typename mp_unwrap_values_into_impl::type; template struct ReplacePlaceholdersImpl { using type = E; }; template struct ReplacePlaceholdersImpl, Args...> { using type = mp_at_c, I>; }; template typename E, typename... Ts, typename... Args> struct ReplacePlaceholdersImpl, Args...> { using type = E::type...>; }; } // namespace internal /// Like boost::mp11::mp_for_each, but marked with LLAMA_FN_HOST_ACC_INLINE template // NOLINTNEXTLINE(readability-identifier-naming) LLAMA_FN_HOST_ACC_INLINE constexpr void mp_for_each_inline(F&& f) { internal::mp_for_each_inline_impl(mp_rename{}, std::forward(f)); } LLAMA_EXPORT template using ReplacePlaceholders = typename internal::ReplacePlaceholdersImpl::type; } // namespace llama // == // == ./include/llama/Meta.hpp == // ============================================================================ // #include "macros.hpp" // amalgamate: file already inlined namespace llama { namespace internal { template && !std::is_final_v> struct LLAMA_DECLSPEC_EMPTY_BASES TupleLeaf { T val; LLAMA_FN_HOST_ACC_INLINE constexpr auto value() -> T& { return val; } LLAMA_FN_HOST_ACC_INLINE constexpr auto value() const -> const T& { return val; } }; template struct LLAMA_DECLSPEC_EMPTY_BASES TupleLeaf { static_assert(!std::is_reference_v, "llama::Tuple cannot store references to stateless types"); LLAMA_FN_HOST_ACC_INLINE constexpr explicit TupleLeaf(T) { } LLAMA_FN_HOST_ACC_INLINE constexpr auto value() const -> T { return {}; } }; } // namespace internal LLAMA_EXPORT template struct LLAMA_DECLSPEC_EMPTY_BASES Tuple { }; /// Tuple class like `std::tuple` but suitable for use with offloading devices like GPUs. LLAMA_EXPORT template struct LLAMA_DECLSPEC_EMPTY_BASES Tuple : internal::TupleLeaf<1 + sizeof...(RestElements), TFirstElement> , Tuple { private: using Leaf = internal::TupleLeaf<1 + sizeof...(RestElements), TFirstElement>; public: using FirstElement = TFirstElement; using RestTuple = Tuple; constexpr Tuple() = default; /// Construct a tuple from values of the same types as the tuple stores. LLAMA_FN_HOST_ACC_INLINE constexpr explicit Tuple(FirstElement first, RestElements... rest) : Leaf{std::move(first)} , RestTuple(std::move(rest)...) { } /// Construct a tuple from forwarded values of potentially different types as the tuple stores. // SFINAE away this ctor if tuple elements cannot be constructed from ctor arguments template< typename T, typename... Ts, std::enable_if_t< sizeof...(RestElements) == sizeof...(Ts) && std::is_constructible_v && (std::is_constructible_v && ...), int> = 0> LLAMA_FN_HOST_ACC_INLINE constexpr explicit Tuple(T&& firstArg, Ts&&... restArgs) : Leaf{static_cast(std::forward(firstArg))} , RestTuple(std::forward(restArgs)...) { } /// Returns the first element of the tuple LLAMA_FN_HOST_ACC_INLINE constexpr auto first() -> decltype(auto) { return Leaf::value(); } /// Returns the first element of the tuple LLAMA_FN_HOST_ACC_INLINE constexpr auto first() const -> decltype(auto) { return Leaf::value(); } /// Returns a tuple of all but the first element LLAMA_FN_HOST_ACC_INLINE constexpr auto rest() -> RestTuple& { return static_cast(*this); } /// Returns a tuple of all but the first element LLAMA_FN_HOST_ACC_INLINE constexpr auto rest() const -> const RestTuple& { return static_cast(*this); } }; LLAMA_EXPORT template LLAMA_HOST_ACC Tuple(Elements...) -> Tuple>...>; LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto get(Tuple& tuple) -> auto& { using Base [[maybe_unused]] // clang claims Base is unused ... = internal::TupleLeaf, I>>; return tuple.Base::value(); } LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto get(const Tuple& tuple) -> const auto& { using Base [[maybe_unused]] // clang claims Base is unused ... = internal::TupleLeaf, I>>; return tuple.Base::value(); } } // namespace llama LLAMA_EXPORT template struct std::tuple_size> // NOLINT(cert-dcl58-cpp) { static constexpr auto value = sizeof...(Elements); }; LLAMA_EXPORT template struct std::tuple_element> // NOLINT(cert-dcl58-cpp) { using type = boost::mp11::mp_at_c, I>; }; namespace llama { namespace internal { template LLAMA_FN_HOST_ACC_INLINE constexpr auto areEqual( const Tuple& a, const Tuple& b, std::index_sequence) -> bool { return ((get(a) == get(b)) && ...); } } // namespace internal LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto operator==(const Tuple& a, const Tuple& b) -> bool { if constexpr(sizeof...(ElementsA) == sizeof...(ElementsB)) if constexpr(mp_apply, mp_list>>:: value) return internal::areEqual(a, b, std::make_index_sequence{}); return false; } LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto operator!=(const Tuple& a, const Tuple& b) -> bool { return !(a == b); } namespace internal { template LLAMA_FN_HOST_ACC_INLINE constexpr auto tupleCatImpl( const Tuple1& t1, const Tuple2& t2, std::index_sequence, std::index_sequence) { return Tuple{get(t1)..., get(t2)...}; } } // namespace internal LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto tupleCat(const Tuple1& t1, const Tuple2& t2) { return internal::tupleCatImpl( t1, t2, std::make_index_sequence>{}, std::make_index_sequence>{}); } namespace internal { template< std::size_t Pos, typename Tuple, typename Replacement, std::size_t... IsBefore, std::size_t... IsAfter> LLAMA_FN_HOST_ACC_INLINE constexpr auto tupleReplaceImpl( Tuple&& tuple, Replacement&& replacement, std::index_sequence, std::index_sequence) { return llama::Tuple{ get(std::forward(tuple))..., std::forward(replacement), get(std::forward(tuple))...}; } } // namespace internal /// Creates a copy of a tuple with the element at position Pos replaced by replacement. LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto tupleReplace(Tuple&& tuple, Replacement&& replacement) { return internal::tupleReplaceImpl( std::forward(tuple), std::forward(replacement), std::make_index_sequence{}, std::make_index_sequence> - Pos - 1>{}); } namespace internal { template LLAMA_FN_HOST_ACC_INLINE constexpr auto tupleTransformHelper( std::index_sequence, const Tuple& tuple, const Functor& functor) { return Tuple{functor(get(tuple))...}; } } // namespace internal /// Applies a functor to every element of a tuple, creating a new tuple with the result of the element /// transformations. The functor needs to implement a template `operator()` to which all tuple elements are passed. LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto tupleTransform(const Tuple& tuple, const Functor& functor) { // note: cannot use mp11::tuple_transform since that returns a std::tuple return internal::tupleTransformHelper(std::make_index_sequence{}, tuple, functor); } /// Returns a copy of the tuple without the first element. LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto popFront(const Tuple& tuple) { return tuple.rest(); } } // namespace llama // == // == ./include/llama/Tuple.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/DumpMapping.hpp == // == // Copyright 2022 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once #if __has_include() // ============================================================================ // == ./include/llama/ArrayIndexRange.hpp == // == // Copyright 2022 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // ============================================================================ // == ./include/llama/ArrayExtents.hpp == // == // Copyright 2022 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // ============================================================================ // == ./include/llama/Array.hpp == // == // Copyright 2022 Alexander Matthes, Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "macros.hpp" // amalgamate: file already inlined #include #include #include namespace llama { /// Array class like `std::array` but suitable for use with offloading devices like GPUs. /// \tparam T type if array elements. /// \tparam N rank of the array. LLAMA_EXPORT template // NOLINTNEXTLINE(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp,readability-identifier-naming) struct Array { using value_type = T; T element[N]; LLAMA_FN_HOST_ACC_INLINE constexpr auto size() const { return N; } LLAMA_FN_HOST_ACC_INLINE constexpr auto empty() const -> bool { return N == 0; } LLAMA_FN_HOST_ACC_INLINE constexpr auto begin() -> T* { return &element[0]; } LLAMA_FN_HOST_ACC_INLINE constexpr auto begin() const -> const T* { return &element[0]; } LLAMA_FN_HOST_ACC_INLINE constexpr auto end() -> T* { return &element[0] + N; } LLAMA_FN_HOST_ACC_INLINE constexpr auto end() const -> const T* { return &element[0] + N; } LLAMA_FN_HOST_ACC_INLINE constexpr auto front() -> T& { return element[0]; } LLAMA_FN_HOST_ACC_INLINE constexpr auto front() const -> const T& { return element[0]; } LLAMA_FN_HOST_ACC_INLINE constexpr auto back() -> T& { return element[N - 1]; } LLAMA_FN_HOST_ACC_INLINE constexpr auto back() const -> const T& { return element[N - 1]; } template LLAMA_FN_HOST_ACC_INLINE constexpr auto operator[](IndexType&& idx) -> T& { return element[idx]; } template LLAMA_FN_HOST_ACC_INLINE constexpr auto operator[](IndexType&& idx) const -> const T& { return element[idx]; } LLAMA_FN_HOST_ACC_INLINE constexpr auto data() -> T* { return &element[0]; } LLAMA_FN_HOST_ACC_INLINE constexpr auto data() const -> const T* { return &element[0]; } LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator==(const Array& a, const Array& b) -> bool { for(std::size_t i = 0; i < N; ++i) if(a.element[i] != b.element[i]) return false; return true; } LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator!=(const Array& a, const Array& b) -> bool { return !(a == b); } LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator+(const Array& a, const Array& b) -> Array { Array temp{}; for(std::size_t i = 0; i < N; ++i) temp[i] = a[i] + b[i]; return temp; } template LLAMA_FN_HOST_ACC_INLINE constexpr auto get() -> T& { return element[I]; } template LLAMA_FN_HOST_ACC_INLINE constexpr auto get() const -> const T& { return element[I]; } }; LLAMA_EXPORT template struct Array { using value_type = T; LLAMA_FN_HOST_ACC_INLINE constexpr auto size() const { return 0; } LLAMA_FN_HOST_ACC_INLINE constexpr auto empty() const -> bool { return true; } LLAMA_FN_HOST_ACC_INLINE constexpr auto begin() -> T* { return nullptr; } LLAMA_FN_HOST_ACC_INLINE constexpr auto begin() const -> const T* { return nullptr; } LLAMA_FN_HOST_ACC_INLINE constexpr auto end() -> T* { return nullptr; } LLAMA_FN_HOST_ACC_INLINE constexpr auto end() const -> const T* { return nullptr; } LLAMA_FN_HOST_ACC_INLINE constexpr auto front() -> T& { outOfRange(); } LLAMA_FN_HOST_ACC_INLINE constexpr auto front() const -> const T& { outOfRange(); } LLAMA_FN_HOST_ACC_INLINE constexpr auto back() -> T& { outOfRange(); } LLAMA_FN_HOST_ACC_INLINE constexpr auto back() const -> const T& { outOfRange(); } template LLAMA_FN_HOST_ACC_INLINE constexpr auto operator[](IndexType&&) -> T& { outOfRange(); } template LLAMA_FN_HOST_ACC_INLINE constexpr auto operator[](IndexType&&) const -> const T& { outOfRange(); } LLAMA_FN_HOST_ACC_INLINE constexpr auto data() -> T* { return nullptr; } LLAMA_FN_HOST_ACC_INLINE constexpr auto data() const -> const T* { return nullptr; } LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator==(const Array&, const Array&) -> bool { return true; } LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator!=(const Array&, const Array&) -> bool { return false; } LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator+(const Array&, const Array&) -> Array { return {}; } template LLAMA_FN_HOST_ACC_INLINE constexpr auto get() -> T& { outOfRange(); } template LLAMA_FN_HOST_ACC_INLINE constexpr auto get() const -> const T& { outOfRange(); } private: [[noreturn]] void outOfRange() const { throw std::out_of_range{"Array has zero length"}; } }; LLAMA_EXPORT template Array(First, Args... args) -> Array; LLAMA_EXPORT template auto operator<<(std::ostream& os, const Array& a) -> std::ostream& { os << "Array{"; bool first = true; for(auto e : a) { if(first) first = false; else os << ", "; os << e; } os << "}"; return os; } LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto pushFront([[maybe_unused]] Array a, T v) -> Array { Array r{}; r[0] = v; if constexpr(N > 0) for(std::size_t i = 0; i < N; i++) r[i + 1] = a[i]; return r; } LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto pushBack([[maybe_unused]] Array a, T v) -> Array { Array r{}; if constexpr(N > 0) for(std::size_t i = 0; i < N; i++) r[i] = a[i]; r[N] = v; return r; } LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto popBack([[maybe_unused]] Array a) { static_assert(N > 0); Array r{}; if constexpr(N > 1) for(std::size_t i = 0; i < N - 1; i++) r[i] = a[i]; return r; } LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto popFront([[maybe_unused]] Array a) { static_assert(N > 0); Array r{}; if constexpr(N > 1) for(std::size_t i = 0; i < N - 1; i++) r[i] = a[i + 1]; return r; } LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto product(Array a) -> T { T prod = 1; for(auto s : a) prod *= s; return prod; } LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto dot([[maybe_unused]] Array a, [[maybe_unused]] Array b) -> T { T r = 0; if constexpr(N > 0) for(std::size_t i = 0; i < N; i++) r += a[i] * b[i]; return r; } } // namespace llama LLAMA_EXPORT template struct std::tuple_size> : std::integral_constant // NOLINT(cert-dcl58-cpp) { }; LLAMA_EXPORT template struct std::tuple_element> // NOLINT(cert-dcl58-cpp) { using type = T; }; // == // == ./include/llama/Array.hpp == // ============================================================================ // #include "Meta.hpp" // amalgamate: file already inlined #include #include namespace llama { // TODO(bgruber): make this an alias in C++20, when we have CTAD for aliases /// Represents a run-time index into the array dimensions. /// \tparam Dim Compile-time number of dimensions. LLAMA_EXPORT template struct ArrayIndex : Array { static constexpr std::size_t rank = Dim; }; // allow comparing ArrayIndex with different size types: LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto operator==(ArrayIndex a, ArrayIndex b) -> bool { for(std::size_t i = 0; i < Dim; ++i) if(a[i] != b[i]) return false; return true; } LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto operator!=(ArrayIndex a, ArrayIndex b) -> bool { return !(a == b); } static_assert( std::is_trivially_default_constructible_v>); // so ArrayIndex<1>{} will produce a zeroed // index. Should hold for all dimensions, // but just checking for <1> here. static_assert(std::is_trivially_copy_constructible_v>); static_assert(std::is_trivially_move_constructible_v>); static_assert(std::is_trivially_copy_assignable_v>); static_assert(std::is_trivially_move_assignable_v>); namespace internal { template struct IndexTypeFromArgs { using type = Default; }; template struct IndexTypeFromArgs { static_assert(std::conjunction_v...>, "All index types must be the same"); using type = FirstInt; }; } // namespace internal LLAMA_EXPORT template ArrayIndex(Args...) -> ArrayIndex::type, sizeof...(Args)>; } // namespace llama LLAMA_EXPORT template struct std::tuple_size> : std::integral_constant // NOLINT(cert-dcl58-cpp) { }; LLAMA_EXPORT template struct std::tuple_element> // NOLINT(cert-dcl58-cpp) { using type = V; }; namespace llama { namespace internal { struct Dyn { template, int> = 0> // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions) LLAMA_FN_HOST_ACC_INLINE constexpr operator T() const { return static_cast(-1); } template LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator==(T i, Dyn) -> bool { return i == static_cast(-1); } template LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator==(Dyn d, T i) -> bool { return i == d; } template LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator!=(T i, Dyn d) -> bool { return !(i == d); } template LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator!=(Dyn d, T i) -> bool { return !(i == d); } }; } // namespace internal LLAMA_EXPORT /// Used as a template argument to \ref ArrayExtents to mark a dynamic extent. inline constexpr auto dyn = internal::Dyn{}; /// ArrayExtents holding compile and runtime indices. This is conceptually equivalent to the std::extent of /// std::mdspan (@see: https://wg21.link/P0009) including the changes to make the size_type controllable (@see: /// https://wg21.link/P2553). LLAMA_EXPORT template struct ArrayExtents : Array { static constexpr std::size_t rank = sizeof...(Sizes); static constexpr auto rankDynamic = ((Sizes == dyn) + ... + 0); static constexpr auto rankStatic = rank - rankDynamic; using Index = ArrayIndex; using value_type = T; template LLAMA_FN_HOST_ACC_INLINE constexpr auto get() const -> value_type { using TypeList = mp_list_c; constexpr auto extent = mp_at_c::value; if constexpr(extent != dyn) return extent; else return static_cast&>( *this)[+mp_count, std::integral_constant>::value]; } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator[](T i) const -> value_type { return mp_with_index(i, [&](auto ic) LLAMA_LAMBDA_INLINE { return get(); }); } private: template LLAMA_FN_HOST_ACC_INLINE constexpr auto toArray(std::index_sequence) const -> Index { return {get()...}; } public: LLAMA_FN_HOST_ACC_INLINE constexpr auto toArray() const -> Index { return toArray(std::make_index_sequence{}); } }; LLAMA_EXPORT template struct ArrayExtents { static constexpr std::size_t rank = 0; static constexpr auto rankDynamic = 0; static constexpr auto rankStatic = 0; using Index = ArrayIndex; using value_type = T; LLAMA_FN_HOST_ACC_INLINE constexpr auto toArray() const -> Index { return {}; } }; LLAMA_EXPORT template ArrayExtents(Args...) -> ArrayExtents< typename internal::IndexTypeFromArgs::type, (Args{}, dyn)...>; // we just use Args to repeat dyn as often as Args is big static_assert(std::is_trivially_default_constructible_v>); static_assert(std::is_trivially_copy_constructible_v>); static_assert(std::is_trivially_move_constructible_v>); static_assert(std::is_trivially_copy_assignable_v>); static_assert(std::is_trivially_move_assignable_v>); static_assert(std::is_empty_v>); LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto operator==( ArrayExtents a, ArrayExtents b) -> bool { return a.toArray() == b.toArray(); } LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto operator!=( ArrayExtents a, ArrayExtents b) -> bool { return !(a == b); } LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto product(ArrayExtents e) -> SizeType { return product(e.toArray()); } namespace internal { template constexpr auto makeArrayExtents(std::index_sequence) { return ArrayExtents(Is), Extent)...>{}; } } // namespace internal LLAMA_EXPORT /// N-dimensional ArrayExtents where all N extents are Extent. template using ArrayExtentsNCube = decltype(internal::makeArrayExtents(std::make_index_sequence{})); LLAMA_EXPORT /// N-dimensional ArrayExtents where all values are dynamic. template using ArrayExtentsDynamic = ArrayExtentsNCube; LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE void forEachArrayIndex( [[maybe_unused]] const ArrayIndex& extents, Func&& func, OuterIndices... outerIndices) { constexpr auto fixedIndices = sizeof...(outerIndices); LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING if constexpr(fixedIndices < Dim) { for(SizeType i = 0; i < extents[fixedIndices]; i++) forEachArrayIndex(extents, std::forward(func), outerIndices..., i); } else { std::forward(func)(ArrayIndex{outerIndices...}); } LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING } LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE void forEachArrayIndex(ArrayExtents extents, Func&& func) { forEachArrayIndex(extents.toArray(), std::forward(func)); } } // namespace llama LLAMA_EXPORT template struct std::tuple_size> // NOLINT(cert-dcl58-cpp) : std::integral_constant { }; LLAMA_EXPORT template struct std::tuple_element> // NOLINT(cert-dcl58-cpp) { using type = SizeType; }; // == // == ./include/llama/ArrayExtents.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/Core.hpp == // == // Copyright 2023 Alexander Matthes, Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "ArrayExtents.hpp" // amalgamate: file already inlined // #include "Meta.hpp" // amalgamate: file already inlined // ============================================================================ // == ./include/llama/RecordCoord.hpp == // == // Copyright 2022 Alexander Matthes, Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "Meta.hpp" // amalgamate: file already inlined // #include "macros.hpp" // amalgamate: file already inlined #include // #include // amalgamate: file already included // #include // amalgamate: file already included namespace llama { /// Represents a coordinate for a record inside the record dimension tree. /// \tparam Coords... the compile time coordinate. LLAMA_EXPORT template struct RecordCoord { /// The list of integral coordinates as `mp_list`. using List = mp_list_c; static constexpr std::size_t front = mp_front::value; static constexpr std::size_t back = mp_back::value; static constexpr std::size_t size = sizeof...(Coords); }; LLAMA_EXPORT template<> struct RecordCoord<> { using List = mp_list_c; static constexpr std::size_t size = 0; }; LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto operator==(RecordCoord, RecordCoord) { return false; } LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto operator==(RecordCoord, RecordCoord) { return true; } LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto operator!=(RecordCoord a, RecordCoord b) { return !(a == b); } LLAMA_EXPORT template inline constexpr bool isRecordCoord = false; LLAMA_EXPORT template inline constexpr bool isRecordCoord> = true; LLAMA_EXPORT template auto operator<<(std::ostream& os, RecordCoord) -> std::ostream& { os << "RecordCoord<"; bool first = true; for(auto rc : std::array{RCs...}) { if(first) first = false; else os << ", "; os << rc; } os << ">"; return os; } inline namespace literals { /// Literal operator for converting a numeric literal into a \ref RecordCoord. LLAMA_EXPORT template constexpr auto operator"" _RC() { constexpr auto coord = []() constexpr { const char digits[] = {(Digits - 48)...}; std::size_t acc = 0; std ::size_t powerOf10 = 1; for(int i = sizeof...(Digits) - 1; i >= 0; i--) { acc += digits[i] * powerOf10; powerOf10 *= 10; } return acc; }(); return RecordCoord{}; } } // namespace literals /// Converts a type list of integral constants into a \ref RecordCoord. LLAMA_EXPORT template using RecordCoordFromList = internal::mp_unwrap_values_into; /// Concatenate a set of \ref RecordCoord%s. LLAMA_EXPORT template using Cat = RecordCoordFromList>; /// Concatenate a set of \ref RecordCoord%s instances. LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr auto cat(RecordCoords...) { return Cat{}; } /// RecordCoord without first coordinate component. LLAMA_EXPORT template using PopFront = RecordCoordFromList>; namespace internal { template constexpr auto recordCoordCommonPrefixIsBiggerImpl(RecordCoord, RecordCoord) -> bool { // CTAD does not work if Coords1/2 is an empty pack std::array a1{Coords1...}; std::array a2{Coords2...}; for(std::size_t i = 0; i < std::min(a1.size(), a2.size()); i++) { if(a1[i] > a2[i]) return true; if(a1[i] < a2[i]) return false; } return false; }; } // namespace internal /// Checks wether the first RecordCoord is bigger than the second. LLAMA_EXPORT template inline constexpr auto recordCoordCommonPrefixIsBigger = internal::recordCoordCommonPrefixIsBiggerImpl(First{}, Second{}); namespace internal { template constexpr auto recordCoordCommonPrefixIsSameImpl(RecordCoord, RecordCoord) -> bool { // CTAD does not work if Coords1/2 is an empty pack std::array a1{Coords1...}; std::array a2{Coords2...}; for(std::size_t i = 0; i < std::min(a1.size(), a2.size()); i++) if(a1[i] != a2[i]) return false; return true; }; } // namespace internal /// Checks whether two \ref RecordCoord%s are the same or one is the prefix of the other. LLAMA_EXPORT template inline constexpr auto recordCoordCommonPrefixIsSame = internal::recordCoordCommonPrefixIsSameImpl(First{}, Second{}); } // namespace llama // == // == ./include/llama/RecordCoord.hpp == // ============================================================================ #include // #include // amalgamate: file already included #if __has_include() # include #endif namespace llama { /// Anonymous naming for a \ref Field. LLAMA_EXPORT struct NoName { }; /// @brief Tells whether the given type is allowed as a field type in LLAMA. Such types need to be trivially /// constructible and trivially destructible. LLAMA_EXPORT template inline constexpr bool isAllowedFieldType = std::is_trivially_destructible_v; /// Record dimension tree node which may either be a leaf or refer to a child tree presented as another \ref /// Record. /// \tparam Tag Name of the node. May be any type (struct, class). /// \tparam Type Type of the node. May be one of three cases. 1. another sub tree consisting of a nested \ref /// Record. 2. an array of static size of any type, in which case a Record with as many \ref Field as the array /// size is created, named \ref RecordCoord specialized on consecutive numbers I. 3. A scalar type different from /// \ref Record, making this node a leaf of this type. LLAMA_EXPORT template struct Field { static_assert(isAllowedFieldType, "This field's type is not allowed"); }; LLAMA_EXPORT template inline constexpr bool isField = false; LLAMA_EXPORT template inline constexpr bool isField> = true; /// A type list of \ref Field%s which may be used to define a record dimension. LLAMA_EXPORT template #if __cpp_concepts // Cannot use a fold expression here, because clang/nvcc/icx cannot handle more than 256 arguments. // If you get an error here, then you passed a type which is not llama::Field as argument to Record requires(mp_all>...>::value) #endif struct Record { }; #if __cpp_nontype_template_args >= 201911L && !defined(__EDG__) /// Defined when string literals are supported in the record dimension. See also \ref NamedField. # define LLAMA_HAS_STRING_FIELDS namespace internal { // N includes the char to store the null-terminator template struct FixedString { constexpr FixedString(const char* str) { std::copy(str, str + N, data); } char data[N]; }; template FixedString(const char (&str)[N]) -> FixedString; template struct StringTag { }; } // namespace internal inline namespace literals { /// Literal operator for converting a string literal "abc"_Name to a StringTag<"Name">. LLAMA_EXPORT template auto operator"" _Name() { return internal::StringTag{}; } } // namespace literals /// Alternative to \ref Field. Use with string literals, e.g. NamedField<"x", float>. Access at the \ref View /// requires to use "x"_Name then. LLAMA_EXPORT template using NamedField = Field, Type>; # if __has_include() /// Defined when LLAMA has support to reflect a C++ struct into a record dimension using Boost.Describe. # define LLAMA_CAN_REFLECT_RECORD_DIM namespace internal { template auto reflectToRecordDim(); template auto memberPointerPointeeType(T C::*) -> T; constexpr auto constexpr_strlen(const char* s) { const char* end = s; while(*end != 0) end++; return end - s; } template using MakeFieldFromMemberDescriptor = NamedField< FixedString(Member::name), decltype(reflectToRecordDim())>; template auto reflectToRecordDim() { if constexpr(boost::describe::has_describe_members::value) { using MemberList = boost::describe::describe_members; return mp_rename, llama::Record>{}; } else return T{}; } } // namespace internal /// Reflects the given type T using Boost.Describe and creates a record dimension for it. LLAMA_EXPORT template using ReflectToRecordDim = decltype(internal::reflectToRecordDim()); # endif #endif LLAMA_EXPORT template struct NrAndOffset { T nr; T offset; friend auto operator<<(std::ostream& os, const NrAndOffset& value) -> std::ostream& { return os << "NrAndOffset{" << value.nr << ", " << value.offset << "}"; } }; LLAMA_EXPORT template NrAndOffset(Int, Int) -> NrAndOffset; LLAMA_EXPORT template auto operator==(const NrAndOffset& a, const NrAndOffset& b) -> bool { return a.nr == b.nr && a.offset == b.offset; } LLAMA_EXPORT template auto operator!=(const NrAndOffset& a, const NrAndOffset& b) -> bool { return !(a == b); } /// Get the tag from a \ref Field. LLAMA_EXPORT template using GetFieldTag = mp_first; /// Get the type from a \ref Field. LLAMA_EXPORT template using GetFieldType = mp_second; LLAMA_EXPORT template inline constexpr auto isRecord = false; LLAMA_EXPORT template inline constexpr auto isRecord> = true; namespace internal { template struct GetTagsImpl; template struct GetTagsImpl, RecordCoord> { using Field = mp_at_c, FirstCoord>; using ChildTag = GetFieldTag; using ChildType = GetFieldType; using type = mp_push_front>::type, ChildTag>; }; template struct GetTagsImpl> { using ChildTag = RecordCoord; using type = mp_push_front>::type, ChildTag>; }; template struct GetTagsImpl> { using type = mp_list<>; }; } // namespace internal /// Get the tags of all \ref Field%s from the root of the record dimension tree until to the node identified by /// \ref RecordCoord. LLAMA_EXPORT template using GetTags = typename internal::GetTagsImpl::type; namespace internal { template struct GetTagImpl { using type = mp_back>; }; template struct GetTagImpl> { using type = NoName; }; } // namespace internal /// Get the tag of the \ref Field at a \ref RecordCoord inside the record dimension tree. LLAMA_EXPORT template using GetTag = typename internal::GetTagImpl::type; /// Is true if, starting at two coordinates in two record dimensions, all subsequent nodes in the record dimension /// tree have the same tag. /// \tparam RecordDimA First record dimension. /// \tparam RecordCoordA \ref RecordCoord based on RecordDimA along which the tags are compared. /// \tparam RecordDimB second record dimension. /// \tparam RecordCoordB \ref RecordCoord based on RecordDimB along which the tags are compared. /// \hideinitializer LLAMA_EXPORT template inline constexpr auto hasSameTags = []() constexpr { if constexpr(RecordCoordA::size != RecordCoordB::size) return false; else if constexpr(RecordCoordA::size == 0 && RecordCoordB::size == 0) return true; else return std::is_same_v, GetTags>; }(); namespace internal { template struct FindFieldByTag { template using HasTag = std::is_same, Tag>; static constexpr auto value = mp_find_if::value; }; template struct GetCoordFromTagsImpl { static_assert(mp_size::value != 0, "Tag combination is not valid"); }; template struct GetCoordFromTagsImpl, RecordCoord, FirstTag, Tags...> { static constexpr auto tagIndex = FindFieldByTag, FirstTag>::value; static_assert( tagIndex < sizeof...(Fields), "FirstTag was not found inside this Record. Does your record dimension contain the tag you access " "with?"); using ChildType = GetFieldType, tagIndex>>; using type = typename GetCoordFromTagsImpl, Tags...>::type; }; template< typename ChildType, std::size_t Count, std::size_t... ResultCoords, typename FirstTag, typename... Tags> struct GetCoordFromTagsImpl, FirstTag, Tags...> { static_assert(isRecordCoord, "Please use a RecordCoord to index into static arrays"); static_assert(FirstTag::size == 1, "Expected RecordCoord with 1 coordinate"); static_assert(FirstTag::front < Count, "Index out of bounds"); using type = typename GetCoordFromTagsImpl, Tags...>::type; }; template struct GetCoordFromTagsImpl { using type = RecordCoord; }; // unpack a list of tags template struct GetCoordFromTagsImpl, RecordCoord<>, mp_list> : GetCoordFromTagsImpl, RecordCoord<>, Tags...> { }; template struct GetCoordFromTagsImpl, mp_list> : GetCoordFromTagsImpl, Tags...> { }; // pass through a RecordCoord template struct GetCoordFromTagsImpl, RecordCoord<>, RecordCoord> { using type = RecordCoord; }; } // namespace internal /// Converts a series of tags, or a list of tags, navigating down a record dimension into a \ref RecordCoord. A /// RecordCoord will be passed through unmodified. LLAMA_EXPORT template using GetCoordFromTags = typename internal::GetCoordFromTagsImpl, TagsOrTagList...>::type; namespace internal { template struct GetTypeImpl { using type = typename GetTypeImpl>::type; }; template struct GetTypeImpl, RecordCoord> { using ChildType = GetFieldType, HeadCoord>>; using type = typename GetTypeImpl>::type; }; template struct GetTypeImpl> { using type = typename GetTypeImpl>::type; }; template struct GetTypeImpl> { static_assert(isAllowedFieldType); using type = T; }; } // namespace internal /// Returns the type of a node in a record dimension tree identified by a given \ref RecordCoord or a series of /// tags. LLAMA_EXPORT template using GetType = typename internal::GetTypeImpl::type; namespace internal { template struct LeafRecordCoordsImpl; template struct LeafRecordCoordsImpl> { using type = mp_list>; }; template struct LeafRecordCoordsImpl, RecordCoord> { template static auto help(std::index_sequence) { return mp_append< typename LeafRecordCoordsImpl, RecordCoord>::type...>{}; } using type = decltype(help(std::make_index_sequence{})); }; template struct LeafRecordCoordsImpl> { template static auto help(std::index_sequence) { return mp_append>::type...>{}; } using type = decltype(help(std::make_index_sequence{})); }; } // namespace internal /// Returns a flat type list containing all record coordinates to all leaves of the given record dimension. LLAMA_EXPORT template using LeafRecordCoords = typename internal::LeafRecordCoordsImpl>::type; /// Iterates over the record dimension tree and calls a functor on each element. /// \param functor Functor to execute at each element of. Needs to have `operator()` with a template parameter for /// the \ref RecordCoord in the record dimension tree. /// \param baseCoord \ref RecordCoord at which the iteration should be started. The functor is called on elements /// beneath this coordinate. LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr void forEachLeafCoord(Functor&& functor, RecordCoord baseCoord) { LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING mp_for_each_inline>>>( [&](auto innerCoord) LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(constexpr) { std::forward(functor)(cat(baseCoord, innerCoord)); }); LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING } /// Iterates over the record dimension tree and calls a functor on each element. /// \param functor Functor to execute at each element of. Needs to have `operator()` with a template parameter for /// the \ref RecordCoord in the record dimension tree. /// \param baseTags Tags used to define where the iteration should be started. The functor is called on elements /// beneath this coordinate. LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr void forEachLeafCoord(Functor&& functor, Tags... /*baseTags*/) { forEachLeafCoord(std::forward(functor), GetCoordFromTags{}); } namespace internal { template struct FlattenRecordDimImpl { using type = mp_list; }; template struct FlattenRecordDimImpl> { using type = mp_append>::type...>; }; template struct FlattenRecordDimImpl { using type = mp_repeat_c::type, N>; }; } // namespace internal /// Returns a flat type list containing all leaf field types of the given record dimension. LLAMA_EXPORT template using FlatRecordDim = typename internal::FlattenRecordDimImpl::type; /// The total number of fields in the recursively expanded record dimension. LLAMA_EXPORT template inline constexpr std::size_t flatFieldCount = 1; LLAMA_EXPORT template inline constexpr std::size_t flatFieldCount> = (flatFieldCount> + ... + 0); LLAMA_EXPORT template inline constexpr std::size_t flatFieldCount = flatFieldCount * N; namespace internal { template inline constexpr std::size_t flatFieldCountBefore = 0; template inline constexpr std::size_t flatFieldCountBefore<0, Record> = 0; // recursive formulation to benefit from template instantiation memoization // this massively improves compilation time when this template is instantiated with a lot of different I template inline constexpr std::size_t flatFieldCountBefore> = flatFieldCountBefore> + flatFieldCount, I - 1>>>; } // namespace internal /// The equivalent zero based index into a flat record dimension (\ref FlatRecordDim) of the given hierarchical /// record coordinate. LLAMA_EXPORT template inline constexpr std::size_t flatRecordCoord = 0; LLAMA_EXPORT template inline constexpr std::size_t flatRecordCoord> = 0; LLAMA_EXPORT template inline constexpr std::size_t flatRecordCoord, RecordCoord> = internal::flatFieldCountBefore> + flatRecordCoord, I>>, RecordCoord>; LLAMA_EXPORT template inline constexpr std::size_t flatRecordCoord> = flatFieldCount * I + flatRecordCoord>; namespace internal { template constexpr auto flatAlignOfImpl() { std::size_t maxAlign = 0; mp_for_each_inline>( [&](auto e) constexpr { using T = typename decltype(e)::type; maxAlign = std::max(maxAlign, alignof(T)); }); return maxAlign; } } // namespace internal /// The alignment of a type list if its elements would be in a normal struct. Effectively returns the maximum /// alignment value in the type list. LLAMA_EXPORT template inline constexpr std::size_t flatAlignOf = internal::flatAlignOfImpl(); /// The alignment of a type T. LLAMA_EXPORT template inline constexpr std::size_t alignOf = alignof(T); /// The alignment of a record dimension if its fields would be in a normal struct. Effectively returns the maximum /// alignment value in the type list. LLAMA_EXPORT template inline constexpr std::size_t alignOf> = flatAlignOf>>; /// Returns the ceiling of a / b. LLAMA_EXPORT template [[nodiscard]] LLAMA_FN_HOST_ACC_INLINE constexpr auto divCeil(Integral a, Integral b) -> Integral { return (a + b - 1) / b; } /// Returns the integral n rounded up to be a multiple of mult. LLAMA_EXPORT template [[nodiscard]] LLAMA_FN_HOST_ACC_INLINE constexpr auto roundUpToMultiple(Integral n, Integral mult) -> Integral { return divCeil(n, mult) * mult; } namespace internal { template constexpr auto sizeOfImpl() -> std::size_t { std::size_t size = 0; std::size_t maxAlign = 0; // NOLINT(misc-const-correctness) mp_for_each_inline>( [&](auto e) constexpr { using T = typename decltype(e)::type; if constexpr(Align) { size = roundUpToMultiple(size, alignof(T)); maxAlign = std::max(maxAlign, alignof(T)); } // NOLINTNEXTLINE(readability-misleading-indentation) size += sizeof(T); }); // final padding, so next struct can start right away if constexpr(Align && IncludeTailPadding) if(maxAlign > 0) size = roundUpToMultiple(size, maxAlign); // TODO(bgruber): we could use flatAlignOf // here, at the cost of more template instantiations return size; } template constexpr auto offsetOfImplWorkaround() -> std::size_t; } // namespace internal /// The size of a type list if its elements would be in a normal struct. LLAMA_EXPORT template inline constexpr std::size_t flatSizeOf = internal::sizeOfImpl(); /// The size of a type T. LLAMA_EXPORT template inline constexpr std::size_t sizeOf = sizeof(T); /// The size of a record dimension if its fields would be in a normal struct. LLAMA_EXPORT template inline constexpr std::size_t sizeOf, Align, IncludeTailPadding> = flatSizeOf>, Align, IncludeTailPadding>; /// The byte offset of an element in a type list ifs elements would be in a normal struct. LLAMA_EXPORT template inline constexpr std::size_t flatOffsetOf = internal::offsetOfImplWorkaround(); namespace internal { // unfortunately, we cannot inline this function as an IILE, as MSVC complains: // fatal error C1202: recursive type or function dependency context too complex template constexpr auto offsetOfImplWorkaround() -> std::size_t { if constexpr(I == 0) return 0; else { std::size_t offset // NOLINT(misc-const-correctness) = flatOffsetOf + sizeof(mp_at_c); if constexpr(Align) offset = roundUpToMultiple(offset, alignof(mp_at_c)); return offset; } } } // namespace internal /// The byte offset of an element in a record dimension if it would be a normal struct. /// \tparam RecordDim Record dimension tree. /// \tparam RecordCoord Record coordinate of an element inrecord dimension tree. LLAMA_EXPORT template inline constexpr std::size_t offsetOf = flatOffsetOf, flatRecordCoord, Align>; namespace internal { // Such a class is also known as arraw_proxy: https://quuxplusone.github.io/blog/2019/02/06/arrow-proxy/ template struct IndirectValue { T value; LLAMA_FN_HOST_ACC_INLINE auto operator->() -> T* { return &value; } LLAMA_FN_HOST_ACC_INLINE auto operator->() const -> const T* { return &value; } }; // TODO(bgruber): replace in C++20 template struct IsBoundedArray : std::false_type { }; template struct IsBoundedArray : std::true_type { }; } // namespace internal /// True if the T is a record dimension. That is, T is either a llama::Record or a bounded array. LLAMA_EXPORT template inline constexpr bool isRecordDim = isRecord || internal::IsBoundedArray::value; namespace internal { template typename TypeFunctor> struct TransformLeavesWithCoordImpl { using type = TypeFunctor; }; template typename TypeFunctor> struct TransformLeavesWithCoordImpl, Record, TypeFunctor> { template static auto f(std::index_sequence) { return Record, typename TransformLeavesWithCoordImpl, GetFieldType, TypeFunctor>:: type>...>{}; } using type = decltype(f(std::index_sequence_for{})); }; template typename TypeFunctor> struct TransformLeavesWithCoordImpl, Child[N], TypeFunctor> { template static void f(std::index_sequence) { static_assert( mp_same, Child, TypeFunctor>:: type...>::value, "Leave transformations beneath an array node must return the same type"); } using dummy = decltype(f(std::make_index_sequence{})); using type = typename TransformLeavesWithCoordImpl, Child, TypeFunctor>::type[N]; }; template typename F> struct MakePassSecond { template using fn = F; }; } // namespace internal /// Creates a new record dimension where each new leaf field's type is the result of applying FieldTypeFunctor to /// the original leaf's \ref RecordCoord and field's type. LLAMA_EXPORT template typename FieldTypeFunctor> using TransformLeavesWithCoord = typename internal::TransformLeavesWithCoordImpl, RecordDim, FieldTypeFunctor>::type; /// Creates a new record dimension where each new leaf field's type is the result of applying FieldTypeFunctor to /// the original leaf field's type. LLAMA_EXPORT template typename FieldTypeFunctor> using TransformLeaves = TransformLeavesWithCoord::template fn>; namespace internal { // TODO(bgruber): we might implement this better by expanding a record dim into a list of tag lists and then // computing a real set union of the two tag list lists template auto mergeRecordDimsImpl(mp_identity a, mp_identity) { static_assert(std::is_same_v, "Cannot merge record and non-record or fields with different types"); return a; } template auto mergeRecordDimsImpl([[maybe_unused]] mp_identity a, [[maybe_unused]] mp_identity b) { static_assert(std::is_same_v, "Cannot merge arrays of different type"); if constexpr(NA < NB) return b; else return a; } template auto mergeRecordDimsImpl(mp_identity> a, mp_identity>) { return a; } template< typename... FieldsA, typename FieldB, typename... FieldsB, auto Pos = FindFieldByTag, GetFieldTag>::value> auto mergeRecordDimsImpl(mp_identity>, mp_identity>) { if constexpr(Pos == sizeof...(FieldsA)) { return mergeRecordDimsImpl( mp_identity>{}, mp_identity>{}); } else { using OldFieldA = mp_at_c, Pos>; using NewFieldA = Field< GetFieldTag, typename decltype(mergeRecordDimsImpl( mp_identity>{}, mp_identity>{}))::type>; using NewRecordA = mp_replace_at_c, Pos, NewFieldA>; return mergeRecordDimsImpl(mp_identity{}, mp_identity>{}); } } } // namespace internal /// Creates a merged record dimension, where duplicated, nested fields are unified. LLAMA_EXPORT template using MergedRecordDims = typename decltype(internal::mergeRecordDimsImpl(mp_identity{}, mp_identity{}))::type; /// Alias for ToT, adding `const` if FromT is const qualified. LLAMA_EXPORT template using CopyConst = std::conditional_t, const ToT, ToT>; /// Used as template argument to specify a constant/compile-time value. LLAMA_EXPORT template using Constant = std::integral_constant; namespace internal { template struct IsConstant : std::false_type { }; template struct IsConstant> : std::true_type { }; } // namespace internal LLAMA_EXPORT template inline constexpr bool isConstant = internal::IsConstant::value; namespace internal { /// Holds a value of type T. Is useful as a base class. Is specialized for llama::Constant to not store the /// value at runtime. \tparam T Type of value to store. \tparam I Is used to disambiguate multiple BoxedValue /// base classes. template struct BoxedValue { BoxedValue() = default; // we don't make this ctor explicit so a Value appearing in a ctor list can just be created by passing a T // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions) LLAMA_FN_HOST_ACC_INLINE BoxedValue(T value) : val(value) { } LLAMA_FN_HOST_ACC_INLINE constexpr auto value() const { return val; } private: T val = {}; }; template struct BoxedValue, I> { BoxedValue() = default; // we don't make this ctor explicit so a Value appearing in a ctor list can just be created by passing a T // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions) LLAMA_FN_HOST_ACC_INLINE BoxedValue(Constant) { } LLAMA_FN_HOST_ACC_INLINE static constexpr auto value() { return V; } }; } // namespace internal LLAMA_EXPORT struct PrettySize { double size; const char* unit; }; /// Repeatedly divides the given size (in bytes) until it fits below 1000. Returns the new size and a string /// literal with the corresponding unit. LLAMA_EXPORT inline auto prettySize(double size) -> PrettySize { static const char* unit[] = {"B ", "KB", "MB", "GB", "TB", "PB", "EB"}; unsigned unitIndex = 0; while(size > 1000.0) { size /= 1000.0; unitIndex++; } assert(unitIndex < std::size(unit)); return {size, unit[unitIndex]}; } } // namespace llama // == // == ./include/llama/Core.hpp == // ============================================================================ // #include "macros.hpp" // amalgamate: file already inlined #include #include // #include // amalgamate: file already included #if CAN_USE_RANGES # include #endif namespace llama { /// Iterator supporting \ref ArrayIndexRange. LLAMA_EXPORT template struct ArrayIndexIterator { static_assert(!std::is_const_v); using value_type = typename ArrayExtents::Index; using difference_type = std::ptrdiff_t; using reference = value_type; using pointer = internal::IndirectValue; using iterator_category = std::random_access_iterator_tag; static constexpr std::size_t rank = ArrayExtents::rank; constexpr ArrayIndexIterator() noexcept = default; LLAMA_FN_HOST_ACC_INLINE constexpr ArrayIndexIterator(ArrayExtents extents, value_type current) noexcept : extents(extents) , current(current) { } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator*() const noexcept -> value_type { return current; } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator->() const noexcept -> pointer { return {**this}; } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator++() noexcept -> ArrayIndexIterator& { current[rank - 1]++; for(auto i = static_cast(rank) - 2; i >= 0; i--) { if(current[i + 1] != extents[i + 1]) return *this; current[i + 1] = 0; current[i]++; } return *this; } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator++(int) noexcept -> ArrayIndexIterator { auto tmp = *this; ++*this; return tmp; } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator--() noexcept -> ArrayIndexIterator& { current[rank - 1]--; for(auto i = static_cast(rank) - 2; i >= 0; i--) { // return if no underflow if(current[i + 1] != static_cast(-1)) return *this; current[i + 1] = extents[i] - 1; current[i]--; } // decrementing beyond [0, 0, ..., 0] is UB return *this; } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator--(int) noexcept -> ArrayIndexIterator { auto tmp = *this; --*this; return tmp; } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator[](difference_type i) const noexcept -> reference { return *(*this + i); } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator+=(difference_type n) noexcept -> ArrayIndexIterator& { // add n to all lower dimensions with carry for(auto i = static_cast(rank) - 1; i > 0 && n != 0; i--) { n += static_cast(current[i]); const auto s = static_cast(extents[i]); auto mod = n % s; n /= s; if(mod < 0) { mod += s; n--; } current[i] = mod; assert(current[i] < extents[i]); } current[0] = static_cast(current[0]) + n; // current is either within bounds or at the end ([last + 1, 0, 0, ..., 0]) assert( (current[0] < extents[0] || (current[0] == extents[0] && std::all_of(std::begin(current) + 1, std::end(current), [](auto c) { return c == 0; }))) && "Iterator was moved past the end"); return *this; } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator+(ArrayIndexIterator it, difference_type n) noexcept -> ArrayIndexIterator { it += n; return it; } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator+(difference_type n, ArrayIndexIterator it) noexcept -> ArrayIndexIterator { return it + n; } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator-=(difference_type n) noexcept -> ArrayIndexIterator& { return operator+=(-n); } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator-(ArrayIndexIterator it, difference_type n) noexcept -> ArrayIndexIterator { it -= n; return it; } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator-(const ArrayIndexIterator& a, const ArrayIndexIterator& b) noexcept -> difference_type { assert(a.extents == b.extents); difference_type n = a.current[rank - 1] - b.current[rank - 1]; difference_type size = a.extents[rank - 1]; for(auto i = static_cast(rank) - 2; i >= 0; i--) { n += (a.current[i] - b.current[i]) * size; size *= a.extents[i]; } return n; } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator==( const ArrayIndexIterator& a, const ArrayIndexIterator& b) noexcept -> bool { assert(a.extents == b.extents); return a.current == b.current; } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator!=( const ArrayIndexIterator& a, const ArrayIndexIterator& b) noexcept -> bool { return !(a == b); } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator<(const ArrayIndexIterator& a, const ArrayIndexIterator& b) noexcept -> bool { assert(a.extents == b.extents); #ifdef __NVCC__ // from: https://en.cppreference.com/w/cpp/algorithm/lexicographical_compare auto first1 = std::begin(a.current); auto last1 = std::end(a.current); auto first2 = std::begin(b.current); auto last2 = std::end(b.current); for(; (first1 != last1) && (first2 != last2); ++first1, (void) ++first2) { if(*first1 < *first2) return true; if(*first2 < *first1) return false; } return (first1 == last1) && (first2 != last2); #else return std::lexicographical_compare( std::begin(a.current), std::end(a.current), std::begin(b.current), std::end(b.current)); #endif } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator>(const ArrayIndexIterator& a, const ArrayIndexIterator& b) noexcept -> bool { return b < a; } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator<=(const ArrayIndexIterator& a, const ArrayIndexIterator& b) noexcept -> bool { return !(a > b); } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator>=(const ArrayIndexIterator& a, const ArrayIndexIterator& b) noexcept -> bool { return !(a < b); } private: ArrayExtents extents; // TODO(bgruber): we only need to store rank - 1 sizes value_type current; }; /// Range allowing to iterate over all indices in an \ref ArrayExtents. LLAMA_EXPORT template struct ArrayIndexRange : private ArrayExtents #if CAN_USE_RANGES , std::ranges::view_base #endif { static_assert(!std::is_const_v); constexpr ArrayIndexRange() noexcept = default; LLAMA_FN_HOST_ACC_INLINE constexpr explicit ArrayIndexRange(ArrayExtents extents) noexcept : ArrayExtents(extents) { } LLAMA_FN_HOST_ACC_INLINE constexpr auto begin() const noexcept -> ArrayIndexIterator { return {*this, typename ArrayExtents::Index{}}; } LLAMA_FN_HOST_ACC_INLINE constexpr auto end() const noexcept -> ArrayIndexIterator { auto endPos = typename ArrayExtents::Index{}; endPos[0] = this->toArray()[0]; return {*this, endPos}; } }; } // namespace llama // == // == ./include/llama/ArrayIndexRange.hpp == // ============================================================================ // # include "Core.hpp" // amalgamate: file already inlined // ============================================================================ // == ./include/llama/StructName.hpp == // == // Copyright 2022 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "Core.hpp" // amalgamate: file already inlined // #include // amalgamate: file already included #include namespace llama { namespace internal { // TODO(bgruber): just use std::copy which became constexpr in C++20 template constexpr auto constexprCopy(In f, In l, Out d) -> Out { while(f != l) *d++ = *f++; return d; } // TODO(bgruber): just use std::search which became constexpr in C++20 // from: https://en.cppreference.com/w/cpp/algorithm/search template constexpr auto constexprSearch(ForwardIt1 first, ForwardIt1 last, ForwardIt2 sFirst, ForwardIt2 sLast) -> ForwardIt1 { while(true) { ForwardIt1 it = first; for(ForwardIt2 sIt = sFirst;; ++it, ++sIt) { if(sIt == sLast) return first; if(it == last) return last; if(!(*it == *sIt)) break; } ++first; } } // TODO(bgruber): just use std::remove_copy which became constexpr in C++20 // from: https://en.cppreference.com/w/cpp/algorithm/remove_copy template constexpr auto constexprRemoveCopy(InputIt first, InputIt last, OutputIt d_first, const T& value) -> OutputIt { for(; first != last; ++first) { if(!(*first == value)) { *d_first++ = *first; } } return d_first; } // TODO(bgruber): just use std::count which became constexpr in C++20 // from: https://en.cppreference.com/w/cpp/algorithm/count template auto constexprCount(InputIt first, InputIt last, const T& value) -> typename std::iterator_traits::difference_type { typename std::iterator_traits::difference_type ret = 0; for(; first != last; ++first) { if(*first == value) { ret++; } } return ret; } template constexpr auto resizeArray(Array a) { Array r{}; constexprCopy(a.begin(), a.begin() + NewSize, r.begin()); return r; } template constexpr auto typeNameAsArray() { // adapted from Matthew Rodusek: // https://bitwizeshift.github.io/posts/2021/03/09/getting-an-unmangled-type-name-at-compile-time/ // // Boost Software License - Version 1.0 - August 17th, 2003 // // Permission is hereby granted, free of charge, to any person or organization // obtaining a copy of the software and accompanying documentation covered by // this license (the "Software") to use, reproduce, display, distribute, // execute, and transmit the Software, and to prepare derivative works of the // Software, and to permit third-parties to whom the Software is furnished to // do so, all subject to the following: // // The copyright notices in the Software and this entire statement, including // the above license grant, this restriction and the following disclaimer, // must be included in all copies of the Software, in whole or in part, and // all derivative works of the Software, unless such copies or derivative // works are solely in the form of machine-executable object code generated by // a source language processor. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT // SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE // FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS IN THE SOFTWARE. #if defined(__clang__) constexpr auto prefix = std::string_view{"[T = "}; constexpr auto suffix = std::string_view{"]"}; constexpr auto function = std::string_view{__PRETTY_FUNCTION__}; #elif defined(__GNUC__) constexpr auto prefix = std::string_view{"with T = "}; constexpr auto suffix = std::string_view{"]"}; constexpr auto function = std::string_view{__PRETTY_FUNCTION__}; #elif defined(_MSC_VER) constexpr auto prefix = std::string_view{"typeNameAsArray<"}; constexpr auto suffix = std::string_view{">(void)"}; constexpr auto function = std::string_view{__FUNCSIG__}; #else # warning Unsupported compiler constexpr auto prefix = std::string_view{}; constexpr auto suffix = std::string_view{}; constexpr auto function = std::string_view{}; #endif constexpr auto start = function.find(prefix) + prefix.size(); constexpr auto end = function.rfind(suffix); static_assert(start <= end); constexpr auto name = function.substr(start, (end - start)); constexpr auto arrAndSize = [&]() constexpr { Array nameArray{}; constexprCopy(name.begin(), name.end(), nameArray.begin()); #ifdef _MSC_VER // MSVC 19.32 runs into a syntax error if we just capture nameArray. Passing it as argument is a // workaround. Applies to the following lambdas. // strip "struct " and "class ". auto removeAllOccurences = [](auto& nameArray, std::size_t size, std::string_view str) constexpr { auto e = nameArray.begin() + size; while(true) { auto it = constexprSearch(nameArray.begin(), e, str.begin(), str.end()); if(it == e) break; constexprCopy(it + str.size(), e, it); e -= str.size(); } return e - nameArray.begin(); }; auto size = removeAllOccurences(nameArray, nameArray.size(), std::string_view{"struct "}); size = removeAllOccurences(nameArray, size, std::string_view{"class "}); #else auto size = nameArray.size(); #endif if(size > 3) { // remove spaces between closing template angle brackets and after commas auto e = nameArray.begin() + size; for(auto b = nameArray.begin(); b < e - 2; b++) { if((b[0] == '>' && b[1] == ' ' && b[2] == '>') || (b[0] == ',' && b[1] == ' ')) { constexprCopy(b + 2, e, b + 1); e--; } } size = e - nameArray.begin(); } return std::pair{nameArray, size}; }(); return resizeArray(arrAndSize.first); } template inline constexpr auto typeNameStorage = typeNameAsArray(); } // namespace internal LLAMA_EXPORT template inline constexpr auto qualifiedTypeName = [] { constexpr auto& value = internal::typeNameStorage; return std::string_view{value.data(), value.size()}; }(); namespace internal { constexpr auto isIdentChar(char c) -> bool { if(c >= 'A' && c <= 'Z') return true; if(c >= 'a' && c <= 'z') return true; if(c >= '0' && c <= '9') return true; if(c == '_') return true; return false; } template inline constexpr auto structNameStorage = []() constexpr { // strip namespace qualifiers before type names constexpr auto arrAndSize = []() constexpr { auto s = internal::typeNameStorage; auto b = s.begin(); auto e = s.end(); #if defined(__clang__) constexpr auto anonNs = std::string_view{"(anonymous namespace)::"}; #elif defined(__NVCOMPILER) constexpr auto anonNs = std::string_view{"::"}; #elif defined(__GNUG__) constexpr auto anonNs = std::string_view{"{anonymous}::"}; #elif defined(_MSC_VER) constexpr auto anonNs = std::string_view{"`anonymous-namespace'::"}; #else constexpr auto anonNs = std::string_view{"@"}; // just anything we won't find #endif std::size_t pos = 0; while((pos = std::string_view(b, e - b).find(anonNs)) != std::string::npos) { constexprCopy(b + pos + anonNs.size(), e, b + pos); e -= anonNs.size(); } while(true) { // find iterator to after "::" auto l = b; while(l + 1 < e && !(l[0] == ':' && l[1] == ':')) l++; if(l + 1 == e) break; l += 2; // find iterator to first identifier char before "::" auto f = l - 3; // start at first char before "::" while(s.begin() < f && isIdentChar(f[-1])) f--; // cut out [f:l[ constexprCopy(l, e, f); e -= (l - f); b = f; } return std::pair{s, e - s.begin()}; }(); return resizeArray(arrAndSize.first); }(); } // namespace internal LLAMA_EXPORT template constexpr auto structName(T = {}) -> std::string_view { constexpr auto& value = internal::structNameStorage; return std::string_view{&value[0], value.size()}; } namespace internal { constexpr auto intToStrSize(std::size_t s) { std::size_t len = 1; while(s >= 10) { len++; s /= 10; } return len; } template LLAMA_ACC inline constexpr auto recordCoordTagsStorage = []() constexpr { using Tags = GetTags>; // precompute char array size constexpr auto size = [&]() constexpr { std::size_t s = 0; mp_for_each_inline( [&](auto tag) { using Tag = decltype(tag); if constexpr(isRecordCoord) { // handle array indices static_assert(Tag::size == 1); s += 2; // for the '[' and ']' s += intToStrSize(Tag::front); } else { if(s != 0) s++; // for the '.'s s += structName(tag).size(); } }); return s; }(); llama::Array a{}; auto it = a.begin(); mp_for_each_inline( [&](auto tag) constexpr { using Tag = decltype(tag); if constexpr(isRecordCoord) { auto n = Tag::front; *it = '['; it++; it += intToStrSize(n); auto it2 = it; // take copy because we write number backward do // NOLINT(cppcoreguidelines-avoid-do-while) { it2--; *it2 = '0' + n % 10; n /= 10; } while(n != 0); *it = ']'; it++; } else { if(it != a.begin()) { *it = '.'; it++; } constexpr auto sn = structName(Tag{}); constexprCopy(sn.begin(), sn.end(), it); it += sn.size(); } }); if(!a.empty() && a.back() == 0) throw std::logic_error{"Implementation error: Array should have been completely overwritten."}; return a; }(); } // namespace internal /// Returns a pretty representation of the record coordinate inside the given record dimension. Tags are /// interspersed by '.' and arrays are represented using subscript notation ("[123]"). LLAMA_EXPORT template constexpr auto prettyRecordCoord(RecordCoord = {}) -> std::string_view { constexpr auto& value = internal::recordCoordTagsStorage; return std::string_view{value.data(), value.size()}; } LLAMA_EXPORT template constexpr auto prettyRecordCoord(RecordCoord<>) -> std::string_view { return {}; } } // namespace llama // == // == ./include/llama/StructName.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/View.hpp == // == // Copyright 2022 Alexander Matthes, Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // ============================================================================ // == ./include/llama/Accessors.hpp == // == // Copyright 2023 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // ============================================================================ // == ./include/llama/Concepts.hpp == // == // Copyright 2022 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "Array.hpp" // amalgamate: file already inlined // #include "Core.hpp" // amalgamate: file already inlined // #include "RecordCoord.hpp" // amalgamate: file already inlined // #include // amalgamate: file already included #if __has_include() # include #endif namespace llama { #ifdef __cpp_lib_concepts LLAMA_EXPORT template concept isConstexpr = requires { std::integral_constant{}; }; LLAMA_EXPORT template concept Mapping = requires(M m) { typename M::ArrayExtents; typename M::RecordDim; { m.extents() } -> std::same_as; { +M::blobCount } -> std::same_as; requires isConstexpr; { m.blobSize(typename M::ArrayExtents::value_type{}) } -> std::same_as; }; LLAMA_EXPORT template concept PhysicalField = requires(M m, typename M::ArrayExtents::Index ai) { { m.blobNrAndOffset(ai, RC{}) } -> std::same_as>; }; template struct MakeIsPhysical { template using fn = mp_bool>; }; LLAMA_EXPORT template inline constexpr bool allFieldsArePhysical = mp_all_of, MakeIsPhysical::template fn>::value; LLAMA_EXPORT template concept PhysicalMapping = Mapping && allFieldsArePhysical; LLAMA_EXPORT template concept LValueReference = std::is_lvalue_reference_v; LLAMA_EXPORT template concept AdlTwoStepSwappable = requires(R a, R b) { swap(a, b); } || requires(R a, R b) { std::swap(a, b); }; LLAMA_EXPORT template concept ProxyReference = std::is_copy_constructible_v && std::is_copy_assignable_v && requires(R r) { typename R::value_type; { static_cast(r) } -> std::same_as; { r = std::declval() } -> std::same_as; } && AdlTwoStepSwappable; LLAMA_EXPORT template concept AnyReference = LValueReference || ProxyReference; LLAMA_EXPORT template concept AnyReferenceTo = (LValueReference && std::is_same_v, T>) || (ProxyReference && std::is_same_v); LLAMA_EXPORT template concept ComputedField = M::isComputed(RC{}) && requires(M m, typename M::ArrayExtents::Index ai, std::byte** blobs) { { m.compute(ai, RC{}, blobs) } -> AnyReferenceTo>; }; template struct MakeIsComputed { template using fn = mp_bool>; }; LLAMA_EXPORT template inline constexpr bool allFieldsAreComputed = mp_all_of, MakeIsComputed::template fn>::value; LLAMA_EXPORT template concept FullyComputedMapping = Mapping && allFieldsAreComputed; LLAMA_EXPORT template< typename M, typename LeafCoords = LeafRecordCoords, std::size_t PhysicalCount = mp_count_if::template fn>::value, std::size_t ComputedCount = mp_count_if::template fn>::value> inline constexpr bool allFieldsArePhysicalOrComputed = (PhysicalCount + ComputedCount) >= mp_size::value && PhysicalCount > 0 && ComputedCount > 0; // == instead of >= would be better, but it's not easy to count correctly, // because we cannot check whether the call to blobNrOrOffset() // or compute() is actually valid LLAMA_EXPORT template concept PartiallyComputedMapping = Mapping && allFieldsArePhysicalOrComputed; /// Additional semantic requirement: &b[i] + j == &b[i + j] for any integral i and j in range of the blob LLAMA_EXPORT template concept Blob = requires(B b, std::size_t i) { // according to http://eel.is/c++draft/intro.object#3 only std::byte and unsigned char can // provide storage for // other types requires std::is_lvalue_reference_v; requires std::same_as, std::byte> || std::same_as, unsigned char>; }; LLAMA_EXPORT template concept BlobAllocator = requires(BA ba, std::size_t size) { { ba(std::integral_constant{}, size) } -> Blob; }; LLAMA_EXPORT template concept AnyView = requires(V v, const V cv) { typename V::Mapping; typename V::BlobType; typename V::ArrayExtents; typename V::ArrayIndex; typename V::RecordDim; typename V::Accessor; typename V::iterator; typename V::const_iterator; { v.mapping() } -> std::same_as; { cv.mapping() } -> std::same_as; { v.accessor() } -> std::same_as; { cv.accessor() } -> std::same_as; { cv.extents() } -> std::same_as; { v.begin() } -> std::same_as; { cv.begin() } -> std::same_as; { v.end() } -> std::same_as; { cv.end() } -> std::same_as; { v.blobs() } -> std::same_as&>; { cv.blobs() } -> std::same_as&>; v(std::declval()); cv(std::declval()); v[std::declval()]; cv[std::declval()]; // operator(i0, i1, ...) // operator[i0, i1, ...] }; #endif namespace internal { template struct IsProxyReferenceImpl : std::false_type { }; template struct IsProxyReferenceImpl< R, std::void_t< typename R::value_type, decltype(static_cast(std::declval())), decltype(std::declval() = std::declval())>> : std::bool_constant && std::is_copy_assignable_v> { }; } // namespace internal LLAMA_EXPORT template #ifdef __cpp_lib_concepts inline constexpr bool isProxyReference = ProxyReference; #else inline constexpr bool isProxyReference = internal::IsProxyReferenceImpl::value; #endif } // namespace llama // == // == ./include/llama/Concepts.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/ProxyRefOpMixin.hpp == // == // Copyright 2022 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "macros.hpp" // amalgamate: file already inlined namespace llama { /// CRTP mixin for proxy reference types to support all compound assignment and increment/decrement operators. LLAMA_EXPORT template struct ProxyRefOpMixin { private: LLAMA_FN_HOST_ACC_INLINE constexpr auto derived() -> Derived& { return static_cast(*this); } // in principle, load() could be const, but we use it only from non-const functions LLAMA_FN_HOST_ACC_INLINE constexpr auto load() -> ValueType { return static_cast(derived()); } LLAMA_FN_HOST_ACC_INLINE constexpr void store(ValueType t) { derived() = std::move(t); } public: LLAMA_FN_HOST_ACC_INLINE constexpr auto operator+=(const ValueType& rhs) -> Derived& { ValueType lhs = load(); lhs += rhs; store(lhs); return derived(); } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator-=(const ValueType& rhs) -> Derived& { ValueType lhs = load(); lhs -= rhs; store(lhs); return derived(); } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator*=(const ValueType& rhs) -> Derived& { ValueType lhs = load(); lhs *= rhs; store(lhs); return derived(); } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator/=(const ValueType& rhs) -> Derived& { ValueType lhs = load(); lhs /= rhs; store(lhs); return derived(); } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator%=(const ValueType& rhs) -> Derived& { ValueType lhs = load(); lhs %= rhs; store(lhs); return derived(); } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator<<=(const ValueType& rhs) -> Derived& { ValueType lhs = load(); lhs <<= rhs; store(lhs); return derived(); } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator>>=(const ValueType& rhs) -> Derived& { ValueType lhs = load(); lhs >>= rhs; store(lhs); return derived(); } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator&=(const ValueType& rhs) -> Derived& { ValueType lhs = load(); lhs &= rhs; store(lhs); return derived(); } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator|=(const ValueType& rhs) -> Derived& { ValueType lhs = load(); lhs |= rhs; store(lhs); return derived(); } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator^=(const ValueType& rhs) -> Derived& { ValueType lhs = load(); lhs ^= rhs; store(lhs); return derived(); } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator++() -> Derived& { ValueType v = load(); ++v; store(v); return derived(); } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator++(int) -> ValueType { ValueType v = load(); ValueType old = v++; store(v); return old; } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator--() -> Derived& { ValueType v = load(); --v; store(v); return derived(); } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator--(int) -> ValueType { ValueType v = load(); ValueType old = v--; store(v); return old; } LLAMA_FN_HOST_ACC_INLINE friend constexpr void swap(Derived a, Derived b) noexcept { const auto va = static_cast(a); const auto vb = static_cast(b); a = vb; b = va; } }; } // namespace llama // == // == ./include/llama/ProxyRefOpMixin.hpp == // ============================================================================ // #include "macros.hpp" // amalgamate: file already inlined #include #include #include namespace llama::accessor { /// Default accessor. Passes through the given reference. LLAMA_EXPORT struct Default { template LLAMA_FN_HOST_ACC_INLINE auto operator()(Reference&& r) const -> Reference { return std::forward(r); } }; /// Allows only read access and returns values instead of references to memory. LLAMA_EXPORT struct ByValue { template LLAMA_FN_HOST_ACC_INLINE auto operator()(Reference&& r) const { using ValueType = std::decay_t; if constexpr(isProxyReference) return static_cast(r); else return ValueType{r}; } }; /// Allows only read access by qualifying the references to memory with const. LLAMA_EXPORT struct Const { // for l-value references template LLAMA_FN_HOST_ACC_INLINE auto operator()(T& r) const -> const T& { return r; } template // NOLINTNEXTLINE(cppcoreguidelines-special-member-functions,hicpp-special-member-functions) struct Reference : ProxyRefOpMixin, typename Ref::value_type> { private: Ref ref; public: using value_type = typename Ref::value_type; LLAMA_FN_HOST_ACC_INLINE constexpr explicit Reference(Ref ref) : ref{ref} { } Reference(const Reference&) = default; // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp) LLAMA_FN_HOST_ACC_INLINE constexpr auto operator=(const Reference& other) -> Reference& { *this = static_cast(other); return *this; } // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions) LLAMA_FN_HOST_ACC_INLINE operator value_type() const { return static_cast(ref); } template LLAMA_FN_HOST_ACC_INLINE auto operator=(T) -> Reference& { static_assert(sizeof(T) == 0, "You cannot write through a Const accessor"); return *this; } }; // for proxy references template, int> = 0> LLAMA_FN_HOST_ACC_INLINE auto operator()(ProxyReference r) const { return Reference{std::move(r)}; } }; /// Qualifies references to memory with __restrict. Only works on l-value references. LLAMA_EXPORT struct Restrict { template LLAMA_FN_HOST_ACC_INLINE auto operator()(T& r) const -> T& __restrict { return r; } }; #ifdef __cpp_lib_atomic_ref /// Accessor wrapping a reference into a std::atomic_ref. Can only wrap l-value references. LLAMA_EXPORT struct Atomic { template LLAMA_FORCE_INLINE auto operator()(T& r) const -> std::atomic_ref { return std::atomic_ref{r}; } }; #endif /// Locks a mutex during each access to the data structure. LLAMA_EXPORT template struct Locked { // mutexes are usually not movable, so we put them on the heap, so the accessor is movable std::unique_ptr m = std::make_unique(); template // NOLINTNEXTLINE(cppcoreguidelines-special-member-functions,hicpp-special-member-functions) struct Reference : ProxyRefOpMixin, Value> { Ref ref; Mutex& m; using value_type = Value; // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp) LLAMA_FORCE_INLINE constexpr auto operator=(const Reference& other) -> Reference& { const std::lock_guard lock(m); *this = static_cast(other); return *this; } // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions) LLAMA_FORCE_INLINE operator value_type() const { const std::lock_guard lock(m); return static_cast(ref); } template LLAMA_FORCE_INLINE auto operator=(T t) -> Reference& { const std::lock_guard lock(m); ref = t; return *this; } }; template LLAMA_FORCE_INLINE auto operator()(PR r) const -> Reference { return {{}, r, *m}; } template LLAMA_FORCE_INLINE auto operator()(T& r) const -> Reference> { return {{}, r, *m}; } }; namespace internal { template struct StackedLeave : Accessor { }; } // namespace internal /// Accessor combining multiple other accessors. The contained accessors are applied in left to right order to the /// memory location when forming the reference returned from a view. LLAMA_EXPORT template struct Stacked : internal::StackedLeave<0, Default> { }; LLAMA_EXPORT template struct Stacked : internal::StackedLeave<1 + sizeof...(MoreAccessors), FirstAccessor> , Stacked { using First = internal::StackedLeave<1 + sizeof...(MoreAccessors), FirstAccessor>; using Rest = Stacked; LLAMA_FN_HOST_ACC_INLINE Stacked() = default; LLAMA_FN_HOST_ACC_INLINE explicit Stacked(FirstAccessor first, MoreAccessors... rest) : First{std::move(first)} , Rest{std::move(rest)...} { } template LLAMA_FN_HOST_ACC_INLINE auto operator()(Reference&& r) const -> decltype(auto) { return static_cast(*this)(static_cast(*this)(std::forward(r))); } }; } // namespace llama::accessor // == // == ./include/llama/Accessors.hpp == // ============================================================================ // #include "Array.hpp" // amalgamate: file already inlined // #include "ArrayIndexRange.hpp" // amalgamate: file already inlined // ============================================================================ // == ./include/llama/BlobAllocators.hpp == // == // Copyright 2022 Alexander Matthes, Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "Array.hpp" // amalgamate: file already inlined // #include "Concepts.hpp" // amalgamate: file already inlined // #include "macros.hpp" // amalgamate: file already inlined #include // #include // amalgamate: file already included #include #if __has_include() # include #endif #if __has_include() # include #endif namespace alpaka { template auto allocBuf(const TDev& dev, const TExtent& extent); // NOLINT(readability-redundant-declaration) } // namespace alpaka namespace llama::bloballoc { /// Allocates statically sized memory for a \ref View, which is copied each time a \ref View is copied. /// \tparam BytesToReserve the amount of memory to reserve. LLAMA_EXPORT template struct Array { template struct alignas(Alignment) AlignedArray : llama::Array { }; template LLAMA_FN_HOST_ACC_INLINE auto operator()( std::integral_constant, [[maybe_unused]] std::size_t count) const -> AlignedArray { assert(count == BytesToReserve); return {}; } }; #ifdef __cpp_lib_concepts static_assert(BlobAllocator>); #endif /// Allocates heap memory managed by a `std::unique_ptr` for a \ref View. This memory can only be uniquely owned by /// a single \ref View. LLAMA_EXPORT struct UniquePtr { template auto operator()(std::integral_constant, std::size_t count) const { auto* ptr = static_cast(::operator new[](count * sizeof(std::byte), std::align_val_t{Alignment})); auto deleter = [](std::byte* ptr) { ::operator delete[](ptr, std::align_val_t{Alignment}); }; return std::unique_ptr{ptr, deleter}; } }; #ifdef __cpp_lib_concepts static_assert(BlobAllocator); #endif /// Allocates heap memory managed by a `std::shared_ptr` for a \ref View. This memory is shared between all copies /// of a \ref View. LLAMA_EXPORT struct SharedPtr { template auto operator()(std::integral_constant, std::size_t count) const -> std::shared_ptr { auto* ptr = static_cast(::operator new[](count * sizeof(std::byte), std::align_val_t{Alignment})); auto deleter = [](std::byte* ptr) { ::operator delete[](ptr, std::align_val_t{Alignment}); }; return {ptr, deleter}; } }; #ifdef __cpp_lib_concepts static_assert(BlobAllocator); #endif /// An STL compatible allocator allowing to specify alignment. LLAMA_EXPORT template struct AlignedAllocator { using value_type = T; inline AlignedAllocator() noexcept = default; template inline explicit AlignedAllocator(const AlignedAllocator&) noexcept { } inline auto allocate(std::size_t n) -> T* { return static_cast(::operator new[](n * sizeof(T), std::align_val_t{Alignment})); } inline void deallocate(T* p, std::size_t) { ::operator delete[](p, std::align_val_t{Alignment}); } template struct rebind // NOLINT(readability-identifier-naming) { using other = AlignedAllocator; }; auto operator!=(const AlignedAllocator& other) const -> bool { return !(*this == other); } auto operator==(const AlignedAllocator&) const -> bool { return true; } }; /// Allocates heap memory managed by a `std::vector` for a \ref View, which is copied each time a \ref View is /// copied. LLAMA_EXPORT struct Vector { template inline auto operator()(std::integral_constant, std::size_t count) const { return std::vector>(count); } }; #ifdef __cpp_lib_concepts static_assert(BlobAllocator); #endif #if __has_include() /// Allocates GPU device memory using cudaMalloc. The memory is managed by a std::unique_ptr with a deleter that /// calles cudaFree. If you want to use a view created with this allocator in a CUDA kernel, call \ref shallowCopy /// on the view before passing it to the kernel. LLAMA_EXPORT struct CudaMalloc { inline static const auto deleter = [](void* p) { if(const auto code = cudaFree(p); code != cudaSuccess) throw std::runtime_error(std::string{"cudaFree failed with code "} + cudaGetErrorString(code)); }; template inline auto operator()(std::integral_constant, std::size_t count) const { std::byte* p = nullptr; if(const auto code = cudaMalloc(&p, count); code != cudaSuccess) throw std::runtime_error(std::string{"cudaMalloc failed with code "} + cudaGetErrorString(code)); if(reinterpret_cast(p) & (FieldAlignment - 1 != 0u)) throw std::runtime_error{"cudaMalloc does not align sufficiently"}; return std::unique_ptr(p, deleter); } }; #endif /// Allocates alpaka buffers as blobs. LLAMA_EXPORT template struct AlpakaBuf { Dev& dev; template inline auto operator()(std::integral_constant, std::size_t count) const { return alpaka::allocBuf(dev, static_cast(count)); } }; #if __has_include() /// Allocates shared USM memory using sycl::aligned_alloc_shared. The memory is managed by a std::unique_ptr with a /// deleter that calles sycl::free. If you want to use a view created with this allocator in a kernel, call \ref /// shallowCopy on the view before passing it to the kernel. LLAMA_EXPORT struct SyclMallocShared { sycl::queue queue; static auto makeDeleter(sycl::queue q) { // create lambda in function independent of FieldAlignment template paramter to avoid different blob types return [q](void* p) { sycl::free(p, q); }; } template inline auto operator()(std::integral_constant, std::size_t count) const { std::byte* p = sycl::aligned_alloc_shared(FieldAlignment, count, queue); if(reinterpret_cast(p) & (FieldAlignment - 1 != 0u)) throw std::runtime_error{"sycl::malloc_shared does not align sufficiently"}; return std::unique_ptr(p, makeDeleter(queue)); } }; #endif } // namespace llama::bloballoc // == // == ./include/llama/BlobAllocators.hpp == // ============================================================================ // #include "Concepts.hpp" // amalgamate: file already inlined // #include "Core.hpp" // amalgamate: file already inlined // #include "macros.hpp" // amalgamate: file already inlined // ============================================================================ // == ./include/llama/mapping/One.hpp == // == // Copyright 2022 Alexander Matthes, Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "../Core.hpp" // amalgamate: file already inlined // ============================================================================ // == ./include/llama/mapping/Common.hpp == // == // Copyright 2022 Alexander Matthes, Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "../Core.hpp" // amalgamate: file already inlined // #include // amalgamate: file already included #include #ifndef __cpp_lib_atomic_ref # include #endif namespace llama::mapping { LLAMA_EXPORT template struct MappingBase : protected TArrayExtents { using ArrayExtents = TArrayExtents; using RecordDim = TRecordDim; protected: using ArrayIndex = typename ArrayExtents::Index; using size_type = typename ArrayExtents::value_type; public: constexpr MappingBase() = default; LLAMA_FN_HOST_ACC_INLINE constexpr explicit MappingBase(ArrayExtents extents, RecordDim = {}) : ArrayExtents(extents) { } LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents { return static_cast(*this); } }; /// Functor that maps an \ref ArrayIndex into linear numbers, where the fast moving index should be the rightmost /// one, which models how C++ arrays work and is analogous to mdspan's layout_right. E.g. ArrayIndex<3> a; stores 3 /// indices where a[2] should be incremented in the innermost loop. LLAMA_EXPORT struct LinearizeArrayIndexRight { template LLAMA_FN_HOST_ACC_INLINE constexpr auto size(const ArrayExtents& extents) -> typename ArrayExtents::value_type { return product(extents); } /// \param ai Index in the array dimensions. /// \param extents Total size of the array dimensions. /// \return Linearized index. template LLAMA_FN_HOST_ACC_INLINE constexpr auto operator()( const typename ArrayExtents::Index& ai, const ArrayExtents& extents) const -> typename ArrayExtents::value_type { if constexpr(ArrayExtents::rank == 0) return 0; else { auto address = ai[0]; for(int i = 1; i < static_cast(ArrayExtents::rank); i++) { address *= extents[i]; address += ai[i]; } return address; } } }; LLAMA_EXPORT using LinearizeArrayIndexCpp = LinearizeArrayIndexRight; /// Functor that maps a \ref ArrayIndex into linear numbers the way Fortran arrays work. The fast moving index of /// the ArrayIndex object should be the last one. E.g. ArrayIndex<3> a; stores 3 indices where a[0] should be /// incremented in the innermost loop. LLAMA_EXPORT struct LinearizeArrayIndexLeft { template LLAMA_FN_HOST_ACC_INLINE constexpr auto size(const ArrayExtents& extents) -> typename ArrayExtents::value_type { return product(extents); } /// \param ai Index in the array dimensions. /// \param extents Total size of the array dimensions. /// \return Linearized index. template LLAMA_FN_HOST_ACC_INLINE constexpr auto operator()( const typename ArrayExtents::Index& ai, const ArrayExtents& extents) const -> typename ArrayExtents::value_type { if constexpr(ArrayExtents::rank == 0) return 0; else { auto address = ai[ArrayExtents::rank - 1]; for(int i = static_cast(ArrayExtents::rank) - 2; i >= 0; i--) { address *= extents[i]; address += ai[i]; } return address; } } }; LLAMA_EXPORT using LinearizeArrayIndexFortran = LinearizeArrayIndexLeft; /// Functor that maps an \ref ArrayIndex into linear numbers using the Z-order space filling curve (Morton codes). LLAMA_EXPORT struct LinearizeArrayIndexMorton { template LLAMA_FN_HOST_ACC_INLINE constexpr auto size(const ArrayExtents& extents) const -> typename ArrayExtents::value_type { if constexpr(ArrayExtents::rank == 0) return 0; else { auto longest = extents[0]; for(int i = 1; i < static_cast(ArrayExtents::rank); i++) longest = std::max(longest, extents[i]); const auto longestPO2 = bitCeil(longest); return intPow(longestPO2, static_cast(ArrayExtents::rank)); } } /// \param ai Coordinate in the array dimensions. /// \param extents Total size of the array dimensions. /// \return Linearized index. template LLAMA_FN_HOST_ACC_INLINE constexpr auto operator()( const typename ArrayExtents::Index& ai, [[maybe_unused]] const ArrayExtents& extents) const -> typename ArrayExtents::value_type { using size_type = typename ArrayExtents::value_type; constexpr auto rank = static_cast(ArrayExtents::rank); size_type r = 0; for(size_type bit = 0; bit < (static_cast(sizeof(size_type)) * CHAR_BIT) / rank; bit++) for(size_type i = 0; i < rank; i++) r |= (ai[i] & (size_type{1} << bit)) << ((bit + 1) * (rank - 1) - i); return r; } private: template LLAMA_FN_HOST_ACC_INLINE static constexpr auto bitCeil(T n) -> T { T r = 1u; while(r < n) r <<= 1u; return r; } template LLAMA_FN_HOST_ACC_INLINE static constexpr auto intPow(T b, T e) -> T { e--; auto r = b; while(e != 0u) { r *= b; e--; } return r; } }; /// Retains the order of the record dimension's fields. LLAMA_EXPORT template struct PermuteFieldsInOrder { using FlatRecordDim = TFlatRecordDim; template static constexpr std::size_t permute = FlatRecordCoord; }; /// Sorts the record dimension's the fields according to a given predicate on the field types. /// @tparam Less A binary predicate accepting two field types, which exposes a member value. Value must be true if /// the first field type is less than the second one, otherwise false. LLAMA_EXPORT template typename Less> struct PermuteFieldsSorted { private: using FlatSortedRecordDim = mp_sort; template using LessWithIndices = Less, mp_at>; // A permutation from new FlatSortedRecordDim index to old FlatOrigRecordDim index using PermutedIndices = mp_sort>, LessWithIndices>; template using LessInvertPermutation = std::bool_constant<(mp_at::value < mp_at::value)>; // A permutation from old FlatOrigRecordDim index to new FlatSortedRecordDim index using InversePermutedIndices = mp_sort>, LessInvertPermutation>; public: using FlatRecordDim = FlatSortedRecordDim; template static constexpr std::size_t permute = mp_at_c::value; }; namespace internal { template using LessAlignment = std::bool_constant; template using MoreAlignment = std::bool_constant<(alignof(A) > alignof(B))>; } // namespace internal /// Sorts the record dimension fields by increasing alignment of its fields. LLAMA_EXPORT template using PermuteFieldsIncreasingAlignment = PermuteFieldsSorted; /// Sorts the record dimension fields by decreasing alignment of its fields. LLAMA_EXPORT template using PermuteFieldsDecreasingAlignment = PermuteFieldsSorted; /// Sorts the record dimension fields by the alignment of its fields to minimize padding. LLAMA_EXPORT template using PermuteFieldsMinimizePadding = PermuteFieldsIncreasingAlignment; namespace internal { template struct S; template LLAMA_FN_HOST_ACC_INLINE void atomicInc(CountType& i) { #ifdef __CUDA_ARCH__ // if you get an error here that there is no overload of atomicAdd, your CMAKE_CUDA_ARCHITECTURE might be // too low or you need to use a smaller CountType for the FieldAccessCount or Heatmap mapping. if constexpr(mp_contains, CountType>::value) atomicAdd(&i, CountType{1}); else if constexpr(sizeof(CountType) == sizeof(unsigned int)) atomicAdd(reinterpret_cast(&i), 1u); else if constexpr(sizeof(CountType) == sizeof(unsigned long long int)) atomicAdd(reinterpret_cast(&i), 1ull); else static_assert(sizeof(CountType) == 0, "There is no CUDA atomicAdd for your CountType"); #elif defined(__cpp_lib_atomic_ref) ++std::atomic_ref{i}; #else ++boost::atomic_ref{i}; #endif } } // namespace internal LLAMA_EXPORT enum class FieldAlignment { Pack, Align }; } // namespace llama::mapping // == // == ./include/llama/mapping/Common.hpp == // ============================================================================ namespace llama::mapping { /// Maps all array dimension indices to the same location and layouts struct members consecutively. This mapping is /// used for temporary, single element views. /// \tparam TFieldAlignment If Align, padding bytes are inserted to guarantee that struct members are properly /// aligned. If false, struct members are tightly packed. /// \tparam PermuteFields Defines how the record dimension's fields should be permuted. See \ref /// PermuteFieldsInOrder, \ref PermuteFieldsIncreasingAlignment, \ref PermuteFieldsDecreasingAlignment and /// \ref PermuteFieldsMinimizePadding. LLAMA_EXPORT template< typename TArrayExtents, typename TRecordDim, FieldAlignment TFieldAlignment = FieldAlignment::Align, template typename PermuteFields = PermuteFieldsMinimizePadding> struct One : MappingBase { private: using Base = MappingBase; using size_type = typename Base::size_type; public: inline static constexpr FieldAlignment fieldAlignment = TFieldAlignment; using Permuter = PermuteFields>; static constexpr std::size_t blobCount = 1; #if defined(__NVCC__) && __CUDACC_VER_MAJOR__ >= 12 using Base::Base; #else constexpr One() = default; LLAMA_FN_HOST_ACC_INLINE constexpr explicit One(TArrayExtents extents, TRecordDim = {}) : Base(extents) { } #endif LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(size_type) const -> size_type { return flatSizeOf< typename Permuter::FlatRecordDim, fieldAlignment == FieldAlignment::Align, false>; // no tail padding } template LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset( typename Base::ArrayIndex, RecordCoord = {}) const -> NrAndOffset { constexpr std::size_t flatFieldIndex = #if defined(__NVCC__) && __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 6 *& // mess with nvcc compiler state to workaround bug #endif Permuter::template permute>>; constexpr auto offset = static_cast(flatOffsetOf< typename Permuter::FlatRecordDim, flatFieldIndex, fieldAlignment == FieldAlignment::Align>); return {size_type{0}, offset}; } }; /// One mapping preserving the alignment of the field types by inserting padding. /// \see One LLAMA_EXPORT template using AlignedOne = One; /// One mapping preserving the alignment of the field types by inserting padding and permuting the field order to /// minimize this padding. /// \see One LLAMA_EXPORT template using MinAlignedOne = One; /// One mapping packing the field types tightly, violating the types' alignment requirements. /// \see One LLAMA_EXPORT template using PackedOne = One; /// Binds parameters to a \ref One mapping except for array and record dimension, producing a quoted /// meta function accepting the latter two. Useful to to prepare this mapping for a meta mapping. LLAMA_EXPORT template< FieldAlignment FieldAlignment = FieldAlignment::Align, template typename PermuteFields = PermuteFieldsMinimizePadding> struct BindOne { template using fn = One; }; LLAMA_EXPORT template inline constexpr bool isOne = false; LLAMA_EXPORT template< typename ArrayExtents, typename RecordDim, FieldAlignment FieldAlignment, template typename PermuteFields> inline constexpr bool isOne> = true; } // namespace llama::mapping // == // == ./include/llama/mapping/One.hpp == // ============================================================================ // #include // amalgamate: file already included namespace llama { LLAMA_EXPORT #ifdef __cpp_lib_concepts template #else template #endif struct View; namespace internal { template using AllocatorBlobType = decltype(std::declval()(std::integral_constant>{}, 0)); template LLAMA_FN_HOST_ACC_INLINE auto makeBlobArray( const Allocator& alloc, const Mapping& mapping, std::integer_sequence) -> Array, Mapping::blobCount> { [[maybe_unused]] constexpr auto alignment = alignOf; // g++-12 warns that alignment is unused LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING return {alloc(std::integral_constant{}, mapping.blobSize(Is))...}; LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING } // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks) } // namespace internal /// Same as \ref allocView but does not run field constructors. LLAMA_EXPORT #ifdef __cpp_lib_concepts template #else template #endif LLAMA_FN_HOST_ACC_INLINE auto allocViewUninitialized( Mapping mapping = {}, const Allocator& alloc = {}, Accessor accessor = {}) { auto blobs = internal::makeBlobArray(alloc, mapping, std::make_index_sequence{}); return View, Accessor>{ std::move(mapping), std::move(blobs), std::move(accessor)}; } namespace internal { template struct IsComputed : std::false_type { }; template struct IsComputed> : std::bool_constant { }; } // namespace internal /// Returns true if the field accessed via the given mapping and record coordinate is a computed value. LLAMA_EXPORT template inline constexpr bool isComputed = internal::IsComputed::value; /// Returns true if any field accessed via the given mapping is a computed value. // TODO(bgruber): harmonize this with LLAMA's concepts from Concepts.hpp LLAMA_EXPORT template inline constexpr bool hasAnyComputedField = mp_any_of< LeafRecordCoords, mp_bind_front::template fn>::value; LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE void constructField( View& view, typename Mapping::ArrayExtents::Index ai, RecordCoord rc) { auto init = [](auto&& ref) LLAMA_LAMBDA_INLINE { using FieldType = GetType>; using RefType = decltype(ref); if constexpr(isProxyReference>) { ref = FieldType{}; } else if constexpr( std::is_lvalue_reference_v && !std::is_const_v>) { new(&ref) FieldType{}; } }; // this handles physical and computed mappings if constexpr(sizeof...(RCs) == 0) { init(view(ai)); } else { init(view(ai)(rc)); } } /// Value-initializes all fields reachable through the given view. That is, constructors are run and fundamental /// types are zero-initialized. Computed fields are constructed if they return l-value references and assigned a /// default constructed value if they return a proxy reference. LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE void constructFields(View& view) { using View = View; using RecordDim = typename View::RecordDim; forEachArrayIndex( view.extents(), [&](typename View::ArrayIndex ai) LLAMA_LAMBDA_INLINE { forEachLeafCoord([&](auto rc) LLAMA_LAMBDA_INLINE { constructField(view, ai, rc); }); }); } /// Creates a view based on the given mapping, e.g. \ref mapping::AoS or \ref mapping::SoA. For allocating the /// view's underlying memory, the specified allocator callable is used (or the default one, which is /// \ref bloballoc::Vector). The allocator callable is called with the alignment and size of bytes to allocate for /// each blob of the mapping. Value-initialization is performed for all fields by calling \ref constructFields. /// This function is the preferred way to create a \ref View. See also \ref allocViewUninitialized. LLAMA_EXPORT #ifdef __cpp_lib_concepts template #else template #endif LLAMA_FN_HOST_ACC_INLINE auto allocView(Mapping mapping = {}, const Allocator& alloc = {}, Accessor accessor = {}) -> View, Accessor> { auto view = allocViewUninitialized(std::move(mapping), alloc, std::move(accessor)); constructFields(view); return view; } /// Same as \ref allocScalarView but does not run field constructors. LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE auto allocScalarViewUninitialized() -> decltype(auto) { constexpr auto mapping = mapping::MinAlignedOne, RecordDim>{}; return allocViewUninitialized(mapping, bloballoc::Array{}); } /// Allocates a \ref View holding a single record backed by a byte array (\ref bloballoc::Array). /// \tparam Dim Dimension of the \ref ArrayExtents of the \ref View. LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE auto allocScalarView() -> decltype(auto) { auto view = allocScalarViewUninitialized(); constructFields(view); return view; } LLAMA_EXPORT template, bool OwnView = false> struct RecordRef; /// A \ref RecordRef that owns and holds a single value. LLAMA_EXPORT template using One = RecordRef()), RecordCoord<>, true>; /// Is true, if T is an instance of \ref One. LLAMA_EXPORT template inline constexpr bool isOne = false; LLAMA_EXPORT template inline constexpr bool isOne> = true; // TODO(bgruber): Higher dimensional iterators might not have good codegen. Multiple nested loops seem to be // superior to a single iterator over multiple dimensions. At least compilers are able to produce better code. // std::mdspan also discovered similar difficulties and there was a discussion in WG21 in Oulu 2016 to // remove/postpone iterators from the design. In std::mdspan's design, the iterator iterated over the co-domain. LLAMA_EXPORT template struct Iterator { using ArrayIndexIterator = llama::ArrayIndexIterator; using iterator_category = std::random_access_iterator_tag; using value_type = One; using difference_type = typename ArrayIndexIterator::difference_type; using pointer = internal::IndirectValue>; using reference = RecordRef; constexpr Iterator() = default; LLAMA_FN_HOST_ACC_INLINE constexpr Iterator(ArrayIndexIterator arrayIndex, View* view) : arrayIndex(arrayIndex) , view(view) { } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator++() -> Iterator& { ++arrayIndex; return *this; } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator++(int) -> Iterator { auto tmp = *this; ++*this; return tmp; } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator--() -> Iterator& { --arrayIndex; return *this; } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator--(int) -> Iterator { auto tmp{*this}; --*this; return tmp; } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator*() const -> reference { return (*view)(*arrayIndex); } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator->() const -> pointer { return {**this}; } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator[](difference_type i) const -> reference { return *(*this + i); } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator+=(difference_type n) -> Iterator& { arrayIndex += n; return *this; } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator+(Iterator it, difference_type n) -> Iterator { it += n; return it; } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator+(difference_type n, Iterator it) -> Iterator { return it + n; } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator-=(difference_type n) -> Iterator& { arrayIndex -= n; return *this; } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator-(Iterator it, difference_type n) -> Iterator { it -= n; return it; } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator-(const Iterator& a, const Iterator& b) -> difference_type { assert(a.view == b.view); return static_cast(a.arrayIndex - b.arrayIndex); } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator==(const Iterator& a, const Iterator& b) -> bool { assert(a.view == b.view); return a.arrayIndex == b.arrayIndex; } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator!=(const Iterator& a, const Iterator& b) -> bool { return !(a == b); } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator<(const Iterator& a, const Iterator& b) -> bool { assert(a.view == b.view); return a.arrayIndex < b.arrayIndex; } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator>(const Iterator& a, const Iterator& b) -> bool { return b < a; } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator<=(const Iterator& a, const Iterator& b) -> bool { return !(a > b); } LLAMA_FN_HOST_ACC_INLINE friend constexpr auto operator>=(const Iterator& a, const Iterator& b) -> bool { return !(a < b); } ArrayIndexIterator arrayIndex; View* view; }; /// Using a mapping, maps the given array index and record coordinate to a memory reference onto the given blobs. /// \return Either an l-value reference if the record coord maps to a physical field or a proxy reference if mapped /// to a computed field. LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE auto mapToMemory( Mapping& mapping, typename Mapping::ArrayExtents::Index ai, RecordCoord rc, Blobs& blobs) -> decltype(auto) { if constexpr(llama::isComputed) return mapping.compute(ai, rc, blobs); else { const auto [nr, offset] = mapping.blobNrAndOffset(ai, rc); using Type = GetType; LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING return reinterpret_cast, Type>&>( blobs[nr][offset]); LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING } } /// Central LLAMA class holding memory for storage and giving access to values stored there defined by a /// mapping. A view should be created using \ref allocView. \tparam TMapping The mapping used by the view to /// map accesses into memory. \tparam TBlobType The storage type used by the view holding memory. \tparam /// TAccessor The accessor to use when an access is made through this view. LLAMA_EXPORT #ifdef __cpp_lib_concepts template #else template #endif struct LLAMA_DECLSPEC_EMPTY_BASES View : private TMapping , private TAccessor #if CAN_USE_RANGES , std::ranges::view_base #endif { static_assert(!std::is_const_v); using Mapping = TMapping; using BlobType = TBlobType; using ArrayExtents = typename Mapping::ArrayExtents; using ArrayIndex = typename ArrayExtents::Index; using RecordDim = typename Mapping::RecordDim; using Accessor = TAccessor; using iterator = Iterator; using const_iterator = Iterator; private: using size_type = typename ArrayExtents::value_type; public: static_assert( std::is_same_v>, "Mapping must not be const qualified or a reference. Are you using decltype(...) as View template " "argument?"); static_assert( std::is_same_v>, "Mapping::ArrayExtents must not be const qualified or a reference. Are you using decltype(...) as " "mapping " "template argument?"); /// Performs default initialization of the blob array. // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,hicpp-member-init) View() = default; /// Creates a LLAMA View manually. Prefer the allocations functions \ref allocView and \ref /// allocViewUninitialized if possible. /// \param mapping The mapping used by the view to map accesses into memory. /// \param blobs An array of blobs providing storage space for the mapped data. /// \param accessor The accessor to use when an access is made through this view. LLAMA_FN_HOST_ACC_INLINE explicit View(Mapping mapping, Array blobs = {}, Accessor accessor = {}) : Mapping(std::move(mapping)) , Accessor(std::move(accessor)) , m_blobs(std::move(blobs)) { } LLAMA_FN_HOST_ACC_INLINE auto mapping() -> Mapping& { return static_cast(*this); } LLAMA_FN_HOST_ACC_INLINE auto mapping() const -> const Mapping& { return static_cast(*this); } LLAMA_FN_HOST_ACC_INLINE auto accessor() -> Accessor& { return static_cast(*this); } LLAMA_FN_HOST_ACC_INLINE auto accessor() const -> const Accessor& { return static_cast(*this); } LLAMA_FN_HOST_ACC_INLINE auto extents() const -> ArrayExtents { return mapping().extents(); } #if !(defined(_MSC_VER) && defined(__NVCC__)) template auto operator()(llama::ArrayIndex) const { static_assert(!sizeof(V), "Passed ArrayIndex with SizeType different than Mapping::ArrayExtent"); } #endif /// Retrieves the \ref RecordRef at the given \ref ArrayIndex index. LLAMA_FN_HOST_ACC_INLINE auto operator()(ArrayIndex ai) const -> decltype(auto) { if constexpr(isRecordDim) return RecordRef{ai, *this}; else return access(ai, RecordCoord<>{}); } LLAMA_FN_HOST_ACC_INLINE auto operator()(ArrayIndex ai) -> decltype(auto) { if constexpr(isRecordDim) return RecordRef{ai, *this}; else return access(ai, RecordCoord<>{}); } /// Retrieves the \ref RecordRef at the \ref ArrayIndex index constructed from the passed component /// indices. template< typename... Indices, std::enable_if_t...>, int> = 0> LLAMA_FN_HOST_ACC_INLINE auto operator()(Indices... indices) const -> decltype(auto) { static_assert( sizeof...(Indices) == ArrayIndex::rank, "Please specify as many indices as you have array dimensions"); return (*this)(ArrayIndex{static_cast(indices)...}); } template< typename... Indices, std::enable_if_t...>, int> = 0> LLAMA_FN_HOST_ACC_INLINE auto operator()(Indices... indices) -> decltype(auto) { static_assert( sizeof...(Indices) == ArrayIndex::rank, "Please specify as many indices as you have array dimensions"); return (*this)(ArrayIndex{static_cast(indices)...}); } /// Retrieves the \ref RecordRef at the \ref ArrayIndex index constructed from the passed component /// indices. LLAMA_FN_HOST_ACC_INLINE auto operator[](ArrayIndex ai) const -> decltype(auto) { return (*this)(ai); } LLAMA_FN_HOST_ACC_INLINE auto operator[](ArrayIndex ai) -> decltype(auto) { return (*this)(ai); } #if !(defined(_MSC_VER) && defined(__NVCC__)) template auto operator[](llama::ArrayIndex) const { static_assert(!sizeof(V), "Passed ArrayIndex with SizeType different than Mapping::ArrayExtent"); } #endif /// Retrieves the \ref RecordRef at the 1D \ref ArrayIndex index constructed from the passed index. LLAMA_FN_HOST_ACC_INLINE auto operator[](size_type index) const -> decltype(auto) { return (*this)(index); } LLAMA_FN_HOST_ACC_INLINE auto operator[](size_type index) -> decltype(auto) { return (*this)(index); } LLAMA_FN_HOST_ACC_INLINE auto begin() -> iterator { return {ArrayIndexRange{extents()}.begin(), this}; } LLAMA_FN_HOST_ACC_INLINE auto begin() const -> const_iterator { return {ArrayIndexRange{extents()}.begin(), this}; } LLAMA_FN_HOST_ACC_INLINE auto end() -> iterator { return {ArrayIndexRange{extents()}.end(), this}; } LLAMA_FN_HOST_ACC_INLINE auto end() const -> const_iterator { return {ArrayIndexRange{extents()}.end(), this}; } LLAMA_FN_HOST_ACC_INLINE auto blobs() -> Array& { return m_blobs; } LLAMA_FN_HOST_ACC_INLINE auto blobs() const -> const Array& { return m_blobs; } private: template friend struct RecordRef; template LLAMA_FN_HOST_ACC_INLINE auto access(ArrayIndex ai, RecordCoord rc = {}) const -> decltype(auto) { return accessor()(mapToMemory(mapping(), ai, rc, m_blobs)); } template LLAMA_FN_HOST_ACC_INLINE auto access(ArrayIndex ai, RecordCoord rc = {}) -> decltype(auto) { return accessor()(mapToMemory(mapping(), ai, rc, m_blobs)); } Array m_blobs; }; LLAMA_EXPORT #ifdef __cpp_lib_concepts template inline constexpr auto isView = AnyView; #else template inline constexpr auto isView = false; // this definition does neither capture SubView nor user defined view's, but the issue resolves itself with a C++20 // upgrade. template inline constexpr auto isView> = true; #endif namespace internal { // remove in C++23, from: https://en.cppreference.com/w/cpp/utility/forward_like // NOLINTBEGIN template [[nodiscard]] LLAMA_FN_HOST_ACC_INLINE constexpr auto&& forward_like(U&& x) noexcept { constexpr bool is_adding_const = std::is_const_v>; if constexpr(std::is_lvalue_reference_v) { if constexpr(is_adding_const) return std::as_const(x); else return static_cast(x); } else { if constexpr(is_adding_const) return std::move(std::as_const(x)); else return std::move(x); } } // NOLINTEND template LLAMA_FN_HOST_ACC_INLINE auto makeTransformedBlobArray( Blobs&& blobs, const TransformBlobFunc& transformBlob, std::integer_sequence) { return llama::Array{transformBlob(forward_like(blobs[Is]))...}; } } // namespace internal /// Applies the given transformation to the blobs of a view and creates a new view with the transformed blobs /// and the same mapping and accessor as the old view. LLAMA_EXPORT template>>> LLAMA_FN_HOST_ACC_INLINE auto transformBlobs(ViewFwd&& view, const TransformBlobFunc& transformBlob) { using View = std::decay_t; constexpr auto blobCount = View::Mapping::blobCount; auto blobs = internal::makeTransformedBlobArray( internal::forward_like(view.blobs()), transformBlob, std::make_index_sequence{}); return llama::View{ view.mapping(), std::move(blobs), view.accessor()}; } /// Creates a shallow copy of a view. This copy must not outlive the view, since it references its blob array. /// \tparam NewBlobType The blob type of the shallow copy. Must be a non owning pointer like type. /// \return A new view with the same mapping as view, where each blob refers to the blob in view. LLAMA_EXPORT template< typename View, typename NewBlobType = CopyConst, std::byte>*, typename = std::enable_if_t>>> LLAMA_FN_HOST_ACC_INLINE auto shallowCopy(View&& view) { if constexpr(std::is_same_v::BlobType, NewBlobType>) return view; else return transformBlobs( std::forward(view), [](auto& blob) { LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING return static_cast(&blob[0]); LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING }); } // Creates a new view from an existing view with the given accessor. // \param view A view which's mapping and blobs are forwarded into a new view with the different accessor. LLAMA_EXPORT template>>> LLAMA_FN_HOST_ACC_INLINE auto withAccessor(ViewFwd&& view, NewAccessor newAccessor = {}) { using OldView = std::decay_t; return View{ internal::forward_like(view.mapping()), internal::forward_like(view.blobs()), std::move(newAccessor)}; } // Creates a new view from an existing view with the given mapping. // \param view A view which's accessor and blobs are forwarded into a new view with the different mapping. LLAMA_EXPORT template>>> LLAMA_FN_HOST_ACC_INLINE auto withMapping(ViewFwd&& view, NewMapping newMapping = {}) { using OldView = std::decay_t; static_assert(OldView::Mapping::blobCount == NewMapping::blobCount); for(std::size_t i = 0; i < NewMapping::blobCount; i++) { assert(view.mapping().blobSize(i) == newMapping.blobSize(i)); } return View{ std::move(newMapping), internal::forward_like(view.blobs()), internal::forward_like(view.accessor())}; } /// Like a \ref View, but array indices are shifted. /// @tparam TStoredParentView Type of the underlying view. May be cv qualified and/or a reference type. LLAMA_EXPORT template struct SubView { using StoredParentView = TStoredParentView; using ParentView = std::remove_const_t>; ///< type of the parent view using Mapping = typename ParentView::Mapping; using ArrayExtents = typename ParentView::ArrayExtents; using ArrayIndex = typename ParentView::ArrayIndex; using BlobType = typename ParentView::BlobType; using RecordDim = typename ParentView::RecordDim; using Accessor = typename ParentView::Accessor; using iterator = typename ParentView::iterator; using const_iterator = typename ParentView::const_iterator; private: using size_type = typename ArrayExtents::value_type; public: /// Creates a SubView given an offset. The parent view is default constructed. LLAMA_FN_HOST_ACC_INLINE explicit SubView(ArrayIndex offset) : offset(offset) { } /// Creates a SubView given a parent \ref View and offset. template LLAMA_FN_HOST_ACC_INLINE SubView(StoredParentViewFwd&& parentView, ArrayIndex offset) : parentView(std::forward(parentView)) , offset(offset) { } LLAMA_FN_HOST_ACC_INLINE auto mapping() -> Mapping& { return parentView.mapping(); } LLAMA_FN_HOST_ACC_INLINE auto mapping() const -> const Mapping& { return parentView.mapping(); } LLAMA_FN_HOST_ACC_INLINE auto accessor() -> Accessor& { return parentView.accessor(); } LLAMA_FN_HOST_ACC_INLINE auto accessor() const -> const Accessor& { return parentView.accessor(); } LLAMA_FN_HOST_ACC_INLINE auto extents() const -> ArrayExtents { return parentView.extents(); } /// Same as \ref View::operator()(ArrayIndex), but shifted by the offset of this \ref SubView. LLAMA_FN_HOST_ACC_INLINE auto operator()(ArrayIndex ai) const -> decltype(auto) { return parentView(ArrayIndex{ai + offset}); } LLAMA_FN_HOST_ACC_INLINE auto operator()(ArrayIndex ai) -> decltype(auto) { return parentView(ArrayIndex{ai + offset}); } /// Same as corresponding operator in \ref View, but shifted by the offset of this \ref SubView. template LLAMA_FN_HOST_ACC_INLINE auto operator()(Indices... indices) const -> decltype(auto) { static_assert( sizeof...(Indices) == ArrayIndex::rank, "Please specify as many indices as you have array dimensions"); static_assert( std::conjunction_v...>, "Indices must be convertible to ArrayExtents::size_type"); return parentView( ArrayIndex{ArrayIndex{static_cast(indices)...} + offset}); } template LLAMA_FN_HOST_ACC_INLINE auto operator()(Indices... indices) -> decltype(auto) { static_assert( sizeof...(Indices) == ArrayIndex::rank, "Please specify as many indices as you have array dimensions"); static_assert( std::conjunction_v...>, "Indices must be convertible to ArrayExtents::size_type"); return parentView( ArrayIndex{ArrayIndex{static_cast(indices)...} + offset}); } template LLAMA_FN_HOST_ACC_INLINE auto operator()(RecordCoord rc = {}) const -> decltype(auto) { return parentView(ArrayIndex{} + offset, rc); } template LLAMA_FN_HOST_ACC_INLINE auto operator()(RecordCoord rc = {}) -> decltype(auto) { return parentView(ArrayIndex{} + offset, rc); } // TODO(bgruber): implement iterators. Those would be transform iterators on top of the parent view's // iterators, applying the offset on access. LLAMA_FN_HOST_ACC_INLINE auto blobs() -> Array& { return parentView.blobs(); } LLAMA_FN_HOST_ACC_INLINE auto blobs() const -> const Array& { return parentView.blobs(); } StoredParentView parentView; const ArrayIndex offset; ///< offset by which this view's \ref ArrayIndex indices are shifted when passed ///< to the parent view. }; /// SubView vview(view); will store a reference to view. /// SubView vview(std::move(view)); will store the view. LLAMA_EXPORT template SubView(TStoredParentView&&, typename std::remove_reference_t::Mapping::ArrayExtents::Index) -> SubView; } // namespace llama // == // == ./include/llama/View.hpp == // ============================================================================ # include # include # include // # include // amalgamate: file already included // # include // amalgamate: file already included // # include // amalgamate: file already included namespace llama { namespace internal { inline auto color(std::string_view recordCoordTags) -> std::uint32_t { auto c = static_cast(std::hash{}(recordCoordTags) &std::size_t{0xFFFFFF}); c |= 0x404040u; // ensure color per channel is at least 0x40. return c; } // from: https://stackoverflow.com/questions/5665231/most-efficient-way-to-escape-xml-html-in-c-string inline auto xmlEscape(const std::string& str) -> std::string { std::string result; result.reserve(str.size()); for(const char c : str) { switch(c) { case '&': result.append("&"); break; case '\"': result.append("""); break; case '\'': result.append("'"); break; case '<': result.append("<"); break; case '>': result.append(">"); break; default: result += c; break; } } return result; } template auto formatArrayIndex(const ArrayIndex& ai) { if constexpr(Dim == 1) return std::to_string(ai[0]); else { std::string s = "{"; for(auto v : ai) { if(s.size() >= 2) s += ","; s += std::to_string(v); } s += "}"; return s; } } template // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,hicpp-member-init) struct FieldBox { ArrayIndex arrayIndex; std::size_t flatRecordCoord; std::string_view recordCoordTags; NrAndOffset nrAndOffset; std::size_t size; }; template void fillBlobsWithPattern(View& view, uint8_t pattern) { const auto& mapping = view.mapping(); for(std::size_t i = 0; i < View::Mapping::blobCount; i++) std::memset(&view.blobs()[i][0], pattern, mapping.blobSize(i)); } template void boxesFromComputedField( View& view, typename View::Mapping::ArrayExtents::Index ai, RecordCoord rc, std::vector>& infos) { using Mapping = typename View::Mapping; using RecordDim = typename Mapping::RecordDim; auto emitInfo = [&](auto nrAndOffset, std::size_t size) { infos.push_back( {ai, flatRecordCoord, prettyRecordCoord(rc), nrAndOffset, size}); }; using Type = GetType; // computed values can come from anywhere, so we can only apply heuristics auto& blobs = view.blobs(); auto&& ref = view.mapping().compute(ai, rc, blobs); // if we get a reference, try to find the mapped address in one of the blobs if constexpr(std::is_lvalue_reference_v) { auto address = reinterpret_cast(&ref); for(std::size_t i = 0; i < blobs.size(); i++) { // TODO(bgruber): this is UB, because we are comparing pointers from unrelated // allocations const auto front = reinterpret_cast(&blobs[i][0]); const auto back = reinterpret_cast(&blobs[i][view.mapping().blobSize(i) - 1]); if(front <= address && address <= back) { emitInfo(NrAndOffset{i, static_cast(address - front)}, sizeof(Type)); return; // a mapping can only map to one location in the blobs } } } if constexpr(std::is_default_constructible_v) { const auto infosBefore = infos.size(); // try to observe written bytes const auto pattern = std::uint8_t{0xFF}; fillBlobsWithPattern(view, pattern); ref = Type{}; // a broad range of types is default constructible and should write // something zero-ish auto wasTouched = [&](auto b) { return static_cast(b) != pattern; }; for(std::size_t i = 0; i < Mapping::blobCount; i++) { const auto blobSize = view.mapping().blobSize(i); const auto* begin = &blobs[i][0]; const auto* end = begin + blobSize; auto* searchBegin = begin; while(true) { const auto* touchedBegin = std::find_if(searchBegin, end, wasTouched); if(touchedBegin == end) break; const auto& touchedEnd = std::find_if_not(touchedBegin + 1, end, wasTouched); emitInfo( NrAndOffset{i, static_cast(touchedBegin - begin)}, touchedEnd - touchedBegin); if(touchedEnd == end) break; searchBegin = touchedEnd + 1; } } if(infosBefore != infos.size()) return; } // if we come here, we could not find out where the value is coming from emitInfo(NrAndOffset{Mapping::blobCount, std::size_t{0}}, sizeof(Type)); } template auto boxesFromMapping(const Mapping& mapping) -> std::vector> { std::vector> infos; std::optional view; if constexpr(hasAnyComputedField) view = allocView(mapping); using RecordDim = typename Mapping::RecordDim; for(auto ai : ArrayIndexRange{mapping.extents()}) forEachLeafCoord( [&](auto rc) { using Type = GetType; if constexpr(llama::isComputed) boxesFromComputedField(view.value(), ai, rc, infos); else { const auto [nr, off] = mapping.blobNrAndOffset(ai, rc); infos.push_back( {ai, flatRecordCoord, prettyRecordCoord(rc), {static_cast(nr), static_cast(off)}, sizeof(Type)}); } }); return infos; } template auto breakBoxes(std::vector> boxes, std::size_t wrapByteCount) -> std::vector> { for(std::size_t i = 0; i < boxes.size(); i++) { auto& fb = boxes[i]; if(fb.nrAndOffset.offset / wrapByteCount != (fb.nrAndOffset.offset + fb.size - 1) / wrapByteCount) { const auto remainingSpace = wrapByteCount - fb.nrAndOffset.offset % wrapByteCount; auto newFb = fb; newFb.nrAndOffset.offset = fb.nrAndOffset.offset + remainingSpace; newFb.size = fb.size - remainingSpace; fb.size = remainingSpace; boxes.push_back(newFb); } } return boxes; } inline auto cssClass(std::string tags) { std::replace(begin(tags), end(tags), '.', '_'); std::replace(begin(tags), end(tags), '<', '_'); std::replace(begin(tags), end(tags), '>', '_'); return tags; }; } // namespace internal /// Returns an SVG image visualizing the memory layout created by the given mapping. The created memory blocks are /// wrapped after wrapByteCount bytes. /// @param palette RGB colors as 0x00RRGGBB assigned cyclically. When empty, colors are assigned by hashing the /// record coordinate. LLAMA_EXPORT template auto toSvg( const Mapping& mapping, std::size_t wrapByteCount = 64, bool breakBoxes = true, const std::vector& palette = {}, std::string_view textColor = "black") -> std::string { constexpr auto byteSizeInPixel = 30; constexpr auto blobBlockWidth = 60; auto infos = internal::boxesFromMapping(mapping); if(breakBoxes) infos = internal::breakBoxes(std::move(infos), wrapByteCount); std::stable_sort( begin(infos), end(infos), [](const auto& a, const auto& b) { return std::tie(a.nrAndOffset.nr, a.nrAndOffset.offset) < std::tie(b.nrAndOffset.nr, b.nrAndOffset.offset); }); std::string svg; std::array + 1> blobYOffset{}; auto writeBlobHeader = [&](std::size_t i, std::size_t size, std::string_view name) { const auto blobRows = (size + wrapByteCount - 1) / wrapByteCount; blobYOffset[i + 1] = blobYOffset[i] + (blobRows + 1) * byteSizeInPixel; // one row gap between blobs const auto height = blobRows * byteSizeInPixel; svg += fmt::format( R"a( {} )a", blobYOffset[i], blobBlockWidth, height, blobBlockWidth / 2, blobYOffset[i] + height / 2, textColor, name); }; for(std::size_t i = 0; i < Mapping::blobCount; i++) writeBlobHeader(i, mapping.blobSize(i), "Blob: " + std::to_string(i)); svg = fmt::format( R"( )", blobBlockWidth + wrapByteCount * byteSizeInPixel, blobYOffset.back() == 0 ? 987654321 : blobYOffset.back() - byteSizeInPixel, byteSizeInPixel / 2) + svg; std::size_t computedSizeSoFar = 0; std::size_t lastBlobNr = std::numeric_limits::max(); std::size_t usedBytesInBlobSoFar = 0; for(const auto& info : infos) { if(lastBlobNr != info.nrAndOffset.nr) { usedBytesInBlobSoFar = 0; lastBlobNr = info.nrAndOffset.nr; } const auto blobY = blobYOffset[info.nrAndOffset.nr]; const auto offset = [&] { if(info.nrAndOffset.nr < Mapping::blobCount) return info.nrAndOffset.offset; const auto offset = computedSizeSoFar; computedSizeSoFar += info.size; return offset; }(); auto x = (offset % wrapByteCount) * byteSizeInPixel + blobBlockWidth; auto y = (offset / wrapByteCount) * byteSizeInPixel + blobY; const auto fillColor = [&] { if(palette.empty()) return internal::color(info.recordCoordTags); return palette[info.flatRecordCoord % palette.size()]; }(); const auto width = byteSizeInPixel * info.size; const auto nextOffset = [&] { if(&info == &infos.back()) return std::numeric_limits::max(); const auto& nextInfo = (&info)[1]; if(info.nrAndOffset.nr < Mapping::blobCount && info.nrAndOffset.nr == nextInfo.nrAndOffset.nr) return nextInfo.nrAndOffset.offset; return std::numeric_limits::max(); }(); const auto isOverlapped = offset < usedBytesInBlobSoFar || nextOffset < offset + info.size; usedBytesInBlobSoFar = offset + info.size; constexpr auto cropBoxes = true; if(cropBoxes) { svg += fmt::format( R"( )", x, y, width, byteSizeInPixel); x = 0; y = 0; } svg += fmt::format( R"( )", x, y, width, byteSizeInPixel, fillColor, isOverlapped ? 0.3 : 1.0); for(std::size_t i = 1; i < info.size; i++) { svg += fmt::format( R"( )", x + i * byteSizeInPixel, y + byteSizeInPixel * 2 / 3, x + i * byteSizeInPixel, y + byteSizeInPixel); } svg += fmt::format( R"({} {} )", x + width / 2, y + byteSizeInPixel * 3 / 4, textColor, internal::formatArrayIndex(info.arrayIndex), internal::xmlEscape(std::string{info.recordCoordTags})); if(cropBoxes) svg += R"( )"; } if(hasAnyComputedField) { if(computedSizeSoFar > 0) writeBlobHeader(Mapping::blobCount, computedSizeSoFar, "Comp."); else { const auto blobRows = (wrapByteCount - 1) / wrapByteCount; blobYOffset[Mapping::blobCount + 1] = blobYOffset[Mapping::blobCount] + blobRows * byteSizeInPixel; // fix-up, omit gap } // fix total SVG size const auto i = svg.find("987654321"); assert(i != std::string::npos); svg.replace(i, 9, std::to_string(blobYOffset.back() - byteSizeInPixel)); } svg += ""; return svg; } LLAMA_EXPORT template auto toSvg(const Mapping& mapping, const std::vector& palette, std::string_view textColor = "#000") -> std::string { return toSvg(mapping, 64, true, palette, textColor); } /// Returns an HTML document visualizing the memory layout created by the given mapping. The visualization is /// resizeable. LLAMA_EXPORT template auto toHtml(const Mapping& mapping) -> std::string { constexpr auto byteSizeInPixel = 30; constexpr auto rulerLengthInBytes = 512; constexpr auto rulerByteInterval = 8; auto infos = internal::boxesFromMapping(mapping); std::stable_sort( begin(infos), end(infos), [](const auto& a, const auto& b) { return std::tie(a.nrAndOffset.nr, a.nrAndOffset.offset) < std::tie(b.nrAndOffset.nr, b.nrAndOffset.offset); }); infos.erase( std::unique( begin(infos), end(infos), [](const auto& a, const auto& b) { return a.nrAndOffset == b.nrAndOffset; }), end(infos)); std::string html; html += fmt::format( R"(
)"); for(auto i = 0; i < rulerLengthInBytes; i += rulerByteInterval) html += fmt::format( R"(
{}
)", i * byteSizeInPixel, i); html += fmt::format(R"(
)"); auto currentBlobNr = std::numeric_limits::max(); for(const auto& info : infos) { if(currentBlobNr != info.nrAndOffset.nr) { currentBlobNr = info.nrAndOffset.nr; html += fmt::format("

Blob: {}

", currentBlobNr); } html += fmt::format( R"(
{1} {2}
)", internal::cssClass(std::string{info.recordCoordTags}), internal::formatArrayIndex(info.arrayIndex), internal::xmlEscape(std::string{info.recordCoordTags})); } html += R"( )"; return html; } } // namespace llama #endif // == // == ./include/llama/DumpMapping.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/Copy.hpp == // == // Copyright 2021 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "View.hpp" // amalgamate: file already inlined // ============================================================================ // == ./include/llama/mapping/AoSoA.hpp == // == // Copyright 2022 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "Common.hpp" // amalgamate: file already inlined // #include // amalgamate: file already included namespace llama::mapping { /// The maximum number of vector lanes that can be used to fetch each leaf type in the record dimension into a /// vector register of the given size in bits. /// \hideinitializer LLAMA_EXPORT template inline constexpr std::size_t maxLanes = []() constexpr { auto max = std::numeric_limits::max(); forEachLeafCoord( [&](auto rc) { using AttributeType = GetType; max = std::min(max, VectorRegisterBits / (sizeof(AttributeType) * CHAR_BIT)); }); return max; }(); /// Array of struct of arrays mapping. Used to create a \ref View via \ref allocView. /// \tparam Lanes The size of the inner arrays of this array of struct of arrays. /// \tparam TFieldAlignment If Align, padding bytes are inserted to guarantee that struct members are properly /// aligned. If Pack, struct members are tightly packed. /// \tparam PermuteFields Defines how the record dimension's fields should be permuted. See \ref /// PermuteFieldsInOrder, \ref PermuteFieldsIncreasingAlignment, \ref PermuteFieldsDecreasingAlignment and /// \ref PermuteFieldsMinimizePadding. LLAMA_EXPORT template< typename TArrayExtents, typename TRecordDim, typename TArrayExtents::value_type Lanes, FieldAlignment TFieldAlignment = FieldAlignment::Align, typename TLinearizeArrayIndexFunctor = LinearizeArrayIndexRight, template typename PermuteFields = PermuteFieldsInOrder> struct AoSoA : MappingBase { private: using Base = MappingBase; using size_type = typename Base::size_type; public: inline static constexpr typename TArrayExtents::value_type lanes = Lanes; inline static constexpr FieldAlignment fieldAlignment = TFieldAlignment; using LinearizeArrayIndexFunctor = TLinearizeArrayIndexFunctor; using Permuter = PermuteFields>; inline static constexpr std::size_t blobCount = 1; #if defined(__NVCC__) && __CUDACC_VER_MAJOR__ >= 12 using Base::Base; #else constexpr AoSoA() = default; LLAMA_FN_HOST_ACC_INLINE constexpr explicit AoSoA(TArrayExtents extents, TRecordDim = {}) : Base(extents) { } #endif LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(size_type) const -> size_type { const auto rs = static_cast( flatSizeOf); return roundUpToMultiple(LinearizeArrayIndexFunctor{}.size(Base::extents()) * rs, Lanes * rs); } template LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset( typename Base::ArrayIndex ai, RecordCoord rc = {}) const -> NrAndOffset { return blobNrAndOffset(LinearizeArrayIndexFunctor{}(ai, Base::extents()), rc); } // Exposed for aosoaCommonBlockCopy. Should be private ... template LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset( size_type flatArrayIndex, RecordCoord = {}) const -> NrAndOffset { constexpr std::size_t flatFieldIndex = #if defined(__NVCC__) && __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 6 *& // mess with nvcc compiler state to workaround bug #endif Permuter::template permute>>; const auto blockIndex = flatArrayIndex / Lanes; const auto laneIndex = flatArrayIndex % Lanes; const auto offset = static_cast( flatSizeOf * Lanes) * blockIndex + static_cast(flatOffsetOf< typename Permuter::FlatRecordDim, flatFieldIndex, fieldAlignment == FieldAlignment::Align>) * Lanes + static_cast(sizeof(GetType>)) * laneIndex; return {0, offset}; } }; /// Binds parameters to an \ref AoSoA mapping except for array and record dimension, producing a quoted meta /// function accepting the latter two. Useful to to prepare this mapping for a meta mapping. LLAMA_EXPORT template< std::size_t Lanes, FieldAlignment Alignment = FieldAlignment::Align, typename LinearizeArrayIndexFunctor = LinearizeArrayIndexRight, template typename PermuteFields = PermuteFieldsInOrder> struct BindAoSoA { template using fn = AoSoA; }; LLAMA_EXPORT template inline constexpr bool isAoSoA = false; LLAMA_EXPORT template< typename AD, typename RD, typename AD::value_type L, FieldAlignment A, typename Lin, template typename Perm> inline constexpr bool isAoSoA> = true; } // namespace llama::mapping // == // == ./include/llama/mapping/AoSoA.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/mapping/SoA.hpp == // == // Copyright 2022 Alexander Matthes, Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "Common.hpp" // amalgamate: file already inlined // #include // amalgamate: file already included namespace llama::mapping { LLAMA_EXPORT enum class Blobs { Single, OnePerField }; LLAMA_EXPORT enum class SubArrayAlignment { Pack, Align }; /// Struct of array mapping. Used to create a \ref View via \ref allocView. We recommend to use multiple blobs when /// the array extents are dynamic and an aligned single blob version when they are static. /// \tparam TBlobs If OnePerField, every element of the record dimension is mapped to its own blob. /// \tparam TSubArrayAlignment Only relevant when TBlobs == Single, ignored otherwise. If Align, aligns the sub /// arrays created within the single blob by inserting padding. If the array extents are dynamic, this may add some /// overhead to the mapping logic. /// \tparam TLinearizeArrayIndexFunctor Defines how the array dimensions should be mapped into linear numbers and /// how big the linear domain gets. /// \tparam PermuteFieldsSingleBlob Defines how the record dimension's fields should be permuted if Blobs is /// Single. See \ref PermuteFieldsInOrder, \ref PermuteFieldsIncreasingAlignment, \ref /// PermuteFieldsDecreasingAlignment and \ref PermuteFieldsMinimizePadding. LLAMA_EXPORT template< typename TArrayExtents, typename TRecordDim, Blobs TBlobs = Blobs::OnePerField, SubArrayAlignment TSubArrayAlignment = TBlobs == Blobs::Single ? SubArrayAlignment::Align : SubArrayAlignment::Pack, typename TLinearizeArrayIndexFunctor = LinearizeArrayIndexRight, template typename PermuteFieldsSingleBlob = PermuteFieldsInOrder> struct SoA : MappingBase { private: using Base = MappingBase; using size_type = typename TArrayExtents::value_type; public: inline static constexpr Blobs blobs = TBlobs; inline static constexpr SubArrayAlignment subArrayAlignment = TSubArrayAlignment; using LinearizeArrayIndexFunctor = TLinearizeArrayIndexFunctor; using Permuter = PermuteFieldsSingleBlob>; inline static constexpr std::size_t blobCount = blobs == Blobs::OnePerField ? mp_size>::value : 1; #if defined(__NVCC__) && __CUDACC_VER_MAJOR__ >= 12 using Base::Base; #else constexpr SoA() = default; LLAMA_FN_HOST_ACC_INLINE constexpr explicit SoA(TArrayExtents extents, TRecordDim = {}) : Base(extents) { } #endif LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize([[maybe_unused]] size_type blobIndex) const -> size_type { const auto flatSize = LinearizeArrayIndexFunctor{}.size(Base::extents()); if constexpr(blobs == Blobs::OnePerField) { constexpr auto typeSizes = []() constexpr { Array r{}; forEachLeafCoord([&r, i = 0](auto rc) mutable constexpr { r[i++] = sizeof(GetType); }); return r; }(); return flatSize * typeSizes[blobIndex]; } else if constexpr(subArrayAlignment == SubArrayAlignment::Align) { size_type size = 0; using FRD = typename Permuter::FlatRecordDim; mp_for_each_inline>( [&](auto ti) LLAMA_LAMBDA_INLINE { using FieldType = typename decltype(ti)::type; size = roundUpToMultiple(size, static_cast(alignof(FieldType))); size += static_cast(sizeof(FieldType)) * flatSize; }); return size; } else { return flatSize * static_cast(sizeOf); } } private: static LLAMA_CONSTEVAL auto computeSubArrayOffsets() { using FRD = typename Permuter::FlatRecordDim; constexpr auto staticFlatSize = LinearizeArrayIndexFunctor{}.size(TArrayExtents{}); constexpr auto subArrays = mp_size::value; Array r{}; // r[0] == 0, only compute the following offsets mp_for_each_inline>( [&](auto ic) { constexpr auto i = decltype(ic)::value; r[i + 1] = r[i]; using ThisFieldType = mp_at_c; r[i + 1] += static_cast(sizeof(ThisFieldType)) * staticFlatSize; using NextFieldType = mp_at_c; r[i + 1] = roundUpToMultiple(r[i + 1], static_cast(alignof(NextFieldType))); }); return r; } public: template LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset( typename Base::ArrayIndex ai, RecordCoord rc = {}) const -> NrAndOffset { return blobNrAndOffset(LinearizeArrayIndexFunctor{}(ai, Base::extents()), rc); } // Exposed for aosoaCommonBlockCopy. Should be private ... template LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset( size_type flatArrayIndex, RecordCoord = {}) const -> NrAndOffset { const size_type elementOffset = flatArrayIndex * static_cast(sizeof(GetType>)); if constexpr(blobs == Blobs::OnePerField) { constexpr auto blob = flatRecordCoord>; return {blob, elementOffset}; } else { constexpr std::size_t flatFieldIndex = #if defined(__NVCC__) && __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 6 *& // mess with nvcc compiler state to workaround bug #endif Permuter::template permute>>; const size_type flatSize = LinearizeArrayIndexFunctor{}.size(Base::extents()); using FRD = typename Permuter::FlatRecordDim; if constexpr(subArrayAlignment == SubArrayAlignment::Align) { if constexpr(TArrayExtents::rankStatic == TArrayExtents::rank) { // full array extents are known statically, we can precompute the sub array offsets constexpr auto subArrayOffsets = computeSubArrayOffsets(); return {0, subArrayOffsets[flatFieldIndex] + elementOffset}; } else { // TODO(bgruber): we can take a shortcut here if we know that flatSize is a multiple of all // type's alignment. We can also precompute a table of sub array starts (and maybe store it), // or rely on the compiler it out of loops. size_type offset = 0; mp_for_each_inline>( [&](auto ic) LLAMA_LAMBDA_INLINE { constexpr auto i = decltype(ic)::value; using ThisFieldType = mp_at_c; offset += static_cast(sizeof(ThisFieldType)) * flatSize; using NextFieldType = mp_at_c; offset = roundUpToMultiple(offset, static_cast(alignof(NextFieldType))); }); offset += elementOffset; return {0, offset}; } } else { const auto offset = elementOffset + static_cast(flatOffsetOf) * flatSize; return {0, offset}; } } } }; // we can drop this when inherited ctors also inherit deduction guides LLAMA_EXPORT template SoA(TArrayExtents, TRecordDim) -> SoA; /// Struct of array mapping storing the entire layout in a single blob. The starts of the sub arrays are aligned by /// inserting padding. \see SoA LLAMA_EXPORT template using AlignedSingleBlobSoA = SoA; /// Struct of array mapping storing the entire layout in a single blob. The sub arrays are tightly packed, /// violating the type's alignment requirements. \see SoA LLAMA_EXPORT template using PackedSingleBlobSoA = SoA; /// Struct of array mapping storing each attribute of the record dimension in a separate blob. /// \see SoA LLAMA_EXPORT template using MultiBlobSoA = SoA; /// Binds parameters to an \ref SoA mapping except for array and record dimension, producing a quoted /// meta function accepting the latter two. Useful to to prepare this mapping for a meta mapping. LLAMA_EXPORT template< Blobs Blobs = Blobs::OnePerField, SubArrayAlignment SubArrayAlignment = SubArrayAlignment::Pack, typename LinearizeArrayIndexFunctor = LinearizeArrayIndexRight> struct BindSoA { template using fn = SoA; }; LLAMA_EXPORT template inline constexpr bool isSoA = false; LLAMA_EXPORT template< typename ArrayExtents, typename RecordDim, Blobs Blobs, SubArrayAlignment SubArrayAlignment, typename LinearizeArrayIndexFunctor> inline constexpr bool isSoA> = true; } // namespace llama::mapping // == // == ./include/llama/mapping/SoA.hpp == // ============================================================================ #include #include namespace llama { namespace internal { template void assertTrivialCopyable() { forEachLeafCoord( [](auto rc) { static_assert( std::is_trivially_copyable_v>, "All types in the record dimension must be trivially copyable"); }); } // need a custom memcpy symbol in LLAMA, because with clang+CUDA, there are multiple std::memcpy symbols, so // the address is ambiguous. inline constexpr auto memcpy = [](void* dst, const void* src, std::size_t size) { std::memcpy(dst, src, size); }; template void parallelMemcpy( std::byte* dst, const std::byte* src, std::size_t size, std::size_t threadId = 0, std::size_t threadCount = 1, MemcpyFunc singleThreadMemcpy = memcpy) { const auto sizePerThread = size / threadCount; const auto sizeLastThread = sizePerThread + size % threadCount; const auto sizeThisThread = threadId == threadCount - 1 ? sizeLastThread : sizePerThread; singleThreadMemcpy(dst + threadId * sizePerThread, src + threadId * sizePerThread, sizeThisThread); } } // namespace internal /// Copy the blobs' content from the source view to the destination view in parallel with the given thread /// configuration. Both views need to have the same mappings with the same array extents. /// @param threadId Zero-based id of calling thread for multi-threaded invocations. /// @param threadCount Thread count in case of multi-threaded invocation. /// \param singleThreadMemcpy The implementation of memcpy. By default: std::memcpy. LLAMA_EXPORT template void memcpyBlobs( const View& srcView, View& dstView, std::size_t threadId = 0, std::size_t threadCount = 1, MemcpyFunc singleThreadMemcpy = internal::memcpy) { internal::assertTrivialCopyable(); // TODO(bgruber): we do not verify if the mappings have other runtime state than the array dimensions if(srcView.extents() != dstView.extents()) throw std::runtime_error{"Array dimensions sizes are different"}; // TODO(bgruber): this is maybe not the best parallel copying strategy for(std::size_t i = 0; i < Mapping::blobCount; i++) internal::parallelMemcpy( &dstView.blobs()[i][0], &srcView.blobs()[i][0], dstView.mapping().blobSize(i), threadId, threadCount, singleThreadMemcpy); } namespace internal { inline constexpr auto copyBlobWithMemcpy = [](const auto& src, auto& dst, std::size_t size) { static_assert(std::is_trivially_copyable_v>); static_assert(std::is_trivially_copyable_v>); std::memcpy(&dst[0], &src[0], size); }; } // namespace internal /// Copy the blobs' content from the source view to the destination view. Both views need to have the same mapping, /// and thus the same blob count and blob sizes. The copy is performed blob by blob. /// \param copyBlob The function to use for copying blobs. Default is \ref internal::copyBlobWithMemcpy, which uses /// std::memcpy. LLAMA_EXPORT template< typename Mapping, typename SrcBlob, typename DstBlob, typename BlobCopyFunc = decltype(internal::copyBlobWithMemcpy)> void copyBlobs( const View& srcView, View& dstView, BlobCopyFunc copyBlob = internal::copyBlobWithMemcpy) { // TODO(bgruber): we do not verify if the mappings have other runtime state than the array dimensions if(srcView.extents() != dstView.extents()) throw std::runtime_error{"Array dimensions sizes are different"}; for(std::size_t i = 0; i < Mapping::blobCount; i++) copyBlob(srcView.blobs()[i], dstView.blobs()[i], dstView.mapping().blobSize(i)); } /// Field-wise copy from source to destination view. Both views need to have the same array and record dimensions. /// @param threadId Optional. Thread id in case of multi-threaded copy. /// @param threadCount Optional. Thread count in case of multi-threaded copy. LLAMA_EXPORT template void fieldWiseCopy( const View& srcView, View& dstView, std::size_t threadId = 0, std::size_t threadCount = 1) { // TODO(bgruber): think if we can remove this restriction static_assert( std::is_same_v, "The source and destination record dimensions must be the same"); if(srcView.extents() != dstView.extents()) throw std::runtime_error{"Array dimensions sizes are different"}; auto copyOne = [&](auto ai) LLAMA_LAMBDA_INLINE { forEachLeafCoord([&](auto rc) LLAMA_LAMBDA_INLINE { dstView(ai)(rc) = srcView(ai)(rc); }); }; constexpr auto dims = SrcMapping::ArrayExtents::rank; const auto extents = srcView.extents().toArray(); const auto workPerThread = (extents[0] + threadCount - 1) / threadCount; const auto start = threadId * workPerThread; const auto end = std::min((threadId + 1) * workPerThread, static_cast(extents[0])); for(auto i = start; i < end; i++) { using SrcSizeType = typename SrcMapping::ArrayExtents::value_type; if constexpr(dims > 1) forEachArrayIndex(extents, copyOne, static_cast(i)); else copyOne(ArrayIndex{static_cast(i)}); } } namespace internal { template inline constexpr std::size_t aosoaLanes = 1; template< typename ArrayExtents, typename RecordDim, mapping::Blobs Blobs, mapping::SubArrayAlignment SubArrayAlignment, typename LinearizeArrayIndexFunctor, template typename PermuteSBFields> inline constexpr std::size_t aosoaLanes< mapping:: SoA> = std::numeric_limits::max(); template< typename ArrayExtents, typename RecordDim, typename ArrayExtents::value_type Lanes, mapping::FieldAlignment FA, typename LinearizeArrayIndexFunctor, template typename PermuteFields> inline constexpr std::size_t aosoaLanes> = Lanes; } // namespace internal /// AoSoA copy strategy which transfers data in common blocks. SoA mappings are also allowed for at most 1 /// argument. /// @param threadId Optional. Zero-based id of calling thread for multi-threaded invocations. /// @param threadCount Optional. Thread count in case of multi-threaded invocation. LLAMA_EXPORT template void aosoaCommonBlockCopy( const View& srcView, View& dstView, std::size_t threadId = 0, std::size_t threadCount = 1) { static_assert( mapping::isAoSoA || mapping::isSoA, "Only AoSoA and SoA mappings allowed as source"); static_assert( mapping::isAoSoA || mapping::isSoA, "Only AoSoA and SoA mappings allowed as destination"); // TODO(bgruber): think if we can remove this restriction static_assert( std::is_same_v, "The source and destination record dimensions must be the same"); static_assert( std::is_same_v< typename SrcMapping::LinearizeArrayIndexFunctor, typename DstMapping::LinearizeArrayIndexFunctor>, "Source and destination mapping need to use the same array dimensions linearizer"); using RecordDim = typename SrcMapping::RecordDim; internal::assertTrivialCopyable(); static constexpr auto lanesSrc = internal::aosoaLanes; static constexpr auto lanesDst = internal::aosoaLanes; if(srcView.extents() != dstView.extents()) throw std::runtime_error{"Array dimensions sizes are different"}; static constexpr auto srcIsAoSoA = lanesSrc != std::numeric_limits::max(); static constexpr auto dstIsAoSoA = lanesDst != std::numeric_limits::max(); static_assert(srcIsAoSoA || dstIsAoSoA, "At least one of the mappings must be an AoSoA mapping"); static_assert(!srcIsAoSoA || SrcMapping::blobCount == 1, "Implementation assumes AoSoA with single blob"); static_assert(!dstIsAoSoA || DstMapping::blobCount == 1, "Implementation assumes AoSoA with single blob"); const auto flatSize = product(dstView.extents()); // TODO(bgruber): implement the following by adding additional copy loops for the remaining elements if(!srcIsAoSoA && flatSize % lanesDst != 0) throw std::runtime_error{"Source SoA mapping's total array elements must be evenly divisible by the " "destination AoSoA Lane count."}; if(!dstIsAoSoA && flatSize % lanesSrc != 0) throw std::runtime_error{"Destination SoA mapping's total array elements must be evenly divisible by the " "source AoSoA Lane count."}; auto mapSrc = [&](std::size_t flatArrayIndex, auto rc) LLAMA_LAMBDA_INLINE { const auto [blob, off] = srcView.mapping().blobNrAndOffset(flatArrayIndex, rc); return &srcView.blobs()[blob][off]; }; auto mapDst = [&](std::size_t flatArrayIndex, auto rc) LLAMA_LAMBDA_INLINE { const auto [blob, off] = dstView.mapping().blobNrAndOffset(flatArrayIndex, rc); return &dstView.blobs()[blob][off]; }; static constexpr auto l = [] { if constexpr(srcIsAoSoA && dstIsAoSoA) return std::gcd(lanesSrc, lanesDst); return std::min(lanesSrc, lanesDst); }(); if constexpr(lanesSrc < lanesDst) { static_assert(srcIsAoSoA); // optimized for linear reading const auto elementsPerThread = flatSize / lanesSrc / threadCount * lanesSrc; const auto start = threadId * elementsPerThread; const auto stop = threadId == threadCount - 1 ? flatSize : (threadId + 1) * elementsPerThread; static constexpr auto packed = SrcMapping::fieldAlignment == mapping::FieldAlignment::Pack; decltype(mapSrc(start, RecordCoord<>{})) src; if constexpr(packed) src = mapSrc(start, RecordCoord<>{}); for(std::size_t i = start; i < stop; i += lanesSrc) forEachLeafCoord( [&](auto rc) LLAMA_LAMBDA_INLINE { if constexpr(!packed) src = mapSrc(i, rc); for(std::size_t j = 0; j < lanesSrc; j += l) { assert(src == mapSrc(i + j, rc)); static constexpr auto bytes = l * sizeof(GetType); std::memcpy(mapDst(i + j, rc), src, bytes); src += bytes; } }); } else { static_assert(dstIsAoSoA); // optimized for linear writing const auto elementsPerThread = flatSize / lanesDst / threadCount * lanesDst; const auto start = threadId * elementsPerThread; const auto stop = threadId == threadCount - 1 ? flatSize : (threadId + 1) * elementsPerThread; static constexpr auto packed = DstMapping::fieldAlignment == mapping::FieldAlignment::Pack; decltype(mapDst(start, RecordCoord<>{})) dst; if constexpr(packed) dst = mapDst(start, RecordCoord<>{}); for(std::size_t i = start; i < stop; i += lanesDst) forEachLeafCoord( [&](auto rc) LLAMA_LAMBDA_INLINE { if constexpr(!packed) dst = mapDst(i, rc); for(std::size_t j = 0; j < lanesDst; j += l) { assert(dst == mapDst(i + j, rc)); constexpr auto bytes = l * sizeof(GetType); std::memcpy(dst, mapSrc(i + j, rc), bytes); dst += bytes; } }); } } /// @brief Generic implementation of \ref copy defaulting to \ref fieldWiseCopy. LLAMA provides several /// specializations of this construct for specific mappings. Users are encouraged to also specialize this template /// with better copy algorithms for further combinations of mappings, if they can and want to provide a better /// implementation. LLAMA_EXPORT template struct Copy { template void operator()(const SrcView& srcView, DstView& dstView, std::size_t threadId, std::size_t threadCount) const { fieldWiseCopy(srcView, dstView, threadId, threadCount); } }; LLAMA_EXPORT template struct Copy { template void operator()(const SrcView& srcView, DstView& dstView, std::size_t threadId, std::size_t threadCount) const { // FIXME(bgruber): need to fallback to fieldWiseCopy when elements are not trivially copyable memcpyBlobs(srcView, dstView, threadId, threadCount); } }; LLAMA_EXPORT template< typename ArrayExtents, typename RecordDim, typename LinearizeArrayIndex, typename ArrayExtents::value_type LanesSrc, typename ArrayExtents::value_type LanesDst, mapping::FieldAlignment AlignSrc, mapping::FieldAlignment AlignDst, template typename PermuteFields> struct Copy< mapping::AoSoA, mapping::AoSoA, std::enable_if_t> { template void operator()( const View< mapping::AoSoA, SrcBlob>& srcView, View< mapping::AoSoA, DstBlob>& dstView, std::size_t threadId, std::size_t threadCount) { aosoaCommonBlockCopy(srcView, dstView, threadId, threadCount); } }; LLAMA_EXPORT template< typename ArrayExtents, typename RecordDim, typename LinearizeArrayIndex, template typename PermuteFields, typename ArrayExtents::value_type LanesSrc, mapping::FieldAlignment AlignSrc, mapping::Blobs DstBlobs, mapping::SubArrayAlignment DstSubArrayAlignment> struct Copy< mapping::AoSoA, mapping::SoA> { template void operator()( const View< mapping::AoSoA, SrcBlob>& srcView, View< mapping:: SoA, DstBlob>& dstView, std::size_t threadId, std::size_t threadCount) { aosoaCommonBlockCopy(srcView, dstView, threadId, threadCount); } }; LLAMA_EXPORT template< typename ArrayExtents, typename RecordDim, typename LinearizeArrayIndex, template typename PermuteFields, typename ArrayExtents::value_type LanesDst, mapping::FieldAlignment AlignDst, mapping::Blobs SrcBlobs, mapping::SubArrayAlignment SrcSubArrayAlignment> struct Copy< mapping::SoA, mapping::AoSoA> { template void operator()( const View< mapping:: SoA, SrcBlob>& srcView, View< mapping::AoSoA, DstBlob>& dstView, std::size_t threadId, std::size_t threadCount) { aosoaCommonBlockCopy(srcView, dstView, threadId, threadCount); } }; LLAMA_EXPORT template< typename ArrayExtents, typename RecordDim, mapping::Blobs SrcBlobs, mapping::Blobs DstBlobs, mapping::SubArrayAlignment SrcSubArrayAlignment, mapping::SubArrayAlignment DstSubArrayAlignment, typename LinearizeArrayIndex, template typename PermuteFields> struct Copy< mapping::SoA, mapping::SoA, std::enable_if_t> { template void operator()( const View< mapping:: SoA, SrcBlob>& srcView, View< mapping:: SoA, DstBlob>& dstView, std::size_t threadId, std::size_t threadCount) { if(srcView.extents() != dstView.extents()) throw std::runtime_error{"Array dimensions sizes are different"}; const auto subArrayLength = product(srcView.extents()); forEachLeafCoord( [&](auto rc) LLAMA_LAMBDA_INLINE { auto subArrayStart = [&](auto& view, auto rc) LLAMA_LAMBDA_INLINE { const auto [blob, off] = view.mapping().blobNrAndOffset(0, rc); return &view.blobs()[blob][off]; }; internal::parallelMemcpy( subArrayStart(dstView, rc), subArrayStart(srcView, rc), subArrayLength * sizeof(GetType), threadId, threadCount); }); } }; /// Copy data from source to destination view. Both views need to have the same array and record /// dimensions, but may have different mappings. The blobs need to be read- and writeable. Delegates to \ref Copy /// to choose an implementation. /// @param threadId Optional. Zero-based id of calling thread for multi-threaded invocations. /// @param threadCount Optional. Thread count in case of multi-threaded invocation. LLAMA_EXPORT template void copy( const View& srcView, View& dstView, std::size_t threadId = 0, std::size_t threadCount = 1) { Copy{}(srcView, dstView, threadId, threadCount); } } // namespace llama // == // == ./include/llama/Copy.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/Proofs.hpp == // == // Copyright 2022 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "ArrayIndexRange.hpp" // amalgamate: file already inlined // #include "Core.hpp" // amalgamate: file already inlined namespace llama { // FIXME(bgruber): this test is actually not correct, because __cpp_constexpr_dynamic_alloc only guarantees constexpr // std::allocator #ifdef __cpp_constexpr_dynamic_alloc namespace internal { template struct DynArray { constexpr DynArray() = default; constexpr explicit DynArray(std::size_t n) : data(new T[n]{}) { } DynArray(const DynArray&) = delete; DynArray(DynArray&&) = delete; auto operator=(const DynArray&) -> DynArray& = delete; auto operator=(DynArray&&) -> DynArray& = delete; constexpr ~DynArray() { delete[] data; } constexpr void resize(std::size_t n) { delete[] data; data = new T[n]{}; } T* data = nullptr; // TODO(bgruber): replace by std::unique_ptr in C++23 }; } // namespace internal /// Proofs by exhaustion of the array and record dimensions, that all values mapped to memory do not overlap. // Unfortunately, this only works for smallish array dimensions, because of compiler limits on constexpr evaluation // depth. LLAMA_EXPORT template constexpr auto mapsNonOverlappingly(const Mapping& m) -> bool { internal::DynArray> blobByteMapped(m.blobCount); for(std::size_t i = 0; i < m.blobCount; i++) blobByteMapped.data[i].resize(divCeil(m.blobSize(i), std::size_t{64})); auto testAndSet = [&](auto blob, auto offset) constexpr { const auto bit = std::uint64_t{1} << (offset % 64); if(blobByteMapped.data[blob].data[offset / 64] & bit) return true; blobByteMapped.data[blob].data[offset / 64] |= bit; return false; }; bool collision = false; forEachLeafCoord( [&](auto rc) constexpr { if(collision) return; for(auto ai : ArrayIndexRange{m.extents()}) { using Type = GetType; const auto [blob, offset] = m.blobNrAndOffset(ai, rc); for(std::size_t b = 0; b < sizeof(Type); b++) if(testAndSet(blob, offset + b)) { collision = true; break; } } }); return !collision; } #endif /// Proofs by exhaustion of the array and record dimensions, that at least PieceLength elements are always stored /// contiguously. // Unfortunately, this only works for smallish array dimensions, because of compiler limits on constexpr evaluation // depth. LLAMA_EXPORT template constexpr auto mapsPiecewiseContiguous(const Mapping& m) -> bool { bool collision = false; forEachLeafCoord( [&](auto rc) constexpr { std::size_t flatIndex = 0; std::size_t lastBlob = std::numeric_limits::max(); std::size_t lastOffset = std::numeric_limits::max(); for(auto ai : ArrayIndexRange{m.extents()}) { using Type = GetType; const auto [blob, offset] = m.blobNrAndOffset(ai, rc); if(flatIndex % PieceLength != 0 && (lastBlob != blob || lastOffset + sizeof(Type) != offset)) { collision = true; break; } lastBlob = blob; lastOffset = offset; flatIndex++; } }); return !collision; } } // namespace llama // == // == ./include/llama/Proofs.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/Vector.hpp == // == // Copyright 2021 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // ============================================================================ // == ./include/llama/RecordRef.hpp == // == // Copyright 2022 Alexander Matthes, Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "Concepts.hpp" // amalgamate: file already inlined // #include "ProxyRefOpMixin.hpp" // amalgamate: file already inlined // #include "StructName.hpp" // amalgamate: file already inlined // #include "View.hpp" // amalgamate: file already inlined // #include "macros.hpp" // amalgamate: file already inlined #include #include // #include // amalgamate: file already included namespace llama { LLAMA_EXPORT template struct RecordRef; LLAMA_EXPORT template inline constexpr auto isRecordRef = false; LLAMA_EXPORT template inline constexpr auto isRecordRef> = true; /// Returns a \ref One with the same record dimension as the given record ref, with values copyied from rr. LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE auto copyRecord(const RecordRef& rr) { using RecordDim = typename RecordRef::AccessibleRecordDim; One temp; temp = rr; return temp; } namespace internal { template< typename Functor, typename LeftRecord, typename RightView, typename RightBoundRecordDim, bool RightOwnView> LLAMA_FN_HOST_ACC_INLINE auto recordRefArithOperator( LeftRecord& left, const RecordRef& right) -> LeftRecord& { using RightRecord = RecordRef; // if the record dimension left and right is the same, a single loop is enough and no tag check is needed. // this safes a lot of compilation time. using LARD = typename LeftRecord::AccessibleRecordDim; using RARD = typename RightRecord::AccessibleRecordDim; if constexpr(std::is_same_v) { forEachLeafCoord([&](auto rc) LLAMA_LAMBDA_INLINE { Functor{}(left(rc), right(rc)); }); } else { forEachLeafCoord( [&](auto leftRC) LLAMA_LAMBDA_INLINE { using LeftInnerCoord = decltype(leftRC); forEachLeafCoord( [&](auto rightRC) LLAMA_LAMBDA_INLINE { using RightInnerCoord = decltype(rightRC); if constexpr(hasSameTags) { Functor{}(left(leftRC), right(rightRC)); } }); }); } return left; } template LLAMA_FN_HOST_ACC_INLINE auto recordRefArithOperator(LeftRecord& left, const T& right) -> LeftRecord& { forEachLeafCoord([&](auto leftRC) LLAMA_LAMBDA_INLINE { Functor{}(left(leftRC), right); }); return left; } template< typename Functor, typename LeftRecord, typename RightView, typename RightBoundRecordDim, bool RightOwnView> LLAMA_FN_HOST_ACC_INLINE auto recordRefRelOperator( const LeftRecord& left, const RecordRef& right) -> bool { using RightRecord = RecordRef; bool result = true; // if the record dimension left and right is the same, a single loop is enough and no tag check is needed. // this safes a lot of compilation time. using LARD = typename LeftRecord::AccessibleRecordDim; using RARD = typename RightRecord::AccessibleRecordDim; if constexpr(std::is_same_v) { forEachLeafCoord([&](auto rc) LLAMA_LAMBDA_INLINE { result &= Functor{}(left(rc), right(rc)); }); } else { forEachLeafCoord( [&](auto leftRC) LLAMA_LAMBDA_INLINE { using LeftInnerCoord = decltype(leftRC); forEachLeafCoord( [&](auto rightRC) LLAMA_LAMBDA_INLINE { using RightInnerCoord = decltype(rightRC); if constexpr(hasSameTags) { result &= Functor{}(left(leftRC), right(rightRC)); } }); }); } return result; } template LLAMA_FN_HOST_ACC_INLINE auto recordRefRelOperator(const LeftRecord& left, const T& right) -> bool { bool result = true; forEachLeafCoord([&](auto leftRC) LLAMA_LAMBDA_INLINE { result &= Functor{}(left(leftRC), right); }); return result; } struct Assign { template LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto) { return std::forward
(a) = b; } }; struct PlusAssign { template LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto) { return std::forward(a) += b; } }; struct MinusAssign { template LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto) { return std::forward(a) -= b; } }; struct MultiplyAssign { template LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto) { return std::forward(a) *= b; } }; struct DivideAssign { template LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto) { return std::forward(a) /= b; } }; struct ModuloAssign { template LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto) { return std::forward(a) %= b; } }; template< typename ProxyReference, typename T, std::enable_if_t>, int> = 0> LLAMA_FN_HOST_ACC_INLINE auto asTupleImpl(ProxyReference&& leaf, T) -> ProxyReference { return leaf; } template< typename TWithOptionalConst, typename T, std::enable_if_t>, int> = 0> LLAMA_FN_HOST_ACC_INLINE auto asTupleImpl(TWithOptionalConst& leaf, T) -> std::reference_wrapper { return leaf; } template LLAMA_FN_HOST_ACC_INLINE auto asTupleImplForArray(RecordRef&& vd, T (&&)[N], std::index_sequence) { return std::make_tuple(asTupleImpl(vd(RecordCoord{}), T{})...); } template LLAMA_FN_HOST_ACC_INLINE auto asTupleImpl(RecordRef&& vd, T (&&a)[N]) { return asTupleImplForArray(std::forward(vd), std::move(a), std::make_index_sequence{}); } template LLAMA_FN_HOST_ACC_INLINE auto asTupleImpl(RecordRef&& vd, Record) { return std::make_tuple(asTupleImpl(vd(GetFieldTag{}), GetFieldType{})...); } template< typename ProxyReference, typename T, std::enable_if_t>, int> = 0> LLAMA_FN_HOST_ACC_INLINE auto asFlatTupleImpl(ProxyReference&& leaf, T) -> std::tuple { static_assert(!std::is_reference_v); return {std::move(leaf)}; // NOLINT(bugprone-move-forwarding-reference) } template< typename TWithOptionalConst, typename T, std::enable_if_t>, int> = 0> LLAMA_FN_HOST_ACC_INLINE auto asFlatTupleImpl(TWithOptionalConst& leaf, T) -> std::tuple { return {leaf}; } template LLAMA_FN_HOST_ACC_INLINE auto asFlatTupleImplForArray(RecordRef&& vd, T (&&)[N], std::index_sequence) { return std::tuple_cat(asFlatTupleImpl(vd(RecordCoord{}), T{})...); } template LLAMA_FN_HOST_ACC_INLINE auto asFlatTupleImpl(RecordRef&& vd, T (&&a)[N]) { return asFlatTupleImplForArray(std::forward(vd), std::move(a), std::make_index_sequence{}); } template LLAMA_FN_HOST_ACC_INLINE auto asFlatTupleImpl(RecordRef&& vd, Record) { return std::tuple_cat(asFlatTupleImpl(vd(GetFieldTag{}), GetFieldType{})...); } template inline constexpr auto isTupleLike = false; // get(t) and std::tuple_size must be available using std::get; // make sure a get<0>() can be found, so the compiler can compile the trait template inline constexpr auto isTupleLike(std::declval())), std::tuple_size>> = true; template inline constexpr auto dependentFalse = false; template LLAMA_FN_HOST_ACC_INLINE void assignTuples(Tuple1&& dst, Tuple2&& src, std::index_sequence); template LLAMA_FN_HOST_ACC_INLINE void assignTupleElement(T1&& dst, T2&& src) { if constexpr(isTupleLike> && isTupleLike>) { static_assert(std::tuple_size_v> == std::tuple_size_v>); assignTuples(dst, src, std::make_index_sequence>>{}); } else if constexpr(!isTupleLike> && !isTupleLike>) std::forward(dst) = std::forward(src); else static_assert( dependentFalse, "Elements to assign are not tuple/tuple or non-tuple/non-tuple."); } template LLAMA_FN_HOST_ACC_INLINE void assignTuples(Tuple1&& dst, Tuple2&& src, std::index_sequence) { static_assert(std::tuple_size_v> == std::tuple_size_v>); using std::get; (assignTupleElement(get(std::forward(dst)), get(std::forward(src))), ...); } template LLAMA_FN_HOST_ACC_INLINE auto makeFromTuple(Tuple&& src, std::index_sequence) { using std::get; return T{get(src)...}; // no forward of src, since we call get multiple times on it } template inline constexpr auto isDirectListInitializableImpl = false; template inline constexpr auto isDirectListInitializableImpl()...})>, Args...> = true; template inline constexpr auto isDirectListInitializable = isDirectListInitializableImpl; template inline constexpr auto isDirectListInitializableFromTuple = false; template typename Tuple, typename... Args> inline constexpr auto isDirectListInitializableFromTuple> = isDirectListInitializable; template LLAMA_FN_HOST_ACC_INLINE void loadSimdFromField(const T& srcRef, Simd& dstSimd, SrcRC srcRC, DstRC dstRC); template LLAMA_FN_HOST_ACC_INLINE void storeSimdToField(const Simd& srcSimd, TFwd&& dstRef, SrcRC srcRC, DstRC dstRC); } // namespace internal /// Record reference type returned by \ref View after resolving an array dimensions coordinate or partially /// resolving a \ref RecordCoord. A record reference does not hold data itself, it just binds enough information /// (array dimensions coord and partial record coord) to retrieve it later from a \ref View. Records references /// should not be created by the user. They are returned from various access functions in \ref View and RecordRef /// itself. LLAMA_EXPORT template struct RecordRef : private TView::Mapping::ArrayExtents::Index { using View = TView; ///< View this record reference points into. using BoundRecordCoord = TBoundRecordCoord; ///< Record coords into View::RecordDim which are already bound by this RecordRef. private: using ArrayIndex = typename View::Mapping::ArrayExtents::Index; using RecordDim = typename View::Mapping::RecordDim; std::conditional_t view; public: /// Subtree of the record dimension of View starting at BoundRecordCoord. If BoundRecordCoord is /// `RecordCoord<>` (default) AccessibleRecordDim is the same as `Mapping::RecordDim`. using AccessibleRecordDim = GetType; /// Creates an empty RecordRef. Only available for if the view is owned. Used by llama::One. LLAMA_FN_HOST_ACC_INLINE RecordRef() /* requires(OwnView) */ : ArrayIndex{} , view{allocScalarView<0, RecordDim>()} { static_assert(OwnView, "The default constructor of RecordRef is only available if it owns the view."); } LLAMA_FN_HOST_ACC_INLINE RecordRef(ArrayIndex ai, std::conditional_t view) : ArrayIndex{ai} , view{static_cast(view)} { } RecordRef(const RecordRef&) = default; // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp) LLAMA_FN_HOST_ACC_INLINE auto operator=(const RecordRef& other) -> RecordRef& { // NOLINTNEXTLINE(cppcoreguidelines-c-copy-assignment-signature,misc-unconventional-assign-operator) return this->operator=(other); } RecordRef(RecordRef&&) noexcept = default; auto operator=(RecordRef&&) noexcept -> RecordRef& = default; ~RecordRef() = default; LLAMA_FN_HOST_ACC_INLINE constexpr auto arrayIndex() const -> ArrayIndex { return static_cast(*this); } /// Create a RecordRef from a different RecordRef. Only available for if the view is owned. Used by /// llama::One. template // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions) LLAMA_FN_HOST_ACC_INLINE RecordRef(const RecordRef& recordRef) /* requires(OwnView) */ : RecordRef() { static_assert( OwnView, "The copy constructor of RecordRef from a different RecordRef is only available if it owns " "the " "view."); *this = recordRef; } // TODO(bgruber): unify with previous in C++20 and use explicit(cond) /// Create a RecordRef from a scalar. Only available for if the view is owned. Used by llama::One. template>> LLAMA_FN_HOST_ACC_INLINE explicit RecordRef(const T& scalar) /* requires(OwnView) */ : RecordRef() { static_assert( OwnView, "The constructor of RecordRef from a scalar is only available if it owns the view."); *this = scalar; } /// Access a record in the record dimension underneath the current record reference using a \ref RecordCoord. /// If the access resolves to a leaf, an l-value reference to a variable inside the \ref View storage is /// returned, otherwise another RecordRef. template LLAMA_FN_HOST_ACC_INLINE auto operator()(RecordCoord) const -> decltype(auto) { using AbsolutCoord = Cat>; using AccessedType = GetType; if constexpr(isRecordDim) return RecordRef{arrayIndex(), this->view}; else return this->view.access(arrayIndex(), AbsolutCoord{}); } // FIXME(bgruber): remove redundancy template LLAMA_FN_HOST_ACC_INLINE auto operator()(RecordCoord) -> decltype(auto) { using AbsolutCoord = Cat>; using AccessedType = GetType; if constexpr(isRecordDim) return RecordRef{arrayIndex(), this->view}; else return this->view.access(arrayIndex(), AbsolutCoord{}); } /// Access a record in the record dimension underneath the current record reference using a series of tags. If /// the access resolves to a leaf, an l-value reference to a variable inside the \ref View storage is returned, /// otherwise another RecordRef. template LLAMA_FN_HOST_ACC_INLINE auto operator()(Tags...) const -> decltype(auto) { using RecordCoord = GetCoordFromTags; return operator()(RecordCoord{}); } // FIXME(bgruber): remove redundancy template LLAMA_FN_HOST_ACC_INLINE auto operator()(Tags...) -> decltype(auto) { using RecordCoord = GetCoordFromTags; return operator()(RecordCoord{}); } #ifdef LLAMA_HAS_STRING_FIELDS /// Experimental template LLAMA_FN_HOST_ACC_INLINE auto at() const -> decltype(auto) { using RecordCoord = GetCoordFromTags>; return operator()(RecordCoord{}); } // FIXME(bgruber): remove redundancy /// Experimental template LLAMA_FN_HOST_ACC_INLINE auto at() -> decltype(auto) { using RecordCoord = GetCoordFromTags>; return operator()(RecordCoord{}); } #endif template LLAMA_FN_HOST_ACC_INLINE auto operator=(const T& other) -> RecordRef& { // NOLINTNEXTLINE(cppcoreguidelines-c-copy-assignment-signature,misc-unconventional-assign-operator) return internal::recordRefArithOperator(*this, other); } template LLAMA_FN_HOST_ACC_INLINE auto operator+=(const T& other) -> RecordRef& { return internal::recordRefArithOperator(*this, other); } template LLAMA_FN_HOST_ACC_INLINE auto operator-=(const T& other) -> RecordRef& { return internal::recordRefArithOperator(*this, other); } template LLAMA_FN_HOST_ACC_INLINE auto operator*=(const T& other) -> RecordRef& { return internal::recordRefArithOperator(*this, other); } template LLAMA_FN_HOST_ACC_INLINE auto operator/=(const T& other) -> RecordRef& { return internal::recordRefArithOperator(*this, other); } template LLAMA_FN_HOST_ACC_INLINE auto operator%=(const T& other) -> RecordRef& { return internal::recordRefArithOperator(*this, other); } template LLAMA_FN_HOST_ACC_INLINE friend auto operator+(const RecordRef& vd, const T& t) { return copyRecord(vd) += t; } template>> LLAMA_FN_HOST_ACC_INLINE friend auto operator+(const T& t, const RecordRef& vd) { return vd + t; } template LLAMA_FN_HOST_ACC_INLINE friend auto operator-(const RecordRef& vd, const T& t) { return copyRecord(vd) -= t; } template LLAMA_FN_HOST_ACC_INLINE friend auto operator*(const RecordRef& vd, const T& t) { return copyRecord(vd) *= t; } template>> LLAMA_FN_HOST_ACC_INLINE friend auto operator*(const T& t, const RecordRef& vd) { return vd * t; } template LLAMA_FN_HOST_ACC_INLINE friend auto operator/(const RecordRef& vd, const T& t) { return copyRecord(vd) /= t; } template LLAMA_FN_HOST_ACC_INLINE friend auto operator%(const RecordRef& vd, const T& t) { return copyRecord(vd) %= t; } template LLAMA_FN_HOST_ACC_INLINE friend auto operator==(const RecordRef& vd, const T& t) -> bool { return internal::recordRefRelOperator>(vd, t); } template>> LLAMA_FN_HOST_ACC_INLINE friend auto operator==(const T& t, const RecordRef& vd) -> bool { return vd == t; } template LLAMA_FN_HOST_ACC_INLINE friend auto operator!=(const RecordRef& vd, const T& t) -> bool { return !(vd == t); } template>> LLAMA_FN_HOST_ACC_INLINE friend auto operator!=(const T& t, const RecordRef& vd) -> bool { return !(t == vd); } template LLAMA_FN_HOST_ACC_INLINE friend auto operator<(const RecordRef& vd, const T& t) -> bool { return internal::recordRefRelOperator>(vd, t); } template>> LLAMA_FN_HOST_ACC_INLINE friend auto operator<(const T& t, const RecordRef& vd) -> bool { return vd > t; } template LLAMA_FN_HOST_ACC_INLINE friend auto operator<=(const RecordRef& vd, const T& t) -> bool { return internal::recordRefRelOperator>(vd, t); } template>> LLAMA_FN_HOST_ACC_INLINE friend auto operator<=(const T& t, const RecordRef& vd) -> bool { return vd >= t; } template LLAMA_FN_HOST_ACC_INLINE friend auto operator>(const RecordRef& vd, const T& t) -> bool { return internal::recordRefRelOperator>(vd, t); } template>> LLAMA_FN_HOST_ACC_INLINE friend auto operator>(const T& t, const RecordRef& vd) -> bool { return vd < t; } template LLAMA_FN_HOST_ACC_INLINE friend auto operator>=(const RecordRef& vd, const T& t) -> bool { return internal::recordRefRelOperator>(vd, t); } template>> LLAMA_FN_HOST_ACC_INLINE friend auto operator>=(const T& t, const RecordRef& vd) -> bool { return vd <= t; } LLAMA_FN_HOST_ACC_INLINE auto asTuple() { return internal::asTupleImpl(*this, AccessibleRecordDim{}); } LLAMA_FN_HOST_ACC_INLINE auto asTuple() const { return internal::asTupleImpl(*this, AccessibleRecordDim{}); } LLAMA_FN_HOST_ACC_INLINE auto asFlatTuple() { return internal::asFlatTupleImpl(*this, AccessibleRecordDim{}); } LLAMA_FN_HOST_ACC_INLINE auto asFlatTuple() const { return internal::asFlatTupleImpl(*this, AccessibleRecordDim{}); } template LLAMA_FN_HOST_ACC_INLINE auto get() -> decltype(auto) { return operator()(RecordCoord{}); } template LLAMA_FN_HOST_ACC_INLINE auto get() const -> decltype(auto) { return operator()(RecordCoord{}); } template LLAMA_FN_HOST_ACC_INLINE auto loadAs() -> TupleLike { static_assert( internal::isDirectListInitializableFromTuple, "TupleLike must be constructible from as many values as this RecordRef recursively represents " "like " "this: TupleLike{values...}"); return internal::makeFromTuple( asFlatTuple(), std::make_index_sequence>{}); } template LLAMA_FN_HOST_ACC_INLINE auto loadAs() const -> TupleLike { static_assert( internal::isDirectListInitializableFromTuple, "TupleLike must be constructible from as many values as this RecordRef recursively represents " "like " "this: TupleLike{values...}"); return internal::makeFromTuple( asFlatTuple(), std::make_index_sequence>{}); } struct Loader { RecordRef& vd; template // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions) LLAMA_FN_HOST_ACC_INLINE operator T() { return vd.loadAs(); } }; struct LoaderConst { const RecordRef& vd; template // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions) LLAMA_FN_HOST_ACC_INLINE operator T() const { return vd.loadAs(); } }; LLAMA_FN_HOST_ACC_INLINE auto load() -> Loader { return {*this}; } LLAMA_FN_HOST_ACC_INLINE auto load() const -> LoaderConst { return {*this}; } template LLAMA_FN_HOST_ACC_INLINE void store(const TupleLike& t) { internal::assignTuples(asTuple(), t, std::make_index_sequence>{}); } // swap for equal RecordRef LLAMA_FN_HOST_ACC_INLINE friend void swap( std::conditional_t a, std::conditional_t b) noexcept { forEachLeafCoord( [&](auto rc) LLAMA_LAMBDA_INLINE { using std::swap; LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING // FIXME(bgruber): swap is constexpr in C++20, so nvcc rightfully complains that we call a __host__ // function here. But we must call ADL swap, so we can pick up any swap() for any user defined type // in the record dimension. Let's see if this ever hits us. Moving to C++20 will solve it. swap(a(rc), b(rc)); LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING }); } // FIXME(bgruber): the SIMD load/store functions need to navigate back from a record ref to the contained view // to find subsequent elements. This is not a great design for now and the SIMD load/store functions should // probably take iterators to records. template friend LLAMA_FN_HOST_ACC_INLINE void internal::loadSimdFromField( const T& srcRef, Simd& dstSimd, SrcRC srcRC, DstRC dstRC); template friend LLAMA_FN_HOST_ACC_INLINE void internal::storeSimdToField( const Simd& srcSimd, TFwd&& dstRef, SrcRC srcRC, DstRC dstRC); }; // swap for heterogeneous RecordRef LLAMA_EXPORT template< typename ViewA, typename BoundRecordDimA, bool OwnViewA, typename ViewB, typename BoundRecordDimB, bool OwnViewB> LLAMA_FN_HOST_ACC_INLINE auto swap( RecordRef& a, RecordRef& b) noexcept -> std::enable_if_t::AccessibleRecordDim, typename RecordRef::AccessibleRecordDim>> { using LeftRecord = RecordRef; forEachLeafCoord( [&](auto rc) LLAMA_LAMBDA_INLINE { using std::swap; swap(a(rc), b(rc)); }); } LLAMA_EXPORT template auto operator<<(std::ostream& os, const RecordRef& vr) -> std::ostream& { using RecordDim = typename RecordRef::AccessibleRecordDim; os << "{"; if constexpr(std::is_array_v) { mp_for_each_inline>>( [&](auto ic) { constexpr std::size_t i = decltype(ic)::value; if(i > 0) os << ", "; os << '[' << i << ']' << ": " << vr(RecordCoord{}); }); } else { mp_for_each_inline>>( [&](auto ic) { constexpr std::size_t i = decltype(ic)::value; if(i > 0) os << ", "; using Field = mp_at_c; os << structName>() << ": " << vr(RecordCoord{}); }); } os << "}"; return os; } LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE constexpr void forEachLeaf(RecordRefFwd&& vr, Functor&& functor) { using RecordRef = std::remove_reference_t; forEachLeafCoord( [functor = std::forward(functor), &vr = vr](auto rc) LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(constexpr mutable) { std::forward(functor)(vr(rc)); }); } namespace internal { // gets the value type for a given T, where T models either a value, an l-value reference, a proxy reference or // a RecordRef. template struct ValueOf { using type = T; }; template struct ValueOf>> { using type = One; }; #ifdef __cpp_lib_concepts template #else template #endif struct ValueOf>> { using type = typename T::value_type; }; template struct ValueOf { using type = std::remove_const_t; }; } // namespace internal /// Pulls a copy of the given value or reference. Proxy references are resolved to their value types. LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE auto decayCopy(T&& valueOrRef) -> typename internal::ValueOf::type { return std::forward(valueOrRef); } /// Scope guard type. ScopedUpdate takes a copy of a value through a reference and stores it internally during /// construction. The stored value is written back when ScopedUpdate is destroyed. ScopedUpdate tries to act like /// the stored value as much as possible, exposing member functions of the stored value and acting like a proxy /// reference if the stored value is a primitive type. LLAMA_EXPORT template struct ScopedUpdate : internal::ValueOf::type { using value_type = typename internal::ValueOf::type; /// Loads a copy of the value referenced by r. Stores r and the loaded value. LLAMA_FN_HOST_ACC_INLINE explicit ScopedUpdate(Reference r) : value_type(r), ref(r) { } ScopedUpdate(const ScopedUpdate&) = delete; auto operator=(const ScopedUpdate&) -> ScopedUpdate& = delete; ScopedUpdate(ScopedUpdate&&) noexcept = default; auto operator=(ScopedUpdate&&) noexcept -> ScopedUpdate& = default; using value_type::operator=; /// Stores the internally stored value back to the referenced value. LLAMA_FN_HOST_ACC_INLINE ~ScopedUpdate() { ref = static_cast(*this); } /// Get access to the stored value. LLAMA_FN_HOST_ACC_INLINE auto get() -> value_type& { return *this; } /// Get access to the stored value. LLAMA_FN_HOST_ACC_INLINE auto get() const -> const value_type& { return *this; } private: Reference ref; }; LLAMA_EXPORT template struct ScopedUpdate< Reference, std::enable_if_t::type>>> : ProxyRefOpMixin, typename internal::ValueOf::type> { using value_type = typename internal::ValueOf::type; LLAMA_FN_HOST_ACC_INLINE explicit ScopedUpdate(Reference r) : value(r), ref(r) { } ScopedUpdate(const ScopedUpdate&) = delete; auto operator=(const ScopedUpdate&) -> ScopedUpdate& = delete; ScopedUpdate(ScopedUpdate&&) noexcept = default; auto operator=(ScopedUpdate&&) noexcept -> ScopedUpdate& = default; LLAMA_FN_HOST_ACC_INLINE auto get() -> value_type& { return value; } LLAMA_FN_HOST_ACC_INLINE auto get() const -> const value_type& { return value; } // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions) LLAMA_FN_HOST_ACC_INLINE operator const value_type&() const { return value; } // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions) LLAMA_FN_HOST_ACC_INLINE operator value_type&() { return value; } LLAMA_FN_HOST_ACC_INLINE auto operator=(value_type v) -> ScopedUpdate& { value = v; return *this; } LLAMA_FN_HOST_ACC_INLINE ~ScopedUpdate() { ref = value; } private: value_type value; Reference ref; }; namespace internal { template struct ReferenceTo { using type = T&; }; template struct ReferenceTo && !isOne>> { using type = T; }; #ifdef __cpp_lib_concepts template #else template #endif struct ReferenceTo>> { using type = T; }; } // namespace internal LLAMA_EXPORT template ScopedUpdate(T) -> ScopedUpdate>::type>; } // namespace llama LLAMA_EXPORT template struct std::tuple_size> // NOLINT(cert-dcl58-cpp) : boost::mp11::mp_size::AccessibleRecordDim> { }; LLAMA_EXPORT template struct std::tuple_element> // NOLINT(cert-dcl58-cpp) { using type = decltype(std::declval>().template get()); }; LLAMA_EXPORT template struct std::tuple_element> // NOLINT(cert-dcl58-cpp) { using type = decltype(std::declval>().template get()); }; LLAMA_EXPORT template struct std::hash> // NOLINT(cert-dcl58-cpp) { LLAMA_FN_HOST_ACC_INLINE auto operator()(const llama::RecordRef& rr) const -> std::size_t { std::size_t acc = 0; llama::forEachLeaf( rr, [&](auto&& ref) LLAMA_LAMBDA_INLINE { boost::hash_combine(acc, llama::decayCopy(ref)); }); return acc; } }; #if CAN_USE_RANGES LLAMA_EXPORT template< typename ViewA, typename BoundA, bool OwnA, typename ViewB, typename BoundB, bool OwnB, template class TQual, template class UQual> struct std:: // NOLINTNEXTLINE(cert-dcl58-cpp) basic_common_reference, llama::RecordRef, TQual, UQual> { using type = std::enable_if_t< std::is_same_v< typename llama::RecordRef::AccessibleRecordDim, typename llama::RecordRef::AccessibleRecordDim>, llama::One>; }; #endif // == // == ./include/llama/RecordRef.hpp == // ============================================================================ // #include "View.hpp" // amalgamate: file already inlined // #include // amalgamate: file already included // #include // amalgamate: file already included // #include // amalgamate: file already included namespace llama { // TODO(bgruber): expose blob allocator /// An equivalent of std::vector backed by a \ref View. Elements are never value initialized though. No strong /// exception guarantee. /// WARNING: This class is experimental. /// @tparam Mapping The mapping to be used for the underlying view. Needs to have 1 array dimension. LLAMA_EXPORT template struct Vector { static_assert(Mapping::ArrayExtents::rank == 1, "llama::Vector only supports 1D mappings"); using ViewType = decltype(allocViewUninitialized()); using RecordDim = typename Mapping::RecordDim; using iterator = decltype(std::declval().begin()); using value_type = typename iterator::value_type; using size_type = typename Mapping::ArrayExtents::value_type; Vector() = default; template> LLAMA_FN_HOST_ACC_INLINE explicit Vector(size_type count, const RecordRef& value = {}) { reserve(count); for(size_type i = 0; i < count; i++) push_back(value); } template LLAMA_FN_HOST_ACC_INLINE Vector(Iterator first, Iterator last) { if constexpr(std::is_same_v< typename std::iterator_traits::iterator_category, std::random_access_iterator_tag>) reserve(std::distance(first, last)); for(; first != last; ++first) push_back(*first); } Vector(const Vector& other) = default; LLAMA_FN_HOST_ACC_INLINE Vector(Vector&& other) noexcept { swap(other); } auto operator=(const Vector& other) -> Vector& = default; LLAMA_FN_HOST_ACC_INLINE auto operator=(Vector&& other) noexcept -> Vector& { swap(other); return *this; } ~Vector() = default; // TODO(bgruber): assign private: LLAMA_FN_HOST_ACC_INLINE void outOfRange([[maybe_unused]] size_type i) const { #if __CUDA_ARCH__ assert(false && "Index out of range"); #else throw std::out_of_range{"Index " + std::to_string(i) + "out of range [0:" + std::to_string(m_size) + "["}; #endif } public: LLAMA_FN_HOST_ACC_INLINE auto at(size_type i) -> decltype(auto) { if(i >= m_size) outOfRange(i); return m_view(i); } LLAMA_FN_HOST_ACC_INLINE auto at(size_type i) const -> decltype(auto) { if(i >= m_size) outOfRange(i); return m_view(i); } LLAMA_FN_HOST_ACC_INLINE auto operator[](size_type i) -> decltype(auto) { return m_view(i); } LLAMA_FN_HOST_ACC_INLINE auto operator[](size_type i) const -> decltype(auto) { return m_view(i); } LLAMA_FN_HOST_ACC_INLINE auto front() -> decltype(auto) { return m_view(0); } LLAMA_FN_HOST_ACC_INLINE auto front() const -> decltype(auto) { return m_view(0); } LLAMA_FN_HOST_ACC_INLINE auto back() -> decltype(auto) { return m_view(m_size - 1); } LLAMA_FN_HOST_ACC_INLINE auto back() const -> decltype(auto) { return m_view(m_size - 1); } LLAMA_FN_HOST_ACC_INLINE auto begin() -> decltype(auto) { return m_view.begin(); } LLAMA_FN_HOST_ACC_INLINE auto begin() const -> decltype(auto) { return m_view.begin(); } LLAMA_FN_HOST_ACC_INLINE auto cbegin() -> decltype(auto) { return std::as_const(m_view).begin(); } LLAMA_FN_HOST_ACC_INLINE auto cbegin() const -> decltype(auto) { return m_view.begin(); } LLAMA_FN_HOST_ACC_INLINE auto end() -> decltype(auto) { return m_view.begin() + m_size; } LLAMA_FN_HOST_ACC_INLINE auto end() const -> decltype(auto) { return m_view.begin() + m_size; } LLAMA_FN_HOST_ACC_INLINE auto cend() -> decltype(auto) { return std::as_const(m_view).begin() + m_size; } LLAMA_FN_HOST_ACC_INLINE auto cend() const -> decltype(auto) { return m_view.begin() + m_size; } LLAMA_FN_HOST_ACC_INLINE auto empty() const -> bool { return m_size == 0; } LLAMA_FN_HOST_ACC_INLINE auto size() const -> size_type { return m_size; } LLAMA_FN_HOST_ACC_INLINE void reserve(size_type cap) { if(cap > capacity()) changeCapacity(cap); } LLAMA_FN_HOST_ACC_INLINE auto capacity() const -> size_type { return m_view.extents()[0]; } // NOLINTNEXTLINE(readability-identifier-naming) LLAMA_FN_HOST_ACC_INLINE void shrink_to_fit() { changeCapacity(m_size); } LLAMA_FN_HOST_ACC_INLINE void clear() { m_size = 0; } template LLAMA_FN_HOST_ACC_INLINE auto insert(iterator pos, T&& t) -> iterator { const auto i = pos - begin(); reserve(m_size + 1); // might invalidate pos pos = begin() + i; std::copy_backward(pos, end(), end() + 1); m_view[i] = std::forward(t); m_size++; return pos; } // TODO(bgruber): more insert overloads // TODO(bgruber): emplace LLAMA_FN_HOST_ACC_INLINE auto erase(iterator pos) -> iterator { std::copy(pos + 1, end(), pos); m_size--; return pos; } // TODO(bgruber): more erase overloads // TODO(bgruber): T here is probably a RecordRef. We could also allow any struct that is storable to the // view via RecordRef::store(). template // NOLINTNEXTLINE(readability-identifier-naming) LLAMA_FN_HOST_ACC_INLINE void push_back(T&& t) { if(const auto cap = capacity(); m_size == cap) reserve(std::max(cap + cap / 2, m_size + 1)); m_view[m_size++] = std::forward(t); } // TODO(bgruber): emplace_back // NOLINTNEXTLINE(readability-identifier-naming) LLAMA_FN_HOST_ACC_INLINE void pop_back() { m_size--; } template> LLAMA_FN_HOST_ACC_INLINE void resize(size_type count, const RecordRef& value = {}) { reserve(count); for(size_type i = m_size; i < count; i++) m_view[i] = value; m_size = count; } LLAMA_FN_HOST_ACC_INLINE friend auto operator==(const Vector& a, const Vector& b) -> bool { if(a.m_size != b.m_size) return false; return std::equal(a.begin(), a.end(), b.begin()); } LLAMA_FN_HOST_ACC_INLINE friend auto operator!=(const Vector& a, const Vector& b) -> bool { return !(a == b); } LLAMA_FN_HOST_ACC_INLINE friend auto operator<(const Vector& a, const Vector& b) -> bool { return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end()); } LLAMA_FN_HOST_ACC_INLINE friend auto operator<=(const Vector& a, const Vector& b) -> bool { return !(b < a); } LLAMA_FN_HOST_ACC_INLINE friend auto operator>(const Vector& a, const Vector& b) -> bool { return b < a; } LLAMA_FN_HOST_ACC_INLINE friend auto operator>=(const Vector& a, const Vector& b) -> bool { return !(a < b); } LLAMA_FN_HOST_ACC_INLINE friend void swap(Vector& a, Vector& b) noexcept { a.swap(b); } private: LLAMA_FN_HOST_ACC_INLINE void changeCapacity(size_type cap) { auto newView = allocViewUninitialized(Mapping{typename Mapping::ArrayExtents{cap}}); auto b = begin(); std::copy(begin(), b + std::min(m_size, cap), newView.begin()); using std::swap; swap(m_view, newView); // depends on move semantic of View } LLAMA_FN_HOST_ACC_INLINE void swap(Vector& other) noexcept { using std::swap; swap(m_view, other.m_view); // depends on move semantic of View swap(m_size, other.m_size); } ViewType m_view = {}; size_type m_size = 0; }; } // namespace llama // == // == ./include/llama/Vector.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/mapping/Byteswap.hpp == // == // Copyright 2022 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "../Core.hpp" // amalgamate: file already inlined // #include "../ProxyRefOpMixin.hpp" // amalgamate: file already inlined // #include "Common.hpp" // amalgamate: file already inlined // ============================================================================ // == ./include/llama/mapping/Projection.hpp == // == // Copyright 2022 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "../ProxyRefOpMixin.hpp" // amalgamate: file already inlined // #include "../View.hpp" // amalgamate: file already inlined // #include "Common.hpp" // amalgamate: file already inlined namespace llama::mapping { namespace internal { template struct UnaryFunctionTraits { static_assert(sizeof(F) == 0, "F is not an unary function"); }; template struct UnaryFunctionTraits { using ArgumentType = Arg; using ReturnType = Ret; }; template auto projectionOrVoidImpl() { if constexpr(mp_map_contains::value) return mp_identity>>{}; else if constexpr(mp_map_contains::value) return mp_identity>>{}; else return mp_identity{}; } template using ProjectionOrVoid = typename decltype(projectionOrVoidImpl())::type; template struct MakeReplacerProj { template static auto replacedTypeProj() { using Projection = ProjectionOrVoid; if constexpr(std::is_void_v) return mp_identity{}; else { using LoadFunc = UnaryFunctionTraits; using StoreFunc = UnaryFunctionTraits; static_assert(std::is_same_v); static_assert(std::is_same_v); static_assert(std::is_same_v); return mp_identity{}; } } template using fn = typename decltype(replacedTypeProj())::type; }; template using ReplaceTypesByProjectionResults = TransformLeavesWithCoord::template fn>; template // NOLINTNEXTLINE(cppcoreguidelines-special-member-functions,hicpp-special-member-functions) struct ProjectionReference : ProxyRefOpMixin< ProjectionReference, decltype(Projection::load(std::declval()))> { private: Reference storageRef; public: using value_type = decltype(Projection::load(std::declval())); LLAMA_FN_HOST_ACC_INLINE constexpr explicit ProjectionReference(Reference storageRef) : storageRef{storageRef} { } ProjectionReference(const ProjectionReference&) = default; // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp) LLAMA_FN_HOST_ACC_INLINE constexpr auto operator=(const ProjectionReference& other) -> ProjectionReference& { *this = static_cast(other); return *this; } // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions) LLAMA_FN_HOST_ACC_INLINE constexpr operator value_type() const { LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING return Projection::load(storageRef); LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator=(value_type v) -> ProjectionReference& { LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING storageRef = Projection::store(v); LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING return *this; } }; } // namespace internal /// Mapping that projects types in the record domain to different types. Projections are executed during load and /// store. /// @tparam TProjectionMap A type list of binary type lists (a map) specifing a projection (map value) for a type /// or the type at a \ref RecordCoord (map key). A projection is a type with two functions: /// struct Proj { /// static auto load(auto&& fromMem); /// static auto store(auto&& toMem); /// }; LLAMA_EXPORT template< typename TArrayExtents, typename TRecordDim, template typename InnerMapping, typename TProjectionMap> struct Projection : private InnerMapping> { using Inner = InnerMapping>; using ProjectionMap = TProjectionMap; using ArrayExtents = typename Inner::ArrayExtents; using RecordDim = TRecordDim; // hide Inner::RecordDim using Inner::blobCount; using Inner::blobSize; using Inner::extents; using Inner::Inner; protected: using ArrayIndex = typename ArrayExtents::Index; public: template LLAMA_FN_HOST_ACC_INLINE static constexpr auto isComputed(RecordCoord) -> bool { return !std::is_void_v< internal::ProjectionOrVoid>>; } template LLAMA_FN_HOST_ACC_INLINE constexpr auto compute( ArrayIndex ai, RecordCoord rc, BlobArray& blobs) const { static_assert(isComputed(rc)); using RecordDimType = GetType>; using Reference = decltype(mapToMemory(static_cast(*this), ai, rc, blobs)); using Projection = internal::ProjectionOrVoid, RecordDimType>; static_assert(!std::is_void_v); Reference r = mapToMemory(static_cast(*this), ai, rc, blobs); LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING return internal::ProjectionReference{r}; LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING } template LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset(ArrayIndex ai, RecordCoord rc = {}) const -> NrAndOffset { static_assert(!isComputed(rc)); return Inner::blobNrAndOffset(ai, rc); } }; /// Binds parameters to a \ref Projection mapping except for array and record dimension, producing a quoted /// meta function accepting the latter two. Useful to to prepare this mapping for a meta mapping. LLAMA_EXPORT template typename InnerMapping, typename ProjectionMap> struct BindProjection { template using fn = Projection; }; LLAMA_EXPORT template inline constexpr bool isProjection = false; LLAMA_EXPORT template< typename TArrayExtents, typename TRecordDim, template typename InnerMapping, typename ReplacementMap> inline constexpr bool isProjection> = true; } // namespace llama::mapping // == // == ./include/llama/mapping/Projection.hpp == // ============================================================================ namespace llama::mapping { namespace internal { // TODO(bgruber): replace by std::byteswap in C++23 template LLAMA_FN_HOST_ACC_INLINE auto byteswap(T t) -> T { if constexpr(sizeof(T) == 1) return t; else { llama::Array arr{}; std::memcpy(&arr, &t, sizeof(T)); for(std::size_t i = 0; i < sizeof(T) / 2; i++) { const auto a = arr[i]; const auto b = arr[sizeof(T) - 1 - i]; arr[i] = b; arr[sizeof(T) - 1 - i] = a; } std::memcpy(&t, &arr, sizeof(T)); return t; } } template struct ByteswapProjection { LLAMA_FN_HOST_ACC_INLINE static auto load(T v) -> T { return byteswap(v); } LLAMA_FN_HOST_ACC_INLINE static auto store(T v) -> T { return byteswap(v); } }; template using MakeByteswapProjectionPair = mp_list>; template using MakeByteswapProjectionMap = mp_transform>>; } // namespace internal /// Mapping that swaps the byte order of all values when loading/storing. LLAMA_EXPORT template typename InnerMapping> struct Byteswap : Projection> { private: using Base = Projection>; public: using Base::Base; }; /// Binds parameters to a \ref ChangeType mapping except for array and record dimension, producing a quoted /// meta function accepting the latter two. Useful to to prepare this mapping for a meta mapping. LLAMA_EXPORT template typename InnerMapping> struct BindByteswap { template using fn = Byteswap; }; LLAMA_EXPORT template inline constexpr bool isByteswap = false; LLAMA_EXPORT template typename InnerMapping> inline constexpr bool isByteswap> = true; } // namespace llama::mapping // == // == ./include/llama/mapping/Byteswap.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/mapping/Heatmap.hpp == // == // Copyright 2022 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "../View.hpp" // amalgamate: file already inlined // #include "Common.hpp" // amalgamate: file already inlined // #include // amalgamate: file already included // #include // amalgamate: file already included #include // #include // amalgamate: file already included #if __has_include() # include #endif namespace llama::mapping { /// Forwards all calls to the inner mapping. Counts all accesses made to blocks inside the blobs, allowing to /// extract a heatmap. /// @tparam Mapping The type of the inner mapping. /// @tparam Granularity The granularity in bytes on which to could accesses. A value of 1 counts every byte. /// individually. A value of e.g. 64, counts accesses per 64 byte block. /// @tparam TCountType Data type used to count the number of accesses. Atomic increments must be supported for this /// type. LLAMA_EXPORT template< typename Mapping, typename Mapping::ArrayExtents::value_type Granularity = 1, typename TCountType = std::size_t> struct Heatmap : private Mapping { static_assert(!hasAnyComputedField, "Heatmaps for computed mappings are not implemented."); private: using size_type = typename Mapping::ArrayExtents::value_type; using ArrayIndex = typename Mapping::ArrayExtents::Index; public: using Inner = Mapping; inline static constexpr std::size_t granularity = Granularity; using CountType = TCountType; using ArrayExtents = typename Mapping::ArrayExtents; using RecordDim = typename Mapping::RecordDim; // We duplicate every blob of the inner mapping with a shadow blob, where we count the accesses inline static constexpr std::size_t blobCount = Mapping::blobCount * 2; constexpr Heatmap() = default; LLAMA_FN_HOST_ACC_INLINE explicit Heatmap(Mapping mapping) : Mapping(std::move(mapping)) { } template LLAMA_FN_HOST_ACC_INLINE explicit Heatmap(Args&&... innerArgs) : Mapping(std::forward(innerArgs)...) { } #if defined(__cpp_lib_concepts) && defined(__NVCOMPILER) // nvc++ fails to find extents() from the base class when trying to satisfy the Mapping concept LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> typename Mapping::ArrayExtents { return static_cast(*this).extents(); } #else using Mapping::extents; #endif LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(size_type blobIndex) const -> size_type { if(blobIndex < size_type{Mapping::blobCount}) return Mapping::blobSize(blobIndex); return blockHitsSize(blobIndex - size_type{Mapping::blobCount}) * sizeof(CountType); } template static constexpr auto isComputed(RecordCoord) { return true; } template LLAMA_FN_HOST_ACC_INLINE auto compute(ArrayIndex ai, RecordCoord rc, Blobs& blobs) const -> decltype(auto) { static_assert( !std::is_const_v, "Cannot access (even just reading) data through Heatmap from const blobs/view, since we need to write " "the access counts"); const auto [nr, offset] = Mapping::blobNrAndOffset(ai, rc); using Type = GetType>; auto* hits = blockHitsPtr(nr, blobs); for(size_type i = 0; i < divCeil(size_type{sizeof(Type)}, Granularity); i++) internal::atomicInc(hits[offset / Granularity + i]); LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING return reinterpret_cast, Type>&>( blobs[nr][offset]); LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING } // Returns the size of the block hits buffer for blob forBlobI in block counts. LLAMA_FN_HOST_ACC_INLINE auto blockHitsSize(size_type forBlobI) const -> size_type { assert(forBlobI < Mapping::blobCount); return divCeil(Mapping::blobSize(forBlobI), Granularity); } LLAMA_SUPPRESS_HOST_DEVICE_WARNING template LLAMA_FN_HOST_ACC_INLINE auto blockHitsPtr(size_type forBlobI, Blobs& blobs) const -> CopyConst* { return reinterpret_cast*>(&blobs[size_type{Mapping::blobCount} + forBlobI][0]); } #ifdef __cpp_lib_span template auto blockHits(size_type forBlobI, Blobs& blobs) const -> std::span> { return {blockHitsPtr(forBlobI, blobs), blockHitsSize(forBlobI)}; } #endif private: static auto trimBlobRight(const CountType* bh, std::size_t size) { while(size > 0 && bh[size - 1] == 0) size--; return size; } public: /// Writes a data file suitable for gnuplot containing the heatmap data. You can use the script provided by /// \ref gnuplotScript to plot this data file. /// @param blobs The blobs of the view containing this mapping /// @param os The stream to write the data to. Should be some form of std::ostream. template void writeGnuplotDataFileAscii( const Blobs& blobs, OStream&& os, bool trimEnd = true, std::size_t wrapAfterBlocks = 64) const { for(std::size_t i = 0; i < Mapping::blobCount; i++) { auto* bh = blockHitsPtr(i, blobs); auto size = blockHitsSize(i); if(trimEnd) size = trimBlobRight(bh, size); for(size_type j = 0; j < size; j++) { if(j > 0) os << (j % wrapAfterBlocks == 0 ? '\n' : ' '); os << bh[j]; } for(size_type j = size; j < roundUpToMultiple(size, wrapAfterBlocks); j++) os << " 0"; os << '\n'; } } template void writeGnuplotDataFileBinary( const Blobs& blobs, OStream&& os, bool trimEnd = true, std::size_t afterBlobRoundUpTo = 64) const { for(std::size_t i = 0; i < Mapping::blobCount; i++) { auto* bh = blockHitsPtr(i, blobs); auto size = blockHitsSize(i); if(trimEnd) size = trimBlobRight(bh, size); os.write(reinterpret_cast(bh), size * sizeof(CountType)); // round up before starting next blob CountType zero = 0; for(size_type j = size; j < roundUpToMultiple(size, afterBlobRoundUpTo); j++) os.write(reinterpret_cast(&zero), sizeof(CountType)); } } /// An example script for plotting the ASCII heatmap data using gnuplot. /// \hideinitializer static constexpr std::string_view gnuplotScriptAscii = R"(#!/bin/bash gnuplot -p < inline constexpr bool isHeatmap = false; LLAMA_EXPORT template inline constexpr bool isHeatmap> = true; } // namespace llama::mapping // == // == ./include/llama/mapping/Heatmap.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/mapping/Split.hpp == // == // Copyright 2022 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "../View.hpp" // amalgamate: file already inlined // #include "Common.hpp" // amalgamate: file already inlined namespace llama::mapping { namespace internal { template auto partitionRecordDim(Record, RecordCoord) { using Rec = Record; if constexpr(sizeof...(Coords) == 0) { using Part1 = Record>; using Part2 = mp_erase_c; return mp_list{}; } else { using FieldTag = GetTag>; using FieldType = GetType>; using InnerPartition = decltype(partitionRecordDim(FieldType{}, RecordCoord{})); using Part1 = Record>>; using Part2 = mp_replace_at_c>>; return mp_list{}; } } template struct PartitionFoldOpImpl { using Part1Before = mp_first; using Part2Before = mp_second; using R = decltype(partitionRecordDim(Part2Before{}, GetCoordFromTags{})); using Part1After = mp_first; using Part2After = mp_second; using type = mp_list, Part2After>; }; template using PartitionFoldOp = typename PartitionFoldOpImpl::type; template auto partitionRecordDim(Record, mp_list) { static_assert((isRecordCoord && ...)); using Initial = mp_list, Record>; // initially, nothing selected for mapping 1 return mp_fold, RCs>...>, Initial, PartitionFoldOp>{}; } template inline constexpr bool isSelected = recordCoordCommonPrefixIsSame; template inline constexpr bool isSelected> = (isSelected || ...); template struct ReplaceTagListsByCoords; template struct ReplaceTagListsByCoords> { using type = RecordCoord; }; template struct ReplaceTagListsByCoords> { static auto f() { if constexpr(((mp_is_list::value || isRecordCoord) &&...)) // Args is a pack of tag lists/record coords return mp_list...>{}; else // Args is a single tag lists return GetCoordFromTags{}; } using type = decltype(f()); }; } // namespace internal /// Mapping which splits off a part of the record dimension and maps it differently then the rest. /// \tparam TSelectorForMapping1 Selects a part of the record dimension to be mapped by MappingTemplate1. Can be a /// \ref RecordCoord, a type list of RecordCoords, a type list of tags (selecting one field), or a type list of /// type list of tags (selecting one field per sub list). dimension to be mapped differently. /// \tparam MappingTemplate1 The mapping used for the selected part of the record dimension. /// \tparam MappingTemplate2 The mapping used for the not selected part of the record dimension. /// \tparam SeparateBlobs If true, both pieces of the record dimension are mapped to separate blobs. LLAMA_EXPORT template< typename TArrayExtents, typename TRecordDim, typename TSelectorForMapping1, template typename MappingTemplate1, template typename MappingTemplate2, bool SeparateBlobs = false> struct Split { static_assert(isRecord, "Cannot split a scalar record dim"); using ArrayExtents = TArrayExtents; using ArrayIndex = typename ArrayExtents::Index; using RecordDim = TRecordDim; using SelectorForMapping1 = TSelectorForMapping1; using NormalizedSelectorForMapping1 = typename internal::ReplaceTagListsByCoords::type; using RecordDimPartitions = decltype(internal::partitionRecordDim(RecordDim{}, NormalizedSelectorForMapping1{})); using RecordDim1 = mp_first; using RecordDim2 = mp_second; using Mapping1 = MappingTemplate1; using Mapping2 = MappingTemplate2; static constexpr std::size_t blobCount = SeparateBlobs ? Mapping1::blobCount + Mapping2::blobCount : 1; static_assert(SeparateBlobs || Mapping1::blobCount == 1); static_assert(SeparateBlobs || Mapping2::blobCount == 1); private: using size_type = typename ArrayExtents::value_type; static constexpr size_type m1bc = static_cast(Mapping1::blobCount); public: constexpr Split() = default; LLAMA_FN_HOST_ACC_INLINE constexpr explicit Split(ArrayExtents extents) : mapping1(extents), mapping2(extents) { } LLAMA_FN_HOST_ACC_INLINE constexpr Split(Mapping1 mapping1, Mapping2 mapping2) : mapping1(std::move(mapping1)) , mapping2(std::move(mapping2)) { } template LLAMA_FN_HOST_ACC_INLINE constexpr Split(std::tuple mappingArgs1, std::tuple mappingArgs2) : mapping1(std::make_from_tuple(mappingArgs1)) , mapping2(std::make_from_tuple(mappingArgs2)) { } LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents { return mapping1.extents(); } LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize([[maybe_unused]] size_type i) const -> size_type { if constexpr(SeparateBlobs) { if(i < m1bc) return mapping1.blobSize(i); return mapping2.blobSize(i - m1bc); } else return mapping1.blobSize(0) + mapping2.blobSize(0); } template LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset(ArrayIndex ai, RecordCoord = {}) const -> NrAndOffset { using Tags = GetTags>; if constexpr(internal::isSelected, NormalizedSelectorForMapping1>) return mapping1.blobNrAndOffset(ai, GetCoordFromTags{}); else { auto nrAndOffset = mapping2.blobNrAndOffset(ai, GetCoordFromTags{}); if constexpr(SeparateBlobs) nrAndOffset.nr += m1bc; else { for(size_type i = 0; i < m1bc; i++) nrAndOffset.offset += mapping1.blobSize(i); } return nrAndOffset; } } template static constexpr auto isComputed(RecordCoord) -> bool { using Tags = GetTags>; if constexpr(internal::isSelected, NormalizedSelectorForMapping1>) return llama::isComputed>; else return llama::isComputed>; } template LLAMA_FN_HOST_ACC_INLINE constexpr auto compute(ArrayIndex ai, RecordCoord, Blobs& blobs) const { using Tags = GetTags>; if constexpr(internal::isSelected, NormalizedSelectorForMapping1>) return mapping1.compute(ai, GetCoordFromTags{}, blobs); else { // only pass on blobs for mapping 2, so it can index starting from 0 auto* blobs2 = &blobs[0] + m1bc; return mapping2.compute(ai, GetCoordFromTags{}, blobs2); } } Mapping1 mapping1; Mapping2 mapping2; }; /// Binds parameters to a \ref Split mapping except for array and record dimension, producing a quoted /// meta function accepting the latter two. Useful to to prepare this mapping for a meta mapping. LLAMA_EXPORT template< typename SelectorForMapping1, template typename MappingTemplate1, template typename MappingTemplate2, bool SeparateBlobs = false> struct BindSplit { template using fn = Split; }; LLAMA_EXPORT template inline constexpr bool isSplit = false; LLAMA_EXPORT template< typename ArrayExtents, typename RecordDim, typename SelectorForMapping1, template typename MappingTemplate1, template typename MappingTemplate2, bool SeparateBlobs> inline constexpr bool isSplit> = true; } // namespace llama::mapping // == // == ./include/llama/mapping/Split.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/mapping/Null.hpp == // == // Copyright 2022 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "../ProxyRefOpMixin.hpp" // amalgamate: file already inlined namespace llama::mapping { namespace internal { template struct NullReference : ProxyRefOpMixin, T> { using value_type = T; // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions) LLAMA_FN_HOST_ACC_INLINE constexpr operator T() const { return T{}; // this might not be the best design decision } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator=(T) -> NullReference& { return *this; } }; } // namespace internal /// The Null mappings maps all elements to nothing. Writing data through a reference obtained from the Null mapping /// discards the value. Reading through such a reference returns a default constructed object. LLAMA_EXPORT template struct Null : MappingBase { private: using Base = MappingBase; using size_type = typename TArrayExtents::value_type; public: static constexpr std::size_t blobCount = 0; using Base::Base; LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(size_type /*blobIndex*/) const -> size_type { return 0; } template static constexpr auto isComputed(RecordCoord) { return true; } template LLAMA_FN_HOST_ACC_INLINE constexpr auto compute( typename Base::ArrayIndex, RecordCoord, Blobs&) const { using FieldType = GetType>; return internal::NullReference{}; } }; LLAMA_EXPORT template inline constexpr bool isNull = false; LLAMA_EXPORT template inline constexpr bool isNull> = true; } // namespace llama::mapping // == // == ./include/llama/mapping/Null.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/mapping/BitPackedInt.hpp == // == // Copyright 2023 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "../Core.hpp" // amalgamate: file already inlined // #include "../ProxyRefOpMixin.hpp" // amalgamate: file already inlined // #include "Common.hpp" // amalgamate: file already inlined // #include // amalgamate: file already included // #include // amalgamate: file already included namespace llama::mapping { LLAMA_EXPORT enum class SignBit { Keep, Discard }; namespace internal { template LLAMA_FN_HOST_ACC_INLINE constexpr auto makeMask(Integral bits) -> Integral { return bits >= sizeof(Integral) * CHAR_BIT ? ~Integral{0} : (Integral{1} << bits) - 1u; } template LLAMA_FN_HOST_ACC_INLINE constexpr auto bitunpack( const StoredIntegral* ptr, StoredIntegral bitOffset, StoredIntegral bitCount) -> Integral { constexpr auto bitsPerIntegral = static_cast(sizeof(Integral) * CHAR_BIT); constexpr auto bitsPerStoredIntegral = static_cast(sizeof(StoredIntegral) * CHAR_BIT); static_assert(bitsPerIntegral <= bitsPerStoredIntegral); assert(bitCount > 0 && bitCount <= bitsPerStoredIntegral); #ifdef __clang__ // this is necessary to silence the clang static analyzer __builtin_assume(bitCount > 0 && bitCount <= bitsPerStoredIntegral); #endif const auto* p = ptr + bitOffset / bitsPerStoredIntegral; const auto innerBitOffset = bitOffset % bitsPerStoredIntegral; // assert(p < endPtr); auto v = p[0] >> innerBitOffset; const auto innerBitEndOffset = innerBitOffset + bitCount; if(innerBitEndOffset <= bitsPerStoredIntegral) { const auto mask = makeMask(bitCount); v &= mask; } else { const auto excessBits = innerBitEndOffset - bitsPerStoredIntegral; const auto bitsLoaded = bitsPerStoredIntegral - innerBitOffset; const auto mask = makeMask(excessBits); // assert(p + 1 < endPtr); v |= (p[1] & mask) << bitsLoaded; } if constexpr(std::is_signed_v && KeepSignBit) { // perform sign extension if((v & (StoredIntegral{1} << (bitCount - 1))) && bitCount < bitsPerStoredIntegral) v |= ~StoredIntegral{0} << bitCount; } return static_cast(v); } template LLAMA_FN_HOST_ACC_INLINE constexpr void bitpack( StoredIntegral* ptr, StoredIntegral bitOffset, StoredIntegral bitCount, Integral value) { constexpr auto bitsPerIntegral = static_cast(sizeof(Integral) * CHAR_BIT); constexpr auto bitsPerStoredIntegral = static_cast(sizeof(StoredIntegral) * CHAR_BIT); static_assert(bitsPerIntegral <= bitsPerStoredIntegral); assert(bitCount > 0 && bitCount <= bitsPerStoredIntegral); #ifdef __clang__ // this is necessary to silence the clang static analyzer __builtin_assume(bitCount > 0 && bitCount <= bitsPerStoredIntegral); #endif // NOLINTNEXTLINE(bugprone-signed-char-misuse,cert-str34-c) const auto unsignedValue = static_cast(value); const auto mask = makeMask(bitCount); StoredIntegral valueBits; if constexpr(std::is_signed_v && KeepSignBit) { const auto magnitudeMask = makeMask(bitCount - 1); const auto isSigned = value < 0; valueBits = (StoredIntegral{isSigned} << (bitCount - 1)) | (unsignedValue & magnitudeMask); } else { valueBits = unsignedValue & mask; } auto* p = ptr + bitOffset / bitsPerStoredIntegral; const auto innerBitOffset = bitOffset % bitsPerStoredIntegral; { const auto clearMask = ~(mask << innerBitOffset); // assert(p < endPtr); auto mem = p[0] & clearMask; // clear previous bits mem |= valueBits << innerBitOffset; // write new bits p[0] = mem; } const auto innerBitEndOffset = innerBitOffset + bitCount; if(innerBitEndOffset > bitsPerStoredIntegral) { const auto excessBits = innerBitEndOffset - bitsPerStoredIntegral; const auto bitsWritten = bitsPerStoredIntegral - innerBitOffset; const auto clearMask = ~makeMask(excessBits); // assert(p + 1 < endPtr); auto mem = p[1] & clearMask; // clear previous bits mem |= valueBits >> bitsWritten; // write new bits p[1] = mem; } } template LLAMA_FN_HOST_ACC_INLINE constexpr auto bitunpack1(const StoredIntegral* ptr, StoredIntegral bitOffset) -> Integral { constexpr auto bitsPerStoredIntegral = static_cast(sizeof(StoredIntegral) * CHAR_BIT); const auto bit = (ptr[bitOffset / bitsPerStoredIntegral] >> (bitOffset % bitsPerStoredIntegral)) & StoredIntegral{1}; return static_cast(bit); } template LLAMA_FN_HOST_ACC_INLINE constexpr void bitpack1(StoredIntegral* ptr, StoredIntegral bitOffset, Integral value) { constexpr auto bitsPerStoredIntegral = static_cast(sizeof(StoredIntegral) * CHAR_BIT); const auto bitOff = bitOffset % bitsPerStoredIntegral; auto& dst = ptr[bitOffset / bitsPerStoredIntegral]; dst &= ~(StoredIntegral{1} << bitOff); // clear bit const auto bit = (static_cast(value) & StoredIntegral{1}); dst |= (bit << bitOff); // set bit } /// A proxy type representing a reference to a reduced precision integral value, stored in a buffer at a /// specified bit offset. /// @tparam Integral Integral data type which can be loaded and store through this reference. /// @tparam StoredIntegralCV Integral type used for storing the bits with CV qualifiers. /// @tparam SizeType Type used to store sizes and offsets. template // NOLINTNEXTLINE(cppcoreguidelines-special-member-functions,hicpp-special-member-functions) struct BitPackedIntRef : private VHBits , ProxyRefOpMixin, Integral> { private: using StoredIntegral = std::remove_cv_t; StoredIntegralCV* ptr; SizeType bitOffset; public: using value_type = Integral; LLAMA_FN_HOST_ACC_INLINE constexpr BitPackedIntRef( StoredIntegralCV* ptr, SizeType bitOffset, VHBits vhBits) : VHBits{vhBits} , ptr{ptr} , bitOffset{bitOffset} { } BitPackedIntRef(const BitPackedIntRef&) = default; // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp) LLAMA_FN_HOST_ACC_INLINE constexpr auto operator=(const BitPackedIntRef& other) -> BitPackedIntRef& { *this = static_cast(other); return *this; } // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions) LLAMA_FN_HOST_ACC_INLINE constexpr operator Integral() const { // fast path for single bits without sign handling if constexpr(std::is_empty_v) { if constexpr(VHBits::value() == 1 && (std::is_unsigned_v || SignBit == SignBit::Discard)) { return bitunpack1(ptr, static_cast(bitOffset)); } } return bitunpack( ptr, static_cast(bitOffset), static_cast(VHBits::value())); } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator=(Integral value) -> BitPackedIntRef& { // fast path for single bits without sign handling if constexpr(std::is_empty_v) { if constexpr(VHBits::value() == 1 && (std::is_unsigned_v || SignBit == SignBit::Discard)) { bitpack1(ptr, static_cast(bitOffset), value); } } bitpack( ptr, static_cast(bitOffset), static_cast(VHBits::value()), value); return *this; } }; template using HasLargerSize = mp_bool; template using LargestIntegral = mp_max_element, HasLargerSize>; template using StoredUnsignedFor = std:: conditional_t<(sizeof(LargestIntegral) > sizeof(std::uint32_t)), std::uint64_t, std::uint32_t>; template< typename TArrayExtents, typename TRecordDim, typename Bits, SignBit SignBit, typename TLinearizeArrayIndexFunctor, typename TStoredIntegral> struct BitPackedIntCommon : MappingBase , protected llama::internal::BoxedValue { using LinearizeArrayIndexFunctor = TLinearizeArrayIndexFunctor; using StoredIntegral = TStoredIntegral; static_assert(std::is_integral_v); static_assert(std::is_unsigned_v); // We could allow more integer types as storage type, but that needs to be thought through carefully static_assert( std::is_same_v || std::is_same_v); protected: using Base = MappingBase; using VHBits = llama::internal::BoxedValue; using size_type = typename TArrayExtents::value_type; template using IsAllowedFieldType = mp_or, std::is_enum>; static_assert( mp_all_of, IsAllowedFieldType>::value, "All record dimension field types must be integral"); template using IsFieldTypeSmallerOrEqualStorageIntegral = mp_bool; static_assert( mp_all_of, IsFieldTypeSmallerOrEqualStorageIntegral>::value, "The integral type used for storage must be at least as big as the type of the values to retrieve"); public: LLAMA_FN_HOST_ACC_INLINE constexpr auto bits() const -> size_type { return static_cast(VHBits::value()); } template, int> = 0> LLAMA_FN_HOST_ACC_INLINE constexpr explicit BitPackedIntCommon( TArrayExtents extents = {}, Bits bits = {}, TRecordDim = {}) : Base(extents) , VHBits{bits} { static_assert(VHBits::value() > 0); mp_for_each_inline>>( [&](auto t) { using FieldType = typename decltype(t)::type; static_assert( static_cast(VHBits::value()) <= sizeof(FieldType) * CHAR_BIT, "Storage bits must not be greater than bits of field type"); static_assert( VHBits::value() >= 2 || std::is_unsigned_v || SignBit == llama::mapping::SignBit::Discard, "When keeping the sign bit, Bits must be at least 2 with signed integers in the record " "dimension"); }); } template, int> = 0> LLAMA_FN_HOST_ACC_INLINE constexpr explicit BitPackedIntCommon( TArrayExtents extents, Bits bits, TRecordDim = {}) : Base(extents) , VHBits{bits} { #ifdef __CUDA_ARCH__ assert(VHBits::value() > 0); #else if(VHBits::value() <= 0) throw std::invalid_argument("BitPackedInt* Bits must not be zero"); #endif mp_for_each_inline>>( [&](auto t) { using FieldType [[maybe_unused]] = typename decltype(t)::type; #ifdef __CUDA_ARCH__ assert(VHBits::value() <= sizeof(FieldType) * CHAR_BIT); #else if(static_cast(VHBits::value()) > sizeof(FieldType) * CHAR_BIT) throw std::invalid_argument( "BitPackedInt* Bits must not be larger than any field type in the record dimension"); if(!(VHBits::value() >= 2 || std::is_unsigned_v || SignBit == llama::mapping::SignBit::Discard)) throw std::invalid_argument("When keeping the sign bit, Bits must be at least 2 with " "signed integers in the record " "dimension"); #endif }); } template static constexpr auto isComputed(RecordCoord) { return true; } }; } // namespace internal /// Struct of array mapping using bit packing to reduce size/precision of integral data types. If your record /// dimension contains non-integral types, split them off using the \ref Split mapping first. /// \tparam Bits If Bits is llama::Constant, the compile-time N specifies the number of bits to use. If Bits is /// an integral type T, the number of bits is specified at runtime, passed to the constructor and stored as type T. /// Must not be zero and must not be bigger than the bits of TStoredIntegral. /// @tparam SignBit When set to SignBit::Discard, discards the sign bit when storing signed integers. All /// numbers will be read back positive. /// \tparam TLinearizeArrayIndexFunctor Defines how the array dimensions should be mapped into linear numbers and /// how big the linear domain gets. /// \tparam TStoredIntegral Integral type used as storage of reduced precision integers. Must be std::uint32_t or /// std::uint64_t. LLAMA_EXPORT template< typename TArrayExtents, typename TRecordDim, typename Bits = typename TArrayExtents::value_type, SignBit SignBit = SignBit::Keep, typename TLinearizeArrayIndexFunctor = LinearizeArrayIndexRight, typename TStoredIntegral = internal::StoredUnsignedFor> struct BitPackedIntSoA : internal::BitPackedIntCommon< TArrayExtents, TRecordDim, Bits, SignBit, TLinearizeArrayIndexFunctor, TStoredIntegral> { private: using Base = internal:: BitPackedIntCommon; public: using Base::Base; using typename Base::size_type; using VHBits = typename Base::VHBits; // use plain using declaration with nvcc >= 11.8 static constexpr std::size_t blobCount = mp_size>::value; LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(size_type /*blobIndex*/) const -> size_type { constexpr auto bitsPerStoredIntegral = static_cast(sizeof(TStoredIntegral) * CHAR_BIT); const auto bitsNeeded = TLinearizeArrayIndexFunctor{}.size(Base::extents()) * VHBits::value(); return roundUpToMultiple(bitsNeeded, bitsPerStoredIntegral) / CHAR_BIT; } template LLAMA_FN_HOST_ACC_INLINE constexpr auto compute( typename Base::ArrayIndex ai, RecordCoord, Blobs& blobs) const { constexpr auto blob = flatRecordCoord>; const auto bitOffset = TLinearizeArrayIndexFunctor{}(ai, Base::extents()) * VHBits::value(); using QualifiedStoredIntegral = CopyConst; using DstType = GetType>; LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING return internal::BitPackedIntRef{ reinterpret_cast(&blobs[blob][0]), bitOffset, static_cast(*this)}; LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING } }; /// Binds parameters to a \ref BitPackedIntSoA mapping except for array and record dimension, producing a quoted /// meta function accepting the latter two. Useful to to prepare this mapping for a meta mapping. LLAMA_EXPORT template< typename Bits = void, SignBit SignBit = SignBit::Keep, typename LinearizeArrayIndexFunctor = mapping::LinearizeArrayIndexRight, typename StoredIntegral = void> struct BindBitPackedIntSoA { template using fn = BitPackedIntSoA< ArrayExtents, RecordDim, std::conditional_t, Bits, typename ArrayExtents::value_type>, SignBit, LinearizeArrayIndexFunctor, std::conditional_t< !std::is_void_v, StoredIntegral, internal::StoredUnsignedFor>>; }; LLAMA_EXPORT template inline constexpr bool isBitPackedIntSoA = false; LLAMA_EXPORT template< typename ArrayExtents, typename RecordDim, typename Bits, SignBit SignBit, typename LinearizeArrayIndexFunctor, typename StoredIntegral> inline constexpr bool isBitPackedIntSoA< BitPackedIntSoA> = true; /// Array of struct mapping using bit packing to reduce size/precision of integral data types. If your record /// dimension contains non-integral types, split them off using the \ref Split mapping first. /// \tparam Bits If Bits is llama::Constant, the compile-time N specifies the number of bits to use. If Bits is /// an integral type T, the number of bits is specified at runtime, passed to the constructor and stored as type T. /// Must not be zero and must not be bigger than the bits of TStoredIntegral. /// @tparam SignBit When set to SignBit::Discard, discards the sign bit when storing signed integers. All /// numbers will be read back positive. /// \tparam TLinearizeArrayIndexFunctor Defines how the array dimensions should be mapped into linear numbers and /// how big the linear domain gets. /// \tparam PermuteFields Defines how the record dimension's fields should be permuted. See \ref // PermuteFieldsInOrder, \ref PermuteFieldsIncreasingAlignment, \ref PermuteFieldsDecreasingAlignment and // \ref PermuteFieldsMinimizePadding. /// \tparam TStoredIntegral Integral type used as storage of reduced precision integers. Must be std::uint32_t or /// std::uint64_t. LLAMA_EXPORT template< typename TArrayExtents, typename TRecordDim, typename Bits = typename TArrayExtents::value_type, SignBit SignBit = SignBit::Keep, typename TLinearizeArrayIndexFunctor = LinearizeArrayIndexRight, template typename PermuteFields = PermuteFieldsInOrder, typename TStoredIntegral = internal::StoredUnsignedFor> struct BitPackedIntAoS : internal::BitPackedIntCommon< TArrayExtents, TRecordDim, Bits, SignBit, TLinearizeArrayIndexFunctor, TStoredIntegral> { private: using Base = internal:: BitPackedIntCommon; public: using Base::Base; using typename Base::size_type; using VHBits = typename Base::VHBits; // use plain using declaration with nvcc >= 11.8 using Permuter = PermuteFields; static constexpr std::size_t blobCount = 1; LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(size_type /*blobIndex*/) const -> size_type { constexpr auto bitsPerStoredIntegral = static_cast(sizeof(TStoredIntegral) * CHAR_BIT); const auto bitsNeeded = TLinearizeArrayIndexFunctor{}.size(Base::extents()) * static_cast(VHBits::value()) * static_cast(flatFieldCount); return roundUpToMultiple(bitsNeeded, bitsPerStoredIntegral) / CHAR_BIT; } template LLAMA_FN_HOST_ACC_INLINE constexpr auto compute( typename Base::ArrayIndex ai, RecordCoord, Blobs& blobs) const { constexpr auto flatFieldIndex = static_cast( Permuter::template permute>>); const auto bitOffset = ((TLinearizeArrayIndexFunctor{}(ai, Base::extents()) * static_cast(flatFieldCount)) + flatFieldIndex) * static_cast(VHBits::value()); using QualifiedStoredIntegral = CopyConst; using DstType = GetType>; LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING return internal::BitPackedIntRef{ reinterpret_cast(&blobs[0][0]), bitOffset, static_cast(*this)}; LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING } }; /// Binds parameters to a \ref BitPackedIntAoS mapping except for array and record dimension, producing a quoted /// meta function accepting the latter two. Useful to to prepare this mapping for a meta mapping. LLAMA_EXPORT template< typename Bits = void, SignBit SignBit = SignBit::Keep, typename LinearizeArrayIndexFunctor = mapping::LinearizeArrayIndexRight, template typename PermuteFields = PermuteFieldsInOrder, typename StoredIntegral = void> struct BindBitPackedIntAoS { template using fn = BitPackedIntAoS< ArrayExtents, RecordDim, std::conditional_t, Bits, typename ArrayExtents::value_type>, SignBit, LinearizeArrayIndexFunctor, PermuteFields, std::conditional_t< !std::is_void_v, StoredIntegral, internal::StoredUnsignedFor>>; }; LLAMA_EXPORT template inline constexpr bool isBitPackedIntAoS = false; template< typename ArrayExtents, typename RecordDim, typename Bits, SignBit SignBit, typename LinearizeArrayIndexFunctor, template typename PermuteFields, typename StoredIntegral> inline constexpr bool isBitPackedIntAoS> = true; } // namespace llama::mapping // == // == ./include/llama/mapping/BitPackedInt.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/mapping/Bytesplit.hpp == // == // Copyright 2022 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "../ProxyRefOpMixin.hpp" // amalgamate: file already inlined // #include "Common.hpp" // amalgamate: file already inlined namespace llama::mapping { namespace internal { template using ReplaceByByteArray = std::byte[sizeof(T)]; template using SplitBytes = TransformLeaves; } // namespace internal /// Meta mapping splitting each field in the record dimension into an array of bytes and mapping the resulting /// record dimension using a further mapping. LLAMA_EXPORT template typename InnerMapping> struct Bytesplit : private InnerMapping> { using Inner = InnerMapping>; using ArrayExtents = typename Inner::ArrayExtents; using RecordDim = TRecordDim; // hide Inner::RecordDim using Inner::blobCount; using Inner::blobSize; using Inner::extents; private: using ArrayIndex = typename TArrayExtents::Index; public: LLAMA_FN_HOST_ACC_INLINE constexpr explicit Bytesplit(TArrayExtents extents, TRecordDim = {}) : Inner(extents) { } template LLAMA_FN_HOST_ACC_INLINE constexpr explicit Bytesplit(std::tuple innerMappingArgs, TRecordDim = {}) : Inner(std::make_from_tuple(innerMappingArgs)) { } template static constexpr auto isComputed(RecordCoord) { return true; } template // NOLINTNEXTLINE(cppcoreguidelines-special-member-functions,hicpp-special-member-functions) struct Reference : ProxyRefOpMixin, GetType> { private: const Inner& inner; ArrayIndex ai; BlobArray& blobs; public: using value_type = GetType; LLAMA_FN_HOST_ACC_INLINE constexpr Reference(const Inner& innerMapping, ArrayIndex ai, BlobArray& blobs) : inner(innerMapping) , ai(ai) , blobs(blobs) { } Reference(const Reference&) = default; // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp) LLAMA_FN_HOST_ACC_INLINE constexpr auto operator=(const Reference& other) -> Reference& { *this = static_cast(other); return *this; } // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions) LLAMA_FN_HOST_ACC_INLINE constexpr operator value_type() const { #ifdef _MSC_VER // MSVC workaround. Without this, MSVC deduces the last template parameter of mapToMemory wrongly BlobArray& blobs = this->blobs; #endif value_type v; auto* p = reinterpret_cast(&v); mp_for_each_inline>( [&](auto ic) LLAMA_LAMBDA_INLINE { constexpr auto i = decltype(ic)::value; auto&& ref = mapToMemory(inner, ai, Cat>{}, blobs); p[i] = ref; }); return v; } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator=(value_type v) -> Reference& { #ifdef _MSC_VER // MSVC workaround. Without this, MSVC deduces the last template parameter of mapToMemory wrongly BlobArray& blobs = this->blobs; #endif auto* p = reinterpret_cast(&v); mp_for_each_inline>( [&](auto ic) LLAMA_LAMBDA_INLINE { constexpr auto i = decltype(ic)::value; auto&& ref = mapToMemory(inner, ai, Cat>{}, blobs); ref = p[i]; }); return *this; } }; template LLAMA_FN_HOST_ACC_INLINE constexpr auto compute(ArrayIndex ai, RecordCoord, BlobArray& blobs) const { return Reference, BlobArray>{*this, ai, blobs}; } }; /// Binds parameters to a \ref Bytesplit mapping except for array and record dimension, producing a quoted /// meta function accepting the latter two. Useful to to prepare this mapping for a meta mapping. LLAMA_EXPORT template typename InnerMapping> struct BindBytesplit { template using fn = Bytesplit; }; LLAMA_EXPORT template inline constexpr bool isBytesplit = false; LLAMA_EXPORT template typename InnerMapping> inline constexpr bool isBytesplit> = true; } // namespace llama::mapping // == // == ./include/llama/mapping/Bytesplit.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/mapping/PermuteArrayIndex.hpp == // == // Copyright 2022 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "Common.hpp" // amalgamate: file already inlined namespace llama::mapping { /// Meta mapping permuting the array indices before forwarding to another mapping. The array extents are not /// changed. /// @tparam Permutation The pack of integrals describing the permutation of the array indices. The inner mapping /// will be called with an ArrayIndex{ai[Permutation]...}. LLAMA_EXPORT template struct PermuteArrayIndex : Mapping { private: using size_type = typename Mapping::ArrayExtents::value_type; using ArrayIndex = typename Mapping::ArrayExtents::Index; public: using Inner = Mapping; constexpr PermuteArrayIndex() = default; LLAMA_FN_HOST_ACC_INLINE explicit PermuteArrayIndex(Mapping mapping) : Mapping(std::move(mapping)) { } template LLAMA_FN_HOST_ACC_INLINE explicit PermuteArrayIndex(Args&&... innerArgs) : Mapping(std::forward(innerArgs)...) { } static_assert( sizeof...(Permutation) == ArrayIndex::rank, "The number of integral arguments to PermuteArrayIndex must be the same as ArrayExtents::rank"); template LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset(ArrayIndex ai, RecordCoord rc = {}) const -> NrAndOffset { return Inner::blobNrAndOffset(ArrayIndex{ai[Permutation]...}, rc); } template LLAMA_FN_HOST_ACC_INLINE auto compute(ArrayIndex ai, RecordCoord rc, Blobs& blobs) const -> decltype(auto) { return Inner::compute(ArrayIndex{ai[Permutation]...}, rc, blobs); } }; LLAMA_EXPORT template PermuteArrayIndex(Mapping) -> PermuteArrayIndex; LLAMA_EXPORT template inline constexpr bool isPermuteArrayIndex = false; LLAMA_EXPORT template inline constexpr bool isPermuteArrayIndex> = true; } // namespace llama::mapping // == // == ./include/llama/mapping/PermuteArrayIndex.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/mapping/AoS.hpp == // == // Copyright 2022 Alexander Matthes, Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "Common.hpp" // amalgamate: file already inlined namespace llama::mapping { /// Array of struct mapping. Used to create a \ref View via \ref allocView. /// \tparam TFieldAlignment If Align, padding bytes are inserted to guarantee that struct members are properly /// aligned. If Pack, struct members are tightly packed. /// \tparam TLinearizeArrayIndexFunctor Defines how the array dimensions should be mapped into linear numbers and /// how big the linear domain gets. /// \tparam PermuteFields Defines how the record dimension's fields should be permuted. See \ref /// PermuteFieldsInOrder, \ref PermuteFieldsIncreasingAlignment, \ref PermuteFieldsDecreasingAlignment and /// \ref PermuteFieldsMinimizePadding. LLAMA_EXPORT template< typename TArrayExtents, typename TRecordDim, FieldAlignment TFieldAlignment = FieldAlignment::Align, typename TLinearizeArrayIndexFunctor = LinearizeArrayIndexRight, template typename PermuteFields = PermuteFieldsInOrder> struct AoS : MappingBase { private: using Base = MappingBase; using size_type = typename Base::size_type; public: inline static constexpr FieldAlignment fieldAlignment = TFieldAlignment; using LinearizeArrayIndexFunctor = TLinearizeArrayIndexFunctor; using Permuter = PermuteFields>; inline static constexpr std::size_t blobCount = 1; using Base::Base; LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(size_type) const -> size_type { return LinearizeArrayIndexFunctor{}.size(Base::extents()) * flatSizeOf; } template LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset( typename Base::ArrayIndex ai, RecordCoord = {}) const -> NrAndOffset { constexpr std::size_t flatFieldIndex = #if defined(__NVCC__) && __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 6 *& // mess with nvcc compiler state to workaround bug #endif Permuter::template permute>>; const auto offset = LinearizeArrayIndexFunctor{}(ai, Base::extents()) * static_cast( flatSizeOf) + static_cast(flatOffsetOf< typename Permuter::FlatRecordDim, flatFieldIndex, fieldAlignment == FieldAlignment::Align>); return {size_type{0}, offset}; } }; // we can drop this when inherited ctors also inherit deduction guides LLAMA_EXPORT template AoS(TArrayExtents, TRecordDim) -> AoS; /// Array of struct mapping preserving the alignment of the field types by inserting padding. /// \see AoS LLAMA_EXPORT template using AlignedAoS = AoS; /// Array of struct mapping preserving the alignment of the field types by inserting padding and permuting the /// field order to minimize this padding. \see AoS LLAMA_EXPORT template using MinAlignedAoS = AoS< ArrayExtents, RecordDim, FieldAlignment::Align, LinearizeArrayIndexFunctor, PermuteFieldsMinimizePadding>; /// Array of struct mapping packing the field types tightly, violating the type's alignment requirements. /// \see AoS LLAMA_EXPORT template using PackedAoS = AoS; /// Binds parameters to an \ref AoS mapping except for array and record dimension, producing a quoted meta /// function accepting the latter two. Useful to to prepare this mapping for a meta mapping. LLAMA_EXPORT template< FieldAlignment Alignment = FieldAlignment::Align, typename LinearizeArrayIndexFunctor = LinearizeArrayIndexRight> struct BindAoS { template using fn = AoS; }; LLAMA_EXPORT template inline constexpr bool isAoS = false; LLAMA_EXPORT template< typename ArrayExtents, typename RecordDim, FieldAlignment FieldAlignment, typename LinearizeArrayIndexFunctor, template typename PermuteFields> inline constexpr bool isAoS> = true; } // namespace llama::mapping // == // == ./include/llama/mapping/AoS.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/mapping/FieldAccessCount.hpp == // == // Copyright 2022 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "../StructName.hpp" // amalgamate: file already inlined // #include "Common.hpp" // amalgamate: file already inlined #include #include #include namespace llama::mapping { LLAMA_EXPORT template struct AccessCounts { union { CountType memLocsComputed; CountType reads; }; CountType writes; }; namespace internal { template struct FieldAccessCountReference : ProxyRefOpMixin, Value> { using value_type = Value; template LLAMA_FN_HOST_ACC_INLINE constexpr FieldAccessCountReference(RefFwd&& r, AccessCounts* hits) : r(std::forward(r)) , hits(hits) { static_assert(std::is_same_v, std::remove_reference_t>); } FieldAccessCountReference(const FieldAccessCountReference&) = default; FieldAccessCountReference(FieldAccessCountReference&&) noexcept = default; auto operator=(FieldAccessCountReference&& ref) noexcept -> FieldAccessCountReference& = default; ~FieldAccessCountReference() = default; LLAMA_FN_HOST_ACC_INLINE auto operator=(const FieldAccessCountReference& ref) -> FieldAccessCountReference& { if(&ref != this) { internal::atomicInc(hits->writes); r = static_cast(ref); } return *this; } LLAMA_FN_HOST_ACC_INLINE auto operator=(value_type value) -> FieldAccessCountReference& { internal::atomicInc(hits->writes); r = value; return *this; } // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions) LLAMA_FN_HOST_ACC_INLINE operator value_type() const { internal::atomicInc(hits->reads); return static_cast(r); } private: Ref r; AccessCounts* hits; }; } // namespace internal /// Forwards all calls to the inner mapping. Counts all accesses made through this mapping and allows printing a /// summary. /// @tparam Mapping The type of the inner mapping. /// @tparam TCountType The type used for counting the number of accesses. /// @tparam MyCodeHandlesProxyReferences If false, FieldAccessCount will avoid proxy references but can then only /// count the number of address computations LLAMA_EXPORT template struct FieldAccessCount : Mapping { private: using size_type = typename Mapping::ArrayExtents::value_type; public: using RecordDim = typename Mapping::RecordDim; using CountType = TCountType; inline static constexpr bool myCodeHandlesProxyReferences = MyCodeHandlesProxyReferences; struct FieldHitsArray : Array, flatFieldCount> { LLAMA_FN_HOST_ACC_INLINE auto total() const -> AccessCounts { AccessCounts total{}; for(const auto& ac : *this) { if constexpr(MyCodeHandlesProxyReferences) { total.reads += ac.reads; total.writes += ac.writes; } else total.memLocsComputed += ac.memLocsComputed; } return total; } struct TotalBytes { CountType totalRead; CountType totalWritten; }; /// When MyCodeHandlesProxyReferences is true, return a pair of the total read and written bytes. If false, /// returns the total bytes of accessed data as a single value. LLAMA_FN_HOST_ACC_INLINE auto totalBytes() const { CountType r = 0; CountType w = 0; // NOLINT(misc-const-correctness) forEachLeafCoord( [&](auto rc) LLAMA_LAMBDA_INLINE { const size_type i = flatRecordCoord; const auto fieldSize = sizeof(GetType); if constexpr(MyCodeHandlesProxyReferences) { r += (*this)[i].reads * fieldSize; w += (*this)[i].writes * fieldSize; } else r += (*this)[i].memLocsComputed * fieldSize; }); if constexpr(MyCodeHandlesProxyReferences) return TotalBytes{r, w}; else return r; } }; inline static constexpr auto blobCount = Mapping::blobCount + 1; constexpr FieldAccessCount() = default; LLAMA_FN_HOST_ACC_INLINE explicit FieldAccessCount(Mapping mapping) : Mapping(std::move(mapping)) { } template LLAMA_FN_HOST_ACC_INLINE explicit FieldAccessCount(Args&&... innerArgs) : Mapping(std::forward(innerArgs)...) { } LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(size_type blobIndex) const -> size_type { if(blobIndex < size_type{Mapping::blobCount}) return inner().blobSize(blobIndex); return sizeof(FieldHitsArray); } template static constexpr auto isComputed(RecordCoord) { return true; } template LLAMA_FN_HOST_ACC_INLINE auto compute( typename Mapping::ArrayExtents::Index ai, RecordCoord rc, Blobs& blobs) const -> decltype(auto) { static_assert( !std::is_const_v, "Cannot access (even just reading) data through FieldAccessCount from const blobs/view, since we need " "to write " "the access counts"); auto& hits = fieldHits(blobs)[+flatRecordCoord>]; decltype(auto) ref = mapToMemory(inner(), ai, rc, blobs); // T& or proxy reference (value) if constexpr(MyCodeHandlesProxyReferences) { using Value = GetType; using Ref = decltype(ref); return internal::FieldAccessCountReference{std::forward(ref), &hits}; } else { internal::atomicInc(hits.memLocsComputed); return ref; } } LLAMA_SUPPRESS_HOST_DEVICE_WARNING template LLAMA_FN_HOST_ACC_INLINE auto fieldHits(const Blobs& blobs) const -> const FieldHitsArray& { return reinterpret_cast(*&blobs[blobCount - 1][0]); } LLAMA_SUPPRESS_HOST_DEVICE_WARNING template LLAMA_FN_HOST_ACC_INLINE auto fieldHits(Blobs& blobs) const -> FieldHitsArray& { return reinterpret_cast(*&blobs[blobCount - 1][0]); } template LLAMA_FN_HOST_ACC_INLINE void printFieldHits(const Blobs& blobs) const { printFieldHits(fieldHits(blobs)); } LLAMA_FN_HOST_ACC_INLINE void printFieldHits(const FieldHitsArray& hits) const { #ifdef __CUDA_ARCH__ printFieldHitsDevice(hits); #else printFieldHitsHost(hits); #endif } private: static constexpr auto columnWidth = 10; static constexpr auto sizeColumnWidth = 5; void printFieldHitsHost(const FieldHitsArray& hits) const { if constexpr(MyCodeHandlesProxyReferences) std::cout << std::left << std::setw(columnWidth) << "Field" << ' ' << std::right << std::setw(sizeColumnWidth) << "Size" << std::right << std::setw(columnWidth) << "Reads" << ' ' << std::right << std::setw(columnWidth) << "Writes" << '\n'; else std::cout << std::left << std::setw(columnWidth) << "Field" << ' ' << std::right << std::setw(sizeColumnWidth) << "Size" << std::right << std::setw(columnWidth) << "Mlocs cmp" << '\n'; forEachLeafCoord( [&](auto rc) { const size_type i = flatRecordCoord; const auto fieldSize = sizeof(GetType); if constexpr(MyCodeHandlesProxyReferences) std::cout << std::left << std::setw(columnWidth) << prettyRecordCoord(rc) << ' ' << std::right << std::setw(sizeColumnWidth) << fieldSize << std::right << std::setw(columnWidth) << hits[i].reads << ' ' << std::right << std::setw(columnWidth) << hits[i].writes << '\n'; else std::cout << std::left << std::setw(columnWidth) << prettyRecordCoord(rc) << ' ' << std::right << std::setw(sizeColumnWidth) << fieldSize << std::right << std::setw(columnWidth) << hits[i].memLocsComputed << '\n'; }); const auto total = hits.totalBytes(); if constexpr(MyCodeHandlesProxyReferences) { const auto [rsize, runit] = prettySize(total.totalRead); const auto [wsize, wunit] = prettySize(total.totalWritten); std::cout << std::left << std::setw(columnWidth) << "Total" << ' ' << std::right << std::setw(sizeColumnWidth) << ' ' << std::right << std::setw(columnWidth) << rsize << runit << ' ' << std::right << std::setw(columnWidth - 2) << wsize << wunit << '\n'; } else { const auto [size, unit] = prettySize(total); std::cout << std::left << std::setw(columnWidth) << "Total" << ' ' << std::right << std::setw(sizeColumnWidth) << ' ' << std::right << std::setw(columnWidth) << size << unit << '\n'; } std::cout << std::internal; } LLAMA_ACC void printFieldHitsDevice(const FieldHitsArray& hits) const { if constexpr(MyCodeHandlesProxyReferences) { // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg) printf( "%*s %*s %*s %*s\n", columnWidth, "Field", sizeColumnWidth, "Size", columnWidth, "Reads", columnWidth, "Writes"); } else { // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg) printf("%*s %*s %*s\n", columnWidth, "Field", sizeColumnWidth, "Size", columnWidth, "Mlocs cmp"); } forEachLeafCoord( [&](auto rc) { const size_type i = flatRecordCoord; const auto fieldSize = sizeof(GetType); constexpr auto fieldName = prettyRecordCoord(rc); char fieldNameZT[fieldName.size() + 1]{}; // nvcc does not handle the %*.*s parameter correctly llama::internal::constexprCopy(fieldName.begin(), fieldName.end(), fieldNameZT); if constexpr(MyCodeHandlesProxyReferences) { // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg) printf( "%*.s %*lu %*lu %*lu\n", columnWidth, fieldNameZT, sizeColumnWidth, fieldSize, columnWidth, static_cast(hits[i].reads), columnWidth, static_cast(hits[i].writes)); } else { // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg) printf( "%*.s %*lu %*lu\n", columnWidth, fieldNameZT, sizeColumnWidth, fieldSize, columnWidth, static_cast(hits[i].memLocsComputed)); } }); const auto total = hits.totalBytes(); if constexpr(MyCodeHandlesProxyReferences) { const auto [rsize, runit] = prettySize(total.totalRead); const auto [wsize, wunit] = prettySize(total.totalWritten); // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg) printf( "%*s %*s %*f%s %*f%s\n", columnWidth, "Total", sizeColumnWidth, "", columnWidth, rsize, runit, columnWidth - 2, wsize, wunit); } else { const auto [size, unit] = prettySize(total); // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg) printf("%*s %*s %*f%s\n", columnWidth, "Total", sizeColumnWidth, "", columnWidth, size, unit); } } LLAMA_FN_HOST_ACC_INLINE auto inner() const -> const Mapping& { return static_cast(*this); } }; LLAMA_EXPORT template inline constexpr bool isFieldAccessCount = false; LLAMA_EXPORT template inline constexpr bool isFieldAccessCount> = true; } // namespace llama::mapping // == // == ./include/llama/mapping/FieldAccessCount.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/mapping/ChangeType.hpp == // == // Copyright 2022 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "../ProxyRefOpMixin.hpp" // amalgamate: file already inlined // #include "Common.hpp" // amalgamate: file already inlined // #include "Projection.hpp" // amalgamate: file already inlined namespace llama::mapping { namespace internal { template struct ChangeTypeProjection { LLAMA_FN_HOST_ACC_INLINE static auto load(StoredT v) -> UserT { return static_cast(v); // we could allow stronger casts here } LLAMA_FN_HOST_ACC_INLINE static auto store(UserT v) -> StoredT { return static_cast(v); // we could allow stronger casts here } }; template struct MakeProjectionPair { template static auto recordDimType() { if constexpr(isRecordCoord) return mp_identity>{}; else return mp_identity{}; } template, typename StoredT = mp_second> using fn = mp_list())::type, StoredT>>; }; template using MakeProjectionMap = mp_transform::template fn, ReplacementMap>; } // namespace internal /// Mapping that changes the type in the record domain for a different one in storage. Conversions happen during /// load and store. /// @tparam ReplacementMap A type list of binary type lists (a map) specifiying which type or the type at a \ref /// RecordCoord (map key) to replace by which other type (mapped value). LLAMA_EXPORT template< typename ArrayExtents, typename RecordDim, template typename InnerMapping, typename ReplacementMap> struct ChangeType : Projection> { private: using Base = Projection< ArrayExtents, RecordDim, InnerMapping, internal::MakeProjectionMap>; public: using Base::Base; }; /// Binds parameters to a \ref ChangeType mapping except for array and record dimension, producing a quoted /// meta function accepting the latter two. Useful to to prepare this mapping for a meta mapping. LLAMA_EXPORT template typename InnerMapping, typename ReplacementMap> struct BindChangeType { template using fn = ChangeType; }; LLAMA_EXPORT template inline constexpr bool isChangeType = false; LLAMA_EXPORT template< typename TArrayExtents, typename TRecordDim, template typename InnerMapping, typename ReplacementMap> inline constexpr bool isChangeType> = true; } // namespace llama::mapping // == // == ./include/llama/mapping/ChangeType.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/mapping/BitPackedFloat.hpp == // == // Copyright 2023 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "../ProxyRefOpMixin.hpp" // amalgamate: file already inlined // #include "BitPackedInt.hpp" // amalgamate: file already inlined // #include "Common.hpp" // amalgamate: file already inlined // #include // amalgamate: file already included // #include // amalgamate: file already included #include // #include // amalgamate: file already included // #include // amalgamate: file already included // #include // amalgamate: file already included namespace llama::mapping { namespace internal { template struct FloatBitTraits; template<> struct FloatBitTraits { inline static constexpr unsigned mantissa = 23; inline static constexpr unsigned exponent = 8; }; template<> struct FloatBitTraits { inline static constexpr unsigned mantissa = 52; inline static constexpr unsigned exponent = 11; }; template LLAMA_FN_HOST_ACC_INLINE auto repackFloat( Integral inFloat, unsigned inMantissaBits, unsigned inExponentBits, unsigned outMantissaBits, unsigned outExponentBits) -> Integral { const Integral inMantissaMask = (Integral{1} << inMantissaBits) - 1u; const Integral inExponentMask = (Integral{1} << inExponentBits) - 1u; Integral inMantissa = inFloat & inMantissaMask; const Integral inExponent = (inFloat >> inMantissaBits) & inExponentMask; const Integral inSign = inFloat >> inExponentBits >> inMantissaBits; const Integral outExponentMask = (Integral{1} << outExponentBits) - 1u; Integral outExponent; if(inExponent == inExponentMask) [[LLAMA_UNLIKELY]] outExponent = outExponentMask; // propagate +/- inf/nan else if(inExponent == 0) [[LLAMA_UNLIKELY]] outExponent = 0; // propagate -/+ zero else { const int outExponentMax = 1 << (outExponentBits - 1); // NOLINT(hicpp-signed-bitwise) const int outExponentMin = -outExponentMax + 1; const int outExponentBias = outExponentMax - 1; const int inExponentBias = (1 << (inExponentBits - 1)) - 1; // NOLINT(hicpp-signed-bitwise) const int exponent = static_cast(inExponent) - inExponentBias; const auto clampedExponent = std::clamp(exponent, outExponentMin, outExponentMax); if(clampedExponent == outExponentMin || clampedExponent == outExponentMax) inMantissa = 0; // when exponent changed, let value become inf and not nan outExponent = clampedExponent + outExponentBias; } assert(outExponent < (1u << outExponentBits)); const Integral packedMantissa = inMantissaBits > outMantissaBits ? inMantissa >> (inMantissaBits - outMantissaBits) : inMantissa << (outMantissaBits - inMantissaBits); const Integral packedExponent = outExponent << outMantissaBits; const Integral packedSign = inSign << outExponentBits << outMantissaBits; const auto outFloat = static_cast(packedMantissa | packedExponent | packedSign); return outFloat; } // TODO(bgruber): Boost.Hana generalizes these sorts of computations on mixed constants and values template LLAMA_FN_HOST_ACC_INLINE auto integBits(E e, M m) { return llama::internal::BoxedValue{e.value() + m.value() + 1}; } template LLAMA_FN_HOST_ACC_INLINE auto integBits( llama::internal::BoxedValue>, llama::internal::BoxedValue>) { return llama::internal::BoxedValue>{}; } /// A proxy type representing a reference to a reduced precision floating-point value, stored in a buffer at a /// specified bit offset. /// @tparam Float Floating-point data type which can be loaded and store through this reference. /// @tparam StoredIntegralCV Integral type used for storing the bits with CV qualifiers. /// @tparam SizeType Type used to store sizes and offsets. template // NOLINTNEXTLINE(cppcoreguidelines-special-member-functions,hicpp-special-member-functions) struct LLAMA_DECLSPEC_EMPTY_BASES BitPackedFloatRef : private VHExp , private VHMan , ProxyRefOpMixin, Float> { private: static_assert( std::is_same_v || std::is_same_v, "Types other than float or double are not implemented yet"); static_assert( std::numeric_limits::is_iec559, "Only IEEE754/IEC559 floating point formats are implemented"); using FloatBits = std::conditional_t, std::uint32_t, std::uint64_t>; BitPackedIntRef< FloatBits, StoredIntegralCV, decltype(integBits(std::declval(), std::declval())), SizeType, SignBit::Discard> intref; public: using value_type = Float; LLAMA_FN_HOST_ACC_INLINE constexpr BitPackedFloatRef( StoredIntegralCV* p, SizeType bitOffset, VHExp vhExp, VHMan vhMan) : VHExp{vhExp} , VHMan{vhMan} , intref{ p, bitOffset, integBits(vhExp, vhMan), } { } BitPackedFloatRef(const BitPackedFloatRef&) = default; // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp) LLAMA_FN_HOST_ACC_INLINE constexpr auto operator=(const BitPackedFloatRef& other) -> BitPackedFloatRef& { *this = static_cast(other); return *this; } // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions) LLAMA_FN_HOST_ACC_INLINE constexpr operator Float() const { using Bits = FloatBitTraits; const FloatBits packedFloat = intref; const FloatBits unpackedFloat = repackFloat(packedFloat, VHMan::value(), VHExp::value(), Bits::mantissa, Bits::exponent); Float f; std::memcpy(&f, &unpackedFloat, sizeof(Float)); return f; } LLAMA_FN_HOST_ACC_INLINE constexpr auto operator=(Float f) -> BitPackedFloatRef& { using Bits = FloatBitTraits; FloatBits unpackedFloat = 0; std::memcpy(&unpackedFloat, &f, sizeof(Float)); const FloatBits packedFloat = repackFloat(unpackedFloat, Bits::mantissa, Bits::exponent, VHMan::value(), VHExp::value()); intref = packedFloat; return *this; } }; template using StoredIntegralFor = std::conditional_t, double>::value, std::uint64_t, std::uint32_t>; } // namespace internal // TODO(bgruber): I would like to allow zero mantissa bits, which would then no longer support INF. Likewise, // support to skip the sign bit would also be great. /// Struct of array mapping using bit packing to reduce size/precision of floating-point data types. The bit layout /// is [1 sign bit, exponentBits bits from the exponent, mantissaBits bits from the mantissa]+ and tries to follow /// IEEE 754. Infinity and NAN are supported. If the packed exponent bits are not big enough to hold a number, it /// will be set to infinity (preserving the sign). If your record dimension contains non-floating-point types, /// split them off using the \ref Split mapping first. /// \tparam ExponentBits If ExponentBits is llama::Constant, the compile-time N specifies the number of bits to /// use to store the exponent. If ExponentBits is llama::Value, the number of bits is specified at runtime, /// passed to the constructor and stored as type T. Must not be zero. /// \tparam MantissaBits Like ExponentBits but for the mantissa bits. Must not be zero (otherwise values turn INF). /// \tparam TLinearizeArrayIndexFunctor Defines how the array dimensions should be mapped into linear numbers and /// how big the linear domain gets. /// \tparam TStoredIntegral Integral type used as storage of reduced precision floating-point values. LLAMA_EXPORT template< typename TArrayExtents, typename TRecordDim, typename ExponentBits = typename TArrayExtents::value_type, typename MantissaBits = ExponentBits, typename TLinearizeArrayIndexFunctor = LinearizeArrayIndexRight, typename TStoredIntegral = internal::StoredIntegralFor> struct LLAMA_DECLSPEC_EMPTY_BASES BitPackedFloatSoA : MappingBase , llama::internal::BoxedValue , llama::internal::BoxedValue { private: using Base = MappingBase; using VHExp = llama::internal::BoxedValue; using VHMan = llama::internal::BoxedValue; using size_type = typename TArrayExtents::value_type; public: using LinearizeArrayIndexFunctor = TLinearizeArrayIndexFunctor; using StoredIntegral = TStoredIntegral; static constexpr std::size_t blobCount = mp_size>::value; LLAMA_FN_HOST_ACC_INLINE constexpr auto exponentBits() const -> size_type { return static_cast(VHExp::value()); } LLAMA_FN_HOST_ACC_INLINE constexpr auto mantissaBits() const -> size_type { return static_cast(VHMan::value()); } LLAMA_FN_HOST_ACC_INLINE constexpr explicit BitPackedFloatSoA( TArrayExtents extents = {}, ExponentBits exponentBits = {}, MantissaBits mantissaBits = {}, TRecordDim = {}) : Base(extents) , VHExp{exponentBits} , VHMan{mantissaBits} { assert(this->exponentBits() > 0); } LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(size_type /*blobIndex*/) const -> size_type { constexpr auto bitsPerStoredIntegral = static_cast(sizeof(StoredIntegral) * CHAR_BIT); const auto bitsNeeded = LinearizeArrayIndexFunctor{}.size(Base::extents()) * (exponentBits() + mantissaBits() + 1); return roundUpToMultiple(bitsNeeded, bitsPerStoredIntegral) / CHAR_BIT; } template static constexpr auto isComputed(RecordCoord) { return true; } template LLAMA_FN_HOST_ACC_INLINE constexpr auto compute( typename Base::ArrayIndex ai, RecordCoord, Blobs& blobs) const { constexpr auto blob = llama::flatRecordCoord>; const auto bitOffset = LinearizeArrayIndexFunctor{}(ai, Base::extents()) * (exponentBits() + mantissaBits() + 1); using QualifiedStoredIntegral = CopyConst; using DstType = GetType>; LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING return internal::BitPackedFloatRef{ reinterpret_cast(&blobs[blob][0]), bitOffset, static_cast(*this), static_cast(*this)}; LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING } }; /// Binds parameters to a \ref BitPackedFloatSoA mapping except for array and record dimension, producing a quoted /// meta function accepting the latter two. Useful to to prepare this mapping for a meta mapping. LLAMA_EXPORT template< typename ExponentBits = unsigned, typename MantissaBits = ExponentBits, typename LinearizeArrayIndexFunctor = LinearizeArrayIndexRight, typename StoredIntegral = void> struct BindBitPackedFloatSoA { template using fn = BitPackedFloatSoA< ArrayExtents, RecordDim, ExponentBits, MantissaBits, LinearizeArrayIndexFunctor, std::conditional_t< !std::is_void_v, StoredIntegral, internal::StoredIntegralFor>>; }; LLAMA_EXPORT template inline constexpr bool isBitPackedFloatSoA = false; LLAMA_EXPORT template inline constexpr bool isBitPackedFloatSoA> = true; LLAMA_EXPORT template< typename TArrayExtents, typename TRecordDim, typename ExponentBits = typename TArrayExtents::value_type, typename MantissaBits = ExponentBits, typename TLinearizeArrayIndexFunctor = LinearizeArrayIndexRight, template typename PermuteFields = PermuteFieldsInOrder, typename TStoredIntegral = internal::StoredIntegralFor> struct LLAMA_DECLSPEC_EMPTY_BASES BitPackedFloatAoS : MappingBase , llama::internal::BoxedValue , llama::internal::BoxedValue { private: using Base = MappingBase; using VHExp = llama::internal::BoxedValue; using VHMan = llama::internal::BoxedValue; using size_type = typename TArrayExtents::value_type; public: using LinearizeArrayIndexFunctor = TLinearizeArrayIndexFunctor; using StoredIntegral = TStoredIntegral; using Permuter = PermuteFields>; static constexpr std::size_t blobCount = 1; LLAMA_FN_HOST_ACC_INLINE constexpr auto exponentBits() const -> size_type { return static_cast(VHExp::value()); } LLAMA_FN_HOST_ACC_INLINE constexpr auto mantissaBits() const -> size_type { return static_cast(VHMan::value()); } LLAMA_FN_HOST_ACC_INLINE constexpr explicit BitPackedFloatAoS( TArrayExtents extents = {}, ExponentBits exponentBits = {}, MantissaBits mantissaBits = {}, TRecordDim = {}) : Base(extents) , VHExp{exponentBits} , VHMan{mantissaBits} { assert(this->exponentBits() > 0); } LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(size_type /*blobIndex*/) const -> size_type { constexpr auto bitsPerStoredIntegral = static_cast(sizeof(StoredIntegral) * CHAR_BIT); const auto bitsNeeded = TLinearizeArrayIndexFunctor{}.size(Base::extents()) * static_cast(exponentBits() + mantissaBits() + 1) * static_cast(flatFieldCount); return roundUpToMultiple(bitsNeeded, bitsPerStoredIntegral) / CHAR_BIT; } template static constexpr auto isComputed(RecordCoord) { return true; } template LLAMA_FN_HOST_ACC_INLINE constexpr auto compute( typename Base::ArrayIndex ai, RecordCoord, Blobs& blobs) const { constexpr auto flatFieldIndex = static_cast( Permuter::template permute>>); const auto bitOffset = ((TLinearizeArrayIndexFunctor{}(ai, Base::extents()) * static_cast(flatFieldCount)) + flatFieldIndex) * static_cast(exponentBits() + mantissaBits() + 1); using QualifiedStoredIntegral = CopyConst; using DstType = GetType>; LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING return internal::BitPackedFloatRef{ reinterpret_cast(&blobs[0][0]), bitOffset, static_cast(*this), static_cast(*this)}; LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING } }; LLAMA_EXPORT template< typename ExponentBits = unsigned, typename MantissaBits = ExponentBits, typename LinearizeArrayIndexFunctor = LinearizeArrayIndexRight, template typename PermuteFields = PermuteFieldsInOrder, typename StoredIntegral = void> struct BindBitPackedFloatAoS { template using fn = BitPackedFloatAoS< ArrayExtents, RecordDim, ExponentBits, MantissaBits, LinearizeArrayIndexFunctor, PermuteFields, std::conditional_t< !std::is_void_v, StoredIntegral, internal::StoredIntegralFor>>; }; LLAMA_EXPORT template inline constexpr bool isBitPackedFloatAoS = false; LLAMA_EXPORT template< typename ArrayExtents, typename RecordDim, typename ExponentBits, typename MantissaBits, typename LinearizeArrayIndexFunctor, template typename PermuteFields, typename StoredIntegral> inline constexpr bool isBitPackedFloatAoS> = true; } // namespace llama::mapping // == // == ./include/llama/mapping/BitPackedFloat.hpp == // ============================================================================ // ============================================================================ // == ./include/llama/llama.hpp == // == // Copyright 2018 Alexander Matthes, Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once /// \mainpage LLAMA API documentation /// /// LLAMA is a C++17 template header-only library for the abstraction of memory access patterns. It distinguishes /// between the view of the algorithm on the memory and the real layout in the background. This enables performance /// portability for multicore, manycore and gpu applications with the very same code. /// /// In contrast to many other solutions LLAMA can define nested data structures of arbitrary depths and is not limited /// only to struct of array and array of struct data layouts. It is also capable to explicitly define padding, /// blocking, striding and any other run time or compile time access pattern simultaneously. /// /// To archieve this goal LLAMA is split into mostly independent, orthogonal parts completely written in modern C++17 /// to run on as many architectures and with as many compilers as possible while still supporting extensions needed /// e.g. to run on GPU or other many core hardware. /// /// This page documents the API of LLAMA. The user documentation and an overview about the concepts and ideas can be /// found here: https://llama-doc.rtfd.io /// /// LLAMA is licensed under the MPL-2.0. // NOLINTNEXTLINE(modernize-macro-to-enum) #define LLAMA_VERSION_MAJOR 0 // NOLINTNEXTLINE(modernize-macro-to-enum) #define LLAMA_VERSION_MINOR 6 // NOLINTNEXTLINE(modernize-macro-to-enum) #define LLAMA_VERSION_PATCH 0 // suppress warnings on missing return statements. we get a lot of these because nvcc/nvc++ have some troubles with if // constexpr. #ifdef __NVCC__ # ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ # pragma nv_diag_suppress 940 # else # pragma diag_suppress 940 # endif #endif #ifdef __NVCOMPILER # pragma push # pragma diag_suppress 941 #endif // #include "ArrayExtents.hpp" // amalgamate: file already inlined // #include "ArrayIndexRange.hpp" // amalgamate: file already inlined // #include "BlobAllocators.hpp" // amalgamate: file already inlined // #include "Copy.hpp" // amalgamate: file already inlined // #include "Core.hpp" // amalgamate: file already inlined // #include "DumpMapping.hpp" // amalgamate: file already inlined // #include "Meta.hpp" // amalgamate: file already inlined // #include "ProxyRefOpMixin.hpp" // amalgamate: file already inlined // #include "RecordRef.hpp" // amalgamate: file already inlined // ============================================================================ // == ./include/llama/Simd.hpp == // == // Copyright 2022 Bernhard Manfred Gruber // SPDX-License-Identifier: MPL-2.0 // #pragma once // #include "Core.hpp" // amalgamate: file already inlined // #include "RecordRef.hpp" // amalgamate: file already inlined // #include "macros.hpp" // amalgamate: file already inlined // #include "mapping/AoS.hpp" // amalgamate: file already inlined // #include "mapping/AoSoA.hpp" // amalgamate: file already inlined // #include "mapping/SoA.hpp" // amalgamate: file already inlined // #include // amalgamate: file already included namespace llama { /// Traits of a specific Simd implementation. Please specialize this template for the SIMD types you are going to /// use in your program. /// Each specialization SimdTraits must provide: /// * an alias `value_type` to indicate the element type of the Simd. /// * a `static constexpr size_t lanes` variable holding the number of SIMD lanes of the Simd. /// * a `static auto loadUnalinged(const value_type* mem) -> Simd` function, loading a Simd from the given memory /// address. /// * a `static void storeUnaligned(Simd simd, value_type* mem)` function, storing the given Simd to a given /// memory address. /// * a `static auto gather(const value_type* mem, std::array indices) -> Simd` function, gathering /// values into a Simd from the memory addresses identified by mem + indices * sizeof(value_type). /// * a `static void scatter(Simd simd, value_type* mem, std::array indices)` function, scattering the /// values from a Simd to the memory addresses identified by mem + indices * sizeof(value_type). LLAMA_EXPORT template struct SimdTraits { static_assert(sizeof(Simd) == 0, "Please specialize SimdTraits for the type Simd"); }; LLAMA_EXPORT template struct SimdTraits>> { using value_type = T; inline static constexpr std::size_t lanes = 1; static LLAMA_FN_HOST_ACC_INLINE auto loadUnaligned(const T* mem) -> T { return *mem; } static LLAMA_FN_HOST_ACC_INLINE void storeUnaligned(T t, T* mem) { *mem = t; } static LLAMA_FN_HOST_ACC_INLINE auto gather(const value_type* mem, std::array indices) -> T { return mem[indices[0]]; } static LLAMA_FN_HOST_ACC_INLINE void scatter(T t, value_type* mem, std::array indices) { mem[indices[0]] = t; } }; /// The number of SIMD simdLanes the given SIMD vector or \ref Simd has. If Simd is not a structural \ref Simd /// or \ref SimdN, this is a shortcut for SimdTraits::lanes. LLAMA_EXPORT template inline constexpr auto simdLanes = SimdTraits::lanes; /// Chooses the number of SIMD lanes for the given record dimension by mapping each field type to a SIMD type and /// then reducing their sizes. /// @tparam MakeSimd Type function creating a SIMD type given a field type from the record dimension. /// @param reduce Binary reduction function to reduce the SIMD lanes. LLAMA_EXPORT template typename MakeSimd, typename BinaryReductionFunction> LLAMA_CONSTEVAL auto chooseSimdLanes(BinaryReductionFunction reduce) -> std::size_t { using FRD = FlatRecordDim; std::size_t lanes = simdLanes>>; mp_for_each_inline>>( [&](auto* t) { using T = std::remove_reference_t; lanes = reduce(lanes, simdLanes>); }); assert(lanes > 0); return lanes; } /// Determines the number of simd lanes suitable to process all types occurring in the given record dimension. The /// algorithm ensures that even SIMD vectors for the smallest field type are filled completely and may thus require /// multiple SIMD vectors for some field types. /// @tparam RecordDim The record dimension to simdize /// @tparam MakeSimd Type function creating a SIMD type given a field type from the record dimension. /// @hideinitializer LLAMA_EXPORT template typename MakeSimd> inline constexpr std::size_t simdLanesWithFullVectorsFor = chooseSimdLanes([](auto a, auto b) { return std::max(a, b); }); /// Determines the number of simd lanes suitable to process all types occurring in the given record dimension. The /// algorithm ensures that the smallest number of SIMD registers is needed and may thus only partially fill /// registers for some data types. /// @tparam RecordDim The record dimension to simdize /// @tparam MakeSimd Type function creating a SIMD type given a field type from the record dimension. /// @hideinitializer LLAMA_EXPORT template typename MakeSimd> inline constexpr std::size_t simdLanesWithLeastRegistersFor = chooseSimdLanes([](auto a, auto b) { return std::min(a, b); }); namespace internal { template typename MakeSizedSimd> struct BindMakeSizedSimd { template using fn = MakeSizedSimd; }; template< typename RecordDim, std::size_t N, template typename MakeSizedSimd> struct SimdizeNImpl { using type = TransformLeaves::template fn>; }; template typename MakeSizedSimd> struct SimdizeNImpl { using type = RecordDim; }; } // namespace internal /// Transforms the given record dimension into a SIMD version of it. Each leaf field type will be replaced by a /// sized SIMD vector with length N, as determined by MakeSizedSimd. If N is 1, SimdizeN is an alias for /// T. LLAMA_EXPORT template typename MakeSizedSimd> using SimdizeN = typename internal::SimdizeNImpl::type; /// Transforms the given record dimension into a SIMD version of it. Each leaf field type will be replaced by a /// SIMD vector, as determined by MakeSimd. LLAMA_EXPORT template typename MakeSimd> using Simdize = TransformLeaves; /// Creates a SIMD version of the given type. Of T is a record dimension, creates a \ref One where each field is a /// SIMD type of the original field type. The SIMD vectors have length N. If N is 1, an ordinary \ref One of the /// record dimension T is created. If T is not a record dimension, a SIMD vector with value T and length N is /// created. If N is 1 (and T is not a record dimension), then T is produced. LLAMA_EXPORT template typename MakeSizedSimd> using SimdN = typename std::conditional_t< isRecordDim, std::conditional_t>, mp_identity>>>, std::conditional_t, mp_identity>>>::type; /// Creates a SIMD version of the given type. Of T is a record dimension, creates a \ref One where each field is a /// SIMD type of the original field type. LLAMA_EXPORT template typename MakeSimd> using Simd = typename std:: conditional_t, mp_identity>>, mp_identity>>::type; namespace internal { template struct SizeEqualTo { template using fn = std::bool_constant == S>; }; } // namespace internal /// Specialization for Simd. Only works if all SIMD types in the fields of the record dimension have the /// same size. LLAMA_EXPORT template inline constexpr std::size_t simdLanes>> = [] { using FRD = FlatRecordDim; using FirstFieldType = mp_first; static_assert(mp_all_of_q>>::value); return simdLanes; }(); namespace internal { template inline constexpr auto aosStridedIndices = []() { auto stride = flatSizeOf< typename AoSMapping::Permuter::FlatRecordDim, AoSMapping::fieldAlignment == llama::mapping::FieldAlignment::Align> / sizeof(ElementType); std::array indices{}; for(int i = 0; i < static_cast(Lanes); i++) indices[i] = i * stride; return indices; }(); template LLAMA_FN_HOST_ACC_INLINE void loadSimdFromField(const T& srcRef, Simd& dstSimd, SrcRC srcRC, DstRC dstRC) { using FieldType = GetType; using ElementSimd = std::decay_t; using Traits = SimdTraits; auto loadElementWise = [&] { auto b = ArrayIndexIterator{srcRef.view.extents(), srcRef.arrayIndex()}; for(std::size_t i = 0; i < Traits::lanes; i++) reinterpret_cast(&dstSimd(dstRC))[i] = srcRef.view(*b++)(cat(typename T::BoundRecordCoord{}, srcRC)); }; // TODO(bgruber): can we generalize the logic whether we can load a dstSimd from that mapping? using Mapping = typename T::View::Mapping; if constexpr(mapping::isSoA) { LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING dstSimd(dstRC) = Traits::loadUnaligned(&srcRef(srcRC)); LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING } else if constexpr(mapping::isAoSoA) { // TODO(bgruber): this check is too strict if(T::View::Mapping::ArrayExtents::rank == 1 && srcRef.arrayIndex()[0] % Traits::lanes == 0 && T::View::Mapping::lanes >= Traits::lanes) { LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING dstSimd(dstRC) = Traits::loadUnaligned(&srcRef(srcRC)); LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING } else loadElementWise(); } else if constexpr(mapping::isAoS) { LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING dstSimd(dstRC) = Traits::gather(&srcRef(srcRC), aosStridedIndices); LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING } else loadElementWise(); } template LLAMA_FN_HOST_ACC_INLINE void storeSimdToField(const Simd& srcSimd, TFwd&& dstRef, SrcRC srcRC, DstRC dstRC) { using T = std::remove_reference_t; using FieldType = GetType; using ElementSimd = std::decay_t; using Traits = SimdTraits; auto storeElementWise = [&] { // TODO(bgruber): how does this generalize conceptually to 2D and higher dimensions? in which // direction should we collect SIMD values? auto b = ArrayIndexIterator{dstRef.view.extents(), dstRef.arrayIndex()}; for(std::size_t i = 0; i < Traits::lanes; i++) dstRef.view (*b++)(cat(typename T::BoundRecordCoord{}, dstRC)) = reinterpret_cast(&srcSimd(srcRC))[i]; }; // TODO(bgruber): can we generalize the logic whether we can store a srcSimd to that mapping? using Mapping = typename std::remove_reference_t::View::Mapping; if constexpr(mapping::isSoA) { LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING Traits::storeUnaligned(srcSimd(srcRC), &dstRef(dstRC)); LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING } else if constexpr(mapping::isAoSoA) { // TODO(bgruber): this check is too strict if(T::View::Mapping::ArrayExtents::rank == 1 && dstRef.arrayIndex()[0] % Traits::lanes == 0 && T::View::Mapping::lanes >= Traits::lanes) { LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING Traits::storeUnaligned(srcSimd(srcRC), &dstRef(dstRC)); LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING } else storeElementWise(); } else if constexpr(mapping::isAoS) { LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING Traits::scatter(srcSimd(srcRC), &dstRef(dstRC), aosStridedIndices); LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING } else storeElementWise(); } } // namespace internal /// Loads SIMD vectors of data starting from the given record reference to dstSimd. Only field tags occurring in /// RecordRef are loaded. If Simd contains multiple fields of SIMD types, a SIMD vector will be fetched for each of /// the fields. The number of elements fetched per SIMD vector depends on the SIMD width of the vector. Simd is /// allowed to have different vector lengths per element. LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE void loadSimd(const T& srcRef, Simd& dstSimd) { // structured dstSimd type and record reference if constexpr(isRecordRef && isRecordRef) { if constexpr(simdLanes == simdLanes) // fast path mainly for scalar SimdN dstSimd = srcRef; else { using SrcARD = typename T::AccessibleRecordDim; using DstArd = typename Simd::AccessibleRecordDim; if constexpr(std::is_same_v) { forEachLeafCoord([&](auto rc) LLAMA_LAMBDA_INLINE { internal::loadSimdFromField(srcRef, dstSimd, rc, rc); }); } else { forEachLeafCoord( [&](auto srcRC) LLAMA_LAMBDA_INLINE { using SrcInnerCoord = decltype(srcRC); forEachLeafCoord( [&](auto dstRC) LLAMA_LAMBDA_INLINE { using DstInnerCoord = decltype(dstRC); if constexpr(hasSameTags) { internal::loadSimdFromField(srcRef, dstSimd, srcRC, dstRC); } }); }); } } } // unstructured dstSimd and reference type else if constexpr(!isRecordRef && !isRecordRef) { LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING dstSimd = SimdTraits::loadUnaligned(&srcRef); LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING } else { // TODO(bgruber): when could we get here? Is this always an error? static_assert(sizeof(Simd) == 0, "Invalid combination of Simd type and reference type"); } } /// Stores SIMD vectors of element data from the given srcSimd into memory starting at the provided record /// reference. Only field tags occurring in RecordRef are stored. If Simd contains multiple fields of SIMD types, a /// SIMD vector will be stored for each of the fields. The number of elements stored per SIMD vector depends on the /// SIMD width of the vector. Simd is allowed to have different vector lengths per element. LLAMA_EXPORT template LLAMA_FN_HOST_ACC_INLINE void storeSimd(const Simd& srcSimd, TFwd&& dstRef) { using T = std::decay_t; // structured Simd type and record reference if constexpr(isRecordRef && isRecordRef) { if constexpr(simdLanes == simdLanes) // fast path mainly for scalar SimdN dstRef = srcSimd; else { using SrcARD = typename Simd::AccessibleRecordDim; using DstArd = typename T::AccessibleRecordDim; if constexpr(std::is_same_v) { forEachLeafCoord([&](auto rc) LLAMA_LAMBDA_INLINE { internal::storeSimdToField(srcSimd, dstRef, rc, rc); }); } else { forEachLeafCoord( [&](auto srcRC) LLAMA_LAMBDA_INLINE { using SrcInnerCoord = decltype(srcRC); forEachLeafCoord( [&](auto dstRC) LLAMA_LAMBDA_INLINE { using DstInnerCoord = decltype(dstRC); if constexpr(hasSameTags) { internal::storeSimdToField(srcSimd, dstRef, srcRC, dstRC); } }); }); } } } // unstructured srcSimd and reference type else if constexpr(!isRecordRef && !isRecordRef) { LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING SimdTraits::storeUnaligned(srcSimd, &dstRef); LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING } else { // TODO(bgruber): when could we get here? Is this always an error? static_assert(sizeof(Simd) == 0, "Invalid combination of Simd type and reference type"); } } LLAMA_EXPORT template< std::size_t N, template typename MakeSizedSimd, typename View, typename UnarySimdFunction> void simdForEachN(View& view, UnarySimdFunction f) { using IndexType = typename View::Mapping::ArrayExtents::value_type; const auto total = product(view.mapping().extents()); auto it = view.begin(); IndexType i = 0; // simd loop while(i + IndexType{N} <= total) { SimdN simd; loadSimd(*it, simd); if constexpr(std::is_void_v) f(simd); else storeSimd(f(simd), *it); i += IndexType{N}; it += IndexType{N}; } // tail while(i < total) { auto scalar = One{*it}; if constexpr(std::is_void_v) f(scalar); else *it = f(scalar); ++i; ++it; } } LLAMA_EXPORT template< template typename MakeSimd, template typename MakeSizedSimd, typename View, typename UnarySimdFunction> void simdForEach(View& view, UnarySimdFunction f) { constexpr auto n = llama::simdLanesWithFullVectorsFor; simdForEachN(view, f); } } // namespace llama // == // == ./include/llama/Simd.hpp == // ============================================================================ // #include "StructName.hpp" // amalgamate: file already inlined // #include "Tuple.hpp" // amalgamate: file already inlined // #include "Vector.hpp" // amalgamate: file already inlined // #include "View.hpp" // amalgamate: file already inlined // #include "macros.hpp" // amalgamate: file already inlined // #include "mapping/AoS.hpp" // amalgamate: file already inlined // #include "mapping/AoSoA.hpp" // amalgamate: file already inlined // #include "mapping/BitPackedFloat.hpp" // amalgamate: file already inlined // #include "mapping/BitPackedInt.hpp" // amalgamate: file already inlined // #include "mapping/Bytesplit.hpp" // amalgamate: file already inlined // #include "mapping/Byteswap.hpp" // amalgamate: file already inlined // #include "mapping/ChangeType.hpp" // amalgamate: file already inlined // #include "mapping/FieldAccessCount.hpp" // amalgamate: file already inlined // #include "mapping/Heatmap.hpp" // amalgamate: file already inlined // #include "mapping/Null.hpp" // amalgamate: file already inlined // #include "mapping/One.hpp" // amalgamate: file already inlined // #include "mapping/PermuteArrayIndex.hpp" // amalgamate: file already inlined // #include "mapping/Projection.hpp" // amalgamate: file already inlined // #include "mapping/SoA.hpp" // amalgamate: file already inlined // #include "mapping/Split.hpp" // amalgamate: file already inlined #if defined(__NVCC__) # ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ # pragma nv_diag_default 940 # else # pragma diag_default 940 # endif #endif #ifdef __NVCOMPILER # pragma push # pragma diag_default 941 #endif // == // == ./include/llama/llama.hpp == // ============================================================================