# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)

import os

import spack.platforms.cray
from spack.package import *


class Aluminum(CachedCMakePackage, CudaPackage, ROCmPackage):
    """Aluminum provides a generic interface to high-performance
    communication libraries, with a focus on allreduce
    algorithms. Blocking and non-blocking algorithms and GPU-aware
    algorithms are supported. Aluminum also contains custom
    implementations of select algorithms to optimize for certain
    situations."""

    homepage = "https://github.com/LLNL/Aluminum"
    url = "https://github.com/LLNL/Aluminum/archive/v1.0.0.tar.gz"
    git = "https://github.com/LLNL/Aluminum.git"
    tags = ["ecp", "radiuss"]

    maintainers("benson31", "bvanessen")

    license("Apache-2.0")

    version("master", branch="master")
    version("1.4.1", sha256="d130a67fef1cb7a9cb3bbec1d0de426f020fe68c9df6e172c83ba42281cd90e3")
    version("1.4.0", sha256="ac54de058f38cead895ec8163f7b1fa7674e4dc5aacba683a660a61babbfe0c6")
    version("1.3.1", sha256="28ce0af6c6f29f97b7f19c5e45184bd2f8a0b1428f1e898b027d96d47cb74b0b")
    version("1.3.0", sha256="d0442efbebfdfb89eec793ae65eceb8f1ba65afa9f2e48df009f81985a4c27e3")
    version("1.2.3", sha256="9b214bdf30f9b7e8e017f83e6615db6be2631f5be3dd186205dbe3aa62f4018a")

    # Library capabilities
    variant(
        "cuda_rma",
        default=False,
        when="+cuda",
        description="Builds with support for CUDA intra-node "
        " Put/Get and IPC RMA functionality",
    )
    variant(
        "ht",
        default=False,
        description="Builds with support for host-enabled MPI"
        " communication of accelerator data",
    )
    variant("nccl", default=False, description="Builds with support for NCCL communication lib")
    variant("shared", default=True, description="Build Aluminum as a shared library")

    # Debugging features
    variant("hang_check", default=False, description="Enable hang checking")
    variant("trace", default=False, description="Enable runtime tracing")

    # Profiler support
    variant("nvtx", default=False, when="+cuda", description="Enable profiling via nvprof/NVTX")
    variant(
        "roctracer", default=False, when="+rocm", description="Enable profiling via rocprof/roctx"
    )

    # Advanced options
    variant("mpi_serialize", default=False, description="Serialize MPI operations")
    variant("stream_mem_ops", default=False, description="Enable stream memory operations")
    variant(
        "thread_multiple",
        default=False,
        description="Allow multiple threads to call Aluminum concurrently",
    )

    # Benchmark/testing support
    variant(
        "benchmarks",
        default=False,
        description="Build the Aluminum benchmarking drivers "
        "(warning: may significantly increase build time!)",
    )
    variant(
        "tests",
        default=False,
        description="Build the Aluminum test drivers "
        "(warning: may moderately increase build time!)",
    )

    # FIXME: Do we want to expose tuning parameters to the Spack
    # recipe? Some are numeric values, some are on/off switches.

    conflicts("~cuda", when="+cuda_rma", msg="CUDA RMA support requires CUDA")
    conflicts("+cuda", when="+rocm", msg="CUDA and ROCm support are mutually exclusive")

    depends_on("mpi")

    depends_on("cmake@3.21.0:", type="build", when="@1.0.1:")
    depends_on("hwloc@1.11:")

    with when("+cuda"):
        depends_on("cub", when="^cuda@:10")
        depends_on("hwloc +cuda +nvml")
        with when("+nccl"):
            depends_on("nccl@2.7.0-0:")
            for arch in CudaPackage.cuda_arch_values:
                depends_on(
                    "nccl +cuda cuda_arch={0}".format(arch),
                    when="+cuda cuda_arch={0}".format(arch),
                )
            if spack.platforms.cray.slingshot_network():
                depends_on("aws-ofi-nccl")  # Note: NOT a CudaPackage

    with when("+rocm"):
        for val in ROCmPackage.amdgpu_targets:
            depends_on(
                "hipcub +rocm amdgpu_target={0}".format(val), when="amdgpu_target={0}".format(val)
            )
            depends_on(
                "hwloc@2.3.0: +rocm amdgpu_target={0}".format(val),
                when="amdgpu_target={0}".format(val),
            )
            # RCCL is *NOT* implented as a ROCmPackage
            depends_on(
                "rccl amdgpu_target={0}".format(val), when="+nccl amdgpu_target={0}".format(val)
            )
            depends_on(
                "roctracer-dev +rocm amdgpu_target={0}".format(val),
                when="+roctracer amdgpu_target={0}".format(val),
            )
        if spack.platforms.cray.slingshot_network():
            depends_on("aws-ofi-rccl", when="+nccl")

    def cmake_args(self):
        args = []
        return args

    def get_cuda_flags(self):
        spec = self.spec
        args = []
        if spec.satisfies("^cuda+allow-unsupported-compilers"):
            args.append("-allow-unsupported-compiler")

        if spec.satisfies("%clang"):
            for flag in spec.compiler_flags["cxxflags"]:
                if "gcc-toolchain" in flag:
                    args.append("-Xcompiler={0}".format(flag))
        return args

    def std_initconfig_entries(self):
        entries = super(Aluminum, self).std_initconfig_entries()

        # CMAKE_PREFIX_PATH, in CMake types, is a "STRING", not a "PATH". :/
        entries = [x for x in entries if "CMAKE_PREFIX_PATH" not in x]
        cmake_prefix_path = os.environ["CMAKE_PREFIX_PATH"].replace(":", ";")
        entries.append(cmake_cache_string("CMAKE_PREFIX_PATH", cmake_prefix_path))
        return entries

    def initconfig_compiler_entries(self):
        spec = self.spec
        entries = super(Aluminum, self).initconfig_compiler_entries()

        # FIXME: Enforce this better in the actual CMake.
        entries.append(cmake_cache_string("CMAKE_CXX_STANDARD", "17"))
        entries.append(cmake_cache_option("BUILD_SHARED_LIBS", "+shared" in spec))
        entries.append(cmake_cache_option("CMAKE_EXPORT_COMPILE_COMMANDS", True))
        entries.append(cmake_cache_option("MPI_ASSUME_NO_BUILTIN_MPI", True))

        return entries

    def initconfig_hardware_entries(self):
        spec = self.spec
        entries = super(Aluminum, self).initconfig_hardware_entries()

        entries.append(cmake_cache_option("ALUMINUM_ENABLE_CUDA", "+cuda" in spec))
        if spec.satisfies("+cuda"):
            entries.append(cmake_cache_string("CMAKE_CUDA_STANDARD", "17"))
            if not spec.satisfies("cuda_arch=none"):
                archs = spec.variants["cuda_arch"].value
                arch_str = ";".join(archs)
                entries.append(cmake_cache_string("CMAKE_CUDA_ARCHITECTURES", arch_str))

            # FIXME: Should this use the "cuda_flags" function of the
            # CudaPackage class or something? There might be other
            # flags in play, and we need to be sure to get them all.
            cuda_flags = self.get_cuda_flags()
            if len(cuda_flags) > 0:
                entries.append(cmake_cache_string("CMAKE_CUDA_FLAGS", " ".join(cuda_flags)))

        entries.append(cmake_cache_option("ALUMINUM_ENABLE_ROCM", "+rocm" in spec))
        if spec.satisfies("+rocm"):
            entries.append(cmake_cache_string("CMAKE_HIP_STANDARD", "17"))
            if not spec.satisfies("amdgpu_target=none"):
                archs = self.spec.variants["amdgpu_target"].value
                arch_str = ";".join(archs)
                entries.append(cmake_cache_string("CMAKE_HIP_ARCHITECTURES", arch_str))
                entries.append(cmake_cache_string("AMDGPU_TARGETS", arch_str))
                entries.append(cmake_cache_string("GPU_TARGETS", arch_str))
            entries.append(cmake_cache_path("HIP_ROOT_DIR", spec["hip"].prefix))

        return entries

    def initconfig_package_entries(self):
        spec = self.spec
        entries = super(Aluminum, self).initconfig_package_entries()

        # Library capabilities
        entries.append(cmake_cache_option("ALUMINUM_ENABLE_MPI_CUDA", "+cuda_rma" in spec))
        entries.append(cmake_cache_option("ALUMINUM_ENABLE_MPI_CUDA_RMA", "+cuda_rma" in spec))
        entries.append(cmake_cache_option("ALUMINUM_ENABLE_HOST_TRANSFER", "+ht" in spec))
        entries.append(cmake_cache_option("ALUMINUM_ENABLE_NCCL", "+nccl" in spec))

        # Debugging features
        entries.append(cmake_cache_option("ALUMINUM_DEBUG_HANG_CHECK", "+hang_check" in spec))
        entries.append(cmake_cache_option("ALUMINUM_ENABLE_TRACE", "+trace" in spec))

        # Profiler support
        entries.append(cmake_cache_option("ALUMINUM_ENABLE_NVPROF", "+nvtx" in spec))
        entries.append(cmake_cache_option("ALUMINUM_ENABLE_ROCTRACER", "+roctracer" in spec))

        # Advanced options
        entries.append(cmake_cache_option("ALUMINUM_MPI_SERIALIZE", "+mpi_serialize" in spec))
        entries.append(
            cmake_cache_option("ALUMINUM_ENABLE_STREAM_MEM_OPS", "+stream_mem_ops" in spec)
        )
        entries.append(
            cmake_cache_option("ALUMINUM_ENABLE_THREAD_MULTIPLE", "+thread_multiple" in spec)
        )

        # Benchmark/testing support
        entries.append(cmake_cache_option("ALUMINUM_ENABLE_BENCHMARKS", "+benchmarks" in spec))
        entries.append(cmake_cache_option("ALUMINUM_ENABLE_TESTS", "+tests" in spec))

        return entries