# SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: BSD-3-Clause cmake_minimum_required(VERSION 3.18 FATAL_ERROR) project(nvfuser) enable_language(CUDA) cmake_policy(SET CMP0063 NEW) # make symbol visibility always apply # ALWAYS: (default) print both Installing and Up-to-date messages # LAZY: print Installing but not Up-to-date messages # NEVER: print neither set(CMAKE_INSTALL_MESSAGE LAZY) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(NVFUSER_ROOT ${PROJECT_SOURCE_DIR}) set(NVFUSER_SRCS_DIR "${NVFUSER_ROOT}/csrc") set(NVFUSER_PYTHON_DIR "${NVFUSER_ROOT}/python") set(NVFUSER_PYTHON_BINDINGS "${NVFUSER_ROOT}/python/python_frontend") set(NVFUSER_PYTHON_COMMON "${NVFUSER_ROOT}/python/python_common") set(NVFUSER_PYTHON_DIRECT_BINDINGS "${NVFUSER_ROOT}/python/python_direct") set(NVFUSER_CUTLASS "${NVFUSER_ROOT}/cutlass") set(NVFUSER_THIRD_PARTY_DIR "${NVFUSER_ROOT}/third_party") option(NVFUSER_STANDALONE_BUILD_WITH_UCC "" OFF) option(NVFUSER_EXPLICIT_ERROR_CHECK "" OFF) if(NVFUSER_EXPLICIT_ERROR_CHECK) add_compile_definitions(NVFUSER_EXPLICIT_ERROR_CHECK) endif() option(NVFUSER_BUILD_WITH_ASAN "Build nvFuser with asan" OFF) include(CMakeDependentOption) cmake_dependent_option(NVFUSER_DISTRIBUTED "" ON "USE_DISTRIBUTED" OFF) if(NVFUSER_DISTRIBUTED) add_compile_definitions(NVFUSER_DISTRIBUTED) endif() message(STATUS "Setting NVFUSER_DISTRIBUTED=${NVFUSER_DISTRIBUTED}") # We try to update which C++ standard we use together in lockstep across all # built libraries, and these variables control which that is. Generally we are # on C++20, but we still support a version of CUDA (11) that does not recognize # C++20 and so we drop back to 17 there. Also, we allow all of these to be # overridden by the user. # Note we do not use a global set_property on e.g. CXX_STANDARD. CMake globals # are footguns that should generally be avoided, because they are difficult to # target where and *only* where they are needed. See e.g.: # https://cliutils.gitlab.io/modern-cmake/chapters/intro/dodonot.html set(NVFUSER_C_STANDARD 20 CACHE STRING "C standard to use for C code") set(NVFUSER_CPP_STANDARD 20 CACHE STRING "C++ standard to use for C++ code") set(NVFUSER_CUDA_STANDARD 17 CACHE STRING "C++ standard to use for CUDA code") if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") # TODO: gcc 11.4 has been end of life according to https://gcc.gnu.org/ # I believe we should bump up the version below to 12.x. # However, because gcc 11.4 is well tested and stable, let's defer this # rejection until the day that we find a bug in gcc 11.4. if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11.4) message(FATAL_ERROR "GCC < 11.4 has compiler bugs and can not compile nvFuser.") endif() endif() string(APPEND CMAKE_CXX_FLAGS " -Wno-psabi") find_package(Torch REQUIRED) find_package(Python REQUIRED Development.Module Interpreter) find_package(pybind11 REQUIRED) find_package(CUDAToolkit REQUIRED) # need this since the pytorch execution uses a different name set(PYTHON_EXECUTABLE ${Python_EXECUTABLE}) # CXX flags is necessary since https://github.com/pytorch/pytorch/issues/98093 string(APPEND CMAKE_CXX_FLAGS " ${TORCH_CXX_FLAGS}") include(cmake/FlatBuffers.cmake) include(cmake/Dependencies.cmake) # set CUDA_ARCH for cu tests. if(TORCH_CUDA_ARCH_LIST) set(ARCH_FLAGS) cuda_select_nvcc_arch_flags(ARCH_FLAGS ${TORCH_CUDA_ARCH_LIST}) list(APPEND CUDA_NVCC_FLAGS ${ARCH_FLAGS}) endif() add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/lib/dynamic_type) set(CUTLASS_STATUS "N/A") if(BUILD_CUTLASS) enable_language(CUDA) if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.8) message(WARNING "Skip building CUTLASS because of incompatible CUDA ${CMAKE_CUDA_COMPILER_VERSION}") set(CUTLASS_STATUS "DISABLED") else() add_compile_definitions(NVFUSER_ENABLE_CUTLASS) set(CUTLASS_STATUS "ENABLED") find_package(CUDAToolkit REQUIRED) include(FetchContent) # cutlass FetchContent_Declare( repo-cutlass GIT_REPOSITORY https://github.com/NVIDIA/cutlass GIT_TAG f115c3f85467d5d9619119d1dbeb9c03c3d73864 GIT_SHALLOW OFF ) FetchContent_Populate(repo-cutlass) include(ProcessorCount) ProcessorCount(NPROC) set(NVF_CUTLASS_CUDA_FLAGS "-DCUTE_USE_PACKED_TUPLE=1" "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1" "-DCUTLASS_VERSIONS_GENERATED" "-DCUTLASS_TEST_LEVEL=0" "-DCUTLASS_TEST_ENABLE_CACHED_RESULTS=1" "-DCUTLASS_DEBUG_TRACE_LEVEL=0" "--expt-relaxed-constexpr" "--expt-extended-lambda" "--threads=${NPROC}" # ----------------- # Suppress warnings # ----------------- "-Xcompiler=-Wconversion" "-Xcompiler=-fno-strict-aliasing" # CUDA 13 has deprecated old vector types such as ulong4: https://developer.nvidia.com/blog/whats-new-and-important-in-cuda-toolkit-13-0 "-Xcompiler=-Wno-deprecated-declarations" ) set(NVFUSER_CUTLASS_SRCS) list(APPEND NVFUSER_CUTLASS_SRCS ${NVFUSER_CUTLASS}/group_mm.cu ${NVFUSER_CUTLASS}/nvfp4_scaled_mm.cu ${NVFUSER_CUTLASS}/nvfp4_scaled_mm_blockscale.cu ${NVFUSER_CUTLASS}/nvfp4_scaled_group_mm.cu ${NVFUSER_CUTLASS}/nvf_cutlass.cpp ${NVFUSER_CUTLASS}/cutlass_utils.cpp ) add_library(nvf_cutlass SHARED ${NVFUSER_CUTLASS_SRCS}) target_include_directories(nvf_cutlass PRIVATE ${repo-cutlass_SOURCE_DIR}/include) target_include_directories(nvf_cutlass PRIVATE ${repo-cutlass_SOURCE_DIR}/tools/util/include) target_compile_options(nvf_cutlass PRIVATE $<$:${NVF_CUTLASS_CUDA_FLAGS}>) if(NOT MSVC) set(NVF_LIB_SUFFIX ".so") else() set(NVF_LIB_SUFFIX ".pyd") endif() target_include_directories(nvf_cutlass PUBLIC "$" "$" "$" ) target_link_libraries(nvf_cutlass PRIVATE "${TORCH_LIBRARIES}" c10) set_target_properties(nvf_cutlass PROPERTIES C_STANDARD ${NVFUSER_C_STANDARD} CUDA_STANDARD ${NVFUSER_CUDA_STANDARD} CXX_STANDARD ${NVFUSER_CPP_STANDARD} CXX_STANDARD_REQUIRED ON CXX_VISIBILITY_PRESET hidden INSTALL_RPATH "$ORIGIN/../../nvidia/cuda_runtime/lib:$ORIGIN/../../nvidia/cuda_nvrtc/lib:$ORIGIN/../../nvidia/cuda_cupti/lib:$ORIGIN/../../torch/lib" POSITION_INDEPENDENT_CODE Yes VISIBILITY_INLINES_HIDDEN Yes CUDA_ARCHITECTURES "100a" ) install(TARGETS nvf_cutlass EXPORT NvfuserTargets DESTINATION lib) endif() endif() # ------------------------------ # build nvfuser_codegen library # ------------------------------ # nvfuser codegen sources set(NVFUSER_SRCS) list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/alias_analysis.cpp ${NVFUSER_SRCS_DIR}/codegen.cpp ${NVFUSER_SRCS_DIR}/compute_at.cpp ${NVFUSER_SRCS_DIR}/compute_at_map.cpp ${NVFUSER_SRCS_DIR}/contiguity.cpp ${NVFUSER_SRCS_DIR}/cutlass/codegen.cpp ${NVFUSER_SRCS_DIR}/cutlass/gemm.cpp ${NVFUSER_SRCS_DIR}/debug.cpp ${NVFUSER_SRCS_DIR}/device_lower/analysis/bank_conflict.cpp ${NVFUSER_SRCS_DIR}/device_lower/analysis/circular_buffer.cpp ${NVFUSER_SRCS_DIR}/device_lower/analysis/device_version.cpp ${NVFUSER_SRCS_DIR}/device_lower/analysis/divisible_split.cpp ${NVFUSER_SRCS_DIR}/device_lower/analysis/fused_reduction.cpp ${NVFUSER_SRCS_DIR}/device_lower/analysis/fusion_info.cpp ${NVFUSER_SRCS_DIR}/device_lower/analysis/index_compute.cpp ${NVFUSER_SRCS_DIR}/device_lower/analysis/non_divisible_split.cpp ${NVFUSER_SRCS_DIR}/device_lower/analysis/padded_parallel_dimensions.cpp ${NVFUSER_SRCS_DIR}/device_lower/analysis/predicate_elimination.cpp ${NVFUSER_SRCS_DIR}/device_lower/analysis/sync_information.cpp ${NVFUSER_SRCS_DIR}/device_lower/analysis/tensor_init_val.cpp ${NVFUSER_SRCS_DIR}/device_lower/analysis/tensor_memory.cpp ${NVFUSER_SRCS_DIR}/device_lower/analysis/tensor_producer_aliases.cpp ${NVFUSER_SRCS_DIR}/device_lower/analysis/thread_predicate.cpp ${NVFUSER_SRCS_DIR}/device_lower/analysis/tma.cpp ${NVFUSER_SRCS_DIR}/device_lower/analysis/trivial_broadcast.cpp ${NVFUSER_SRCS_DIR}/device_lower/lower2device.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/alias_memory.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/allocation.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/circular_buffer.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/expr_sort.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/fusion_simplifier.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/grid_serialization.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/index.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/inline_ptx.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/inplace_alias.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/insert_syncs.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/instrument.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/loop_rotation.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/loops.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/magic_zero.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/predicate.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/replace_size.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/rng.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/scalar_hoist.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/unroll.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/vectorize_welford.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/warp_reduce.cpp ${NVFUSER_SRCS_DIR}/device_lower/utils.cpp ${NVFUSER_SRCS_DIR}/device_lower/validation.cpp ${NVFUSER_SRCS_DIR}/dispatch.cpp ${NVFUSER_SRCS_DIR}/driver_api.cpp ${NVFUSER_SRCS_DIR}/dynamic_transform.cpp ${NVFUSER_SRCS_DIR}/evaluator_common.cpp ${NVFUSER_SRCS_DIR}/exceptions.cpp ${NVFUSER_SRCS_DIR}/expr_evaluator.cpp ${NVFUSER_SRCS_DIR}/expr_simplifier.cpp ${NVFUSER_SRCS_DIR}/fusion.cpp ${NVFUSER_SRCS_DIR}/fusion_guard.cpp ${NVFUSER_SRCS_DIR}/fusion_segmenter.cpp ${NVFUSER_SRCS_DIR}/global_allocator.cpp ${NVFUSER_SRCS_DIR}/grouped_reduction.cpp ${NVFUSER_SRCS_DIR}/host_ir/container.cpp ${NVFUSER_SRCS_DIR}/host_ir/evaluator.cpp ${NVFUSER_SRCS_DIR}/host_ir/host_ir.cpp ${NVFUSER_SRCS_DIR}/host_ir/lower.cpp ${NVFUSER_SRCS_DIR}/host_ir/lower_to_communication.cpp ${NVFUSER_SRCS_DIR}/host_ir/lowering.cpp ${NVFUSER_SRCS_DIR}/id_model/circular_buffer_indexing.cpp ${NVFUSER_SRCS_DIR}/id_model/contiguity.cpp ${NVFUSER_SRCS_DIR}/id_model/id_model.cpp ${NVFUSER_SRCS_DIR}/id_model/id_model_index_compute.cpp ${NVFUSER_SRCS_DIR}/id_model/indexing.cpp ${NVFUSER_SRCS_DIR}/id_model/indexing_traversal.cpp ${NVFUSER_SRCS_DIR}/id_model/loop_promotion.cpp ${NVFUSER_SRCS_DIR}/id_model/predicate_indexing.cpp ${NVFUSER_SRCS_DIR}/id_model/schedule.cpp ${NVFUSER_SRCS_DIR}/id_model/to_string.cpp ${NVFUSER_SRCS_DIR}/id_model/transform_replay.cpp ${NVFUSER_SRCS_DIR}/id_model/validation_utils.cpp ${NVFUSER_SRCS_DIR}/index_compute.cpp ${NVFUSER_SRCS_DIR}/instrumentation.cpp ${NVFUSER_SRCS_DIR}/interval_analysis.cpp ${NVFUSER_SRCS_DIR}/ir/allocation_utils.cpp ${NVFUSER_SRCS_DIR}/ir/base_nodes.cpp ${NVFUSER_SRCS_DIR}/ir/builder.cpp ${NVFUSER_SRCS_DIR}/ir/cloner.cpp ${NVFUSER_SRCS_DIR}/ir/container.cpp ${NVFUSER_SRCS_DIR}/ir/graphviz.cpp ${NVFUSER_SRCS_DIR}/ir/iostream.cpp ${NVFUSER_SRCS_DIR}/ir/nodes.cpp ${NVFUSER_SRCS_DIR}/ir/printer.cpp ${NVFUSER_SRCS_DIR}/ir/utils.cpp ${NVFUSER_SRCS_DIR}/iter_visitor.cpp ${NVFUSER_SRCS_DIR}/kernel.cpp ${NVFUSER_SRCS_DIR}/kernel_db/kernel_db.cpp ${NVFUSER_SRCS_DIR}/kernel_db/utils.cpp ${NVFUSER_SRCS_DIR}/kernel_ir.cpp ${NVFUSER_SRCS_DIR}/kernel_ir_dispatch.cpp ${NVFUSER_SRCS_DIR}/logical_domain_map.cpp ${NVFUSER_SRCS_DIR}/mma_type.cpp ${NVFUSER_SRCS_DIR}/multidevice/communication.cpp ${NVFUSER_SRCS_DIR}/multidevice/communicator.cpp ${NVFUSER_SRCS_DIR}/multidevice/cuda_p2p.cpp ${NVFUSER_SRCS_DIR}/multidevice/ipc_handle.cpp ${NVFUSER_SRCS_DIR}/multidevice/device_mesh.cpp ${NVFUSER_SRCS_DIR}/multidevice/executor.cpp ${NVFUSER_SRCS_DIR}/multidevice/utils.cpp ${NVFUSER_SRCS_DIR}/mutator.cpp ${NVFUSER_SRCS_DIR}/ops/alias.cpp ${NVFUSER_SRCS_DIR}/ops/arith.cpp ${NVFUSER_SRCS_DIR}/ops/composite.cpp ${NVFUSER_SRCS_DIR}/ops/indexing.cpp ${NVFUSER_SRCS_DIR}/ops/normalization.cpp ${NVFUSER_SRCS_DIR}/ops/utils.cpp ${NVFUSER_SRCS_DIR}/options.cpp ${NVFUSER_SRCS_DIR}/parallel_dimension_map.cpp ${NVFUSER_SRCS_DIR}/parallel_type_bitmap.cpp ${NVFUSER_SRCS_DIR}/polymorphic_value.cpp ${NVFUSER_SRCS_DIR}/predicate_compute.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/add_axioms.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/allocation_order_inference.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/consecutive_cast.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/exact_mapped_extent_substitution.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/decompose_reshardings.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/finalize_multidevice_domains.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/mark_aliases_prepare.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/move_gather.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/move_pad.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/move_repeat_forward.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/move_split_cat.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/pre_segmenter.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/propagate_shardings.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/remove_bcast_squeeze.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/remove_empty.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/reorder_sharded_axis.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/segment_inplace_update.cpp ${NVFUSER_SRCS_DIR}/host_ir/pass/convert_op_to_communication.cpp ${NVFUSER_SRCS_DIR}/host_ir/pass/stream_parallel_type.cpp ${NVFUSER_SRCS_DIR}/host_ir/pass/insert_deallocations.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/translate_no_reduction_matmul_to_mul_squeeze.cpp ${NVFUSER_SRCS_DIR}/preseg_passes/translate_repeat_to_expand.cpp ${NVFUSER_SRCS_DIR}/rng.cpp ${NVFUSER_SRCS_DIR}/runtime/allocations.cpp ${NVFUSER_SRCS_DIR}/runtime/communication_executor.cpp ${NVFUSER_SRCS_DIR}/runtime/compiled_kernel.cpp ${NVFUSER_SRCS_DIR}/runtime/cutlass_executor.cpp ${NVFUSER_SRCS_DIR}/runtime/cutlass_compiled_kernel.cpp ${NVFUSER_SRCS_DIR}/runtime/executor.cpp ${NVFUSER_SRCS_DIR}/runtime/executor_dispatch.cpp ${NVFUSER_SRCS_DIR}/runtime/executor_kernel_arg.cpp ${NVFUSER_SRCS_DIR}/runtime/executor_params.cpp ${NVFUSER_SRCS_DIR}/runtime/executor_utils.cpp ${NVFUSER_SRCS_DIR}/runtime/fusion_cache_utils.cpp ${NVFUSER_SRCS_DIR}/runtime/fusion_executor_cache.cpp ${NVFUSER_SRCS_DIR}/runtime/fusion_kernel_runtime.cpp ${NVFUSER_SRCS_DIR}/scheduler/cache_policy_refiner.cpp ${NVFUSER_SRCS_DIR}/scheduler/cutlass.cpp ${NVFUSER_SRCS_DIR}/scheduler/heuristic.cpp ${NVFUSER_SRCS_DIR}/scheduler/greedy.cpp ${NVFUSER_SRCS_DIR}/scheduler/mark_aliases.cpp ${NVFUSER_SRCS_DIR}/scheduler/matmul.cpp ${NVFUSER_SRCS_DIR}/scheduler/matmul_ampere-.cpp ${NVFUSER_SRCS_DIR}/scheduler/matmul_hopper+.cpp ${NVFUSER_SRCS_DIR}/scheduler/matmul_heuristic_plugin.cpp ${NVFUSER_SRCS_DIR}/scheduler/matmul_utils.cpp ${NVFUSER_SRCS_DIR}/scheduler/mma_utils.cpp ${NVFUSER_SRCS_DIR}/scheduler/no_op.cpp ${NVFUSER_SRCS_DIR}/scheduler/communication.cpp ${NVFUSER_SRCS_DIR}/scheduler/normalization_inner.cpp ${NVFUSER_SRCS_DIR}/scheduler/normalization_inner_outer.cpp ${NVFUSER_SRCS_DIR}/scheduler/normalization_inner_outer_utils.cpp ${NVFUSER_SRCS_DIR}/scheduler/normalization_inner_outer_tma_ws.cpp ${NVFUSER_SRCS_DIR}/scheduler/normalization_inner_outer_multi_wave.cpp ${NVFUSER_SRCS_DIR}/scheduler/normalization_outer.cpp ${NVFUSER_SRCS_DIR}/scheduler/normalization_utils.cpp ${NVFUSER_SRCS_DIR}/scheduler/pointwise.cpp ${NVFUSER_SRCS_DIR}/scheduler/pointwise_utils.cpp ${NVFUSER_SRCS_DIR}/scheduler/reduction.cpp ${NVFUSER_SRCS_DIR}/scheduler/reduction_utils.cpp ${NVFUSER_SRCS_DIR}/scheduler/registry.cpp ${NVFUSER_SRCS_DIR}/scheduler/registry_utils.cpp ${NVFUSER_SRCS_DIR}/scheduler/resize.cpp ${NVFUSER_SRCS_DIR}/scheduler/runtime_info.cpp ${NVFUSER_SRCS_DIR}/scheduler/scheduler_types.cpp ${NVFUSER_SRCS_DIR}/scheduler/tools/domain_map.cpp ${NVFUSER_SRCS_DIR}/scheduler/tools/inlining.cpp ${NVFUSER_SRCS_DIR}/scheduler/tools/loop_domain_scheduler.cpp ${NVFUSER_SRCS_DIR}/scheduler/tools/maxinfo_propagator.cpp ${NVFUSER_SRCS_DIR}/scheduler/tools/resize_utils.cpp ${NVFUSER_SRCS_DIR}/scheduler/tools/static_repeat.cpp ${NVFUSER_SRCS_DIR}/scheduler/transpose.cpp ${NVFUSER_SRCS_DIR}/scheduler/utils.cpp ${NVFUSER_SRCS_DIR}/scheduler/vectorize_helper.cpp ${NVFUSER_SRCS_DIR}/scheduler/expr_eval_sched.cpp ${NVFUSER_SRCS_DIR}/serde/polymorphic_value.cpp ${NVFUSER_SRCS_DIR}/serde/utils.cpp ${NVFUSER_SRCS_DIR}/statement_guard.cpp ${NVFUSER_SRCS_DIR}/swizzle.cpp ${NVFUSER_SRCS_DIR}/sys_utils.cpp ${NVFUSER_SRCS_DIR}/tensor_metadata.cpp ${NVFUSER_SRCS_DIR}/tensor_view.cpp ${NVFUSER_SRCS_DIR}/tma.cpp ${NVFUSER_SRCS_DIR}/transform_iter.cpp ${NVFUSER_SRCS_DIR}/transform_replay.cpp ${NVFUSER_SRCS_DIR}/transform_rfactor.cpp ${NVFUSER_SRCS_DIR}/transform_view.cpp ${NVFUSER_SRCS_DIR}/type.cpp ${NVFUSER_SRCS_DIR}/type_promotion.cpp ${NVFUSER_SRCS_DIR}/utils.cpp ${NVFUSER_SRCS_DIR}/val_graph.cpp ${NVFUSER_SRCS_DIR}/val_graph_visitor.cpp ${NVFUSER_SRCS_DIR}/validator_utils.cpp ) cmake_dependent_option(NVFUSER_HOST_IR_JIT "Build nvFuser with LLVM" ON "USE_HOST_IR_JIT" OFF) message(STATUS "Setting NVFUSER_HOST_IR_JIT=${NVFUSER_HOST_IR_JIT}") if(NVFUSER_HOST_IR_JIT) add_compile_definitions(NVFUSER_HOST_IR_JIT) # Add LLVM JIT related dependencies find_package(LLVM 18.1 REQUIRED CONFIG) llvm_map_components_to_libnames(LLVM_LIBS support core orcjit executionengine irreader nativecodegen Target Analysis JITLink Demangle ) add_library(LLVM_JIT INTERFACE) target_include_directories(LLVM_JIT INTERFACE ${LLVM_INCLUDE_DIRS}) target_compile_definitions(LLVM_JIT INTERFACE ${LLVM_DEFINITIONS}) target_link_libraries(LLVM_JIT INTERFACE ${LLVM_LIBS}) # Add LLVM JIT related sources list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/host_ir/jit.cpp ) endif() # We don't link CUPTI for MSVC if(NOT MSVC) list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/fusion_profiler.cpp ) endif() if(BUILD_PYTHON) list(APPEND NVFUSER_SRCS ${NVFUSER_PYTHON_BINDINGS}/fusion_cache.cpp ${NVFUSER_PYTHON_BINDINGS}/fusion_definition.cpp ${NVFUSER_PYTHON_BINDINGS}/fusion_state.cpp ${NVFUSER_PYTHON_BINDINGS}/segmentation.cpp ${NVFUSER_PYTHON_BINDINGS}/translation.cpp ${NVFUSER_PYTHON_BINDINGS}/translation_utils.cpp ${NVFUSER_SRCS_DIR}/serde/fusion_record.cpp ${NVFUSER_PYTHON_COMMON}/distributed_tensor.cpp ${NVFUSER_PYTHON_COMMON}/python_utils.cpp ${NVFUSER_PYTHON_COMMON}/translation_names.cpp ) endif() # We create both static and shared libraries. # # Shared libraries are what ships, but a large advantage of static libraries is # that symbols are all visible. This allows us to test internal components # inside our test or benchmark binaries, even if we do not want said components # to be visible to the outside. If we used only shared libraries, then any API # we invoked from test binaries would need to be marked as public, even if we # did not want to expose it to users. # # Note technically we create an "OBJECT" library instead of a "STATIC" library. # This is just a CMake quirk; an OBJECT library is a better way to implement a # "private" (not installed) static library. add_library(codegen_internal OBJECT ${NVFUSER_SRCS}) if(NOT MSVC) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") target_compile_options(codegen_internal PRIVATE -Wall -Wno-unused-function -Werror # These warnings are not treated as errors because of gcc 12.2 used in # manylinux image. consider enable this when we upgrade. # linking comment: # https://github.com/NVIDIA/Fuser/pull/3001#discussion_r1772551266 -Wno-error=restrict -Wno-error=stringop-overflow -Wno-error=maybe-uninitialized) else() target_compile_options(codegen_internal PRIVATE -Wall -Wno-unused-function -Werror) endif() endif() target_compile_definitions(codegen_internal PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB") target_include_directories(codegen_internal PUBLIC ${NVFUSER_PYTHON_DIR}) target_include_directories(codegen_internal PUBLIC ${NVFUSER_PYTHON_COMMON}) target_include_directories(codegen_internal SYSTEM PUBLIC ${CMAKE_SOURCE_DIR}/third_party/flatbuffers/include PRIVATE ${CUDA_INCLUDE_DIRS} ) target_include_directories(codegen_internal PUBLIC "$" "$" ) set_target_properties(codegen_internal PROPERTIES C_STANDARD ${NVFUSER_C_STANDARD} CUDA_STANDARD ${NVFUSER_CUDA_STANDARD} CXX_STANDARD ${NVFUSER_CPP_STANDARD} CXX_STANDARD_REQUIRED ON CXX_VISIBILITY_PRESET hidden # this is to find pip installed nvrtc.so INSTALL_RPATH "$ORIGIN/../../nvidia/cuda_runtime/lib:$ORIGIN/../../nvidia/cuda_nvrtc/lib:$ORIGIN/../../nvidia/cuda_cupti/lib:$ORIGIN/../../torch/lib" POSITION_INDEPENDENT_CODE Yes VISIBILITY_INLINES_HIDDEN Yes ) # Ensure we don't link against libcuda; we'll dlopen it ourselves. list(FILTER TORCH_LIBRARIES EXCLUDE REGEX "libcuda\.so") target_link_libraries(codegen_internal PUBLIC dynamic_type CUDA::cupti ${TORCH_LIBRARIES} dl ) if (BUILD_CUTLASS AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) target_link_libraries(codegen_internal PUBLIC nvf_cutlass) target_compile_definitions(codegen_internal PRIVATE "-DNVFUSER_CUTLASS_KERNEL_ENABLED") endif() if(NVFUSER_HOST_IR_JIT) target_link_libraries(codegen_internal PUBLIC LLVM_JIT) endif() add_library(nvfuser_codegen SHARED $) if (BUILD_CUTLASS AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) target_link_libraries(nvfuser_codegen PUBLIC nvf_cutlass) endif() # Conditionally link CUTLASS using generator expression to avoid export issues target_link_libraries(nvfuser_codegen PRIVATE $<$,$>:nvf_cutlass> ) if(NVFUSER_BUILD_WITH_ASAN) target_compile_options(codegen_internal PRIVATE -fsanitize=address) target_link_options(codegen_internal PUBLIC -fsanitize=address) target_link_options(nvfuser_codegen PUBLIC -fsanitize=address) endif() target_include_directories(nvfuser_codegen PUBLIC # Core nvfuser "$" # Python-specific "$" "$" # External dependencies "$" "$" ) target_include_directories(nvfuser_codegen SYSTEM PUBLIC "$" "$" ) target_link_libraries(nvfuser_codegen PUBLIC ${TORCH_LIBRARIES} PRIVATE dynamic_type flatbuffers ${CUDA_NVRTC_LIB} CUDA::cupti dl $<$:LLVM_JIT> ) set_target_properties(nvfuser_codegen PROPERTIES C_STANDARD ${NVFUSER_C_STANDARD} CUDA_STANDARD ${NVFUSER_CUDA_STANDARD} CXX_STANDARD ${NVFUSER_CPP_STANDARD} CXX_STANDARD_REQUIRED ON CXX_VISIBILITY_PRESET hidden INSTALL_RPATH "$ORIGIN/../../nvidia/cuda_runtime/lib:$ORIGIN/../../nvidia/cuda_nvrtc/lib:$ORIGIN/../../nvidia/cuda_cupti/lib:$ORIGIN/../../torch/lib:$ORIGIN" POSITION_INDEPENDENT_CODE Yes VISIBILITY_INLINES_HIDDEN Yes ) # Add dead code elimination flags to reduce file size if(NOT MSVC) target_link_options(nvfuser_codegen PRIVATE "-Wl,--gc-sections" "-Wl,--as-needed" $<$:-s> ) target_compile_options(nvfuser_codegen PRIVATE "-ffunction-sections" "-fdata-sections" ) endif() install(TARGETS nvfuser_codegen EXPORT NvfuserTargets DESTINATION lib) # We are keeping fusion_cache_generated.h for the submodule build because flatc is unavailable. add_custom_command( OUTPUT ${NVFUSER_ROOT}/csrc/serde/fusion_cache_generated.h DEPENDS ${NVFUSER_ROOT}/csrc/serde/fusion_cache.fbs DEPENDS flatc COMMAND ${CMAKE_CURRENT_BINARY_DIR}/third_party/flatbuffers/flatc --scoped-enums -o ${NVFUSER_ROOT}/csrc/serde/ -c -b ${NVFUSER_ROOT}/csrc/serde/fusion_cache.fbs COMMENT "Generating fusion_cache_generated header from fusion_cache.fbs" VERBATIM ) add_custom_target(build_flatbuffer_config ALL DEPENDS ${NVFUSER_ROOT}/csrc/serde/fusion_cache_generated.h) if(NVFUSER_STANDALONE_BUILD_WITH_UCC) # User may need to set env vars UCC_DIR, UCX_DIR, UCC_HOME, UCX_HOME for CMake's Find_UCC to work. find_package(UCC REQUIRED) find_package(UCX REQUIRED) add_library(__nvfuser_ucc INTERFACE) set_target_properties(__nvfuser_ucc PROPERTIES C_STANDARD ${NVFUSER_C_STANDARD} CUDA_STANDARD ${NVFUSER_CUDA_STANDARD} CXX_STANDARD ${NVFUSER_CPP_STANDARD} CXX_STANDARD_REQUIRED ON CXX_VISIBILITY_PRESET hidden POSITION_INDEPENDENT_CODE Yes VISIBILITY_INLINES_HIDDEN Yes ) target_link_libraries(__nvfuser_ucc INTERFACE ucx::ucs ucx::ucp ucc::ucc) target_include_directories(__nvfuser_ucc INTERFACE ${UCC_INCLUDE_DIRS}) target_link_libraries(codegen_internal PRIVATE __nvfuser_ucc) target_compile_definitions(codegen_internal PRIVATE NVFUSER_BUILD_WITH_UCC) endif() add_dependencies(codegen_internal flatc build_flatbuffer_config) # installing nvfuser headers install(DIRECTORY "${NVFUSER_SRCS_DIR}/" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nvfuser" FILES_MATCHING PATTERN "*.h" PATTERN "csrc/C++20/compare" PATTERN "csrc/C++23/utility" PATTERN "csrc/struct.inl") # TODO guard including flatbuffers headers # installing flatbuffers headers install(DIRECTORY "${NVFUSER_THIRD_PARTY_DIR}/flatbuffers/include/" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nvfuser") # installing dynamic_type headers install(DIRECTORY "${NVFUSER_ROOT}/lib/dynamic_type/src/dynamic_type" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nvfuser") if(BUILD_PYTHON) # ----------------------------- # build nvfuser python library # ----------------------------- # nvfuser python API sources set(NVFUSER_PYTHON_SRCS) list(APPEND NVFUSER_PYTHON_SRCS ${NVFUSER_PYTHON_BINDINGS}/multidevice_bindings.cpp ${NVFUSER_PYTHON_BINDINGS}/python_bindings.cpp ${NVFUSER_PYTHON_BINDINGS}/python_bindings_extension.cpp ${NVFUSER_PYTHON_BINDINGS}/schedule_bindings.cpp ) add_library(nvf_py_internal OBJECT ${NVFUSER_PYTHON_SRCS}) target_include_directories(nvf_py_internal PUBLIC ${NVFUSER_PYTHON_DIR}) target_include_directories(nvf_py_internal PUBLIC ${NVFUSER_PYTHON_COMMON}) target_include_directories(nvf_py_internal PUBLIC ${NVFUSER_CUTLASS}) target_include_directories(nvf_py_internal SYSTEM INTERFACE ${CMAKE_SOURCE_DIR}/third_party/flatbuffers/include ) # setup python API version add_custom_command( OUTPUT ${NVFUSER_PYTHON_DIR}/nvfuser/version.py COMMAND "${PYTHON_EXECUTABLE}" -c \"from pathlib import Path\; Path('${NVFUSER_PYTHON_DIR}/tools/gen_nvfuser_version.py') .touch() \" COMMAND "${PYTHON_EXECUTABLE}" ${NVFUSER_PYTHON_DIR}/tools/gen_nvfuser_version.py nvfuser DEPENDS ${NVFUSER_PYTHON_DIR}/tools/gen_nvfuser_version.py DEPENDS ${NVFUSER_PYTHON_DIR}/version.txt WORKING_DIRECTORY ${NVFUSER_PYTHON_DIR}/tools/ ) add_custom_target( gen_nvfuser_version ALL DEPENDS ${NVFUSER_PYTHON_DIR}/nvfuser/version.py ) add_dependencies(nvf_py_internal gen_nvfuser_version) target_compile_definitions(nvf_py_internal PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB" "-DC10_BUILD_MAIN_LIB=1" EXTENSION_NAME=_C ) add_library(nvfuser MODULE $) target_compile_definitions(nvfuser PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB" "-DC10_BUILD_MAIN_LIB=1" EXTENSION_NAME=_C ) if(NOT MSVC) target_compile_options(nvf_py_internal PRIVATE -Wall -Wno-unused-function) target_compile_options(nvf_py_internal PRIVATE -Werror) # Add function/data sections for dead code elimination target_compile_options(nvf_py_internal PRIVATE "-ffunction-sections" "-fdata-sections" ) set(NVF_LIB_SUFFIX ".so") else() set(NVF_LIB_SUFFIX ".pyd") endif() set_target_properties(nvfuser PROPERTIES C_STANDARD ${NVFUSER_C_STANDARD} CUDA_STANDARD ${NVFUSER_CUDA_STANDARD} CXX_STANDARD ${NVFUSER_CPP_STANDARD} CXX_STANDARD_REQUIRED ON CXX_VISIBILITY_PRESET hidden INSTALL_RPATH "$ORIGIN/lib:$ORIGIN/../nvfuser_common/lib:$ORIGIN/../nvidia/cuda_runtime/lib:$ORIGIN/../nvidia/cuda_nvrtc/lib:$ORIGIN/../../nvidia/cuda_cupti/lib:$ORIGIN/../torch/lib" POSITION_INDEPENDENT_CODE Yes SUFFIX ${NVF_LIB_SUFFIX} VISIBILITY_INLINES_HIDDEN Yes ) set_target_properties(nvf_py_internal PROPERTIES C_STANDARD ${NVFUSER_C_STANDARD} CUDA_STANDARD ${NVFUSER_CUDA_STANDARD} CXX_STANDARD ${NVFUSER_CPP_STANDARD} CXX_STANDARD_REQUIRED ON CXX_VISIBILITY_PRESET hidden INSTALL_RPATH "$ORIGIN/lib:$ORIGIN/../nvidia/cuda_runtime/lib:$ORIGIN/../nvidia/cuda_nvrtc/lib:$ORIGIN/../../nvidia/cuda_cupti/lib:$ORIGIN/../torch/lib" POSITION_INDEPENDENT_CODE Yes VISIBILITY_INLINES_HIDDEN Yes ) if (BUILD_CUTLASS AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) target_link_libraries(nvf_py_internal PRIVATE nvf_cutlass) endif() if (NOT MSVC) target_link_libraries(nvf_py_internal PRIVATE CUDA::cupti) endif() target_link_libraries(nvf_py_internal PRIVATE nvfuser_codegen "${TORCH_INSTALL_PREFIX}/lib/libtorch_python.so" pybind11::pybind11 pybind11::headers ) target_link_libraries(nvfuser PRIVATE nvf_py_internal Python::Module ) # Add dead code elimination flags to reduce file size if(NOT MSVC) target_link_options(nvfuser PRIVATE "-Wl,--gc-sections" "-Wl,--as-needed" $<$:-s> ) endif() set_target_properties(nvfuser PROPERTIES INSTALL_RPATH "$ORIGIN:$ORIGIN/lib:$ORIGIN/../build:$ORIGIN/../nvfuser_common/lib" ) install(TARGETS nvfuser DESTINATION lib) # ------------------------------------------------ # build nvfuser next python library # ------------------------------------------------ # nvfuser next bindings API sources set(NVFUSER_PYTHON_DIRECT_SRCS) list(APPEND NVFUSER_PYTHON_DIRECT_SRCS ${NVFUSER_PYTHON_DIRECT_BINDINGS}/extension.cpp ${NVFUSER_PYTHON_DIRECT_BINDINGS}/bindings.cpp ${NVFUSER_PYTHON_DIRECT_BINDINGS}/enum.cpp ${NVFUSER_PYTHON_DIRECT_BINDINGS}/heuristic_params.cpp ${NVFUSER_PYTHON_DIRECT_BINDINGS}/ir.cpp ${NVFUSER_PYTHON_DIRECT_BINDINGS}/multidevice.cpp ${NVFUSER_PYTHON_DIRECT_BINDINGS}/ops.cpp ${NVFUSER_PYTHON_DIRECT_BINDINGS}/cutlass.cpp ${NVFUSER_PYTHON_DIRECT_BINDINGS}/runtime.cpp ${NVFUSER_PYTHON_DIRECT_BINDINGS}/direct_utils.cpp ${NVFUSER_PYTHON_DIRECT_BINDINGS}/python_translate.cpp ) add_library(nvf_py_direct_internal OBJECT ${NVFUSER_PYTHON_DIRECT_SRCS}) # setup python API version add_custom_command( OUTPUT ${NVFUSER_PYTHON_DIR}/nvfuser_direct/version.py COMMAND "${PYTHON_EXECUTABLE}" -c \"from pathlib import Path\; Path('${NVFUSER_PYTHON_DIR}/tools/gen_nvfuser_version.py') .touch() \" COMMAND "${PYTHON_EXECUTABLE}" ${NVFUSER_PYTHON_DIR}/tools/gen_nvfuser_version.py nvfuser_direct DEPENDS ${NVFUSER_PYTHON_DIR}/tools/gen_nvfuser_version.py DEPENDS ${NVFUSER_PYTHON_DIR}/version.txt WORKING_DIRECTORY ${NVFUSER_PYTHON_DIR}/tools/ ) add_custom_target( gen_nvfuser_direct_version ALL DEPENDS ${NVFUSER_PYTHON_DIR}/nvfuser_direct/version.py ) add_dependencies(nvf_py_direct_internal gen_nvfuser_direct_version) # NOTE: For any future extension, change PYTHON_DIRECT_EXTENSION to another # name other than EXTENSION_NAME. target_compile_definitions(nvf_py_direct_internal PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB" "-DC10_BUILD_MAIN_LIB=1" PYTHON_DIRECT_EXTENSION=_C_DIRECT ) add_library(nvfuser_direct MODULE $) target_compile_definitions(nvfuser_direct PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB" "-DC10_BUILD_MAIN_LIB=1" PYTHON_DIRECT_EXTENSION=_C_DIRECT ) if(NOT MSVC) target_compile_options(nvf_py_direct_internal PRIVATE -Wall -Wno-unused-function) target_compile_options(nvf_py_direct_internal PRIVATE -Werror) # Add function/data sections for dead code elimination target_compile_options(nvf_py_direct_internal PRIVATE "-ffunction-sections" "-fdata-sections" ) set(NVF_LIB_SUFFIX ".so") else() set(NVF_LIB_SUFFIX ".pyd") endif() set_target_properties(nvf_py_direct_internal PROPERTIES C_STANDARD ${NVFUSER_C_STANDARD} CUDA_STANDARD ${NVFUSER_CUDA_STANDARD} CXX_STANDARD ${NVFUSER_CPP_STANDARD} CXX_STANDARD_REQUIRED ON CXX_VISIBILITY_PRESET hidden INSTALL_RPATH "$ORIGIN/lib:$ORIGIN/../nvidia/cuda_runtime/lib:$ORIGIN/../nvidia/cuda_nvrtc/lib:$ORIGIN/../../nvidia/cuda_cupti/lib:$ORIGIN/../torch/lib" POSITION_INDEPENDENT_CODE Yes VISIBILITY_INLINES_HIDDEN Yes ) set_target_properties(nvfuser_direct PROPERTIES C_STANDARD ${NVFUSER_C_STANDARD} CUDA_STANDARD ${NVFUSER_CUDA_STANDARD} CXX_STANDARD ${NVFUSER_CPP_STANDARD} CXX_STANDARD_REQUIRED ON CXX_VISIBILITY_PRESET hidden INSTALL_RPATH "$ORIGIN/lib:$ORIGIN/../nvfuser_common/lib:$ORIGIN/../nvidia/cuda_runtime/lib:$ORIGIN/../nvidia/cuda_nvrtc/lib:$ORIGIN/../../nvidia/cuda_cupti/lib:$ORIGIN/../torch/lib" POSITION_INDEPENDENT_CODE Yes SUFFIX ${NVF_LIB_SUFFIX} VISIBILITY_INLINES_HIDDEN Yes ) target_include_directories(nvf_py_direct_internal PUBLIC ${NVFUSER_PYTHON_DIRECT_BINDINGS}) target_include_directories(nvf_py_direct_internal PUBLIC ${NVFUSER_PYTHON_COMMON}) if (BUILD_CUTLASS AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) target_link_libraries(nvf_py_direct_internal PRIVATE nvfuser_codegen nvf_cutlass "${TORCH_INSTALL_PREFIX}/lib/libtorch_python.so" pybind11::pybind11 pybind11::headers ) else() target_link_libraries(nvf_py_direct_internal PRIVATE nvfuser_codegen "${TORCH_INSTALL_PREFIX}/lib/libtorch_python.so" pybind11::pybind11 pybind11::headers ) endif() target_link_libraries(nvfuser_direct PRIVATE nvf_py_direct_internal Python::Module ) # Add dead code elimination flags to reduce file size if(NOT MSVC) target_link_options(nvfuser_direct PRIVATE "-Wl,--gc-sections" "-Wl,--as-needed" $<$:-s> ) endif() set_target_properties(nvfuser_direct PROPERTIES INSTALL_RPATH "$ORIGIN:$ORIGIN/../build:$ORIGIN/../nvfuser_common/lib" ) install(TARGETS nvfuser_direct DESTINATION lib) endif() set(JIT_TEST_SRCS) list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/tests/cpp/kernel_db/test_nvfuser_kernel_db_open.cpp ${NVFUSER_ROOT}/tests/cpp/kernel_db/test_nvfuser_kernel_db_query.cpp ${NVFUSER_ROOT}/tests/cpp/kernel_db/test_nvfuser_kernel_db_write.cpp ${NVFUSER_ROOT}/tests/cpp/test_abstract_tensor.cpp ${NVFUSER_ROOT}/tests/cpp/test_alias.cpp ${NVFUSER_ROOT}/tests/cpp/test_alias_analysis.cpp ${NVFUSER_ROOT}/tests/cpp/test_allocation_domain.cpp ${NVFUSER_ROOT}/tests/cpp/test_allocation_order_inference.cpp ${NVFUSER_ROOT}/tests/cpp/test_bfs.cpp ${NVFUSER_ROOT}/tests/cpp/test_ca_root_domain_map.cpp ${NVFUSER_ROOT}/tests/cpp/test_circular_buffering.cpp ${NVFUSER_ROOT}/tests/cpp/test_circular_buffering_ping_pong.cpp ${NVFUSER_ROOT}/tests/cpp/test_combined_inner_outer_reduction.cpp ${NVFUSER_ROOT}/tests/cpp/test_compute_at_map.cpp ${NVFUSER_ROOT}/tests/cpp/test_compute_with.cpp ${NVFUSER_ROOT}/tests/cpp/test_contiguity_id_model.cpp ${NVFUSER_ROOT}/tests/cpp/test_driver_api.cpp ${NVFUSER_ROOT}/tests/cpp/test_dynamic_transform.cpp ${NVFUSER_ROOT}/tests/cpp/test_embedding_node.cpp ${NVFUSER_ROOT}/tests/cpp/test_evaluator.cpp ${NVFUSER_ROOT}/tests/cpp/test_exceptions.cpp ${NVFUSER_ROOT}/tests/cpp/test_expr_simplifier.cpp ${NVFUSER_ROOT}/tests/cpp/test_expr_sort.cpp ${NVFUSER_ROOT}/tests/cpp/test_gather.cpp ${NVFUSER_ROOT}/tests/cpp/test_gpu1.cpp ${NVFUSER_ROOT}/tests/cpp/test_gpu2.cpp ${NVFUSER_ROOT}/tests/cpp/test_gpu3.cpp ${NVFUSER_ROOT}/tests/cpp/test_id_model.cpp ${NVFUSER_ROOT}/tests/cpp/test_index_put.cpp ${NVFUSER_ROOT}/tests/cpp/test_index_select.cpp ${NVFUSER_ROOT}/tests/cpp/test_indexing.cpp ${NVFUSER_ROOT}/tests/cpp/test_indexing_advanced.cpp ${NVFUSER_ROOT}/tests/cpp/test_inlining.cpp ${NVFUSER_ROOT}/tests/cpp/test_interval_analysis.cpp ${NVFUSER_ROOT}/tests/cpp/test_iostream.cpp ${NVFUSER_ROOT}/tests/cpp/test_iter_visitor.cpp ${NVFUSER_ROOT}/tests/cpp/test_linked_hash_map.cpp ${NVFUSER_ROOT}/tests/cpp/test_loop_domain_scheduling.cpp ${NVFUSER_ROOT}/tests/cpp/test_loop_rotation.cpp ${NVFUSER_ROOT}/tests/cpp/test_low_precision_recipe.cpp ${NVFUSER_ROOT}/tests/cpp/test_math_opt.cpp ${NVFUSER_ROOT}/tests/cpp/test_mbarrier.cpp ${NVFUSER_ROOT}/tests/cpp/test_memory.cpp ${NVFUSER_ROOT}/tests/cpp/test_move_pad.cpp ${NVFUSER_ROOT}/tests/cpp/test_move_repeat_forward.cpp ${NVFUSER_ROOT}/tests/cpp/test_move_split_cat.cpp ${NVFUSER_ROOT}/tests/cpp/test_mutator.cpp ${NVFUSER_ROOT}/tests/cpp/test_no_op.cpp ${NVFUSER_ROOT}/tests/cpp/test_outer_reduction.cpp ${NVFUSER_ROOT}/tests/cpp/test_overlap.cpp ${NVFUSER_ROOT}/tests/cpp/test_persistent_buffer.cpp ${NVFUSER_ROOT}/tests/cpp/test_pointwise.cpp ${NVFUSER_ROOT}/tests/cpp/test_polymorphic_value.cpp ${NVFUSER_ROOT}/tests/cpp/test_predicate_elimination.cpp ${NVFUSER_ROOT}/tests/cpp/test_preseg_passes.cpp ${NVFUSER_ROOT}/tests/cpp/test_reduction.cpp ${NVFUSER_ROOT}/tests/cpp/test_reduction_pointwise.cpp ${NVFUSER_ROOT}/tests/cpp/test_remove_bcast_squeeze.cpp ${NVFUSER_ROOT}/tests/cpp/test_remove_trivial_ops.cpp ${NVFUSER_ROOT}/tests/cpp/test_replay.cpp ${NVFUSER_ROOT}/tests/cpp/test_resharding.cpp ${NVFUSER_ROOT}/tests/cpp/test_resize.cpp ${NVFUSER_ROOT}/tests/cpp/test_rope.cpp ${NVFUSER_ROOT}/tests/cpp/test_runtime.cpp ${NVFUSER_ROOT}/tests/cpp/test_scalar_hoisting.cpp ${NVFUSER_ROOT}/tests/cpp/test_scatter.cpp ${NVFUSER_ROOT}/tests/cpp/test_sdpa_node.cpp ${NVFUSER_ROOT}/tests/cpp/test_segmentation.cpp ${NVFUSER_ROOT}/tests/cpp/test_select.cpp ${NVFUSER_ROOT}/tests/cpp/test_serial_gridreduce.cpp ${NVFUSER_ROOT}/tests/cpp/test_sharding.cpp ${NVFUSER_ROOT}/tests/cpp/test_smem_reuse.cpp ${NVFUSER_ROOT}/tests/cpp/test_statement_guard.cpp ${NVFUSER_ROOT}/tests/cpp/test_stream.cpp ${NVFUSER_ROOT}/tests/cpp/test_swizzle.cpp ${NVFUSER_ROOT}/tests/cpp/test_tensor_factories.cpp ${NVFUSER_ROOT}/tests/cpp/test_tmem.cpp ${NVFUSER_ROOT}/tests/cpp/test_transpose.cpp ${NVFUSER_ROOT}/tests/cpp/test_unary.cpp ${NVFUSER_ROOT}/tests/cpp/test_utils.cpp ${NVFUSER_ROOT}/tests/cpp/test_vectorization.cpp ${NVFUSER_ROOT}/tests/cpp/test_welford.cpp ) if(BUILD_TEST) set(RNG_TEST_KERNELS "rng_test_kernels") add_library(${RNG_TEST_KERNELS} SHARED ${NVFUSER_ROOT}/tests/cpp/rng_kernels.cu) # CUDA 11 does not support C++20, so hard code C++17 here set_property(TARGET ${RNG_TEST_KERNELS} PROPERTY CXX_STANDARD 17) target_link_libraries(${RNG_TEST_KERNELS} PRIVATE torch ${TORCH_LIBRARIES}) target_include_directories(${RNG_TEST_KERNELS} PRIVATE "${NVFUSER_ROOT}") set(ARGSORT_TEST_KERNELS "argsort_test_kernels") add_library(${ARGSORT_TEST_KERNELS} SHARED ${NVFUSER_ROOT}/tests/cpp/argsort_test_kernels.cu) # CUDA 11 does not support C++20, so hard code C++17 here set_property(TARGET ${ARGSORT_TEST_KERNELS} PROPERTY CXX_STANDARD 17) target_link_libraries(${ARGSORT_TEST_KERNELS} PRIVATE torch ${TORCH_LIBRARIES}) target_include_directories(${ARGSORT_TEST_KERNELS} PRIVATE "${NVFUSER_ROOT}") target_include_directories(${ARGSORT_TEST_KERNELS} SYSTEM PRIVATE ${NVFUSER_ROOT}/third_party/googletest/googletest/include ${NVFUSER_ROOT}/third_party/googletest/googlemock/include ) set(TOPK_TEST_KERNELS "topk_test_kernels") add_library(${TOPK_TEST_KERNELS} SHARED ${NVFUSER_ROOT}/tests/cpp/topk_test_kernels.cu ${NVFUSER_ROOT}/tests/cpp/topk_test_helper.cpp ) # CUDA 11 does not support C++20, so hard code C++17 here set_property(TARGET ${TOPK_TEST_KERNELS} PROPERTY CXX_STANDARD 17) target_link_libraries(${TOPK_TEST_KERNELS} PRIVATE torch ${TORCH_LIBRARIES}) target_include_directories(${TOPK_TEST_KERNELS} PRIVATE "${NVFUSER_ROOT}") target_include_directories(${TOPK_TEST_KERNELS} PRIVATE ${CMAKE_SOURCE_DIR}/csrc ) target_include_directories(${TOPK_TEST_KERNELS} SYSTEM PRIVATE ${NVFUSER_ROOT}/third_party/googletest/googletest/include ${NVFUSER_ROOT}/third_party/googletest/googlemock/include ) set(SCAN_TEST_KERNELS "scan_test_kernels") add_library(${SCAN_TEST_KERNELS} SHARED ${NVFUSER_ROOT}/tests/cpp/scan_test_kernels.cu ${NVFUSER_ROOT}/tests/cpp/scan_test_helper.cpp ) # CUDA 11 does not support C++20, so hard code C++17 here set_property(TARGET ${SCAN_TEST_KERNELS} PROPERTY CXX_STANDARD 17) target_link_libraries(${SCAN_TEST_KERNELS} PRIVATE torch ${TORCH_LIBRARIES}) target_include_directories(${SCAN_TEST_KERNELS} PRIVATE "${NVFUSER_ROOT}") target_include_directories(${SCAN_TEST_KERNELS} PRIVATE ${CMAKE_SOURCE_DIR}/csrc ) target_include_directories(${SCAN_TEST_KERNELS} SYSTEM PRIVATE ${NVFUSER_ROOT}/third_party/googletest/googletest/include ${NVFUSER_ROOT}/third_party/googletest/googlemock/include ) endif() function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK) list(APPEND TEST_SRC ${NVFUSER_ROOT}/tests/cpp/utils.cpp ${NVFUSER_ROOT}/tests/cpp/validator.cpp ) add_executable(${TEST_NAME} ${TEST_SRC}) set_property(TARGET ${TEST_NAME} PROPERTY CXX_STANDARD ${NVFUSER_CPP_STANDARD}) target_compile_definitions(${TEST_NAME} PRIVATE USE_GTEST) target_include_directories(${TEST_NAME} PRIVATE "${NVFUSER_ROOT}") target_include_directories(${TEST_NAME} SYSTEM PRIVATE ${NVFUSER_ROOT}/third_party/googletest/googletest/include ${NVFUSER_ROOT}/third_party/googletest/googlemock/include ) target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/csrc ) target_link_libraries(${TEST_NAME} PRIVATE codegen_internal ${ADDITIONAL_LINK} dynamic_type GTest::gtest GTest::gmock flatbuffers ${TORCH_LIBRARIES} ) if(NOT MSVC) target_compile_options(${TEST_NAME} PRIVATE -Wall -Wno-unused-function -Werror ) endif() endfunction() function(add_test TEST_NAME TEST_SRC ADDITIONAL_LINK) list(APPEND ADDITIONAL_LINK "GTest::gtest_main") add_test_without_main("${TEST_NAME}" "${TEST_SRC}" "${ADDITIONAL_LINK}") endfunction() if(BUILD_TEST) set(TEST_BINARIES) add_test(test_nvfuser "${JIT_TEST_SRCS}" "") list(APPEND TEST_BINARIES test_nvfuser) add_test(test_rng ${NVFUSER_ROOT}/tests/cpp/test_rng.cpp ${RNG_TEST_KERNELS}) list(APPEND TEST_BINARIES test_rng) set(ARGSORT_TEST_SRCS) list(APPEND ARGSORT_TEST_SRCS ${NVFUSER_ROOT}/tests/cpp/test_argsort.cpp ${NVFUSER_ROOT}/tests/cpp/test_argsort_device_func.cpp ) add_test(test_argsort "${ARGSORT_TEST_SRCS}" ${ARGSORT_TEST_KERNELS}) list(APPEND TEST_BINARIES test_argsort) set(SCAN_TEST_SRCS) list(APPEND SCAN_TEST_SRCS ${NVFUSER_ROOT}/tests/cpp/test_scan.cpp ${NVFUSER_ROOT}/tests/cpp/test_scan_device_func.cpp ) add_test(test_scan "${SCAN_TEST_SRCS}" ${SCAN_TEST_KERNELS}) list(APPEND TEST_BINARIES test_scan) set(TOPK_TEST_SRCS) list(APPEND TOPK_TEST_SRCS ${NVFUSER_ROOT}/tests/cpp/test_topk.cpp ${NVFUSER_ROOT}/tests/cpp/test_topk_device_func.cpp ) add_test(test_topk "${TOPK_TEST_SRCS}" ${TOPK_TEST_KERNELS}) list(APPEND TEST_BINARIES test_topk) set(MOE_TEST_SRCS) list(APPEND MOE_TEST_SRCS ${NVFUSER_ROOT}/tests/cpp/test_moe.cpp ) add_test(test_moe "${MOE_TEST_SRCS}" "") list(APPEND TEST_BINARIES test_moe) set(MULTIDEVICE_TEST_SRCS) list(APPEND MULTIDEVICE_TEST_SRCS ${NVFUSER_ROOT}/tests/cpp/multidevice.cpp ${NVFUSER_ROOT}/tests/cpp/multidevice_transformer.cpp ${NVFUSER_ROOT}/tests/cpp/test_multidevice_host_ir_overlap.cpp ${NVFUSER_ROOT}/tests/cpp/test_multidevice_communications.cpp ${NVFUSER_ROOT}/tests/cpp/test_multidevice_communicator.cpp ${NVFUSER_ROOT}/tests/cpp/test_multidevice_host_ir.cpp ${NVFUSER_ROOT}/tests/cpp/test_multidevice_lower_communication.cpp ${NVFUSER_ROOT}/tests/cpp/test_multidevice_matmul.cpp ${NVFUSER_ROOT}/tests/cpp/test_multidevice_pipeline.cpp ${NVFUSER_ROOT}/tests/cpp/test_multidevice_sharding.cpp ${NVFUSER_ROOT}/tests/cpp/test_multidevice_stream_parallel_type.cpp ${NVFUSER_ROOT}/tests/cpp/test_multidevice_transformer.cpp ${NVFUSER_ROOT}/tests/cpp/test_multidevice_ipc.cpp ) add_test_without_main(test_multidevice "${MULTIDEVICE_TEST_SRCS}" "") list(APPEND TEST_BINARIES test_multidevice) set(MULTIDEVICE_TUTORIAL_SRCS) list(APPEND MULTIDEVICE_TUTORIAL_SRCS ${NVFUSER_ROOT}/tests/cpp/multidevice.cpp ${NVFUSER_ROOT}/tests/cpp/test_multidevice_tutorial.cpp ) add_test_without_main(tutorial_multidevice "${MULTIDEVICE_TUTORIAL_SRCS}" "") list(APPEND TEST_BINARIES tutorial_multidevice) add_test(test_reshape "${NVFUSER_ROOT}/tests/cpp/test_reshape.cpp" "") list(APPEND TEST_BINARIES test_reshape) add_test(test_layout_op ${NVFUSER_ROOT}/tests/cpp/test_layout_op.cpp "") list(APPEND TEST_BINARIES test_layout_op) set(MATMUL_TEST_SRCS) list(APPEND MATMUL_TEST_SRCS ${NVFUSER_ROOT}/tests/cpp/test_cutlass_scheduler.cpp ${NVFUSER_ROOT}/tests/cpp/test_translate_mma.cpp ${NVFUSER_ROOT}/tests/cpp/test_matmul.cpp ${NVFUSER_ROOT}/tests/cpp/test_matmul_aten_evaluation.cpp # ${NVFUSER_ROOT}/tests/cpp/test_matmul_sass.cpp ${NVFUSER_ROOT}/tests/cpp/test_matmul_scheduler.cpp ${NVFUSER_ROOT}/tests/cpp/test_mma.cpp ) add_test(test_matmul "${MATMUL_TEST_SRCS}" "") list(APPEND TEST_BINARIES test_matmul) add_test(test_greedy "${NVFUSER_ROOT}/tests/cpp/test_greedy.cpp" "") list(APPEND TEST_BINARIES test_greedy) add_test(test_external_src "${NVFUSER_ROOT}/tests/cpp/test_external_src.cpp" "") list(APPEND TEST_BINARIES test_external_src) set(TUTORIAL_SRCS) list(APPEND TUTORIAL_SRCS ${NVFUSER_ROOT}/tests/cpp/test_tutorial.cpp ${NVFUSER_ROOT}/tests/cpp/tutorial_tmem.cpp ${NVFUSER_ROOT}/tests/cpp/tutorial_ldmatrix_stmatrix.cpp) add_test(test_tutorial "${TUTORIAL_SRCS}" "") list(APPEND TEST_BINARIES test_tutorial) set(HOSTIR_TEST_SRCS) list(APPEND HOSTIR_TEST_SRCS ${NVFUSER_ROOT}/tests/cpp/test_host_ir_evaluator.cpp ${NVFUSER_ROOT}/tests/cpp/test_host_ir_integration.cpp ${NVFUSER_ROOT}/tests/cpp/test_host_ir_stream_lowering.cpp ${NVFUSER_ROOT}/tests/cpp/test_host_irs.cpp ) add_test(test_host_ir "${HOSTIR_TEST_SRCS}" "") list(APPEND TEST_BINARIES test_host_ir) if(NVFUSER_HOST_IR_JIT) set(LLVM_COMPILE_TEST_SRCS) list(APPEND LLVM_COMPILE_TEST_SRCS ${NVFUSER_ROOT}/tests/cpp/test_host_ir_jit.cpp ) add_test(test_host_ir_jit "${LLVM_COMPILE_TEST_SRCS}" "") target_link_libraries(test_host_ir_jit PUBLIC LLVM_JIT) list(APPEND TEST_BINARIES test_host_ir_jit) endif() if(BUILD_PYTHON) set(PY_FRONTEND_TEST_SRCS) list(APPEND PY_FRONTEND_TEST_SRCS ${NVFUSER_PYTHON_DIR}/tests/python_frontend/test_nvfuser_fusion_cache.cpp ${NVFUSER_PYTHON_DIR}/tests/python_frontend/test_nvfuser_fusion_definition.cpp ${NVFUSER_PYTHON_DIR}/tests/python_frontend/test_nvfuser_fusion_record.cpp ) add_test(test_python_frontend "${PY_FRONTEND_TEST_SRCS}" "") list(APPEND TEST_BINARIES test_python_frontend) endif() # We don't link CUPTI for MSVC if(NOT MSVC) add_test(test_profiler "${NVFUSER_ROOT}/tests/cpp/test_fusion_profiler.cpp" "") list(APPEND TEST_BINARIES test_profiler) endif() add_custom_target(tests DEPENDS ${TEST_BINARIES}) endif() # -- build benchmark if(BUILD_NVFUSER_BENCHMARK) # nvfuser benchmark sources set(BENCHMARK_SRCS) list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmarks/cpp/batch_norm_channels_first.cpp ${NVFUSER_ROOT}/benchmarks/cpp/batch_norm_channels_first_backward.cpp ${NVFUSER_ROOT}/benchmarks/cpp/batch_norm_channels_last.cpp ${NVFUSER_ROOT}/benchmarks/cpp/batch_norm_channels_last_backward.cpp ${NVFUSER_ROOT}/benchmarks/cpp/bert.cpp ${NVFUSER_ROOT}/benchmarks/cpp/broadcast.cpp ${NVFUSER_ROOT}/benchmarks/cpp/gelu_backward.cpp ${NVFUSER_ROOT}/benchmarks/cpp/gelu_backward_reduction.cpp ${NVFUSER_ROOT}/benchmarks/cpp/heuristic_cache.cpp ${NVFUSER_ROOT}/benchmarks/cpp/heuristic_lookup.cpp ${NVFUSER_ROOT}/benchmarks/cpp/indexselect.cpp ${NVFUSER_ROOT}/benchmarks/cpp/instance_norm.cpp ${NVFUSER_ROOT}/benchmarks/cpp/layer_norm.cpp ${NVFUSER_ROOT}/benchmarks/cpp/layer_norm_backward.cpp ${NVFUSER_ROOT}/benchmarks/cpp/layer_norm_fused.cpp ${NVFUSER_ROOT}/benchmarks/cpp/lstm_cell.cpp ${NVFUSER_ROOT}/benchmarks/cpp/main.cpp ${NVFUSER_ROOT}/benchmarks/cpp/many_pointwise_ops.cpp ${NVFUSER_ROOT}/benchmarks/cpp/matmul.cpp ${NVFUSER_ROOT}/benchmarks/cpp/reduction.cpp ${NVFUSER_ROOT}/benchmarks/cpp/rms_norm.cpp ${NVFUSER_ROOT}/benchmarks/cpp/rms_norm_backward.cpp ${NVFUSER_ROOT}/benchmarks/cpp/scale_bias_relu.cpp ${NVFUSER_ROOT}/benchmarks/cpp/shape_inference.cpp ${NVFUSER_ROOT}/benchmarks/cpp/softmax.cpp ${NVFUSER_ROOT}/benchmarks/cpp/softmax_backward.cpp ${NVFUSER_ROOT}/benchmarks/cpp/softmax_dropout.cpp ${NVFUSER_ROOT}/benchmarks/cpp/timm.cpp ${NVFUSER_ROOT}/benchmarks/cpp/transpose.cpp ${NVFUSER_ROOT}/benchmarks/cpp/utils.cpp ${NVFUSER_ROOT}/tests/cpp/utils.cpp ) add_executable(nvfuser_bench ${BENCHMARK_SRCS}) set_target_properties(nvfuser_bench PROPERTIES C_STANDARD ${NVFUSER_C_STANDARD} CUDA_STANDARD ${NVFUSER_CUDA_STANDARD} CXX_STANDARD ${NVFUSER_CPP_STANDARD} CXX_STANDARD_REQUIRED ON CXX_VISIBILITY_PRESET hidden POSITION_INDEPENDENT_CODE Yes VISIBILITY_INLINES_HIDDEN Yes ) target_include_directories(nvfuser_bench SYSTEM PRIVATE ${CMAKE_SOURCE_DIR}/third_party/benchmark/include ${CMAKE_SOURCE_DIR}/third_party/flatbuffers/include ${CMAKE_SOURCE_DIR}/third_party/googletest/googletest/include ) target_include_directories(nvfuser_bench PUBLIC ${NVFUSER_ROOT}) target_link_libraries(nvfuser_bench PRIVATE GTest::gtest benchmark::benchmark codegen_internal ) add_dependencies(nvfuser_bench flatc build_flatbuffer_config) if(NOT MSVC) target_compile_options(nvfuser_bench PRIVATE -Wall -Wno-unused-function -Werror -Wno-deprecated-copy ) endif() # multidevice transformer benchmark if(NVFUSER_DISTRIBUTED) set(MULTIDEVICE_BENCHMARK_SRCS) list(APPEND MULTIDEVICE_BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmarks/cpp/transformer.cpp ${NVFUSER_ROOT}/tests/cpp/multidevice_transformer.cpp ${NVFUSER_ROOT}/tests/cpp/utils.cpp ) add_executable(nvfuser_multidevice_bench ${MULTIDEVICE_BENCHMARK_SRCS}) set_target_properties(nvfuser_multidevice_bench PROPERTIES C_STANDARD ${NVFUSER_C_STANDARD} CUDA_STANDARD ${NVFUSER_CUDA_STANDARD} CXX_STANDARD ${NVFUSER_CPP_STANDARD} CXX_STANDARD_REQUIRED ON CXX_VISIBILITY_PRESET hidden POSITION_INDEPENDENT_CODE Yes VISIBILITY_INLINES_HIDDEN Yes ) target_include_directories(nvfuser_multidevice_bench SYSTEM PRIVATE ${CMAKE_SOURCE_DIR}/third_party/benchmark/include ${CMAKE_SOURCE_DIR}/third_party/flatbuffers/include ${CMAKE_SOURCE_DIR}/third_party/googletest/googletest/include ) target_include_directories(nvfuser_multidevice_bench PUBLIC ${NVFUSER_ROOT}) target_link_libraries(nvfuser_multidevice_bench PRIVATE GTest::gtest benchmark::benchmark codegen_internal ) add_dependencies(nvfuser_multidevice_bench flatc build_flatbuffer_config) if(NOT MSVC) target_compile_options(nvfuser_bench PRIVATE -Wall -Wno-unused-function -Werror -Wno-deprecated-copy ) endif() endif() endif() # --- generate runtime files # nvfuser runtime files set(NVFUSER_RUNTIME_FILES) list(APPEND NVFUSER_RUNTIME_FILES ${NVFUSER_ROOT}/runtime/argsort.cu ${NVFUSER_ROOT}/runtime/array.cu ${NVFUSER_ROOT}/runtime/basic_type_traits.cu ${NVFUSER_ROOT}/runtime/bf16_support.cu ${NVFUSER_ROOT}/runtime/bit.cu ${NVFUSER_ROOT}/runtime/block_reduction.cu ${NVFUSER_ROOT}/runtime/block_sync_atomic.cu ${NVFUSER_ROOT}/runtime/block_sync_default.cu ${NVFUSER_ROOT}/runtime/block_welford_outer.cu ${NVFUSER_ROOT}/runtime/block_layout.cu ${NVFUSER_ROOT}/runtime/broadcast.cu ${NVFUSER_ROOT}/runtime/casts.cu ${NVFUSER_ROOT}/runtime/cluster.cu ${NVFUSER_ROOT}/runtime/complex_number.cu ${NVFUSER_ROOT}/runtime/cub_utils.cu ${NVFUSER_ROOT}/runtime/fp16_support.cu ${NVFUSER_ROOT}/runtime/fp8_support.cu ${NVFUSER_ROOT}/runtime/fp4_support.cu ${NVFUSER_ROOT}/runtime/fused_reduction.cu ${NVFUSER_ROOT}/runtime/fused_welford_helper.cu ${NVFUSER_ROOT}/runtime/fused_welford_impl.cu ${NVFUSER_ROOT}/runtime/fused_welford_impl_outer.cu ${NVFUSER_ROOT}/runtime/grid_broadcast.cu ${NVFUSER_ROOT}/runtime/grid_reduction.cu ${NVFUSER_ROOT}/runtime/grid_sync.cu ${NVFUSER_ROOT}/runtime/helpers.cu ${NVFUSER_ROOT}/runtime/index_utils.cu ${NVFUSER_ROOT}/runtime/mbarrier.cu ${NVFUSER_ROOT}/runtime/memory.cu ${NVFUSER_ROOT}/runtime/random_numbers.cu ${NVFUSER_ROOT}/runtime/tensor_memory.cu ${NVFUSER_ROOT}/runtime/tensor.cu ${NVFUSER_ROOT}/runtime/topk.cu ${NVFUSER_ROOT}/runtime/scan.cu ${NVFUSER_ROOT}/runtime/tuple.cu ${NVFUSER_ROOT}/runtime/type_traits.cu ${NVFUSER_ROOT}/runtime/warp.cu ${NVFUSER_ROOT}/runtime/welford.cu ) file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/include/nvfuser_resources") # "stringify" NVFUSER runtime sources # (generate C++ header files embedding the original input as a string literal) set(NVFUSER_STRINGIFY_TOOL "${NVFUSER_ROOT}/tools/stringify_file.py") foreach(src ${NVFUSER_RUNTIME_FILES}) get_filename_component(filename ${src} NAME_WE) set(dst "${CMAKE_BINARY_DIR}/include/nvfuser_resources/${filename}.h") add_custom_command( COMMENT "Stringify NVFUSER runtime source file ${src}" OUTPUT ${dst} DEPENDS ${src} "${NVFUSER_STRINGIFY_TOOL}" COMMAND ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst} ) add_custom_target(nvfuser_rt_${filename} DEPENDS ${dst}) add_dependencies(codegen_internal nvfuser_rt_${filename}) # Do not overwrite resource header if it already exists. This avoids unnecessary rebuilds. # If ${dst} doesn't exist, this `if` is also true, so header will be generated. if(${src} IS_NEWER_THAN ${dst}) # also generate the resource headers during the configuration step # (so tools like clang-tidy can run w/o requiring a real build) execute_process(COMMAND ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst}) endif() endforeach() target_include_directories(codegen_internal PRIVATE "${CMAKE_BINARY_DIR}/include") # -- install nvfuser cmake config files and symlink to build binaries install(EXPORT NvfuserTargets FILE NvfuserConfig.cmake DESTINATION share/cmake/nvfuser) file(CREATE_LINK "${CMAKE_BINARY_DIR}" "${NVFUSER_ROOT}/bin" SYMBOLIC) # These symbolic links help IDEs like Cursor resolve symbols in nvfuser and # nvfuser_direct. file(CREATE_LINK "${NVFUSER_ROOT}/python/nvfuser" "${NVFUSER_ROOT}/nvfuser" SYMBOLIC) file(CREATE_LINK "${NVFUSER_ROOT}/python/nvfuser_direct" "${NVFUSER_ROOT}/nvfuser_direct" SYMBOLIC) message(STATUS "") message(STATUS "******** Nvfuser configuration summary ********") message(STATUS " BUILD_CUTLASS: ${CUTLASS_STATUS}") message(STATUS " UCC_FOUND: ${UCC_FOUND}") message(STATUS " NVFUSER_STANDALONE_BUILD_WITH_UCC : ${NVFUSER_STANDALONE_BUILD_WITH_UCC}") message(STATUS " NVFUSER_BUILD_WITH_ASAN : ${NVFUSER_BUILD_WITH_ASAN}") message(STATUS " NVFUSER_DISTRIBUTED : ${NVFUSER_DISTRIBUTED}") message(STATUS " NVFUSER_HOST_IR_JIT : ${NVFUSER_HOST_IR_JIT}") message(STATUS " NVFUSER_CPP_STANDARD : ${NVFUSER_CPP_STANDARD}") if(NVFUSER_STANDALONE_BUILD_WITH_UCC) message(STATUS " UCC_HOME: $ENV{UCC_HOME}") message(STATUS " UCC_DIR : $ENV{UCC_DIR}") message(STATUS " UCX_HOME: $ENV{UCX_HOME}") message(STATUS " UCX_DIR : $ENV{UCX_DIR}") endif() message(STATUS "******** End of Nvfuser configuration summary ********")