From 55d63b1211af9bd3a71d88f82f3bd5804d83b2cd Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 22 Aug 2024 08:28:52 -0400 Subject: [PATCH] [Bugfix] Don't build machete on cuda <12.0 (#7757) --- CMakeLists.txt | 64 +++++++++++--------- csrc/quantization/machete/machete_pytorch.cu | 12 ++++ 2 files changed, 48 insertions(+), 28 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 217dc70c..ab91b864 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,6 +10,9 @@ message(STATUS "Target device: ${VLLM_TARGET_DEVICE}") include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) +# Suppress potential warnings about unused manually-specified variables +set(ignoreMe "${VLLM_PYTHON_PATH}") + # # Supported python versions. These versions will be searched in order, the # first match will be selected. These should be kept in sync with setup.py. @@ -228,35 +231,38 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() # - # For the Machete kernels we automatically generate sources for various - # preselected input type pairs and schedules. - # Generate sources: - execute_process( - COMMAND ${CMAKE_COMMAND} -E env - PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH - ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py - RESULT_VARIABLE machete_generation_result - OUTPUT_VARIABLE machete_generation_output - OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log - ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log - ) + # Machete kernels - if (NOT machete_generation_result EQUAL 0) - message(FATAL_ERROR "Machete generation failed." - " Result: \"${machete_generation_result}\"" - "\nCheck the log for details: " - "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log") - else() - message(STATUS "Machete generation completed successfully.") - endif() - - # Add machete generated sources - file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu") - list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES}) - message(STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES}") - - # See comment above for scaled_mm_c3x (same if condition) + # The machete kernels only work on hopper and require CUDA 12.0 or later. if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0) + # + # For the Machete kernels we automatically generate sources for various + # preselected input type pairs and schedules. + # Generate sources: + execute_process( + COMMAND ${CMAKE_COMMAND} -E env + PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH + ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py + RESULT_VARIABLE machete_generation_result + OUTPUT_VARIABLE machete_generation_output + OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log + ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log + ) + + if (NOT machete_generation_result EQUAL 0) + message(FATAL_ERROR "Machete generation failed." + " Result: \"${machete_generation_result}\"" + "\nCheck the log for details: " + "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log") + else() + message(STATUS "Machete generation completed successfully.") + endif() + + # Add machete generated sources + file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu") + list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES}) + message(STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES}") + set_source_files_properties( ${MACHETE_GEN_SOURCES} PROPERTIES @@ -264,7 +270,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "-gencode arch=compute_90a,code=sm_90a") endif() - # Add pytorch binding + # Add pytorch binding for machete (add on even CUDA < 12.0 so that we can + # raise an error if the user that this was built with an incompatible + # CUDA version) list(APPEND VLLM_EXT_SRC csrc/quantization/machete/machete_pytorch.cu) endif() diff --git a/csrc/quantization/machete/machete_pytorch.cu b/csrc/quantization/machete/machete_pytorch.cu index ef36a490..a78cccb2 100644 --- a/csrc/quantization/machete/machete_pytorch.cu +++ b/csrc/quantization/machete/machete_pytorch.cu @@ -37,9 +37,13 @@ static auto scalar_type_dispatch(ScalarType const& type, Fn fn) { // std::vector supported_schedules(ScalarTypeTorchPtr const& btype) { +#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12 return scalar_type_dispatch(*btype, [&](auto BType) { return GemmDispatcher::supported_schedules(); }); +#else + TORCH_CHECK(false, "Machete requires CUDA 12.0 or later"); +#endif } torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B, @@ -50,6 +54,7 @@ torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B, c10::optional const& C, c10::optional alpha, c10::optional beta, c10::optional schedule) { +#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12 auto args = PyTorchArguments{.A = A, .B = B, .scales = scales, @@ -67,13 +72,20 @@ torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B, return GemmDispatcher::dispatch(args); }); }); +#else + TORCH_CHECK(false, "Machete requires CUDA 12.0 or later"); +#endif } torch::Tensor prepack_B(torch::Tensor const& B, ScalarTypeTorchPtr const& btype) { +#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12 return scalar_type_dispatch(*btype, [&](auto BType) { return PrepackBDispatcher::dispatch(B); }); +#else + TORCH_CHECK(false, "Machete requires CUDA 12.0 or later"); +#endif } }; // namespace machete