From 55d63b1211af9bd3a71d88f82f3bd5804d83b2cd Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 22 Aug 2024 08:28:52 -0400
Subject: [PATCH] [Bugfix] Don't build machete on cuda <12.0 (#7757)

---
 CMakeLists.txt                               | 64 +++++++++++---------
 csrc/quantization/machete/machete_pytorch.cu | 12 ++++
 2 files changed, 48 insertions(+), 28 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 217dc70c..ab91b864 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,6 +10,9 @@ message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
 
 include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 
+# Suppress potential warnings about unused manually-specified variables
+set(ignoreMe "${VLLM_PYTHON_PATH}")
+
 #
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
@@ -228,35 +231,38 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   #
-  # For the Machete kernels we automatically generate sources for various 
-  # preselected input type pairs and schedules.
-  # Generate sources:
-  execute_process(
-    COMMAND ${CMAKE_COMMAND} -E env 
-    PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH 
-      ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py
-    RESULT_VARIABLE machete_generation_result
-    OUTPUT_VARIABLE machete_generation_output
-    OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
-    ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
-  )
+  # Machete kernels
 
-  if (NOT machete_generation_result EQUAL 0)
-    message(FATAL_ERROR "Machete generation failed."
-                        " Result: \"${machete_generation_result}\"" 
-                        "\nCheck the log for details: "
-                        "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
-  else()
-    message(STATUS "Machete generation completed successfully.")
-  endif()
-
-  # Add machete generated sources
-  file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
-  list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
-  message(STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES}")
-
-  # See comment above for scaled_mm_c3x (same if condition)
+  # The machete kernels only work on hopper and require CUDA 12.0 or later.
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
+    #
+    # For the Machete kernels we automatically generate sources for various 
+    # preselected input type pairs and schedules.
+    # Generate sources:
+    execute_process(
+      COMMAND ${CMAKE_COMMAND} -E env 
+      PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH 
+        ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py
+      RESULT_VARIABLE machete_generation_result
+      OUTPUT_VARIABLE machete_generation_output
+      OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
+      ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
+    )
+
+    if (NOT machete_generation_result EQUAL 0)
+      message(FATAL_ERROR "Machete generation failed."
+                          " Result: \"${machete_generation_result}\"" 
+                          "\nCheck the log for details: "
+                          "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
+    else()
+      message(STATUS "Machete generation completed successfully.")
+    endif()
+
+    # Add machete generated sources
+    file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
+    list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
+    message(STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES}")
+
     set_source_files_properties(
           ${MACHETE_GEN_SOURCES}
           PROPERTIES
@@ -264,7 +270,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
           "-gencode arch=compute_90a,code=sm_90a")
   endif()
 
-  # Add pytorch binding
+  # Add pytorch binding for machete (add on even CUDA < 12.0 so that we can
+  #  raise an error if the user that this was built with an incompatible 
+  #  CUDA version)
   list(APPEND VLLM_EXT_SRC
     csrc/quantization/machete/machete_pytorch.cu)
 endif()
diff --git a/csrc/quantization/machete/machete_pytorch.cu b/csrc/quantization/machete/machete_pytorch.cu
index ef36a490..a78cccb2 100644
--- a/csrc/quantization/machete/machete_pytorch.cu
+++ b/csrc/quantization/machete/machete_pytorch.cu
@@ -37,9 +37,13 @@ static auto scalar_type_dispatch(ScalarType const& type, Fn fn) {
 //
 
 std::vector<std::string> supported_schedules(ScalarTypeTorchPtr const& btype) {
+#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12
   return scalar_type_dispatch(*btype, [&](auto BType) {
     return GemmDispatcher<half_t, decltype(BType)>::supported_schedules();
   });
+#else
+  TORCH_CHECK(false, "Machete requires CUDA 12.0 or later");
+#endif
 }
 
 torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B,
@@ -50,6 +54,7 @@ torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B,
                    c10::optional<torch::Tensor> const& C,
                    c10::optional<double> alpha, c10::optional<double> beta,
                    c10::optional<std::string> schedule) {
+#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12
   auto args = PyTorchArguments{.A = A,
                                .B = B,
                                .scales = scales,
@@ -67,13 +72,20 @@ torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B,
           return GemmDispatcher<ComputeType, decltype(BType)>::dispatch(args);
         });
   });
+#else
+  TORCH_CHECK(false, "Machete requires CUDA 12.0 or later");
+#endif
 }
 
 torch::Tensor prepack_B(torch::Tensor const& B,
                         ScalarTypeTorchPtr const& btype) {
+#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12
   return scalar_type_dispatch(*btype, [&](auto BType) {
     return PrepackBDispatcher<half_t, decltype(BType), half_t>::dispatch(B);
   });
+#else
+  TORCH_CHECK(false, "Machete requires CUDA 12.0 or later");
+#endif
 }
 
 };  // namespace machete