[Bugfix] Don't build machete on cuda <12.0 (#7757)
This commit is contained in:
parent
4f419c00a6
commit
55d63b1211
@ -10,6 +10,9 @@ message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
|||||||
|
|
||||||
include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
||||||
|
|
||||||
|
# Suppress potential warnings about unused manually-specified variables
|
||||||
|
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported python versions. These versions will be searched in order, the
|
# Supported python versions. These versions will be searched in order, the
|
||||||
# first match will be selected. These should be kept in sync with setup.py.
|
# first match will be selected. These should be kept in sync with setup.py.
|
||||||
@ -228,35 +231,38 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# For the Machete kernels we automatically generate sources for various
|
# Machete kernels
|
||||||
# preselected input type pairs and schedules.
|
|
||||||
# Generate sources:
|
|
||||||
execute_process(
|
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
|
||||||
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
|
|
||||||
${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py
|
|
||||||
RESULT_VARIABLE machete_generation_result
|
|
||||||
OUTPUT_VARIABLE machete_generation_output
|
|
||||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
|
|
||||||
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
|
|
||||||
)
|
|
||||||
|
|
||||||
if (NOT machete_generation_result EQUAL 0)
|
# The machete kernels only work on hopper and require CUDA 12.0 or later.
|
||||||
message(FATAL_ERROR "Machete generation failed."
|
|
||||||
" Result: \"${machete_generation_result}\""
|
|
||||||
"\nCheck the log for details: "
|
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
|
|
||||||
else()
|
|
||||||
message(STATUS "Machete generation completed successfully.")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Add machete generated sources
|
|
||||||
file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
|
|
||||||
list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
|
|
||||||
message(STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES}")
|
|
||||||
|
|
||||||
# See comment above for scaled_mm_c3x (same if condition)
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
|
||||||
|
#
|
||||||
|
# For the Machete kernels we automatically generate sources for various
|
||||||
|
# preselected input type pairs and schedules.
|
||||||
|
# Generate sources:
|
||||||
|
execute_process(
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E env
|
||||||
|
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
|
||||||
|
${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py
|
||||||
|
RESULT_VARIABLE machete_generation_result
|
||||||
|
OUTPUT_VARIABLE machete_generation_output
|
||||||
|
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
|
||||||
|
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
|
||||||
|
)
|
||||||
|
|
||||||
|
if (NOT machete_generation_result EQUAL 0)
|
||||||
|
message(FATAL_ERROR "Machete generation failed."
|
||||||
|
" Result: \"${machete_generation_result}\""
|
||||||
|
"\nCheck the log for details: "
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
|
||||||
|
else()
|
||||||
|
message(STATUS "Machete generation completed successfully.")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Add machete generated sources
|
||||||
|
file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
|
||||||
|
list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
|
||||||
|
message(STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES}")
|
||||||
|
|
||||||
set_source_files_properties(
|
set_source_files_properties(
|
||||||
${MACHETE_GEN_SOURCES}
|
${MACHETE_GEN_SOURCES}
|
||||||
PROPERTIES
|
PROPERTIES
|
||||||
@ -264,7 +270,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"-gencode arch=compute_90a,code=sm_90a")
|
"-gencode arch=compute_90a,code=sm_90a")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Add pytorch binding
|
# Add pytorch binding for machete (add on even CUDA < 12.0 so that we can
|
||||||
|
# raise an error if the user that this was built with an incompatible
|
||||||
|
# CUDA version)
|
||||||
list(APPEND VLLM_EXT_SRC
|
list(APPEND VLLM_EXT_SRC
|
||||||
csrc/quantization/machete/machete_pytorch.cu)
|
csrc/quantization/machete/machete_pytorch.cu)
|
||||||
endif()
|
endif()
|
||||||
|
@ -37,9 +37,13 @@ static auto scalar_type_dispatch(ScalarType const& type, Fn fn) {
|
|||||||
//
|
//
|
||||||
|
|
||||||
std::vector<std::string> supported_schedules(ScalarTypeTorchPtr const& btype) {
|
std::vector<std::string> supported_schedules(ScalarTypeTorchPtr const& btype) {
|
||||||
|
#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12
|
||||||
return scalar_type_dispatch(*btype, [&](auto BType) {
|
return scalar_type_dispatch(*btype, [&](auto BType) {
|
||||||
return GemmDispatcher<half_t, decltype(BType)>::supported_schedules();
|
return GemmDispatcher<half_t, decltype(BType)>::supported_schedules();
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
TORCH_CHECK(false, "Machete requires CUDA 12.0 or later");
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B,
|
torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B,
|
||||||
@ -50,6 +54,7 @@ torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B,
|
|||||||
c10::optional<torch::Tensor> const& C,
|
c10::optional<torch::Tensor> const& C,
|
||||||
c10::optional<double> alpha, c10::optional<double> beta,
|
c10::optional<double> alpha, c10::optional<double> beta,
|
||||||
c10::optional<std::string> schedule) {
|
c10::optional<std::string> schedule) {
|
||||||
|
#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12
|
||||||
auto args = PyTorchArguments{.A = A,
|
auto args = PyTorchArguments{.A = A,
|
||||||
.B = B,
|
.B = B,
|
||||||
.scales = scales,
|
.scales = scales,
|
||||||
@ -67,13 +72,20 @@ torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B,
|
|||||||
return GemmDispatcher<ComputeType, decltype(BType)>::dispatch(args);
|
return GemmDispatcher<ComputeType, decltype(BType)>::dispatch(args);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
TORCH_CHECK(false, "Machete requires CUDA 12.0 or later");
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
torch::Tensor prepack_B(torch::Tensor const& B,
|
torch::Tensor prepack_B(torch::Tensor const& B,
|
||||||
ScalarTypeTorchPtr const& btype) {
|
ScalarTypeTorchPtr const& btype) {
|
||||||
|
#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12
|
||||||
return scalar_type_dispatch(*btype, [&](auto BType) {
|
return scalar_type_dispatch(*btype, [&](auto BType) {
|
||||||
return PrepackBDispatcher<half_t, decltype(BType), half_t>::dispatch(B);
|
return PrepackBDispatcher<half_t, decltype(BType), half_t>::dispatch(B);
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
TORCH_CHECK(false, "Machete requires CUDA 12.0 or later");
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
}; // namespace machete
|
}; // namespace machete
|
||||||
|
Loading…
x
Reference in New Issue
Block a user