vllm/csrc/quantization/machete/machete_pytorch.cu

#include "machete_mm_launcher.cuh"
#include "machete_prepack_launcher.cuh"
#include "core/scalar_type.hpp"

#include "core/registration.h"

namespace machete {

using namespace vllm;

std::vector<std::string> supported_schedules(
    at::ScalarType a_type, int64_t b_type_id,
    c10::optional<at::ScalarType> maybe_group_scales_type,
    c10::optional<at::ScalarType> maybe_group_zeros_type,
    c10::optional<at::ScalarType> maybe_channel_scales_type,
    c10::optional<at::ScalarType> maybe_token_scales_type,
    c10::optional<at::ScalarType> maybe_out_type) {
  ScalarType const b_type = ScalarType::from_id(b_type_id);
  return supported_schedules_dispatch({
      .a_type = a_type,
      .b_type = b_type,
      .maybe_group_scales_type = maybe_group_scales_type,
      .maybe_group_zeros_type = maybe_group_zeros_type,
      .maybe_channel_scales_type = maybe_channel_scales_type,
      .maybe_token_scales_type = maybe_token_scales_type,
      .maybe_out_type = maybe_out_type,
  });
}

torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B,
                 int64_t b_type_id,
                 c10::optional<at::ScalarType> const& maybe_out_type,
                 c10::optional<torch::Tensor> const& maybe_group_scales,
                 c10::optional<torch::Tensor> const& maybe_group_zeros,
                 c10::optional<int64_t> maybe_group_size,
                 c10::optional<torch::Tensor> const& maybe_channel_scales,
                 c10::optional<torch::Tensor> const& maybe_token_scales,
                 c10::optional<std::string> maybe_schedule) {
  ScalarType const b_type = ScalarType::from_id(b_type_id);
  return mm_dispatch({.A = A,
                      .B = B,
                      .b_type = b_type,
                      .maybe_out_type = maybe_out_type,
                      .maybe_group_scales = maybe_group_scales,
                      .maybe_group_zeros = maybe_group_zeros,
                      .maybe_group_size = maybe_group_size,
                      .maybe_channel_scales = maybe_channel_scales,
                      .maybe_token_scales = maybe_token_scales,
                      .maybe_schedule = maybe_schedule});
}

torch::Tensor prepack_B(
    torch::Tensor const& B, at::ScalarType const& a_type, int64_t b_type_id,
    c10::optional<at::ScalarType> const& maybe_group_scales_type) {
  ScalarType const b_type = ScalarType::from_id(b_type_id);
  return prepack_B_dispatch(
      {.B = B,
       .a_type = a_type,
       .b_type = b_type,
       .maybe_group_scales_type = maybe_group_scales_type});
}

TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
  m.impl("machete_prepack_B", &prepack_B);
  m.impl("machete_mm", &mm);
}

// use CatchAll since supported_schedules has no tensor arguments
TORCH_LIBRARY_IMPL(TORCH_EXTENSION_NAME, CatchAll, m) {
  m.impl("machete_supported_schedules", &supported_schedules);
}

};  // namespace machete
[Kernel] (1/N) Machete - Hopper Optimized Mixed Precision Linear Kernel (#7174) 2024-08-20 09:09:33 -04:00			`#include "machete_mm_launcher.cuh"`
			`#include "machete_prepack_launcher.cuh"`
			`#include "core/scalar_type.hpp"`

[CI/Build] Per file CUDA Archs (improve wheel size and dev build times) (#8845) 2024-10-03 22:55:25 -04:00			`#include "core/registration.h"`

[Kernel] (1/N) Machete - Hopper Optimized Mixed Precision Linear Kernel (#7174) 2024-08-20 09:09:33 -04:00			`namespace machete {`

			`using namespace vllm;`

[Kernel] Initial Machete W4A8 support + Refactors (#9855) Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> 2024-11-18 14:59:29 -05:00			`std::vector<std::string> supported_schedules(`
			`at::ScalarType a_type, int64_t b_type_id,`
			`c10::optional<at::ScalarType> maybe_group_scales_type,`
			`c10::optional<at::ScalarType> maybe_group_zeros_type,`
			`c10::optional<at::ScalarType> maybe_channel_scales_type,`
			`c10::optional<at::ScalarType> maybe_token_scales_type,`
			`c10::optional<at::ScalarType> maybe_out_type) {`
			`ScalarType const b_type = ScalarType::from_id(b_type_id);`
			`return supported_schedules_dispatch({`
			`.a_type = a_type,`
			`.b_type = b_type,`
			`.maybe_group_scales_type = maybe_group_scales_type,`
			`.maybe_group_zeros_type = maybe_group_zeros_type,`
			`.maybe_channel_scales_type = maybe_channel_scales_type,`
			`.maybe_token_scales_type = maybe_token_scales_type,`
			`.maybe_out_type = maybe_out_type,`
[Kernel] (1/N) Machete - Hopper Optimized Mixed Precision Linear Kernel (#7174) 2024-08-20 09:09:33 -04:00			`});`
			`}`

[Kernel] Initial Machete W4A8 support + Refactors (#9855) Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> 2024-11-18 14:59:29 -05:00			`torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B,`
			`int64_t b_type_id,`
			`c10::optional<at::ScalarType> const& maybe_out_type,`
			`c10::optional<torch::Tensor> const& maybe_group_scales,`
			`c10::optional<torch::Tensor> const& maybe_group_zeros,`
			`c10::optional<int64_t> maybe_group_size,`
			`c10::optional<torch::Tensor> const& maybe_channel_scales,`
			`c10::optional<torch::Tensor> const& maybe_token_scales,`
			`c10::optional<std::string> maybe_schedule) {`
			`ScalarType const b_type = ScalarType::from_id(b_type_id);`
			`return mm_dispatch({.A = A,`
			`.B = B,`
			`.b_type = b_type,`
			`.maybe_out_type = maybe_out_type,`
			`.maybe_group_scales = maybe_group_scales,`
			`.maybe_group_zeros = maybe_group_zeros,`
			`.maybe_group_size = maybe_group_size,`
			`.maybe_channel_scales = maybe_channel_scales,`
			`.maybe_token_scales = maybe_token_scales,`
			`.maybe_schedule = maybe_schedule});`
[Kernel] (1/N) Machete - Hopper Optimized Mixed Precision Linear Kernel (#7174) 2024-08-20 09:09:33 -04:00			`}`

[Kernel] Initial Machete W4A8 support + Refactors (#9855) Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> 2024-11-18 14:59:29 -05:00			`torch::Tensor prepack_B(`
			`torch::Tensor const& B, at::ScalarType const& a_type, int64_t b_type_id,`
			`c10::optional<at::ScalarType> const& maybe_group_scales_type) {`
			`ScalarType const b_type = ScalarType::from_id(b_type_id);`
			`return prepack_B_dispatch(`
			`{.B = B,`
			`.a_type = a_type,`
			`.b_type = b_type,`
			`.maybe_group_scales_type = maybe_group_scales_type});`
[CI/Build] Per file CUDA Archs (improve wheel size and dev build times) (#8845) 2024-10-03 22:55:25 -04:00			`}`

			`TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {`
			`m.impl("machete_prepack_B", &prepack_B);`
[Kernel] Initial Machete W4A8 support + Refactors (#9855) Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> 2024-11-18 14:59:29 -05:00			`m.impl("machete_mm", &mm);`
[Bugfix] Fix Machete unittests failing with `NotImplementedError` (#9218) 2024-10-10 13:39:56 -04:00			`}`

			`// use CatchAll since supported_schedules has no tensor arguments`
			`TORCH_LIBRARY_IMPL(TORCH_EXTENSION_NAME, CatchAll, m) {`
[CI/Build] Per file CUDA Archs (improve wheel size and dev build times) (#8845) 2024-10-03 22:55:25 -04:00			`m.impl("machete_supported_schedules", &supported_schedules);`
[Kernel] (1/N) Machete - Hopper Optimized Mixed Precision Linear Kernel (#7174) 2024-08-20 09:09:33 -04:00			`}`

			`}; // namespace machete`