vllm/csrc/moe/torch_bindings.cpp

#include "core/registration.h"
#include "moe_ops.h"

TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
  // Apply topk softmax to the gating outputs.
  m.def(
      "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
      "token_expert_indices, Tensor gating_output) -> ()");
  m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
}

REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
[Misc] Disambiguate quantized types via a new ScalarType (#6396) 2024-08-02 16:51:58 -04:00			`#include "core/registration.h"`
[Kernel][Misc] Use TORCH_LIBRARY instead of PYBIND11_MODULE for custom ops (#5047) 2024-06-09 16:23:30 -04:00			`#include "moe_ops.h"`

			`TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {`
			`// Apply topk softmax to the gating outputs.`
			`m.def(`
			`"topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "`
			`"token_expert_indices, Tensor gating_output) -> ()");`
			`m.impl("topk_softmax", torch::kCUDA, &topk_softmax);`
			`}`

			`REGISTER_EXTENSION(TORCH_EXTENSION_NAME)`