From 2344192a55022d84ba3bb8d33b1ec38724f54fed Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 13 Feb 2025 18:43:37 -0500 Subject: [PATCH] Optimize moe_align_block_size for deepseek_v3 (#12850) Signed-off-by: mgoin --- csrc/moe/moe_align_sum_kernels.cu | 50 +++++++++++++------ .../layers/fused_moe/fused_moe.py | 3 +- 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index c072744f..d7be7694 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -198,26 +198,27 @@ __global__ void moe_align_block_size_global_mem_kernel( } // taken from -// https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a +// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957 template __global__ void sgl_moe_align_block_size_kernel( scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids, int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts, int32_t block_size, size_t numel, int32_t* cumsum) { __shared__ int32_t shared_counts[32][8]; - __shared__ int32_t local_offsets[256]; const int warp_id = threadIdx.x / 32; - const int lane_id = threadIdx.x % 32; const int experts_per_warp = 8; const int my_expert_start = warp_id * experts_per_warp; + // Initialize shared_counts for this warp's experts for (int i = 0; i < experts_per_warp; ++i) { if (my_expert_start + i < num_experts) { shared_counts[warp_id][i] = 0; } } + __syncthreads(); + const size_t tokens_per_thread = CEILDIV(numel, blockDim.x); const size_t start_idx = threadIdx.x * tokens_per_thread; @@ -230,6 +231,7 @@ __global__ void sgl_moe_align_block_size_kernel( __syncthreads(); + // Single thread computes cumulative sum and total tokens if (threadIdx.x == 0) { cumsum[0] = 0; for (int i = 1; i <= num_experts; ++i) { @@ -246,19 +248,28 @@ __global__ void sgl_moe_align_block_size_kernel( __syncthreads(); + // Assign expert IDs to blocks if (threadIdx.x < num_experts) { for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; i += block_size) { expert_ids[i / block_size] = threadIdx.x; } - local_offsets[threadIdx.x] = cumsum[threadIdx.x]; } +} - __syncthreads(); +// taken from +// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957 +template +__global__ void sgl_moe_token_sort_kernel(scalar_t* __restrict__ topk_ids, + int32_t* sorted_token_ids, + int32_t* cumsum_buffer, + size_t numel) { + const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const size_t stride = blockDim.x * gridDim.x; - for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { + for (size_t i = tid; i < numel; i += stride) { int32_t expert_id = topk_ids[i]; - int32_t rank_post_pad = atomicAdd(&local_offsets[expert_id], 1); + int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1); sorted_token_ids[rank_post_pad] = i; } } @@ -377,23 +388,34 @@ void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, torch::Tensor experts_ids, torch::Tensor num_tokens_post_pad) { const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + TORCH_CHECK(num_experts == 256, + "sgl_moe_align_block_size kernel only supports deepseek v3."); + VLLM_DISPATCH_INTEGRAL_TYPES( topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] { - // calc needed amount of shared mem for `tokens_cnts` and `cumsum` - // tensors + // calc needed amount of shared mem for `cumsum` tensors auto options_int = torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device()); - // torch::Tensor token_cnts_buffer = - // torch::empty({(num_experts + 1) * num_experts}, options_int); torch::Tensor cumsum_buffer = - torch::empty({num_experts + 1}, options_int); + torch::zeros({num_experts + 1}, options_int); - auto kernel = vllm::moe::sgl_moe_align_block_size_kernel; - kernel<<<1, 1024, 0, stream>>>( + auto align_kernel = + vllm::moe::sgl_moe_align_block_size_kernel; + align_kernel<<<1, 1024, 0, stream>>>( topk_ids.data_ptr(), sorted_token_ids.data_ptr(), experts_ids.data_ptr(), num_tokens_post_pad.data_ptr(), num_experts, block_size, topk_ids.numel(), cumsum_buffer.data_ptr()); + + const int block_threads = 256; + const int num_blocks = + (topk_ids.numel() + block_threads - 1) / block_threads; + const int max_blocks = 65535; + const int actual_blocks = std::min(num_blocks, max_blocks); + auto sort_kernel = vllm::moe::sgl_moe_token_sort_kernel; + sort_kernel<<>>( + topk_ids.data_ptr(), sorted_token_ids.data_ptr(), + cumsum_buffer.data_ptr(), topk_ids.numel()); }); } diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index f14200e0..d0b6249e 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -596,7 +596,7 @@ def moe_align_block_size( dtype=torch.int32, device=topk_ids.device) if num_experts >= 224: - if envs.VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: + if envs.VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON or num_experts != 256: moe_align_block_size_triton( topk_ids, num_experts, @@ -606,6 +606,7 @@ def moe_align_block_size( num_tokens_post_pad, ) else: + # Currently requires num_experts=256 ops.sgl_moe_align_block_size( topk_ids, num_experts,