[Bugfix] fix use_atomic_add support of marlin kernel when using v1 engine (#15946)
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
This commit is contained in:
parent
13affc432d
commit
2fa66ef713
@ -1785,7 +1785,7 @@ __global__ void Marlin(
|
||||
<<<blocks, NUM_THREADS, max_shared_mem, stream>>>( \
|
||||
A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr, \
|
||||
num_groups, prob_m, prob_n, prob_k, lda, locks, \
|
||||
use_atomic_add, use_fp32_reduce); \
|
||||
part_use_atomic_add, use_fp32_reduce); \
|
||||
} \
|
||||
}
|
||||
|
||||
@ -2215,6 +2215,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
|
||||
thread_m_blocks = exec_cfg.max_m_blocks;
|
||||
}
|
||||
|
||||
// atomic add reduce have better performance only when m * n is small
|
||||
bool part_use_atomic_add =
|
||||
use_atomic_add && div_ceil(prob_m, 64) * prob_n <= 2048;
|
||||
|
||||
if (false) {
|
||||
}
|
||||
GPTQ_CALL_IF(vllm::kU4B8, 16, 4, 256)
|
||||
|
@ -305,7 +305,7 @@ def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
|
||||
|
||||
# the performance of atomicAdd is better than global reduce
|
||||
# only when m*n is small and k is large
|
||||
return max(m, 64) * n < 64 * 2048 and k >= 2048
|
||||
return n < 2048 and k >= 2048
|
||||
|
||||
|
||||
def apply_gptq_marlin_linear(
|
||||
|
Loading…
x
Reference in New Issue
Block a user