From 8c0d15d5c5658b74a70694124af2ac250fdc4e23 Mon Sep 17 00:00:00 2001 From: Lu Fang <30275821+houseroad@users.noreply.github.com> Date: Fri, 14 Mar 2025 21:40:09 -0700 Subject: [PATCH] [Misc][Easy] Annotate unused vars in the csrc files (#14798) Signed-off-by: Lu Fang --- csrc/prepare_inputs/advance_step.cu | 2 +- csrc/quantization/fp8/amd/quant_utils.cuh | 2 +- csrc/quantization/gptq/q_gemm.cu | 16 ++++++++-------- csrc/rocm/attention.cu | 7 ++++--- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu index c3902f4c..fea4bc2c 100644 --- a/csrc/prepare_inputs/advance_step.cu +++ b/csrc/prepare_inputs/advance_step.cu @@ -274,7 +274,7 @@ void advance_step_flashinfer( cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev); - int block_tables_stride = block_tables.stride(0); + [[maybe_unused]] int block_tables_stride = block_tables.stride(0); TORCH_CHECK((blocks * threads > num_queries), "multi-step: not enough threads to map to num_queries = ", num_queries, " block_tables.stride(0) = ", block_tables.stride(0), diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh index feda497d..c4ed1b47 100644 --- a/csrc/quantization/fp8/amd/quant_utils.cuh +++ b/csrc/quantization/fp8/amd/quant_utils.cuh @@ -446,7 +446,7 @@ scaled_vec_conversion(const uint8_t& a, float scale) { template <> __inline__ __device__ uint32_t scaled_vec_conversion(const uint16_t& a, float scale) { - __half2_raw h2r = + [[maybe_unused]] __half2_raw h2r = __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret); union { __half2_raw h2r; diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu index 785f1a09..538cb584 100644 --- a/csrc/quantization/gptq/q_gemm.cu +++ b/csrc/quantization/gptq/q_gemm.cu @@ -206,8 +206,8 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel( int offset_m = blockIdx.y * m_count; int offset_k = blockIdx.z * BLOCK_KN_SIZE; - int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - int end_m = min(offset_m + m_count, size_m); + [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); + [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int n = offset_n + t * 4; @@ -344,8 +344,8 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel( int offset_m = blockIdx.y * m_count; int offset_k = blockIdx.z * BLOCK_KN_SIZE; - int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - int end_m = min(offset_m + m_count, size_m); + [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); + [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int n = offset_n + t * 4; @@ -465,8 +465,8 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel( int offset_m = blockIdx.y * m_count; int offset_k = blockIdx.z * BLOCK_KN_SIZE; - int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - int end_m = min(offset_m + m_count, size_m); + [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); + [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int n = offset_n + t * 4; @@ -593,8 +593,8 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel( int offset_m = blockIdx.y * m_count; int offset_k = blockIdx.z * BLOCK_KN_SIZE; - int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - int end_m = min(offset_m + m_count, size_m); + [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); + [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int n = offset_n + t * 4; diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index 86029da1..90f0b54d 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -308,8 +308,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( constexpr int GQA_RATIO4 = DIVIDE_ROUND_UP(GQA_RATIO, 4); - __shared__ float shared_qk_max[NWARPS][16 + 1]; - __shared__ float shared_exp_sum[NWARPS][16 + 1]; + [[maybe_unused]] __shared__ float shared_qk_max[NWARPS][16 + 1]; + [[maybe_unused]] __shared__ float shared_exp_sum[NWARPS][16 + 1]; // shared_logits is used for multiple purposes __shared__ _B16x4 shared_logits[NWARPS][4][16][4]; @@ -426,7 +426,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel( const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride; const int klocal_token_idx = TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; - const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx; + [[maybe_unused]] const int kglobal_token_idx = + partition_start_token_idx + klocal_token_idx; const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE; const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;