diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 432bf5ed..7b32df90 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,18 +3,18 @@ default_stages: - manual # Run in CI repos: - repo: https://github.com/google/yapf - rev: v0.32.0 + rev: v0.43.0 hooks: - id: yapf args: [--in-place, --verbose] additional_dependencies: [toml] # TODO: Remove when yapf is upgraded - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.5 + rev: v0.9.3 hooks: - id: ruff args: [--output-format, github] - repo: https://github.com/codespell-project/codespell - rev: v2.3.0 + rev: v2.4.0 hooks: - id: codespell exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*' @@ -23,7 +23,7 @@ repos: hooks: - id: isort - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v18.1.5 + rev: v19.1.7 hooks: - id: clang-format exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))' @@ -35,7 +35,7 @@ repos: - id: pymarkdown files: docs/.* - repo: https://github.com/rhysd/actionlint - rev: v1.7.6 + rev: v1.7.7 hooks: - id: actionlint - repo: local diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 63d2c3f7..8b321283 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -926,8 +926,8 @@ def main(args: argparse.Namespace): ) # Traffic - result_json["request_rate"] = ( - args.request_rate if args.request_rate < float("inf") else "inf") + result_json["request_rate"] = (args.request_rate if args.request_rate + < float("inf") else "inf") result_json["burstiness"] = args.burstiness result_json["max_concurrency"] = args.max_concurrency diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh index 6be4d4f2..b9df4ed1 100644 --- a/csrc/custom_all_reduce.cuh +++ b/csrc/custom_all_reduce.cuh @@ -38,9 +38,13 @@ struct Signal { alignas(128) FlagType peer_counter[2][kMaxBlocks][8]; }; -struct __align__(16) RankData { const void* __restrict__ ptrs[8]; }; +struct __align__(16) RankData { + const void* __restrict__ ptrs[8]; +}; -struct __align__(16) RankSignals { Signal* signals[8]; }; +struct __align__(16) RankSignals { + Signal* signals[8]; +}; // like std::array, but aligned template diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel.h b/csrc/moe/marlin_kernels/marlin_moe_kernel.h index a217401b..47ecf109 100644 --- a/csrc/moe/marlin_kernels/marlin_moe_kernel.h +++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h @@ -138,8 +138,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; @@ -182,8 +182,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); const int SUB = 0x64006400; const int MUL = 0x2c002c00; diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu index 04ef842f..7c33fea9 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -173,8 +173,8 @@ dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; @@ -197,9 +197,9 @@ dequant(int q) { // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); q >>= 4; - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); typename ScalarType::FragB frag_b; static constexpr uint32_t MUL = 0x3F803F80; @@ -221,8 +221,8 @@ dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); const int SUB = 0x64006400; const int MUL = 0x2c002c00; @@ -244,9 +244,9 @@ dequant(int q) { // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); q >>= 4; - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); typename ScalarType::FragB frag_b; static constexpr uint32_t MUL = 0x3F803F80; diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu index c03fef88..4db8f5dc 100644 --- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu +++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu @@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; diff --git a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu index 103a6444..048a3f73 100644 --- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu +++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu @@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) { static constexpr uint32_t HI = 0x00f000f0; static constexpr uint32_t EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. static constexpr uint32_t SUB = 0x64086408; diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h index b26505f7..49eee412 100644 --- a/csrc/quantization/marlin/sparse/common/mma.h +++ b/csrc/quantization/marlin/sparse/common/mma.h @@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index 94777906..ffa9d446 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -907,7 +907,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, // max_num_partitions, head_size] const int* __restrict__ context_lens, // [num_seqs] - const int max_num_partitions){UNREACHABLE_CODE} + const int max_num_partitions) { + UNREACHABLE_CODE +} #endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support diff --git a/setup.py b/setup.py index ee193e46..59ece870 100755 --- a/setup.py +++ b/setup.py @@ -417,7 +417,7 @@ def get_rocm_version(): if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor), ctypes.byref(patch)) == 0): - return "%d.%d.%d" % (major.value, minor.value, patch.value) + return f"{major.value}.{minor.value}.{patch.value}" return None except Exception: return None diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py index a16cc458..f28fdf3f 100644 --- a/tests/kernels/test_block_fp8.py +++ b/tests/kernels/test_block_fp8.py @@ -92,8 +92,10 @@ def native_w8a8_block_fp8_matmul(A, A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles) ] B_tiles = [[ - B[j * block_n:min((j + 1) * block_n, N), - i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles) + B[ + j * block_n:min((j + 1) * block_n, N), + i * block_k:min((i + 1) * block_k, K), + ] for i in range(k_tiles) ] for j in range(n_tiles)] C_tiles = [ C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles) @@ -157,9 +159,9 @@ def setup_cuda(): torch.set_default_device("cuda") -@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed", - itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, - SEEDS)) +@pytest.mark.parametrize( + "num_tokens,d,dtype,group_size,seed", + itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS)) @torch.inference_mode() def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed): torch.manual_seed(seed) @@ -174,9 +176,9 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed): assert torch.allclose(scale, ref_scale) -@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed", - itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, - SEEDS)) +@pytest.mark.parametrize( + "M,N,K,block_size,out_dtype,seed", + itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS)) @torch.inference_mode() def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): torch.manual_seed(seed) @@ -207,9 +209,10 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): assert rel_diff < 0.001 -@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed", - itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, - BLOCK_SIZE, DTYPES, SEEDS)) +@pytest.mark.parametrize( + "M,N,K,E,topk,block_size,dtype,seed", + itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES, + SEEDS)) @torch.inference_mode() def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed): torch.manual_seed(seed) diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py index 718730bb..4d689030 100644 --- a/tests/kv_transfer/test_lookup_buffer.py +++ b/tests/kv_transfer/test_lookup_buffer.py @@ -20,7 +20,7 @@ def test_run(my_rank, buffer, device): assert buffer.buffer_size == 0 assert len(buffer.buffer) == 0 - print("My rank: %d, device: %s" % (my_rank, device)) + print(f"My rank: {my_rank}, device: {device}") # insert tokens = torch.tensor([1, 2, 3]).to(device) @@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device): assert buffer.buffer_size == 0 assert len(buffer.buffer) == 0 - print("My rank: %d, Test run passed!" % (my_rank)) + print(f"My rank: {my_rank}, Test run passed!") def stress_test(my_rank, buf, device): @@ -94,7 +94,7 @@ def stress_test(my_rank, buf, device): assert torch.allclose(k, k_) assert torch.allclose(v, v_) assert torch.allclose(h, h_) - print('Rank %d done' % my_rank) + print(f"Rank {my_rank} done") torch.distributed.barrier() if my_rank == 0: @@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device): else: torch.distributed.send(torch.tensor([n]), 0) - print("My rank: %d, Passed stress test!" % (my_rank)) + print(f"My rank: {my_rank}, Passed stress test!") if __name__ == "__main__": @@ -122,7 +122,7 @@ if __name__ == "__main__": rank=my_rank, ) - print("initialized! My rank is %d" % my_rank) + print(f"initialized! My rank is {my_rank}") config = KVTransferConfig( kv_connector='PyNcclConnector', diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index ebdd129d..570aa386 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -55,9 +55,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts -@pytest.mark.xfail(current_platform.is_rocm(), - reason="Qwen2-VL dependency xformers incompatible with ROCm" - ) +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="Qwen2-VL dependency xformers incompatible with ROCm") def test_qwen2vl_lora(qwen2vl_lora_files): llm = vllm.LLM( MODEL_PATH, diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 14d9a739..d5f0d632 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -521,12 +521,13 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2) # - image embeddings # - video # - custom inputs -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.IMAGE, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.IMAGE, + fork_new_process_for_each_test=False, + )) def test_single_image_models(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], @@ -543,12 +544,13 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.MULTI_IMAGE, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.MULTI_IMAGE, + fork_new_process_for_each_test=False, + )) def test_multi_image_models(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], @@ -565,12 +567,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.EMBEDDING, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.EMBEDDING, + fork_new_process_for_each_test=False, + )) def test_image_embedding_models(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], @@ -586,12 +589,13 @@ def test_image_embedding_models(model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.VIDEO, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.VIDEO, + fork_new_process_for_each_test=False, + )) def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], video_assets: _VideoAssets): @@ -605,12 +609,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.CUSTOM_INPUTS, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.CUSTOM_INPUTS, + fork_new_process_for_each_test=False, + )) def test_custom_inputs_models( model_type: str, test_case: ExpandableVLMTestArgs, @@ -627,12 +632,13 @@ def test_custom_inputs_models( #### Tests filtering for things running each test as a new process -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.IMAGE, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.IMAGE, + fork_new_process_for_each_test=True, + )) @fork_new_process_for_each_test def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, @@ -650,12 +656,13 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.MULTI_IMAGE, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.MULTI_IMAGE, + fork_new_process_for_each_test=True, + )) @fork_new_process_for_each_test def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, @@ -673,12 +680,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.EMBEDDING, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.EMBEDDING, + fork_new_process_for_each_test=True, + )) @fork_new_process_for_each_test def test_image_embedding_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, @@ -695,12 +703,13 @@ def test_image_embedding_models_heavy(model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.VIDEO, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.VIDEO, + fork_new_process_for_each_test=True, + )) def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], @@ -715,12 +724,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.CUSTOM_INPUTS, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.CUSTOM_INPUTS, + fork_new_process_for_each_test=True, + )) @fork_new_process_for_each_test def test_custom_inputs_models_heavy( model_type: str, diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py index 90c0fab9..8103e530 100644 --- a/tests/models/decoder_only/vision_language/test_pixtral.py +++ b/tests/models/decoder_only/vision_language/test_pixtral.py @@ -135,10 +135,10 @@ def _dump_outputs_w_logprobs( outputs: OutputsLogprobs, filename: "StrPath", ) -> None: - json_data = [(tokens, text, - [{k: asdict(v) - for k, v in token_logprobs.items()} - for token_logprobs in (logprobs or [])]) + json_data = [(tokens, text, [{ + k: asdict(v) + for k, v in token_logprobs.items() + } for token_logprobs in (logprobs or [])]) for tokens, text, logprobs in outputs] with open(filename, "w") as f: @@ -149,11 +149,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs: with open(filename, "rb") as f: json_data = json.load(f) - return [(tokens, text, - [{int(k): Logprob(**v) - for k, v in token_logprobs.items()} - for token_logprobs in logprobs]) - for tokens, text, logprobs in json_data] + return [(tokens, text, [{ + int(k): Logprob(**v) + for k, v in token_logprobs.items() + } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data] @large_gpu_test(min_gb=80) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index bf0d454a..1072697e 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -314,9 +314,9 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4): @pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.") -@pytest.mark.skipif(not sparse_cutlass_supported(), - reason="2of4 Sparse is not yet supported on this GPU type." - ) +@pytest.mark.skipif( + not sparse_cutlass_supported(), + reason="2of4 Sparse is not yet supported on this GPU type.") @pytest.mark.parametrize( "args_2of4", [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")]) diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index 397fa2cc..dcb1b27b 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -23,16 +23,17 @@ def mock_causal_accepted_tensor( """ batch_size = last_accepted_indices.shape[0] - accepted = (torch.arange(k).expand(batch_size, k) <= - last_accepted_indices.unsqueeze(-1).broadcast_to( + accepted = (torch.arange(k).expand(batch_size, k) + <= last_accepted_indices.unsqueeze(-1).broadcast_to( batch_size, k)) # Sprinkle accepted values after the contiguous initial accepted values. # This replicates the behavior of rejection sampling, which may "accept" # a token that cannot be accepted because of causality. - sprinkle_candidates = ( - torch.arange(k).expand(batch_size, k) > - last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + 1) + sprinkle_candidates = (torch.arange(k).expand( + batch_size, + k) > last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + + 1) sprinkle = torch.rand(batch_size, k) > 0.5 accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates] return accepted @@ -445,8 +446,8 @@ def test_rejection_sampling_approximates_target_distribution( distance_wrt_reference) expected_improvement_multiplier = 20 - assert (relative_change_in_distance_wrt_target > - relative_change_in_distance_wrt_reference * + assert (relative_change_in_distance_wrt_target + > relative_change_in_distance_wrt_reference * expected_improvement_multiplier) diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py index 51ad2adc..9dc19f5f 100644 --- a/tools/report_build_time_ninja.py +++ b/tools/report_build_time_ninja.py @@ -274,8 +274,9 @@ def SummarizeEntries(entries, extra_step_types): print(' {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x ' 'parallelism)'.format(length, total_cpu_time, total_cpu_time * 1.0 / length)) - print(' %d build steps completed, average of %1.2f/s' % - (len(entries), len(entries) / (length))) + print(' {} build steps completed, average of {:1.2f}/s'.format( + len(entries), + len(entries) / (length))) def main(): diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 440bc520..85c1121e 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -820,8 +820,8 @@ def scaled_int8_quant( if scale is not None: # static-per-tensor quantization. assert symmetric == ( - azp is - None), "azp must only be provided for asymmetric quantization." + azp + is None), "azp must only be provided for asymmetric quantization." torch.ops._C.static_scaled_int8_quant(output, input, scale, azp) return output, scale, azp diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index e2f2b66d..ec3c8459 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -219,8 +219,8 @@ if triton.__version__ >= "2.1.0": float("-inf")) if SLIDING_WINDOW > 0: qk = tl.where( - offs_m[:, None] - - (start_n + offs_n[None, :]) < SLIDING_WINDOW, qk, -10000) + offs_m[:, None] - (start_n + offs_n[None, :]) + < SLIDING_WINDOW, qk, -10000) # -- compute m_ij, p, l_ij m_ij = tl.max(qk, 1) @@ -324,10 +324,10 @@ if triton.__version__ >= "2.1.0": (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + cur_head * stride_qh + offs_d[None, :] * stride_qd) - q = tl.load( - Q + off_q, - mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len, - other=0.0) + q = tl.load(Q + off_q, + mask=offs_m[:, None] + < cur_batch_seq_len - cur_batch_ctx_len, + other=0.0) # # initialize pointer to m and l m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") @@ -402,8 +402,8 @@ if triton.__version__ >= "2.1.0": # -- compute qk ---- k = tl.load(k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs, - mask=(start_n + offs_n[None, :]) < - cur_batch_seq_len - cur_batch_ctx_len, + mask=(start_n + offs_n[None, :]) + < cur_batch_seq_len - cur_batch_ctx_len, other=0.0) qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) @@ -430,8 +430,8 @@ if triton.__version__ >= "2.1.0": # update acc v = tl.load(v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs, - mask=(start_n + offs_n[:, None]) < - cur_batch_seq_len - cur_batch_ctx_len, + mask=(start_n + offs_n[:, None]) + < cur_batch_seq_len - cur_batch_ctx_len, other=0.0) p = p.to(v.dtype) @@ -639,8 +639,8 @@ if triton.__version__ >= "2.1.0": k = tl.load(k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs, mask=dim_mask[:, None] & - ((start_n + offs_n[None, :]) < - cur_batch_seq_len - cur_batch_ctx_len), + ((start_n + offs_n[None, :]) + < cur_batch_seq_len - cur_batch_ctx_len), other=0.0) qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) @@ -677,8 +677,8 @@ if triton.__version__ >= "2.1.0": v = tl.load(v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs, mask=dim_mask[None, :] & - ((start_n + offs_n[:, None]) < - cur_batch_seq_len - cur_batch_ctx_len), + ((start_n + offs_n[:, None]) + < cur_batch_seq_len - cur_batch_ctx_len), other=0.0) p = p.to(v.dtype) diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py index f9421111..ef04603f 100644 --- a/vllm/attention/ops/triton_flash_attention.py +++ b/vllm/attention/ops/triton_flash_attention.py @@ -627,8 +627,8 @@ def attn_fwd( causal_start_idx, dtype=tl.int32) mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M) - out_ptrs_mask = (mask_m_offsets[:, None] >= - out_mask_boundary[None, :]) + out_ptrs_mask = (mask_m_offsets[:, None] + >= out_mask_boundary[None, :]) z = 0.0 acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty)) # write back LSE diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 81ea6eef..1376274d 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -1,6 +1,6 @@ import os from contextlib import contextmanager -from functools import lru_cache +from functools import cache from typing import Generator, Optional, Type import torch @@ -100,7 +100,7 @@ def get_attn_backend( ) -@lru_cache(maxsize=None) +@cache def _cached_get_attn_backend( head_size: int, dtype: torch.dtype, diff --git a/vllm/config.py b/vllm/config.py index 7ab632d7..d7c9311a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -67,7 +67,8 @@ _RUNNER_TASKS: Dict[RunnerType, List[_ResolvedTask]] = { _TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = { task: runner - for runner, tasks in _RUNNER_TASKS.items() for task in tasks + for runner, tasks in _RUNNER_TASKS.items() + for task in tasks } HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig], @@ -1976,8 +1977,8 @@ class SpeculativeConfig: "typical_acceptance_sampler.") if (self.draft_token_acceptance_method != 'rejection_sampler' - and self.draft_token_acceptance_method != - 'typical_acceptance_sampler'): + and self.draft_token_acceptance_method + != 'typical_acceptance_sampler'): raise ValueError( "Expected draft_token_acceptance_method to be either " "rejection_sampler or typical_acceptance_sampler. Instead it " diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py index c03b5932..115f663e 100644 --- a/vllm/core/block/common.py +++ b/vllm/core/block/common.py @@ -34,9 +34,10 @@ class RefCounter(RefCounterProtocol): def __init__(self, all_block_indices: Iterable[BlockId]): deduped = set(all_block_indices) - self._refcounts: Dict[BlockId, - RefCount] = {index: 0 - for index in deduped} + self._refcounts: Dict[BlockId, RefCount] = { + index: 0 + for index in deduped + } def incr(self, block_id: BlockId) -> RefCount: assert block_id in self._refcounts diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 62a5f0bd..2d6a132e 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -136,8 +136,8 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager): device=Device.GPU) # Use watermark to avoid frequent cache eviction. - if (self.num_total_gpu_blocks - num_required_blocks < - self.watermark_blocks): + if (self.num_total_gpu_blocks - num_required_blocks + < self.watermark_blocks): return AllocStatus.NEVER if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks: return AllocStatus.OK diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index b1630b34..2bb96148 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -988,8 +988,8 @@ class Scheduler: waiting_queue.popleft() continue - if (budget.num_batched_tokens >= - self.scheduler_config.max_num_batched_tokens): + if (budget.num_batched_tokens + >= self.scheduler_config.max_num_batched_tokens): # We've reached the budget limit - since there might be # continuous prefills in the running queue, we should break # to avoid scheduling any new prefills. @@ -1096,8 +1096,8 @@ class Scheduler: running_scheduled.swapped_out) == 0: swapped_in = self._schedule_swapped(budget, curr_loras) - assert (budget.num_batched_tokens <= - self.scheduler_config.max_num_batched_tokens) + assert (budget.num_batched_tokens + <= self.scheduler_config.max_num_batched_tokens) assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs # Update waiting requests. @@ -1189,8 +1189,8 @@ class Scheduler: curr_loras, enable_chunking=True) - assert (budget.num_batched_tokens <= - self.scheduler_config.max_num_batched_tokens) + assert (budget.num_batched_tokens + <= self.scheduler_config.max_num_batched_tokens) assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs # Update waiting requests. @@ -1358,8 +1358,8 @@ class Scheduler: # NOTE: We use get_len instead of get_prompt_len because when # a sequence is preempted, prefill includes previous generated # output tokens. - if (token_chunk_size + num_computed_tokens < - seqs[0].data.get_len()): + if (token_chunk_size + num_computed_tokens + < seqs[0].data.get_len()): do_sample = False # It assumes the scheduled_seq_groups is ordered by @@ -1625,10 +1625,9 @@ class Scheduler: if self.scheduler_config.delay_factor > 0 and self.waiting: earliest_arrival_time = min( [e.metrics.arrival_time for e in self.waiting]) - passed_delay = ( - (now - earliest_arrival_time) > - (self.scheduler_config.delay_factor * self.last_prompt_latency) - or not self.running) + passed_delay = ((now - earliest_arrival_time) + > (self.scheduler_config.delay_factor * + self.last_prompt_latency) or not self.running) else: passed_delay = True return passed_delay diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 4ced991f..268edc09 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -352,8 +352,8 @@ class MessageQueue: sched_yield() # if we wait for a long time, log a message - if (time.monotonic() - start_time > - VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): + if (time.monotonic() - start_time + > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): logger.debug("No available block found in %s second. ", VLLM_RINGBUFFER_WARNING_INTERVAL) n_warning += 1 @@ -410,8 +410,8 @@ class MessageQueue: sched_yield() # if we wait for a long time, log a message - if (time.monotonic() - start_time > - VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): + if (time.monotonic() - start_time + > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): logger.debug("No available block found in %s second. ", VLLM_RINGBUFFER_WARNING_INTERVAL) n_warning += 1 diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index ffdf8b0f..7fe9b68d 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1014,8 +1014,8 @@ def initialize_model_parallel( backend = backend or torch.distributed.get_backend( get_world_group().device_group) - if (world_size != - tensor_model_parallel_size * pipeline_model_parallel_size): + if (world_size + != tensor_model_parallel_size * pipeline_model_parallel_size): raise RuntimeError( f"world_size ({world_size}) is not equal to " f"tensor_model_parallel_size ({tensor_model_parallel_size}) x " @@ -1069,8 +1069,8 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None: return if all([ - vllm_config.kv_transfer_config.need_kv_parallel_group, - _KV_TRANSFER is None + vllm_config.kv_transfer_config.need_kv_parallel_group, _KV_TRANSFER + is None ]): _KV_TRANSFER = kv_transfer.KVTransferAgent( rank=get_world_group().rank, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index beedf5d1..723d6e90 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -3,7 +3,7 @@ import codecs import json from abc import ABC, abstractmethod from collections import defaultdict, deque -from functools import lru_cache, partial +from functools import cache, lru_cache, partial from pathlib import Path from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List, Literal, Optional, Tuple, TypeVar, Union, cast) @@ -377,7 +377,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): return self._model_config.allowed_local_media_path @staticmethod - @lru_cache(maxsize=None) + @cache def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str: return tokenizer.decode(token_index) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 2c9c20ca..b0179f78 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -522,11 +522,10 @@ class OpenAIServingCompletion(OpenAIServing): out_top_logprobs.append({ # Convert float("-inf") to the # JSON-serializable float that OpenAI uses - self._get_decoded_token( - top_lp[1], - top_lp[0], - tokenizer, - return_as_token_id=self.return_tokens_as_token_ids): + self._get_decoded_token(top_lp[1], + top_lp[0], + tokenizer, + return_as_token_id=self.return_tokens_as_token_ids): max(top_lp[1].logprob, -9999.0) for i, top_lp in enumerate(step_top_logprobs.items()) if num_output_top_logprobs >= i diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index 94db8f37..93e357e8 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -62,8 +62,8 @@ class Granite20bFCToolParser(ToolParser): start_of_json = match.end() # end_index == the start of the next function call # (if exists) - next_function_call_start = (matches[i + 1].start() - if i + 1 < len(matches) else None) + next_function_call_start = (matches[i + 1].start() if i + + 1 < len(matches) else None) raw_function_calls.append( dec.raw_decode( diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index e6f26d2b..cdd439d0 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -220,8 +220,10 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA): lora_b.T, non_blocking=True) if embeddings_tensor is not None: self.embeddings_tensors[ - index, :embeddings_tensor.shape[0], :embeddings_tensor. - shape[1], ].copy_(embeddings_tensor, non_blocking=True) + index, + :embeddings_tensor.shape[0], + :embeddings_tensor.shape[1], + ].copy_(embeddings_tensor, non_blocking=True) if self.embeddings_slice is not None: # TODO(yard1): Optimize this copy, we don't need to copy # everything, just the modified part @@ -1024,8 +1026,10 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA): lora_b.T, non_blocking=True) if embeddings_tensor is not None: self.embeddings_tensors[ - index, :embeddings_tensor.shape[0], :embeddings_tensor. - shape[1], ] = embeddings_tensor + index, + :embeddings_tensor.shape[0], + :embeddings_tensor.shape[1], + ] = embeddings_tensor def _get_logits( self, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index b77b6b3d..2e04cb90 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -75,8 +75,9 @@ class LoRAModel(AdapterModel): # Scaling factor for long context lora model. None if it is not # fine tuned for the long context. self.scaling_factor = scaling_factor - assert (lora_model_id > - 0), f"a valid lora id should be greater than 0, got {self.id}" + assert ( + lora_model_id + > 0), f"a valid lora id should be greater than 0, got {self.id}" self.rank = rank self.loras: Dict[str, LoRALayerWeights] = loras diff --git a/vllm/lora/ops/triton_ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py index 8af44b70..48fa5cd6 100644 --- a/vllm/lora/ops/triton_ops/sgmv_expand.py +++ b/vllm/lora/ops/triton_ops/sgmv_expand.py @@ -136,9 +136,8 @@ def _sgmv_expand_kernel( c_ptr = (out_ptr + offset_cm[:, None] * output_d0_stride + offset_cn[None, :] * output_d1_stride) M = tl.load(seq_lens + cur_batch) - c_mask = (offset_cm[:, None] < - (cur_seq_start + M)) & (offset_cn[None, :] < - (cur_slice_start + curr_N)) + c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & ( + offset_cn[None, :] < (cur_slice_start + curr_N)) if ADD_INPUTS: tiled_out = tl.load(c_ptr, mask=c_mask) tiled_c += tiled_out diff --git a/vllm/lora/ops/triton_ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py index 3d2ebe82..9bb35e8f 100644 --- a/vllm/lora/ops/triton_ops/sgmv_shrink.py +++ b/vllm/lora/ops/triton_ops/sgmv_shrink.py @@ -114,8 +114,8 @@ def _sgmv_shrink_kernel( slice_id * output_d0_stride) c_ptr = cur_out_ptr + offset_cm[:, None] * output_d1_stride + offset_cn[ None, :] * output_d2_stride - c_mask = (offset_cm[:, None] < - (cur_seq_start + M)) & (offset_cn[None, :] < N) + c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :] + < N) accumulator *= scaling # handles write-back with reduction-splitting if SPLIT_K == 1: diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py index b04612a9..915bdc47 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py @@ -73,12 +73,12 @@ class MPLinearKernel(ABC): torch.nn.Parameter(new_param.data, requires_grad=False)) def _get_weight_params( - self, layer: torch.nn.Module - ) -> Tuple[torch.Tensor, # w_q - torch.Tensor, # w_s - Optional[torch.Tensor], # w_zp, - Optional[torch.Tensor] # w_gidx - ]: + self, layer: torch.nn.Module) -> Tuple[ + torch.Tensor, # w_q + torch.Tensor, # w_s + Optional[torch.Tensor], # w_zp, + Optional[torch.Tensor] # w_gidx + ]: return ( getattr(layer, self.w_q_name), getattr(layer, self.w_s_name), diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py index 75cf91f1..c4a83b4f 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py @@ -48,13 +48,13 @@ class ScaledMMLinearKernel(ABC): raise NotImplementedError def _get_weight_params( - self, layer: torch.nn.Module - ) -> Tuple[torch.Tensor, # weight - torch.Tensor, # weight_scale - Optional[torch.Tensor], # input_scale, - Optional[torch.Tensor], # input_zp - Optional[torch.Tensor], # azp_adj - ]: + self, layer: torch.nn.Module) -> Tuple[ + torch.Tensor, # weight + torch.Tensor, # weight_scale + Optional[torch.Tensor], # input_scale, + Optional[torch.Tensor], # input_zp + Optional[torch.Tensor], # azp_adj + ]: return ( getattr(layer, self.w_q_name), getattr(layer, self.w_s_name), diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index b6882cc7..43b19970 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -72,9 +72,10 @@ def block_quant_to_tensor_quant( x_dq_block = x_q_block.to(torch.float32) x_dq_block_tiles = [[ - x_dq_block[j * block_n:min((j + 1) * block_n, n), - i * block_k:min((i + 1) * block_k, k), ] - for i in range(k_tiles) + x_dq_block[ + j * block_n:min((j + 1) * block_n, n), + i * block_k:min((i + 1) * block_k, k), + ] for i in range(k_tiles) ] for j in range(n_tiles)] for i in range(k_tiles): diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 7cdce67c..99778041 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -73,8 +73,8 @@ def requantize_with_max_scale( # from disk in this case. Skip requantization in this case (since) # we already are quantized with the single scale. # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8 - unfused_module_in_checkpoint = (weight_scale[-1] > torch.finfo( - torch.float8_e4m3fn).min) + unfused_module_in_checkpoint = (weight_scale[-1] + > torch.finfo(torch.float8_e4m3fn).min) # If unfused checkpoint, need requanize with the single scale. if unfused_module_in_checkpoint: diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index c2d12c46..8dc26309 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -716,9 +716,10 @@ def _sample_with_torch( tensors required for Pythonization ''' - categorized_seq_group_ids: Dict[SamplingType, - List[int]] = {t: [] - for t in SamplingType} + categorized_seq_group_ids: Dict[SamplingType, List[int]] = { + t: [] + for t in SamplingType + } categorized_sample_indices = sampling_metadata.categorized_sample_indices for i, seq_group in enumerate(sampling_metadata.seq_groups): sampling_params = seq_group.sampling_params diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 3eb5c39c..f230efac 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -115,17 +115,17 @@ class VocabParallelEmbeddingShardIndices: def __post_init__(self): # sanity checks - assert (self.padded_org_vocab_start_index <= - self.padded_org_vocab_end_index) - assert (self.padded_added_vocab_start_index <= - self.padded_added_vocab_end_index) + assert (self.padded_org_vocab_start_index + <= self.padded_org_vocab_end_index) + assert (self.padded_added_vocab_start_index + <= self.padded_added_vocab_end_index) assert self.org_vocab_start_index <= self.org_vocab_end_index assert self.added_vocab_start_index <= self.added_vocab_end_index assert self.org_vocab_start_index <= self.padded_org_vocab_start_index - assert (self.added_vocab_start_index <= - self.padded_added_vocab_start_index) + assert (self.added_vocab_start_index + <= self.padded_added_vocab_start_index) assert self.org_vocab_end_index <= self.padded_org_vocab_end_index assert self.added_vocab_end_index <= self.padded_added_vocab_end_index @@ -141,8 +141,8 @@ def get_masked_input_and_mask( added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]: # torch.compile will fuse all of the pointwise ops below # into a single kernel, making it very fast - org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < - org_vocab_end_index) + org_vocab_mask = (input_ >= org_vocab_start_index) & ( + input_ < org_vocab_end_index) added_vocab_mask = (input_ >= added_vocab_start_index) & ( input_ < added_vocab_end_index) added_offset = added_vocab_start_index - ( diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 527b4307..712266ee 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -1121,8 +1121,9 @@ class BitsAndBytesModelLoader(BaseModelLoader): # from being incorrectly identified as being present in # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight shard_pos = quant_param_name.find(shard_name) - can_correct_rename = (shard_pos > 0) and ( - quant_param_name[shard_pos - 1] == ".") + can_correct_rename = (shard_pos + > 0) and (quant_param_name[shard_pos - 1] + == ".") # If the quant_param_name is packed, it won't occur in the # param_dict before renaming. new_quant_param_name = quant_param_name.replace( diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index e359aef9..9266ca75 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -298,8 +298,8 @@ class TensorizerAgent: to allow for adapter added tokens.""" for child in self.model.modules(): if (isinstance(child, VocabParallelEmbedding) - and child.weight.shape[0] < - child.num_embeddings_per_partition): + and child.weight.shape[0] + < child.num_embeddings_per_partition): new_weight = torch.empty(child.num_embeddings_per_partition, child.embedding_dim, dtype=child.weight.dtype, diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 6de0c866..b23aba82 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Gemma model compatible with HuggingFace weights.""" -from functools import lru_cache +from functools import cache from typing import Iterable, List, Optional, Set, Tuple, Union import torch @@ -48,7 +48,7 @@ from .utils import (is_pp_missing_parameter, logger = init_logger(__name__) -@lru_cache(maxsize=None) +@cache def _get_gemma_act_fn( hidden_act: Optional[str], hidden_activation: Optional[str], diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index b518a0a6..cdf9414d 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -429,10 +429,10 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): for e in range(p.size(0)): w1_name = n.replace( '.block_sparse_moe.input_linear.weight', - ".block_sparse_moe.experts.%d.w1.weight" % e) + f".block_sparse_moe.experts.{e}.w1.weight") w3_name = n.replace( '.block_sparse_moe.input_linear.weight', - ".block_sparse_moe.experts.%d.w3.weight" % e) + f".block_sparse_moe.experts.{e}.w3.weight") w1_param, w3_param = p[e].chunk(2, dim=0) assert w1_name not in new_weights assert w3_name not in new_weights @@ -442,7 +442,7 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): for e in range(p.size(0)): w2_name = n.replace( '.block_sparse_moe.output_linear.weight', - ".block_sparse_moe.experts.%d.w2.weight" % e) + f".block_sparse_moe.experts.{e}.w2.weight") w2_param = p[e] assert w2_name not in new_weights new_weights[w2_name] = w2_param diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 61baa8e5..e15ac84a 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1365,8 +1365,8 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal): # For 1) text-only prefill and decode, 2) image-present decode. if image_inputs is None: full_text_row_masked_out_mask = ( - attn_metadata.encoder_seq_lens_tensor != 0).reshape(-1, 1).to( - input_ids.device) + attn_metadata.encoder_seq_lens_tensor + != 0).reshape(-1, 1).to(input_ids.device) skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0 # For image-present prefill. diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index d49da5f2..f1d796ca 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -81,8 +81,8 @@ class MLPSpeculator(nn.Module): if self.tie_weights: assert ( - self.n_predict > - 1), "You cannot tie weights between stages when only 1 exists" + self.n_predict > 1 + ), "You cannot tie weights between stages when only 1 exists" embedding = VocabParallelEmbedding( config.vocab_size, self.inner_dim, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 881c09ea..6367b770 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -167,8 +167,8 @@ def sparsemixer(scores, jitter_eps=0.01): # compute mask for sparsity mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True) factor = scores.abs().clamp(min=mask_logits_threshold) - mask_logits_threshold = ( - (mask_logits_threshold - scores) / factor) > (2 * jitter_eps) + mask_logits_threshold = ((mask_logits_threshold - scores) / + factor) > (2 * jitter_eps) # apply mask masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf")) @@ -192,8 +192,8 @@ def sparsemixer(scores, jitter_eps=0.01): mask_logits_threshold, max_ind = masked_scores.max(dim=-1, keepdim=True) factor = scores.abs().clamp(min=mask_logits_threshold) - mask_logits_threshold = ( - (mask_logits_threshold - scores) / factor) > (2 * jitter_eps) + mask_logits_threshold = ((mask_logits_threshold - scores) / + factor) > (2 * jitter_eps) # apply mask masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 8d2719ca..8d71b190 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -462,7 +462,8 @@ class _ModelRegistry: ModelRegistry = _ModelRegistry({ - model_arch: _LazyRegisteredModel( + model_arch: + _LazyRegisteredModel( module_name=f"vllm.model_executor.models.{mod_relname}", class_name=cls_name, ) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index d577e545..605a0ecf 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -333,10 +333,10 @@ class ModifiedWhisperEncoder(WhisperEncoder): return hidden_states -@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor, - info=UltravoxProcessingInfo, - dummy_inputs=UltravoxDummyInputsBuilder - ) +@MULTIMODAL_REGISTRY.register_processor( + UltravoxMultiModalProcessor, + info=UltravoxProcessingInfo, + dummy_inputs=UltravoxDummyInputsBuilder) class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): hf_to_vllm_mapper = WeightsMapper( diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 43b3c973..01a232fd 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -599,9 +599,8 @@ def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int): device: torch.device, ) -> IntermediateTensors: return IntermediateTensors({ - key: torch.zeros((batch_size, hidden_size), - dtype=dtype, - device=device) + key: + torch.zeros((batch_size, hidden_size), dtype=dtype, device=device) for key in keys }) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 1df8f84e..61e8881b 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -166,7 +166,8 @@ class SamplingMetadata: pin_memory=pin_memory, ) categorized_sample_indices = { - t: async_tensor_h2d( + t: + async_tensor_h2d( seq_ids, dtype=torch.int, target_device=device, @@ -198,8 +199,12 @@ def _prepare_seq_groups( device: str, generators: Optional[Dict[str, torch.Generator]] = None, cache: Optional[SamplingMetadataCache] = None, -) -> Tuple[List[SequenceGroupToSample], List[int], Dict[SamplingType, - List[int]], int, ]: +) -> Tuple[ + List[SequenceGroupToSample], + List[int], + Dict[SamplingType, List[int]], + int, +]: """Prepare sequence groups and indices for sampling. Args: diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index ead3dab0..23a7126f 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -38,8 +38,8 @@ class NeuronPlatform(Platform): if parallel_config.world_size > 1: parallel_config.distributed_executor_backend = "uni" - assert (vllm_config.lora_config is - None), "LoRA is not supported for Neuron backend." + assert (vllm_config.lora_config + is None), "LoRA is not supported for Neuron backend." assert (not vllm_config.speculative_config ), "Speculative decoding not yet supported for Neuron backend." diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py index 9d711b0d..20063a5b 100644 --- a/vllm/scalar_type.py +++ b/vllm/scalar_type.py @@ -121,8 +121,8 @@ class ScalarType: min_raw = max_raw | sign_bit_double return struct.unpack('!d', struct.pack('!Q', min_raw))[0] else: - assert (not self.is_signed() or - self.size_bits <= 64), "Cannot represent min as a int64_t" + assert (not self.is_signed() or self.size_bits + <= 64), "Cannot represent min as a int64_t" if self.is_signed(): return -(1 << (self.size_bits - 1)) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 8e9802c7..af1c4dfc 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -510,8 +510,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): self, execute_model_req: ExecuteModelRequest) -> bool: # When the batch size is too large, disable speculative decoding # to stop trading off throughput for latency. - return (execute_model_req.running_queue_size >= - self.disable_by_batch_size) + return (execute_model_req.running_queue_size + >= self.disable_by_batch_size) def _maybe_disable_speculative_tokens( self, disable_all_speculation: bool, diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py index 5a7999a2..6bf7587c 100644 --- a/vllm/spec_decode/top1_proposer.py +++ b/vllm/spec_decode/top1_proposer.py @@ -104,11 +104,11 @@ class Top1Proposer(SpeculativeProposer): sampler_transposed=transposed, ) - proposals = SpeculativeProposals( - proposal_token_ids=proposal_tokens, - proposal_probs=proposal_probs, - proposal_lens=proposal_lens, - no_proposals=maybe_sampler_output is None) + proposals = SpeculativeProposals(proposal_token_ids=proposal_tokens, + proposal_probs=proposal_probs, + proposal_lens=proposal_lens, + no_proposals=maybe_sampler_output + is None) return proposals def _split_by_proposal_len( diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index da870665..c88820ab 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -40,13 +40,15 @@ def get_sampled_token_logprobs( """ num_steps, batch_size, vocab_size = logprob_tensor.shape - selected_logprobs = logprob_tensor[torch.arange(num_steps).unsqueeze(1), - torch.arange(batch_size), - sampled_token_ids, ] + selected_logprobs = logprob_tensor[ + torch.arange(num_steps).unsqueeze(1), + torch.arange(batch_size), + sampled_token_ids, + ] expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand( -1, -1, vocab_size) - sampled_token_ids_ranks = (logprob_tensor > - expanded_selected_logprobs).sum(-1).add_(1) + sampled_token_ids_ranks = (logprob_tensor + > expanded_selected_logprobs).sum(-1).add_(1) return sampled_token_ids_ranks, selected_logprobs diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index 93fec667..1edf3632 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -182,8 +182,8 @@ class NemotronConfig(PretrainedConfig): if self.rope_scaling is None: return - if not isinstance(self.rope_scaling, - dict) or len(self.rope_scaling) != 2: + if not isinstance(self.rope_scaling, dict) or len( + self.rope_scaling) != 2: raise ValueError( "`rope_scaling` must be a dictionary with two fields, " f"`type` and `factor`, got {self.rope_scaling}") diff --git a/vllm/utils.py b/vllm/utils.py index 17bffd28..15481fb0 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -29,7 +29,7 @@ from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task from collections import OrderedDict, UserDict, defaultdict from collections.abc import Hashable, Iterable, Mapping from dataclasses import dataclass, field -from functools import lru_cache, partial, wraps +from functools import cache, lru_cache, partial, wraps from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, Dict, Generator, Generic, Iterator, List, Literal, NamedTuple, Optional, Tuple, Type, TypeVar, Union, @@ -352,7 +352,7 @@ class PyObjectCache: self._index = 0 -@lru_cache(maxsize=None) +@cache def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" from vllm import _custom_ops as ops @@ -697,7 +697,7 @@ def create_kv_caches_with_random( return key_caches, value_caches -@lru_cache(maxsize=None) +@cache def is_pin_memory_available() -> bool: from vllm.platforms import current_platform return current_platform.is_pin_memory_available() @@ -886,7 +886,7 @@ def init_cached_hf_modules() -> None: init_hf_modules() -@lru_cache(maxsize=None) +@cache def find_library(lib_name: str) -> str: """ Find the library file in the system. @@ -1607,7 +1607,7 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]): return module -@lru_cache(maxsize=None) +@cache def get_vllm_optional_dependencies(): metadata = importlib.metadata.metadata("vllm") requirements = metadata.get_all("Requires-Dist", []) diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index de7fb1a6..7a88cc94 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -247,8 +247,8 @@ class Scheduler: token_budget -= num_new_tokens request.status = RequestStatus.RUNNING request.num_computed_tokens = num_computed_tokens - has_partial_request = (num_computed_tokens + num_new_tokens < - request.num_tokens) + has_partial_request = (num_computed_tokens + num_new_tokens + < request.num_tokens) # Encoder-related. if encoder_inputs_to_schedule: diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py index 500bc356..902800e0 100644 --- a/vllm/v1/stats/common.py +++ b/vllm/v1/stats/common.py @@ -311,8 +311,8 @@ class RequestStats: return [] latency_s_lst = [] for i in range(1, len(self.output_token_ts_s_lst)): - assert (self.output_token_ts_s_lst[i] >= - self.output_token_ts_s_lst[i - 1]) + assert (self.output_token_ts_s_lst[i] + >= self.output_token_ts_s_lst[i - 1]) latency_s = (self.output_token_ts_s_lst[i] - self.output_token_ts_s_lst[i - 1]) latency_s_lst.append(latency_s) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9d7e3007..a00c00c3 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -205,7 +205,7 @@ class GPUModelRunner: def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Remove stopped requests from the cached states. - # Keep the states of the pre-empted requests. + # Keep the states of the preempted requests. for req_id in scheduler_output.finished_req_ids: self.requests.pop(req_id, None) self.encoder_cache.pop(req_id, None) diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 3c570212..aaf9cb40 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -173,13 +173,13 @@ class HPUWorker(LocalOrDistributedWorkerBase): cpu_fallback_ctx as cpu_fallback_local_metric: output = LocalOrDistributedWorkerBase.execute_model( self, execute_model_req) - if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0 - ) or log_graph_compilation_all: + if (log_graph_compilation and gc_local_metric.stats()[0][1] + > 0) or log_graph_compilation_all: msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: " f"{gc_local_metric.stats()}, {input_stats}") logger.warning(msg) - if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] > - 0) or log_cpu_fallbacks_all: + if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] + > 0) or log_cpu_fallbacks_all: msg = ("VLLM_HPU_STEP_CPU_FALLBACK: " f"{cpu_fallback_local_metric.stats()}, {input_stats}") logger.warning(msg) diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index a3f648f4..87495182 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -316,8 +316,8 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]): logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len) num_tokens = batch_size * seq_len - if (num_tokens >= - self.scheduler_config.max_num_batched_tokens): + if (num_tokens + >= self.scheduler_config.max_num_batched_tokens): break seq_len = seq_len * 2 end = time.time()