Update pre-commit hooks (#12475)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor 2025-01-28 00:23:08 +00:00 committed by GitHub
parent 6116ca8cd7
commit 823ab79633
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
64 changed files with 322 additions and 288 deletions

View File

@ -3,18 +3,18 @@ default_stages:
- manual # Run in CI - manual # Run in CI
repos: repos:
- repo: https://github.com/google/yapf - repo: https://github.com/google/yapf
rev: v0.32.0 rev: v0.43.0
hooks: hooks:
- id: yapf - id: yapf
args: [--in-place, --verbose] args: [--in-place, --verbose]
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.5 rev: v0.9.3
hooks: hooks:
- id: ruff - id: ruff
args: [--output-format, github] args: [--output-format, github]
- repo: https://github.com/codespell-project/codespell - repo: https://github.com/codespell-project/codespell
rev: v2.3.0 rev: v2.4.0
hooks: hooks:
- id: codespell - id: codespell
exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*' exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
@ -23,7 +23,7 @@ repos:
hooks: hooks:
- id: isort - id: isort
- repo: https://github.com/pre-commit/mirrors-clang-format - repo: https://github.com/pre-commit/mirrors-clang-format
rev: v18.1.5 rev: v19.1.7
hooks: hooks:
- id: clang-format - id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))' exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
@ -35,7 +35,7 @@ repos:
- id: pymarkdown - id: pymarkdown
files: docs/.* files: docs/.*
- repo: https://github.com/rhysd/actionlint - repo: https://github.com/rhysd/actionlint
rev: v1.7.6 rev: v1.7.7
hooks: hooks:
- id: actionlint - id: actionlint
- repo: local - repo: local

View File

@ -926,8 +926,8 @@ def main(args: argparse.Namespace):
) )
# Traffic # Traffic
result_json["request_rate"] = ( result_json["request_rate"] = (args.request_rate if args.request_rate
args.request_rate if args.request_rate < float("inf") else "inf") < float("inf") else "inf")
result_json["burstiness"] = args.burstiness result_json["burstiness"] = args.burstiness
result_json["max_concurrency"] = args.max_concurrency result_json["max_concurrency"] = args.max_concurrency

View File

@ -38,9 +38,13 @@ struct Signal {
alignas(128) FlagType peer_counter[2][kMaxBlocks][8]; alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
}; };
struct __align__(16) RankData { const void* __restrict__ ptrs[8]; }; struct __align__(16) RankData {
const void* __restrict__ ptrs[8];
};
struct __align__(16) RankSignals { Signal* signals[8]; }; struct __align__(16) RankSignals {
Signal* signals[8];
};
// like std::array, but aligned // like std::array, but aligned
template <typename T, int sz> template <typename T, int sz>

View File

@ -138,8 +138,8 @@ __device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
const int HI = 0x00f000f0; const int HI = 0x00f000f0;
const int EX = 0x64006400; const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s. // Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`. // directly into `SUB` and `ADD`.
const int SUB = 0x64086408; const int SUB = 0x64086408;
@ -182,8 +182,8 @@ __device__ inline FragB dequant<vllm::kU4.id()>(int q) {
const int HI = 0x00f000f0; const int HI = 0x00f000f0;
const int EX = 0x64006400; const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s. // Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
const int SUB = 0x64006400; const int SUB = 0x64006400;
const int MUL = 0x2c002c00; const int MUL = 0x2c002c00;

View File

@ -173,8 +173,8 @@ dequant<half, vllm::kU4B8.id()>(int q) {
const int HI = 0x00f000f0; const int HI = 0x00f000f0;
const int EX = 0x64006400; const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s. // Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`. // directly into `SUB` and `ADD`.
const int SUB = 0x64086408; const int SUB = 0x64086408;
@ -197,9 +197,9 @@ dequant<nv_bfloat16, vllm::kU4B8.id()>(int q) {
// Guarantee that the `(a & b) | c` operations are LOP3s. // Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
q >>= 4; q >>= 4;
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
typename ScalarType<nv_bfloat16>::FragB frag_b; typename ScalarType<nv_bfloat16>::FragB frag_b;
static constexpr uint32_t MUL = 0x3F803F80; static constexpr uint32_t MUL = 0x3F803F80;
@ -221,8 +221,8 @@ dequant<half, vllm::kU4.id()>(int q) {
const int HI = 0x00f000f0; const int HI = 0x00f000f0;
const int EX = 0x64006400; const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s. // Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
const int SUB = 0x64006400; const int SUB = 0x64006400;
const int MUL = 0x2c002c00; const int MUL = 0x2c002c00;
@ -244,9 +244,9 @@ dequant<nv_bfloat16, vllm::kU4.id()>(int q) {
// Guarantee that the `(a & b) | c` operations are LOP3s. // Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
q >>= 4; q >>= 4;
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
typename ScalarType<nv_bfloat16>::FragB frag_b; typename ScalarType<nv_bfloat16>::FragB frag_b;
static constexpr uint32_t MUL = 0x3F803F80; static constexpr uint32_t MUL = 0x3F803F80;

View File

@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) {
const int HI = 0x00f000f0; const int HI = 0x00f000f0;
const int EX = 0x64006400; const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s. // Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`. // directly into `SUB` and `ADD`.
const int SUB = 0x64086408; const int SUB = 0x64086408;

View File

@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
static constexpr uint32_t HI = 0x00f000f0; static constexpr uint32_t HI = 0x00f000f0;
static constexpr uint32_t EX = 0x64006400; static constexpr uint32_t EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s. // Guarantee that the `(a & b) | c` operations are LOP3s.
uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`. // directly into `SUB` and `ADD`.
static constexpr uint32_t SUB = 0x64086408; static constexpr uint32_t SUB = 0x64086408;

View File

@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) {
const int HI = 0x00f000f0; const int HI = 0x00f000f0;
const int EX = 0x64006400; const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s. // Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`. // directly into `SUB` and `ADD`.
const int SUB = 0x64086408; const int SUB = 0x64086408;

View File

@ -907,7 +907,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
// max_num_partitions, head_size] // max_num_partitions, head_size]
const int* __restrict__ context_lens, // [num_seqs] const int* __restrict__ context_lens, // [num_seqs]
const int max_num_partitions){UNREACHABLE_CODE} const int max_num_partitions) {
UNREACHABLE_CODE
}
#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support #endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support

View File

@ -417,7 +417,7 @@ def get_rocm_version():
if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor), if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
ctypes.byref(patch)) == 0): ctypes.byref(patch)) == 0):
return "%d.%d.%d" % (major.value, minor.value, patch.value) return f"{major.value}.{minor.value}.{patch.value}"
return None return None
except Exception: except Exception:
return None return None

View File

@ -92,8 +92,10 @@ def native_w8a8_block_fp8_matmul(A,
A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles) A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
] ]
B_tiles = [[ B_tiles = [[
B[j * block_n:min((j + 1) * block_n, N), B[
i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles) j * block_n:min((j + 1) * block_n, N),
i * block_k:min((i + 1) * block_k, K),
] for i in range(k_tiles)
] for j in range(n_tiles)] ] for j in range(n_tiles)]
C_tiles = [ C_tiles = [
C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles) C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
@ -157,9 +159,9 @@ def setup_cuda():
torch.set_default_device("cuda") torch.set_default_device("cuda")
@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed", @pytest.mark.parametrize(
itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, "num_tokens,d,dtype,group_size,seed",
SEEDS)) itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS))
@torch.inference_mode() @torch.inference_mode()
def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed): def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
torch.manual_seed(seed) torch.manual_seed(seed)
@ -174,9 +176,9 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
assert torch.allclose(scale, ref_scale) assert torch.allclose(scale, ref_scale)
@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed", @pytest.mark.parametrize(
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, "M,N,K,block_size,out_dtype,seed",
SEEDS)) itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
@torch.inference_mode() @torch.inference_mode()
def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
torch.manual_seed(seed) torch.manual_seed(seed)
@ -207,9 +209,10 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
assert rel_diff < 0.001 assert rel_diff < 0.001
@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed", @pytest.mark.parametrize(
itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, "M,N,K,E,topk,block_size,dtype,seed",
BLOCK_SIZE, DTYPES, SEEDS)) itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES,
SEEDS))
@torch.inference_mode() @torch.inference_mode()
def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed): def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
torch.manual_seed(seed) torch.manual_seed(seed)

View File

@ -20,7 +20,7 @@ def test_run(my_rank, buffer, device):
assert buffer.buffer_size == 0 assert buffer.buffer_size == 0
assert len(buffer.buffer) == 0 assert len(buffer.buffer) == 0
print("My rank: %d, device: %s" % (my_rank, device)) print(f"My rank: {my_rank}, device: {device}")
# insert # insert
tokens = torch.tensor([1, 2, 3]).to(device) tokens = torch.tensor([1, 2, 3]).to(device)
@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
assert buffer.buffer_size == 0 assert buffer.buffer_size == 0
assert len(buffer.buffer) == 0 assert len(buffer.buffer) == 0
print("My rank: %d, Test run passed!" % (my_rank)) print(f"My rank: {my_rank}, Test run passed!")
def stress_test(my_rank, buf, device): def stress_test(my_rank, buf, device):
@ -94,7 +94,7 @@ def stress_test(my_rank, buf, device):
assert torch.allclose(k, k_) assert torch.allclose(k, k_)
assert torch.allclose(v, v_) assert torch.allclose(v, v_)
assert torch.allclose(h, h_) assert torch.allclose(h, h_)
print('Rank %d done' % my_rank) print(f"Rank {my_rank} done")
torch.distributed.barrier() torch.distributed.barrier()
if my_rank == 0: if my_rank == 0:
@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
else: else:
torch.distributed.send(torch.tensor([n]), 0) torch.distributed.send(torch.tensor([n]), 0)
print("My rank: %d, Passed stress test!" % (my_rank)) print(f"My rank: {my_rank}, Passed stress test!")
if __name__ == "__main__": if __name__ == "__main__":
@ -122,7 +122,7 @@ if __name__ == "__main__":
rank=my_rank, rank=my_rank,
) )
print("initialized! My rank is %d" % my_rank) print(f"initialized! My rank is {my_rank}")
config = KVTransferConfig( config = KVTransferConfig(
kv_connector='PyNcclConnector', kv_connector='PyNcclConnector',

View File

@ -55,9 +55,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
return generated_texts return generated_texts
@pytest.mark.xfail(current_platform.is_rocm(), @pytest.mark.xfail(
reason="Qwen2-VL dependency xformers incompatible with ROCm" current_platform.is_rocm(),
) reason="Qwen2-VL dependency xformers incompatible with ROCm")
def test_qwen2vl_lora(qwen2vl_lora_files): def test_qwen2vl_lora(qwen2vl_lora_files):
llm = vllm.LLM( llm = vllm.LLM(
MODEL_PATH, MODEL_PATH,

View File

@ -521,12 +521,13 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
# - image embeddings # - image embeddings
# - video # - video
# - custom inputs # - custom inputs
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.IMAGE, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=False, test_type=VLMTestType.IMAGE,
)) fork_new_process_for_each_test=False,
))
def test_single_image_models(tmp_path: PosixPath, model_type: str, def test_single_image_models(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], hf_runner: Type[HfRunner],
@ -543,12 +544,13 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.MULTI_IMAGE, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=False, test_type=VLMTestType.MULTI_IMAGE,
)) fork_new_process_for_each_test=False,
))
def test_multi_image_models(tmp_path: PosixPath, model_type: str, def test_multi_image_models(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], hf_runner: Type[HfRunner],
@ -565,12 +567,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.EMBEDDING, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=False, test_type=VLMTestType.EMBEDDING,
)) fork_new_process_for_each_test=False,
))
def test_image_embedding_models(model_type: str, def test_image_embedding_models(model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], hf_runner: Type[HfRunner],
@ -586,12 +589,13 @@ def test_image_embedding_models(model_type: str,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.VIDEO, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=False, test_type=VLMTestType.VIDEO,
)) fork_new_process_for_each_test=False,
))
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
video_assets: _VideoAssets): video_assets: _VideoAssets):
@ -605,12 +609,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.CUSTOM_INPUTS, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=False, test_type=VLMTestType.CUSTOM_INPUTS,
)) fork_new_process_for_each_test=False,
))
def test_custom_inputs_models( def test_custom_inputs_models(
model_type: str, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
@ -627,12 +632,13 @@ def test_custom_inputs_models(
#### Tests filtering for things running each test as a new process #### Tests filtering for things running each test as a new process
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.IMAGE, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=True, test_type=VLMTestType.IMAGE,
)) fork_new_process_for_each_test=True,
))
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
@ -650,12 +656,13 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.MULTI_IMAGE, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=True, test_type=VLMTestType.MULTI_IMAGE,
)) fork_new_process_for_each_test=True,
))
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
@ -673,12 +680,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.EMBEDDING, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=True, test_type=VLMTestType.EMBEDDING,
)) fork_new_process_for_each_test=True,
))
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_image_embedding_models_heavy(model_type: str, def test_image_embedding_models_heavy(model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
@ -695,12 +703,13 @@ def test_image_embedding_models_heavy(model_type: str,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.VIDEO, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=True, test_type=VLMTestType.VIDEO,
)) fork_new_process_for_each_test=True,
))
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner], vllm_runner: Type[VllmRunner],
@ -715,12 +724,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
) )
@pytest.mark.parametrize("model_type,test_case", @pytest.mark.parametrize(
get_parametrized_options( "model_type,test_case",
VLM_TEST_SETTINGS, get_parametrized_options(
test_type=VLMTestType.CUSTOM_INPUTS, VLM_TEST_SETTINGS,
fork_new_process_for_each_test=True, test_type=VLMTestType.CUSTOM_INPUTS,
)) fork_new_process_for_each_test=True,
))
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_custom_inputs_models_heavy( def test_custom_inputs_models_heavy(
model_type: str, model_type: str,

View File

@ -135,10 +135,10 @@ def _dump_outputs_w_logprobs(
outputs: OutputsLogprobs, outputs: OutputsLogprobs,
filename: "StrPath", filename: "StrPath",
) -> None: ) -> None:
json_data = [(tokens, text, json_data = [(tokens, text, [{
[{k: asdict(v) k: asdict(v)
for k, v in token_logprobs.items()} for k, v in token_logprobs.items()
for token_logprobs in (logprobs or [])]) } for token_logprobs in (logprobs or [])])
for tokens, text, logprobs in outputs] for tokens, text, logprobs in outputs]
with open(filename, "w") as f: with open(filename, "w") as f:
@ -149,11 +149,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
with open(filename, "rb") as f: with open(filename, "rb") as f:
json_data = json.load(f) json_data = json.load(f)
return [(tokens, text, return [(tokens, text, [{
[{int(k): Logprob(**v) int(k): Logprob(**v)
for k, v in token_logprobs.items()} for k, v in token_logprobs.items()
for token_logprobs in logprobs]) } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
for tokens, text, logprobs in json_data]
@large_gpu_test(min_gb=80) @large_gpu_test(min_gb=80)

View File

@ -314,9 +314,9 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.") @pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
@pytest.mark.skipif(not sparse_cutlass_supported(), @pytest.mark.skipif(
reason="2of4 Sparse is not yet supported on this GPU type." not sparse_cutlass_supported(),
) reason="2of4 Sparse is not yet supported on this GPU type.")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"args_2of4", "args_2of4",
[("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")]) [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])

View File

@ -23,16 +23,17 @@ def mock_causal_accepted_tensor(
""" """
batch_size = last_accepted_indices.shape[0] batch_size = last_accepted_indices.shape[0]
accepted = (torch.arange(k).expand(batch_size, k) <= accepted = (torch.arange(k).expand(batch_size, k)
last_accepted_indices.unsqueeze(-1).broadcast_to( <= last_accepted_indices.unsqueeze(-1).broadcast_to(
batch_size, k)) batch_size, k))
# Sprinkle accepted values after the contiguous initial accepted values. # Sprinkle accepted values after the contiguous initial accepted values.
# This replicates the behavior of rejection sampling, which may "accept" # This replicates the behavior of rejection sampling, which may "accept"
# a token that cannot be accepted because of causality. # a token that cannot be accepted because of causality.
sprinkle_candidates = ( sprinkle_candidates = (torch.arange(k).expand(
torch.arange(k).expand(batch_size, k) > batch_size,
last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + 1) k) > last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) +
1)
sprinkle = torch.rand(batch_size, k) > 0.5 sprinkle = torch.rand(batch_size, k) > 0.5
accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates] accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates]
return accepted return accepted
@ -445,8 +446,8 @@ def test_rejection_sampling_approximates_target_distribution(
distance_wrt_reference) distance_wrt_reference)
expected_improvement_multiplier = 20 expected_improvement_multiplier = 20
assert (relative_change_in_distance_wrt_target > assert (relative_change_in_distance_wrt_target
relative_change_in_distance_wrt_reference * > relative_change_in_distance_wrt_reference *
expected_improvement_multiplier) expected_improvement_multiplier)

View File

@ -274,8 +274,9 @@ def SummarizeEntries(entries, extra_step_types):
print(' {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x ' print(' {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
'parallelism)'.format(length, total_cpu_time, 'parallelism)'.format(length, total_cpu_time,
total_cpu_time * 1.0 / length)) total_cpu_time * 1.0 / length))
print(' %d build steps completed, average of %1.2f/s' % print(' {} build steps completed, average of {:1.2f}/s'.format(
(len(entries), len(entries) / (length))) len(entries),
len(entries) / (length)))
def main(): def main():

View File

@ -820,8 +820,8 @@ def scaled_int8_quant(
if scale is not None: if scale is not None:
# static-per-tensor quantization. # static-per-tensor quantization.
assert symmetric == ( assert symmetric == (
azp is azp
None), "azp must only be provided for asymmetric quantization." is None), "azp must only be provided for asymmetric quantization."
torch.ops._C.static_scaled_int8_quant(output, input, scale, azp) torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
return output, scale, azp return output, scale, azp

View File

@ -219,8 +219,8 @@ if triton.__version__ >= "2.1.0":
float("-inf")) float("-inf"))
if SLIDING_WINDOW > 0: if SLIDING_WINDOW > 0:
qk = tl.where( qk = tl.where(
offs_m[:, None] - offs_m[:, None] - (start_n + offs_n[None, :])
(start_n + offs_n[None, :]) < SLIDING_WINDOW, qk, -10000) < SLIDING_WINDOW, qk, -10000)
# -- compute m_ij, p, l_ij # -- compute m_ij, p, l_ij
m_ij = tl.max(qk, 1) m_ij = tl.max(qk, 1)
@ -324,10 +324,10 @@ if triton.__version__ >= "2.1.0":
(cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
cur_head * stride_qh + offs_d[None, :] * stride_qd) cur_head * stride_qh + offs_d[None, :] * stride_qd)
q = tl.load( q = tl.load(Q + off_q,
Q + off_q, mask=offs_m[:, None]
mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len, < cur_batch_seq_len - cur_batch_ctx_len,
other=0.0) other=0.0)
# # initialize pointer to m and l # # initialize pointer to m and l
m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
@ -402,8 +402,8 @@ if triton.__version__ >= "2.1.0":
# -- compute qk ---- # -- compute qk ----
k = tl.load(k_ptrs + k = tl.load(k_ptrs +
(cur_batch_in_all_start_index + start_n) * stride_kbs, (cur_batch_in_all_start_index + start_n) * stride_kbs,
mask=(start_n + offs_n[None, :]) < mask=(start_n + offs_n[None, :])
cur_batch_seq_len - cur_batch_ctx_len, < cur_batch_seq_len - cur_batch_ctx_len,
other=0.0) other=0.0)
qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
@ -430,8 +430,8 @@ if triton.__version__ >= "2.1.0":
# update acc # update acc
v = tl.load(v_ptrs + v = tl.load(v_ptrs +
(cur_batch_in_all_start_index + start_n) * stride_vbs, (cur_batch_in_all_start_index + start_n) * stride_vbs,
mask=(start_n + offs_n[:, None]) < mask=(start_n + offs_n[:, None])
cur_batch_seq_len - cur_batch_ctx_len, < cur_batch_seq_len - cur_batch_ctx_len,
other=0.0) other=0.0)
p = p.to(v.dtype) p = p.to(v.dtype)
@ -639,8 +639,8 @@ if triton.__version__ >= "2.1.0":
k = tl.load(k_ptrs + k = tl.load(k_ptrs +
(cur_batch_in_all_start_index + start_n) * stride_kbs, (cur_batch_in_all_start_index + start_n) * stride_kbs,
mask=dim_mask[:, None] & mask=dim_mask[:, None] &
((start_n + offs_n[None, :]) < ((start_n + offs_n[None, :])
cur_batch_seq_len - cur_batch_ctx_len), < cur_batch_seq_len - cur_batch_ctx_len),
other=0.0) other=0.0)
qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
@ -677,8 +677,8 @@ if triton.__version__ >= "2.1.0":
v = tl.load(v_ptrs + v = tl.load(v_ptrs +
(cur_batch_in_all_start_index + start_n) * stride_vbs, (cur_batch_in_all_start_index + start_n) * stride_vbs,
mask=dim_mask[None, :] & mask=dim_mask[None, :] &
((start_n + offs_n[:, None]) < ((start_n + offs_n[:, None])
cur_batch_seq_len - cur_batch_ctx_len), < cur_batch_seq_len - cur_batch_ctx_len),
other=0.0) other=0.0)
p = p.to(v.dtype) p = p.to(v.dtype)

View File

@ -627,8 +627,8 @@ def attn_fwd(
causal_start_idx, causal_start_idx,
dtype=tl.int32) dtype=tl.int32)
mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M) mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
out_ptrs_mask = (mask_m_offsets[:, None] >= out_ptrs_mask = (mask_m_offsets[:, None]
out_mask_boundary[None, :]) >= out_mask_boundary[None, :])
z = 0.0 z = 0.0
acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty)) acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
# write back LSE # write back LSE

View File

@ -1,6 +1,6 @@
import os import os
from contextlib import contextmanager from contextlib import contextmanager
from functools import lru_cache from functools import cache
from typing import Generator, Optional, Type from typing import Generator, Optional, Type
import torch import torch
@ -100,7 +100,7 @@ def get_attn_backend(
) )
@lru_cache(maxsize=None) @cache
def _cached_get_attn_backend( def _cached_get_attn_backend(
head_size: int, head_size: int,
dtype: torch.dtype, dtype: torch.dtype,

View File

@ -67,7 +67,8 @@ _RUNNER_TASKS: Dict[RunnerType, List[_ResolvedTask]] = {
_TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = { _TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = {
task: runner task: runner
for runner, tasks in _RUNNER_TASKS.items() for task in tasks for runner, tasks in _RUNNER_TASKS.items()
for task in tasks
} }
HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig], HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig],
@ -1976,8 +1977,8 @@ class SpeculativeConfig:
"typical_acceptance_sampler.") "typical_acceptance_sampler.")
if (self.draft_token_acceptance_method != 'rejection_sampler' if (self.draft_token_acceptance_method != 'rejection_sampler'
and self.draft_token_acceptance_method != and self.draft_token_acceptance_method
'typical_acceptance_sampler'): != 'typical_acceptance_sampler'):
raise ValueError( raise ValueError(
"Expected draft_token_acceptance_method to be either " "Expected draft_token_acceptance_method to be either "
"rejection_sampler or typical_acceptance_sampler. Instead it " "rejection_sampler or typical_acceptance_sampler. Instead it "

View File

@ -34,9 +34,10 @@ class RefCounter(RefCounterProtocol):
def __init__(self, all_block_indices: Iterable[BlockId]): def __init__(self, all_block_indices: Iterable[BlockId]):
deduped = set(all_block_indices) deduped = set(all_block_indices)
self._refcounts: Dict[BlockId, self._refcounts: Dict[BlockId, RefCount] = {
RefCount] = {index: 0 index: 0
for index in deduped} for index in deduped
}
def incr(self, block_id: BlockId) -> RefCount: def incr(self, block_id: BlockId) -> RefCount:
assert block_id in self._refcounts assert block_id in self._refcounts

View File

@ -136,8 +136,8 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
device=Device.GPU) device=Device.GPU)
# Use watermark to avoid frequent cache eviction. # Use watermark to avoid frequent cache eviction.
if (self.num_total_gpu_blocks - num_required_blocks < if (self.num_total_gpu_blocks - num_required_blocks
self.watermark_blocks): < self.watermark_blocks):
return AllocStatus.NEVER return AllocStatus.NEVER
if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks: if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
return AllocStatus.OK return AllocStatus.OK

View File

@ -988,8 +988,8 @@ class Scheduler:
waiting_queue.popleft() waiting_queue.popleft()
continue continue
if (budget.num_batched_tokens >= if (budget.num_batched_tokens
self.scheduler_config.max_num_batched_tokens): >= self.scheduler_config.max_num_batched_tokens):
# We've reached the budget limit - since there might be # We've reached the budget limit - since there might be
# continuous prefills in the running queue, we should break # continuous prefills in the running queue, we should break
# to avoid scheduling any new prefills. # to avoid scheduling any new prefills.
@ -1096,8 +1096,8 @@ class Scheduler:
running_scheduled.swapped_out) == 0: running_scheduled.swapped_out) == 0:
swapped_in = self._schedule_swapped(budget, curr_loras) swapped_in = self._schedule_swapped(budget, curr_loras)
assert (budget.num_batched_tokens <= assert (budget.num_batched_tokens
self.scheduler_config.max_num_batched_tokens) <= self.scheduler_config.max_num_batched_tokens)
assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
# Update waiting requests. # Update waiting requests.
@ -1189,8 +1189,8 @@ class Scheduler:
curr_loras, curr_loras,
enable_chunking=True) enable_chunking=True)
assert (budget.num_batched_tokens <= assert (budget.num_batched_tokens
self.scheduler_config.max_num_batched_tokens) <= self.scheduler_config.max_num_batched_tokens)
assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
# Update waiting requests. # Update waiting requests.
@ -1358,8 +1358,8 @@ class Scheduler:
# NOTE: We use get_len instead of get_prompt_len because when # NOTE: We use get_len instead of get_prompt_len because when
# a sequence is preempted, prefill includes previous generated # a sequence is preempted, prefill includes previous generated
# output tokens. # output tokens.
if (token_chunk_size + num_computed_tokens < if (token_chunk_size + num_computed_tokens
seqs[0].data.get_len()): < seqs[0].data.get_len()):
do_sample = False do_sample = False
# It assumes the scheduled_seq_groups is ordered by # It assumes the scheduled_seq_groups is ordered by
@ -1625,10 +1625,9 @@ class Scheduler:
if self.scheduler_config.delay_factor > 0 and self.waiting: if self.scheduler_config.delay_factor > 0 and self.waiting:
earliest_arrival_time = min( earliest_arrival_time = min(
[e.metrics.arrival_time for e in self.waiting]) [e.metrics.arrival_time for e in self.waiting])
passed_delay = ( passed_delay = ((now - earliest_arrival_time)
(now - earliest_arrival_time) > > (self.scheduler_config.delay_factor *
(self.scheduler_config.delay_factor * self.last_prompt_latency) self.last_prompt_latency) or not self.running)
or not self.running)
else: else:
passed_delay = True passed_delay = True
return passed_delay return passed_delay

View File

@ -352,8 +352,8 @@ class MessageQueue:
sched_yield() sched_yield()
# if we wait for a long time, log a message # if we wait for a long time, log a message
if (time.monotonic() - start_time > if (time.monotonic() - start_time
VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
logger.debug("No available block found in %s second. ", logger.debug("No available block found in %s second. ",
VLLM_RINGBUFFER_WARNING_INTERVAL) VLLM_RINGBUFFER_WARNING_INTERVAL)
n_warning += 1 n_warning += 1
@ -410,8 +410,8 @@ class MessageQueue:
sched_yield() sched_yield()
# if we wait for a long time, log a message # if we wait for a long time, log a message
if (time.monotonic() - start_time > if (time.monotonic() - start_time
VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
logger.debug("No available block found in %s second. ", logger.debug("No available block found in %s second. ",
VLLM_RINGBUFFER_WARNING_INTERVAL) VLLM_RINGBUFFER_WARNING_INTERVAL)
n_warning += 1 n_warning += 1

View File

@ -1014,8 +1014,8 @@ def initialize_model_parallel(
backend = backend or torch.distributed.get_backend( backend = backend or torch.distributed.get_backend(
get_world_group().device_group) get_world_group().device_group)
if (world_size != if (world_size
tensor_model_parallel_size * pipeline_model_parallel_size): != tensor_model_parallel_size * pipeline_model_parallel_size):
raise RuntimeError( raise RuntimeError(
f"world_size ({world_size}) is not equal to " f"world_size ({world_size}) is not equal to "
f"tensor_model_parallel_size ({tensor_model_parallel_size}) x " f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
@ -1069,8 +1069,8 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
return return
if all([ if all([
vllm_config.kv_transfer_config.need_kv_parallel_group, vllm_config.kv_transfer_config.need_kv_parallel_group, _KV_TRANSFER
_KV_TRANSFER is None is None
]): ]):
_KV_TRANSFER = kv_transfer.KVTransferAgent( _KV_TRANSFER = kv_transfer.KVTransferAgent(
rank=get_world_group().rank, rank=get_world_group().rank,

View File

@ -3,7 +3,7 @@ import codecs
import json import json
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from collections import defaultdict, deque from collections import defaultdict, deque
from functools import lru_cache, partial from functools import cache, lru_cache, partial
from pathlib import Path from pathlib import Path
from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List, from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List,
Literal, Optional, Tuple, TypeVar, Union, cast) Literal, Optional, Tuple, TypeVar, Union, cast)
@ -377,7 +377,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
return self._model_config.allowed_local_media_path return self._model_config.allowed_local_media_path
@staticmethod @staticmethod
@lru_cache(maxsize=None) @cache
def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str: def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
return tokenizer.decode(token_index) return tokenizer.decode(token_index)

View File

@ -522,11 +522,10 @@ class OpenAIServingCompletion(OpenAIServing):
out_top_logprobs.append({ out_top_logprobs.append({
# Convert float("-inf") to the # Convert float("-inf") to the
# JSON-serializable float that OpenAI uses # JSON-serializable float that OpenAI uses
self._get_decoded_token( self._get_decoded_token(top_lp[1],
top_lp[1], top_lp[0],
top_lp[0], tokenizer,
tokenizer, return_as_token_id=self.return_tokens_as_token_ids):
return_as_token_id=self.return_tokens_as_token_ids):
max(top_lp[1].logprob, -9999.0) max(top_lp[1].logprob, -9999.0)
for i, top_lp in enumerate(step_top_logprobs.items()) for i, top_lp in enumerate(step_top_logprobs.items())
if num_output_top_logprobs >= i if num_output_top_logprobs >= i

View File

@ -62,8 +62,8 @@ class Granite20bFCToolParser(ToolParser):
start_of_json = match.end() start_of_json = match.end()
# end_index == the start of the next function call # end_index == the start of the next function call
# (if exists) # (if exists)
next_function_call_start = (matches[i + 1].start() next_function_call_start = (matches[i + 1].start() if i +
if i + 1 < len(matches) else None) 1 < len(matches) else None)
raw_function_calls.append( raw_function_calls.append(
dec.raw_decode( dec.raw_decode(

View File

@ -220,8 +220,10 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
lora_b.T, non_blocking=True) lora_b.T, non_blocking=True)
if embeddings_tensor is not None: if embeddings_tensor is not None:
self.embeddings_tensors[ self.embeddings_tensors[
index, :embeddings_tensor.shape[0], :embeddings_tensor. index,
shape[1], ].copy_(embeddings_tensor, non_blocking=True) :embeddings_tensor.shape[0],
:embeddings_tensor.shape[1],
].copy_(embeddings_tensor, non_blocking=True)
if self.embeddings_slice is not None: if self.embeddings_slice is not None:
# TODO(yard1): Optimize this copy, we don't need to copy # TODO(yard1): Optimize this copy, we don't need to copy
# everything, just the modified part # everything, just the modified part
@ -1024,8 +1026,10 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
lora_b.T, non_blocking=True) lora_b.T, non_blocking=True)
if embeddings_tensor is not None: if embeddings_tensor is not None:
self.embeddings_tensors[ self.embeddings_tensors[
index, :embeddings_tensor.shape[0], :embeddings_tensor. index,
shape[1], ] = embeddings_tensor :embeddings_tensor.shape[0],
:embeddings_tensor.shape[1],
] = embeddings_tensor
def _get_logits( def _get_logits(
self, self,

View File

@ -75,8 +75,9 @@ class LoRAModel(AdapterModel):
# Scaling factor for long context lora model. None if it is not # Scaling factor for long context lora model. None if it is not
# fine tuned for the long context. # fine tuned for the long context.
self.scaling_factor = scaling_factor self.scaling_factor = scaling_factor
assert (lora_model_id > assert (
0), f"a valid lora id should be greater than 0, got {self.id}" lora_model_id
> 0), f"a valid lora id should be greater than 0, got {self.id}"
self.rank = rank self.rank = rank
self.loras: Dict[str, LoRALayerWeights] = loras self.loras: Dict[str, LoRALayerWeights] = loras

View File

@ -136,9 +136,8 @@ def _sgmv_expand_kernel(
c_ptr = (out_ptr + offset_cm[:, None] * output_d0_stride + c_ptr = (out_ptr + offset_cm[:, None] * output_d0_stride +
offset_cn[None, :] * output_d1_stride) offset_cn[None, :] * output_d1_stride)
M = tl.load(seq_lens + cur_batch) M = tl.load(seq_lens + cur_batch)
c_mask = (offset_cm[:, None] < c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (
(cur_seq_start + M)) & (offset_cn[None, :] < offset_cn[None, :] < (cur_slice_start + curr_N))
(cur_slice_start + curr_N))
if ADD_INPUTS: if ADD_INPUTS:
tiled_out = tl.load(c_ptr, mask=c_mask) tiled_out = tl.load(c_ptr, mask=c_mask)
tiled_c += tiled_out tiled_c += tiled_out

View File

@ -114,8 +114,8 @@ def _sgmv_shrink_kernel(
slice_id * output_d0_stride) slice_id * output_d0_stride)
c_ptr = cur_out_ptr + offset_cm[:, None] * output_d1_stride + offset_cn[ c_ptr = cur_out_ptr + offset_cm[:, None] * output_d1_stride + offset_cn[
None, :] * output_d2_stride None, :] * output_d2_stride
c_mask = (offset_cm[:, None] < c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :]
(cur_seq_start + M)) & (offset_cn[None, :] < N) < N)
accumulator *= scaling accumulator *= scaling
# handles write-back with reduction-splitting # handles write-back with reduction-splitting
if SPLIT_K == 1: if SPLIT_K == 1:

View File

@ -73,12 +73,12 @@ class MPLinearKernel(ABC):
torch.nn.Parameter(new_param.data, requires_grad=False)) torch.nn.Parameter(new_param.data, requires_grad=False))
def _get_weight_params( def _get_weight_params(
self, layer: torch.nn.Module self, layer: torch.nn.Module) -> Tuple[
) -> Tuple[torch.Tensor, # w_q torch.Tensor, # w_q
torch.Tensor, # w_s torch.Tensor, # w_s
Optional[torch.Tensor], # w_zp, Optional[torch.Tensor], # w_zp,
Optional[torch.Tensor] # w_gidx Optional[torch.Tensor] # w_gidx
]: ]:
return ( return (
getattr(layer, self.w_q_name), getattr(layer, self.w_q_name),
getattr(layer, self.w_s_name), getattr(layer, self.w_s_name),

View File

@ -48,13 +48,13 @@ class ScaledMMLinearKernel(ABC):
raise NotImplementedError raise NotImplementedError
def _get_weight_params( def _get_weight_params(
self, layer: torch.nn.Module self, layer: torch.nn.Module) -> Tuple[
) -> Tuple[torch.Tensor, # weight torch.Tensor, # weight
torch.Tensor, # weight_scale torch.Tensor, # weight_scale
Optional[torch.Tensor], # input_scale, Optional[torch.Tensor], # input_scale,
Optional[torch.Tensor], # input_zp Optional[torch.Tensor], # input_zp
Optional[torch.Tensor], # azp_adj Optional[torch.Tensor], # azp_adj
]: ]:
return ( return (
getattr(layer, self.w_q_name), getattr(layer, self.w_q_name),
getattr(layer, self.w_s_name), getattr(layer, self.w_s_name),

View File

@ -72,9 +72,10 @@ def block_quant_to_tensor_quant(
x_dq_block = x_q_block.to(torch.float32) x_dq_block = x_q_block.to(torch.float32)
x_dq_block_tiles = [[ x_dq_block_tiles = [[
x_dq_block[j * block_n:min((j + 1) * block_n, n), x_dq_block[
i * block_k:min((i + 1) * block_k, k), ] j * block_n:min((j + 1) * block_n, n),
for i in range(k_tiles) i * block_k:min((i + 1) * block_k, k),
] for i in range(k_tiles)
] for j in range(n_tiles)] ] for j in range(n_tiles)]
for i in range(k_tiles): for i in range(k_tiles):

View File

@ -73,8 +73,8 @@ def requantize_with_max_scale(
# from disk in this case. Skip requantization in this case (since) # from disk in this case. Skip requantization in this case (since)
# we already are quantized with the single scale. # we already are quantized with the single scale.
# * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8 # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
unfused_module_in_checkpoint = (weight_scale[-1] > torch.finfo( unfused_module_in_checkpoint = (weight_scale[-1]
torch.float8_e4m3fn).min) > torch.finfo(torch.float8_e4m3fn).min)
# If unfused checkpoint, need requanize with the single scale. # If unfused checkpoint, need requanize with the single scale.
if unfused_module_in_checkpoint: if unfused_module_in_checkpoint:

View File

@ -716,9 +716,10 @@ def _sample_with_torch(
tensors required for Pythonization tensors required for Pythonization
''' '''
categorized_seq_group_ids: Dict[SamplingType, categorized_seq_group_ids: Dict[SamplingType, List[int]] = {
List[int]] = {t: [] t: []
for t in SamplingType} for t in SamplingType
}
categorized_sample_indices = sampling_metadata.categorized_sample_indices categorized_sample_indices = sampling_metadata.categorized_sample_indices
for i, seq_group in enumerate(sampling_metadata.seq_groups): for i, seq_group in enumerate(sampling_metadata.seq_groups):
sampling_params = seq_group.sampling_params sampling_params = seq_group.sampling_params

View File

@ -115,17 +115,17 @@ class VocabParallelEmbeddingShardIndices:
def __post_init__(self): def __post_init__(self):
# sanity checks # sanity checks
assert (self.padded_org_vocab_start_index <= assert (self.padded_org_vocab_start_index
self.padded_org_vocab_end_index) <= self.padded_org_vocab_end_index)
assert (self.padded_added_vocab_start_index <= assert (self.padded_added_vocab_start_index
self.padded_added_vocab_end_index) <= self.padded_added_vocab_end_index)
assert self.org_vocab_start_index <= self.org_vocab_end_index assert self.org_vocab_start_index <= self.org_vocab_end_index
assert self.added_vocab_start_index <= self.added_vocab_end_index assert self.added_vocab_start_index <= self.added_vocab_end_index
assert self.org_vocab_start_index <= self.padded_org_vocab_start_index assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
assert (self.added_vocab_start_index <= assert (self.added_vocab_start_index
self.padded_added_vocab_start_index) <= self.padded_added_vocab_start_index)
assert self.org_vocab_end_index <= self.padded_org_vocab_end_index assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
assert self.added_vocab_end_index <= self.padded_added_vocab_end_index assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
@ -141,8 +141,8 @@ def get_masked_input_and_mask(
added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]: added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
# torch.compile will fuse all of the pointwise ops below # torch.compile will fuse all of the pointwise ops below
# into a single kernel, making it very fast # into a single kernel, making it very fast
org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_mask = (input_ >= org_vocab_start_index) & (
org_vocab_end_index) input_ < org_vocab_end_index)
added_vocab_mask = (input_ >= added_vocab_start_index) & ( added_vocab_mask = (input_ >= added_vocab_start_index) & (
input_ < added_vocab_end_index) input_ < added_vocab_end_index)
added_offset = added_vocab_start_index - ( added_offset = added_vocab_start_index - (

View File

@ -1121,8 +1121,9 @@ class BitsAndBytesModelLoader(BaseModelLoader):
# from being incorrectly identified as being present in # from being incorrectly identified as being present in
# 'vpm.encoder.layers.0.self_attn.qkv_proj.weight # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
shard_pos = quant_param_name.find(shard_name) shard_pos = quant_param_name.find(shard_name)
can_correct_rename = (shard_pos > 0) and ( can_correct_rename = (shard_pos
quant_param_name[shard_pos - 1] == ".") > 0) and (quant_param_name[shard_pos - 1]
== ".")
# If the quant_param_name is packed, it won't occur in the # If the quant_param_name is packed, it won't occur in the
# param_dict before renaming. # param_dict before renaming.
new_quant_param_name = quant_param_name.replace( new_quant_param_name = quant_param_name.replace(

View File

@ -298,8 +298,8 @@ class TensorizerAgent:
to allow for adapter added tokens.""" to allow for adapter added tokens."""
for child in self.model.modules(): for child in self.model.modules():
if (isinstance(child, VocabParallelEmbedding) if (isinstance(child, VocabParallelEmbedding)
and child.weight.shape[0] < and child.weight.shape[0]
child.num_embeddings_per_partition): < child.num_embeddings_per_partition):
new_weight = torch.empty(child.num_embeddings_per_partition, new_weight = torch.empty(child.num_embeddings_per_partition,
child.embedding_dim, child.embedding_dim,
dtype=child.weight.dtype, dtype=child.weight.dtype,

View File

@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Inference-only Gemma model compatible with HuggingFace weights.""" """Inference-only Gemma model compatible with HuggingFace weights."""
from functools import lru_cache from functools import cache
from typing import Iterable, List, Optional, Set, Tuple, Union from typing import Iterable, List, Optional, Set, Tuple, Union
import torch import torch
@ -48,7 +48,7 @@ from .utils import (is_pp_missing_parameter,
logger = init_logger(__name__) logger = init_logger(__name__)
@lru_cache(maxsize=None) @cache
def _get_gemma_act_fn( def _get_gemma_act_fn(
hidden_act: Optional[str], hidden_act: Optional[str],
hidden_activation: Optional[str], hidden_activation: Optional[str],

View File

@ -429,10 +429,10 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
for e in range(p.size(0)): for e in range(p.size(0)):
w1_name = n.replace( w1_name = n.replace(
'.block_sparse_moe.input_linear.weight', '.block_sparse_moe.input_linear.weight',
".block_sparse_moe.experts.%d.w1.weight" % e) f".block_sparse_moe.experts.{e}.w1.weight")
w3_name = n.replace( w3_name = n.replace(
'.block_sparse_moe.input_linear.weight', '.block_sparse_moe.input_linear.weight',
".block_sparse_moe.experts.%d.w3.weight" % e) f".block_sparse_moe.experts.{e}.w3.weight")
w1_param, w3_param = p[e].chunk(2, dim=0) w1_param, w3_param = p[e].chunk(2, dim=0)
assert w1_name not in new_weights assert w1_name not in new_weights
assert w3_name not in new_weights assert w3_name not in new_weights
@ -442,7 +442,7 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
for e in range(p.size(0)): for e in range(p.size(0)):
w2_name = n.replace( w2_name = n.replace(
'.block_sparse_moe.output_linear.weight', '.block_sparse_moe.output_linear.weight',
".block_sparse_moe.experts.%d.w2.weight" % e) f".block_sparse_moe.experts.{e}.w2.weight")
w2_param = p[e] w2_param = p[e]
assert w2_name not in new_weights assert w2_name not in new_weights
new_weights[w2_name] = w2_param new_weights[w2_name] = w2_param

View File

@ -1365,8 +1365,8 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
# For 1) text-only prefill and decode, 2) image-present decode. # For 1) text-only prefill and decode, 2) image-present decode.
if image_inputs is None: if image_inputs is None:
full_text_row_masked_out_mask = ( full_text_row_masked_out_mask = (
attn_metadata.encoder_seq_lens_tensor != 0).reshape(-1, 1).to( attn_metadata.encoder_seq_lens_tensor
input_ids.device) != 0).reshape(-1, 1).to(input_ids.device)
skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0 skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0
# For image-present prefill. # For image-present prefill.

View File

@ -81,8 +81,8 @@ class MLPSpeculator(nn.Module):
if self.tie_weights: if self.tie_weights:
assert ( assert (
self.n_predict > self.n_predict > 1
1), "You cannot tie weights between stages when only 1 exists" ), "You cannot tie weights between stages when only 1 exists"
embedding = VocabParallelEmbedding( embedding = VocabParallelEmbedding(
config.vocab_size, config.vocab_size,
self.inner_dim, self.inner_dim,

View File

@ -167,8 +167,8 @@ def sparsemixer(scores, jitter_eps=0.01):
# compute mask for sparsity # compute mask for sparsity
mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True) mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
factor = scores.abs().clamp(min=mask_logits_threshold) factor = scores.abs().clamp(min=mask_logits_threshold)
mask_logits_threshold = ( mask_logits_threshold = ((mask_logits_threshold - scores) /
(mask_logits_threshold - scores) / factor) > (2 * jitter_eps) factor) > (2 * jitter_eps)
# apply mask # apply mask
masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf")) masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf"))
@ -192,8 +192,8 @@ def sparsemixer(scores, jitter_eps=0.01):
mask_logits_threshold, max_ind = masked_scores.max(dim=-1, mask_logits_threshold, max_ind = masked_scores.max(dim=-1,
keepdim=True) keepdim=True)
factor = scores.abs().clamp(min=mask_logits_threshold) factor = scores.abs().clamp(min=mask_logits_threshold)
mask_logits_threshold = ( mask_logits_threshold = ((mask_logits_threshold - scores) /
(mask_logits_threshold - scores) / factor) > (2 * jitter_eps) factor) > (2 * jitter_eps)
# apply mask # apply mask
masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold, masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold,

View File

@ -462,7 +462,8 @@ class _ModelRegistry:
ModelRegistry = _ModelRegistry({ ModelRegistry = _ModelRegistry({
model_arch: _LazyRegisteredModel( model_arch:
_LazyRegisteredModel(
module_name=f"vllm.model_executor.models.{mod_relname}", module_name=f"vllm.model_executor.models.{mod_relname}",
class_name=cls_name, class_name=cls_name,
) )

View File

@ -333,10 +333,10 @@ class ModifiedWhisperEncoder(WhisperEncoder):
return hidden_states return hidden_states
@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor, @MULTIMODAL_REGISTRY.register_processor(
info=UltravoxProcessingInfo, UltravoxMultiModalProcessor,
dummy_inputs=UltravoxDummyInputsBuilder info=UltravoxProcessingInfo,
) dummy_inputs=UltravoxDummyInputsBuilder)
class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
hf_to_vllm_mapper = WeightsMapper( hf_to_vllm_mapper = WeightsMapper(

View File

@ -599,9 +599,8 @@ def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int):
device: torch.device, device: torch.device,
) -> IntermediateTensors: ) -> IntermediateTensors:
return IntermediateTensors({ return IntermediateTensors({
key: torch.zeros((batch_size, hidden_size), key:
dtype=dtype, torch.zeros((batch_size, hidden_size), dtype=dtype, device=device)
device=device)
for key in keys for key in keys
}) })

View File

@ -166,7 +166,8 @@ class SamplingMetadata:
pin_memory=pin_memory, pin_memory=pin_memory,
) )
categorized_sample_indices = { categorized_sample_indices = {
t: async_tensor_h2d( t:
async_tensor_h2d(
seq_ids, seq_ids,
dtype=torch.int, dtype=torch.int,
target_device=device, target_device=device,
@ -198,8 +199,12 @@ def _prepare_seq_groups(
device: str, device: str,
generators: Optional[Dict[str, torch.Generator]] = None, generators: Optional[Dict[str, torch.Generator]] = None,
cache: Optional[SamplingMetadataCache] = None, cache: Optional[SamplingMetadataCache] = None,
) -> Tuple[List[SequenceGroupToSample], List[int], Dict[SamplingType, ) -> Tuple[
List[int]], int, ]: List[SequenceGroupToSample],
List[int],
Dict[SamplingType, List[int]],
int,
]:
"""Prepare sequence groups and indices for sampling. """Prepare sequence groups and indices for sampling.
Args: Args:

View File

@ -38,8 +38,8 @@ class NeuronPlatform(Platform):
if parallel_config.world_size > 1: if parallel_config.world_size > 1:
parallel_config.distributed_executor_backend = "uni" parallel_config.distributed_executor_backend = "uni"
assert (vllm_config.lora_config is assert (vllm_config.lora_config
None), "LoRA is not supported for Neuron backend." is None), "LoRA is not supported for Neuron backend."
assert (not vllm_config.speculative_config assert (not vllm_config.speculative_config
), "Speculative decoding not yet supported for Neuron backend." ), "Speculative decoding not yet supported for Neuron backend."

View File

@ -121,8 +121,8 @@ class ScalarType:
min_raw = max_raw | sign_bit_double min_raw = max_raw | sign_bit_double
return struct.unpack('!d', struct.pack('!Q', min_raw))[0] return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
else: else:
assert (not self.is_signed() or assert (not self.is_signed() or self.size_bits
self.size_bits <= 64), "Cannot represent min as a int64_t" <= 64), "Cannot represent min as a int64_t"
if self.is_signed(): if self.is_signed():
return -(1 << (self.size_bits - 1)) return -(1 << (self.size_bits - 1))

View File

@ -510,8 +510,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
self, execute_model_req: ExecuteModelRequest) -> bool: self, execute_model_req: ExecuteModelRequest) -> bool:
# When the batch size is too large, disable speculative decoding # When the batch size is too large, disable speculative decoding
# to stop trading off throughput for latency. # to stop trading off throughput for latency.
return (execute_model_req.running_queue_size >= return (execute_model_req.running_queue_size
self.disable_by_batch_size) >= self.disable_by_batch_size)
def _maybe_disable_speculative_tokens( def _maybe_disable_speculative_tokens(
self, disable_all_speculation: bool, self, disable_all_speculation: bool,

View File

@ -104,11 +104,11 @@ class Top1Proposer(SpeculativeProposer):
sampler_transposed=transposed, sampler_transposed=transposed,
) )
proposals = SpeculativeProposals( proposals = SpeculativeProposals(proposal_token_ids=proposal_tokens,
proposal_token_ids=proposal_tokens, proposal_probs=proposal_probs,
proposal_probs=proposal_probs, proposal_lens=proposal_lens,
proposal_lens=proposal_lens, no_proposals=maybe_sampler_output
no_proposals=maybe_sampler_output is None) is None)
return proposals return proposals
def _split_by_proposal_len( def _split_by_proposal_len(

View File

@ -40,13 +40,15 @@ def get_sampled_token_logprobs(
""" """
num_steps, batch_size, vocab_size = logprob_tensor.shape num_steps, batch_size, vocab_size = logprob_tensor.shape
selected_logprobs = logprob_tensor[torch.arange(num_steps).unsqueeze(1), selected_logprobs = logprob_tensor[
torch.arange(batch_size), torch.arange(num_steps).unsqueeze(1),
sampled_token_ids, ] torch.arange(batch_size),
sampled_token_ids,
]
expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand( expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand(
-1, -1, vocab_size) -1, -1, vocab_size)
sampled_token_ids_ranks = (logprob_tensor > sampled_token_ids_ranks = (logprob_tensor
expanded_selected_logprobs).sum(-1).add_(1) > expanded_selected_logprobs).sum(-1).add_(1)
return sampled_token_ids_ranks, selected_logprobs return sampled_token_ids_ranks, selected_logprobs

View File

@ -182,8 +182,8 @@ class NemotronConfig(PretrainedConfig):
if self.rope_scaling is None: if self.rope_scaling is None:
return return
if not isinstance(self.rope_scaling, if not isinstance(self.rope_scaling, dict) or len(
dict) or len(self.rope_scaling) != 2: self.rope_scaling) != 2:
raise ValueError( raise ValueError(
"`rope_scaling` must be a dictionary with two fields, " "`rope_scaling` must be a dictionary with two fields, "
f"`type` and `factor`, got {self.rope_scaling}") f"`type` and `factor`, got {self.rope_scaling}")

View File

@ -29,7 +29,7 @@ from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
from collections import OrderedDict, UserDict, defaultdict from collections import OrderedDict, UserDict, defaultdict
from collections.abc import Hashable, Iterable, Mapping from collections.abc import Hashable, Iterable, Mapping
from dataclasses import dataclass, field from dataclasses import dataclass, field
from functools import lru_cache, partial, wraps from functools import cache, lru_cache, partial, wraps
from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
Dict, Generator, Generic, Iterator, List, Literal, Dict, Generator, Generic, Iterator, List, Literal,
NamedTuple, Optional, Tuple, Type, TypeVar, Union, NamedTuple, Optional, Tuple, Type, TypeVar, Union,
@ -352,7 +352,7 @@ class PyObjectCache:
self._index = 0 self._index = 0
@lru_cache(maxsize=None) @cache
def get_max_shared_memory_bytes(gpu: int = 0) -> int: def get_max_shared_memory_bytes(gpu: int = 0) -> int:
"""Returns the maximum shared memory per thread block in bytes.""" """Returns the maximum shared memory per thread block in bytes."""
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
@ -697,7 +697,7 @@ def create_kv_caches_with_random(
return key_caches, value_caches return key_caches, value_caches
@lru_cache(maxsize=None) @cache
def is_pin_memory_available() -> bool: def is_pin_memory_available() -> bool:
from vllm.platforms import current_platform from vllm.platforms import current_platform
return current_platform.is_pin_memory_available() return current_platform.is_pin_memory_available()
@ -886,7 +886,7 @@ def init_cached_hf_modules() -> None:
init_hf_modules() init_hf_modules()
@lru_cache(maxsize=None) @cache
def find_library(lib_name: str) -> str: def find_library(lib_name: str) -> str:
""" """
Find the library file in the system. Find the library file in the system.
@ -1607,7 +1607,7 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
return module return module
@lru_cache(maxsize=None) @cache
def get_vllm_optional_dependencies(): def get_vllm_optional_dependencies():
metadata = importlib.metadata.metadata("vllm") metadata = importlib.metadata.metadata("vllm")
requirements = metadata.get_all("Requires-Dist", []) requirements = metadata.get_all("Requires-Dist", [])

View File

@ -247,8 +247,8 @@ class Scheduler:
token_budget -= num_new_tokens token_budget -= num_new_tokens
request.status = RequestStatus.RUNNING request.status = RequestStatus.RUNNING
request.num_computed_tokens = num_computed_tokens request.num_computed_tokens = num_computed_tokens
has_partial_request = (num_computed_tokens + num_new_tokens < has_partial_request = (num_computed_tokens + num_new_tokens
request.num_tokens) < request.num_tokens)
# Encoder-related. # Encoder-related.
if encoder_inputs_to_schedule: if encoder_inputs_to_schedule:

View File

@ -311,8 +311,8 @@ class RequestStats:
return [] return []
latency_s_lst = [] latency_s_lst = []
for i in range(1, len(self.output_token_ts_s_lst)): for i in range(1, len(self.output_token_ts_s_lst)):
assert (self.output_token_ts_s_lst[i] >= assert (self.output_token_ts_s_lst[i]
self.output_token_ts_s_lst[i - 1]) >= self.output_token_ts_s_lst[i - 1])
latency_s = (self.output_token_ts_s_lst[i] - latency_s = (self.output_token_ts_s_lst[i] -
self.output_token_ts_s_lst[i - 1]) self.output_token_ts_s_lst[i - 1])
latency_s_lst.append(latency_s) latency_s_lst.append(latency_s)

View File

@ -205,7 +205,7 @@ class GPUModelRunner:
def _update_states(self, scheduler_output: "SchedulerOutput") -> None: def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
# Remove stopped requests from the cached states. # Remove stopped requests from the cached states.
# Keep the states of the pre-empted requests. # Keep the states of the preempted requests.
for req_id in scheduler_output.finished_req_ids: for req_id in scheduler_output.finished_req_ids:
self.requests.pop(req_id, None) self.requests.pop(req_id, None)
self.encoder_cache.pop(req_id, None) self.encoder_cache.pop(req_id, None)

View File

@ -173,13 +173,13 @@ class HPUWorker(LocalOrDistributedWorkerBase):
cpu_fallback_ctx as cpu_fallback_local_metric: cpu_fallback_ctx as cpu_fallback_local_metric:
output = LocalOrDistributedWorkerBase.execute_model( output = LocalOrDistributedWorkerBase.execute_model(
self, execute_model_req) self, execute_model_req)
if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0 if (log_graph_compilation and gc_local_metric.stats()[0][1]
) or log_graph_compilation_all: > 0) or log_graph_compilation_all:
msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: " msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: "
f"{gc_local_metric.stats()}, {input_stats}") f"{gc_local_metric.stats()}, {input_stats}")
logger.warning(msg) logger.warning(msg)
if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] > if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1]
0) or log_cpu_fallbacks_all: > 0) or log_cpu_fallbacks_all:
msg = ("VLLM_HPU_STEP_CPU_FALLBACK: " msg = ("VLLM_HPU_STEP_CPU_FALLBACK: "
f"{cpu_fallback_local_metric.stats()}, {input_stats}") f"{cpu_fallback_local_metric.stats()}, {input_stats}")
logger.warning(msg) logger.warning(msg)

View File

@ -316,8 +316,8 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
logger.info("batch_size: %d, seq_len: %d", batch_size, logger.info("batch_size: %d, seq_len: %d", batch_size,
seq_len) seq_len)
num_tokens = batch_size * seq_len num_tokens = batch_size * seq_len
if (num_tokens >= if (num_tokens
self.scheduler_config.max_num_batched_tokens): >= self.scheduler_config.max_num_batched_tokens):
break break
seq_len = seq_len * 2 seq_len = seq_len * 2
end = time.time() end = time.time()