30 lines
730 B
Python
30 lines
730 B
Python
![]() |
# SPDX-License-Identifier: Apache-2.0
|
||
|
import os
|
||
|
|
||
|
import pytest
|
||
|
|
||
|
from vllm import LLM
|
||
|
|
||
|
if os.getenv("VLLM_USE_V1", "0") != "1":
|
||
|
pytest.skip("Test package requires V1", allow_module_level=True)
|
||
|
|
||
|
MODEL = "meta-llama/Llama-3.2-1B"
|
||
|
PROMPT = "Hello my name is Robert and I"
|
||
|
|
||
|
|
||
|
@pytest.fixture(scope="module")
|
||
|
def model() -> LLM:
|
||
|
return LLM(MODEL,
|
||
|
enforce_eager=True,
|
||
|
enable_prefix_caching=True,
|
||
|
long_prefill_token_threshold=2,
|
||
|
max_num_batched_tokens=6,
|
||
|
max_num_seqs=3)
|
||
|
|
||
|
|
||
|
def test_concurrent_partial_prefill(model):
|
||
|
outputs = model.generate([PROMPT] * 3)
|
||
|
assert len(outputs) == 3
|
||
|
for output in outputs:
|
||
|
assert len(output.outputs) == 1
|