2024-11-11 18:05:38 -05:00
|
|
|
import time
|
|
|
|
import uuid
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
from transformers import AutoTokenizer
|
|
|
|
|
2025-01-10 13:14:42 +08:00
|
|
|
from tests.utils import fork_new_process_for_each_test
|
2024-11-11 18:05:38 -05:00
|
|
|
from vllm import SamplingParams
|
|
|
|
from vllm.engine.arg_utils import EngineArgs
|
|
|
|
from vllm.platforms import current_platform
|
|
|
|
from vllm.v1.engine import EngineCoreRequest
|
|
|
|
from vllm.v1.engine.core import EngineCore
|
2025-01-05 23:59:16 -08:00
|
|
|
from vllm.v1.executor.abstract import Executor
|
2024-11-11 18:05:38 -05:00
|
|
|
|
|
|
|
if not current_platform.is_cuda():
|
|
|
|
pytest.skip(reason="V1 currently only supported on CUDA.",
|
|
|
|
allow_module_level=True)
|
|
|
|
|
|
|
|
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
|
|
|
|
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
|
|
|
|
PROMPT = "Hello my name is Robert and I love quantization kernels"
|
|
|
|
PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
|
|
|
|
|
|
|
|
|
|
|
|
def make_request() -> EngineCoreRequest:
|
|
|
|
return EngineCoreRequest(
|
|
|
|
request_id=uuid.uuid4(),
|
|
|
|
prompt=PROMPT,
|
|
|
|
prompt_token_ids=PROMPT_TOKENS,
|
2024-12-03 05:33:10 -05:00
|
|
|
mm_inputs=None,
|
2024-12-11 19:55:30 -05:00
|
|
|
mm_hashes=None,
|
2024-11-12 16:17:20 -08:00
|
|
|
mm_placeholders=None,
|
2024-11-11 18:05:38 -05:00
|
|
|
sampling_params=SamplingParams(),
|
|
|
|
eos_token_id=None,
|
|
|
|
arrival_time=time.time(),
|
|
|
|
lora_request=None,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2025-01-10 13:14:42 +08:00
|
|
|
@fork_new_process_for_each_test
|
2024-11-11 18:05:38 -05:00
|
|
|
def test_engine_core(monkeypatch):
|
|
|
|
|
|
|
|
with monkeypatch.context() as m:
|
|
|
|
m.setenv("VLLM_USE_V1", "1")
|
|
|
|
"""Setup the EngineCore."""
|
|
|
|
engine_args = EngineArgs(model=MODEL_NAME)
|
2024-12-27 20:45:08 -05:00
|
|
|
vllm_config = engine_args.create_engine_config()
|
2025-01-05 23:59:16 -08:00
|
|
|
executor_class = Executor.get_class(vllm_config)
|
2024-11-11 18:05:38 -05:00
|
|
|
|
|
|
|
engine_core = EngineCore(vllm_config=vllm_config,
|
2024-12-27 20:45:08 -05:00
|
|
|
executor_class=executor_class)
|
2024-11-11 18:05:38 -05:00
|
|
|
"""Test basic request lifecycle."""
|
|
|
|
|
|
|
|
# First request.
|
|
|
|
engine_core.add_request(make_request())
|
|
|
|
assert len(engine_core.scheduler.waiting) == 1
|
|
|
|
assert len(engine_core.scheduler.running) == 0
|
|
|
|
|
|
|
|
_ = engine_core.step()
|
|
|
|
assert len(engine_core.scheduler.waiting) == 0
|
|
|
|
assert len(engine_core.scheduler.running) == 1
|
|
|
|
|
|
|
|
# Second request.
|
|
|
|
engine_core.add_request(make_request())
|
|
|
|
assert len(engine_core.scheduler.waiting) == 1
|
|
|
|
assert len(engine_core.scheduler.running) == 1
|
|
|
|
|
|
|
|
_ = engine_core.step()
|
|
|
|
assert len(engine_core.scheduler.waiting) == 0
|
|
|
|
assert len(engine_core.scheduler.running) == 2
|
|
|
|
|
|
|
|
# Add two requests in a row.
|
|
|
|
engine_core.add_request(make_request())
|
|
|
|
engine_core.add_request(make_request())
|
|
|
|
assert len(engine_core.scheduler.waiting) == 2
|
|
|
|
assert len(engine_core.scheduler.running) == 2
|
|
|
|
|
|
|
|
_ = engine_core.step()
|
|
|
|
assert len(engine_core.scheduler.waiting) == 0
|
|
|
|
assert len(engine_core.scheduler.running) == 4
|
|
|
|
|
|
|
|
# Loop through until they are all done.
|
|
|
|
while len(engine_core.step()) > 0:
|
|
|
|
pass
|
|
|
|
|
|
|
|
assert len(engine_core.scheduler.waiting) == 0
|
|
|
|
assert len(engine_core.scheduler.running) == 0
|
|
|
|
"""Test abort cycle."""
|
|
|
|
|
|
|
|
# Basic abort.
|
|
|
|
req = make_request()
|
|
|
|
request_id = req.request_id
|
|
|
|
|
|
|
|
engine_core.add_request(req)
|
|
|
|
assert len(engine_core.scheduler.waiting) == 1
|
|
|
|
assert len(engine_core.scheduler.running) == 0
|
|
|
|
|
|
|
|
_ = engine_core.step()
|
|
|
|
assert len(engine_core.scheduler.waiting) == 0
|
|
|
|
assert len(engine_core.scheduler.running) == 1
|
|
|
|
|
|
|
|
engine_core.abort_requests([request_id])
|
|
|
|
assert len(engine_core.scheduler.waiting) == 0
|
|
|
|
assert len(engine_core.scheduler.running) == 0
|
|
|
|
|
|
|
|
# Add, step, abort 1 of the 3.
|
|
|
|
req0 = make_request()
|
|
|
|
req1 = make_request()
|
|
|
|
req2 = make_request()
|
|
|
|
|
|
|
|
engine_core.add_request(req0)
|
|
|
|
engine_core.add_request(req1)
|
|
|
|
assert len(engine_core.scheduler.waiting) == 2
|
|
|
|
assert len(engine_core.scheduler.running) == 0
|
|
|
|
|
|
|
|
_ = engine_core.step()
|
|
|
|
assert len(engine_core.scheduler.waiting) == 0
|
|
|
|
assert len(engine_core.scheduler.running) == 2
|
|
|
|
|
|
|
|
engine_core.add_request(req2)
|
|
|
|
assert len(engine_core.scheduler.waiting) == 1
|
|
|
|
assert len(engine_core.scheduler.running) == 2
|
|
|
|
|
|
|
|
_ = engine_core.step()
|
|
|
|
assert len(engine_core.scheduler.waiting) == 0
|
|
|
|
assert len(engine_core.scheduler.running) == 3
|
|
|
|
|
|
|
|
# Abort just one.
|
|
|
|
engine_core.abort_requests([req1.request_id])
|
|
|
|
assert len(engine_core.scheduler.waiting) == 0
|
|
|
|
assert len(engine_core.scheduler.running) == 2
|
|
|
|
|
|
|
|
_ = engine_core.step()
|
|
|
|
assert len(engine_core.scheduler.waiting) == 0
|
|
|
|
assert len(engine_core.scheduler.running) == 2
|
|
|
|
|
|
|
|
# Abort the other requests at the same time.
|
|
|
|
engine_core.abort_requests([req2.request_id, req0.request_id])
|
|
|
|
assert len(engine_core.scheduler.waiting) == 0
|
|
|
|
assert len(engine_core.scheduler.running) == 0
|
2024-12-26 02:02:58 -08:00
|
|
|
|
|
|
|
|
2025-01-10 13:14:42 +08:00
|
|
|
@fork_new_process_for_each_test
|
2024-12-26 02:02:58 -08:00
|
|
|
def test_engine_core_advanced_sampling(monkeypatch):
|
|
|
|
"""
|
|
|
|
A basic end-to-end test to verify that the engine functions correctly
|
|
|
|
when additional sampling parameters, such as min_tokens and
|
|
|
|
presence_penalty, are set.
|
|
|
|
"""
|
|
|
|
with monkeypatch.context() as m:
|
|
|
|
m.setenv("VLLM_USE_V1", "1")
|
|
|
|
"""Setup the EngineCore."""
|
|
|
|
engine_args = EngineArgs(model=MODEL_NAME)
|
2024-12-27 20:45:08 -05:00
|
|
|
vllm_config = engine_args.create_engine_config()
|
2025-01-05 23:59:16 -08:00
|
|
|
executor_class = Executor.get_class(vllm_config)
|
2024-12-26 02:02:58 -08:00
|
|
|
|
|
|
|
engine_core = EngineCore(vllm_config=vllm_config,
|
2024-12-27 20:45:08 -05:00
|
|
|
executor_class=executor_class)
|
2024-12-26 02:02:58 -08:00
|
|
|
"""Test basic request lifecycle."""
|
|
|
|
# First request.
|
|
|
|
request: EngineCoreRequest = make_request()
|
|
|
|
request.sampling_params = SamplingParams(
|
|
|
|
min_tokens=4,
|
|
|
|
presence_penalty=1.0,
|
|
|
|
frequency_penalty=1.0,
|
|
|
|
repetition_penalty=0.1,
|
|
|
|
stop_token_ids=[1001, 1002],
|
|
|
|
)
|
|
|
|
engine_core.add_request(request)
|
|
|
|
assert len(engine_core.scheduler.waiting) == 1
|
|
|
|
assert len(engine_core.scheduler.running) == 0
|
|
|
|
# Loop through until they are all done.
|
|
|
|
while len(engine_core.step()) > 0:
|
|
|
|
pass
|
|
|
|
|
|
|
|
assert len(engine_core.scheduler.waiting) == 0
|
|
|
|
assert len(engine_core.scheduler.running) == 0
|