vllm/tests/async_engine/test_async_llm_engine.py

import asyncio
import os
from asyncio import CancelledError
from dataclasses import dataclass
from typing import Optional

import pytest
import pytest_asyncio
import torch

from vllm import SamplingParams
from vllm.config import ParallelConfig
from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
from vllm.outputs import RequestOutput as RealRequestOutput

from ..conftest import cleanup
from ..utils import wait_for_gpu_memory_to_clear


@dataclass
class RequestOutput:
    request_id: int
    finished: bool = False


class MockEngine:

    def __init__(self):
        self.step_calls = 0
        self.add_request_calls = 0
        self.abort_request_calls = 0
        self.request_id = None
        # Ugly, remove dependency when possible
        self.parallel_config = ParallelConfig(1, 1, False)

    async def step_async(self, virtual_engine):
        # PP size is 1, ignore virtual engine
        self.step_calls += 1
        return [RequestOutput(
            request_id=self.request_id)] if self.request_id else []

    async def process_model_inputs_async(self, *args, **kwargs):
        pass

    async def stop_remote_worker_execution_loop_async(self):
        pass

    def generate(self, request_id):
        self.request_id = request_id

    def stop_generating(self):
        self.request_id = None

    def add_request(self, **kwargs):
        del kwargs  # Unused
        self.add_request_calls += 1
        print(f'Request calls: {self.add_request_calls}')

    async def add_request_async(self, **kwargs):
        self.add_request_calls += 1
        return

    def abort_request(self, request_id):
        del request_id  # Unused
        self.abort_request_calls += 1

    def has_unfinished_requests(self):
        return self.request_id is not None

    def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
        return self.request_id is not None


class MockAsyncLLMEngine(AsyncLLMEngine):

    def _init_engine(self, *args, **kwargs):
        return MockEngine()


@pytest.mark.asyncio
async def test_new_requests_event():
    engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False)
    engine.start_background_loop()
    await asyncio.sleep(0.01)
    assert engine.engine.step_calls == 0

    await engine.add_request("1", "", None)
    await asyncio.sleep(0.01)
    assert engine.engine.add_request_calls == 1
    assert engine.engine.step_calls == 1

    await engine.add_request("2", "", None)
    engine.engine.generate("2")
    await asyncio.sleep(0)
    await asyncio.sleep(0)
    await asyncio.sleep(0)
    assert engine.engine.add_request_calls == 2
    assert engine.engine.step_calls >= 2
    await asyncio.sleep(0.001)
    assert engine.engine.step_calls >= 3
    engine.engine.stop_generating()
    await asyncio.sleep(0.001)
    old_step_calls = engine.engine.step_calls
    await asyncio.sleep(0.001)
    assert engine.engine.step_calls == old_step_calls

    await engine.add_request("3", "", None)
    await asyncio.sleep(0.01)
    assert engine.engine.add_request_calls == 3
    assert engine.engine.step_calls == old_step_calls + 1
    await asyncio.sleep(0.01)
    assert engine.engine.add_request_calls == 3
    assert engine.engine.step_calls == old_step_calls + 1

    # Allow deprecated engine_use_ray to not raise exception
    os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"

    engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
    assert engine.get_model_config() is not None
    assert engine.get_tokenizer() is not None
    assert engine.get_decoding_config() is not None

    os.environ.pop("VLLM_ALLOW_ENGINE_USE_RAY")


def start_engine():
    wait_for_gpu_memory_to_clear(
        devices=list(range(torch.cuda.device_count())),
        threshold_bytes=2 * 2**30,
        timeout_s=60,
    )

    return AsyncLLMEngine.from_engine_args(
        AsyncEngineArgs(model="facebook/opt-125m", enforce_eager=True))


@pytest_asyncio.fixture(scope="module")
async def async_engine():
    engine = await asyncio.get_event_loop().run_in_executor(executor=None,
                                                            func=start_engine)
    try:
        yield engine
    finally:
        engine.shutdown_background_loop()
        del engine
        await asyncio.sleep(0.1)
        cleanup()


@pytest.fixture()
def should_do_global_cleanup_after_test(request) -> bool:
    # So we can share the async engine fixture between these tests
    return False


@pytest.mark.asyncio(scope="module")
async def test_asyncio_run(async_engine):

    async def run(prompt: str):
        sampling_params = SamplingParams(
            temperature=0,
            max_tokens=32,
        )

        async for output in async_engine.generate(prompt,
                                                  sampling_params,
                                                  request_id=prompt):
            final_output = output
        return final_output

    results = await asyncio.gather(
        run("test0"),
        run("test1"),
    )
    assert len(results) == 2


@pytest.mark.asyncio(scope="module")
async def test_cancellation(async_engine):
    sampling_params = SamplingParams(
        temperature=0,
        min_tokens=10,
        max_tokens=10,
    )

    i = 0
    with pytest.raises(CancelledError):
        async for output in async_engine.generate("test2",
                                                  sampling_params,
                                                  request_id="test2"):
            assert not output.finished
            i += 1
            if i == 5:
                await async_engine.abort("test2")

    assert i == 5


@pytest.mark.asyncio(scope="module")
async def test_delayed_generator(async_engine):
    sampling_params = SamplingParams(
        temperature=0,
        min_tokens=10,
        max_tokens=10,
    )

    stream = async_engine.generate("test3",
                                   sampling_params,
                                   request_id="test3")
    i = 0
    final_output: Optional[RealRequestOutput] = None
    async for output in stream:
        final_output = output
        if i == 0:
            # wait for generation to complete before consuming
            # the remaining messages
            await asyncio.sleep(1)
        if i < 9:
            assert not output.finished
        i += 1

    assert i == 10
    assert final_output is not None
    assert len(final_output.outputs[0].token_ids) == 10
    assert final_output.finished
Remove AsyncLLMEngine busy loop, shield background task (#1059) 2023-09-17 00:29:08 -07:00			`import asyncio`
[Misc] Deprecation Warning when setting --engine-use-ray (#7424) Signed-off-by: Wallas Santos <wallashss@ibm.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com> Co-authored-by: youkaichao <youkaichao@126.com> 2024-08-14 13:44:27 -03:00			`import os`
[BugFix] Avoid premature async generator exit and raise all exception variations (#7698) 2024-08-21 11:45:55 -04:00			`from asyncio import CancelledError`
Remove AsyncLLMEngine busy loop, shield background task (#1059) 2023-09-17 00:29:08 -07:00			`from dataclasses import dataclass`
[BugFix] Avoid premature async generator exit and raise all exception variations (#7698) 2024-08-21 11:45:55 -04:00			`from typing import Optional`
Remove AsyncLLMEngine busy loop, shield background task (#1059) 2023-09-17 00:29:08 -07:00
			`import pytest`
[BugFix] Avoid premature async generator exit and raise all exception variations (#7698) 2024-08-21 11:45:55 -04:00			`import pytest_asyncio`
[Bugfix] AsyncLLMEngine hangs with asyncio.run (#5654) 2024-06-19 13:57:12 -07:00			`import torch`
Remove AsyncLLMEngine busy loop, shield background task (#1059) 2023-09-17 00:29:08 -07:00
[Bugfix] AsyncLLMEngine hangs with asyncio.run (#5654) 2024-06-19 13:57:12 -07:00			`from vllm import SamplingParams`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`from vllm.config import ParallelConfig`
[Bugfix] AsyncLLMEngine hangs with asyncio.run (#5654) 2024-06-19 13:57:12 -07:00			`from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine`
[BugFix] Avoid premature async generator exit and raise all exception variations (#7698) 2024-08-21 11:45:55 -04:00			`from vllm.outputs import RequestOutput as RealRequestOutput`
[Bugfix] AsyncLLMEngine hangs with asyncio.run (#5654) 2024-06-19 13:57:12 -07:00
[BugFix] Avoid premature async generator exit and raise all exception variations (#7698) 2024-08-21 11:45:55 -04:00			`from ..conftest import cleanup`
[Bugfix] AsyncLLMEngine hangs with asyncio.run (#5654) 2024-06-19 13:57:12 -07:00			`from ..utils import wait_for_gpu_memory_to_clear`
Remove AsyncLLMEngine busy loop, shield background task (#1059) 2023-09-17 00:29:08 -07:00

			`@dataclass`
			`class RequestOutput:`
			`request_id: int`
			`finished: bool = False`


			`class MockEngine:`

			`def __init__(self):`
			`self.step_calls = 0`
			`self.add_request_calls = 0`
			`self.abort_request_calls = 0`
			`self.request_id = None`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`# Ugly, remove dependency when possible`
			`self.parallel_config = ParallelConfig(1, 1, False)`
Remove AsyncLLMEngine busy loop, shield background task (#1059) 2023-09-17 00:29:08 -07:00
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`async def step_async(self, virtual_engine):`
			`# PP size is 1, ignore virtual engine`
Remove AsyncLLMEngine busy loop, shield background task (#1059) 2023-09-17 00:29:08 -07:00			`self.step_calls += 1`
			`return [RequestOutput(`
			`request_id=self.request_id)] if self.request_id else []`

[Core] Consolidate prompt arguments to LLM engines (#4328) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-05-29 04:29:31 +08:00			`async def process_model_inputs_async(self, args, *kwargs):`
Add health check, make async Engine more robust (#3015) Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> 2024-03-04 14:01:40 -08:00			`pass`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`async def stop_remote_worker_execution_loop_async(self):`
			`pass`

Remove AsyncLLMEngine busy loop, shield background task (#1059) 2023-09-17 00:29:08 -07:00			`def generate(self, request_id):`
			`self.request_id = request_id`

			`def stop_generating(self):`
			`self.request_id = None`

			`def add_request(self, **kwargs):`
TP/quantization/weight loading refactor part 1 - Simplify parallel linear logic (#1181) 2023-10-02 15:36:09 -07:00			`del kwargs # Unused`
Remove AsyncLLMEngine busy loop, shield background task (#1059) 2023-09-17 00:29:08 -07:00			`self.add_request_calls += 1`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`print(f'Request calls: {self.add_request_calls}')`
Remove AsyncLLMEngine busy loop, shield background task (#1059) 2023-09-17 00:29:08 -07:00
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`async def add_request_async(self, **kwargs):`
			`self.add_request_calls += 1`
Add health check, make async Engine more robust (#3015) Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> 2024-03-04 14:01:40 -08:00			`return`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00
Remove AsyncLLMEngine busy loop, shield background task (#1059) 2023-09-17 00:29:08 -07:00			`def abort_request(self, request_id):`
TP/quantization/weight loading refactor part 1 - Simplify parallel linear logic (#1181) 2023-10-02 15:36:09 -07:00			`del request_id # Unused`
Remove AsyncLLMEngine busy loop, shield background task (#1059) 2023-09-17 00:29:08 -07:00			`self.abort_request_calls += 1`

Add health check, make async Engine more robust (#3015) Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> 2024-03-04 14:01:40 -08:00			`def has_unfinished_requests(self):`
			`return self.request_id is not None`

[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`def has_unfinished_requests_for_virtual_engine(self, virtual_engine):`
			`return self.request_id is not None`

Remove AsyncLLMEngine busy loop, shield background task (#1059) 2023-09-17 00:29:08 -07:00
			`class MockAsyncLLMEngine(AsyncLLMEngine):`

			`def _init_engine(self, args, *kwargs):`
			`return MockEngine()`


			`@pytest.mark.asyncio`
			`async def test_new_requests_event():`
			`engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False)`
			`engine.start_background_loop()`
			`await asyncio.sleep(0.01)`
			`assert engine.engine.step_calls == 0`

			`await engine.add_request("1", "", None)`
			`await asyncio.sleep(0.01)`
			`assert engine.engine.add_request_calls == 1`
			`assert engine.engine.step_calls == 1`

			`await engine.add_request("2", "", None)`
			`engine.engine.generate("2")`
			`await asyncio.sleep(0)`
			`await asyncio.sleep(0)`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`await asyncio.sleep(0)`
Add health check, make async Engine more robust (#3015) Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> 2024-03-04 14:01:40 -08:00			`assert engine.engine.add_request_calls == 2`
			`assert engine.engine.step_calls >= 2`
			`await asyncio.sleep(0.001)`
			`assert engine.engine.step_calls >= 3`
Remove AsyncLLMEngine busy loop, shield background task (#1059) 2023-09-17 00:29:08 -07:00			`engine.engine.stop_generating()`
Add health check, make async Engine more robust (#3015) Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> 2024-03-04 14:01:40 -08:00			`await asyncio.sleep(0.001)`
			`old_step_calls = engine.engine.step_calls`
			`await asyncio.sleep(0.001)`
			`assert engine.engine.step_calls == old_step_calls`
Remove AsyncLLMEngine busy loop, shield background task (#1059) 2023-09-17 00:29:08 -07:00
			`await engine.add_request("3", "", None)`
			`await asyncio.sleep(0.01)`
			`assert engine.engine.add_request_calls == 3`
Add health check, make async Engine more robust (#3015) Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> 2024-03-04 14:01:40 -08:00			`assert engine.engine.step_calls == old_step_calls + 1`
Remove AsyncLLMEngine busy loop, shield background task (#1059) 2023-09-17 00:29:08 -07:00			`await asyncio.sleep(0.01)`
			`assert engine.engine.add_request_calls == 3`
Add health check, make async Engine more robust (#3015) Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> 2024-03-04 14:01:40 -08:00			`assert engine.engine.step_calls == old_step_calls + 1`
[BugFix] Fix get tokenizer when using ray (#3301) 2024-03-11 10:17:16 +08:00
[Misc] Deprecation Warning when setting --engine-use-ray (#7424) Signed-off-by: Wallas Santos <wallashss@ibm.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com> Co-authored-by: youkaichao <youkaichao@126.com> 2024-08-14 13:44:27 -03:00			`# Allow deprecated engine_use_ray to not raise exception`
			`os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"`

[BugFix] Fix get tokenizer when using ray (#3301) 2024-03-11 10:17:16 +08:00			`engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)`
[Bugfix][Core] Fix get decoding config from ray (#4335) 2024-04-27 19:30:08 +08:00			`assert engine.get_model_config() is not None`
[BugFix] Fix get tokenizer when using ray (#3301) 2024-03-11 10:17:16 +08:00			`assert engine.get_tokenizer() is not None`
[Bugfix][Core] Fix get decoding config from ray (#4335) 2024-04-27 19:30:08 +08:00			`assert engine.get_decoding_config() is not None`
[Bugfix] AsyncLLMEngine hangs with asyncio.run (#5654) 2024-06-19 13:57:12 -07:00
[Misc] Deprecation Warning when setting --engine-use-ray (#7424) Signed-off-by: Wallas Santos <wallashss@ibm.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com> Co-authored-by: youkaichao <youkaichao@126.com> 2024-08-14 13:44:27 -03:00			`os.environ.pop("VLLM_ALLOW_ENGINE_USE_RAY")`

[Bugfix] AsyncLLMEngine hangs with asyncio.run (#5654) 2024-06-19 13:57:12 -07:00
[BugFix] Avoid premature async generator exit and raise all exception variations (#7698) 2024-08-21 11:45:55 -04:00			`def start_engine():`
[Bugfix] AsyncLLMEngine hangs with asyncio.run (#5654) 2024-06-19 13:57:12 -07:00			`wait_for_gpu_memory_to_clear(`
			`devices=list(range(torch.cuda.device_count())),`
			`threshold_bytes=2 * 2**30,`
			`timeout_s=60,`
			`)`

[BugFix] Avoid premature async generator exit and raise all exception variations (#7698) 2024-08-21 11:45:55 -04:00			`return AsyncLLMEngine.from_engine_args(`
			`AsyncEngineArgs(model="facebook/opt-125m", enforce_eager=True))`


			`@pytest_asyncio.fixture(scope="module")`
			`async def async_engine():`
			`engine = await asyncio.get_event_loop().run_in_executor(executor=None,`
			`func=start_engine)`
			`try:`
			`yield engine`
			`finally:`
			`engine.shutdown_background_loop()`
			`del engine`
			`await asyncio.sleep(0.1)`
			`cleanup()`


			`@pytest.fixture()`
			`def should_do_global_cleanup_after_test(request) -> bool:`
			`# So we can share the async engine fixture between these tests`
			`return False`


			`@pytest.mark.asyncio(scope="module")`
			`async def test_asyncio_run(async_engine):`
[Bugfix] AsyncLLMEngine hangs with asyncio.run (#5654) 2024-06-19 13:57:12 -07:00
			`async def run(prompt: str):`
			`sampling_params = SamplingParams(`
			`temperature=0,`
			`max_tokens=32,`
			`)`

[BugFix] Avoid premature async generator exit and raise all exception variations (#7698) 2024-08-21 11:45:55 -04:00			`async for output in async_engine.generate(prompt,`
			`sampling_params,`
			`request_id=prompt):`
[Bugfix] AsyncLLMEngine hangs with asyncio.run (#5654) 2024-06-19 13:57:12 -07:00			`final_output = output`
			`return final_output`

[BugFix] Avoid premature async generator exit and raise all exception variations (#7698) 2024-08-21 11:45:55 -04:00			`results = await asyncio.gather(`
			`run("test0"),`
			`run("test1"),`
			`)`
[Bugfix] AsyncLLMEngine hangs with asyncio.run (#5654) 2024-06-19 13:57:12 -07:00			`assert len(results) == 2`
[BugFix] Avoid premature async generator exit and raise all exception variations (#7698) 2024-08-21 11:45:55 -04:00

			`@pytest.mark.asyncio(scope="module")`
			`async def test_cancellation(async_engine):`
			`sampling_params = SamplingParams(`
			`temperature=0,`
			`min_tokens=10,`
			`max_tokens=10,`
			`)`

			`i = 0`
			`with pytest.raises(CancelledError):`
			`async for output in async_engine.generate("test2",`
			`sampling_params,`
			`request_id="test2"):`
			`assert not output.finished`
			`i += 1`
			`if i == 5:`
			`await async_engine.abort("test2")`

			`assert i == 5`


			`@pytest.mark.asyncio(scope="module")`
			`async def test_delayed_generator(async_engine):`
			`sampling_params = SamplingParams(`
			`temperature=0,`
			`min_tokens=10,`
			`max_tokens=10,`
			`)`

			`stream = async_engine.generate("test3",`
			`sampling_params,`
			`request_id="test3")`
			`i = 0`
			`final_output: Optional[RealRequestOutput] = None`
			`async for output in stream:`
			`final_output = output`
			`if i == 0:`
			`# wait for generation to complete before consuming`
			`# the remaining messages`
			`await asyncio.sleep(1)`
			`if i < 9:`
			`assert not output.finished`
			`i += 1`

			`assert i == 10`
			`assert final_output is not None`
			`assert len(final_output.outputs[0].token_ids) == 10`
			`assert final_output.finished`