vllm/tests/utils.py

import os
import subprocess
import sys
import time
import warnings
from contextlib import contextmanager

import ray
import requests

from vllm.distributed import (ensure_model_parallel_initialized,
                              init_distributed_environment)
from vllm.utils import get_open_port

# Path to root of repository so that utilities can be imported by ray workers
VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))


@ray.remote(num_gpus=1)
class ServerRunner:
    MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds

    def __init__(self, args):
        env = os.environ.copy()
        env["PYTHONUNBUFFERED"] = "1"
        self.proc = subprocess.Popen(
            ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args,
            env=env,
            stdout=sys.stdout,
            stderr=sys.stderr,
        )
        self._wait_for_server()

    def ready(self):
        return True

    def _wait_for_server(self):
        # run health check
        start = time.time()
        while True:
            try:
                if requests.get(
                        "http://localhost:8000/health").status_code == 200:
                    break
            except Exception as err:
                if self.proc.poll() is not None:
                    raise RuntimeError("Server exited unexpectedly.") from err

                time.sleep(0.5)
                if time.time() - start > self.MAX_SERVER_START_WAIT_S:
                    raise RuntimeError(
                        "Server failed to start in time.") from err

    def __del__(self):
        if hasattr(self, "proc"):
            self.proc.terminate()


def init_test_distributed_environment(
    tp_size: int,
    pp_size: int,
    rank: int,
    distributed_init_port: str,
    local_rank: int = -1,
) -> None:
    distributed_init_method = f"tcp://localhost:{distributed_init_port}"
    init_distributed_environment(
        world_size=pp_size * tp_size,
        rank=rank,
        distributed_init_method=distributed_init_method,
        local_rank=local_rank)
    ensure_model_parallel_initialized(tp_size, pp_size)


def multi_process_tensor_parallel(
    tp_size: int,
    pp_size: int,
    test_target,
) -> None:
    # Using ray helps debugging the error when it failed
    # as compared to multiprocessing.
    ray.init(runtime_env={"working_dir": VLLM_PATH})

    distributed_init_port = get_open_port()
    refs = []
    for rank in range(tp_size * pp_size):
        refs.append(
            test_target.remote(tp_size, pp_size, rank, distributed_init_port))
    ray.get(refs)

    ray.shutdown()


@contextmanager
def error_on_warning():
    """
    Within the scope of this context manager, tests will fail if any warning
    is emitted.
    """
    with warnings.catch_warnings():
        warnings.simplefilter("error")

        yield
[CI/Build] Move `test_utils.py` to `tests/utils.py` (#4425) Since #4335 was merged, I've noticed that the definition of ServerRunner in the tests is the same as in the test for OpenAI API. I have moved the class to the test utilities to avoid code duplication. (Although it only has been repeated twice so far, I will add another similar test suite in #4200 which would duplicate the code a third time) Also, I have moved the test utilities file (test_utils.py) to under the test directory (tests/utils.py), since none of its code is actually used in the main package. Note that I have added __init__.py to each test subpackage and updated the ray.init() call in the test utilities file in order to relative import tests/utils.py. 2024-05-13 22:50:09 +08:00			`import os`
			`import subprocess`
			`import sys`
			`import time`
[Core] Consolidate prompt arguments to LLM engines (#4328) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-05-29 04:29:31 +08:00			`import warnings`
			`from contextlib import contextmanager`
[CI/Build] Move `test_utils.py` to `tests/utils.py` (#4425) Since #4335 was merged, I've noticed that the definition of ServerRunner in the tests is the same as in the test for OpenAI API. I have moved the class to the test utilities to avoid code duplication. (Although it only has been repeated twice so far, I will add another similar test suite in #4200 which would duplicate the code a third time) Also, I have moved the test utilities file (test_utils.py) to under the test directory (tests/utils.py), since none of its code is actually used in the main package. Note that I have added __init__.py to each test subpackage and updated the ray.init() call in the test utilities file in order to relative import tests/utils.py. 2024-05-13 22:50:09 +08:00
			`import ray`
			`import requests`

			`from vllm.distributed import (ensure_model_parallel_initialized,`
			`init_distributed_environment)`
			`from vllm.utils import get_open_port`

			`# Path to root of repository so that utilities can be imported by ray workers`
			`VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))`


			`@ray.remote(num_gpus=1)`
			`class ServerRunner:`
			`MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds`

			`def __init__(self, args):`
			`env = os.environ.copy()`
			`env["PYTHONUNBUFFERED"] = "1"`
			`self.proc = subprocess.Popen(`
			`["python3", "-m", "vllm.entrypoints.openai.api_server"] + args,`
			`env=env,`
			`stdout=sys.stdout,`
			`stderr=sys.stderr,`
			`)`
			`self._wait_for_server()`

			`def ready(self):`
			`return True`

			`def _wait_for_server(self):`
			`# run health check`
			`start = time.time()`
			`while True:`
			`try:`
			`if requests.get(`
			`"http://localhost:8000/health").status_code == 200:`
			`break`
			`except Exception as err:`
			`if self.proc.poll() is not None:`
			`raise RuntimeError("Server exited unexpectedly.") from err`

			`time.sleep(0.5)`
			`if time.time() - start > self.MAX_SERVER_START_WAIT_S:`
			`raise RuntimeError(`
			`"Server failed to start in time.") from err`

			`def __del__(self):`
			`if hasattr(self, "proc"):`
			`self.proc.terminate()`


			`def init_test_distributed_environment(`
			`tp_size: int,`
			`pp_size: int,`
			`rank: int,`
			`distributed_init_port: str,`
			`local_rank: int = -1,`
			`) -> None:`
			`distributed_init_method = f"tcp://localhost:{distributed_init_port}"`
			`init_distributed_environment(`
			`world_size=pp_size * tp_size,`
			`rank=rank,`
			`distributed_init_method=distributed_init_method,`
			`local_rank=local_rank)`
			`ensure_model_parallel_initialized(tp_size, pp_size)`


			`def multi_process_tensor_parallel(`
			`tp_size: int,`
			`pp_size: int,`
			`test_target,`
			`) -> None:`
			`# Using ray helps debugging the error when it failed`
			`# as compared to multiprocessing.`
			`ray.init(runtime_env={"working_dir": VLLM_PATH})`

			`distributed_init_port = get_open_port()`
			`refs = []`
			`for rank in range(tp_size * pp_size):`
			`refs.append(`
			`test_target.remote(tp_size, pp_size, rank, distributed_init_port))`
			`ray.get(refs)`

			`ray.shutdown()`
[Core] Consolidate prompt arguments to LLM engines (#4328) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-05-29 04:29:31 +08:00

			`@contextmanager`
			`def error_on_warning():`
			`"""`
			`Within the scope of this context manager, tests will fail if any warning`
			`is emitted.`
			`"""`
			`with warnings.catch_warnings():`
			`warnings.simplefilter("error")`

			`yield`