[Misc] Deprecation Warning when setting --engine-use-ray (#7424)

Signed-off-by: Wallas Santos <wallashss@ibm.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: youkaichao <youkaichao@126.com>
This commit is contained in:
Wallas Henrique 2024-08-14 13:44:27 -03:00 committed by GitHub
parent 67d115db08
commit 70b746efcf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 56 additions and 3 deletions

View File

@ -1,3 +1,4 @@
import os
import subprocess import subprocess
import sys import sys
import time import time
@ -35,11 +36,17 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
"127.0.0.1", "--tokenizer-pool-size", "127.0.0.1", "--tokenizer-pool-size",
str(tokenizer_pool_size) str(tokenizer_pool_size)
] ]
# Copy the environment variables and append `VLLM_ALLOW_ENGINE_USE_RAY=1`
# to prevent `--engine-use-ray` raises an exception due to it deprecation
env_vars = os.environ.copy()
env_vars["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
if engine_use_ray: if engine_use_ray:
commands.append("--engine-use-ray") commands.append("--engine-use-ray")
if worker_use_ray: if worker_use_ray:
commands.append("--worker-use-ray") commands.append("--worker-use-ray")
uvicorn_process = subprocess.Popen(commands) uvicorn_process = subprocess.Popen(commands, env=env_vars)
yield yield
uvicorn_process.terminate() uvicorn_process.terminate()

View File

@ -1,4 +1,5 @@
import asyncio import asyncio
import os
from dataclasses import dataclass from dataclasses import dataclass
import pytest import pytest
@ -106,11 +107,16 @@ async def test_new_requests_event():
assert engine.engine.add_request_calls == 3 assert engine.engine.add_request_calls == 3
assert engine.engine.step_calls == old_step_calls + 1 assert engine.engine.step_calls == old_step_calls + 1
# Allow deprecated engine_use_ray to not raise exception
os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True) engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
assert engine.get_model_config() is not None assert engine.get_model_config() is not None
assert engine.get_tokenizer() is not None assert engine.get_tokenizer() is not None
assert engine.get_decoding_config() is not None assert engine.get_decoding_config() is not None
os.environ.pop("VLLM_ALLOW_ENGINE_USE_RAY")
def test_asyncio_run(): def test_asyncio_run():
wait_for_gpu_memory_to_clear( wait_for_gpu_memory_to_clear(

View File

@ -23,7 +23,11 @@ def server():
str(chatml_jinja_path), str(chatml_jinja_path),
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: # Allow `--engine-use-ray`, otherwise the launch of the server throw
# an error due to try to use a deprecated feature
env_dict = {"VLLM_ALLOW_ENGINE_USE_RAY": "1"}
with RemoteOpenAIServer(MODEL_NAME, args,
env_dict=env_dict) as remote_server:
yield remote_server yield remote_server

View File

@ -1,4 +1,5 @@
import asyncio import asyncio
import os
from itertools import cycle from itertools import cycle
from typing import Dict, List, Optional, Sequence, Tuple, Union from typing import Dict, List, Optional, Sequence, Tuple, Union
@ -56,6 +57,11 @@ class AsyncLLM:
) -> None: ) -> None:
if "disable_log_stats" not in kwargs: if "disable_log_stats" not in kwargs:
kwargs["disable_log_stats"] = True kwargs["disable_log_stats"] = True
# Needed to engine_use_ray works as a deprecated feature,
# otherwise the following constructor will raise an exception
os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
engine_args = AsyncEngineArgs( engine_args = AsyncEngineArgs(
model=model, model=model,
tokenizer=tokenizer, tokenizer=tokenizer,

View File

@ -923,7 +923,13 @@ class AsyncEngineArgs(EngineArgs):
parser.add_argument('--engine-use-ray', parser.add_argument('--engine-use-ray',
action='store_true', action='store_true',
help='Use Ray to start the LLM engine in a ' help='Use Ray to start the LLM engine in a '
'separate process as the server process.') 'separate process as the server process.'
'(DEPRECATED. This argument is deprecated '
'and will be removed in a future update. '
'Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to force '
'use it. See '
'https://github.com/vllm-project/vllm/issues/7045.'
')')
parser.add_argument('--disable-log-requests', parser.add_argument('--disable-log-requests',
action='store_true', action='store_true',
help='Disable logging requests.') help='Disable logging requests.')

View File

@ -29,6 +29,7 @@ from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.utils import print_warning_once
logger = init_logger(__name__) logger = init_logger(__name__)
ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
@ -510,6 +511,20 @@ class AsyncLLMEngine:
self.log_requests = log_requests self.log_requests = log_requests
self.engine = self._init_engine(*args, **kwargs) self.engine = self._init_engine(*args, **kwargs)
if self.engine_use_ray:
print_warning_once(
"DEPRECATED. `--engine-use-ray` is deprecated and will "
"be removed in a future update. "
"See https://github.com/vllm-project/vllm/issues/7045.")
if envs.VLLM_ALLOW_ENGINE_USE_RAY:
print_warning_once(
"VLLM_ALLOW_ENGINE_USE_RAY is set, force engine use Ray")
else:
raise ValueError("`--engine-use-ray` is deprecated. "
"Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to "
"force use it")
self.background_loop: Optional[asyncio.Future] = None self.background_loop: Optional[asyncio.Future] = None
# We need to keep a reference to unshielded # We need to keep a reference to unshielded
# task as well to prevent it from being garbage # task as well to prevent it from being garbage

View File

@ -55,6 +55,7 @@ if TYPE_CHECKING:
VERBOSE: bool = False VERBOSE: bool = False
VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
VLLM_TEST_FORCE_FP8_MARLIN: bool = False VLLM_TEST_FORCE_FP8_MARLIN: bool = False
VLLM_ALLOW_ENGINE_USE_RAY: bool = False
VLLM_PLUGINS: Optional[List[str]] = None VLLM_PLUGINS: Optional[List[str]] = None
@ -364,6 +365,14 @@ environment_variables: Dict[str, Callable[[], Any]] = {
(os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in (os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in
("1", "true")), ("1", "true")),
# If set, allow running the engine as a separate ray actor,
# which is a deprecated feature soon to be removed.
# See https://github.com/vllm-project/vllm/issues/7045
"VLLM_ALLOW_ENGINE_USE_RAY":
lambda:
(os.environ.get("VLLM_ALLOW_ENGINE_USE_RAY", "0").strip().lower() in
("1", "true")),
# a list of plugin names to load, separated by commas. # a list of plugin names to load, separated by commas.
# if this is not set, it means all plugins will be loaded # if this is not set, it means all plugins will be loaded
# if this is set to an empty string, no plugins will be loaded # if this is set to an empty string, no plugins will be loaded