[Bugfix] set VLLM_WORKER_MULTIPROC_METHOD=spawn for vllm.entrypoionts.openai.api_server (#15700)
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
This commit is contained in:
parent
8427f70493
commit
5b800f0932
@ -1,7 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
# The CLI entrypoint to vLLM.
|
# The CLI entrypoint to vLLM.
|
||||||
import os
|
|
||||||
import signal
|
import signal
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
@ -9,11 +8,9 @@ import vllm.entrypoints.cli.benchmark.main
|
|||||||
import vllm.entrypoints.cli.openai
|
import vllm.entrypoints.cli.openai
|
||||||
import vllm.entrypoints.cli.serve
|
import vllm.entrypoints.cli.serve
|
||||||
import vllm.version
|
import vllm.version
|
||||||
from vllm.logger import init_logger
|
from vllm.entrypoints.utils import cli_env_setup
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
|
||||||
|
|
||||||
CMD_MODULES = [
|
CMD_MODULES = [
|
||||||
vllm.entrypoints.cli.openai,
|
vllm.entrypoints.cli.openai,
|
||||||
vllm.entrypoints.cli.serve,
|
vllm.entrypoints.cli.serve,
|
||||||
@ -30,29 +27,8 @@ def register_signal_handlers():
|
|||||||
signal.signal(signal.SIGTSTP, signal_handler)
|
signal.signal(signal.SIGTSTP, signal_handler)
|
||||||
|
|
||||||
|
|
||||||
def env_setup():
|
|
||||||
# The safest multiprocessing method is `spawn`, as the default `fork` method
|
|
||||||
# is not compatible with some accelerators. The default method will be
|
|
||||||
# changing in future versions of Python, so we should use it explicitly when
|
|
||||||
# possible.
|
|
||||||
#
|
|
||||||
# We only set it here in the CLI entrypoint, because changing to `spawn`
|
|
||||||
# could break some existing code using vLLM as a library. `spawn` will cause
|
|
||||||
# unexpected behavior if the code is not protected by
|
|
||||||
# `if __name__ == "__main__":`.
|
|
||||||
#
|
|
||||||
# References:
|
|
||||||
# - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
|
|
||||||
# - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
|
|
||||||
# - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
|
|
||||||
# - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders
|
|
||||||
if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ:
|
|
||||||
logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'")
|
|
||||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
env_setup()
|
cli_env_setup()
|
||||||
|
|
||||||
parser = FlexibleArgumentParser(description="vLLM CLI")
|
parser = FlexibleArgumentParser(description="vLLM CLI")
|
||||||
parser.add_argument('-v',
|
parser.add_argument('-v',
|
||||||
|
@ -82,7 +82,8 @@ from vllm.entrypoints.openai.serving_tokenization import (
|
|||||||
from vllm.entrypoints.openai.serving_transcription import (
|
from vllm.entrypoints.openai.serving_transcription import (
|
||||||
OpenAIServingTranscription)
|
OpenAIServingTranscription)
|
||||||
from vllm.entrypoints.openai.tool_parsers import ToolParserManager
|
from vllm.entrypoints.openai.tool_parsers import ToolParserManager
|
||||||
from vllm.entrypoints.utils import load_aware_call, with_cancellation
|
from vllm.entrypoints.utils import (cli_env_setup, load_aware_call,
|
||||||
|
with_cancellation)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.reasoning import ReasoningParserManager
|
from vllm.reasoning import ReasoningParserManager
|
||||||
from vllm.transformers_utils.config import (
|
from vllm.transformers_utils.config import (
|
||||||
@ -1106,6 +1107,7 @@ if __name__ == "__main__":
|
|||||||
# NOTE(simon):
|
# NOTE(simon):
|
||||||
# This section should be in sync with vllm/entrypoints/cli/main.py for CLI
|
# This section should be in sync with vllm/entrypoints/cli/main.py for CLI
|
||||||
# entrypoints.
|
# entrypoints.
|
||||||
|
cli_env_setup()
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="vLLM OpenAI-Compatible RESTful API server.")
|
description="vLLM OpenAI-Compatible RESTful API server.")
|
||||||
parser = make_arg_parser(parser)
|
parser = make_arg_parser(parser)
|
||||||
|
@ -2,11 +2,16 @@
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import functools
|
import functools
|
||||||
|
import os
|
||||||
|
|
||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
from fastapi.responses import JSONResponse, StreamingResponse
|
from fastapi.responses import JSONResponse, StreamingResponse
|
||||||
from starlette.background import BackgroundTask, BackgroundTasks
|
from starlette.background import BackgroundTask, BackgroundTasks
|
||||||
|
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
async def listen_for_disconnect(request: Request) -> None:
|
async def listen_for_disconnect(request: Request) -> None:
|
||||||
"""Returns if a disconnect message is received"""
|
"""Returns if a disconnect message is received"""
|
||||||
@ -108,3 +113,24 @@ def load_aware_call(func):
|
|||||||
return response
|
return response
|
||||||
|
|
||||||
return wrapper
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
|
def cli_env_setup():
|
||||||
|
# The safest multiprocessing method is `spawn`, as the default `fork` method
|
||||||
|
# is not compatible with some accelerators. The default method will be
|
||||||
|
# changing in future versions of Python, so we should use it explicitly when
|
||||||
|
# possible.
|
||||||
|
#
|
||||||
|
# We only set it here in the CLI entrypoint, because changing to `spawn`
|
||||||
|
# could break some existing code using vLLM as a library. `spawn` will cause
|
||||||
|
# unexpected behavior if the code is not protected by
|
||||||
|
# `if __name__ == "__main__":`.
|
||||||
|
#
|
||||||
|
# References:
|
||||||
|
# - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
|
||||||
|
# - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
|
||||||
|
# - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
|
||||||
|
# - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders
|
||||||
|
if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ:
|
||||||
|
logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'")
|
||||||
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user