[Bugfix] set VLLM_WORKER_MULTIPROC_METHOD=spawn for vllm.entrypoionts.openai.api_server (#15700)

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
This commit is contained in:
Jinzhen Lin 2025-03-29 12:12:26 +08:00 committed by GitHub
parent 8427f70493
commit 5b800f0932
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 31 additions and 27 deletions

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# The CLI entrypoint to vLLM. # The CLI entrypoint to vLLM.
import os
import signal import signal
import sys import sys
@ -9,11 +8,9 @@ import vllm.entrypoints.cli.benchmark.main
import vllm.entrypoints.cli.openai import vllm.entrypoints.cli.openai
import vllm.entrypoints.cli.serve import vllm.entrypoints.cli.serve
import vllm.version import vllm.version
from vllm.logger import init_logger from vllm.entrypoints.utils import cli_env_setup
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
logger = init_logger(__name__)
CMD_MODULES = [ CMD_MODULES = [
vllm.entrypoints.cli.openai, vllm.entrypoints.cli.openai,
vllm.entrypoints.cli.serve, vllm.entrypoints.cli.serve,
@ -30,29 +27,8 @@ def register_signal_handlers():
signal.signal(signal.SIGTSTP, signal_handler) signal.signal(signal.SIGTSTP, signal_handler)
def env_setup():
# The safest multiprocessing method is `spawn`, as the default `fork` method
# is not compatible with some accelerators. The default method will be
# changing in future versions of Python, so we should use it explicitly when
# possible.
#
# We only set it here in the CLI entrypoint, because changing to `spawn`
# could break some existing code using vLLM as a library. `spawn` will cause
# unexpected behavior if the code is not protected by
# `if __name__ == "__main__":`.
#
# References:
# - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
# - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
# - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
# - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders
if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ:
logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'")
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
def main(): def main():
env_setup() cli_env_setup()
parser = FlexibleArgumentParser(description="vLLM CLI") parser = FlexibleArgumentParser(description="vLLM CLI")
parser.add_argument('-v', parser.add_argument('-v',

View File

@ -82,7 +82,8 @@ from vllm.entrypoints.openai.serving_tokenization import (
from vllm.entrypoints.openai.serving_transcription import ( from vllm.entrypoints.openai.serving_transcription import (
OpenAIServingTranscription) OpenAIServingTranscription)
from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.entrypoints.openai.tool_parsers import ToolParserManager
from vllm.entrypoints.utils import load_aware_call, with_cancellation from vllm.entrypoints.utils import (cli_env_setup, load_aware_call,
with_cancellation)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager from vllm.reasoning import ReasoningParserManager
from vllm.transformers_utils.config import ( from vllm.transformers_utils.config import (
@ -1106,6 +1107,7 @@ if __name__ == "__main__":
# NOTE(simon): # NOTE(simon):
# This section should be in sync with vllm/entrypoints/cli/main.py for CLI # This section should be in sync with vllm/entrypoints/cli/main.py for CLI
# entrypoints. # entrypoints.
cli_env_setup()
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description="vLLM OpenAI-Compatible RESTful API server.") description="vLLM OpenAI-Compatible RESTful API server.")
parser = make_arg_parser(parser) parser = make_arg_parser(parser)

View File

@ -2,11 +2,16 @@
import asyncio import asyncio
import functools import functools
import os
from fastapi import Request from fastapi import Request
from fastapi.responses import JSONResponse, StreamingResponse from fastapi.responses import JSONResponse, StreamingResponse
from starlette.background import BackgroundTask, BackgroundTasks from starlette.background import BackgroundTask, BackgroundTasks
from vllm.logger import init_logger
logger = init_logger(__name__)
async def listen_for_disconnect(request: Request) -> None: async def listen_for_disconnect(request: Request) -> None:
"""Returns if a disconnect message is received""" """Returns if a disconnect message is received"""
@ -108,3 +113,24 @@ def load_aware_call(func):
return response return response
return wrapper return wrapper
def cli_env_setup():
# The safest multiprocessing method is `spawn`, as the default `fork` method
# is not compatible with some accelerators. The default method will be
# changing in future versions of Python, so we should use it explicitly when
# possible.
#
# We only set it here in the CLI entrypoint, because changing to `spawn`
# could break some existing code using vLLM as a library. `spawn` will cause
# unexpected behavior if the code is not protected by
# `if __name__ == "__main__":`.
#
# References:
# - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
# - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
# - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
# - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders
if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ:
logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'")
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"