From 344bf7cd2d66a8b13f216f61c7a6d5d70576a498 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 3 May 2024 15:55:56 -0700 Subject: [PATCH] [Misc] add installation time env vars (#4574) --- setup.py | 33 ++++++++++++++++++-------- vllm/envs.py | 66 +++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 81 insertions(+), 18 deletions(-) diff --git a/setup.py b/setup.py index 801d8d50..3768daf9 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,4 @@ +import importlib.util import io import logging import os @@ -13,10 +14,23 @@ from setuptools import Extension, find_packages, setup from setuptools.command.build_ext import build_ext from torch.utils.cpp_extension import CUDA_HOME + +def load_module_from_path(module_name, path): + spec = importlib.util.spec_from_file_location(module_name, path) + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + + ROOT_DIR = os.path.dirname(__file__) logger = logging.getLogger(__name__) -# Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu] -VLLM_TARGET_DEVICE = os.getenv("VLLM_TARGET_DEVICE", "cuda") + +# cannot import envs directly because it depends on vllm, +# which is not installed yet +envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py')) + +VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE # vLLM only supports Linux platform assert sys.platform.startswith( @@ -60,7 +74,7 @@ class cmake_build_ext(build_ext): def compute_num_jobs(self): # `num_jobs` is either the value of the MAX_JOBS environment variable # (if defined) or the number of CPUs available. - num_jobs = os.environ.get("MAX_JOBS", None) + num_jobs = envs.MAX_JOBS if num_jobs is not None: num_jobs = int(num_jobs) logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs) @@ -78,7 +92,7 @@ class cmake_build_ext(build_ext): # environment variable (if defined) or 1. # when it is set, we reduce `num_jobs` to avoid # overloading the system. - nvcc_threads = os.getenv("NVCC_THREADS", None) + nvcc_threads = envs.NVCC_THREADS if nvcc_threads is not None: nvcc_threads = int(nvcc_threads) logger.info( @@ -104,7 +118,7 @@ class cmake_build_ext(build_ext): # Select the build type. # Note: optimization level + debug info are set by the build type default_cfg = "Debug" if self.debug else "RelWithDebInfo" - cfg = os.getenv("CMAKE_BUILD_TYPE", default_cfg) + cfg = envs.CMAKE_BUILD_TYPE or default_cfg # where .so files will be written, should be the same for all extensions # that use the same CMakeLists.txt. @@ -118,7 +132,7 @@ class cmake_build_ext(build_ext): '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE), ] - verbose = bool(int(os.getenv('VERBOSE', '0'))) + verbose = envs.VERBOSE if verbose: cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON'] @@ -205,8 +219,7 @@ def _is_neuron() -> bool: subprocess.run(["neuron-ls"], capture_output=True, check=True) except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): torch_neuronx_installed = False - return torch_neuronx_installed or os.environ.get("VLLM_BUILD_WITH_NEURON", - False) + return torch_neuronx_installed or envs.VLLM_BUILD_WITH_NEURON def _is_cpu() -> bool: @@ -214,7 +227,7 @@ def _is_cpu() -> bool: def _install_punica() -> bool: - return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))) + return envs.VLLM_INSTALL_PUNICA_KERNELS def get_hipcc_rocm_version(): @@ -377,7 +390,7 @@ if not _is_neuron(): package_data = { "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"] } -if os.environ.get("VLLM_USE_PRECOMPILED"): +if envs.VLLM_USE_PRECOMPILED: ext_modules = [] package_data["vllm"].append("*.so") diff --git a/vllm/envs.py b/vllm/envs.py index 2dbb57e6..91cc8f3b 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -27,6 +27,14 @@ if TYPE_CHECKING: VLLM_CPU_KVCACHE_SPACE: int = 0 VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_WORKER_MULTIPROC_METHOD: str = "spawn" + VLLM_TARGET_DEVICE: str = "cuda" + MAX_JOBS: Optional[str] = None + NVCC_THREADS: Optional[str] = None + VLLM_BUILD_WITH_NEURON: bool = False + VLLM_USE_PRECOMPILED: bool = False + VLLM_INSTALL_PUNICA_KERNELS: bool = False + CMAKE_BUILD_TYPE: Optional[str] = None + VERBOSE: bool = False # The begin-* and end* here are used by the documentation generator # to extract the used env vars. @@ -34,6 +42,56 @@ if TYPE_CHECKING: # begin-env-vars-definition environment_variables: Dict[str, Callable[[], Any]] = { + + # ================== Installation Time Env Vars ================== + + # Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu] + "VLLM_TARGET_DEVICE": + lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"), + + # Maximum number of compilation jobs to run in parallel. + # By default this is the number of CPUs + "MAX_JOBS": + lambda: os.getenv("MAX_JOBS", None), + + # Number of threads to use for nvcc + # By default this is 1. + # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU. + "NVCC_THREADS": + lambda: os.getenv("NVCC_THREADS", None), + + # If set, vllm will build with Neuron support + "VLLM_BUILD_WITH_NEURON": + lambda: bool(os.environ.get("VLLM_BUILD_WITH_NEURON", False)), + + # If set, vllm will use precompiled binaries (*.so) + "VLLM_USE_PRECOMPILED": + lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")), + + # If set, vllm will install Punica kernels + "VLLM_INSTALL_PUNICA_KERNELS": + lambda: bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))), + + # CMake build type + # If not set, defaults to "Debug" or "RelWithDebInfo" + # Available options: "Debug", "Release", "RelWithDebInfo" + "CMAKE_BUILD_TYPE": + lambda: os.getenv("CMAKE_BUILD_TYPE"), + + # If set, vllm will print verbose logs during installation + "VERBOSE": + lambda: bool(int(os.getenv('VERBOSE', '0'))), + + # Root directory for VLLM configuration files + # Note that this not only affects how vllm finds its configuration files + # during runtime, but also affects how vllm installs its configuration + # files during **installation**. + "VLLM_CONFIG_ROOT": + lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv( + "XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"), + + # ================== Runtime Env Vars ================== + # used in distributed environment to determine the master address 'VLLM_HOST_IP': lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""), @@ -93,14 +151,6 @@ environment_variables: Dict[str, Callable[[], Any]] = { "S3_ENDPOINT_URL": lambda: os.environ.get("S3_ENDPOINT_URL", None), - # Root directory for VLLM configuration files - # Note that this not only affects how vllm finds its configuration files - # during runtime, but also affects how vllm installs its configuration - # files during **installation**. - "VLLM_CONFIG_ROOT": - lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv( - "XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"), - # Usage stats collection "VLLM_USAGE_STATS_SERVER": lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),