[Misc] add installation time env vars (#4574)
This commit is contained in:
parent
ab50275111
commit
344bf7cd2d
33
setup.py
33
setup.py
@ -1,3 +1,4 @@
|
||||
import importlib.util
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
@ -13,10 +14,23 @@ from setuptools import Extension, find_packages, setup
|
||||
from setuptools.command.build_ext import build_ext
|
||||
from torch.utils.cpp_extension import CUDA_HOME
|
||||
|
||||
|
||||
def load_module_from_path(module_name, path):
|
||||
spec = importlib.util.spec_from_file_location(module_name, path)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
sys.modules[module_name] = module
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
ROOT_DIR = os.path.dirname(__file__)
|
||||
logger = logging.getLogger(__name__)
|
||||
# Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
|
||||
VLLM_TARGET_DEVICE = os.getenv("VLLM_TARGET_DEVICE", "cuda")
|
||||
|
||||
# cannot import envs directly because it depends on vllm,
|
||||
# which is not installed yet
|
||||
envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
|
||||
|
||||
VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
|
||||
|
||||
# vLLM only supports Linux platform
|
||||
assert sys.platform.startswith(
|
||||
@ -60,7 +74,7 @@ class cmake_build_ext(build_ext):
|
||||
def compute_num_jobs(self):
|
||||
# `num_jobs` is either the value of the MAX_JOBS environment variable
|
||||
# (if defined) or the number of CPUs available.
|
||||
num_jobs = os.environ.get("MAX_JOBS", None)
|
||||
num_jobs = envs.MAX_JOBS
|
||||
if num_jobs is not None:
|
||||
num_jobs = int(num_jobs)
|
||||
logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs)
|
||||
@ -78,7 +92,7 @@ class cmake_build_ext(build_ext):
|
||||
# environment variable (if defined) or 1.
|
||||
# when it is set, we reduce `num_jobs` to avoid
|
||||
# overloading the system.
|
||||
nvcc_threads = os.getenv("NVCC_THREADS", None)
|
||||
nvcc_threads = envs.NVCC_THREADS
|
||||
if nvcc_threads is not None:
|
||||
nvcc_threads = int(nvcc_threads)
|
||||
logger.info(
|
||||
@ -104,7 +118,7 @@ class cmake_build_ext(build_ext):
|
||||
# Select the build type.
|
||||
# Note: optimization level + debug info are set by the build type
|
||||
default_cfg = "Debug" if self.debug else "RelWithDebInfo"
|
||||
cfg = os.getenv("CMAKE_BUILD_TYPE", default_cfg)
|
||||
cfg = envs.CMAKE_BUILD_TYPE or default_cfg
|
||||
|
||||
# where .so files will be written, should be the same for all extensions
|
||||
# that use the same CMakeLists.txt.
|
||||
@ -118,7 +132,7 @@ class cmake_build_ext(build_ext):
|
||||
'-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
|
||||
]
|
||||
|
||||
verbose = bool(int(os.getenv('VERBOSE', '0')))
|
||||
verbose = envs.VERBOSE
|
||||
if verbose:
|
||||
cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
|
||||
|
||||
@ -205,8 +219,7 @@ def _is_neuron() -> bool:
|
||||
subprocess.run(["neuron-ls"], capture_output=True, check=True)
|
||||
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
|
||||
torch_neuronx_installed = False
|
||||
return torch_neuronx_installed or os.environ.get("VLLM_BUILD_WITH_NEURON",
|
||||
False)
|
||||
return torch_neuronx_installed or envs.VLLM_BUILD_WITH_NEURON
|
||||
|
||||
|
||||
def _is_cpu() -> bool:
|
||||
@ -214,7 +227,7 @@ def _is_cpu() -> bool:
|
||||
|
||||
|
||||
def _install_punica() -> bool:
|
||||
return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
|
||||
return envs.VLLM_INSTALL_PUNICA_KERNELS
|
||||
|
||||
|
||||
def get_hipcc_rocm_version():
|
||||
@ -377,7 +390,7 @@ if not _is_neuron():
|
||||
package_data = {
|
||||
"vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
|
||||
}
|
||||
if os.environ.get("VLLM_USE_PRECOMPILED"):
|
||||
if envs.VLLM_USE_PRECOMPILED:
|
||||
ext_modules = []
|
||||
package_data["vllm"].append("*.so")
|
||||
|
||||
|
66
vllm/envs.py
66
vllm/envs.py
@ -27,6 +27,14 @@ if TYPE_CHECKING:
|
||||
VLLM_CPU_KVCACHE_SPACE: int = 0
|
||||
VLLM_USE_RAY_COMPILED_DAG: bool = False
|
||||
VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
|
||||
VLLM_TARGET_DEVICE: str = "cuda"
|
||||
MAX_JOBS: Optional[str] = None
|
||||
NVCC_THREADS: Optional[str] = None
|
||||
VLLM_BUILD_WITH_NEURON: bool = False
|
||||
VLLM_USE_PRECOMPILED: bool = False
|
||||
VLLM_INSTALL_PUNICA_KERNELS: bool = False
|
||||
CMAKE_BUILD_TYPE: Optional[str] = None
|
||||
VERBOSE: bool = False
|
||||
|
||||
# The begin-* and end* here are used by the documentation generator
|
||||
# to extract the used env vars.
|
||||
@ -34,6 +42,56 @@ if TYPE_CHECKING:
|
||||
# begin-env-vars-definition
|
||||
|
||||
environment_variables: Dict[str, Callable[[], Any]] = {
|
||||
|
||||
# ================== Installation Time Env Vars ==================
|
||||
|
||||
# Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
|
||||
"VLLM_TARGET_DEVICE":
|
||||
lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
|
||||
|
||||
# Maximum number of compilation jobs to run in parallel.
|
||||
# By default this is the number of CPUs
|
||||
"MAX_JOBS":
|
||||
lambda: os.getenv("MAX_JOBS", None),
|
||||
|
||||
# Number of threads to use for nvcc
|
||||
# By default this is 1.
|
||||
# If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
|
||||
"NVCC_THREADS":
|
||||
lambda: os.getenv("NVCC_THREADS", None),
|
||||
|
||||
# If set, vllm will build with Neuron support
|
||||
"VLLM_BUILD_WITH_NEURON":
|
||||
lambda: bool(os.environ.get("VLLM_BUILD_WITH_NEURON", False)),
|
||||
|
||||
# If set, vllm will use precompiled binaries (*.so)
|
||||
"VLLM_USE_PRECOMPILED":
|
||||
lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")),
|
||||
|
||||
# If set, vllm will install Punica kernels
|
||||
"VLLM_INSTALL_PUNICA_KERNELS":
|
||||
lambda: bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))),
|
||||
|
||||
# CMake build type
|
||||
# If not set, defaults to "Debug" or "RelWithDebInfo"
|
||||
# Available options: "Debug", "Release", "RelWithDebInfo"
|
||||
"CMAKE_BUILD_TYPE":
|
||||
lambda: os.getenv("CMAKE_BUILD_TYPE"),
|
||||
|
||||
# If set, vllm will print verbose logs during installation
|
||||
"VERBOSE":
|
||||
lambda: bool(int(os.getenv('VERBOSE', '0'))),
|
||||
|
||||
# Root directory for VLLM configuration files
|
||||
# Note that this not only affects how vllm finds its configuration files
|
||||
# during runtime, but also affects how vllm installs its configuration
|
||||
# files during **installation**.
|
||||
"VLLM_CONFIG_ROOT":
|
||||
lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv(
|
||||
"XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"),
|
||||
|
||||
# ================== Runtime Env Vars ==================
|
||||
|
||||
# used in distributed environment to determine the master address
|
||||
'VLLM_HOST_IP':
|
||||
lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
|
||||
@ -93,14 +151,6 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
||||
"S3_ENDPOINT_URL":
|
||||
lambda: os.environ.get("S3_ENDPOINT_URL", None),
|
||||
|
||||
# Root directory for VLLM configuration files
|
||||
# Note that this not only affects how vllm finds its configuration files
|
||||
# during runtime, but also affects how vllm installs its configuration
|
||||
# files during **installation**.
|
||||
"VLLM_CONFIG_ROOT":
|
||||
lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv(
|
||||
"XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"),
|
||||
|
||||
# Usage stats collection
|
||||
"VLLM_USAGE_STATS_SERVER":
|
||||
lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
|
||||
|
Loading…
x
Reference in New Issue
Block a user