[Misc] add installation time env vars (#4574)
This commit is contained in:
parent
ab50275111
commit
344bf7cd2d
33
setup.py
33
setup.py
@ -1,3 +1,4 @@
|
|||||||
|
import importlib.util
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
@ -13,10 +14,23 @@ from setuptools import Extension, find_packages, setup
|
|||||||
from setuptools.command.build_ext import build_ext
|
from setuptools.command.build_ext import build_ext
|
||||||
from torch.utils.cpp_extension import CUDA_HOME
|
from torch.utils.cpp_extension import CUDA_HOME
|
||||||
|
|
||||||
|
|
||||||
|
def load_module_from_path(module_name, path):
|
||||||
|
spec = importlib.util.spec_from_file_location(module_name, path)
|
||||||
|
module = importlib.util.module_from_spec(spec)
|
||||||
|
sys.modules[module_name] = module
|
||||||
|
spec.loader.exec_module(module)
|
||||||
|
return module
|
||||||
|
|
||||||
|
|
||||||
ROOT_DIR = os.path.dirname(__file__)
|
ROOT_DIR = os.path.dirname(__file__)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
# Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
|
|
||||||
VLLM_TARGET_DEVICE = os.getenv("VLLM_TARGET_DEVICE", "cuda")
|
# cannot import envs directly because it depends on vllm,
|
||||||
|
# which is not installed yet
|
||||||
|
envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
|
||||||
|
|
||||||
|
VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
|
||||||
|
|
||||||
# vLLM only supports Linux platform
|
# vLLM only supports Linux platform
|
||||||
assert sys.platform.startswith(
|
assert sys.platform.startswith(
|
||||||
@ -60,7 +74,7 @@ class cmake_build_ext(build_ext):
|
|||||||
def compute_num_jobs(self):
|
def compute_num_jobs(self):
|
||||||
# `num_jobs` is either the value of the MAX_JOBS environment variable
|
# `num_jobs` is either the value of the MAX_JOBS environment variable
|
||||||
# (if defined) or the number of CPUs available.
|
# (if defined) or the number of CPUs available.
|
||||||
num_jobs = os.environ.get("MAX_JOBS", None)
|
num_jobs = envs.MAX_JOBS
|
||||||
if num_jobs is not None:
|
if num_jobs is not None:
|
||||||
num_jobs = int(num_jobs)
|
num_jobs = int(num_jobs)
|
||||||
logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs)
|
logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs)
|
||||||
@ -78,7 +92,7 @@ class cmake_build_ext(build_ext):
|
|||||||
# environment variable (if defined) or 1.
|
# environment variable (if defined) or 1.
|
||||||
# when it is set, we reduce `num_jobs` to avoid
|
# when it is set, we reduce `num_jobs` to avoid
|
||||||
# overloading the system.
|
# overloading the system.
|
||||||
nvcc_threads = os.getenv("NVCC_THREADS", None)
|
nvcc_threads = envs.NVCC_THREADS
|
||||||
if nvcc_threads is not None:
|
if nvcc_threads is not None:
|
||||||
nvcc_threads = int(nvcc_threads)
|
nvcc_threads = int(nvcc_threads)
|
||||||
logger.info(
|
logger.info(
|
||||||
@ -104,7 +118,7 @@ class cmake_build_ext(build_ext):
|
|||||||
# Select the build type.
|
# Select the build type.
|
||||||
# Note: optimization level + debug info are set by the build type
|
# Note: optimization level + debug info are set by the build type
|
||||||
default_cfg = "Debug" if self.debug else "RelWithDebInfo"
|
default_cfg = "Debug" if self.debug else "RelWithDebInfo"
|
||||||
cfg = os.getenv("CMAKE_BUILD_TYPE", default_cfg)
|
cfg = envs.CMAKE_BUILD_TYPE or default_cfg
|
||||||
|
|
||||||
# where .so files will be written, should be the same for all extensions
|
# where .so files will be written, should be the same for all extensions
|
||||||
# that use the same CMakeLists.txt.
|
# that use the same CMakeLists.txt.
|
||||||
@ -118,7 +132,7 @@ class cmake_build_ext(build_ext):
|
|||||||
'-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
|
'-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
|
||||||
]
|
]
|
||||||
|
|
||||||
verbose = bool(int(os.getenv('VERBOSE', '0')))
|
verbose = envs.VERBOSE
|
||||||
if verbose:
|
if verbose:
|
||||||
cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
|
cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
|
||||||
|
|
||||||
@ -205,8 +219,7 @@ def _is_neuron() -> bool:
|
|||||||
subprocess.run(["neuron-ls"], capture_output=True, check=True)
|
subprocess.run(["neuron-ls"], capture_output=True, check=True)
|
||||||
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
|
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
|
||||||
torch_neuronx_installed = False
|
torch_neuronx_installed = False
|
||||||
return torch_neuronx_installed or os.environ.get("VLLM_BUILD_WITH_NEURON",
|
return torch_neuronx_installed or envs.VLLM_BUILD_WITH_NEURON
|
||||||
False)
|
|
||||||
|
|
||||||
|
|
||||||
def _is_cpu() -> bool:
|
def _is_cpu() -> bool:
|
||||||
@ -214,7 +227,7 @@ def _is_cpu() -> bool:
|
|||||||
|
|
||||||
|
|
||||||
def _install_punica() -> bool:
|
def _install_punica() -> bool:
|
||||||
return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
|
return envs.VLLM_INSTALL_PUNICA_KERNELS
|
||||||
|
|
||||||
|
|
||||||
def get_hipcc_rocm_version():
|
def get_hipcc_rocm_version():
|
||||||
@ -377,7 +390,7 @@ if not _is_neuron():
|
|||||||
package_data = {
|
package_data = {
|
||||||
"vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
|
"vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
|
||||||
}
|
}
|
||||||
if os.environ.get("VLLM_USE_PRECOMPILED"):
|
if envs.VLLM_USE_PRECOMPILED:
|
||||||
ext_modules = []
|
ext_modules = []
|
||||||
package_data["vllm"].append("*.so")
|
package_data["vllm"].append("*.so")
|
||||||
|
|
||||||
|
66
vllm/envs.py
66
vllm/envs.py
@ -27,6 +27,14 @@ if TYPE_CHECKING:
|
|||||||
VLLM_CPU_KVCACHE_SPACE: int = 0
|
VLLM_CPU_KVCACHE_SPACE: int = 0
|
||||||
VLLM_USE_RAY_COMPILED_DAG: bool = False
|
VLLM_USE_RAY_COMPILED_DAG: bool = False
|
||||||
VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
|
VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
|
||||||
|
VLLM_TARGET_DEVICE: str = "cuda"
|
||||||
|
MAX_JOBS: Optional[str] = None
|
||||||
|
NVCC_THREADS: Optional[str] = None
|
||||||
|
VLLM_BUILD_WITH_NEURON: bool = False
|
||||||
|
VLLM_USE_PRECOMPILED: bool = False
|
||||||
|
VLLM_INSTALL_PUNICA_KERNELS: bool = False
|
||||||
|
CMAKE_BUILD_TYPE: Optional[str] = None
|
||||||
|
VERBOSE: bool = False
|
||||||
|
|
||||||
# The begin-* and end* here are used by the documentation generator
|
# The begin-* and end* here are used by the documentation generator
|
||||||
# to extract the used env vars.
|
# to extract the used env vars.
|
||||||
@ -34,6 +42,56 @@ if TYPE_CHECKING:
|
|||||||
# begin-env-vars-definition
|
# begin-env-vars-definition
|
||||||
|
|
||||||
environment_variables: Dict[str, Callable[[], Any]] = {
|
environment_variables: Dict[str, Callable[[], Any]] = {
|
||||||
|
|
||||||
|
# ================== Installation Time Env Vars ==================
|
||||||
|
|
||||||
|
# Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
|
||||||
|
"VLLM_TARGET_DEVICE":
|
||||||
|
lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
|
||||||
|
|
||||||
|
# Maximum number of compilation jobs to run in parallel.
|
||||||
|
# By default this is the number of CPUs
|
||||||
|
"MAX_JOBS":
|
||||||
|
lambda: os.getenv("MAX_JOBS", None),
|
||||||
|
|
||||||
|
# Number of threads to use for nvcc
|
||||||
|
# By default this is 1.
|
||||||
|
# If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
|
||||||
|
"NVCC_THREADS":
|
||||||
|
lambda: os.getenv("NVCC_THREADS", None),
|
||||||
|
|
||||||
|
# If set, vllm will build with Neuron support
|
||||||
|
"VLLM_BUILD_WITH_NEURON":
|
||||||
|
lambda: bool(os.environ.get("VLLM_BUILD_WITH_NEURON", False)),
|
||||||
|
|
||||||
|
# If set, vllm will use precompiled binaries (*.so)
|
||||||
|
"VLLM_USE_PRECOMPILED":
|
||||||
|
lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")),
|
||||||
|
|
||||||
|
# If set, vllm will install Punica kernels
|
||||||
|
"VLLM_INSTALL_PUNICA_KERNELS":
|
||||||
|
lambda: bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))),
|
||||||
|
|
||||||
|
# CMake build type
|
||||||
|
# If not set, defaults to "Debug" or "RelWithDebInfo"
|
||||||
|
# Available options: "Debug", "Release", "RelWithDebInfo"
|
||||||
|
"CMAKE_BUILD_TYPE":
|
||||||
|
lambda: os.getenv("CMAKE_BUILD_TYPE"),
|
||||||
|
|
||||||
|
# If set, vllm will print verbose logs during installation
|
||||||
|
"VERBOSE":
|
||||||
|
lambda: bool(int(os.getenv('VERBOSE', '0'))),
|
||||||
|
|
||||||
|
# Root directory for VLLM configuration files
|
||||||
|
# Note that this not only affects how vllm finds its configuration files
|
||||||
|
# during runtime, but also affects how vllm installs its configuration
|
||||||
|
# files during **installation**.
|
||||||
|
"VLLM_CONFIG_ROOT":
|
||||||
|
lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv(
|
||||||
|
"XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"),
|
||||||
|
|
||||||
|
# ================== Runtime Env Vars ==================
|
||||||
|
|
||||||
# used in distributed environment to determine the master address
|
# used in distributed environment to determine the master address
|
||||||
'VLLM_HOST_IP':
|
'VLLM_HOST_IP':
|
||||||
lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
|
lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
|
||||||
@ -93,14 +151,6 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
|||||||
"S3_ENDPOINT_URL":
|
"S3_ENDPOINT_URL":
|
||||||
lambda: os.environ.get("S3_ENDPOINT_URL", None),
|
lambda: os.environ.get("S3_ENDPOINT_URL", None),
|
||||||
|
|
||||||
# Root directory for VLLM configuration files
|
|
||||||
# Note that this not only affects how vllm finds its configuration files
|
|
||||||
# during runtime, but also affects how vllm installs its configuration
|
|
||||||
# files during **installation**.
|
|
||||||
"VLLM_CONFIG_ROOT":
|
|
||||||
lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv(
|
|
||||||
"XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"),
|
|
||||||
|
|
||||||
# Usage stats collection
|
# Usage stats collection
|
||||||
"VLLM_USAGE_STATS_SERVER":
|
"VLLM_USAGE_STATS_SERVER":
|
||||||
lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
|
lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user