From 344bf7cd2d66a8b13f216f61c7a6d5d70576a498 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 3 May 2024 15:55:56 -0700
Subject: [PATCH] [Misc] add installation time env vars (#4574)

---
 setup.py     | 33 ++++++++++++++++++--------
 vllm/envs.py | 66 +++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 81 insertions(+), 18 deletions(-)

diff --git a/setup.py b/setup.py
index 801d8d50..3768daf9 100644
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,4 @@
+import importlib.util
 import io
 import logging
 import os
@@ -13,10 +14,23 @@ from setuptools import Extension, find_packages, setup
 from setuptools.command.build_ext import build_ext
 from torch.utils.cpp_extension import CUDA_HOME
 
+
+def load_module_from_path(module_name, path):
+    spec = importlib.util.spec_from_file_location(module_name, path)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
 ROOT_DIR = os.path.dirname(__file__)
 logger = logging.getLogger(__name__)
-# Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
-VLLM_TARGET_DEVICE = os.getenv("VLLM_TARGET_DEVICE", "cuda")
+
+# cannot import envs directly because it depends on vllm,
+#  which is not installed yet
+envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
+
+VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
 
 # vLLM only supports Linux platform
 assert sys.platform.startswith(
@@ -60,7 +74,7 @@ class cmake_build_ext(build_ext):
     def compute_num_jobs(self):
         # `num_jobs` is either the value of the MAX_JOBS environment variable
         # (if defined) or the number of CPUs available.
-        num_jobs = os.environ.get("MAX_JOBS", None)
+        num_jobs = envs.MAX_JOBS
         if num_jobs is not None:
             num_jobs = int(num_jobs)
             logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs)
@@ -78,7 +92,7 @@ class cmake_build_ext(build_ext):
             # environment variable (if defined) or 1.
             # when it is set, we reduce `num_jobs` to avoid
             # overloading the system.
-            nvcc_threads = os.getenv("NVCC_THREADS", None)
+            nvcc_threads = envs.NVCC_THREADS
             if nvcc_threads is not None:
                 nvcc_threads = int(nvcc_threads)
                 logger.info(
@@ -104,7 +118,7 @@ class cmake_build_ext(build_ext):
         # Select the build type.
         # Note: optimization level + debug info are set by the build type
         default_cfg = "Debug" if self.debug else "RelWithDebInfo"
-        cfg = os.getenv("CMAKE_BUILD_TYPE", default_cfg)
+        cfg = envs.CMAKE_BUILD_TYPE or default_cfg
 
         # where .so files will be written, should be the same for all extensions
         # that use the same CMakeLists.txt.
@@ -118,7 +132,7 @@ class cmake_build_ext(build_ext):
             '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
         ]
 
-        verbose = bool(int(os.getenv('VERBOSE', '0')))
+        verbose = envs.VERBOSE
         if verbose:
             cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
 
@@ -205,8 +219,7 @@ def _is_neuron() -> bool:
         subprocess.run(["neuron-ls"], capture_output=True, check=True)
     except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
         torch_neuronx_installed = False
-    return torch_neuronx_installed or os.environ.get("VLLM_BUILD_WITH_NEURON",
-                                                     False)
+    return torch_neuronx_installed or envs.VLLM_BUILD_WITH_NEURON
 
 
 def _is_cpu() -> bool:
@@ -214,7 +227,7 @@ def _is_cpu() -> bool:
 
 
 def _install_punica() -> bool:
-    return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
+    return envs.VLLM_INSTALL_PUNICA_KERNELS
 
 
 def get_hipcc_rocm_version():
@@ -377,7 +390,7 @@ if not _is_neuron():
 package_data = {
     "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
 }
-if os.environ.get("VLLM_USE_PRECOMPILED"):
+if envs.VLLM_USE_PRECOMPILED:
     ext_modules = []
     package_data["vllm"].append("*.so")
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 2dbb57e6..91cc8f3b 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -27,6 +27,14 @@ if TYPE_CHECKING:
     VLLM_CPU_KVCACHE_SPACE: int = 0
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
+    VLLM_TARGET_DEVICE: str = "cuda"
+    MAX_JOBS: Optional[str] = None
+    NVCC_THREADS: Optional[str] = None
+    VLLM_BUILD_WITH_NEURON: bool = False
+    VLLM_USE_PRECOMPILED: bool = False
+    VLLM_INSTALL_PUNICA_KERNELS: bool = False
+    CMAKE_BUILD_TYPE: Optional[str] = None
+    VERBOSE: bool = False
 
 # The begin-* and end* here are used by the documentation generator
 # to extract the used env vars.
@@ -34,6 +42,56 @@ if TYPE_CHECKING:
 # begin-env-vars-definition
 
 environment_variables: Dict[str, Callable[[], Any]] = {
+
+    # ================== Installation Time Env Vars ==================
+
+    # Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
+    "VLLM_TARGET_DEVICE":
+    lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
+
+    # Maximum number of compilation jobs to run in parallel.
+    # By default this is the number of CPUs
+    "MAX_JOBS":
+    lambda: os.getenv("MAX_JOBS", None),
+
+    # Number of threads to use for nvcc
+    # By default this is 1.
+    # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
+    "NVCC_THREADS":
+    lambda: os.getenv("NVCC_THREADS", None),
+
+    # If set, vllm will build with Neuron support
+    "VLLM_BUILD_WITH_NEURON":
+    lambda: bool(os.environ.get("VLLM_BUILD_WITH_NEURON", False)),
+
+    # If set, vllm will use precompiled binaries (*.so)
+    "VLLM_USE_PRECOMPILED":
+    lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")),
+
+    # If set, vllm will install Punica kernels
+    "VLLM_INSTALL_PUNICA_KERNELS":
+    lambda: bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))),
+
+    # CMake build type
+    # If not set, defaults to "Debug" or "RelWithDebInfo"
+    # Available options: "Debug", "Release", "RelWithDebInfo"
+    "CMAKE_BUILD_TYPE":
+    lambda: os.getenv("CMAKE_BUILD_TYPE"),
+
+    # If set, vllm will print verbose logs during installation
+    "VERBOSE":
+    lambda: bool(int(os.getenv('VERBOSE', '0'))),
+
+    # Root directory for VLLM configuration files
+    # Note that this not only affects how vllm finds its configuration files
+    # during runtime, but also affects how vllm installs its configuration
+    # files during **installation**.
+    "VLLM_CONFIG_ROOT":
+    lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv(
+        "XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"),
+
+    # ================== Runtime Env Vars ==================
+
     # used in distributed environment to determine the master address
     'VLLM_HOST_IP':
     lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
@@ -93,14 +151,6 @@ environment_variables: Dict[str, Callable[[], Any]] = {
     "S3_ENDPOINT_URL":
     lambda: os.environ.get("S3_ENDPOINT_URL", None),
 
-    # Root directory for VLLM configuration files
-    # Note that this not only affects how vllm finds its configuration files
-    # during runtime, but also affects how vllm installs its configuration
-    # files during **installation**.
-    "VLLM_CONFIG_ROOT":
-    lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv(
-        "XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"),
-
     # Usage stats collection
     "VLLM_USAGE_STATS_SERVER":
     lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),