[Hardware][NVIDIA] Add non-NVML CUDA mode for Jetson (#9735)

Signed-off-by: Conroy Cheers <conroy@corncheese.org>
2024-11-27 05:26:28 +11:00 · 2024-11-27 05:26:28 +11:00 · f5792c7c4a
commit f5792c7c4a
parent db66e018ea
3 changed files with 159 additions and 91 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -34,7 +34,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 # Supported NVIDIA architectures.
-set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
+set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
@ -249,7 +249,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Only build Marlin kernels if we are building for at least some compatible archs.
  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
  # are not supported by Machete yet.
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
  if (MARLIN_ARCHS)
    set(MARLIN_SRCS
       "csrc/quantization/fp8/fp8_marlin.cu"
@ -301,7 +301,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
  # kernels for the remaining archs that are not already built for 3x.
  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS 
-    "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
+    "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
  # subtract out the archs that are already built for 3x
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
  if (SCALED_MM_2X_ARCHS)
@ -427,7 +427,7 @@ set_gencode_flags_for_srcs(
  CUDA_ARCHS "${CUDA_ARCHS}")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
  if (MARLIN_MOE_ARCHS)
    set(MARLIN_MOE_SRC
        "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
--- a/vllm/platforms/init.py
+++ b/vllm/platforms/init.py
@ -28,7 +28,15 @@ try:
    finally:
        pynvml.nvmlShutdown()
 except Exception:
-    pass
+    # CUDA is supported on Jetson, but NVML may not be.
    import os
    def cuda_is_jetson() -> bool:
        return os.path.isfile("/etc/nv_tegra_release") \
            or os.path.exists("/sys/class/tegra-firmware")
    if cuda_is_jetson():
        is_cuda = True
 is_rocm = False
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@ -4,7 +4,7 @@ pynvml. However, it should not initialize cuda context.
 import os
 from functools import lru_cache, wraps
-from typing import TYPE_CHECKING, Callable, List, Tuple, TypeVar
+from typing import TYPE_CHECKING, Callable, List, TypeVar
 import pynvml
 import torch
@ -38,67 +38,6 @@ if pynvml.__file__.endswith("__init__.py"):
 # see https://github.com/huggingface/diffusers/issues/9704 for details
 torch.backends.cuda.enable_cudnn_sdp(False)
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
 # all the related functions work on real physical device ids.
 # the major benefit of using NVML is that it will not initialize CUDA
 def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
    @wraps(fn)
    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
        pynvml.nvmlInit()
        try:
            return fn(*args, **kwargs)
        finally:
            pynvml.nvmlShutdown()
    return wrapper
@lru_cache(maxsize=8)
@with_nvml_context
 def get_physical_device_capability(device_id: int = 0) -> Tuple[int, int]:
    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
    return pynvml.nvmlDeviceGetCudaComputeCapability(handle)
@lru_cache(maxsize=8)
@with_nvml_context
 def get_physical_device_name(device_id: int = 0) -> str:
    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
    return pynvml.nvmlDeviceGetName(handle)
@lru_cache(maxsize=8)
@with_nvml_context
 def get_physical_device_total_memory(device_id: int = 0) -> int:
    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
    return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
@with_nvml_context
 def warn_if_different_devices():
    device_ids: int = pynvml.nvmlDeviceGetCount()
    if device_ids > 1:
        device_names = [get_physical_device_name(i) for i in range(device_ids)]
        if len(set(device_names)) > 1 and os.environ.get(
                "CUDA_DEVICE_ORDER") != "PCI_BUS_ID":
            logger.warning(
                "Detected different devices in the system: \n%s\nPlease"
                " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
                "avoid unexpected behavior.", "\n".join(device_names))
 try:
    from sphinx.ext.autodoc.mock import _MockModule
    if not isinstance(pynvml, _MockModule):
        warn_if_different_devices()
 except ModuleNotFoundError:
    warn_if_different_devices()
 def device_id_to_physical_device_id(device_id: int) -> int:
    if "CUDA_VISIBLE_DEVICES" in os.environ:
@ -118,26 +57,88 @@ def device_id_to_physical_device_id(device_id: int) -> int:
        return device_id
-class CudaPlatform(Platform):
+def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
    @wraps(fn)
    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
        pynvml.nvmlInit()
        try:
            return fn(*args, **kwargs)
        finally:
            pynvml.nvmlShutdown()
    return wrapper
 class CudaPlatformBase(Platform):
    _enum = PlatformEnum.CUDA
    device_type: str = "cuda"
    dispatch_key: str = "CUDA"
    @classmethod
    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
-        physical_device_id = device_id_to_physical_device_id(device_id)
+        raise NotImplementedError
        major, minor = get_physical_device_capability(physical_device_id)
        return DeviceCapability(major=major, minor=minor)
    @classmethod
    def get_device_name(cls, device_id: int = 0) -> str:
-        physical_device_id = device_id_to_physical_device_id(device_id)
+        raise NotImplementedError
        return get_physical_device_name(physical_device_id)
    @classmethod
    def get_device_total_memory(cls, device_id: int = 0) -> int:
        raise NotImplementedError
    @classmethod
    def is_full_nvlink(cls, device_ids: List[int]) -> bool:
        raise NotImplementedError
    @classmethod
    def log_warnings(cls):
        pass
    @classmethod
    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
        parallel_config = vllm_config.parallel_config
        scheduler_config = vllm_config.scheduler_config
        if parallel_config.worker_cls == "auto":
            if scheduler_config.is_multi_step:
                parallel_config.worker_cls = \
                    "vllm.worker.multi_step_worker.MultiStepWorker"
            elif vllm_config.speculative_config:
                parallel_config.worker_cls = \
                    "vllm.spec_decode.spec_decode_worker.create_spec_worker"
            else:
                parallel_config.worker_cls = "vllm.worker.worker.Worker"
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
 # all the related functions work on real physical device ids.
 # the major benefit of using NVML is that it will not initialize CUDA
 class NvmlCudaPlatform(CudaPlatformBase):
    @classmethod
    @lru_cache(maxsize=8)
    @with_nvml_context
    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
        physical_device_id = device_id_to_physical_device_id(device_id)
-        return get_physical_device_total_memory(physical_device_id)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
        major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
        return DeviceCapability(major=major, minor=minor)
    @classmethod
    @lru_cache(maxsize=8)
    @with_nvml_context
    def get_device_name(cls, device_id: int = 0) -> str:
        physical_device_id = device_id_to_physical_device_id(device_id)
        return cls._get_physical_device_name(physical_device_id)
    @classmethod
    @lru_cache(maxsize=8)
    @with_nvml_context
    def get_device_total_memory(cls, device_id: int = 0) -> int:
        physical_device_id = device_id_to_physical_device_id(device_id)
        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
        return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
    @classmethod
    @with_nvml_context
@ -153,27 +154,86 @@ class CudaPlatform(Platform):
                if i < j:
                    try:
                        p2p_status = pynvml.nvmlDeviceGetP2PStatus(
-                            handle, peer_handle,
+                            handle,
-                            pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
+                            peer_handle,
                            pynvml.NVML_P2P_CAPS_INDEX_NVLINK,
                        )
                        if p2p_status != pynvml.NVML_P2P_STATUS_OK:
                            return False
                    except pynvml.NVMLError:
                        logger.exception(
-                            "NVLink detection failed. This is normal if your"
+                            "NVLink detection failed. This is normal if"
-                            " machine has no NVLink equipped.")
+                            " your machine has no NVLink equipped.")
                        return False
        return True
    @classmethod
-    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+    def _get_physical_device_name(cls, device_id: int = 0) -> str:
-        parallel_config = vllm_config.parallel_config
+        handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
-        scheduler_config = vllm_config.scheduler_config
+        return pynvml.nvmlDeviceGetName(handle)
-        if parallel_config.worker_cls == "auto":
+
-            if scheduler_config.is_multi_step:
+    @classmethod
-                parallel_config.worker_cls = \
+    @with_nvml_context
-                    "vllm.worker.multi_step_worker.MultiStepWorker"
+    def log_warnings(cls):
-            elif vllm_config.speculative_config:
+        device_ids: int = pynvml.nvmlDeviceGetCount()
-                parallel_config.worker_cls = \
+        if device_ids > 1:
-                    "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+            device_names = [
-            else:
+                cls._get_physical_device_name(i) for i in range(device_ids)
-                parallel_config.worker_cls = "vllm.worker.worker.Worker"
+            ]
            if (len(set(device_names)) > 1
                    and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"):
                logger.warning(
                    "Detected different devices in the system: \n%s\nPlease"
                    " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
                    "avoid unexpected behavior.",
                    "\n".join(device_names),
                )
 class NonNvmlCudaPlatform(CudaPlatformBase):
    @classmethod
    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
        major, minor = torch.cuda.get_device_capability(device_id)
        return DeviceCapability(major=major, minor=minor)
    @classmethod
    def get_device_name(cls, device_id: int = 0) -> str:
        return torch.cuda.get_device_name(device_id)
    @classmethod
    def get_device_total_memory(cls, device_id: int = 0) -> int:
        device_props = torch.cuda.get_device_properties(device_id)
        return device_props.total_memory
    @classmethod
    def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
        logger.exception(
            "NVLink detection not possible, as context support was"
            " not found. Assuming no NVLink available.")
        return False
 # Autodetect either NVML-enabled or non-NVML platform
 # based on whether NVML is available.
 nvml_available = False
 try:
    try:
        pynvml.nvmlInit()
        nvml_available = True
    except Exception:
        # On Jetson, NVML is not supported.
        nvml_available = False
 finally:
    if nvml_available:
        pynvml.nvmlShutdown()
 CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
 try:
    from sphinx.ext.autodoc.mock import _MockModule
    if not isinstance(pynvml, _MockModule):
        CudaPlatform.log_warnings()
 except ModuleNotFoundError:
    CudaPlatform.log_warnings()