[Hardware][NVIDIA] Add non-NVML CUDA mode for Jetson (#9735)
Signed-off-by: Conroy Cheers <conroy@corncheese.org>
This commit is contained in:
parent
db66e018ea
commit
f5792c7c4a
@ -34,7 +34,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
|||||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
||||||
|
|
||||||
# Supported NVIDIA architectures.
|
# Supported NVIDIA architectures.
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
|
||||||
@ -249,7 +249,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# Only build Marlin kernels if we are building for at least some compatible archs.
|
# Only build Marlin kernels if we are building for at least some compatible archs.
|
||||||
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||||
# are not supported by Machete yet.
|
# are not supported by Machete yet.
|
||||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
|
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
|
||||||
if (MARLIN_ARCHS)
|
if (MARLIN_ARCHS)
|
||||||
set(MARLIN_SRCS
|
set(MARLIN_SRCS
|
||||||
"csrc/quantization/fp8/fp8_marlin.cu"
|
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||||
@ -301,7 +301,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
||||||
# kernels for the remaining archs that are not already built for 3x.
|
# kernels for the remaining archs that are not already built for 3x.
|
||||||
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
||||||
"7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
|
"7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
|
||||||
# subtract out the archs that are already built for 3x
|
# subtract out the archs that are already built for 3x
|
||||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||||
if (SCALED_MM_2X_ARCHS)
|
if (SCALED_MM_2X_ARCHS)
|
||||||
@ -427,7 +427,7 @@ set_gencode_flags_for_srcs(
|
|||||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
|
||||||
if (MARLIN_MOE_ARCHS)
|
if (MARLIN_MOE_ARCHS)
|
||||||
set(MARLIN_MOE_SRC
|
set(MARLIN_MOE_SRC
|
||||||
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
|
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
|
||||||
|
@ -28,7 +28,15 @@ try:
|
|||||||
finally:
|
finally:
|
||||||
pynvml.nvmlShutdown()
|
pynvml.nvmlShutdown()
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
# CUDA is supported on Jetson, but NVML may not be.
|
||||||
|
import os
|
||||||
|
|
||||||
|
def cuda_is_jetson() -> bool:
|
||||||
|
return os.path.isfile("/etc/nv_tegra_release") \
|
||||||
|
or os.path.exists("/sys/class/tegra-firmware")
|
||||||
|
|
||||||
|
if cuda_is_jetson():
|
||||||
|
is_cuda = True
|
||||||
|
|
||||||
is_rocm = False
|
is_rocm = False
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ pynvml. However, it should not initialize cuda context.
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
from functools import lru_cache, wraps
|
from functools import lru_cache, wraps
|
||||||
from typing import TYPE_CHECKING, Callable, List, Tuple, TypeVar
|
from typing import TYPE_CHECKING, Callable, List, TypeVar
|
||||||
|
|
||||||
import pynvml
|
import pynvml
|
||||||
import torch
|
import torch
|
||||||
@ -38,67 +38,6 @@ if pynvml.__file__.endswith("__init__.py"):
|
|||||||
# see https://github.com/huggingface/diffusers/issues/9704 for details
|
# see https://github.com/huggingface/diffusers/issues/9704 for details
|
||||||
torch.backends.cuda.enable_cudnn_sdp(False)
|
torch.backends.cuda.enable_cudnn_sdp(False)
|
||||||
|
|
||||||
# NVML utils
|
|
||||||
# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
|
|
||||||
# all the related functions work on real physical device ids.
|
|
||||||
# the major benefit of using NVML is that it will not initialize CUDA
|
|
||||||
|
|
||||||
|
|
||||||
def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
|
|
||||||
|
|
||||||
@wraps(fn)
|
|
||||||
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
|
|
||||||
pynvml.nvmlInit()
|
|
||||||
try:
|
|
||||||
return fn(*args, **kwargs)
|
|
||||||
finally:
|
|
||||||
pynvml.nvmlShutdown()
|
|
||||||
|
|
||||||
return wrapper
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=8)
|
|
||||||
@with_nvml_context
|
|
||||||
def get_physical_device_capability(device_id: int = 0) -> Tuple[int, int]:
|
|
||||||
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
|
|
||||||
return pynvml.nvmlDeviceGetCudaComputeCapability(handle)
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=8)
|
|
||||||
@with_nvml_context
|
|
||||||
def get_physical_device_name(device_id: int = 0) -> str:
|
|
||||||
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
|
|
||||||
return pynvml.nvmlDeviceGetName(handle)
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=8)
|
|
||||||
@with_nvml_context
|
|
||||||
def get_physical_device_total_memory(device_id: int = 0) -> int:
|
|
||||||
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
|
|
||||||
return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
|
|
||||||
|
|
||||||
|
|
||||||
@with_nvml_context
|
|
||||||
def warn_if_different_devices():
|
|
||||||
device_ids: int = pynvml.nvmlDeviceGetCount()
|
|
||||||
if device_ids > 1:
|
|
||||||
device_names = [get_physical_device_name(i) for i in range(device_ids)]
|
|
||||||
if len(set(device_names)) > 1 and os.environ.get(
|
|
||||||
"CUDA_DEVICE_ORDER") != "PCI_BUS_ID":
|
|
||||||
logger.warning(
|
|
||||||
"Detected different devices in the system: \n%s\nPlease"
|
|
||||||
" make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
|
|
||||||
"avoid unexpected behavior.", "\n".join(device_names))
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
from sphinx.ext.autodoc.mock import _MockModule
|
|
||||||
|
|
||||||
if not isinstance(pynvml, _MockModule):
|
|
||||||
warn_if_different_devices()
|
|
||||||
except ModuleNotFoundError:
|
|
||||||
warn_if_different_devices()
|
|
||||||
|
|
||||||
|
|
||||||
def device_id_to_physical_device_id(device_id: int) -> int:
|
def device_id_to_physical_device_id(device_id: int) -> int:
|
||||||
if "CUDA_VISIBLE_DEVICES" in os.environ:
|
if "CUDA_VISIBLE_DEVICES" in os.environ:
|
||||||
@ -118,26 +57,88 @@ def device_id_to_physical_device_id(device_id: int) -> int:
|
|||||||
return device_id
|
return device_id
|
||||||
|
|
||||||
|
|
||||||
class CudaPlatform(Platform):
|
def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
|
||||||
|
|
||||||
|
@wraps(fn)
|
||||||
|
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
|
||||||
|
pynvml.nvmlInit()
|
||||||
|
try:
|
||||||
|
return fn(*args, **kwargs)
|
||||||
|
finally:
|
||||||
|
pynvml.nvmlShutdown()
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
|
class CudaPlatformBase(Platform):
|
||||||
_enum = PlatformEnum.CUDA
|
_enum = PlatformEnum.CUDA
|
||||||
device_type: str = "cuda"
|
device_type: str = "cuda"
|
||||||
dispatch_key: str = "CUDA"
|
dispatch_key: str = "CUDA"
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
|
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
|
||||||
physical_device_id = device_id_to_physical_device_id(device_id)
|
raise NotImplementedError
|
||||||
major, minor = get_physical_device_capability(physical_device_id)
|
|
||||||
return DeviceCapability(major=major, minor=minor)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_device_name(cls, device_id: int = 0) -> str:
|
def get_device_name(cls, device_id: int = 0) -> str:
|
||||||
physical_device_id = device_id_to_physical_device_id(device_id)
|
raise NotImplementedError
|
||||||
return get_physical_device_name(physical_device_id)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def is_full_nvlink(cls, device_ids: List[int]) -> bool:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def log_warnings(cls):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||||
|
parallel_config = vllm_config.parallel_config
|
||||||
|
scheduler_config = vllm_config.scheduler_config
|
||||||
|
if parallel_config.worker_cls == "auto":
|
||||||
|
if scheduler_config.is_multi_step:
|
||||||
|
parallel_config.worker_cls = \
|
||||||
|
"vllm.worker.multi_step_worker.MultiStepWorker"
|
||||||
|
elif vllm_config.speculative_config:
|
||||||
|
parallel_config.worker_cls = \
|
||||||
|
"vllm.spec_decode.spec_decode_worker.create_spec_worker"
|
||||||
|
else:
|
||||||
|
parallel_config.worker_cls = "vllm.worker.worker.Worker"
|
||||||
|
|
||||||
|
|
||||||
|
# NVML utils
|
||||||
|
# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
|
||||||
|
# all the related functions work on real physical device ids.
|
||||||
|
# the major benefit of using NVML is that it will not initialize CUDA
|
||||||
|
class NvmlCudaPlatform(CudaPlatformBase):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@lru_cache(maxsize=8)
|
||||||
|
@with_nvml_context
|
||||||
|
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
|
||||||
physical_device_id = device_id_to_physical_device_id(device_id)
|
physical_device_id = device_id_to_physical_device_id(device_id)
|
||||||
return get_physical_device_total_memory(physical_device_id)
|
handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
|
||||||
|
major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
|
||||||
|
return DeviceCapability(major=major, minor=minor)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@lru_cache(maxsize=8)
|
||||||
|
@with_nvml_context
|
||||||
|
def get_device_name(cls, device_id: int = 0) -> str:
|
||||||
|
physical_device_id = device_id_to_physical_device_id(device_id)
|
||||||
|
return cls._get_physical_device_name(physical_device_id)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@lru_cache(maxsize=8)
|
||||||
|
@with_nvml_context
|
||||||
|
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||||
|
physical_device_id = device_id_to_physical_device_id(device_id)
|
||||||
|
handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
|
||||||
|
return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@with_nvml_context
|
@with_nvml_context
|
||||||
@ -153,27 +154,86 @@ class CudaPlatform(Platform):
|
|||||||
if i < j:
|
if i < j:
|
||||||
try:
|
try:
|
||||||
p2p_status = pynvml.nvmlDeviceGetP2PStatus(
|
p2p_status = pynvml.nvmlDeviceGetP2PStatus(
|
||||||
handle, peer_handle,
|
handle,
|
||||||
pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
|
peer_handle,
|
||||||
|
pynvml.NVML_P2P_CAPS_INDEX_NVLINK,
|
||||||
|
)
|
||||||
if p2p_status != pynvml.NVML_P2P_STATUS_OK:
|
if p2p_status != pynvml.NVML_P2P_STATUS_OK:
|
||||||
return False
|
return False
|
||||||
except pynvml.NVMLError:
|
except pynvml.NVMLError:
|
||||||
logger.exception(
|
logger.exception(
|
||||||
"NVLink detection failed. This is normal if your"
|
"NVLink detection failed. This is normal if"
|
||||||
" machine has no NVLink equipped.")
|
" your machine has no NVLink equipped.")
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
def _get_physical_device_name(cls, device_id: int = 0) -> str:
|
||||||
parallel_config = vllm_config.parallel_config
|
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
|
||||||
scheduler_config = vllm_config.scheduler_config
|
return pynvml.nvmlDeviceGetName(handle)
|
||||||
if parallel_config.worker_cls == "auto":
|
|
||||||
if scheduler_config.is_multi_step:
|
@classmethod
|
||||||
parallel_config.worker_cls = \
|
@with_nvml_context
|
||||||
"vllm.worker.multi_step_worker.MultiStepWorker"
|
def log_warnings(cls):
|
||||||
elif vllm_config.speculative_config:
|
device_ids: int = pynvml.nvmlDeviceGetCount()
|
||||||
parallel_config.worker_cls = \
|
if device_ids > 1:
|
||||||
"vllm.spec_decode.spec_decode_worker.create_spec_worker"
|
device_names = [
|
||||||
else:
|
cls._get_physical_device_name(i) for i in range(device_ids)
|
||||||
parallel_config.worker_cls = "vllm.worker.worker.Worker"
|
]
|
||||||
|
if (len(set(device_names)) > 1
|
||||||
|
and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"):
|
||||||
|
logger.warning(
|
||||||
|
"Detected different devices in the system: \n%s\nPlease"
|
||||||
|
" make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
|
||||||
|
"avoid unexpected behavior.",
|
||||||
|
"\n".join(device_names),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class NonNvmlCudaPlatform(CudaPlatformBase):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
|
||||||
|
major, minor = torch.cuda.get_device_capability(device_id)
|
||||||
|
return DeviceCapability(major=major, minor=minor)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_device_name(cls, device_id: int = 0) -> str:
|
||||||
|
return torch.cuda.get_device_name(device_id)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||||
|
device_props = torch.cuda.get_device_properties(device_id)
|
||||||
|
return device_props.total_memory
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
|
||||||
|
logger.exception(
|
||||||
|
"NVLink detection not possible, as context support was"
|
||||||
|
" not found. Assuming no NVLink available.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# Autodetect either NVML-enabled or non-NVML platform
|
||||||
|
# based on whether NVML is available.
|
||||||
|
nvml_available = False
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
pynvml.nvmlInit()
|
||||||
|
nvml_available = True
|
||||||
|
except Exception:
|
||||||
|
# On Jetson, NVML is not supported.
|
||||||
|
nvml_available = False
|
||||||
|
finally:
|
||||||
|
if nvml_available:
|
||||||
|
pynvml.nvmlShutdown()
|
||||||
|
|
||||||
|
CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
|
||||||
|
|
||||||
|
try:
|
||||||
|
from sphinx.ext.autodoc.mock import _MockModule
|
||||||
|
|
||||||
|
if not isinstance(pynvml, _MockModule):
|
||||||
|
CudaPlatform.log_warnings()
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
CudaPlatform.log_warnings()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user