Signed-off-by: yihong0618 <zouzou0208@gmail.com>
This commit is contained in:
parent
239b7befdd
commit
2de4118243
@ -488,9 +488,9 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
|
|||||||
if needed_memory > available_memory:
|
if needed_memory > available_memory:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"To serve at least one request with the models's max seq len "
|
f"To serve at least one request with the models's max seq len "
|
||||||
f"({max_model_len}), ({needed_memory/1024/1024/1024:.2f} GB KV "
|
f"({max_model_len}), ({needed_memory/1024/1024/1024:.2f} GiB KV "
|
||||||
f"cache is needed, which is larger than the available KV cache "
|
f"cache is needed, which is larger than the available KV cache "
|
||||||
f"memory ({available_memory/1024/1024/1024:.2f} GB). Try "
|
f"memory ({available_memory/1024/1024/1024:.2f} GiB). Try "
|
||||||
f"increasing `gpu_memory_utilization` or decreasing "
|
f"increasing `gpu_memory_utilization` or decreasing "
|
||||||
f"`max_model_len` when initializing the engine.")
|
f"`max_model_len` when initializing the engine.")
|
||||||
|
|
||||||
|
@ -24,8 +24,8 @@ from vllm.multimodal.utils import group_mm_inputs_by_modality
|
|||||||
from vllm.sampling_params import SamplingType
|
from vllm.sampling_params import SamplingType
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
|
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
|
||||||
LayerBlockType, LazyLoader, cdiv, check_use_alibi,
|
GiB_bytes, LayerBlockType, LazyLoader, cdiv,
|
||||||
is_pin_memory_available)
|
check_use_alibi, is_pin_memory_available)
|
||||||
from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
|
from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
|
||||||
from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
|
from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
|
||||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||||
@ -1206,8 +1206,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
self.device)
|
self.device)
|
||||||
time_after_load = time.perf_counter()
|
time_after_load = time.perf_counter()
|
||||||
self.model_memory_usage = m.consumed_memory
|
self.model_memory_usage = m.consumed_memory
|
||||||
logger.info("Model loading took %.4f GB and %.6f seconds",
|
logger.info("Model loading took %.4f GiB and %.6f seconds",
|
||||||
self.model_memory_usage / float(2**30),
|
self.model_memory_usage / GiB_bytes,
|
||||||
time_after_load - time_before_load)
|
time_after_load - time_before_load)
|
||||||
|
|
||||||
def _get_prompt_logprobs_dict(
|
def _get_prompt_logprobs_dict(
|
||||||
|
@ -1143,8 +1143,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
|
|||||||
time_after_load = time.perf_counter()
|
time_after_load = time.perf_counter()
|
||||||
|
|
||||||
self.model_memory_usage = m.consumed_memory
|
self.model_memory_usage = m.consumed_memory
|
||||||
logger.info("Model loading took %.4f GB and %.6f seconds",
|
logger.info("Model loading took %.4f GiB and %.6f seconds",
|
||||||
self.model_memory_usage / float(2**30),
|
self.model_memory_usage / GiB_bytes,
|
||||||
time_after_load - time_before_load)
|
time_after_load - time_before_load)
|
||||||
if self.prompt_adapter_config:
|
if self.prompt_adapter_config:
|
||||||
self.prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager(
|
self.prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager(
|
||||||
|
@ -25,7 +25,7 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
|
|||||||
MultiModalRegistry)
|
MultiModalRegistry)
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
|
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
|
||||||
from vllm.utils import DeviceMemoryProfiler, make_tensor_with_pad
|
from vllm.utils import DeviceMemoryProfiler, GiB_bytes, make_tensor_with_pad
|
||||||
from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata
|
from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata
|
||||||
from vllm.worker.model_runner_base import (
|
from vllm.worker.model_runner_base import (
|
||||||
ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
|
ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
|
||||||
@ -422,8 +422,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
|
|||||||
self.model = get_model(vllm_config=self.vllm_config)
|
self.model = get_model(vllm_config=self.vllm_config)
|
||||||
|
|
||||||
self.model_memory_usage = m.consumed_memory
|
self.model_memory_usage = m.consumed_memory
|
||||||
logger.info("Loading model weights took %.4f GB",
|
logger.info("Loading model weights took %.4f GiB",
|
||||||
self.model_memory_usage / float(2**30))
|
self.model_memory_usage / GiB_bytes)
|
||||||
|
|
||||||
def get_model(self) -> nn.Module:
|
def get_model(self) -> nn.Module:
|
||||||
return self.model
|
return self.model
|
||||||
|
Loading…
x
Reference in New Issue
Block a user