[Bugfix] fix automatic prefix args and add log info (#3608)

2024-03-25 20:35:22 +08:00 · 2024-03-25 20:35:22 +08:00 · e67c295b0c
commit e67c295b0c
parent 925f3332ca
2 changed files with 7 additions and 1 deletions
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@ -9,6 +9,9 @@ from vllm.block import BlockTable, PhysicalTokenBlock
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
 from vllm.utils import Device
 from vllm.core.evictor import Evictor, EvictionPolicy, make_evictor
 from vllm.logger import init_logger
 logger = init_logger(__name__)
 class BlockAllocatorBase(ABC):
@ -241,11 +244,13 @@ class BlockSpaceManager:
        self.watermark_blocks = int(watermark * num_gpu_blocks)
        if self.enable_caching:
            logger.info("enable automatic prefix caching")
            self.gpu_allocator = CachedBlockAllocator(Device.GPU, block_size,
                                                      num_gpu_blocks)
            self.cpu_allocator = CachedBlockAllocator(Device.CPU, block_size,
                                                      num_cpu_blocks)
        else:
            logger.info("disable automatic prefix caching")
            self.gpu_allocator = UncachedBlockAllocator(
                Device.GPU, block_size, num_gpu_blocks)
            self.cpu_allocator = UncachedBlockAllocator(
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -337,7 +337,8 @@ class EngineArgs:
        cache_config = CacheConfig(self.block_size,
                                   self.gpu_memory_utilization,
                                   self.swap_space, self.kv_cache_dtype,
-                                   model_config.get_sliding_window())
+                                   model_config.get_sliding_window(),
                                   self.enable_prefix_caching)
        parallel_config = ParallelConfig(
            self.pipeline_parallel_size, self.tensor_parallel_size,
            self.worker_use_ray, self.max_parallel_loading_workers,