vllm/vllm/platforms/cpu.py

# SPDX-License-Identifier: Apache-2.0

import os
from typing import TYPE_CHECKING, Optional

import psutil
import torch

from vllm.logger import init_logger

from .interface import Platform, PlatformEnum, _Backend

logger = init_logger(__name__)

if TYPE_CHECKING:
    from vllm.config import VllmConfig
else:
    VllmConfig = None

logger = init_logger(__name__)


class CpuPlatform(Platform):
    _enum = PlatformEnum.CPU
    device_name: str = "cpu"
    device_type: str = "cpu"
    dispatch_key: str = "CPU"

    @classmethod
    def get_device_name(cls, device_id: int = 0) -> str:
        return "cpu"

    @classmethod
    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
                             block_size: int, use_v1: bool,
                             use_mla: bool) -> str:
        if selected_backend and selected_backend != _Backend.TORCH_SDPA:
            logger.info("Cannot use %s backend on CPU.", selected_backend)
        logger.info("Using Torch SDPA backend.")
        return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"

    @classmethod
    def get_device_total_memory(cls, device_id: int = 0) -> int:
        return psutil.virtual_memory().total

    @classmethod
    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
        return False

    @classmethod
    def inference_mode(cls):
        return torch.no_grad()

    @classmethod
    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
        import vllm.envs as envs
        from vllm.utils import GiB_bytes
        model_config = vllm_config.model_config
        # Reminder: Please update docs/source/features/compatibility_matrix.md
        # If the feature combo become valid
        if not model_config.enforce_eager:
            logger.warning(
                "CUDA graph is not supported on CPU, fallback to the eager "
                "mode.")
            model_config.enforce_eager = True

        cache_config = vllm_config.cache_config

        if cache_config and cache_config.block_size is None:
            cache_config.block_size = 16

        kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE

        if kv_cache_space >= 0:
            if kv_cache_space == 0:
                cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
                logger.warning(
                    "Environment variable VLLM_CPU_KVCACHE_SPACE (GB) "
                    "for CPU backend is not set, using 4 by default.")
            else:
                cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes  # type: ignore # noqa
        else:
            raise RuntimeError(
                "Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
                f" {kv_cache_space}, expect a positive integer value.")

        scheduler_config = vllm_config.scheduler_config
        if ((scheduler_config.chunked_prefill_enabled
             or cache_config.enable_prefix_caching)
                and model_config.dtype == torch.half):
            logger.warning("Chunked-prefill on the CPU backend only does not"
                           " support fp16 for now, cast to bf16.")
            model_config.dtype = torch.bfloat16

        parallel_config = vllm_config.parallel_config
        if (parallel_config.distributed_executor_backend is not None
                and parallel_config.distributed_executor_backend != "mp"):
            logger.warning(("%s is not supported on CPU, fallback to mp "
                            "distributed executor backend."),
                           parallel_config.distributed_executor_backend)
            parallel_config.distributed_executor_backend = "mp"
        if parallel_config.worker_cls == "auto":
            if vllm_config.speculative_config:
                parallel_config.worker_cls = \
                    "vllm.spec_decode.spec_decode_worker.create_spec_worker"
                parallel_config.sd_worker_cls = \
                    "vllm.worker.cpu_worker.CPUWorker"
            else:
                parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"

        assert vllm_config.device_config.device_type == "cpu"

        #
        # Environment variables for CPU executor
        #

        # Disable torch async compiling which won't work with daemonic processes
        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"

        # Intel OpenMP setting
        ld_prealod_str = os.getenv("LD_PRELOAD", "")
        if "libiomp5.so" in ld_prealod_str:
            # The time(milliseconds) that a thread should wait after
            # completing the execution of a parallel region, before sleeping.
            os.environ['KMP_BLOCKTIME'] = "1"
            # Prevents the CPU to run into low performance state
            os.environ['KMP_TPAUSE'] = "0"
            # Provides fine granularity parallelism
            os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
            os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
            os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"

        # To hint IPEX uses shared memory based AllReduce
        os.environ["LOCAL_WORLD_SIZE"] = str(
            vllm_config.parallel_config.tensor_parallel_size)

    @classmethod
    def is_pin_memory_available(cls) -> bool:
        logger.warning("Pin memory is not supported on CPU.")
        return False

    @classmethod
    def get_punica_wrapper(cls) -> str:
        return "vllm.lora.punica_wrapper.punica_cpu.PunicaWrapperCPU"
[Misc] Add SPDX-License-Identifier headers to python source files (#12628) - Add SPDX license headers to python source files - Check for SPDX headers using pre-commit commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745 Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:18:24 2025 -0500 Add SPDX license headers to python source files This commit adds SPDX license headers to python source files as recommended to the project by the Linux Foundation. These headers provide a concise way that is both human and machine readable for communicating license information for each source file. It helps avoid any ambiguity about the license of the code and can also be easily used by tools to help manage license compliance. The Linux Foundation runs license scans against the codebase to help ensure we are in compliance with the licenses of the code we use, including dependencies. Having these headers in place helps that tool do its job. More information can be found on the SPDX site: - https://spdx.dev/learn/handling-license-info/ Signed-off-by: Russell Bryant <rbryant@redhat.com> commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:36:32 2025 -0500 Check for SPDX headers using pre-commit Signed-off-by: Russell Bryant <rbryant@redhat.com> --------- Signed-off-by: Russell Bryant <rbryant@redhat.com> 2025-02-02 14:58:18 -05:00			`# SPDX-License-Identifier: Apache-2.0`

[core] platform agnostic executor via collective_rpc (#11256) Signed-off-by: youkaichao <youkaichao@gmail.com> 2025-01-15 13:45:21 +08:00			`import os`
[Platform] Move `async output` check to platform (#10768) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2024-12-10 01:24:46 +08:00			`from typing import TYPE_CHECKING, Optional`
[platforms] refactor cpu code (#10402) Signed-off-by: youkaichao <youkaichao@gmail.com> 2024-11-16 23:14:23 -08:00
[CI/Build] Add test decorator for minimum GPU memory (#8925) 2024-09-29 10:50:51 +08:00			`import psutil`
[Hardware][Intel] Support compressed-tensor W8A8 for CPU backend (#7257) 2024-09-12 00:46:46 +08:00			`import torch`

[platforms] refactor cpu code (#10402) Signed-off-by: youkaichao <youkaichao@gmail.com> 2024-11-16 23:14:23 -08:00			`from vllm.logger import init_logger`

[Platform][Refactor] Extract func `get_default_attn_backend` to `Platform` (#10358) Signed-off-by: Mengqing Cao <cmq0113@163.com> 2024-11-19 11:22:26 +08:00			`from .interface import Platform, PlatformEnum, _Backend`

			`logger = init_logger(__name__)`
[Hardware][Intel] Support compressed-tensor W8A8 for CPU backend (#7257) 2024-09-12 00:46:46 +08:00
[platforms] refactor cpu code (#10402) Signed-off-by: youkaichao <youkaichao@gmail.com> 2024-11-16 23:14:23 -08:00			`if TYPE_CHECKING:`
			`from vllm.config import VllmConfig`
			`else:`
			`VllmConfig = None`

			`logger = init_logger(__name__)`

[Hardware][Intel] Support compressed-tensor W8A8 for CPU backend (#7257) 2024-09-12 00:46:46 +08:00
			`class CpuPlatform(Platform):`
			`_enum = PlatformEnum.CPU`
[platform] Add verify_quantization in platform. (#10757) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2024-11-29 23:22:21 +08:00			`device_name: str = "cpu"`
[Platforms] Add `device_type` in `Platform` (#10508) Signed-off-by: MengqingCao <cmq0113@163.com> 2024-11-21 12:44:20 +08:00			`device_type: str = "cpu"`
[torch.compile] support all attention backends (#10558) Signed-off-by: youkaichao <youkaichao@gmail.com> 2024-11-22 14:04:42 -08:00			`dispatch_key: str = "CPU"`
[Hardware][Intel] Support compressed-tensor W8A8 for CPU backend (#7257) 2024-09-12 00:46:46 +08:00
[CI/Build] Avoid CUDA initialization (#8534) 2024-09-18 18:38:11 +08:00			`@classmethod`
			`def get_device_name(cls, device_id: int = 0) -> str:`
[Hardware][Intel] Support compressed-tensor W8A8 for CPU backend (#7257) 2024-09-12 00:46:46 +08:00			`return "cpu"`

[Platform][Refactor] Extract func `get_default_attn_backend` to `Platform` (#10358) Signed-off-by: Mengqing Cao <cmq0113@163.com> 2024-11-19 11:22:26 +08:00			`@classmethod`
[platform] Allow platform specify attention backend (#11609) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> 2025-01-09 21:46:50 +08:00			`def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,`
			`dtype: torch.dtype, kv_cache_dtype: Optional[str],`
[Attention] MLA decode optimizations (#12528) Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: simon-mo <xmo@berkeley.edu> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: simon-mo <simon.mo@hey.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> Co-authored-by: Tyler Michael Smith <tysmith@redhat.com> Co-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Co-authored-by: simon-mo <xmo@berkeley.edu> 2025-01-31 02:49:37 -05:00			`block_size: int, use_v1: bool,`
			`use_mla: bool) -> str:`
Check if selected backend is None in get_attn_backend_cls() (#12975) Signed-off-by: Yuan Tang <terrytangyuan@gmail.com> 2025-02-09 22:45:07 -05:00			`if selected_backend and selected_backend != _Backend.TORCH_SDPA:`
[Platform][Refactor] Extract func `get_default_attn_backend` to `Platform` (#10358) Signed-off-by: Mengqing Cao <cmq0113@163.com> 2024-11-19 11:22:26 +08:00			`logger.info("Cannot use %s backend on CPU.", selected_backend)`
[platform] Allow platform specify attention backend (#11609) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> 2025-01-09 21:46:50 +08:00			`logger.info("Using Torch SDPA backend.")`
			`return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"`
[Platform][Refactor] Extract func `get_default_attn_backend` to `Platform` (#10358) Signed-off-by: Mengqing Cao <cmq0113@163.com> 2024-11-19 11:22:26 +08:00
[CI/Build] Add test decorator for minimum GPU memory (#8925) 2024-09-29 10:50:51 +08:00			`@classmethod`
			`def get_device_total_memory(cls, device_id: int = 0) -> int:`
			`return psutil.virtual_memory().total`

[Platform] Move `async output` check to platform (#10768) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2024-12-10 01:24:46 +08:00			`@classmethod`
			`def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:`
			`return False`

[CI/Build] Avoid CUDA initialization (#8534) 2024-09-18 18:38:11 +08:00			`@classmethod`
			`def inference_mode(cls):`
[Hardware][Intel] Support compressed-tensor W8A8 for CPU backend (#7257) 2024-09-12 00:46:46 +08:00			`return torch.no_grad()`
[platforms] refactor cpu code (#10402) Signed-off-by: youkaichao <youkaichao@gmail.com> 2024-11-16 23:14:23 -08:00
			`@classmethod`
			`def check_and_update_config(cls, vllm_config: VllmConfig) -> None:`
			`import vllm.envs as envs`
			`from vllm.utils import GiB_bytes`
			`model_config = vllm_config.model_config`
[Doc][2/N] Reorganize Models and Usage sections (#11755) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> 2025-01-06 21:40:31 +08:00			`# Reminder: Please update docs/source/features/compatibility_matrix.md`
[platforms] refactor cpu code (#10402) Signed-off-by: youkaichao <youkaichao@gmail.com> 2024-11-16 23:14:23 -08:00			`# If the feature combo become valid`
			`if not model_config.enforce_eager:`
			`logger.warning(`
			`"CUDA graph is not supported on CPU, fallback to the eager "`
			`"mode.")`
			`model_config.enforce_eager = True`

			`cache_config = vllm_config.cache_config`

[Platform] platform agnostic for EngineArgs initialization (#11225) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2024-12-17 14:11:06 +08:00			`if cache_config and cache_config.block_size is None:`
			`cache_config.block_size = 16`

[platforms] refactor cpu code (#10402) Signed-off-by: youkaichao <youkaichao@gmail.com> 2024-11-16 23:14:23 -08:00			`kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE`

			`if kv_cache_space >= 0:`
			`if kv_cache_space == 0:`
			`cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes # type: ignore`
			`logger.warning(`
			`"Environment variable VLLM_CPU_KVCACHE_SPACE (GB) "`
			`"for CPU backend is not set, using 4 by default.")`
			`else:`
			`cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes # type: ignore # noqa`
			`else:`
			`raise RuntimeError(`
			`"Invalid environment variable VLLM_CPU_KVCACHE_SPACE"`
			`f" {kv_cache_space}, expect a positive integer value.")`

			`scheduler_config = vllm_config.scheduler_config`
[Hardware][CPU] Support chunked-prefill and prefix-caching on CPU (#10355) Signed-off-by: jiang1.li <jiang1.li@intel.com> 2024-11-20 18:57:39 +08:00			`if ((scheduler_config.chunked_prefill_enabled`
			`or cache_config.enable_prefix_caching)`
			`and model_config.dtype == torch.half):`
			`logger.warning("Chunked-prefill on the CPU backend only does not"`
			`" support fp16 for now, cast to bf16.")`
			`model_config.dtype = torch.bfloat16`
[platforms] refactor cpu code (#10402) Signed-off-by: youkaichao <youkaichao@gmail.com> 2024-11-16 23:14:23 -08:00
			`parallel_config = vllm_config.parallel_config`
			`if (parallel_config.distributed_executor_backend is not None`
			`and parallel_config.distributed_executor_backend != "mp"):`
			`logger.warning(("%s is not supported on CPU, fallback to mp "`
			`"distributed executor backend."),`
			`parallel_config.distributed_executor_backend)`
			`parallel_config.distributed_executor_backend = "mp"`
[platforms] absorb worker cls difference into platforms folder (#10555) Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Nick Hill <nhill@redhat.com> 2024-11-21 21:00:32 -08:00			`if parallel_config.worker_cls == "auto":`
Remove hard-dependencies of Speculative decode to CUDA workers (#10587) Signed-off-by: Chendi Xue <chendi.xue@intel.com> 2024-11-26 19:57:11 -06:00			`if vllm_config.speculative_config:`
			`parallel_config.worker_cls = \`
			`"vllm.spec_decode.spec_decode_worker.create_spec_worker"`
			`parallel_config.sd_worker_cls = \`
			`"vllm.worker.cpu_worker.CPUWorker"`
			`else:`
			`parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"`
[Refactor]A simple device-related refactor (#11163) Signed-off-by: noemotiovon <noemotiovon@gmail.com> Co-authored-by: noemotiovon <noemotiovon@gmail.com> 2024-12-13 21:39:00 +08:00
[core] platform agnostic executor via collective_rpc (#11256) Signed-off-by: youkaichao <youkaichao@gmail.com> 2025-01-15 13:45:21 +08:00			`assert vllm_config.device_config.device_type == "cpu"`

			`#`
			`# Environment variables for CPU executor`
			`#`

			`# Disable torch async compiling which won't work with daemonic processes`
			`os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"`

			`# Intel OpenMP setting`
			`ld_prealod_str = os.getenv("LD_PRELOAD", "")`
			`if "libiomp5.so" in ld_prealod_str:`
			`# The time(milliseconds) that a thread should wait after`
			`# completing the execution of a parallel region, before sleeping.`
			`os.environ['KMP_BLOCKTIME'] = "1"`
			`# Prevents the CPU to run into low performance state`
			`os.environ['KMP_TPAUSE'] = "0"`
			`# Provides fine granularity parallelism`
			`os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"`
			`os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"`
			`os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"`

			`# To hint IPEX uses shared memory based AllReduce`
			`os.environ["LOCAL_WORLD_SIZE"] = str(`
			`vllm_config.parallel_config.tensor_parallel_size)`

[Refactor]A simple device-related refactor (#11163) Signed-off-by: noemotiovon <noemotiovon@gmail.com> Co-authored-by: noemotiovon <noemotiovon@gmail.com> 2024-12-13 21:39:00 +08:00			`@classmethod`
			`def is_pin_memory_available(cls) -> bool:`
			`logger.warning("Pin memory is not supported on CPU.")`
			`return False`
[Platform] Move get_punica_wrapper() function to Platform (#11516) Signed-off-by: Shanshan Shen <467638484@qq.com> 2025-01-13 21:12:10 +08:00
			`@classmethod`
			`def get_punica_wrapper(cls) -> str:`
			`return "vllm.lora.punica_wrapper.punica_cpu.PunicaWrapperCPU"`