vllm/tests/kernels/test_attention_selector.py

# SPDX-License-Identifier: Apache-2.0

from unittest.mock import Mock, patch

import pytest
import torch

from tests.kernels.utils import override_backend_env_variable
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.platforms.cpu import CpuPlatform
from vllm.platforms.cuda import CudaPlatform
from vllm.platforms.openvino import OpenVinoPlatform
from vllm.platforms.rocm import RocmPlatform
from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL


@pytest.fixture(autouse=True)
def clear_cache():
    """Clear lru cache to ensure each test case runs without caching.
    """
    _cached_get_attn_backend.cache_clear()


@pytest.mark.parametrize(
    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
def test_env(name: str, device: str, monkeypatch):
    """Test that the attention selector can be set via environment variable.
    Note that we do not test FlashAttn because it is the default backend.
    """

    override_backend_env_variable(monkeypatch, name)

    if device == "cpu":
        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
                                       False)
        assert backend.get_name() == "TORCH_SDPA"
    elif device == "hip":
        with patch("vllm.attention.selector.current_platform", RocmPlatform()):
            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
                                       False)
        assert backend.get_name() == "ROCM_FLASH"
    elif device == "openvino":
        with patch("vllm.attention.selector.current_platform",
                   OpenVinoPlatform()), patch.dict('sys.modules',
                                                   {'openvino': Mock()}):
            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
                                       False)
        assert backend.get_name() == "OPENVINO"
    else:
        if name in ["XFORMERS", "FLASHINFER"]:
            with patch("vllm.attention.selector.current_platform",
                       CudaPlatform()):
                backend = get_attn_backend(16, torch.float16, torch.float16,
                                           16, False)
            assert backend.get_name() == name


def test_flash_attn(monkeypatch):
    """Test FlashAttn validation."""
    # TODO: When testing for v1, pipe in `use_v1` as an argument to
    # get_attn_backend

    override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)

    # Unsupported CUDA arch
    with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
        backend = get_attn_backend(16, torch.float16, None, 16, False)
        assert backend.get_name() != STR_FLASH_ATTN_VAL

    # Unsupported data type
    backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
    assert backend.get_name() != STR_FLASH_ATTN_VAL

    # Unsupported kv cache data type
    backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
    assert backend.get_name() != STR_FLASH_ATTN_VAL

    # Unsupported block size
    backend = get_attn_backend(16, torch.float16, None, 8, False)
    assert backend.get_name() != STR_FLASH_ATTN_VAL

    # flash-attn is not installed
    with patch.dict('sys.modules', {'vllm_flash_attn': None}):
        backend = get_attn_backend(16, torch.float16, None, 16, False)
        assert backend.get_name() != STR_FLASH_ATTN_VAL

    # Unsupported head size
    backend = get_attn_backend(17, torch.float16, None, 16, False)
    assert backend.get_name() != STR_FLASH_ATTN_VAL

    # Attention-free models should bypass env and use PlaceholderAttention
    backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
    assert backend.get_name() != STR_FLASH_ATTN_VAL


def test_invalid_env(monkeypatch):
    """Ignore the invalid env variable if it is set."""
    override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
    with patch("vllm.attention.selector.current_platform", CudaPlatform()):
        backend = get_attn_backend(32, torch.float16, None, 16, False)
        assert backend.get_name() == "FLASH_ATTN"

        # when block size == 16, backend will fall back to XFORMERS
        backend = get_attn_backend(16, torch.float16, None, 16, False)
        assert backend.get_name() == "XFORMERS"
[Misc] Add SPDX-License-Identifier headers to python source files (#12628) - Add SPDX license headers to python source files - Check for SPDX headers using pre-commit commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745 Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:18:24 2025 -0500 Add SPDX license headers to python source files This commit adds SPDX license headers to python source files as recommended to the project by the Linux Foundation. These headers provide a concise way that is both human and machine readable for communicating license information for each source file. It helps avoid any ambiguity about the license of the code and can also be easily used by tools to help manage license compliance. The Linux Foundation runs license scans against the codebase to help ensure we are in compliance with the licenses of the code we use, including dependencies. Having these headers in place helps that tool do its job. More information can be found on the SPDX site: - https://spdx.dev/learn/handling-license-info/ Signed-off-by: Russell Bryant <rbryant@redhat.com> commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:36:32 2025 -0500 Check for SPDX headers using pre-commit Signed-off-by: Russell Bryant <rbryant@redhat.com> --------- Signed-off-by: Russell Bryant <rbryant@redhat.com> 2025-02-02 14:58:18 -05:00			`# SPDX-License-Identifier: Apache-2.0`

[platform] Allow platform specify attention backend (#11609) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> 2025-01-09 21:46:50 +08:00			`from unittest.mock import Mock, patch`
[Misc] Take user preference in attention selector (#4960) 2024-05-22 15:55:56 -07:00
			`import pytest`
			`import torch`

[Core] Subclass ModelRunner to support cross-attention & encoder sequences (towards eventual encoder/decoder model support) (#4942) Co-authored-by: Andrew Feldman <afeld2012@gmail.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2024-08-06 16:51:47 -04:00			`from tests.kernels.utils import override_backend_env_variable`
[platform] Allow platform specify attention backend (#11609) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> 2025-01-09 21:46:50 +08:00			`from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend`
[platforms] enable platform plugins (#11602) Signed-off-by: youkaichao <youkaichao@gmail.com> 2024-12-30 20:24:45 +08:00			`from vllm.platforms.cpu import CpuPlatform`
			`from vllm.platforms.cuda import CudaPlatform`
			`from vllm.platforms.openvino import OpenVinoPlatform`
			`from vllm.platforms.rocm import RocmPlatform`
[Core] Subclass ModelRunner to support cross-attention & encoder sequences (towards eventual encoder/decoder model support) (#4942) Co-authored-by: Andrew Feldman <afeld2012@gmail.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2024-08-06 16:51:47 -04:00			`from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL`
[Misc] Take user preference in attention selector (#4960) 2024-05-22 15:55:56 -07:00

[platform] Allow platform specify attention backend (#11609) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> 2025-01-09 21:46:50 +08:00			`@pytest.fixture(autouse=True)`
			`def clear_cache():`
			`"""Clear lru cache to ensure each test case runs without caching.`
			`"""`
			`_cached_get_attn_backend.cache_clear()`


[Misc] Take user preference in attention selector (#4960) 2024-05-22 15:55:56 -07:00			`@pytest.mark.parametrize(`
[Hardware][Intel] OpenVINO vLLM backend (#5379) 2024-06-28 17:50:16 +04:00			`"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])`
			`@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])`
[Bugfix]: During testing, use pytest monkeypatch for safely overriding the env var that indicates the vLLM backend (#5210) 2024-06-03 23:32:57 -04:00			`def test_env(name: str, device: str, monkeypatch):`
[Misc] Take user preference in attention selector (#4960) 2024-05-22 15:55:56 -07:00			`"""Test that the attention selector can be set via environment variable.`
			`Note that we do not test FlashAttn because it is the default backend.`
			`"""`
[Bugfix]: During testing, use pytest monkeypatch for safely overriding the env var that indicates the vLLM backend (#5210) 2024-06-03 23:32:57 -04:00
			`override_backend_env_variable(monkeypatch, name)`
[Misc] Take user preference in attention selector (#4960) 2024-05-22 15:55:56 -07:00
			`if device == "cpu":`
[platforms] enable platform plugins (#11602) Signed-off-by: youkaichao <youkaichao@gmail.com> 2024-12-30 20:24:45 +08:00			`with patch("vllm.attention.selector.current_platform", CpuPlatform()):`
[platform] Allow platform specify attention backend (#11609) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> 2025-01-09 21:46:50 +08:00			`backend = get_attn_backend(16, torch.float16, torch.float16, 16,`
			`False)`
			`assert backend.get_name() == "TORCH_SDPA"`
[Misc] Take user preference in attention selector (#4960) 2024-05-22 15:55:56 -07:00			`elif device == "hip":`
[platforms] enable platform plugins (#11602) Signed-off-by: youkaichao <youkaichao@gmail.com> 2024-12-30 20:24:45 +08:00			`with patch("vllm.attention.selector.current_platform", RocmPlatform()):`
[platform] Allow platform specify attention backend (#11609) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> 2025-01-09 21:46:50 +08:00			`backend = get_attn_backend(16, torch.float16, torch.float16, 16,`
			`False)`
			`assert backend.get_name() == "ROCM_FLASH"`
[Hardware][Intel] OpenVINO vLLM backend (#5379) 2024-06-28 17:50:16 +04:00			`elif device == "openvino":`
[Platform][Refactor] Extract func `get_default_attn_backend` to `Platform` (#10358) Signed-off-by: Mengqing Cao <cmq0113@163.com> 2024-11-19 11:22:26 +08:00			`with patch("vllm.attention.selector.current_platform",`
[platform] Allow platform specify attention backend (#11609) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> 2025-01-09 21:46:50 +08:00			`OpenVinoPlatform()), patch.dict('sys.modules',`
			`{'openvino': Mock()}):`
			`backend = get_attn_backend(16, torch.float16, torch.float16, 16,`
			`False)`
			`assert backend.get_name() == "OPENVINO"`
[Misc] Take user preference in attention selector (#4960) 2024-05-22 15:55:56 -07:00			`else:`
[platform] Allow platform specify attention backend (#11609) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> 2025-01-09 21:46:50 +08:00			`if name in ["XFORMERS", "FLASHINFER"]:`
			`with patch("vllm.attention.selector.current_platform",`
			`CudaPlatform()):`
			`backend = get_attn_backend(16, torch.float16, torch.float16,`
			`16, False)`
			`assert backend.get_name() == name`
[Misc] Take user preference in attention selector (#4960) 2024-05-22 15:55:56 -07:00

[Bugfix]: During testing, use pytest monkeypatch for safely overriding the env var that indicates the vLLM backend (#5210) 2024-06-03 23:32:57 -04:00			`def test_flash_attn(monkeypatch):`
[Misc] Take user preference in attention selector (#4960) 2024-05-22 15:55:56 -07:00			`"""Test FlashAttn validation."""`
[V1] Make v1 more testable (#9888) Signed-off-by: Joe Runde <Joseph.Runde@ibm.com> 2024-11-06 12:57:35 -07:00			# TODO: When testing for v1, pipe in `use_v1` as an argument to
[platform] Allow platform specify attention backend (#11609) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> 2025-01-09 21:46:50 +08:00			`# get_attn_backend`
[Bugfix]: During testing, use pytest monkeypatch for safely overriding the env var that indicates the vLLM backend (#5210) 2024-06-03 23:32:57 -04:00
			`override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)`
[Misc] Take user preference in attention selector (#4960) 2024-05-22 15:55:56 -07:00
			`# Unsupported CUDA arch`
[CI/Build] Avoid CUDA initialization (#8534) 2024-09-18 18:38:11 +08:00			`with patch("torch.cuda.get_device_capability", return_value=(7, 5)):`
[platform] Allow platform specify attention backend (#11609) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> 2025-01-09 21:46:50 +08:00			`backend = get_attn_backend(16, torch.float16, None, 16, False)`
			`assert backend.get_name() != STR_FLASH_ATTN_VAL`
[Misc] Take user preference in attention selector (#4960) 2024-05-22 15:55:56 -07:00
			`# Unsupported data type`
[platform] Allow platform specify attention backend (#11609) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> 2025-01-09 21:46:50 +08:00			`backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)`
			`assert backend.get_name() != STR_FLASH_ATTN_VAL`
[Misc] Take user preference in attention selector (#4960) 2024-05-22 15:55:56 -07:00
			`# Unsupported kv cache data type`
[platform] Allow platform specify attention backend (#11609) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> 2025-01-09 21:46:50 +08:00			`backend = get_attn_backend(16, torch.float16, "fp8", 16, False)`
			`assert backend.get_name() != STR_FLASH_ATTN_VAL`
[Misc] Take user preference in attention selector (#4960) 2024-05-22 15:55:56 -07:00
			`# Unsupported block size`
[platform] Allow platform specify attention backend (#11609) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> 2025-01-09 21:46:50 +08:00			`backend = get_attn_backend(16, torch.float16, None, 8, False)`
			`assert backend.get_name() != STR_FLASH_ATTN_VAL`
[Misc] Take user preference in attention selector (#4960) 2024-05-22 15:55:56 -07:00
			`# flash-attn is not installed`
			`with patch.dict('sys.modules', {'vllm_flash_attn': None}):`
[platform] Allow platform specify attention backend (#11609) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> 2025-01-09 21:46:50 +08:00			`backend = get_attn_backend(16, torch.float16, None, 16, False)`
			`assert backend.get_name() != STR_FLASH_ATTN_VAL`
[Misc] Take user preference in attention selector (#4960) 2024-05-22 15:55:56 -07:00
			`# Unsupported head size`
[platform] Allow platform specify attention backend (#11609) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> 2025-01-09 21:46:50 +08:00			`backend = get_attn_backend(17, torch.float16, None, 16, False)`
			`assert backend.get_name() != STR_FLASH_ATTN_VAL`
[Model] Support Mamba (#6484) 2024-10-11 11:40:06 -04:00
			`# Attention-free models should bypass env and use PlaceholderAttention`
[platform] Allow platform specify attention backend (#11609) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> 2025-01-09 21:46:50 +08:00			`backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)`
			`assert backend.get_name() != STR_FLASH_ATTN_VAL`
[Misc] Take user preference in attention selector (#4960) 2024-05-22 15:55:56 -07:00

[Bugfix]: During testing, use pytest monkeypatch for safely overriding the env var that indicates the vLLM backend (#5210) 2024-06-03 23:32:57 -04:00			`def test_invalid_env(monkeypatch):`
[Platform] Do not raise error if _Backend is not found (#12023) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> 2025-01-15 18:14:15 +08:00			`"""Ignore the invalid env variable if it is set."""`
[Bugfix]: During testing, use pytest monkeypatch for safely overriding the env var that indicates the vLLM backend (#5210) 2024-06-03 23:32:57 -04:00			`override_backend_env_variable(monkeypatch, STR_INVALID_VAL)`
[Platform] Do not raise error if _Backend is not found (#12023) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com> 2025-01-15 18:14:15 +08:00			`with patch("vllm.attention.selector.current_platform", CudaPlatform()):`
			`backend = get_attn_backend(32, torch.float16, None, 16, False)`
			`assert backend.get_name() == "FLASH_ATTN"`

			`# when block size == 16, backend will fall back to XFORMERS`
			`backend = get_attn_backend(16, torch.float16, None, 16, False)`
			`assert backend.get_name() == "XFORMERS"`