vllm/tests/kernels/test_pos_encoding.py

from typing import Optional

import pytest
import torch
from allclose_default import get_default_atol, get_default_rtol
from vllm.model_executor.layers.rotary_embedding import get_rope

IS_NEOX_STYLE = [True, False]
DTYPES = [torch.half, torch.bfloat16, torch.float]
HEAD_SIZES = [64, 80, 96, 112, 128, 256]
ROTARY_DIMS = [None, 32]  # None means rotary dim == head size
NUM_HEADS = [7, 17]  # Arbitrary values for testing
BATCH_SIZES = [1, 5]  # Arbitrary values for testing
SEQ_LENS = [11, 8192]  # Arbitrary values for testing
SEEDS = [0]
CUDA_DEVICES = [
    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]


@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
@pytest.mark.parametrize("seq_len", SEQ_LENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode()
def test_rotary_embedding(
    is_neox_style: bool,
    batch_size: int,
    seq_len: int,
    num_heads: int,
    head_size: int,
    rotary_dim: Optional[int],
    dtype: torch.dtype,
    seed: int,
    device: str,
    max_position: int = 8192,
    base: int = 10000,
) -> None:
    if rotary_dim is None:
        rotary_dim = head_size
    torch.random.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    torch.set_default_device(device)
    if rotary_dim is None:
        rotary_dim = head_size
    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
    rope = rope.to(dtype=dtype)

    positions = torch.randint(0, max_position, (batch_size, seq_len))
    query = torch.randn(batch_size,
                        seq_len,
                        num_heads * head_size,
                        dtype=dtype)
    key = torch.randn_like(query)

    # NOTE(woosuk): The reference implementation should be executed first
    # because the custom kernel is in-place.
    ref_query, ref_key = rope._forward(positions, query, key)
    out_query, out_key = rope.forward(positions, query, key)
    # Compare the results.
    assert torch.allclose(out_query,
                          ref_query,
                          atol=get_default_atol(out_query),
                          rtol=get_default_rtol(out_query))
    assert torch.allclose(out_key,
                          ref_key,
                          atol=get_default_atol(out_key),
                          rtol=get_default_rtol(out_key))
Add PyTorch-native implementation of custom layers (#1898) 2023-12-02 21:18:40 -08:00			`from typing import Optional`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00
Clean up kernel unit tests (#938) 2023-09-06 08:57:38 +09:00			`import pytest`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00			`import torch`
[ROCm] Fix some kernels failed unit tests (#2498) 2024-02-05 17:25:36 -05:00			`from allclose_default import get_default_atol, get_default_rtol`
Add PyTorch-native implementation of custom layers (#1898) 2023-12-02 21:18:40 -08:00			`from vllm.model_executor.layers.rotary_embedding import get_rope`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00
[BugFix] Implement RoPE for GPT-J (#941) 2023-09-06 11:54:33 +09:00			`IS_NEOX_STYLE = [True, False]`
Clean up kernel unit tests (#938) 2023-09-06 08:57:38 +09:00			`DTYPES = [torch.half, torch.bfloat16, torch.float]`
			`HEAD_SIZES = [64, 80, 96, 112, 128, 256]`
			`ROTARY_DIMS = [None, 32] # None means rotary dim == head size`
Add PyTorch-native implementation of custom layers (#1898) 2023-12-02 21:18:40 -08:00			`NUM_HEADS = [7, 17] # Arbitrary values for testing`
			`BATCH_SIZES = [1, 5] # Arbitrary values for testing`
			`SEQ_LENS = [11, 8192] # Arbitrary values for testing`
Clean up kernel unit tests (#938) 2023-09-06 08:57:38 +09:00			`SEEDS = [0]`
Remove hardcoded `device="cuda" ` to support more devices (#2503) Co-authored-by: Jiang Li <jiang1.li@intel.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> 2024-02-02 07:46:39 +08:00			`CUDA_DEVICES = [`
			`f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)`
			`]`
Clean up kernel unit tests (#938) 2023-09-06 08:57:38 +09:00
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00
[BugFix] Implement RoPE for GPT-J (#941) 2023-09-06 11:54:33 +09:00			`@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)`
Add PyTorch-native implementation of custom layers (#1898) 2023-12-02 21:18:40 -08:00			`@pytest.mark.parametrize("batch_size", BATCH_SIZES)`
			`@pytest.mark.parametrize("seq_len", SEQ_LENS)`
Clean up kernel unit tests (#938) 2023-09-06 08:57:38 +09:00			`@pytest.mark.parametrize("num_heads", NUM_HEADS)`
			`@pytest.mark.parametrize("head_size", HEAD_SIZES)`
			`@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)`
			`@pytest.mark.parametrize("dtype", DTYPES)`
			`@pytest.mark.parametrize("seed", SEEDS)`
Remove hardcoded `device="cuda" ` to support more devices (#2503) Co-authored-by: Jiang Li <jiang1.li@intel.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> 2024-02-02 07:46:39 +08:00			`@pytest.mark.parametrize("device", CUDA_DEVICES)`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00			`@torch.inference_mode()`
[BugFix] Implement RoPE for GPT-J (#941) 2023-09-06 11:54:33 +09:00			`def test_rotary_embedding(`
			`is_neox_style: bool,`
Add PyTorch-native implementation of custom layers (#1898) 2023-12-02 21:18:40 -08:00			`batch_size: int,`
			`seq_len: int,`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00			`num_heads: int,`
			`head_size: int,`
Clean up kernel unit tests (#938) 2023-09-06 08:57:38 +09:00			`rotary_dim: Optional[int],`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00			`dtype: torch.dtype,`
Clean up kernel unit tests (#938) 2023-09-06 08:57:38 +09:00			`seed: int,`
Remove hardcoded `device="cuda" ` to support more devices (#2503) Co-authored-by: Jiang Li <jiang1.li@intel.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> 2024-02-02 07:46:39 +08:00			`device: str,`
Clean up kernel unit tests (#938) 2023-09-06 08:57:38 +09:00			`max_position: int = 8192,`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00			`base: int = 10000,`
			`) -> None:`
Clean up kernel unit tests (#938) 2023-09-06 08:57:38 +09:00			`if rotary_dim is None:`
			`rotary_dim = head_size`
			`torch.random.manual_seed(seed)`
Remove hardcoded `device="cuda" ` to support more devices (#2503) Co-authored-by: Jiang Li <jiang1.li@intel.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> 2024-02-02 07:46:39 +08:00			`if torch.cuda.is_available():`
			`torch.cuda.manual_seed(seed)`
			`torch.set_default_device(device)`
Add PyTorch-native implementation of custom layers (#1898) 2023-12-02 21:18:40 -08:00			`if rotary_dim is None:`
			`rotary_dim = head_size`
			`rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)`
Remove hardcoded `device="cuda" ` to support more devices (#2503) Co-authored-by: Jiang Li <jiang1.li@intel.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> 2024-02-02 07:46:39 +08:00			`rope = rope.to(dtype=dtype)`
Add PyTorch-native implementation of custom layers (#1898) 2023-12-02 21:18:40 -08:00
Remove hardcoded `device="cuda" ` to support more devices (#2503) Co-authored-by: Jiang Li <jiang1.li@intel.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> 2024-02-02 07:46:39 +08:00			`positions = torch.randint(0, max_position, (batch_size, seq_len))`
Add PyTorch-native implementation of custom layers (#1898) 2023-12-02 21:18:40 -08:00			`query = torch.randn(batch_size,`
			`seq_len,`
[Quality] Add code formatter and linter (#326) 2023-07-03 11:31:55 -07:00			`num_heads * head_size,`
Remove hardcoded `device="cuda" ` to support more devices (#2503) Co-authored-by: Jiang Li <jiang1.li@intel.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> 2024-02-02 07:46:39 +08:00			`dtype=dtype)`
Add PyTorch-native implementation of custom layers (#1898) 2023-12-02 21:18:40 -08:00			`key = torch.randn_like(query)`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00
Add PyTorch-native implementation of custom layers (#1898) 2023-12-02 21:18:40 -08:00			`# NOTE(woosuk): The reference implementation should be executed first`
			`# because the custom kernel is in-place.`
			`ref_query, ref_key = rope._forward(positions, query, key)`
			`out_query, out_key = rope.forward(positions, query, key)`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00			`# Compare the results.`
[ROCm] Fix some kernels failed unit tests (#2498) 2024-02-05 17:25:36 -05:00			`assert torch.allclose(out_query,`
			`ref_query,`
			`atol=get_default_atol(out_query),`
			`rtol=get_default_rtol(out_query))`
			`assert torch.allclose(out_key,`
			`ref_key,`
			`atol=get_default_atol(out_key),`
			`rtol=get_default_rtol(out_key))`