vllm/cacheflow/model_executor/layers/attention.py

"""Multi-head attention."""
from typing import Optional

import torch
import torch.nn as nn
from xformers import ops as xops

from cacheflow import attention_ops
from cacheflow import cache_ops
from cacheflow import pos_encoding_ops
from cacheflow.model_executor.input_metadata import InputMetadata

_SUPPORTED_HEAD_SIZES = [64, 80, 96, 128]


class GPTCacheFlowAttention(nn.Module):
    """GPT-style multi-head attention.

    This class takes flattened 1D query, key, and value tensors as input. The
    input 1D tensors can be split into three parts: the prompt tokens, the
    generation tokens, and the paddings.

    |<------------------------------------- num_valid_tokens ------------------------------------->|
    |<--------------- num_prompt_tokens -------------->|<------- num_generation_tokens (M) ------->|
    |<--prompt_0-->|<--prompt_1-->|...|<--prompt_N-1-->|<--generation_0-->|...|<--generation_M-1-->|<--padding-->|

    The prompts might have different lengths, while the generation tokens always
    have length 1. The paddings are appended to make the input length a multiple
    of 8, which is desirable for Tensor Cores.

    The class does the following:
    1. Perform multi_query_kv_attention for the prompts. This operation does
        not use the KV cache.
    2. Wait for the cache operations (e.g., swap, copy) to finish. The cache
        operations are issued by the cache engine before executing the forward
        pass of the model, and they are executed asynchronously.
    3. Reshape and store the input key and value tensors in the KV cache.
    4. Perform single_query_cached_kv_attention for the generation tokens.
        This operation reads the previous key and value tensors from the KV
        cache.
    5. Output a flattened 1D tensor.
    """

    def __init__(self, num_heads: int, head_size: int, scale: float) -> None:
        super().__init__()
        self.num_heads = num_heads
        self.head_size = head_size
        self.scale = float(scale)
        self.attn_op = xops.fmha.cutlass.FwOp()

        if self.head_size not in _SUPPORTED_HEAD_SIZES:
            raise ValueError(f"head_size ({self.head_size}) is not supported. "
                             f"Supported head sizes: {_SUPPORTED_HEAD_SIZES}.")

    def multi_query_kv_attention(
        self,
        output: torch.Tensor,                   # [num_prompt_tokens, num_heads, head_size]
        query: torch.Tensor,                    # [num_prompt_tokens, num_heads, head_size]
        key: torch.Tensor,                      # [num_prompt_tokens, num_heads, head_size]
        value: torch.Tensor,                    # [num_prompt_tokens, num_heads, head_size]
        attn_bias: xops.AttentionBias,
    ) -> torch.Tensor:
        # TODO(woosuk): The unsqueeze op may incur some CPU overhead. Optimize.
        out = xops.memory_efficient_attention_forward(
            query.unsqueeze(0),
            key.unsqueeze(0),
            value.unsqueeze(0),
            attn_bias=attn_bias,
            p=0.0,
            scale=self.scale,
            op=self.attn_op,
        )
        # TODO(woosuk): Unnecessary copy. Optimize.
        output.copy_(out.squeeze(0))
        return output

    def single_query_cached_kv_attention(
        self,
        output: torch.Tensor,           # [num_generation_tokens, num_heads, head_size]
        query: torch.Tensor,            # [num_generation_tokens, num_heads, head_size]
        key_cache: torch.Tensor,        # [num_blocks, num_heads, head_size/x, block_size, x]
        value_cache: torch.Tensor,      # [num_blocks, num_heads, head_size, block_size]
        input_metadata: InputMetadata,
    ) -> None:
        block_size = value_cache.shape[3]
        attention_ops.single_query_cached_kv_attention(
            output,
            query,
            key_cache,
            value_cache,
            self.scale,
            input_metadata.block_tables,
            input_metadata.context_lens,
            block_size,
            input_metadata.max_context_len,
        )

    def forward(
        self,
        query: torch.Tensor,                    # [num_tokens, num_heads * head_size]
        key: torch.Tensor,                      # [num_tokens, num_heads * head_size]
        value: torch.Tensor,                    # [num_tokens, num_heads * head_size]
        key_cache: Optional[torch.Tensor],      # [num_blocks, num_heads, head_size/x, block_size, x]
        value_cache: Optional[torch.Tensor],    # [num_blocks, num_heads, head_size, block_size]
        input_metadata: InputMetadata,
        cache_event: Optional[torch.cuda.Event],
    ) -> torch.Tensor:                          # [num_tokens, num_heads * head_size]
        # NOTE: The query, key, and value tensors must be sliced from a qkv
        # tensor of shape [num_tokens, 3 * num_heads * head_size].

        # Reshape the query, key, and value tensors.
        query = query.view(-1, self.num_heads, self.head_size)
        key = key.view(-1, self.num_heads, self.head_size)
        value = value.view(-1, self.num_heads, self.head_size)

        # Pre-allocate the output tensor.
        output = torch.empty_like(query)

        # Compute the attention op for prompts.
        num_prompt_tokens = input_metadata.num_prompt_tokens
        if num_prompt_tokens > 0:
            self.multi_query_kv_attention(
                output[:num_prompt_tokens],
                query[:num_prompt_tokens],
                key[:num_prompt_tokens],
                value[:num_prompt_tokens],
                input_metadata.attn_bias,
            )

        # Wait until the cache op is done.
        if cache_event is not None:
            cache_event.wait()

        # Reshape the keys and values and store them in the cache.
        # When key_cache and value_cache are not provided, the new key
        # and value vectors will not be cached.
        num_valid_tokens = input_metadata.num_valid_tokens
        if (num_valid_tokens > 0 and key_cache is not None
            and value_cache is not None):
            # The stride is 3 because the key and value are sliced from qkv.
            cache_ops.reshape_and_cache(
                key[:num_valid_tokens],
                value[:num_valid_tokens],
                key_cache,
                value_cache,
                input_metadata.slot_mapping,
            )

        if input_metadata.num_generation_tokens > 0:
            assert key_cache is not None and value_cache is not None, (
                "key_cache and value_cache must be provided when "
                "generating tokens."
            )
            # Compute the attention op for generation tokens.
            self.single_query_cached_kv_attention(
                output[num_prompt_tokens:num_valid_tokens],
                query[num_prompt_tokens:num_valid_tokens],
                key_cache,
                value_cache,
                input_metadata)

        # Reshape the output tensor.
        # NOTE(woosuk): The output tensor may include paddings.
        return output.view(-1, self.num_heads * self.head_size)


class GPTNeoXCacheFlowAttention(GPTCacheFlowAttention):
    """Attention with GPT-NeoX style rotary embedding."""

    def __init__(
        self,
        num_heads: int,
        head_size: int,
        scale: float,
        rotary_dim: int,
        max_position: int = 8192,
        base: int = 10000,
    ) -> None:
        super().__init__(num_heads, head_size, scale)

        # Create the cos and sin cache.
        inv_freq = 1.0 / (base ** (torch.arange(0, rotary_dim, 2) / rotary_dim))
        t = torch.arange(max_position).float()
        freqs = torch.einsum('i,j -> ij', t, inv_freq.float())
        cos = freqs.cos()
        sin = freqs.sin()
        cache = torch.cat((cos, sin), dim=-1)

        # FIXME(woosuk): This assumes that we configure the default dtype when
        # initializing the model. Make it more robust.
        torch_dtype = torch.get_default_dtype()
        cache = cache.to(torch_dtype)
        # Embedding size: [max_position, rotary_dim]
        self.register_buffer("cos_sin_cache", cache, persistent=False)

    def forward(
        self,
        positions: torch.Tensor,                # [num_tokens]
        query: torch.Tensor,                    # [num_tokens, num_heads * head_size]
        key: torch.Tensor,                      # [num_tokens, num_heads * head_size]
        value: torch.Tensor,                    # [num_tokens, num_heads * head_size]
        key_cache: torch.Tensor,                # [num_blocks, num_heads, head_size/x, block_size, x]
        value_cache: torch.Tensor,              # [num_blocks, num_heads, head_size, block_size]
        input_metadata: InputMetadata,
        cache_event: Optional[torch.cuda.Event],
    ) -> torch.Tensor:                          # [num_tokens, num_heads * head_size]
        # Apply rotary embedding to the query and key before passing them
        # to the attention op.
        pos_encoding_ops.rotary_embedding_neox(
            positions,
            query,
            key,
            self.head_size,
            self.cos_sin_cache,
        )
        return super().forward(
            query,
            key,
            value,
            key_cache,
            value_cache,
            input_metadata,
            cache_event,
        )
Add docstrings to some modules and classes (#100) 2023-05-14 22:32:38 -07:00			`"""Multi-head attention."""`
Optimize data movement (#20) 2023-04-02 00:30:17 -07:00			`from typing import Optional`
Add unoptimized OPT Attention 2023-02-23 09:31:55 +00:00
			`import torch`
			`import torch.nn as nn`
Replace FlashAttention with xformers (#70) 2023-05-05 02:01:08 -07:00			`from xformers import ops as xops`
Add unoptimized OPT Attention 2023-02-23 09:31:55 +00:00
Implement `single_query_cached_kv_attention` kernel (#3) 2023-03-01 15:02:19 -08:00			`from cacheflow import attention_ops`
			`from cacheflow import cache_ops`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00			`from cacheflow import pos_encoding_ops`
Refactor system architecture (#82) 2023-05-09 15:30:12 -07:00			`from cacheflow.model_executor.input_metadata import InputMetadata`
Add unoptimized OPT Attention 2023-02-23 09:31:55 +00:00
Support FP32 (#141) 2023-06-07 00:40:21 -07:00			`_SUPPORTED_HEAD_SIZES = [64, 80, 96, 128]`
Use runtime profiling to replace manual memory analyzers (#81) 2023-05-19 11:35:44 -06:00
Refactor system architecture (#109) 2023-05-20 13:06:59 -07:00
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00			`class GPTCacheFlowAttention(nn.Module):`
Add docstrings to some modules and classes (#100) 2023-05-14 22:32:38 -07:00			`"""GPT-style multi-head attention.`

			`This class takes flattened 1D query, key, and value tensors as input. The`
			`input 1D tensors can be split into three parts: the prompt tokens, the`
			`generation tokens, and the paddings.`

			`\|<------------------------------------- num_valid_tokens ------------------------------------->\|`
			`\|<--------------- num_prompt_tokens -------------->\|<------- num_generation_tokens (M) ------->\|`
			`\|<--prompt_0-->\|<--prompt_1-->\|...\|<--prompt_N-1-->\|<--generation_0-->\|...\|<--generation_M-1-->\|<--padding-->\|`

			`The prompts might have different lengths, while the generation tokens always`
			`have length 1. The paddings are appended to make the input length a multiple`
			`of 8, which is desirable for Tensor Cores.`

			`The class does the following:`
			`1. Perform multi_query_kv_attention for the prompts. This operation does`
			`not use the KV cache.`
			`2. Wait for the cache operations (e.g., swap, copy) to finish. The cache`
			`operations are issued by the cache engine before executing the forward`
			`pass of the model, and they are executed asynchronously.`
			`3. Reshape and store the input key and value tensors in the KV cache.`
			`4. Perform single_query_cached_kv_attention for the generation tokens.`
			`This operation reads the previous key and value tensors from the KV`
			`cache.`
			`5. Output a flattened 1D tensor.`
			`"""`
Add unoptimized OPT Attention 2023-02-23 09:31:55 +00:00
Use runtime profiling to replace manual memory analyzers (#81) 2023-05-19 11:35:44 -06:00			`def __init__(self, num_heads: int, head_size: int, scale: float) -> None:`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00			`super().__init__()`
Use runtime profiling to replace manual memory analyzers (#81) 2023-05-19 11:35:44 -06:00			`self.num_heads = num_heads`
			`self.head_size = head_size`
Implement `single_query_cached_kv_attention` kernel (#3) 2023-03-01 15:02:19 -08:00			`self.scale = float(scale)`
Replace FlashAttention with xformers (#70) 2023-05-05 02:01:08 -07:00			`self.attn_op = xops.fmha.cutlass.FwOp()`
Add unoptimized OPT Attention 2023-02-23 09:31:55 +00:00
Use runtime profiling to replace manual memory analyzers (#81) 2023-05-19 11:35:44 -06:00			`if self.head_size not in _SUPPORTED_HEAD_SIZES:`
Support FP32 (#141) 2023-06-07 00:40:21 -07:00			`raise ValueError(f"head_size ({self.head_size}) is not supported. "`
			`f"Supported head sizes: {_SUPPORTED_HEAD_SIZES}.")`
Use runtime profiling to replace manual memory analyzers (#81) 2023-05-19 11:35:44 -06:00
Add unoptimized OPT Attention 2023-02-23 09:31:55 +00:00			`def multi_query_kv_attention(`
			`self,`
Optimize data movement (#20) 2023-04-02 00:30:17 -07:00			`output: torch.Tensor, # [num_prompt_tokens, num_heads, head_size]`
			`query: torch.Tensor, # [num_prompt_tokens, num_heads, head_size]`
			`key: torch.Tensor, # [num_prompt_tokens, num_heads, head_size]`
			`value: torch.Tensor, # [num_prompt_tokens, num_heads, head_size]`
Replace FlashAttention with xformers (#70) 2023-05-05 02:01:08 -07:00			`attn_bias: xops.AttentionBias,`
Add contributing guideline and mypy config (#122) 2023-05-23 17:58:51 -07:00			`) -> torch.Tensor:`
Replace FlashAttention with xformers (#70) 2023-05-05 02:01:08 -07:00			`# TODO(woosuk): The unsqueeze op may incur some CPU overhead. Optimize.`
			`out = xops.memory_efficient_attention_forward(`
			`query.unsqueeze(0),`
			`key.unsqueeze(0),`
			`value.unsqueeze(0),`
			`attn_bias=attn_bias,`
			`p=0.0,`
			`scale=self.scale,`
			`op=self.attn_op,`
Optimize data movement (#20) 2023-04-02 00:30:17 -07:00			`)`
Replace FlashAttention with xformers (#70) 2023-05-05 02:01:08 -07:00			`# TODO(woosuk): Unnecessary copy. Optimize.`
			`output.copy_(out.squeeze(0))`
			`return output`
Add unoptimized OPT Attention 2023-02-23 09:31:55 +00:00
			`def single_query_cached_kv_attention(`
			`self,`
Refactor and annotate types for attention 2023-02-24 08:58:46 +00:00			`output: torch.Tensor, # [num_generation_tokens, num_heads, head_size]`
			`query: torch.Tensor, # [num_generation_tokens, num_heads, head_size]`
			`key_cache: torch.Tensor, # [num_blocks, num_heads, head_size/x, block_size, x]`
Implement `single_query_cached_kv_attention` kernel (#3) 2023-03-01 15:02:19 -08:00			`value_cache: torch.Tensor, # [num_blocks, num_heads, head_size, block_size]`
Add unoptimized OPT Attention 2023-02-23 09:31:55 +00:00			`input_metadata: InputMetadata,`
			`) -> None:`
Implement `single_query_cached_kv_attention` kernel (#3) 2023-03-01 15:02:19 -08:00			`block_size = value_cache.shape[3]`
			`attention_ops.single_query_cached_kv_attention(`
			`output,`
			`query,`
			`key_cache,`
			`value_cache,`
			`self.scale,`
			`input_metadata.block_tables,`
			`input_metadata.context_lens,`
			`block_size,`
			`input_metadata.max_context_len,`
			`)`
Add unoptimized OPT Attention 2023-02-23 09:31:55 +00:00
			`def forward(`
			`self,`
Refactor and annotate types for attention 2023-02-24 08:58:46 +00:00			`query: torch.Tensor, # [num_tokens, num_heads * head_size]`
			`key: torch.Tensor, # [num_tokens, num_heads * head_size]`
			`value: torch.Tensor, # [num_tokens, num_heads * head_size]`
Use runtime profiling to replace manual memory analyzers (#81) 2023-05-19 11:35:44 -06:00			`key_cache: Optional[torch.Tensor], # [num_blocks, num_heads, head_size/x, block_size, x]`
			`value_cache: Optional[torch.Tensor], # [num_blocks, num_heads, head_size, block_size]`
Add unoptimized OPT Attention 2023-02-23 09:31:55 +00:00			`input_metadata: InputMetadata,`
			`cache_event: Optional[torch.cuda.Event],`
Refactor and annotate types for attention 2023-02-24 08:58:46 +00:00			`) -> torch.Tensor: # [num_tokens, num_heads * head_size]`
Optimize data movement (#20) 2023-04-02 00:30:17 -07:00			`# NOTE: The query, key, and value tensors must be sliced from a qkv`
			`# tensor of shape [num_tokens, 3 * num_heads * head_size].`
Fix attention 2023-02-23 23:02:25 +00:00
Optimize data movement (#20) 2023-04-02 00:30:17 -07:00			`# Reshape the query, key, and value tensors.`
Use runtime profiling to replace manual memory analyzers (#81) 2023-05-19 11:35:44 -06:00			`query = query.view(-1, self.num_heads, self.head_size)`
			`key = key.view(-1, self.num_heads, self.head_size)`
			`value = value.view(-1, self.num_heads, self.head_size)`
Optimize data movement (#20) 2023-04-02 00:30:17 -07:00
			`# Pre-allocate the output tensor.`
			`output = torch.empty_like(query)`
Add unoptimized OPT Attention 2023-02-23 09:31:55 +00:00
			`# Compute the attention op for prompts.`
Add miscellaneous updates (#8) 2023-03-13 13:48:38 -07:00			`num_prompt_tokens = input_metadata.num_prompt_tokens`
			`if num_prompt_tokens > 0:`
Use FlashAttention for `multi_query_kv_attention` (#4) 2023-03-01 21:13:08 -08:00			`self.multi_query_kv_attention(`
Fix a bug in 1D input shape (#5) 2023-03-06 10:05:27 -08:00			`output[:num_prompt_tokens],`
			`query[:num_prompt_tokens],`
			`key[:num_prompt_tokens],`
Support tensor parallel (#2) 2023-03-22 04:45:42 +08:00			`value[:num_prompt_tokens],`
Replace FlashAttention with xformers (#70) 2023-05-05 02:01:08 -07:00			`input_metadata.attn_bias,`
Fix a bug in 1D input shape (#5) 2023-03-06 10:05:27 -08:00			`)`
Add unoptimized OPT Attention 2023-02-23 09:31:55 +00:00
			`# Wait until the cache op is done.`
			`if cache_event is not None:`
			`cache_event.wait()`

			`# Reshape the keys and values and store them in the cache.`
Use runtime profiling to replace manual memory analyzers (#81) 2023-05-19 11:35:44 -06:00			`# When key_cache and value_cache are not provided, the new key`
			`# and value vectors will not be cached.`
Optimize data movement (#20) 2023-04-02 00:30:17 -07:00			`num_valid_tokens = input_metadata.num_valid_tokens`
Use runtime profiling to replace manual memory analyzers (#81) 2023-05-19 11:35:44 -06:00			`if (num_valid_tokens > 0 and key_cache is not None`
			`and value_cache is not None):`
Optimize data movement (#20) 2023-04-02 00:30:17 -07:00			`# The stride is 3 because the key and value are sliced from qkv.`
			`cache_ops.reshape_and_cache(`
			`key[:num_valid_tokens],`
			`value[:num_valid_tokens],`
			`key_cache,`
			`value_cache,`
			`input_metadata.slot_mapping,`
			`)`
Add unoptimized OPT Attention 2023-02-23 09:31:55 +00:00
			`if input_metadata.num_generation_tokens > 0:`
Use runtime profiling to replace manual memory analyzers (#81) 2023-05-19 11:35:44 -06:00			`assert key_cache is not None and value_cache is not None, (`
			`"key_cache and value_cache must be provided when "`
			`"generating tokens."`
			`)`
Add unoptimized OPT Attention 2023-02-23 09:31:55 +00:00			`# Compute the attention op for generation tokens.`
			`self.single_query_cached_kv_attention(`
Optimize data movement (#20) 2023-04-02 00:30:17 -07:00			`output[num_prompt_tokens:num_valid_tokens],`
			`query[num_prompt_tokens:num_valid_tokens],`
Add unoptimized OPT Attention 2023-02-23 09:31:55 +00:00			`key_cache,`
			`value_cache,`
			`input_metadata)`

			`# Reshape the output tensor.`
Refactor and annotate types for attention 2023-02-24 08:58:46 +00:00			`# NOTE(woosuk): The output tensor may include paddings.`
Use runtime profiling to replace manual memory analyzers (#81) 2023-05-19 11:35:44 -06:00			`return output.view(-1, self.num_heads * self.head_size)`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00

Add support for GPT-NeoX (Pythia) (#50) 2023-04-28 00:32:10 -07:00			`class GPTNeoXCacheFlowAttention(GPTCacheFlowAttention):`
			`"""Attention with GPT-NeoX style rotary embedding."""`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00
			`def __init__(`
			`self,`
Use runtime profiling to replace manual memory analyzers (#81) 2023-05-19 11:35:44 -06:00			`num_heads: int,`
			`head_size: int,`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00			`scale: float,`
Add support for GPT-NeoX (Pythia) (#50) 2023-04-28 00:32:10 -07:00			`rotary_dim: int,`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00			`max_position: int = 8192,`
			`base: int = 10000,`
			`) -> None:`
Use runtime profiling to replace manual memory analyzers (#81) 2023-05-19 11:35:44 -06:00			`super().__init__(num_heads, head_size, scale)`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00
			`# Create the cos and sin cache.`
Add support for GPT-NeoX (Pythia) (#50) 2023-04-28 00:32:10 -07:00			`inv_freq = 1.0 / (base ** (torch.arange(0, rotary_dim, 2) / rotary_dim))`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00			`t = torch.arange(max_position).float()`
			`freqs = torch.einsum('i,j -> ij', t, inv_freq.float())`
			`cos = freqs.cos()`
			`sin = freqs.sin()`
			`cache = torch.cat((cos, sin), dim=-1)`

			`# FIXME(woosuk): This assumes that we configure the default dtype when`
			`# initializing the model. Make it more robust.`
			`torch_dtype = torch.get_default_dtype()`
			`cache = cache.to(torch_dtype)`
Add support for GPT-NeoX (Pythia) (#50) 2023-04-28 00:32:10 -07:00			`# Embedding size: [max_position, rotary_dim]`
Add docstrings to some modules and classes (#100) 2023-05-14 22:32:38 -07:00			`self.register_buffer("cos_sin_cache", cache, persistent=False)`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00
			`def forward(`
			`self,`
Add contributing guideline and mypy config (#122) 2023-05-23 17:58:51 -07:00			`positions: torch.Tensor, # [num_tokens]`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00			`query: torch.Tensor, # [num_tokens, num_heads * head_size]`
			`key: torch.Tensor, # [num_tokens, num_heads * head_size]`
			`value: torch.Tensor, # [num_tokens, num_heads * head_size]`
			`key_cache: torch.Tensor, # [num_blocks, num_heads, head_size/x, block_size, x]`
			`value_cache: torch.Tensor, # [num_blocks, num_heads, head_size, block_size]`
			`input_metadata: InputMetadata,`
			`cache_event: Optional[torch.cuda.Event],`
			`) -> torch.Tensor: # [num_tokens, num_heads * head_size]`
			`# Apply rotary embedding to the query and key before passing them`
			`# to the attention op.`
			`pos_encoding_ops.rotary_embedding_neox(`
			`positions,`
			`query,`
			`key,`
Use runtime profiling to replace manual memory analyzers (#81) 2023-05-19 11:35:44 -06:00			`self.head_size,`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00			`self.cos_sin_cache,`
			`)`
			`return super().forward(`
Optimize data movement (#20) 2023-04-02 00:30:17 -07:00			`query,`
			`key,`
Implement custom kernel for LLaMA rotary embedding (#14) 2023-03-30 11:04:21 -07:00			`value,`
			`key_cache,`
			`value_cache,`
			`input_metadata,`
			`cache_event,`
			`)`