diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index ebeefdf4..7376f930 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -471,7 +471,10 @@ class ROCmFlashAttentionImpl(AttentionImpl): if blocksparse_params is not None: raise ValueError( "ROCmFlashAttention does not support blocksparse attention.") - + if use_irope: + logger.warning( + "Using irope in V0 is not supported yet, it will fall back " + "to global attention for long context.") if logits_soft_cap is None: # In flash-attn, setting logits_soft_cap as 0 means no soft cap. self.logits_soft_cap = 0.0 diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 4ab99acb..a6a00040 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1002,6 +1002,7 @@ direct_register_custom_op( op_func=inplace_fused_experts, mutates_args=["hidden_states"], fake_impl=inplace_fused_experts_fake, + tags=(torch.Tag.needs_fixed_stride_order, ), ) @@ -1060,6 +1061,7 @@ direct_register_custom_op( op_func=outplace_fused_experts, mutates_args=[], fake_impl=outplace_fused_experts_fake, + tags=(torch.Tag.needs_fixed_stride_order, ), ) diff --git a/vllm/utils.py b/vllm/utils.py index 46f01638..1645565a 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -40,7 +40,7 @@ from dataclasses import dataclass, field from functools import cache, lru_cache, partial, wraps from types import MappingProxyType from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple, - Optional, Type, TypeVar, Union, cast, overload) + Optional, Tuple, Type, TypeVar, Union, cast, overload) from uuid import uuid4 import cachetools @@ -1935,12 +1935,13 @@ vllm_lib = Library("vllm", "FRAGMENT") # noqa def direct_register_custom_op( - op_name: str, - op_func: Callable, - mutates_args: list[str], - fake_impl: Optional[Callable] = None, - target_lib: Optional[Library] = None, - dispatch_key: str = "CUDA", + op_name: str, + op_func: Callable, + mutates_args: list[str], + fake_impl: Optional[Callable] = None, + target_lib: Optional[Library] = None, + dispatch_key: str = "CUDA", + tags: Tuple[torch.Tag, ...] = (), ): """ `torch.library.custom_op` can have significant overhead because it @@ -1979,7 +1980,7 @@ def direct_register_custom_op( import torch._custom_op.impl schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args) my_lib = target_lib or vllm_lib - my_lib.define(op_name + schema_str) + my_lib.define(op_name + schema_str, tags=tags) my_lib.impl(op_name, op_func, dispatch_key=dispatch_key) if fake_impl is not None: my_lib._register_fake(op_name, fake_impl)