2023-12-13 16:56:05 +08:00
|
|
|
--- flash_ori.py 2023-12-13 05:43:31.530752623 +0000
|
|
|
|
+++ flash_patch.py 2023-12-13 06:00:45.962403104 +0000
|
|
|
|
@@ -36,44 +36,44 @@
|
2023-12-08 15:16:52 +08:00
|
|
|
|
|
|
|
FLASH_VERSION = "0.0.0"
|
|
|
|
try:
|
|
|
|
- try:
|
|
|
|
- from ... import _C_flashattention # type: ignore[attr-defined]
|
|
|
|
- from ..._cpp_lib import _build_metadata
|
|
|
|
-
|
|
|
|
- if _build_metadata is not None:
|
|
|
|
- FLASH_VERSION = _build_metadata.flash_version
|
|
|
|
- except ImportError:
|
|
|
|
- import flash_attn
|
|
|
|
- from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention
|
|
|
|
-
|
|
|
|
- FLASH_VERSION = flash_attn.__version__
|
2023-12-13 16:56:05 +08:00
|
|
|
- flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3])
|
|
|
|
- if (
|
|
|
|
- flash_ver_parsed != (2, 3, 6)
|
|
|
|
- and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1"
|
|
|
|
- ):
|
|
|
|
- raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api")
|
2023-12-08 15:16:52 +08:00
|
|
|
+ #try:
|
|
|
|
+ # from ... import _C_flashattention # type: ignore[attr-defined]
|
|
|
|
+ # from ..._cpp_lib import _build_metadata
|
|
|
|
+
|
|
|
|
+ # if _build_metadata is not None:
|
|
|
|
+ # FLASH_VERSION = _build_metadata.flash_version
|
|
|
|
+ #except ImportError:
|
|
|
|
+ import flash_attn
|
|
|
|
+ from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention
|
|
|
|
+
|
|
|
|
+ FLASH_VERSION = flash_attn.__version__
|
2023-12-13 16:56:05 +08:00
|
|
|
+ # flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3])
|
|
|
|
+ # if (
|
|
|
|
+ # flash_ver_parsed != (2, 3, 6)
|
|
|
|
+ # and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1"
|
|
|
|
+ # ):
|
|
|
|
+ # raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api")
|
2023-12-08 15:16:52 +08:00
|
|
|
|
|
|
|
# create library so that flash-attn goes through the PyTorch Dispatcher
|
|
|
|
- _flash_lib = torch.library.Library("xformers_flash", "DEF")
|
2023-12-13 16:56:05 +08:00
|
|
|
-
|
2023-12-08 15:16:52 +08:00
|
|
|
- _flash_lib.define(
|
|
|
|
- "flash_fwd(Tensor query, Tensor key, Tensor value, "
|
2023-12-13 16:56:05 +08:00
|
|
|
- "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, "
|
2023-12-08 15:16:52 +08:00
|
|
|
- "int max_seqlen_q, int max_seqlen_k, "
|
|
|
|
- "float p, float softmax_scale, "
|
2023-12-13 16:56:05 +08:00
|
|
|
- "bool is_causal, int window_left, "
|
|
|
|
- "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)"
|
2023-12-08 15:16:52 +08:00
|
|
|
- )
|
2023-12-13 16:56:05 +08:00
|
|
|
+ #_flash_lib = torch.library.Library("xformers_flash", "DEF")
|
|
|
|
|
2023-12-08 15:16:52 +08:00
|
|
|
- _flash_lib.define(
|
|
|
|
- "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, "
|
|
|
|
- "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, "
|
|
|
|
- "Tensor cu_seqlens_q, Tensor cu_seqlens_k, "
|
|
|
|
- "int max_seqlen_q, int max_seqlen_k, "
|
2023-12-13 16:56:05 +08:00
|
|
|
- "float p, float softmax_scale, bool is_causal, "
|
|
|
|
- "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
|
2023-12-08 15:16:52 +08:00
|
|
|
- )
|
|
|
|
+ #_flash_lib.define(
|
|
|
|
+ # "flash_fwd(Tensor query, Tensor key, Tensor value, "
|
2023-12-13 16:56:05 +08:00
|
|
|
+ # "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, "
|
2023-12-08 15:16:52 +08:00
|
|
|
+ # "int max_seqlen_q, int max_seqlen_k, "
|
|
|
|
+ # "float p, float softmax_scale, "
|
2023-12-13 16:56:05 +08:00
|
|
|
+ # "bool is_causal, int window_left, "
|
|
|
|
+ # "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)"
|
2023-12-08 15:16:52 +08:00
|
|
|
+ #)
|
|
|
|
+
|
|
|
|
+ #_flash_lib.define(
|
|
|
|
+ # "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, "
|
|
|
|
+ # "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, "
|
|
|
|
+ # "Tensor cu_seqlens_q, Tensor cu_seqlens_k, "
|
|
|
|
+ # "int max_seqlen_q, int max_seqlen_k, "
|
2023-12-13 16:56:05 +08:00
|
|
|
+ # "float p, float softmax_scale, bool is_causal, "
|
|
|
|
+ # "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
|
2023-12-08 15:16:52 +08:00
|
|
|
+ #)
|
|
|
|
|
|
|
|
def _flash_fwd(
|
|
|
|
query,
|
2023-12-13 16:56:05 +08:00
|
|
|
@@ -111,8 +111,8 @@
|
2023-12-08 15:16:52 +08:00
|
|
|
p,
|
|
|
|
softmax_scale,
|
|
|
|
is_causal,
|
2023-12-13 16:56:05 +08:00
|
|
|
- window_left, # window_size_left
|
|
|
|
- window_right, # window_size_right
|
|
|
|
+ # window_left, # window_size_left
|
|
|
|
+ # window_right, # window_size_right
|
2023-12-08 15:16:52 +08:00
|
|
|
return_softmax,
|
|
|
|
None, # rng
|
|
|
|
)
|
2023-12-13 16:56:05 +08:00
|
|
|
@@ -134,15 +134,15 @@
|
|
|
|
out,
|
|
|
|
cu_seq_lens_q,
|
|
|
|
cu_seq_lens_k,
|
|
|
|
- seqused_k,
|
|
|
|
+ # seqused_k,
|
|
|
|
max_seq_len_q,
|
|
|
|
max_seq_len_k,
|
|
|
|
p,
|
2023-12-08 15:16:52 +08:00
|
|
|
softmax_scale,
|
|
|
|
False,
|
|
|
|
is_causal,
|
2023-12-13 16:56:05 +08:00
|
|
|
- window_left,
|
|
|
|
- window_right,
|
|
|
|
+ # window_left,
|
|
|
|
+ # window_right,
|
2023-12-08 15:16:52 +08:00
|
|
|
return_softmax,
|
|
|
|
None,
|
|
|
|
)
|
2023-12-13 16:56:05 +08:00
|
|
|
@@ -184,8 +184,8 @@
|
2023-12-08 15:16:52 +08:00
|
|
|
p,
|
|
|
|
softmax_scale,
|
|
|
|
is_causal,
|
2023-12-13 16:56:05 +08:00
|
|
|
- window_left,
|
|
|
|
- window_right,
|
|
|
|
+ # window_left,
|
|
|
|
+ # window_right,
|
2023-12-08 15:16:52 +08:00
|
|
|
None,
|
|
|
|
rng_state,
|
|
|
|
)
|
2023-12-13 16:56:05 +08:00
|
|
|
@@ -208,15 +208,15 @@
|
2023-12-08 15:16:52 +08:00
|
|
|
softmax_scale,
|
|
|
|
False, # zero_tensors
|
|
|
|
is_causal,
|
2023-12-13 16:56:05 +08:00
|
|
|
- window_left,
|
|
|
|
- window_right,
|
|
|
|
+ # window_left,
|
|
|
|
+ # window_right,
|
2023-12-08 15:16:52 +08:00
|
|
|
None,
|
|
|
|
rng_state,
|
|
|
|
)
|
|
|
|
return dq, dk, dv
|
|
|
|
|
|
|
|
- _flash_lib.impl("flash_fwd", _flash_fwd, "CUDA")
|
|
|
|
- _flash_lib.impl("flash_bwd", _flash_bwd, "CUDA")
|
|
|
|
+ #_flash_lib.impl("flash_fwd", _flash_fwd, "CUDA")
|
|
|
|
+ #_flash_lib.impl("flash_bwd", _flash_bwd, "CUDA")
|
|
|
|
except ImportError:
|
|
|
|
pass
|
|
|
|
|
2023-12-13 16:56:05 +08:00
|
|
|
@@ -400,7 +400,7 @@
|
2023-12-08 15:16:52 +08:00
|
|
|
implementation.
|
|
|
|
"""
|
|
|
|
|
|
|
|
- OPERATOR = get_operator("xformers_flash", "flash_fwd")
|
|
|
|
+ OPERATOR = _flash_fwd # get_operator("xformers_flash", "flash_fwd")
|
|
|
|
SUPPORTED_DEVICES: Set[str] = {"cuda"}
|
|
|
|
CUDA_MINIMUM_COMPUTE_CAPABILITY = (8, 0)
|
|
|
|
SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
|