[Misc][XPU] Upgrade to Pytorch 2.5 for xpu backend (#9823)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> Signed-off-by: yan ma <yan.ma@intel.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
This commit is contained in:
parent
4ab3256644
commit
d3859f1891
@ -30,9 +30,19 @@ COPY requirements-common.txt /workspace/vllm/requirements-common.txt
|
|||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install --no-cache-dir \
|
pip install --no-cache-dir \
|
||||||
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
|
|
||||||
-r requirements-xpu.txt
|
-r requirements-xpu.txt
|
||||||
|
|
||||||
|
RUN git clone https://github.com/intel/pti-gpu && \
|
||||||
|
cd pti-gpu/sdk && \
|
||||||
|
git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
|
||||||
|
mkdir build && \
|
||||||
|
cd build && \
|
||||||
|
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
|
||||||
|
make -j && \
|
||||||
|
cmake --install . --config Release --prefix "/usr/local"
|
||||||
|
|
||||||
|
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
ARG GIT_REPO_CHECK
|
ARG GIT_REPO_CHECK
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
|
@ -8,9 +8,9 @@ packaging
|
|||||||
setuptools-scm>=8
|
setuptools-scm>=8
|
||||||
wheel
|
wheel
|
||||||
jinja2
|
jinja2
|
||||||
# Following pkgs retrieved from https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
|
|
||||||
torch == 2.3.1+cxx11.abi
|
torch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
|
||||||
intel-extension-for-pytorch == 2.3.110+xpu
|
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
|
||||||
oneccl_bind_pt == 2.3.100+xpu
|
oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl
|
||||||
|
|
||||||
triton-xpu == 3.0.0b1
|
triton-xpu == 3.0.0b1
|
||||||
|
@ -74,20 +74,12 @@ class ipex_ops:
|
|||||||
assert kv_cache_dtype == "auto"
|
assert kv_cache_dtype == "auto"
|
||||||
num_heads = out.size(1)
|
num_heads = out.size(1)
|
||||||
num_queries_per_tokens = num_heads // num_kv_heads
|
num_queries_per_tokens = num_heads // num_kv_heads
|
||||||
head_mapping = torch.arange(
|
ipex.llm.modules.PagedAttention.single_query_kv_attention(
|
||||||
0,
|
|
||||||
num_kv_heads,
|
|
||||||
device=query.device,
|
|
||||||
dtype=torch.int32,
|
|
||||||
).view(num_kv_heads,
|
|
||||||
1).repeat_interleave(num_queries_per_tokens).flatten()
|
|
||||||
# todo: ipex will refactor namespace
|
|
||||||
torch.xpu.paged_attention_v1( # type: ignore
|
|
||||||
out,
|
out,
|
||||||
query.contiguous(),
|
query.contiguous(),
|
||||||
key_cache.view_as(value_cache),
|
key_cache.view_as(value_cache),
|
||||||
value_cache,
|
value_cache,
|
||||||
head_mapping,
|
num_queries_per_tokens,
|
||||||
scale,
|
scale,
|
||||||
block_tables,
|
block_tables,
|
||||||
context_lens,
|
context_lens,
|
||||||
@ -124,26 +116,15 @@ class ipex_ops:
|
|||||||
assert kv_cache_dtype == "auto"
|
assert kv_cache_dtype == "auto"
|
||||||
num_heads = out.size(1)
|
num_heads = out.size(1)
|
||||||
num_queries_per_tokens = num_heads // num_kv_heads
|
num_queries_per_tokens = num_heads // num_kv_heads
|
||||||
head_mapping = torch.arange(
|
ipex.llm.modules.PagedAttention.single_query_kv_attention(
|
||||||
0,
|
|
||||||
num_kv_heads,
|
|
||||||
dtype=torch.int32,
|
|
||||||
device=query.device,
|
|
||||||
).view(num_kv_heads,
|
|
||||||
1).repeat_interleave(num_queries_per_tokens).flatten()
|
|
||||||
# todo: ipex will refactor namespace
|
|
||||||
torch.xpu.paged_attention_v2( # type: ignore
|
|
||||||
out,
|
out,
|
||||||
exp_sum,
|
|
||||||
max_logits,
|
|
||||||
tmp_out,
|
|
||||||
query.contiguous(),
|
query.contiguous(),
|
||||||
key_cache.view_as(value_cache),
|
key_cache.view_as(value_cache),
|
||||||
value_cache,
|
value_cache,
|
||||||
head_mapping,
|
num_queries_per_tokens,
|
||||||
|
scale,
|
||||||
block_tables,
|
block_tables,
|
||||||
context_lens,
|
context_lens,
|
||||||
scale,
|
|
||||||
block_size,
|
block_size,
|
||||||
max_context_len,
|
max_context_len,
|
||||||
alibi_slopes,
|
alibi_slopes,
|
||||||
@ -202,6 +183,7 @@ class ipex_ops:
|
|||||||
is_causal: bool,
|
is_causal: bool,
|
||||||
return_softmax: bool,
|
return_softmax: bool,
|
||||||
gen_: torch.Generator,
|
gen_: torch.Generator,
|
||||||
|
logits_soft_cap: float,
|
||||||
) -> None:
|
) -> None:
|
||||||
ipex.llm.functional.varlen_attention(query.contiguous(),
|
ipex.llm.functional.varlen_attention(query.contiguous(),
|
||||||
key.contiguous(),
|
key.contiguous(),
|
||||||
@ -210,7 +192,8 @@ class ipex_ops:
|
|||||||
max_seqlen_q, max_seqlen_k,
|
max_seqlen_q, max_seqlen_k,
|
||||||
pdropout, softmax_scale,
|
pdropout, softmax_scale,
|
||||||
zero_tensors, is_causal,
|
zero_tensors, is_causal,
|
||||||
return_softmax, gen_)
|
return_softmax, gen_,
|
||||||
|
logits_soft_cap)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def reshape_and_cache(
|
def reshape_and_cache(
|
||||||
|
@ -119,8 +119,6 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
|
|||||||
if blocksparse_params is not None:
|
if blocksparse_params is not None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"IPEX backend does not support block-sparse attention.")
|
"IPEX backend does not support block-sparse attention.")
|
||||||
if logits_soft_cap is not None:
|
|
||||||
raise ValueError("IPEX backend does not support logits_soft_cap.")
|
|
||||||
self.num_heads = num_heads
|
self.num_heads = num_heads
|
||||||
self.head_size = head_size
|
self.head_size = head_size
|
||||||
self.scale = float(scale)
|
self.scale = float(scale)
|
||||||
@ -135,6 +133,9 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
|
|||||||
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
||||||
self.need_mask = (self.alibi_slopes is not None
|
self.need_mask = (self.alibi_slopes is not None
|
||||||
or self.sliding_window is not None)
|
or self.sliding_window is not None)
|
||||||
|
if logits_soft_cap is None:
|
||||||
|
logits_soft_cap = 0
|
||||||
|
self.logits_soft_cap = logits_soft_cap
|
||||||
|
|
||||||
supported_head_sizes = PagedAttention.get_supported_head_sizes()
|
supported_head_sizes = PagedAttention.get_supported_head_sizes()
|
||||||
if head_size not in supported_head_sizes:
|
if head_size not in supported_head_sizes:
|
||||||
@ -239,7 +240,8 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
|
|||||||
(num_tokens, self.num_heads, self.head_size),
|
(num_tokens, self.num_heads, self.head_size),
|
||||||
dtype=query.dtype,
|
dtype=query.dtype,
|
||||||
device=query.device)
|
device=query.device)
|
||||||
ipex_ops.varlen_attention(query,
|
ipex_ops.varlen_attention(
|
||||||
|
query,
|
||||||
key,
|
key,
|
||||||
value,
|
value,
|
||||||
output,
|
output,
|
||||||
@ -252,7 +254,9 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
|
|||||||
zero_tensors=False,
|
zero_tensors=False,
|
||||||
is_causal=True,
|
is_causal=True,
|
||||||
return_softmax=False,
|
return_softmax=False,
|
||||||
gen_=None)
|
gen_=None,
|
||||||
|
logits_soft_cap=self.logits_soft_cap,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# prefix-enabled attention
|
# prefix-enabled attention
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user