[Misc][XPU] Upgrade to Pytorch 2.5 for xpu backend (#9823)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> Signed-off-by: yan ma <yan.ma@intel.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
2024-11-07 09:29:03 +08:00 · 2024-11-07 09:29:03 +08:00 · d3859f1891
commit d3859f1891
parent 4ab3256644
4 changed files with 43 additions and 46 deletions
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@ -30,9 +30,19 @@ COPY requirements-common.txt /workspace/vllm/requirements-common.txt

 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install --no-cache-dir \
-    --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
    -r requirements-xpu.txt

+RUN git clone https://github.com/intel/pti-gpu && \
+    cd pti-gpu/sdk && \
+    git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
+    mkdir build && \
+    cd build && \
+    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
+    make -j && \
+    cmake --install . --config Release --prefix "/usr/local"
+
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
+
 COPY . .
 ARG GIT_REPO_CHECK
 RUN --mount=type=bind,source=.git,target=.git \
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@ -8,9 +8,9 @@ packaging
 setuptools-scm>=8
 wheel
 jinja2
-# Following pkgs retrieved from https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-torch == 2.3.1+cxx11.abi
-intel-extension-for-pytorch == 2.3.110+xpu
-oneccl_bind_pt == 2.3.100+xpu
+
+torch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
+intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
+oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl

 triton-xpu == 3.0.0b1
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@ -74,20 +74,12 @@ class ipex_ops:
        assert kv_cache_dtype == "auto"
        num_heads = out.size(1)
        num_queries_per_tokens = num_heads // num_kv_heads
-        head_mapping = torch.arange(
-            0,
-            num_kv_heads,
-            device=query.device,
-            dtype=torch.int32,
-        ).view(num_kv_heads,
-               1).repeat_interleave(num_queries_per_tokens).flatten()
-        # todo: ipex will refactor namespace
-        torch.xpu.paged_attention_v1(  # type: ignore
+        ipex.llm.modules.PagedAttention.single_query_kv_attention(
            out,
            query.contiguous(),
            key_cache.view_as(value_cache),
            value_cache,
-            head_mapping,
+            num_queries_per_tokens,
            scale,
            block_tables,
            context_lens,
@ -124,26 +116,15 @@ class ipex_ops:
        assert kv_cache_dtype == "auto"
        num_heads = out.size(1)
        num_queries_per_tokens = num_heads // num_kv_heads
-        head_mapping = torch.arange(
-            0,
-            num_kv_heads,
-            dtype=torch.int32,
-            device=query.device,
-        ).view(num_kv_heads,
-               1).repeat_interleave(num_queries_per_tokens).flatten()
-        # todo: ipex will refactor namespace
-        torch.xpu.paged_attention_v2(  # type: ignore
+        ipex.llm.modules.PagedAttention.single_query_kv_attention(
            out,
-            exp_sum,
-            max_logits,
-            tmp_out,
            query.contiguous(),
            key_cache.view_as(value_cache),
            value_cache,
-            head_mapping,
+            num_queries_per_tokens,
+            scale,
            block_tables,
            context_lens,
-            scale,
            block_size,
            max_context_len,
            alibi_slopes,
@ -202,6 +183,7 @@ class ipex_ops:
        is_causal: bool,
        return_softmax: bool,
        gen_: torch.Generator,
+        logits_soft_cap: float,
    ) -> None:
        ipex.llm.functional.varlen_attention(query.contiguous(),
                                             key.contiguous(),
@ -210,7 +192,8 @@ class ipex_ops:
                                             max_seqlen_q, max_seqlen_k,
                                             pdropout, softmax_scale,
                                             zero_tensors, is_causal,
-                                             return_softmax, gen_)
+                                             return_softmax, gen_,
+                                             logits_soft_cap)

    @staticmethod
    def reshape_and_cache(
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@ -119,8 +119,6 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
        if blocksparse_params is not None:
            raise ValueError(
                "IPEX backend does not support block-sparse attention.")
-        if logits_soft_cap is not None:
-            raise ValueError("IPEX backend does not support logits_soft_cap.")
        self.num_heads = num_heads
        self.head_size = head_size
        self.scale = float(scale)
@ -135,6 +133,9 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
        self.need_mask = (self.alibi_slopes is not None
                          or self.sliding_window is not None)
+        if logits_soft_cap is None:
+            logits_soft_cap = 0
+        self.logits_soft_cap = logits_soft_cap

        supported_head_sizes = PagedAttention.get_supported_head_sizes()
        if head_size not in supported_head_sizes:
@ -239,20 +240,23 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
                    (num_tokens, self.num_heads, self.head_size),
                    dtype=query.dtype,
                    device=query.device)
-                ipex_ops.varlen_attention(query,
-                                          key,
-                                          value,
-                                          output,
-                                          attn_metadata.seqlen_q,
-                                          attn_metadata.seqlen_q,
-                                          attn_metadata.max_seqlen,
-                                          attn_metadata.max_seqlen,
-                                          pdropout=0.0,
-                                          softmax_scale=self.scale,
-                                          zero_tensors=False,
-                                          is_causal=True,
-                                          return_softmax=False,
-                                          gen_=None)
+                ipex_ops.varlen_attention(
+                    query,
+                    key,
+                    value,
+                    output,
+                    attn_metadata.seqlen_q,
+                    attn_metadata.seqlen_q,
+                    attn_metadata.max_seqlen,
+                    attn_metadata.max_seqlen,
+                    pdropout=0.0,
+                    softmax_scale=self.scale,
+                    zero_tensors=False,
+                    is_causal=True,
+                    return_softmax=False,
+                    gen_=None,
+                    logits_soft_cap=self.logits_soft_cap,
+                )
            else:
                # prefix-enabled attention
                raise RuntimeError(