From 943e72ca56974b4d8b5a141182e717d2abd3a819 Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Mon, 20 May 2024 13:29:28 -0500
Subject: [PATCH] [Build/CI] Enabling AMD Entrypoints Test (#4834)

Co-authored-by: Alexey Kondratiev <alexey.kondratiev@amd.com>
---
 .buildkite/test-pipeline.yaml     | 3 ++-
 Dockerfile.rocm                   | 8 ++++++--
 requirements-rocm.txt             | 3 ++-
 tests/spec_decode/e2e/conftest.py | 8 ++++++--
 4 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 6f5c46e2..def8a460 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -60,7 +60,8 @@ steps:
   command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
 
 - label: Entrypoints Test
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
+
   commands:
   # these tests have to be separated, because each one will allocate all posible GPU memory
   - pytest -v -s entrypoints --ignore=entrypoints/test_server_oot_registration.py
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index eefad79e..9bfe8446 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -92,19 +92,23 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \
 WORKDIR /vllm-workspace
 COPY . .
 
+#RUN python3 -m pip install pynvml # to be removed eventually
 RUN python3 -m pip install --upgrade pip numba
 
 # make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1
+# Workaround for ray >= 2.10.0
+ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+
+ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -U -r requirements-rocm.txt \
     && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
     && python3 setup.py install \
     && cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.cpython-39-x86_64-linux-gnu.so vllm/ \
     && cd ..
 
-RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3
 
 CMD ["/bin/bash"]
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index 903845b6..cc42839a 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -2,4 +2,5 @@
 -r requirements-common.txt
 
 # Dependencies for AMD GPUs
-ray == 2.9.3
+ray >= 2.10.0
+pytest-asyncio
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index da8b9271..7c5840ba 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -6,8 +6,12 @@ from typing import Dict, List, Optional, Tuple, Union
 import pytest
 import ray
 import torch
-from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
-                    nvmlInit)
+
+from vllm.utils import is_hip
+
+if (not is_hip()):
+    from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
+                        nvmlInit)
 
 from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs