diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 702d4b0b..ad719808 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -4,7 +4,7 @@ numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding # Dependencies for NVIDIA GPUs -ray[cgraph]>=2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1. +ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1. torch==2.6.0 torchaudio==2.6.0 # These must be updated alongside torch diff --git a/requirements/test.in b/requirements/test.in index 5c59bbd1..3df5e32c 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -17,7 +17,7 @@ vector_quantize_pytorch # required for minicpmo_26 test vocos # required for minicpmo_26 test peft pqdm -ray[cgraph]>=2.43.0 # Ray Compiled Graph, required by pipeline parallelism tests +ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests sentence-transformers # required for embedding tests soundfile # required for audio tests jiwer # required for audio tests diff --git a/vllm/config.py b/vllm/config.py index 2e9325c2..62800afc 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -313,7 +313,7 @@ class ModelConfig: raise ValueError( "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer " "module was not found." - "See https://github.com/vllm-project/vllm/blob/main/Dockerfile" + "See https://github.com/vllm-project/vllm/blob/main/Dockerfile " "for instructions on how to install it.") # The tokenizer version is consistent with the model version by default. diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 364555b3..784ea35b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1686,8 +1686,11 @@ class EngineArgs: if self.enable_lora and _warn_or_fallback("LORA"): return False - # PP is supported on V1, but off by default for now. - if self.pipeline_parallel_size > 1 and _warn_or_fallback("PP"): + # PP is supported on V1 with Ray distributed executor, + # but off for MP distributed executor for now. + if (self.pipeline_parallel_size > 1 + and self.distributed_executor_backend == "mp" + and _warn_or_fallback("PP (MP distributed executor)")): return False # ngram is supported on V1, but off by default for now.