diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst index f629b3ca..5fc86ab0 100644 --- a/docs/source/serving/compatibility_matrix.rst +++ b/docs/source/serving/compatibility_matrix.rst @@ -39,12 +39,13 @@ Feature x Feature - :abbr:`prmpt adptr (Prompt Adapter)` - :ref:`SD ` - CUDA graph + - :abbr:`emd (Embedding Models)` - :abbr:`enc-dec (Encoder-Decoder Models)` - :abbr:`logP (Logprobs)` - :abbr:`prmpt logP (Prompt Logprobs)` - :abbr:`async output (Async Output Processing)` - multi-step - - :abbr:`MM (Multimodal)` + - :abbr:`mm (Multimodal)` - best-of - beam-search - :abbr:`guided dec (Guided Decoding)` @@ -64,6 +65,7 @@ Feature x Feature - - - + - * - :ref:`APC ` - ✅ - @@ -80,6 +82,7 @@ Feature x Feature - - - + - * - :ref:`LoRA ` - `✗ `__ - ✅ @@ -96,6 +99,7 @@ Feature x Feature - - - + - * - :abbr:`prmpt adptr (Prompt Adapter)` - ✅ - ✅ @@ -112,6 +116,7 @@ Feature x Feature - - - + - * - :ref:`SD ` - ✗ - ✅ @@ -128,6 +133,7 @@ Feature x Feature - - - + - * - CUDA graph - ✅ - ✅ @@ -144,6 +150,24 @@ Feature x Feature - - - + - + * - :abbr:`emd (Embedding Models)` + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + - + - + - + - + - + - + - + - + - + - * - :abbr:`enc-dec (Encoder-Decoder Models)` - ✗ - `✗ `__ @@ -151,6 +175,7 @@ Feature x Feature - ✗ - `✗ `__ - ✅ + - ✅ - - - @@ -166,7 +191,8 @@ Feature x Feature - ✅ - ✅ - ✅ - - ✅ + - ✅ + - ✗ - ✅ - - @@ -183,7 +209,8 @@ Feature x Feature - ✅ - `✗ `__ - ✅ - - ✅ + - ✗ + - ✅ - ✅ - - @@ -199,6 +226,7 @@ Feature x Feature - ✅ - ✗ - ✅ + - ✗ - ✗ - ✅ - ✅ @@ -215,6 +243,7 @@ Feature x Feature - ✅ - ✗ - ✅ + - ✗ - ✗ - ✅ - `✗ `__ @@ -224,14 +253,15 @@ Feature x Feature - - - - * - :abbr:`MM (Multimodal)` - - `✗ `__ + * - :abbr:`mm (Multimodal)` + - ✅ - `✗ `__ - `✗ `__ - ? - ? - ✅ - - ✗ + - ✅ + - ✅ - ✅ - ✅ - ✅ @@ -247,6 +277,7 @@ Feature x Feature - ✅ - `✗ `__ - ✅ + - ✗ - ✅ - ✅ - ✅ @@ -263,6 +294,7 @@ Feature x Feature - ✅ - `✗ `__ - ✅ + - ✗ - ✅ - ✅ - ✅ @@ -279,6 +311,7 @@ Feature x Feature - ? - ✅ - ✅ + - ✗ - ? - ✅ - ✅ @@ -353,6 +386,14 @@ Feature x Hardware - ✅ - ✗ - ✅ + * - :abbr:`emd (Embedding Models)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ * - :abbr:`enc-dec (Encoder-Decoder Models)` - ✅ - ✅ @@ -361,6 +402,14 @@ Feature x Hardware - ✅ - ✅ - ✗ + * - :abbr:`mm (Multimodal)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ * - :abbr:`logP (Logprobs)` - ✅ - ✅ @@ -393,14 +442,6 @@ Feature x Hardware - ✅ - `✗ `__ - ✅ - * - :abbr:`MM (Multimodal)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ * - best-of - ✅ - ✅ diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a3ae1889..9288cd22 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1014,7 +1014,8 @@ class EngineArgs: use_spec_decode = self.speculative_model is not None if (is_gpu and not use_sliding_window and not use_spec_decode and not self.enable_lora - and not self.enable_prompt_adapter): + and not self.enable_prompt_adapter + and model_config.task != "embedding"): self.enable_chunked_prefill = True logger.warning( "Chunked prefill is enabled by default for models with " @@ -1031,6 +1032,9 @@ class EngineArgs: "errors during the initial memory profiling phase, or result " "in low performance due to small KV cache space. Consider " "setting --max-model-len to a smaller value.", max_model_len) + elif self.enable_chunked_prefill and model_config.task == "embedding": + msg = "Chunked prefill is not supported for embedding models" + raise ValueError(msg) speculative_config = SpeculativeConfig.maybe_create_spec_config( target_model_config=model_config,