diff --git a/docs/source/usage/spec_decode.rst b/docs/source/usage/spec_decode.rst index 67e8ede7..f1f1917f 100644 --- a/docs/source/usage/spec_decode.rst +++ b/docs/source/usage/spec_decode.rst @@ -8,6 +8,9 @@ Speculative decoding not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work to optimize it is ongoing and can be followed in `this issue. `_ +.. warning:: + Currently, speculative decoding in vLLM is not compatible with pipeline parallelism. + This document shows how to use `Speculative Decoding `_ with vLLM. Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference. diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 386877e0..b818ca92 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -247,9 +247,19 @@ def _compare_tp( *, method: Literal["generate", "encode"], ): - tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup - multi_node_only, trust_remote_code, tokenizer_mode, \ - load_format, hf_overrides = test_options + ( + tp_size, + pp_size, + eager_mode, + chunked_prefill, + ) = parallel_setup + ( + multi_node_only, + trust_remote_code, + tokenizer_mode, + load_format, + hf_overrides, + ) = test_options if num_gpus_available < tp_size * pp_size: pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs") diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 5ca26d53..0398f094 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -473,10 +473,11 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() + self.sampler = get_sampler() + self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index bd2394e7..f9e0443b 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -400,16 +400,17 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.lm_head.weight = self.model.embed_tokens.weight logit_scale = getattr(config, "logit_scale", 1.0) - if hasattr(config, "logits_scaling"): logit_scale /= config.logits_scaling + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, scale=logit_scale) - self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() + self.sampler = get_sampler() + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 31dfb235..733b1bc7 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -540,10 +540,11 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() + self.sampler = get_sampler() + self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index c7b4c22b..34cb9981 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -435,9 +435,11 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() + + self.sampler = get_sampler() + self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index f58710d2..caae0b65 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -443,10 +443,11 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() + self.sampler = get_sampler() + self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index ced7f538..26898021 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -54,6 +54,10 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": speculative_config: SpeculativeConfig = vllm_config.speculative_config assert speculative_config is not None + if vllm_config.parallel_config.pipeline_parallel_size > 1: + raise NotImplementedError("Speculative decoding is currently " + "incompatible with pipeline parallelism") + draft_worker_kwargs = kwargs.copy() kwargs["model_runner_cls"] = TargetModelRunner