diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 5949858a..92587b40 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -83,6 +83,13 @@ def test_decode_streaming(tokenizer_id, truth, with_prompt, assert decoded_text == generated + decoded_text = _run_incremental_decode( + tokenizer, [len(tokenizer)], + skip_special_tokens=skip_special_tokens, + starting_index=starting_index) + + assert decoded_text == '' + @pytest.fixture def detokenizer(tokenizer_name: str) -> Detokenizer: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3a9e20f4..dec42c63 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -222,14 +222,6 @@ class LLMEngine: self.tokenizer: BaseTokenizerGroup = get_tokenizer_group( self.parallel_config.tokenizer_pool_config, **init_kwargs) - if len(self.get_tokenizer()) != self.model_config.get_vocab_size(): - logger.warning( - f"The tokenizer's vocabulary size {len(self.get_tokenizer())}" - f" does not match the model's vocabulary size " - f"{self.model_config.get_vocab_size()}. This might " - f"cause an error in decoding. Please change config.json " - "to match the tokenizer's vocabulary size.") - def _verify_args(self) -> None: self.model_config.verify_with_parallel_config(self.parallel_config) self.cache_config.verify_with_parallel_config(self.parallel_config) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 47e0275c..9dbd1750 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -68,14 +68,6 @@ class OpenAIServing: tokenizer_mode=engine_model_config.tokenizer_mode, trust_remote_code=engine_model_config.trust_remote_code) - if len(self.tokenizer) != engine_model_config.get_vocab_size(): - logger.warning( - f"The tokenizer's vocabulary size {len(self.tokenizer)}" - f" does not match the model's vocabulary size " - f"{engine_model_config.get_vocab_size()}. This might " - f"cause an error in decoding. Please change config.json " - "to match the tokenizer's vocabulary size.") - async def show_available_models(self) -> ModelList: """Show available models. Right now we only have one model.""" model_cards = [ diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index ad778d19..dad20a56 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -232,9 +232,13 @@ def detokenize_incrementally( all_input_ids[:-1], skip_special_tokens=skip_special_tokens) - # Put new_token_id in a list so skip_special_tokens is respected - new_tokens = tokenizer.convert_ids_to_tokens( - [new_token_id], skip_special_tokens=skip_special_tokens) + # If the new token id is out of bounds, return an empty string. + if new_token_id >= len(tokenizer): + new_tokens = [""] + else: + # Put new_token_id in a list so skip_special_tokens is respected + new_tokens = tokenizer.convert_ids_to_tokens( + [new_token_id], skip_special_tokens=skip_special_tokens) output_tokens = prev_tokens + new_tokens # If this is the first iteration, return all tokens.