Fix edge case Mistral tokenizer (#10152)

This commit is contained in:
Patrick von Platen 2024-11-08 16:42:27 +01:00 committed by GitHub
parent b489fc3c91
commit 0535e5fe6c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -72,11 +72,12 @@ class MistralTokenizer:
self.instruct = tokenizer.instruct_tokenizer
tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
if isinstance(tokenizer_, Tekkenizer):
self.is_tekken = isinstance(tokenizer_, Tekkenizer)
self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer)
if self.is_tekken:
# Make sure special tokens will not raise
tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE
elif isinstance(tokenizer_, SentencePieceTokenizer):
elif self.is_spm:
pass
else:
raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
@ -218,7 +219,7 @@ class MistralTokenizer:
return encoded.tokens
def convert_tokens_to_string(self, tokens: List[str]) -> str:
if isinstance(self.tokenizer, Tekkenizer):
if self.is_tekken:
tokens = [
t for t in tokens
if t not in self.tokenizer._all_special_tokens
@ -270,21 +271,20 @@ class MistralTokenizer:
skip_special_tokens
), "skip_special_tokens=False is not supported for Mistral tokenizers."
assert isinstance(self.tokenizer,
(Tekkenizer, SentencePieceTokenizer)), type(
self.tokenizer)
assert self.is_tekken or self.is_spm, type(self.tokenizer)
if isinstance(self.tokenizer, Tekkenizer):
if self.is_tekken:
# skip special tokens
ids = [i for i in ids if i > self.tokenizer.num_special_tokens]
tokens = [self.tokenizer.id_to_piece(id) for id in ids]
if any("<EFBFBD>" in t for t in tokens):
if any("<EFBFBD>" in t for t in tokens) and self.is_tekken:
# if a decoded token contains the replacement character, then the
# token has an incomplete UTF-8 character so we must use bytes
# See: https://github.com/vllm-project/vllm/pull/8640
# https://github.com/vllm-project/vllm/pull/9625
# if underlying tokenizeir is sentencepiece, we just add "<22>"
tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]
return tokens