Prashant Gupta d11b46f3a5
[bugfix] fix f-string for error (#9295)
Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
2024-10-11 17:03:48 -07:00

230 lines
7.8 KiB
Python
Raw Blame History

import os
import re
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
from huggingface_hub import HfApi, hf_hub_download
# yapf: disable
from mistral_common.tokens.tokenizers.mistral import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import (
MistralTokenizer as PublicMistralTokenizer)
# yapf: enable
from mistral_common.tokens.tokenizers.sentencepiece import (
SentencePieceTokenizer)
from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy,
Tekkenizer)
if TYPE_CHECKING:
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
@dataclass
class Encoding:
input_ids: List[int]
def find_tokenizer_file(files: List[str]):
file_pattern = re.compile(r"^tokenizer\.model\.v.*$|^tekken\.json$")
matched_files = [file for file in files if file_pattern.match(file)]
if len(matched_files) > 1:
raise OSError(f"Found {len(matched_files)} files matching the "
f"pattern: {file_pattern}. Make sure only one Mistral "
f"tokenizer is present in {files}.")
elif len(matched_files) == 0:
raise OSError(f"Found {len(matched_files)} files matching the "
f"pattern: {file_pattern}. Make sure that a Mistral "
f"tokenizer is present in {files}.")
return matched_files[0]
class MistralTokenizer:
def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
self.mistral = tokenizer
self.instruct = tokenizer.instruct_tokenizer
tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
if isinstance(tokenizer_, Tekkenizer):
# Make sure special tokens will not raise
tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE
self._vocab = {
token: idx
for idx, token in enumerate(tokenizer_.vocab())
}
elif isinstance(tokenizer_, SentencePieceTokenizer):
self._vocab = {
token: idx
for idx, token in enumerate(tokenizer_.vocab())
}
else:
raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
self.tokenizer = tokenizer_
@classmethod
def from_pretrained(cls,
path_or_repo_id: str,
*,
revision: Optional[str] = None) -> "MistralTokenizer":
if not Path(path_or_repo_id).exists():
assert len(path_or_repo_id.split("/")) == 2, (
"You have either provided a non-existent path: "
"{path_or_repo_id} or an invalid HF Hub repo id.")
tokenizer_file = cls._download_mistral_tokenizer_from_hf(
path_or_repo_id, revision)
elif Path(path_or_repo_id).is_dir():
tokenizer_file_name = find_tokenizer_file(
os.listdir(path_or_repo_id))
tokenizer_file = str(Path(path_or_repo_id) / tokenizer_file_name)
else:
assert Path(
path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
mistral_tokenizer = PublicMistralTokenizer.from_file(tokenizer_file)
return cls(mistral_tokenizer)
@staticmethod
def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
revision: Optional[str]) -> str:
api = HfApi()
repo_info = api.model_info(tokenizer_name)
files = [s.rfilename for s in repo_info.siblings]
filename = find_tokenizer_file(files)
tokenizer_file = hf_hub_download(tokenizer_name,
filename=filename,
revision=revision)
return tokenizer_file
# the following attributes are set to fit VLLM's design
@property
def all_special_tokens_extended(self) -> List[str]:
return []
@property
def all_special_tokens(self) -> List[str]:
return []
@property
def all_special_ids(self) -> List[int]:
return []
@property
def bos_token_id(self) -> int:
return self.tokenizer.bos_id
@property
def eos_token_id(self) -> int:
return self.tokenizer.eos_id
@property
def is_fast(self) -> bool:
return True
@property
def vocab_size(self) -> int:
return len(self._vocab)
def __len__(self) -> int:
return self.vocab_size
def __call__(
self,
prompt: str,
add_special_tokens: bool = False,
truncation: bool = False,
max_length: Optional[int] = None,
):
# Mistral Tokenizers should not add special tokens
input_ids = self.encode(prompt)
if truncation:
input_ids = input_ids[:max_length]
return Encoding(input_ids=input_ids)
def get_vocab(self) -> Dict[str, int]:
return self._vocab
def get_added_vocab(self) -> Dict[str, int]:
# Mistral tokenizers have no added vocabulary
return {}
def encode(self, prompt: str) -> List[int]:
# `encode` should only be used for prompt completion
# it should never be used for chat_completion.
# For chat completion use `apply_chat_template`
return self.tokenizer.encode(prompt, bos=True, eos=False)
def apply_chat_template(self,
messages: List["ChatCompletionMessageParam"],
tools: Optional[Dict[str, Any]] = None,
**kwargs) -> List[int]:
request = ChatCompletionRequest(messages=messages,
tools=tools) # type: ignore[type-var]
encoded = self.mistral.encode_chat_completion(request)
# encode-decode to get clean prompt
return encoded.tokens
def convert_tokens_to_string(self, tokens: List[str]) -> str:
if isinstance(self.tokenizer, Tekkenizer):
tokens = [
t for t in tokens
if t not in self.tokenizer._all_special_tokens
]
if any(isinstance(t, bytes) for t in tokens):
# we need to encode and decode all tokens again
shift = self.tokenizer.num_special_tokens
byte_tokens = [
t.encode("utf-8") if not isinstance(t, bytes) else t
for t in tokens
]
ids = [
self.tokenizer._tekken_token2id_nospecial[t] + shift
for t in byte_tokens
]
decoded = self.tokenizer.decode(ids)
else:
decoded = "".join(tokens)
else:
decoded = self.tokenizer.decode(tokens) # type: ignore[arg-type]
return decoded
def decode(self, ids: Union[List[int], int]) -> str:
if isinstance(ids, int):
ids = [ids]
return self.tokenizer.decode(ids)
def convert_ids_to_tokens(
self,
ids: List[int],
skip_special_tokens: bool = True,
) -> List[str]:
# TODO(Patrick) - potentially allow special tokens to not be skipped
assert (
skip_special_tokens
), "Skipping special tokens is not supported for Mistral tokenizers."
assert isinstance(self.tokenizer,
(Tekkenizer, SentencePieceTokenizer)), type(
self.tokenizer)
tokens = [self.tokenizer.id_to_piece(id) for id in ids]
if any(t.strip() == "<EFBFBD>" for t in tokens):
# if any stripped decoded token is undefined
# because it's invalid unicode then pass bytes
# See: https://github.com/vllm-project/vllm/pull/8640
tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]
return tokens