[Bugfix][V1] Fix allowed_token_ids for v1 Sampler (#14169)
Signed-off-by: Lu Fang <lufang@fb.com>
This commit is contained in:
parent
ec79b67c77
commit
8d6cd32b7b
@ -92,10 +92,12 @@ class Processor:
|
|||||||
return
|
return
|
||||||
if params.allowed_token_ids is None:
|
if params.allowed_token_ids is None:
|
||||||
return
|
return
|
||||||
if not all(0 <= tid < self.model_config.vocab_size
|
if not params.allowed_token_ids:
|
||||||
for tid in params.allowed_token_ids):
|
raise ValueError("allowed_token_ids is not None and empty!")
|
||||||
|
vocab_size = self.model_config.get_vocab_size()
|
||||||
|
if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"allowed_token_ids contains out-of-vocab token id")
|
"allowed_token_ids contains out-of-vocab token id!")
|
||||||
|
|
||||||
def process_inputs(
|
def process_inputs(
|
||||||
self,
|
self,
|
||||||
|
@ -199,6 +199,8 @@ class InputBatch:
|
|||||||
self.logit_bias: list[Optional[dict[int,
|
self.logit_bias: list[Optional[dict[int,
|
||||||
float]]] = [None] * max_num_reqs
|
float]]] = [None] * max_num_reqs
|
||||||
self.has_allowed_token_ids: set[str] = set()
|
self.has_allowed_token_ids: set[str] = set()
|
||||||
|
# NOTE(lufang): In the mask tensor, if the corresponding token allowed,
|
||||||
|
# the value is False. Since we use masked_fill_ to set -inf.
|
||||||
self.allowed_token_ids_mask: Optional[torch.Tensor] = None
|
self.allowed_token_ids_mask: Optional[torch.Tensor] = None
|
||||||
self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
|
self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
|
||||||
|
|
||||||
@ -300,6 +302,7 @@ class InputBatch:
|
|||||||
self.has_allowed_token_ids.add(req_id)
|
self.has_allowed_token_ids.add(req_id)
|
||||||
if self.allowed_token_ids_mask_cpu_tensor is None:
|
if self.allowed_token_ids_mask_cpu_tensor is None:
|
||||||
# Lazy allocation for this tensor, which can be large.
|
# Lazy allocation for this tensor, which can be large.
|
||||||
|
# False means we don't fill with -inf.
|
||||||
self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
|
self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
|
||||||
self.vocab_size,
|
self.vocab_size,
|
||||||
dtype=torch.bool,
|
dtype=torch.bool,
|
||||||
@ -309,8 +312,10 @@ class InputBatch:
|
|||||||
self.vocab_size,
|
self.vocab_size,
|
||||||
dtype=torch.bool,
|
dtype=torch.bool,
|
||||||
device="cpu")
|
device="cpu")
|
||||||
|
self.allowed_token_ids_mask_cpu_tensor[req_index] = True
|
||||||
|
# False means we don't fill with -inf.
|
||||||
self.allowed_token_ids_mask_cpu_tensor[req_index][
|
self.allowed_token_ids_mask_cpu_tensor[req_index][
|
||||||
sampling_params.allowed_token_ids] = True
|
sampling_params.allowed_token_ids] = False
|
||||||
|
|
||||||
# Add request lora ID
|
# Add request lora ID
|
||||||
if request.lora_request:
|
if request.lora_request:
|
||||||
@ -359,6 +364,7 @@ class InputBatch:
|
|||||||
self.logit_bias[req_index] = None
|
self.logit_bias[req_index] = None
|
||||||
self.has_allowed_token_ids.discard(req_id)
|
self.has_allowed_token_ids.discard(req_id)
|
||||||
if self.allowed_token_ids_mask_cpu_tensor is not None:
|
if self.allowed_token_ids_mask_cpu_tensor is not None:
|
||||||
|
# False means we don't fill with -inf.
|
||||||
self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
|
self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
|
||||||
return req_index
|
return req_index
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user