151 lines
4.3 KiB
Python
151 lines
4.3 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
import enum
|
|
import time
|
|
from typing import Any, List, Optional, Union
|
|
|
|
import msgspec
|
|
|
|
from vllm.lora.request import LoRARequest
|
|
from vllm.multimodal import MultiModalKwargs
|
|
from vllm.multimodal.inputs import PlaceholderRange
|
|
from vllm.sampling_params import SamplingParams
|
|
from vllm.v1.metrics.stats import SchedulerStats
|
|
from vllm.v1.outputs import LogprobsLists, LogprobsTensors
|
|
|
|
# These are possible values of RequestOutput.finish_reason,
|
|
# so form part of the external API.
|
|
FINISH_REASON_STRINGS = ("stop", "length", "abort")
|
|
|
|
|
|
class FinishReason(enum.IntEnum):
|
|
"""
|
|
Reason a request finished - stop, length, or abort.
|
|
|
|
Int rather than Str for more compact serialization.
|
|
|
|
stop - a stop string was emitted
|
|
length - max_tokens was consumed, or max_model_len was reached
|
|
abort - aborted for another reason
|
|
|
|
"""
|
|
STOP = 0
|
|
LENGTH = 1
|
|
ABORT = 2
|
|
|
|
def __str__(self):
|
|
return FINISH_REASON_STRINGS[self.value]
|
|
|
|
|
|
class EngineCoreRequest(
|
|
msgspec.Struct,
|
|
array_like=True, # type: ignore[call-arg]
|
|
omit_defaults=True, # type: ignore[call-arg]
|
|
gc=False): # type: ignore[call-arg]
|
|
|
|
# NOTE: prompt and prompt_token_ids should be DecoderOnlyInput,
|
|
# but this object is currently not playing well with msgspec
|
|
# due to circular imports and typing we have in data.py
|
|
|
|
request_id: str
|
|
# NOTE(ywang96): original text prompt is needed when a request is added to
|
|
# Detokenizer, but set to None when it is added to EngineCoreClient.
|
|
prompt: Optional[str]
|
|
prompt_token_ids: List[int]
|
|
mm_inputs: Optional[List[Optional[MultiModalKwargs]]]
|
|
mm_hashes: Optional[List[str]]
|
|
mm_placeholders: Optional[List[PlaceholderRange]]
|
|
sampling_params: SamplingParams
|
|
eos_token_id: Optional[int]
|
|
arrival_time: float
|
|
lora_request: Optional[LoRARequest]
|
|
|
|
|
|
class EngineCoreEventType(enum.IntEnum):
|
|
"""The type of engine core request event."""
|
|
QUEUED = 1
|
|
SCHEDULED = 2
|
|
PREEMPTED = 3
|
|
|
|
|
|
class EngineCoreEvent(msgspec.Struct):
|
|
"""A timestamped engine core event associated with a request.
|
|
|
|
The timestamp is a monotonic timestamps and is used for by the engine
|
|
frontend to calculate intervals between engine core events. These
|
|
timestamps should not be compared with timestamps from other processes.
|
|
"""
|
|
type: EngineCoreEventType
|
|
timestamp: float
|
|
|
|
@classmethod
|
|
def new_event(cls,
|
|
event_type: EngineCoreEventType,
|
|
timestamp: Optional[float] = None) -> "EngineCoreEvent":
|
|
timestamp = time.monotonic() if timestamp is None else timestamp
|
|
return cls(event_type, timestamp)
|
|
|
|
|
|
class EngineCoreOutput(
|
|
msgspec.Struct,
|
|
array_like=True, # type: ignore[call-arg]
|
|
omit_defaults=True, # type: ignore[call-arg]
|
|
gc=False): # type: ignore[call-arg]
|
|
|
|
request_id: str
|
|
new_token_ids: List[int]
|
|
|
|
new_logprobs: Optional[LogprobsLists] = None
|
|
new_prompt_logprobs_tensors: Optional[LogprobsTensors] = None
|
|
|
|
finish_reason: Optional[FinishReason] = None
|
|
stop_reason: Union[int, str, None] = None
|
|
events: Optional[List[EngineCoreEvent]] = None
|
|
|
|
@property
|
|
def finished(self) -> bool:
|
|
return self.finish_reason is not None
|
|
|
|
|
|
class UtilityOutput(
|
|
msgspec.Struct,
|
|
array_like=True, # type: ignore[call-arg]
|
|
gc=False): # type: ignore[call-arg]
|
|
|
|
call_id: int
|
|
|
|
# Non-None implies the call failed, result should be None.
|
|
failure_message: Optional[str] = None
|
|
result: Any = None
|
|
|
|
|
|
class EngineCoreOutputs(
|
|
msgspec.Struct,
|
|
array_like=True, # type: ignore[call-arg]
|
|
omit_defaults=True, # type: ignore[call-arg]
|
|
gc=False): # type: ignore[call-arg]
|
|
|
|
#NOTE(Nick): We could consider ways to make this more compact,
|
|
# e.g. columnwise layout
|
|
|
|
# [num_reqs]
|
|
outputs: List[EngineCoreOutput] = []
|
|
scheduler_stats: Optional[SchedulerStats] = None
|
|
timestamp: float = 0.0
|
|
|
|
utility_output: Optional[UtilityOutput] = None
|
|
|
|
def __post_init__(self):
|
|
if self.timestamp == 0.0:
|
|
self.timestamp = time.monotonic()
|
|
|
|
|
|
class EngineCoreRequestType(enum.Enum):
|
|
"""
|
|
Request types defined as hex byte strings, so it can be sent over sockets
|
|
without separate encoding step.
|
|
"""
|
|
ADD = b'\x00'
|
|
ABORT = b'\x01'
|
|
UTILITY = b'\x02'
|