Fixes the misuse/mixuse of time.time()/time.monotonic() (#3220)
Signed-off-by: Tao He <sighingnow@gmail.com> Co-authored-by: simon-mo <simon.mo@hey.com>
This commit is contained in:
parent
03d37f2441
commit
14b8ae02e7
@ -160,7 +160,7 @@ class Scheduler:
|
|||||||
blocks_to_copy: Dict[int, List[int]] = {}
|
blocks_to_copy: Dict[int, List[int]] = {}
|
||||||
|
|
||||||
# Fix the current time.
|
# Fix the current time.
|
||||||
now = time.monotonic()
|
now = time.time()
|
||||||
|
|
||||||
# Join waiting sequences if possible.
|
# Join waiting sequences if possible.
|
||||||
if not self.swapped:
|
if not self.swapped:
|
||||||
|
@ -604,8 +604,7 @@ class AsyncLLMEngine:
|
|||||||
>>> ...
|
>>> ...
|
||||||
"""
|
"""
|
||||||
# Preprocess the request.
|
# Preprocess the request.
|
||||||
# This should not be used for logging, as it is monotonic time.
|
arrival_time = time.time()
|
||||||
arrival_time = time.monotonic()
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
stream = await self.add_request(
|
stream = await self.add_request(
|
||||||
|
@ -244,7 +244,7 @@ class LLMEngine:
|
|||||||
raise ValueError(f"Cannot request more than "
|
raise ValueError(f"Cannot request more than "
|
||||||
f"{max_logprobs} logprobs.")
|
f"{max_logprobs} logprobs.")
|
||||||
if arrival_time is None:
|
if arrival_time is None:
|
||||||
arrival_time = time.monotonic()
|
arrival_time = time.time()
|
||||||
prompt_token_ids = self.encode_request(
|
prompt_token_ids = self.encode_request(
|
||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
@ -628,7 +628,7 @@ class LLMEngine:
|
|||||||
def _get_stats(self,
|
def _get_stats(self,
|
||||||
scheduler_outputs: Optional[SchedulerOutputs]) -> Stats:
|
scheduler_outputs: Optional[SchedulerOutputs]) -> Stats:
|
||||||
"""Get Stats to be Logged to Prometheus."""
|
"""Get Stats to be Logged to Prometheus."""
|
||||||
now = time.monotonic()
|
now = time.time()
|
||||||
|
|
||||||
# KV Cache Usage in %.
|
# KV Cache Usage in %.
|
||||||
num_total_gpu = self.cache_config.num_gpu_blocks
|
num_total_gpu = self.cache_config.num_gpu_blocks
|
||||||
|
@ -103,7 +103,7 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
) -> Union[ErrorResponse, AsyncGenerator[str, None]]:
|
) -> Union[ErrorResponse, AsyncGenerator[str, None]]:
|
||||||
|
|
||||||
model_name = request.model
|
model_name = request.model
|
||||||
created_time = int(time.monotonic())
|
created_time = int(time.time())
|
||||||
chunk_object_type = "chat.completion.chunk"
|
chunk_object_type = "chat.completion.chunk"
|
||||||
first_iteration = True
|
first_iteration = True
|
||||||
|
|
||||||
@ -244,7 +244,7 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
request_id: str) -> Union[ErrorResponse, ChatCompletionResponse]:
|
request_id: str) -> Union[ErrorResponse, ChatCompletionResponse]:
|
||||||
|
|
||||||
model_name = request.model
|
model_name = request.model
|
||||||
created_time = int(time.monotonic())
|
created_time = int(time.time())
|
||||||
final_res: RequestOutput = None
|
final_res: RequestOutput = None
|
||||||
|
|
||||||
async for res in result_generator:
|
async for res in result_generator:
|
||||||
|
@ -118,7 +118,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
|||||||
|
|
||||||
model_name = request.model
|
model_name = request.model
|
||||||
request_id = f"cmpl-{random_uuid()}"
|
request_id = f"cmpl-{random_uuid()}"
|
||||||
created_time = int(time.monotonic())
|
created_time = int(time.time())
|
||||||
|
|
||||||
# Schedule the request and get the result generator.
|
# Schedule the request and get the result generator.
|
||||||
generators = []
|
generators = []
|
||||||
|
Loading…
x
Reference in New Issue
Block a user