diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 9255f91b..c3f93a29 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -160,7 +160,7 @@ class Scheduler: blocks_to_copy: Dict[int, List[int]] = {} # Fix the current time. - now = time.monotonic() + now = time.time() # Join waiting sequences if possible. if not self.swapped: diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 0cee604c..8bcd1e0e 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -604,8 +604,7 @@ class AsyncLLMEngine: >>> ... """ # Preprocess the request. - # This should not be used for logging, as it is monotonic time. - arrival_time = time.monotonic() + arrival_time = time.time() try: stream = await self.add_request( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 4cdad418..691c9e83 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -244,7 +244,7 @@ class LLMEngine: raise ValueError(f"Cannot request more than " f"{max_logprobs} logprobs.") if arrival_time is None: - arrival_time = time.monotonic() + arrival_time = time.time() prompt_token_ids = self.encode_request( request_id=request_id, prompt=prompt, @@ -628,7 +628,7 @@ class LLMEngine: def _get_stats(self, scheduler_outputs: Optional[SchedulerOutputs]) -> Stats: """Get Stats to be Logged to Prometheus.""" - now = time.monotonic() + now = time.time() # KV Cache Usage in %. num_total_gpu = self.cache_config.num_gpu_blocks diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index d2fb9ca0..bfdfe39f 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -103,7 +103,7 @@ class OpenAIServingChat(OpenAIServing): ) -> Union[ErrorResponse, AsyncGenerator[str, None]]: model_name = request.model - created_time = int(time.monotonic()) + created_time = int(time.time()) chunk_object_type = "chat.completion.chunk" first_iteration = True @@ -244,7 +244,7 @@ class OpenAIServingChat(OpenAIServing): request_id: str) -> Union[ErrorResponse, ChatCompletionResponse]: model_name = request.model - created_time = int(time.monotonic()) + created_time = int(time.time()) final_res: RequestOutput = None async for res in result_generator: diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index b78f0538..bfd7c9b5 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -118,7 +118,7 @@ class OpenAIServingCompletion(OpenAIServing): model_name = request.model request_id = f"cmpl-{random_uuid()}" - created_time = int(time.monotonic()) + created_time = int(time.time()) # Schedule the request and get the result generator. generators = []