Fixes the misuse/mixuse of time.time()/time.monotonic() (#3220)

Signed-off-by: Tao He <sighingnow@gmail.com> Co-authored-by: simon-mo <simon.mo@hey.com>
2024-03-16 02:25:43 +08:00 · 2024-03-16 02:25:43 +08:00 · 14b8ae02e7
commit 14b8ae02e7
parent 03d37f2441
5 changed files with 7 additions and 8 deletions
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@ -160,7 +160,7 @@ class Scheduler:
        blocks_to_copy: Dict[int, List[int]] = {}

        # Fix the current time.
-        now = time.monotonic()
+        now = time.time()

        # Join waiting sequences if possible.
        if not self.swapped:
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@ -604,8 +604,7 @@ class AsyncLLMEngine:
            >>> ...
        """
        # Preprocess the request.
-        # This should not be used for logging, as it is monotonic time.
-        arrival_time = time.monotonic()
+        arrival_time = time.time()

        try:
            stream = await self.add_request(
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@ -244,7 +244,7 @@ class LLMEngine:
            raise ValueError(f"Cannot request more than "
                             f"{max_logprobs} logprobs.")
        if arrival_time is None:
-            arrival_time = time.monotonic()
+            arrival_time = time.time()
        prompt_token_ids = self.encode_request(
            request_id=request_id,
            prompt=prompt,
@ -628,7 +628,7 @@ class LLMEngine:
    def _get_stats(self,
                   scheduler_outputs: Optional[SchedulerOutputs]) -> Stats:
        """Get Stats to be Logged to Prometheus."""
-        now = time.monotonic()
+        now = time.time()

        # KV Cache Usage in %.
        num_total_gpu = self.cache_config.num_gpu_blocks
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@ -103,7 +103,7 @@ class OpenAIServingChat(OpenAIServing):
    ) -> Union[ErrorResponse, AsyncGenerator[str, None]]:

        model_name = request.model
-        created_time = int(time.monotonic())
+        created_time = int(time.time())
        chunk_object_type = "chat.completion.chunk"
        first_iteration = True

@ -244,7 +244,7 @@ class OpenAIServingChat(OpenAIServing):
            request_id: str) -> Union[ErrorResponse, ChatCompletionResponse]:

        model_name = request.model
-        created_time = int(time.monotonic())
+        created_time = int(time.time())
        final_res: RequestOutput = None

        async for res in result_generator:
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@ -118,7 +118,7 @@ class OpenAIServingCompletion(OpenAIServing):

        model_name = request.model
        request_id = f"cmpl-{random_uuid()}"
-        created_time = int(time.monotonic())
+        created_time = int(time.time())

        # Schedule the request and get the result generator.
        generators = []