Use monotonic time where appropriate (#1249)
This commit is contained in:
parent
66d18a7fb0
commit
acbed3ef40
@ -40,13 +40,13 @@ def main(args: argparse.Namespace):
|
||||
def run_to_completion(profile: bool = False):
|
||||
if profile:
|
||||
torch.cuda.cudart().cudaProfilerStart()
|
||||
start_time = time.time()
|
||||
start_time = time.perf_counter()
|
||||
|
||||
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=False)
|
||||
|
||||
end_time = time.time()
|
||||
end_time = time.perf_counter()
|
||||
latency = end_time - start_time
|
||||
if profile:
|
||||
torch.cuda.cudart().cudaProfilerStop()
|
||||
|
@ -105,7 +105,7 @@ async def send_request(
|
||||
best_of: int,
|
||||
use_beam_search: bool,
|
||||
) -> None:
|
||||
request_start_time = time.time()
|
||||
request_start_time = time.perf_counter()
|
||||
|
||||
headers = {"User-Agent": "Benchmark Client"}
|
||||
if backend == "vllm":
|
||||
@ -148,7 +148,7 @@ async def send_request(
|
||||
if "error" not in output:
|
||||
break
|
||||
|
||||
request_end_time = time.time()
|
||||
request_end_time = time.perf_counter()
|
||||
request_latency = request_end_time - request_start_time
|
||||
REQUEST_LATENCY.append((prompt_len, output_len, request_latency))
|
||||
|
||||
@ -180,10 +180,10 @@ def main(args: argparse.Namespace):
|
||||
tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
|
||||
input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
|
||||
|
||||
benchmark_start_time = time.time()
|
||||
benchmark_start_time = time.perf_counter()
|
||||
asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of,
|
||||
args.use_beam_search, args.request_rate))
|
||||
benchmark_end_time = time.time()
|
||||
benchmark_end_time = time.perf_counter()
|
||||
benchmark_time = benchmark_end_time - benchmark_start_time
|
||||
print(f"Total time: {benchmark_time:.2f} s")
|
||||
print(f"Throughput: {args.num_prompts / benchmark_time:.2f} requests/s")
|
||||
|
@ -93,10 +93,10 @@ def run_vllm(
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
start = time.perf_counter()
|
||||
# FIXME(woosuk): Do use internal method.
|
||||
llm._run_engine(use_tqdm=True)
|
||||
end = time.time()
|
||||
end = time.perf_counter()
|
||||
return end - start
|
||||
|
||||
|
||||
@ -118,7 +118,7 @@ def run_hf(
|
||||
llm = llm.cuda()
|
||||
|
||||
pbar = tqdm(total=len(requests))
|
||||
start = time.time()
|
||||
start = time.perf_counter()
|
||||
batch: List[str] = []
|
||||
max_prompt_len = 0
|
||||
max_output_len = 0
|
||||
@ -156,7 +156,7 @@ def run_hf(
|
||||
batch = []
|
||||
max_prompt_len = 0
|
||||
max_output_len = 0
|
||||
end = time.time()
|
||||
end = time.perf_counter()
|
||||
return end - start
|
||||
|
||||
|
||||
|
@ -121,7 +121,7 @@ class Scheduler:
|
||||
blocks_to_copy: Dict[int, List[int]] = {}
|
||||
|
||||
# Fix the current time.
|
||||
now = time.time()
|
||||
now = time.monotonic()
|
||||
|
||||
# Join waiting sequences if possible.
|
||||
if not self.swapped:
|
||||
|
@ -417,7 +417,8 @@ class AsyncLLMEngine:
|
||||
request.
|
||||
"""
|
||||
# Preprocess the request.
|
||||
arrival_time = time.time()
|
||||
# This should not be used for logging, as it is monotonic time.
|
||||
arrival_time = time.monotonic()
|
||||
|
||||
try:
|
||||
stream = await self.add_request(request_id,
|
||||
|
@ -256,10 +256,10 @@ class LLMEngine:
|
||||
prompt_token_ids: The token IDs of the prompt. If None, we
|
||||
use the tokenizer to convert the prompts to token IDs.
|
||||
arrival_time: The arrival time of the request. If None, we use
|
||||
the current time.
|
||||
the current monotonic time.
|
||||
"""
|
||||
if arrival_time is None:
|
||||
arrival_time = time.time()
|
||||
arrival_time = time.monotonic()
|
||||
if prompt_token_ids is None:
|
||||
assert prompt is not None
|
||||
prompt_token_ids = self.tokenizer.encode(prompt)
|
||||
@ -568,7 +568,7 @@ class LLMEngine:
|
||||
prompt_run: bool,
|
||||
num_batched_tokens: int,
|
||||
) -> None:
|
||||
now = time.time()
|
||||
now = time.monotonic()
|
||||
# Log the number of batched input tokens.
|
||||
if prompt_run:
|
||||
self.num_prompt_tokens.append((now, num_batched_tokens))
|
||||
|
@ -210,7 +210,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
|
||||
|
||||
model_name = request.model
|
||||
request_id = f"cmpl-{random_uuid()}"
|
||||
created_time = int(time.time())
|
||||
created_time = int(time.monotonic())
|
||||
try:
|
||||
sampling_params = SamplingParams(
|
||||
n=request.n,
|
||||
@ -411,7 +411,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
created_time = int(time.time())
|
||||
created_time = int(time.monotonic())
|
||||
try:
|
||||
sampling_params = SamplingParams(
|
||||
n=request.n,
|
||||
|
Loading…
x
Reference in New Issue
Block a user