[Misc] Make Serving Benchmark More User-friendly (#5044)

2024-05-25 10:28:16 -07:00 · 2024-05-25 10:28:16 -07:00 · f17a1a8f96
commit f17a1a8f96
parent d5a1697772
2 changed files with 32 additions and 3 deletions
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -89,6 +89,9 @@ async def async_request_tgi(
                    output.latency = most_recent_timestamp - st
                    output.success = True
                    output.generated_text = data["generated_text"]
                else:
                    output.error = response.reason or ""
                    output.success = False
        except Exception:
            output.success = False
            exc_info = sys.exc_info()
@ -276,6 +279,9 @@ async def async_request_openai_completions(
                    output.generated_text = generated_text
                    output.success = True
                    output.latency = latency
                else:
                    output.error = response.reason or ""
                    output.success = False
        except Exception:
            output.success = False
            exc_info = sys.exc_info()
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -215,6 +215,11 @@ def calculate_metrics(
        else:
            actual_output_lens.append(0)
    if completed == 0:
        warnings.warn(
            "All requests failed. This is likely due to a misconfiguration "
            "on the benchmark arguments.",
            stacklevel=2)
    metrics = BenchmarkMetrics(
        completed=completed,
        total_input=total_input,
@ -226,9 +231,9 @@ def calculate_metrics(
        1000,  # ttfts is empty if streaming is not supported by backend
        median_ttft_ms=np.median(ttfts or 0) * 1000,
        p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
-        mean_tpot_ms=np.mean(tpots) * 1000,
+        mean_tpot_ms=np.mean(tpots or 0) * 1000,
-        median_tpot_ms=np.median(tpots) * 1000,
+        median_tpot_ms=np.median(tpots or 0) * 1000,
-        p99_tpot_ms=np.percentile(tpots, 99) * 1000,
+        p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
    )
    return metrics, actual_output_lens
@ -250,6 +255,24 @@ async def benchmark(
    else:
        raise ValueError(f"Unknown backend: {backend}")
    print("Starting initial single prompt test run...")
    test_prompt, test_prompt_len, test_output_len = input_requests[0]
    test_input = RequestFuncInput(
        model=model_id,
        prompt=test_prompt,
        api_url=api_url,
        prompt_len=test_prompt_len,
        output_len=test_output_len,
        best_of=best_of,
        use_beam_search=use_beam_search,
    )
    test_output = await request_func(request_func_input=test_input)
    if not test_output.success:
        raise ValueError(
            "Initial test run failed - Please make sure benchmark arguments "
            f"are correctly specified. Error: {test_output.error}")
    else:
        print("Initial test run completed. Starting main benchmark run...")
    print(f"Traffic request rate: {request_rate}")
    pbar = None if disable_tqdm else tqdm(total=len(input_requests))