[CI] the readability of benchmarking and prepare for dashboard (#5571)

[CI] Improve the readability of performance benchmarking results and prepare for upcoming performance dashboard (#5571)
2024-06-17 11:41:08 -07:00 · 2024-06-17 11:41:08 -07:00 · 9e4e6fe207
commit 9e4e6fe207
parent ab66536dbf
8 changed files with 213 additions and 111 deletions
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@ -13,9 +13,17 @@ This benchmark will be *triggered* upon:
 **Benchmarking Duration**: about 1hr.
-## Configuring the workload for the quick benchmark
+**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run.
-The workload of the quick benchmark contains two parts: latency tests in `latency-tests.json`, throughput tests in `throughput-tests.json` and serving tests in `serving-tests.json`.
+
 ## Configuring the workload
 The benchmarking workload contains three parts:
 - Latency tests in `latency-tests.json`.
 - Throughput tests in `throughput-tests.json`.
 - Serving tests in `serving-tests.json`.
 See [descriptions.md](tests/descriptions.md) for detailed descriptions. 
 ### Latency test
@ -23,7 +31,6 @@ Here is an example of one test inside `latency-tests.json`:
 ```json
 [
    ...
    {
        "test_name": "latency_llama8B_tp1",
        "parameters": {
@ -34,7 +41,6 @@ Here is an example of one test inside `latency-tests.json`:
            "num_iters": 15
        }
    },
    ...
 ]
 ```
@ -57,7 +63,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
 ```
 [
    ...
    {
        "test_name": "serving_llama8B_tp1_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
@ -77,7 +82,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
            "num_prompts": 200
        }
    },
    ...
 ]
 ```
@ -92,7 +96,8 @@ The number of this test is less stable compared to the delay and latency benchma
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
 ## Visualizing the results
-The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table.
+The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
 You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
 If you do not see the table, please wait till the benchmark finish running.
-The JSON file is also attached within each buildkite job for further analysis.
+The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
 The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
--- a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
@ -343,9 +343,9 @@ main() {
  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
  # benchmarking
-  run_serving_tests $QUICK_BENCHMARK_ROOT/serving-tests.json
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
-  run_latency_tests $QUICK_BENCHMARK_ROOT/latency-tests.json
+  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
-  run_throughput_tests $QUICK_BENCHMARK_ROOT/throughput-tests.json
+  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
  # postprocess benchmarking results
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -1,4 +1,5 @@
 import json
 import os
 from pathlib import Path
 import pandas as pd
@ -11,12 +12,13 @@ latency_results = []
 latency_column_mapping = {
    "test_name": "Test name",
    "gpu_type": "GPU",
-    "avg_latency": "Average latency (s)",
+    "avg_latency": "Mean latency (ms)",
-    "P10": "P10 (s)",
+    # "P10": "P10 (s)",
-    "P25": "P25 (s)",
+    # "P25": "P25 (s)",
-    "P50": "P50 (s)",
+    "P50": "Median",
-    "P75": "P75 (s)",
+    # "P75": "P75 (s)",
-    "P90": "P90 (s)",
+    # "P90": "P90 (s)",
    "P99": "P99",
 }
 # thoughput tests and the keys that will be printed into markdown
@ -24,11 +26,11 @@ throughput_results = []
 throughput_results_column_mapping = {
    "test_name": "Test name",
    "gpu_type": "GPU",
-    "num_requests": "# of req.",
+    # "num_requests": "# of req.",
-    "total_num_tokens": "Total # of tokens",
+    # "total_num_tokens": "Total # of tokens",
-    "elapsed_time": "Elapsed time (s)",
+    # "elapsed_time": "Elapsed time (s)",
    "requests_per_second": "Tput (req/s)",
-    "tokens_per_second": "Tput (tok/s)",
+    # "tokens_per_second": "Tput (tok/s)",
 }
 # serving results and the keys that will be printed into markdown
@ -36,120 +38,148 @@ serving_results = []
 serving_column_mapping = {
    "test_name": "Test name",
    "gpu_type": "GPU",
-    "completed": "# of req.",
+    # "completed": "# of req.",
    "request_throughput": "Tput (req/s)",
-    "input_throughput": "Input Tput (tok/s)",
+    # "input_throughput": "Input Tput (tok/s)",
-    "output_throughput": "Output Tput (tok/s)",
+    # "output_throughput": "Output Tput (tok/s)",
    "mean_ttft_ms": "Mean TTFT (ms)",
    # do not say TTFT again to avoid the table getting too wide
    "median_ttft_ms": "Median",
    "p99_ttft_ms": "P99",
-    "mean_tpot_ms": "Mean TPOT (ms)",
+    # "mean_tpot_ms": "Mean TPOT (ms)",
-    "median_tpot_ms": "Median",
+    # "median_tpot_ms": "Median",
-    "p99_tpot_ms": "P99",
+    # "p99_tpot_ms": "P99",
    "mean_itl_ms": "Mean ITL (ms)",
    "median_itl_ms": "Median",
    "p99_itl_ms": "P99",
 }
 for test_file in results_folder.glob("*.json"):
-    with open(test_file, "r") as f:
+def read_markdown(file):
-        raw_result = json.loads(f.read())
+    if os.path.exists(file):
        with open(file, "r") as f:
            return f.read() + "\n"
    else:
        return f"{file} not found.\n"
    if "serving" in str(test_file):
        # this result is generated via `benchmark_serving.py`
-        # attach the benchmarking command to raw_result
+def results_to_json(latency, throughput, serving):
-        with open(test_file.with_suffix(".commands"), "r") as f:
+    return json.dumps({
-            command = json.loads(f.read())
+        'latency': latency.to_dict(),
-        raw_result.update(command)
+        'throughput': throughput.to_dict(),
        'serving': serving.to_dict()
    })
        # update the test name of this result
        raw_result.update({"test_name": test_file.stem})
-        # add the result to raw_result
+if __name__ == "__main__":
        serving_results.append(raw_result)
        continue
-    elif "latency" in f.name:
+    # collect results
-        # this result is generated via `benchmark_latency.py`
+    for test_file in results_folder.glob("*.json"):
-        # attach the benchmarking command to raw_result
+        with open(test_file, "r") as f:
-        with open(test_file.with_suffix(".commands"), "r") as f:
+            raw_result = json.loads(f.read())
            command = json.loads(f.read())
        raw_result.update(command)
-        # update the test name of this result
+        if "serving" in str(test_file):
-        raw_result.update({"test_name": test_file.stem})
+            # this result is generated via `benchmark_serving.py`
-        # get different percentiles
+            # attach the benchmarking command to raw_result
-        for perc in [10, 25, 50, 75, 90]:
+            with open(test_file.with_suffix(".commands"), "r") as f:
-            raw_result.update(
+                command = json.loads(f.read())
-                {f"P{perc}": raw_result["percentiles"][str(perc)]})
+            raw_result.update(command)
-        # add the result to raw_result
+            # update the test name of this result
-        latency_results.append(raw_result)
+            raw_result.update({"test_name": test_file.stem})
        continue
-    elif "throughput" in f.name:
+            # add the result to raw_result
-        # this result is generated via `benchmark_throughput.py`
+            serving_results.append(raw_result)
            continue
-        # attach the benchmarking command to raw_result
+        elif "latency" in f.name:
-        with open(test_file.with_suffix(".commands"), "r") as f:
+            # this result is generated via `benchmark_latency.py`
            command = json.loads(f.read())
        raw_result.update(command)
-        # update the test name of this result
+            # attach the benchmarking command to raw_result
-        raw_result.update({"test_name": test_file.stem})
+            with open(test_file.with_suffix(".commands"), "r") as f:
                command = json.loads(f.read())
            raw_result.update(command)
-        # add the result to raw_result
+            # update the test name of this result
-        throughput_results.append(raw_result)
+            raw_result.update({"test_name": test_file.stem})
        continue
-    print(f"Skipping {test_file}")
+            # get different percentiles
            for perc in [10, 25, 50, 75, 90, 99]:
                # Multiply 1000 to convert the time unit from s to ms
                raw_result.update(
                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
-latency_results = pd.DataFrame.from_dict(latency_results)
+            # add the result to raw_result
-serving_results = pd.DataFrame.from_dict(serving_results)
+            latency_results.append(raw_result)
-throughput_results = pd.DataFrame.from_dict(throughput_results)
+            continue
-# remapping the key, for visualization purpose
+        elif "throughput" in f.name:
-if not latency_results.empty:
+            # this result is generated via `benchmark_throughput.py`
    latency_results = latency_results[list(
        latency_column_mapping.keys())].rename(columns=latency_column_mapping)
 if not serving_results.empty:
    serving_results = serving_results[list(
        serving_column_mapping.keys())].rename(columns=serving_column_mapping)
 if not throughput_results.empty:
    throughput_results = throughput_results[list(
        throughput_results_column_mapping.keys())].rename(
            columns=throughput_results_column_mapping)
-# get markdown tables
+            # attach the benchmarking command to raw_result
-latency_md_table = tabulate(latency_results,
+            with open(test_file.with_suffix(".commands"), "r") as f:
-                            headers='keys',
+                command = json.loads(f.read())
-                            tablefmt='pipe',
+            raw_result.update(command)
                            showindex=False)
 serving_md_table = tabulate(serving_results,
                            headers='keys',
                            tablefmt='pipe',
                            showindex=False)
 throughput_md_table = tabulate(throughput_results,
                               headers='keys',
                               tablefmt='pipe',
                               showindex=False)
-# document the result
+            # update the test name of this result
-with open(results_folder / "benchmark_results.md", "w") as f:
+            raw_result.update({"test_name": test_file.stem})
            # add the result to raw_result
            throughput_results.append(raw_result)
            continue
        print(f"Skipping {test_file}")
    latency_results = pd.DataFrame.from_dict(latency_results)
    serving_results = pd.DataFrame.from_dict(serving_results)
    throughput_results = pd.DataFrame.from_dict(throughput_results)
    raw_results_json = results_to_json(latency_results, throughput_results,
                                       serving_results)
    # remapping the key, for visualization purpose
    if not latency_results.empty:
-        f.write("## Latency tests\n")
+        latency_results = latency_results[list(
-        f.write(latency_md_table)
+            latency_column_mapping.keys())].rename(
-        f.write("\n")
+                columns=latency_column_mapping)
    if not throughput_results.empty:
        f.write("## Throughput tests\n")
        f.write(throughput_md_table)
        f.write("\n")
    if not serving_results.empty:
-        f.write("## Serving tests\n")
+        serving_results = serving_results[list(
-        f.write(serving_md_table)
+            serving_column_mapping.keys())].rename(
-        f.write("\n")
+                columns=serving_column_mapping)
    if not throughput_results.empty:
        throughput_results = throughput_results[list(
            throughput_results_column_mapping.keys())].rename(
                columns=throughput_results_column_mapping)
    processed_results_json = results_to_json(latency_results,
                                             throughput_results,
                                             serving_results)
    # get markdown tables
    latency_md_table = tabulate(latency_results,
                                headers='keys',
                                tablefmt='pipe',
                                showindex=False)
    serving_md_table = tabulate(serving_results,
                                headers='keys',
                                tablefmt='pipe',
                                showindex=False)
    throughput_md_table = tabulate(throughput_results,
                                   headers='keys',
                                   tablefmt='pipe',
                                   showindex=False)
    # document the result
    with open(results_folder / "benchmark_results.md", "w") as f:
        results = read_markdown(
            "../.buildkite/nightly-benchmarks/tests/descriptions.md")
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
            throughput_tests_markdown_table=throughput_md_table,
            serving_tests_markdown_table=serving_md_table,
            benchmarking_results_in_json_string=processed_results_json)
        f.write(results)
--- a/.buildkite/nightly-benchmarks/tests/descriptions.md
+++ b/.buildkite/nightly-benchmarks/tests/descriptions.md
@ -0,0 +1,67 @@
 ## Latency tests
 This test suite aims to test vllm's end-to-end latency under a controlled setup.
 - Input length: 32 tokens.
 - Output length: 128 tokens.
 - Batch size: fixed (8).
 - Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).
 ### Latency benchmarking results
 {latency_tests_markdown_table}
 ## Throughput tests
 This test suite aims to test vllm's throughput.
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm to achieve maximum throughput.
 - Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput.
 ### Throughput benchmarking results
 {throughput_tests_markdown_table}
 ## Serving tests
 This test suite aims to test vllm's real serving metrics.
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
 - Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
 ### Serving benchmarking results
 {serving_tests_markdown_table}
 ## json version of the benchmarking tables
 This section contains the data of the markdown tables above in JSON format. 
 You can load the benchmarking tables into pandas dataframes as follows:
 ```python
 import json
 import pandas as pd
 benchmarking_results_json = """The json string"""
 benchmarking_results = json.loads(benchmarking_results_json)
 latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
 throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
 serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
 ```
 The json string for all benchmarking tables:
 ```json
 {benchmarking_results_in_json_string}
 ```
 You can also check the raw experiment data in the Artifact tab of the Buildkite page.
--- a/.buildkite/nightly-benchmarks/tests/latency-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json
@ -29,4 +29,4 @@
            "num-iters": 15
        }
    }
-]
+]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@ -56,4 +56,4 @@
            "num_prompts": 200
        }
    }
-]
+]
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
@ -32,4 +32,4 @@
            "backend": "vllm"
        }
    }
-]
+]
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -98,7 +98,7 @@ def main(args: argparse.Namespace):
    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
        latencies.append(run_to_completion(profile_dir=None))
    latencies = np.array(latencies)
-    percentages = [10, 25, 50, 75, 90]
+    percentages = [10, 25, 50, 75, 90, 99]
    percentiles = np.percentile(latencies, percentages)
    print(f'Avg latency: {np.mean(latencies)} seconds')
    for percentage, percentile in zip(percentages, percentiles):