vllm/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py

import json
from pathlib import Path

import pandas as pd
from tabulate import tabulate

results_folder = Path("results/")

# latency results and the keys that will be printed into markdown
latency_results = []
latency_column_mapping = {
    "test_name": "Test name",
    "gpu_type": "GPU",
    "avg_latency": "Average latency (s)",
    "P10": "P10 (s)",
    "P25": "P25 (s)",
    "P50": "P50 (s)",
    "P75": "P75 (s)",
    "P90": "P90 (s)",
}

# thoughput tests and the keys that will be printed into markdown
throughput_results = []
throughput_results_column_mapping = {
    "test_name": "Test name",
    "gpu_type": "GPU",
    "num_requests": "# of req.",
    "total_num_tokens": "Total # of tokens",
    "elapsed_time": "Elapsed time (s)",
    "requests_per_second": "Tput (req/s)",
    "tokens_per_second": "Tput (tok/s)",
}

# serving results and the keys that will be printed into markdown
serving_results = []
serving_column_mapping = {
    "test_name": "Test name",
    "gpu_type": "GPU",
    "completed": "# of req.",
    "request_throughput": "Tput (req/s)",
    "input_throughput": "Input Tput (tok/s)",
    "output_throughput": "Output Tput (tok/s)",
    "mean_ttft_ms": "Mean TTFT (ms)",
    # do not say TTFT again to avoid the table getting too wide
    "median_ttft_ms": "Median",
    "p99_ttft_ms": "P99",
    "mean_tpot_ms": "Mean TPOT (ms)",
    "median_tpot_ms": "Median",
    "p99_tpot_ms": "P99",
    "mean_itl_ms": "Mean ITL (ms)",
    "median_itl_ms": "Median",
    "p99_itl_ms": "P99",
}

for test_file in results_folder.glob("*.json"):

    with open(test_file, "r") as f:
        raw_result = json.loads(f.read())

    if "serving" in str(test_file):
        # this result is generated via `benchmark_serving.py`

        # attach the benchmarking command to raw_result
        with open(test_file.with_suffix(".commands"), "r") as f:
            command = json.loads(f.read())
        raw_result.update(command)

        # update the test name of this result
        raw_result.update({"test_name": test_file.stem})

        # add the result to raw_result
        serving_results.append(raw_result)
        continue

    elif "latency" in f.name:
        # this result is generated via `benchmark_latency.py`

        # attach the benchmarking command to raw_result
        with open(test_file.with_suffix(".commands"), "r") as f:
            command = json.loads(f.read())
        raw_result.update(command)

        # update the test name of this result
        raw_result.update({"test_name": test_file.stem})

        # get different percentiles
        for perc in [10, 25, 50, 75, 90]:
            raw_result.update(
                {f"P{perc}": raw_result["percentiles"][str(perc)]})

        # add the result to raw_result
        latency_results.append(raw_result)
        continue

    elif "throughput" in f.name:
        # this result is generated via `benchmark_throughput.py`

        # attach the benchmarking command to raw_result
        with open(test_file.with_suffix(".commands"), "r") as f:
            command = json.loads(f.read())
        raw_result.update(command)

        # update the test name of this result
        raw_result.update({"test_name": test_file.stem})

        # add the result to raw_result
        throughput_results.append(raw_result)
        continue

    print(f"Skipping {test_file}")

latency_results = pd.DataFrame.from_dict(latency_results)
serving_results = pd.DataFrame.from_dict(serving_results)
throughput_results = pd.DataFrame.from_dict(throughput_results)

# remapping the key, for visualization purpose
if not latency_results.empty:
    latency_results = latency_results[list(
        latency_column_mapping.keys())].rename(columns=latency_column_mapping)
if not serving_results.empty:
    serving_results = serving_results[list(
        serving_column_mapping.keys())].rename(columns=serving_column_mapping)
if not throughput_results.empty:
    throughput_results = throughput_results[list(
        throughput_results_column_mapping.keys())].rename(
            columns=throughput_results_column_mapping)

# get markdown tables
latency_md_table = tabulate(latency_results,
                            headers='keys',
                            tablefmt='pipe',
                            showindex=False)
serving_md_table = tabulate(serving_results,
                            headers='keys',
                            tablefmt='pipe',
                            showindex=False)
throughput_md_table = tabulate(throughput_results,
                               headers='keys',
                               tablefmt='pipe',
                               showindex=False)

# document the result
with open(results_folder / "benchmark_results.md", "w") as f:
    if not latency_results.empty:
        f.write("## Latency tests\n")
        f.write(latency_md_table)
        f.write("\n")
    if not throughput_results.empty:
        f.write("## Throughput tests\n")
        f.write(throughput_md_table)
        f.write("\n")
    if not serving_results.empty:
        f.write("## Serving tests\n")
        f.write(serving_md_table)
        f.write("\n")
[CI/Build][Misc] Add CI that benchmarks vllm performance on those PRs with `perf-benchmarks` label (#5073) Co-authored-by: simon-mo <simon.mo@hey.com> 2024-06-13 22:36:20 -07:00			`import json`
			`from pathlib import Path`

			`import pandas as pd`
			`from tabulate import tabulate`

			`results_folder = Path("results/")`

			`# latency results and the keys that will be printed into markdown`
			`latency_results = []`
			`latency_column_mapping = {`
			`"test_name": "Test name",`
			`"gpu_type": "GPU",`
			`"avg_latency": "Average latency (s)",`
			`"P10": "P10 (s)",`
			`"P25": "P25 (s)",`
			`"P50": "P50 (s)",`
			`"P75": "P75 (s)",`
			`"P90": "P90 (s)",`
			`}`

			`# thoughput tests and the keys that will be printed into markdown`
			`throughput_results = []`
			`throughput_results_column_mapping = {`
			`"test_name": "Test name",`
			`"gpu_type": "GPU",`
			`"num_requests": "# of req.",`
			`"total_num_tokens": "Total # of tokens",`
			`"elapsed_time": "Elapsed time (s)",`
			`"requests_per_second": "Tput (req/s)",`
			`"tokens_per_second": "Tput (tok/s)",`
			`}`

			`# serving results and the keys that will be printed into markdown`
			`serving_results = []`
			`serving_column_mapping = {`
			`"test_name": "Test name",`
			`"gpu_type": "GPU",`
			`"completed": "# of req.",`
			`"request_throughput": "Tput (req/s)",`
			`"input_throughput": "Input Tput (tok/s)",`
			`"output_throughput": "Output Tput (tok/s)",`
			`"mean_ttft_ms": "Mean TTFT (ms)",`
			`# do not say TTFT again to avoid the table getting too wide`
			`"median_ttft_ms": "Median",`
			`"p99_ttft_ms": "P99",`
			`"mean_tpot_ms": "Mean TPOT (ms)",`
			`"median_tpot_ms": "Median",`
			`"p99_tpot_ms": "P99",`
			`"mean_itl_ms": "Mean ITL (ms)",`
			`"median_itl_ms": "Median",`
			`"p99_itl_ms": "P99",`
			`}`

			`for test_file in results_folder.glob("*.json"):`

			`with open(test_file, "r") as f:`
			`raw_result = json.loads(f.read())`

			`if "serving" in str(test_file):`
			# this result is generated via `benchmark_serving.py`

			`# attach the benchmarking command to raw_result`
			`with open(test_file.with_suffix(".commands"), "r") as f:`
			`command = json.loads(f.read())`
			`raw_result.update(command)`

			`# update the test name of this result`
			`raw_result.update({"test_name": test_file.stem})`

			`# add the result to raw_result`
			`serving_results.append(raw_result)`
			`continue`

			`elif "latency" in f.name:`
			# this result is generated via `benchmark_latency.py`

			`# attach the benchmarking command to raw_result`
			`with open(test_file.with_suffix(".commands"), "r") as f:`
			`command = json.loads(f.read())`
			`raw_result.update(command)`

			`# update the test name of this result`
			`raw_result.update({"test_name": test_file.stem})`

			`# get different percentiles`
			`for perc in [10, 25, 50, 75, 90]:`
			`raw_result.update(`
			`{f"P{perc}": raw_result["percentiles"][str(perc)]})`

			`# add the result to raw_result`
			`latency_results.append(raw_result)`
			`continue`

			`elif "throughput" in f.name:`
			# this result is generated via `benchmark_throughput.py`

			`# attach the benchmarking command to raw_result`
			`with open(test_file.with_suffix(".commands"), "r") as f:`
			`command = json.loads(f.read())`
			`raw_result.update(command)`

			`# update the test name of this result`
			`raw_result.update({"test_name": test_file.stem})`

			`# add the result to raw_result`
			`throughput_results.append(raw_result)`
			`continue`

			`print(f"Skipping {test_file}")`

			`latency_results = pd.DataFrame.from_dict(latency_results)`
			`serving_results = pd.DataFrame.from_dict(serving_results)`
			`throughput_results = pd.DataFrame.from_dict(throughput_results)`

			`# remapping the key, for visualization purpose`
			`if not latency_results.empty:`
			`latency_results = latency_results[list(`
			`latency_column_mapping.keys())].rename(columns=latency_column_mapping)`
			`if not serving_results.empty:`
			`serving_results = serving_results[list(`
			`serving_column_mapping.keys())].rename(columns=serving_column_mapping)`
			`if not throughput_results.empty:`
			`throughput_results = throughput_results[list(`
			`throughput_results_column_mapping.keys())].rename(`
			`columns=throughput_results_column_mapping)`

			`# get markdown tables`
			`latency_md_table = tabulate(latency_results,`
			`headers='keys',`
			`tablefmt='pipe',`
			`showindex=False)`
			`serving_md_table = tabulate(serving_results,`
			`headers='keys',`
			`tablefmt='pipe',`
			`showindex=False)`
			`throughput_md_table = tabulate(throughput_results,`
			`headers='keys',`
			`tablefmt='pipe',`
			`showindex=False)`

			`# document the result`
			`with open(results_folder / "benchmark_results.md", "w") as f:`
			`if not latency_results.empty:`
			`f.write("## Latency tests\n")`
			`f.write(latency_md_table)`
			`f.write("\n")`
			`if not throughput_results.empty:`
			`f.write("## Throughput tests\n")`
			`f.write(throughput_md_table)`
			`f.write("\n")`
			`if not serving_results.empty:`
			`f.write("## Serving tests\n")`
			`f.write(serving_md_table)`
			`f.write("\n")`