[Benchmark] Add BurstGPT to benchmark_serving (#13063)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
This commit is contained in:
Woosuk Kwon 2025-02-10 21:25:30 -08:00 committed by GitHub
parent cb080f32e3
commit 58047c6f04
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 47 additions and 1 deletions

View File

@ -19,3 +19,11 @@ mkdir coco -p
wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
unzip coco/train2017.zip -d coco/ unzip coco/train2017.zip -d coco/
``` ```
# Downloading the BurstGPT dataset
You can download the BurstGPT v1.1 dataset by running:
```bash
wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv
```

View File

@ -38,6 +38,7 @@ from datetime import datetime
from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple
import numpy as np import numpy as np
import pandas as pd
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
RequestFuncOutput) RequestFuncOutput)
from datasets import load_dataset from datasets import load_dataset
@ -131,6 +132,35 @@ def sample_sharegpt_requests(
return filtered_dataset return filtered_dataset
def sample_burstgpt_requests(
dataset_path: str,
num_requests: int,
random_seed: int,
tokenizer: PreTrainedTokenizerBase,
) -> List[Tuple[str, int, int, None]]:
df = pd.read_csv(dataset_path)
gpt4_df = df[df["Model"] == "GPT-4"]
# Remove the failed requests (i.e., response length is 0)
gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
# Randomly sample num_requests from the dataset
if num_requests <= len(gpt4_df):
gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed)
else:
gpt4_df = gpt4_df.sample(n=num_requests,
random_state=random_seed,
replace=True)
# Convert the dataframe to a list of tuples
dataset = gpt4_df.values.tolist()
input_requests = []
for i in range(num_requests):
input_len = int(dataset[i][2])
output_len = int(dataset[i][3])
prompt = tokenizer.decode([(i + j) % tokenizer.vocab_size
for j in range(input_len)])
input_requests.append((prompt, input_len, output_len, None))
return input_requests
def sample_sonnet_requests( def sample_sonnet_requests(
dataset_path: str, dataset_path: str,
num_requests: int, num_requests: int,
@ -830,6 +860,14 @@ def main(args: argparse.Namespace):
fixed_output_len=args.sharegpt_output_len, fixed_output_len=args.sharegpt_output_len,
) )
elif args.dataset_name == "burstgpt":
input_requests = sample_burstgpt_requests(
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
random_seed=args.seed,
tokenizer=tokenizer,
)
elif args.dataset_name == "sonnet": elif args.dataset_name == "sonnet":
# Do not format the prompt, pass to message directly # Do not format the prompt, pass to message directly
if args.backend == "openai-chat": if args.backend == "openai-chat":
@ -995,7 +1033,7 @@ if __name__ == "__main__":
"--dataset-name", "--dataset-name",
type=str, type=str,
default="sharegpt", default="sharegpt",
choices=["sharegpt", "sonnet", "random", "hf"], choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
help="Name of the dataset to benchmark on.", help="Name of the dataset to benchmark on.",
) )
parser.add_argument("--dataset-path", parser.add_argument("--dataset-path",