From 3e397a948431755b668fb95af679ec0a33daea89 Mon Sep 17 00:00:00 2001 From: Alexey Belyakov Date: Fri, 11 Apr 2025 03:15:06 +0100 Subject: [PATCH] check input length of sonnet samples (#16423) Signed-off-by: alexey-belyakov --- benchmarks/benchmark_dataset.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 8bb7cfe0..63f17427 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -489,7 +489,7 @@ class SonnetDataset(BenchmarkDataset): prefix_lines = self.data[:num_prefix_lines] samples = [] - for _ in range(num_requests): + while len(samples) < num_requests: extra_lines = random.choices(self.data, k=num_input_lines - num_prefix_lines) prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}" @@ -497,13 +497,14 @@ class SonnetDataset(BenchmarkDataset): prompt_formatted = tokenizer.apply_chat_template( msg, add_generation_prompt=True, tokenize=False) prompt_len = len(tokenizer(prompt_formatted).input_ids) - samples.append( - SampleRequest( - prompt=prompt_formatted - if return_prompt_formatted else prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - )) + if prompt_len <= input_len: + samples.append( + SampleRequest( + prompt=prompt_formatted + if return_prompt_formatted else prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + )) return samples