[Bugfix] Add warmup for prefix caching example (#5235)

2024-06-03 19:36:41 -07:00 · 2024-06-03 19:36:41 -07:00 · bd0e7802e0
commit bd0e7802e0
parent 06b2550cbb
1 changed files with 4 additions and 2 deletions
--- a/examples/offline_inference_with_prefix.py
+++ b/examples/offline_inference_with_prefix.py
@ -51,8 +51,10 @@ for output in outputs:

 print("-" * 80)

-# The llm.generate call will batch all prompts and send the batch at once
-# if resources allow.
+# Warmup so that the shared prompt's KV cache is computed.
+prefix_cached_llm.generate(generating_prompts[0], sampling_params)
+
+# Generate with prefix caching.
 start_time_cached = time()
 outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
 duration_cached = time() - start_time_cached