[Bugfix] Add warmup for prefix caching example (#5235)

This commit is contained in:
Zhuohan Li 2024-06-03 19:36:41 -07:00 committed by GitHub
parent 06b2550cbb
commit bd0e7802e0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -51,8 +51,10 @@ for output in outputs:
print("-" * 80)
# The llm.generate call will batch all prompts and send the batch at once
# if resources allow.
# Warmup so that the shared prompt's KV cache is computed.
prefix_cached_llm.generate(generating_prompts[0], sampling_params)
# Generate with prefix caching.
start_time_cached = time()
outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
duration_cached = time() - start_time_cached