[Bugfix] Add warmup for prefix caching example (#5235)
This commit is contained in:
parent
06b2550cbb
commit
bd0e7802e0
@ -51,8 +51,10 @@ for output in outputs:
|
||||
|
||||
print("-" * 80)
|
||||
|
||||
# The llm.generate call will batch all prompts and send the batch at once
|
||||
# if resources allow.
|
||||
# Warmup so that the shared prompt's KV cache is computed.
|
||||
prefix_cached_llm.generate(generating_prompts[0], sampling_params)
|
||||
|
||||
# Generate with prefix caching.
|
||||
start_time_cached = time()
|
||||
outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
|
||||
duration_cached = time() - start_time_cached
|
||||
|
Loading…
x
Reference in New Issue
Block a user