[Bugfix] Fix offline_inference_with_prefix.py (#9505)

This commit is contained in:
Tyler Michael Smith 2024-10-18 12:59:19 -04:00 committed by GitHub
parent 1bbbcc0b1d
commit ae8b633ba3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -29,11 +29,13 @@ generating_prompts = [prefix + prompt for prompt in prompts]
sampling_params = SamplingParams(temperature=0.0)
# Create an LLM.
regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.3)
# The second LLM needs to request a higher gpu_memory_utilization because
# the first LLM has already allocated a full 30% of the gpu memory.
prefix_cached_llm = LLM(model="facebook/opt-125m",
enable_prefix_caching=True,
gpu_memory_utilization=0.4)
gpu_memory_utilization=0.6)
print("Results without `enable_prefix_caching`")
# Generate texts from the prompts. The output is a list of RequestOutput objects