[Misc] refactor examples series - lmcache (#16758)

Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-04-17 19:02:35 +08:00 · 2025-04-17 19:02:35 +08:00 · 99ed526101
commit 99ed526101
parent 207da28186
1 changed files with 78 additions and 43 deletions
--- a/examples/offline_inference/cpu_offload_lmcache.py
+++ b/examples/offline_inference/cpu_offload_lmcache.py
@ -3,9 +3,12 @@
 This file demonstrates the example usage of cpu offloading
 with LMCache.
-Note that `pip install lmcache` is needed to run this example.
+Note that `lmcache` is needed to run this example.
-Learn more about LMCache in https://github.com/LMCache/LMCache.
+Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
 Learn more about LMCache environment setup, please refer to:
 https://docs.lmcache.ai/getting_started/installation.html
 """
 import contextlib
 import os
 import time
@ -15,6 +18,8 @@ from lmcache.integration.vllm.utils import ENGINE_NAME
 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
 def setup_environment_variables():
    # LMCache-related environment variables
    # Use experimental features in LMCache
    os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
@ -25,17 +30,9 @@ os.environ["LMCACHE_LOCAL_CPU"] = "True"
    # Set local CPU memory limit to 5.0 GB
    os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
 # This example script runs two requests with a shared prefix.
 shared_prompt = "Hello, how are you?" * 1000
 first_prompt = [
    shared_prompt + "Hello, my name is",
 ]
 second_prompt = [
    shared_prompt + "Tell me a very long story",
 ]
 sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
@contextlib.contextmanager
 def build_llm_with_lmcache():
    ktc = KVTransferConfig.from_cli(
        '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
@ -47,19 +44,57 @@ llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
              enable_chunked_prefill=False,
              gpu_memory_utilization=0.8)
-outputs = llm.generate(first_prompt, sampling_params)
+    try:
        yield llm
    finally:
        # Clean up lmcache backend
        LMCacheEngineBuilder.destroy(ENGINE_NAME)
 def print_output(
    llm: LLM,
    prompt: list[str],
    sampling_params: SamplingParams,
    req_str: str,
 ):
    start = time.time()
    outputs = llm.generate(prompt, sampling_params)
    print("-" * 50)
    for output in outputs:
        generated_text = output.outputs[0].text
        print(f"Generated text: {generated_text!r}")
-print("First request done.")
+    print(f"Generation took {time.time() - start:.2f} seconds, "
          f"{req_str} request done.")
    print("-" * 50)
 def main():
    setup_environment_variables()
    with build_llm_with_lmcache() as llm:
        # This example script runs two requests with a shared prefix.
        # Define the shared prompt and specific prompts
        shared_prompt = "Hello, how are you?" * 1000
        first_prompt = [
            shared_prompt + "Hello, my name is",
        ]
        second_prompt = [
            shared_prompt + "Tell me a very long story",
        ]
        sampling_params = SamplingParams(temperature=0,
                                         top_p=0.95,
                                         max_tokens=10)
        # Print the first output
        print_output(llm, first_prompt, sampling_params, "first")
        time.sleep(1)
-outputs = llm.generate(second_prompt, sampling_params)
+        # print the second output
-for output in outputs:
+        print_output(llm, second_prompt, sampling_params, "second")
    generated_text = output.outputs[0].text
    print(f"Generated text: {generated_text!r}")
 print("Second request done.")
-# Clean up lmcache backend
+
-LMCacheEngineBuilder.destroy(ENGINE_NAME)
+if __name__ == "__main__":
    main()