diff --git a/examples/offline_inference/cpu_offload_lmcache.py b/examples/offline_inference/cpu_offload_lmcache.py index 8211629b..02544423 100644 --- a/examples/offline_inference/cpu_offload_lmcache.py +++ b/examples/offline_inference/cpu_offload_lmcache.py @@ -3,9 +3,12 @@ This file demonstrates the example usage of cpu offloading with LMCache. -Note that `pip install lmcache` is needed to run this example. -Learn more about LMCache in https://github.com/LMCache/LMCache. +Note that `lmcache` is needed to run this example. +Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1 +Learn more about LMCache environment setup, please refer to: +https://docs.lmcache.ai/getting_started/installation.html """ +import contextlib import os import time @@ -15,51 +18,83 @@ from lmcache.integration.vllm.utils import ENGINE_NAME from vllm import LLM, SamplingParams from vllm.config import KVTransferConfig -# LMCache-related environment variables -# Use experimental features in LMCache -os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True" -# LMCache is set to use 256 tokens per chunk -os.environ["LMCACHE_CHUNK_SIZE"] = "256" -# Enable local CPU backend in LMCache -os.environ["LMCACHE_LOCAL_CPU"] = "True" -# Set local CPU memory limit to 5.0 GB -os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0" -# This example script runs two requests with a shared prefix. -shared_prompt = "Hello, how are you?" * 1000 -first_prompt = [ - shared_prompt + "Hello, my name is", -] -second_prompt = [ - shared_prompt + "Tell me a very long story", -] +def setup_environment_variables(): + # LMCache-related environment variables + # Use experimental features in LMCache + os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True" + # LMCache is set to use 256 tokens per chunk + os.environ["LMCACHE_CHUNK_SIZE"] = "256" + # Enable local CPU backend in LMCache + os.environ["LMCACHE_LOCAL_CPU"] = "True" + # Set local CPU memory limit to 5.0 GB + os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0" -sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) -ktc = KVTransferConfig.from_cli( - '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}') -# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB -# memory. Reduce the value if your GPU has less memory. -# Note that LMCache is not compatible with chunked prefill for now. -llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", - kv_transfer_config=ktc, - max_model_len=8000, - enable_chunked_prefill=False, - gpu_memory_utilization=0.8) +@contextlib.contextmanager +def build_llm_with_lmcache(): + ktc = KVTransferConfig.from_cli( + '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}') + # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB + # memory. Reduce the value if your GPU has less memory. + # Note that LMCache is not compatible with chunked prefill for now. + llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", + kv_transfer_config=ktc, + max_model_len=8000, + enable_chunked_prefill=False, + gpu_memory_utilization=0.8) -outputs = llm.generate(first_prompt, sampling_params) -for output in outputs: - generated_text = output.outputs[0].text - print(f"Generated text: {generated_text!r}") -print("First request done.") + try: + yield llm + finally: + # Clean up lmcache backend + LMCacheEngineBuilder.destroy(ENGINE_NAME) -time.sleep(1) -outputs = llm.generate(second_prompt, sampling_params) -for output in outputs: - generated_text = output.outputs[0].text - print(f"Generated text: {generated_text!r}") -print("Second request done.") +def print_output( + llm: LLM, + prompt: list[str], + sampling_params: SamplingParams, + req_str: str, +): + start = time.time() + outputs = llm.generate(prompt, sampling_params) + print("-" * 50) + for output in outputs: + generated_text = output.outputs[0].text + print(f"Generated text: {generated_text!r}") + print(f"Generation took {time.time() - start:.2f} seconds, " + f"{req_str} request done.") + print("-" * 50) -# Clean up lmcache backend -LMCacheEngineBuilder.destroy(ENGINE_NAME) + +def main(): + setup_environment_variables() + + with build_llm_with_lmcache() as llm: + + # This example script runs two requests with a shared prefix. + # Define the shared prompt and specific prompts + shared_prompt = "Hello, how are you?" * 1000 + first_prompt = [ + shared_prompt + "Hello, my name is", + ] + second_prompt = [ + shared_prompt + "Tell me a very long story", + ] + + sampling_params = SamplingParams(temperature=0, + top_p=0.95, + max_tokens=10) + + # Print the first output + print_output(llm, first_prompt, sampling_params, "first") + + time.sleep(1) + + # print the second output + print_output(llm, second_prompt, sampling_params, "second") + + +if __name__ == "__main__": + main()