# SPDX-License-Identifier: Apache-2.0 """ This file demonstrates the example usage of cpu offloading with LMCache. Note that `lmcache` is needed to run this example. Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1 Learn more about LMCache environment setup, please refer to: https://docs.lmcache.ai/getting_started/installation.html """ import contextlib import os import time from lmcache.experimental.cache_engine import LMCacheEngineBuilder from lmcache.integration.vllm.utils import ENGINE_NAME from vllm import LLM, SamplingParams from vllm.config import KVTransferConfig def setup_environment_variables(): # LMCache-related environment variables # Use experimental features in LMCache os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True" # LMCache is set to use 256 tokens per chunk os.environ["LMCACHE_CHUNK_SIZE"] = "256" # Enable local CPU backend in LMCache os.environ["LMCACHE_LOCAL_CPU"] = "True" # Set local CPU memory limit to 5.0 GB os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0" @contextlib.contextmanager def build_llm_with_lmcache(): ktc = KVTransferConfig.from_cli( '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}') # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # memory. Reduce the value if your GPU has less memory. # Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392). llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", kv_transfer_config=ktc, max_model_len=8000, enable_chunked_prefill=True, gpu_memory_utilization=0.8) try: yield llm finally: # Clean up lmcache backend LMCacheEngineBuilder.destroy(ENGINE_NAME) def print_output( llm: LLM, prompt: list[str], sampling_params: SamplingParams, req_str: str, ): start = time.time() outputs = llm.generate(prompt, sampling_params) print("-" * 50) for output in outputs: generated_text = output.outputs[0].text print(f"Generated text: {generated_text!r}") print(f"Generation took {time.time() - start:.2f} seconds, " f"{req_str} request done.") print("-" * 50) def main(): setup_environment_variables() with build_llm_with_lmcache() as llm: # This example script runs two requests with a shared prefix. # Define the shared prompt and specific prompts shared_prompt = "Hello, how are you?" * 1000 first_prompt = [ shared_prompt + "Hello, my name is", ] second_prompt = [ shared_prompt + "Tell me a very long story", ] sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) # Print the first output print_output(llm, first_prompt, sampling_params, "first") time.sleep(1) # print the second output print_output(llm, second_prompt, sampling_params, "second") if __name__ == "__main__": main()