vllm/examples/offline_inference/cpu_offload_lmcache.py

# SPDX-License-Identifier: Apache-2.0
"""
This file demonstrates the example usage of cpu offloading
with LMCache.

Note that `lmcache` is needed to run this example.
Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
Learn more about LMCache environment setup, please refer to:
https://docs.lmcache.ai/getting_started/installation.html
"""
import contextlib
import os
import time

from lmcache.experimental.cache_engine import LMCacheEngineBuilder
from lmcache.integration.vllm.utils import ENGINE_NAME

from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig


def setup_environment_variables():
    # LMCache-related environment variables
    # Use experimental features in LMCache
    os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
    # LMCache is set to use 256 tokens per chunk
    os.environ["LMCACHE_CHUNK_SIZE"] = "256"
    # Enable local CPU backend in LMCache
    os.environ["LMCACHE_LOCAL_CPU"] = "True"
    # Set local CPU memory limit to 5.0 GB
    os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"


@contextlib.contextmanager
def build_llm_with_lmcache():
    ktc = KVTransferConfig.from_cli(
        '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
    # memory. Reduce the value if your GPU has less memory.
    # Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
              kv_transfer_config=ktc,
              max_model_len=8000,
              enable_chunked_prefill=True,
              gpu_memory_utilization=0.8)

    try:
        yield llm
    finally:
        # Clean up lmcache backend
        LMCacheEngineBuilder.destroy(ENGINE_NAME)


def print_output(
    llm: LLM,
    prompt: list[str],
    sampling_params: SamplingParams,
    req_str: str,
):
    start = time.time()
    outputs = llm.generate(prompt, sampling_params)
    print("-" * 50)
    for output in outputs:
        generated_text = output.outputs[0].text
        print(f"Generated text: {generated_text!r}")
    print(f"Generation took {time.time() - start:.2f} seconds, "
          f"{req_str} request done.")
    print("-" * 50)


def main():
    setup_environment_variables()

    with build_llm_with_lmcache() as llm:

        # This example script runs two requests with a shared prefix.
        # Define the shared prompt and specific prompts
        shared_prompt = "Hello, how are you?" * 1000
        first_prompt = [
            shared_prompt + "Hello, my name is",
        ]
        second_prompt = [
            shared_prompt + "Tell me a very long story",
        ]

        sampling_params = SamplingParams(temperature=0,
                                         top_p=0.95,
                                         max_tokens=10)

        # Print the first output
        print_output(llm, first_prompt, sampling_params, "first")

        time.sleep(1)

        # print the second output
        print_output(llm, second_prompt, sampling_params, "second")


if __name__ == "__main__":
    main()
[Feature] Support KV cache offloading and disagg prefill with LMCache connector. (#12953) 2025-02-25 02:38:42 -06:00			`# SPDX-License-Identifier: Apache-2.0`
			`"""`
			`This file demonstrates the example usage of cpu offloading`
			`with LMCache.`

[Misc] refactor examples series - lmcache (#16758) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> 2025-04-17 19:02:35 +08:00			Note that `lmcache` is needed to run this example.
			`Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1`
			`Learn more about LMCache environment setup, please refer to:`
			`https://docs.lmcache.ai/getting_started/installation.html`
[Feature] Support KV cache offloading and disagg prefill with LMCache connector. (#12953) 2025-02-25 02:38:42 -06:00			`"""`
[Misc] refactor examples series - lmcache (#16758) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> 2025-04-17 19:02:35 +08:00			`import contextlib`
[Feature] Support KV cache offloading and disagg prefill with LMCache connector. (#12953) 2025-02-25 02:38:42 -06:00			`import os`
			`import time`

			`from lmcache.experimental.cache_engine import LMCacheEngineBuilder`
			`from lmcache.integration.vllm.utils import ENGINE_NAME`

			`from vllm import LLM, SamplingParams`
			`from vllm.config import KVTransferConfig`

[Misc] refactor examples series - lmcache (#16758) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> 2025-04-17 19:02:35 +08:00
			`def setup_environment_variables():`
			`# LMCache-related environment variables`
			`# Use experimental features in LMCache`
			`os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"`
			`# LMCache is set to use 256 tokens per chunk`
			`os.environ["LMCACHE_CHUNK_SIZE"] = "256"`
			`# Enable local CPU backend in LMCache`
			`os.environ["LMCACHE_LOCAL_CPU"] = "True"`
			`# Set local CPU memory limit to 5.0 GB`
			`os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"`


			`@contextlib.contextmanager`
			`def build_llm_with_lmcache():`
			`ktc = KVTransferConfig.from_cli(`
			`'{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')`
			`# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB`
			`# memory. Reduce the value if your GPU has less memory.`
[Misc] Update outdated note: LMCache now supports chunked prefill (#16697) Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> 2025-04-18 13:12:42 +08:00			`# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).`
[Misc] refactor examples series - lmcache (#16758) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> 2025-04-17 19:02:35 +08:00			`llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",`
			`kv_transfer_config=ktc,`
			`max_model_len=8000,`
[Misc] Update outdated note: LMCache now supports chunked prefill (#16697) Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> 2025-04-18 13:12:42 +08:00			`enable_chunked_prefill=True,`
[Misc] refactor examples series - lmcache (#16758) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> 2025-04-17 19:02:35 +08:00			`gpu_memory_utilization=0.8)`

			`try:`
			`yield llm`
			`finally:`
			`# Clean up lmcache backend`
			`LMCacheEngineBuilder.destroy(ENGINE_NAME)`


			`def print_output(`
			`llm: LLM,`
			`prompt: list[str],`
			`sampling_params: SamplingParams,`
			`req_str: str,`
			`):`
			`start = time.time()`
			`outputs = llm.generate(prompt, sampling_params)`
			`print("-" * 50)`
			`for output in outputs:`
			`generated_text = output.outputs[0].text`
			`print(f"Generated text: {generated_text!r}")`
			`print(f"Generation took {time.time() - start:.2f} seconds, "`
			`f"{req_str} request done.")`
			`print("-" * 50)`


			`def main():`
			`setup_environment_variables()`

			`with build_llm_with_lmcache() as llm:`

			`# This example script runs two requests with a shared prefix.`
			`# Define the shared prompt and specific prompts`
			`shared_prompt = "Hello, how are you?" * 1000`
			`first_prompt = [`
			`shared_prompt + "Hello, my name is",`
			`]`
			`second_prompt = [`
			`shared_prompt + "Tell me a very long story",`
			`]`

			`sampling_params = SamplingParams(temperature=0,`
			`top_p=0.95,`
			`max_tokens=10)`

			`# Print the first output`
			`print_output(llm, first_prompt, sampling_params, "first")`

			`time.sleep(1)`

			`# print the second output`
			`print_output(llm, second_prompt, sampling_params, "second")`


			`if __name__ == "__main__":`
			`main()`