vllm/tests/worker/test_swap.py

import torch

from vllm.engine.arg_utils import EngineArgs
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
from vllm.worker.worker import Worker


def test_swap() -> None:
    # Configure the engine.
    engine_args = EngineArgs(model="facebook/opt-125m",
                             dtype="half",
                             load_format="dummy")
    (model_config, cache_config, parallel_config, scheduler_config,
     device_config, _) = engine_args.create_engine_configs()
    cache_config.num_gpu_blocks = 100
    cache_config.num_cpu_blocks = 100

    # Create the worker.
    distributed_init_method = get_distributed_init_method(
        get_ip(), get_open_port())
    worker = Worker(
        model_config=model_config,
        parallel_config=parallel_config,
        scheduler_config=scheduler_config,
        device_config=device_config,
        local_rank=0,
        rank=0,
        distributed_init_method=distributed_init_method,
        is_driver_worker=True,
    )

    # Initialize the worker.
    worker.init_device()
    worker.load_model()
    worker.init_cache_engine(cache_config)
    worker.warm_up_model()

    # Randomly initialize the cache.
    gpu_cache = worker.cache_engine.gpu_cache
    cpu_cache = worker.cache_engine.cpu_cache
    num_layers = len(gpu_cache)
    for i in range(num_layers):
        gpu_key_cache, gpu_value_cache = gpu_cache[i]
        gpu_key_cache.random_()
        gpu_value_cache.random_()
        cpu_key_cache, cpu_value_cache = cpu_cache[i]
        cpu_key_cache.random_()
        cpu_value_cache.random_()

    allclose = lambda a, b: torch.allclose(
        a.cuda(), b.cuda(), rtol=0.0, atol=0.0)

    # Test swap out.
    blocks_to_swap_out = {3: 72, 56: 35, 84: 34}
    worker.execute_model(seq_group_metadata_list=[],
                         blocks_to_swap_in={},
                         blocks_to_swap_out=blocks_to_swap_out,
                         blocks_to_copy={})
    for i in range(num_layers):
        gpu_key_cache, gpu_value_cache = gpu_cache[i]
        cpu_key_cache, cpu_value_cache = cpu_cache[i]
        for src, dst in blocks_to_swap_out.items():
            assert allclose(gpu_key_cache[src], cpu_key_cache[dst])
            assert allclose(gpu_value_cache[src], cpu_value_cache[dst])

    # Test swap in.
    blocks_to_swap_in = {19: 45, 67: 23, 12: 78, 40: 99, 1: 71}
    worker.execute_model(seq_group_metadata_list=[],
                         blocks_to_swap_in=blocks_to_swap_in,
                         blocks_to_swap_out={},
                         blocks_to_copy={})
    for i in range(num_layers):
        gpu_key_cache, gpu_value_cache = gpu_cache[i]
        cpu_key_cache, cpu_value_cache = cpu_cache[i]
        for src, dst in blocks_to_swap_in.items():
            assert allclose(gpu_key_cache[dst], cpu_key_cache[src])
            assert allclose(gpu_value_cache[dst], cpu_value_cache[src])
[Misc] Remove cache stream and cache events (#3461) 2024-03-20 00:38:53 -07:00			`import torch`

			`from vllm.engine.arg_utils import EngineArgs`
			`from vllm.utils import get_distributed_init_method, get_ip, get_open_port`
[CI] Try introducing isort. (#3495) 2024-03-25 23:59:47 +09:00			`from vllm.worker.worker import Worker`
[Misc] Remove cache stream and cache events (#3461) 2024-03-20 00:38:53 -07:00

			`def test_swap() -> None:`
			`# Configure the engine.`
			`engine_args = EngineArgs(model="facebook/opt-125m",`
			`dtype="half",`
			`load_format="dummy")`
			`(model_config, cache_config, parallel_config, scheduler_config,`
			`device_config, _) = engine_args.create_engine_configs()`
			`cache_config.num_gpu_blocks = 100`
			`cache_config.num_cpu_blocks = 100`

			`# Create the worker.`
			`distributed_init_method = get_distributed_init_method(`
			`get_ip(), get_open_port())`
			`worker = Worker(`
			`model_config=model_config,`
			`parallel_config=parallel_config,`
			`scheduler_config=scheduler_config,`
			`device_config=device_config,`
			`local_rank=0,`
			`rank=0,`
			`distributed_init_method=distributed_init_method,`
			`is_driver_worker=True,`
			`)`

			`# Initialize the worker.`
[Hardware][Neuron] Refactor neuron support (#3471) 2024-03-21 18:22:17 -07:00			`worker.init_device()`
[Misc] Remove cache stream and cache events (#3461) 2024-03-20 00:38:53 -07:00			`worker.load_model()`
			`worker.init_cache_engine(cache_config)`
			`worker.warm_up_model()`

			`# Randomly initialize the cache.`
			`gpu_cache = worker.cache_engine.gpu_cache`
			`cpu_cache = worker.cache_engine.cpu_cache`
			`num_layers = len(gpu_cache)`
			`for i in range(num_layers):`
			`gpu_key_cache, gpu_value_cache = gpu_cache[i]`
			`gpu_key_cache.random_()`
			`gpu_value_cache.random_()`
			`cpu_key_cache, cpu_value_cache = cpu_cache[i]`
			`cpu_key_cache.random_()`
			`cpu_value_cache.random_()`

			`allclose = lambda a, b: torch.allclose(`
			`a.cuda(), b.cuda(), rtol=0.0, atol=0.0)`

			`# Test swap out.`
			`blocks_to_swap_out = {3: 72, 56: 35, 84: 34}`
			`worker.execute_model(seq_group_metadata_list=[],`
			`blocks_to_swap_in={},`
			`blocks_to_swap_out=blocks_to_swap_out,`
			`blocks_to_copy={})`
			`for i in range(num_layers):`
			`gpu_key_cache, gpu_value_cache = gpu_cache[i]`
			`cpu_key_cache, cpu_value_cache = cpu_cache[i]`
			`for src, dst in blocks_to_swap_out.items():`
			`assert allclose(gpu_key_cache[src], cpu_key_cache[dst])`
			`assert allclose(gpu_value_cache[src], cpu_value_cache[dst])`

			`# Test swap in.`
			`blocks_to_swap_in = {19: 45, 67: 23, 12: 78, 40: 99, 1: 71}`
			`worker.execute_model(seq_group_metadata_list=[],`
			`blocks_to_swap_in=blocks_to_swap_in,`
			`blocks_to_swap_out={},`
			`blocks_to_copy={})`
			`for i in range(num_layers):`
			`gpu_key_cache, gpu_value_cache = gpu_cache[i]`
			`cpu_key_cache, cpu_value_cache = cpu_cache[i]`
			`for src, dst in blocks_to_swap_in.items():`
			`assert allclose(gpu_key_cache[dst], cpu_key_cache[src])`
			`assert allclose(gpu_value_cache[dst], cpu_value_cache[src])`