vllm/tests/lora/test_worker.py

# SPDX-License-Identifier: Apache-2.0

import os
import random
import tempfile
from typing import Union
from unittest.mock import patch

import pytest

import vllm.envs as envs
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                         ModelConfig, ParallelConfig, SchedulerConfig,
                         VllmConfig)
from vllm.lora.models import LoRAMapping
from vllm.lora.request import LoRARequest
from vllm.v1.worker.gpu_worker import Worker as V1Worker
from vllm.worker.worker import Worker


@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
    # Simple autouse wrapper to run both engines for each test
    # This can be promoted up to conftest.py to run for every
    # test in a package
    pass


@patch.dict(os.environ, {"RANK": "0"})
def test_worker_apply_lora(sql_lora_files):

    def set_active_loras(worker: Union[Worker, V1Worker],
                         lora_requests: list[LoRARequest]):
        lora_mapping = LoRAMapping([], [])
        if isinstance(worker, Worker):
            # v0 case
            worker.model_runner.set_active_loras(lora_requests, lora_mapping)
        else:
            # v1 case
            worker.model_runner.lora_manager.set_active_adapters(
                lora_requests, lora_mapping)

    worker_cls = V1Worker if envs.VLLM_USE_V1 else Worker

    vllm_config = VllmConfig(
        model_config=ModelConfig(
            "meta-llama/Llama-2-7b-hf",
            task="auto",
            tokenizer="meta-llama/Llama-2-7b-hf",
            tokenizer_mode="auto",
            trust_remote_code=False,
            seed=0,
            dtype="float16",
            revision=None,
            enforce_eager=True,
        ),
        load_config=LoadConfig(
            download_dir=None,
            load_format="dummy",
        ),
        parallel_config=ParallelConfig(1, 1, False),
        scheduler_config=SchedulerConfig("generate", 32, 32, 32),
        device_config=DeviceConfig("cuda"),
        cache_config=CacheConfig(block_size=16,
                                 gpu_memory_utilization=1.,
                                 swap_space=0,
                                 cache_dtype="auto"),
        lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
                               max_loras=32),
    )
    worker = worker_cls(
        vllm_config=vllm_config,
        local_rank=0,
        rank=0,
        distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
    )

    worker.init_device()
    worker.load_model()

    set_active_loras(worker, [])
    assert worker.list_loras() == set()

    n_loras = 32
    lora_requests = [
        LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
    ]

    set_active_loras(worker, lora_requests)
    assert worker.list_loras() == {
        lora_request.lora_int_id
        for lora_request in lora_requests
    }

    for i in range(32):
        random.seed(i)
        iter_lora_requests = random.choices(lora_requests,
                                            k=random.randint(1, n_loras))
        random.shuffle(iter_lora_requests)
        iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
        set_active_loras(worker, lora_requests)
        assert worker.list_loras().issuperset(
            {lora_request.lora_int_id
             for lora_request in iter_lora_requests})
[Misc] Add SPDX-License-Identifier headers to python source files (#12628) - Add SPDX license headers to python source files - Check for SPDX headers using pre-commit commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745 Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:18:24 2025 -0500 Add SPDX license headers to python source files This commit adds SPDX license headers to python source files as recommended to the project by the Linux Foundation. These headers provide a concise way that is both human and machine readable for communicating license information for each source file. It helps avoid any ambiguity about the license of the code and can also be easily used by tools to help manage license compliance. The Linux Foundation runs license scans against the codebase to help ensure we are in compliance with the licenses of the code we use, including dependencies. Having these headers in place helps that tool do its job. More information can be found on the SPDX site: - https://spdx.dev/learn/handling-license-info/ Signed-off-by: Russell Bryant <rbryant@redhat.com> commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:36:32 2025 -0500 Check for SPDX headers using pre-commit Signed-off-by: Russell Bryant <rbryant@redhat.com> --------- Signed-off-by: Russell Bryant <rbryant@redhat.com> 2025-02-02 14:58:18 -05:00			`# SPDX-License-Identifier: Apache-2.0`

[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`import os`
			`import random`
			`import tempfile`
[V1] LoRA - Enable more V1 tests (#14315) Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com> 2025-03-05 22:55:42 -05:00			`from typing import Union`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`from unittest.mock import patch`

[V1] LoRA - Enable more V1 tests (#14315) Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com> 2025-03-05 22:55:42 -05:00			`import pytest`

			`import vllm.envs as envs`
[Core] Refactor model loading code (#4097) 2024-04-16 11:34:39 -07:00			`from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,`
[2/N] executor pass the complete config to worker/modelrunner (#9938) Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Nick Hill <nhill@redhat.com> 2024-11-02 07:35:05 -07:00			`ModelConfig, ParallelConfig, SchedulerConfig,`
			`VllmConfig)`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`from vllm.lora.models import LoRAMapping`
			`from vllm.lora.request import LoRARequest`
[V1] LoRA - Enable more V1 tests (#14315) Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com> 2025-03-05 22:55:42 -05:00			`from vllm.v1.worker.gpu_worker import Worker as V1Worker`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`from vllm.worker.worker import Worker`


[V1] LoRA - Enable more V1 tests (#14315) Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com> 2025-03-05 22:55:42 -05:00			`@pytest.fixture(autouse=True)`
			`def v1(run_with_both_engines_lora):`
			`# Simple autouse wrapper to run both engines for each test`
			`# This can be promoted up to conftest.py to run for every`
			`# test in a package`
			`pass`


[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`@patch.dict(os.environ, {"RANK": "0"})`
			`def test_worker_apply_lora(sql_lora_files):`
[V1] LoRA - Enable more V1 tests (#14315) Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com> 2025-03-05 22:55:42 -05:00
			`def set_active_loras(worker: Union[Worker, V1Worker],`
			`lora_requests: list[LoRARequest]):`
			`lora_mapping = LoRAMapping([], [])`
			`if isinstance(worker, Worker):`
			`# v0 case`
			`worker.model_runner.set_active_loras(lora_requests, lora_mapping)`
			`else:`
			`# v1 case`
			`worker.model_runner.lora_manager.set_active_adapters(`
			`lora_requests, lora_mapping)`

			`worker_cls = V1Worker if envs.VLLM_USE_V1 else Worker`

[2/N] executor pass the complete config to worker/modelrunner (#9938) Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Nick Hill <nhill@redhat.com> 2024-11-02 07:35:05 -07:00			`vllm_config = VllmConfig(`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`model_config=ModelConfig(`
			`"meta-llama/Llama-2-7b-hf",`
[Model] Add user-configurable task for models that support both generation and embedding (#9424) 2024-10-19 02:31:58 +08:00			`task="auto",`
			`tokenizer="meta-llama/Llama-2-7b-hf",`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`tokenizer_mode="auto",`
			`trust_remote_code=False,`
			`seed=0,`
			`dtype="float16",`
			`revision=None,`
[Kernel] LoRA - Enable CUDAGraphs for V1 (#14626) Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com> 2025-03-13 23:42:04 -04:00			`enforce_eager=True,`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`),`
[Core] Refactor model loading code (#4097) 2024-04-16 11:34:39 -07:00			`load_config=LoadConfig(`
			`download_dir=None,`
			`load_format="dummy",`
			`),`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`parallel_config=ParallelConfig(1, 1, False),`
[Model] Add user-configurable task for models that support both generation and embedding (#9424) 2024-10-19 02:31:58 +08:00			`scheduler_config=SchedulerConfig("generate", 32, 32, 32),`
Remove hardcoded `device="cuda" ` to support more devices (#2503) Co-authored-by: Jiang Li <jiang1.li@intel.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> 2024-02-02 07:46:39 +08:00			`device_config=DeviceConfig("cuda"),`
[Misc] [Core] Implement RFC "Augment BaseExecutor interfaces to enable hardware-agnostic speculative decoding" (#3837) 2024-04-09 11:44:15 -07:00			`cache_config=CacheConfig(block_size=16,`
			`gpu_memory_utilization=1.,`
			`swap_space=0,`
			`cache_dtype="auto"),`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,`
			`max_loras=32),`
[2/N] executor pass the complete config to worker/modelrunner (#9938) Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Nick Hill <nhill@redhat.com> 2024-11-02 07:35:05 -07:00			`)`
[V1] LoRA - Enable more V1 tests (#14315) Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com> 2025-03-05 22:55:42 -05:00			`worker = worker_cls(`
[2/N] executor pass the complete config to worker/modelrunner (#9938) Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Nick Hill <nhill@redhat.com> 2024-11-02 07:35:05 -07:00			`vllm_config=vllm_config,`
			`local_rank=0,`
			`rank=0,`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`distributed_init_method=f"file://{tempfile.mkstemp()[1]}",`
			`)`
[V1] LoRA - Enable more V1 tests (#14315) Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com> 2025-03-05 22:55:42 -05:00
[Hardware][Neuron] Refactor neuron support (#3471) 2024-03-21 18:22:17 -07:00			`worker.init_device()`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`worker.load_model()`

[V1] LoRA - Enable more V1 tests (#14315) Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com> 2025-03-05 22:55:42 -05:00			`set_active_loras(worker, [])`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`assert worker.list_loras() == set()`

			`n_loras = 32`
			`lora_requests = [`
			`LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)`
			`]`

[V1] LoRA - Enable more V1 tests (#14315) Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com> 2025-03-05 22:55:42 -05:00			`set_active_loras(worker, lora_requests)`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`assert worker.list_loras() == {`
			`lora_request.lora_int_id`
			`for lora_request in lora_requests`
			`}`

			`for i in range(32):`
			`random.seed(i)`
			`iter_lora_requests = random.choices(lora_requests,`
			`k=random.randint(1, n_loras))`
			`random.shuffle(iter_lora_requests)`
			`iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]`
[V1] LoRA - Enable more V1 tests (#14315) Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com> 2025-03-05 22:55:42 -05:00			`set_active_loras(worker, lora_requests)`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`assert worker.list_loras().issuperset(`
			`{lora_request.lora_int_id`
			`for lora_request in iter_lora_requests})`