vllm/tests/lora/conftest.py

import tempfile
from collections import OrderedDict
from typing import Dict, List, TypedDict
from unittest.mock import MagicMock, patch

import pytest
import torch
import torch.nn as nn
from huggingface_hub import snapshot_download

import vllm
from vllm.config import LoRAConfig
from vllm.distributed import (cleanup_dist_env_and_memory,
                              init_distributed_environment,
                              initialize_model_parallel)
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               MergedColumnParallelLinear,
                                               RowParallelLinear)
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader import get_model


class ContextIDInfo(TypedDict):
    lora_id: int
    context_length: str


class ContextInfo(TypedDict):
    lora: str
    context_length: str


LONG_LORA_INFOS: List[ContextIDInfo] = [{
    "lora_id": 1,
    "context_length": "16k",
}, {
    "lora_id": 2,
    "context_length": "16k",
}, {
    "lora_id": 3,
    "context_length": "32k",
}]


@pytest.fixture()
def should_do_global_cleanup_after_test(request) -> bool:
    """Allow subdirectories to skip global cleanup by overriding this fixture.
    This can provide a ~10x speedup for non-GPU unit tests since they don't need
    to initialize torch.
    """

    return not request.node.get_closest_marker("skip_global_cleanup")


@pytest.fixture(autouse=True)
def cleanup_fixture(should_do_global_cleanup_after_test: bool):
    yield
    if should_do_global_cleanup_after_test:
        cleanup_dist_env_and_memory(shutdown_ray=True)


@pytest.fixture
def dist_init():
    temp_file = tempfile.mkstemp()[1]
    init_distributed_environment(
        world_size=1,
        rank=0,
        distributed_init_method=f"file://{temp_file}",
        local_rank=0,
        backend="nccl",
    )
    initialize_model_parallel(1, 1)
    yield
    cleanup_dist_env_and_memory(shutdown_ray=True)


@pytest.fixture
def dist_init_torch_only():
    if torch.distributed.is_initialized():
        return
    temp_file = tempfile.mkstemp()[1]
    torch.distributed.init_process_group(
        backend="nccl",
        world_size=1,
        rank=0,
        init_method=f"file://{temp_file}",
    )


@pytest.fixture
def dummy_model() -> nn.Module:
    model = nn.Sequential(
        OrderedDict([
            ("dense1", ColumnParallelLinear(764, 100)),
            ("dense2", RowParallelLinear(100, 50)),
            (
                "layer1",
                nn.Sequential(
                    OrderedDict([
                        ("dense1", ColumnParallelLinear(100, 10)),
                        ("dense2", RowParallelLinear(10, 50)),
                    ])),
            ),
            ("act2", nn.ReLU()),
            ("output", ColumnParallelLinear(50, 10)),
            ("outact", nn.Sigmoid()),
            # Special handling for lm_head & sampler
            ("lm_head", ParallelLMHead(512, 10)),
            ("logits_processor", LogitsProcessor(512)),
            ("sampler", Sampler())
        ]))
    model.config = MagicMock()
    return model


@pytest.fixture
def dummy_model_gate_up() -> nn.Module:
    model = nn.Sequential(
        OrderedDict([
            ("dense1", ColumnParallelLinear(764, 100)),
            ("dense2", RowParallelLinear(100, 50)),
            (
                "layer1",
                nn.Sequential(
                    OrderedDict([
                        ("dense1", ColumnParallelLinear(100, 10)),
                        ("dense2", RowParallelLinear(10, 50)),
                    ])),
            ),
            ("act2", nn.ReLU()),
            ("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
            ("outact", nn.Sigmoid()),
            # Special handling for lm_head & sampler
            ("lm_head", ParallelLMHead(512, 10)),
            ("logits_processor", LogitsProcessor(512)),
            ("sampler", Sampler())
        ]))
    model.config = MagicMock()
    return model


@pytest.fixture(scope="session")
def sql_lora_huggingface_id():
    # huggingface repo id is used to test lora runtime downloading.
    return "yard1/llama-2-7b-sql-lora-test"


@pytest.fixture(scope="session")
def sql_lora_files(sql_lora_huggingface_id):
    return snapshot_download(repo_id=sql_lora_huggingface_id)


@pytest.fixture(scope="session")
def mixtral_lora_files():
    # Note: this module has incorrect adapter_config.json to test
    # https://github.com/vllm-project/vllm/pull/5909/files.
    return snapshot_download(repo_id="SangBinCho/mixtral-lora")


@pytest.fixture(scope="session")
def mixtral_lora_files_all_target_modules():
    return snapshot_download(repo_id="dyang415/mixtral-lora-v0")


@pytest.fixture(scope="session")
def gemma_lora_files():
    return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")


@pytest.fixture(scope="session")
def chatglm3_lora_files():
    return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")


@pytest.fixture(scope="session")
def baichuan_lora_files():
    return snapshot_download(repo_id="jeeejeee/baichuan7b-text2sql-spider")


@pytest.fixture(scope="session")
def baichuan_zero_lora_files():
    # all the lora_B weights are initialized to zero.
    return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init")


@pytest.fixture(scope="session")
def baichuan_regex_lora_files():
    return snapshot_download(repo_id="jeeejeee/baichuan-7b-lora-zero-regex")


@pytest.fixture(scope="session")
def minicpmv_lora_files():
    return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon")


@pytest.fixture(scope="session")
def tinyllama_lora_files():
    return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")


@pytest.fixture(scope="session")
def phi2_lora_files():
    return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")


@pytest.fixture(scope="session")
def long_context_lora_files_16k_1():
    return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_1")


@pytest.fixture(scope="session")
def long_context_lora_files_16k_2():
    return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_2")


@pytest.fixture(scope="session")
def long_context_lora_files_32k():
    return snapshot_download(repo_id="SangBinCho/long_context_32k_testing")


@pytest.fixture(scope="session")
def long_context_infos(long_context_lora_files_16k_1,
                       long_context_lora_files_16k_2,
                       long_context_lora_files_32k):
    cleanup_dist_env_and_memory(shutdown_ray=True)
    infos: Dict[int, ContextInfo] = {}
    for lora_checkpoint_info in LONG_LORA_INFOS:
        lora_id = lora_checkpoint_info["lora_id"]
        if lora_id == 1:
            lora = long_context_lora_files_16k_1
        elif lora_id == 2:
            lora = long_context_lora_files_16k_2
        elif lora_id == 3:
            lora = long_context_lora_files_32k
        else:
            raise AssertionError("Unknown lora id")
        infos[lora_id] = {
            "context_length": lora_checkpoint_info["context_length"],
            "lora": lora,
        }
    return infos


@pytest.fixture
def llama_2_7b_engine_extra_embeddings():
    cleanup_dist_env_and_memory(shutdown_ray=True)
    get_model_old = get_model

    def get_model_patched(**kwargs):
        kwargs["vllm_config"].lora_config = LoRAConfig(max_loras=4,
                                                       max_lora_rank=8)
        return get_model_old(**kwargs)

    with patch("vllm.worker.model_runner.get_model", get_model_patched):
        engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
    yield engine.llm_engine
    del engine
    cleanup_dist_env_and_memory(shutdown_ray=True)


@pytest.fixture
def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
    yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
           model_runner.model)
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`import tempfile`
			`from collections import OrderedDict`
[mypy] Enable type checking for test directory (#5017) 2024-06-15 12:45:31 +08:00			`from typing import Dict, List, TypedDict`
[CI] Try introducing isort. (#3495) 2024-03-25 23:59:47 +09:00			`from unittest.mock import MagicMock, patch`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00
			`import pytest`
			`import torch`
			`import torch.nn as nn`
			`from huggingface_hub import snapshot_download`

			`import vllm`
			`from vllm.config import LoRAConfig`
[MISC] Consolidate cleanup() and refactor offline_inference_with_prefix.py (#9510) 2024-10-18 14:30:55 -07:00			`from vllm.distributed import (cleanup_dist_env_and_memory,`
[Core][Distributed] code deduplication in tp&pp with coordinator(#5293) [Core][Distributed] add coordinator to reduce code duplication in tp and pp (#5293) 2024-06-12 17:27:08 -07:00			`init_distributed_environment,`
			`initialize_model_parallel)`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`from vllm.model_executor.layers.linear import (ColumnParallelLinear,`
			`MergedColumnParallelLinear,`
			`RowParallelLinear)`
[CI] Try introducing isort. (#3495) 2024-03-25 23:59:47 +09:00			`from vllm.model_executor.layers.logits_processor import LogitsProcessor`
			`from vllm.model_executor.layers.sampler import Sampler`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead`
[CI] Try introducing isort. (#3495) 2024-03-25 23:59:47 +09:00			`from vllm.model_executor.model_loader import get_model`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00
[mypy] Enable type checking for test directory (#5017) 2024-06-15 12:45:31 +08:00
			`class ContextIDInfo(TypedDict):`
			`lora_id: int`
			`context_length: str`


			`class ContextInfo(TypedDict):`
			`lora: str`
			`context_length: str`


			`LONG_LORA_INFOS: List[ContextIDInfo] = [{`
[Lora] Support long context lora (#4787) Currently we need to call rotary embedding kernel for each LoRA, which makes it hard to serve multiple long context length LoRA. Add batched rotary embedding kernel and pipe it through. It replaces the rotary embedding layer to the one that is aware of multiple cos-sin-cache per scaling factors. Follow up of https://github.com/vllm-project/vllm/pull/3095/files 2024-05-18 16:05:23 +09:00			`"lora_id": 1,`
			`"context_length": "16k",`
			`}, {`
			`"lora_id": 2,`
			`"context_length": "16k",`
			`}, {`
			`"lora_id": 3,`
			`"context_length": "32k",`
			`}]`

[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00
[Core] Change LoRA embedding sharding to support loading methods (#5038) 2024-06-06 19:07:57 -07:00			`@pytest.fixture()`
			`def should_do_global_cleanup_after_test(request) -> bool:`
			`"""Allow subdirectories to skip global cleanup by overriding this fixture.`
			`This can provide a ~10x speedup for non-GPU unit tests since they don't need`
			`to initialize torch.`
			`"""`

[CI/Build] Update Ruff version (#8469) Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> 2024-09-18 07:00:56 -04:00			`return not request.node.get_closest_marker("skip_global_cleanup")`
[Core] Change LoRA embedding sharding to support loading methods (#5038) 2024-06-06 19:07:57 -07:00

[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`@pytest.fixture(autouse=True)`
[Core] Change LoRA embedding sharding to support loading methods (#5038) 2024-06-06 19:07:57 -07:00			`def cleanup_fixture(should_do_global_cleanup_after_test: bool):`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`yield`
[Core] Change LoRA embedding sharding to support loading methods (#5038) 2024-06-06 19:07:57 -07:00			`if should_do_global_cleanup_after_test:`
[MISC] Consolidate cleanup() and refactor offline_inference_with_prefix.py (#9510) 2024-10-18 14:30:55 -07:00			`cleanup_dist_env_and_memory(shutdown_ray=True)`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00

			`@pytest.fixture`
			`def dist_init():`
[Core][Distributed] code deduplication in tp&pp with coordinator(#5293) [Core][Distributed] add coordinator to reduce code duplication in tp and pp (#5293) 2024-06-12 17:27:08 -07:00			`temp_file = tempfile.mkstemp()[1]`
			`init_distributed_environment(`
			`world_size=1,`
			`rank=0,`
			`distributed_init_method=f"file://{temp_file}",`
			`local_rank=0,`
			`backend="nccl",`
			`)`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`initialize_model_parallel(1, 1)`
			`yield`
[MISC] Consolidate cleanup() and refactor offline_inference_with_prefix.py (#9510) 2024-10-18 14:30:55 -07:00			`cleanup_dist_env_and_memory(shutdown_ray=True)`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00

			`@pytest.fixture`
			`def dist_init_torch_only():`
			`if torch.distributed.is_initialized():`
			`return`
			`temp_file = tempfile.mkstemp()[1]`
			`torch.distributed.init_process_group(`
			`backend="nccl",`
			`world_size=1,`
			`rank=0,`
			`init_method=f"file://{temp_file}",`
			`)`


			`@pytest.fixture`
			`def dummy_model() -> nn.Module:`
			`model = nn.Sequential(`
			`OrderedDict([`
			`("dense1", ColumnParallelLinear(764, 100)),`
			`("dense2", RowParallelLinear(100, 50)),`
			`(`
			`"layer1",`
			`nn.Sequential(`
			`OrderedDict([`
			`("dense1", ColumnParallelLinear(100, 10)),`
			`("dense2", RowParallelLinear(10, 50)),`
			`])),`
			`),`
			`("act2", nn.ReLU()),`
			`("output", ColumnParallelLinear(50, 10)),`
			`("outact", nn.Sigmoid()),`
			`# Special handling for lm_head & sampler`
			`("lm_head", ParallelLMHead(512, 10)),`
Migrate `logits` computation and gather to `model_runner` (#3233) 2024-03-21 07:25:01 +08:00			`("logits_processor", LogitsProcessor(512)),`
			`("sampler", Sampler())`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`]))`
			`model.config = MagicMock()`
			`return model`


			`@pytest.fixture`
			`def dummy_model_gate_up() -> nn.Module:`
			`model = nn.Sequential(`
			`OrderedDict([`
			`("dense1", ColumnParallelLinear(764, 100)),`
			`("dense2", RowParallelLinear(100, 50)),`
			`(`
			`"layer1",`
			`nn.Sequential(`
			`OrderedDict([`
			`("dense1", ColumnParallelLinear(100, 10)),`
			`("dense2", RowParallelLinear(10, 50)),`
			`])),`
			`),`
			`("act2", nn.ReLU()),`
			`("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),`
			`("outact", nn.Sigmoid()),`
			`# Special handling for lm_head & sampler`
			`("lm_head", ParallelLMHead(512, 10)),`
Migrate `logits` computation and gather to `model_runner` (#3233) 2024-03-21 07:25:01 +08:00			`("logits_processor", LogitsProcessor(512)),`
			`("sampler", Sampler())`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`]))`
			`model.config = MagicMock()`
			`return model`


			`@pytest.fixture(scope="session")`
[Core] Support dynamically loading Lora adapter from HuggingFace (#6234) Co-authored-by: Antoni Baum <antoni.baum@protonmail.com> 2024-07-22 15:42:40 -07:00			`def sql_lora_huggingface_id():`
			`# huggingface repo id is used to test lora runtime downloading.`
			`return "yard1/llama-2-7b-sql-lora-test"`


			`@pytest.fixture(scope="session")`
			`def sql_lora_files(sql_lora_huggingface_id):`
			`return snapshot_download(repo_id=sql_lora_huggingface_id)`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00

Add LoRA support for Mixtral (#2831) * add mixtral lora support * formatting * fix incorrectly ported logic * polish tests * minor fixes and refactoring * minor fixes * formatting * rename and remove redundant logic * refactoring * refactoring * minor fix * minor refactoring * fix code smell 2024-02-13 15:55:45 -08:00			`@pytest.fixture(scope="session")`
			`def mixtral_lora_files():`
[Lora] Use safetensor keys instead of adapter_config.json to find unexpected modules. (#5909) Co-authored-by: sang <sangcho@anyscale.com> 2024-07-01 02:11:15 +09:00			`# Note: this module has incorrect adapter_config.json to test`
			`# https://github.com/vllm-project/vllm/pull/5909/files.`
			`return snapshot_download(repo_id="SangBinCho/mixtral-lora")`
Add LoRA support for Mixtral (#2831) * add mixtral lora support * formatting * fix incorrectly ported logic * polish tests * minor fixes and refactoring * minor fixes * formatting * rename and remove redundant logic * refactoring * refactoring * minor fix * minor refactoring * fix code smell 2024-02-13 15:55:45 -08:00

[Model] add a bunch of supported lora modules for mixtral (#9008) Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com> 2024-10-04 09:24:40 -07:00			`@pytest.fixture(scope="session")`
			`def mixtral_lora_files_all_target_modules():`
			`return snapshot_download(repo_id="dyang415/mixtral-lora-v0")`


Add LoRA support for Gemma (#3050) 2024-02-28 13:03:28 -08:00			`@pytest.fixture(scope="session")`
			`def gemma_lora_files():`
			`return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")`


Enable more models to inference based on LoRA (#3382) Co-authored-by: Antoni Baum <antoni.baum@protonmail.com> 2024-03-26 09:09:31 +08:00			`@pytest.fixture(scope="session")`
			`def chatglm3_lora_files():`
			`return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")`


			`@pytest.fixture(scope="session")`
			`def baichuan_lora_files():`
			`return snapshot_download(repo_id="jeeejeee/baichuan7b-text2sql-spider")`


[Bugfix] Fix LoRA loading check (#4138) Co-authored-by: simon-mo <simon.mo@hey.com> 2024-04-19 15:59:54 +08:00			`@pytest.fixture(scope="session")`
			`def baichuan_zero_lora_files():`
			`# all the lora_B weights are initialized to zero.`
			`return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init")`


[Misc][LoRA] Support loading LoRA weights for target_modules in reg format (#9275) 2024-10-11 20:31:21 +08:00			`@pytest.fixture(scope="session")`
			`def baichuan_regex_lora_files():`
			`return snapshot_download(repo_id="jeeejeee/baichuan-7b-lora-zero-regex")`


[Model][LoRA]LoRA support added for MiniCPMV2.5 (#7199) 2024-09-29 14:59:45 +08:00			`@pytest.fixture(scope="session")`
			`def minicpmv_lora_files():`
			`return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon")`


[Core] Support LoRA on quantized models (#4012) 2024-04-12 12:02:44 +08:00			`@pytest.fixture(scope="session")`
			`def tinyllama_lora_files():`
			`return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")`


[Model] Add Phi-2 LoRA support (#4886) 2024-05-21 13:24:17 +08:00			`@pytest.fixture(scope="session")`
			`def phi2_lora_files():`
			`return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")`


[Lora] Support long context lora (#4787) Currently we need to call rotary embedding kernel for each LoRA, which makes it hard to serve multiple long context length LoRA. Add batched rotary embedding kernel and pipe it through. It replaces the rotary embedding layer to the one that is aware of multiple cos-sin-cache per scaling factors. Follow up of https://github.com/vllm-project/vllm/pull/3095/files 2024-05-18 16:05:23 +09:00			`@pytest.fixture(scope="session")`
			`def long_context_lora_files_16k_1():`
			`return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_1")`


			`@pytest.fixture(scope="session")`
			`def long_context_lora_files_16k_2():`
			`return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_2")`


			`@pytest.fixture(scope="session")`
			`def long_context_lora_files_32k():`
			`return snapshot_download(repo_id="SangBinCho/long_context_32k_testing")`


			`@pytest.fixture(scope="session")`
			`def long_context_infos(long_context_lora_files_16k_1,`
			`long_context_lora_files_16k_2,`
			`long_context_lora_files_32k):`
[MISC] Consolidate cleanup() and refactor offline_inference_with_prefix.py (#9510) 2024-10-18 14:30:55 -07:00			`cleanup_dist_env_and_memory(shutdown_ray=True)`
[mypy] Enable type checking for test directory (#5017) 2024-06-15 12:45:31 +08:00			`infos: Dict[int, ContextInfo] = {}`
[Lora] Support long context lora (#4787) Currently we need to call rotary embedding kernel for each LoRA, which makes it hard to serve multiple long context length LoRA. Add batched rotary embedding kernel and pipe it through. It replaces the rotary embedding layer to the one that is aware of multiple cos-sin-cache per scaling factors. Follow up of https://github.com/vllm-project/vllm/pull/3095/files 2024-05-18 16:05:23 +09:00			`for lora_checkpoint_info in LONG_LORA_INFOS:`
			`lora_id = lora_checkpoint_info["lora_id"]`
			`if lora_id == 1:`
			`lora = long_context_lora_files_16k_1`
			`elif lora_id == 2:`
			`lora = long_context_lora_files_16k_2`
			`elif lora_id == 3:`
			`lora = long_context_lora_files_32k`
			`else:`
			`raise AssertionError("Unknown lora id")`
			`infos[lora_id] = {`
			`"context_length": lora_checkpoint_info["context_length"],`
			`"lora": lora,`
			`}`
			`return infos`


[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`@pytest.fixture`
[mypy] Enable type checking for test directory (#5017) 2024-06-15 12:45:31 +08:00			`def llama_2_7b_engine_extra_embeddings():`
[MISC] Consolidate cleanup() and refactor offline_inference_with_prefix.py (#9510) 2024-10-18 14:30:55 -07:00			`cleanup_dist_env_and_memory(shutdown_ray=True)`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`get_model_old = get_model`

[3/N] model runner pass the whole config to model (#9958) Signed-off-by: youkaichao <youkaichao@gmail.com> 2024-11-02 12:08:49 -07:00			`def get_model_patched(**kwargs):`
			`kwargs["vllm_config"].lora_config = LoRAConfig(max_loras=4,`
			`max_lora_rank=8)`
			`return get_model_old(**kwargs)`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00
			`with patch("vllm.worker.model_runner.get_model", get_model_patched):`
			`engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)`
			`yield engine.llm_engine`
			`del engine`
[MISC] Consolidate cleanup() and refactor offline_inference_with_prefix.py (#9510) 2024-10-18 14:30:55 -07:00			`cleanup_dist_env_and_memory(shutdown_ray=True)`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00

			`@pytest.fixture`
[mypy] Enable type checking for test directory (#5017) 2024-06-15 12:45:31 +08:00			`def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):`
Add distributed model executor abstraction (#3191) 2024-03-11 11:03:45 -07:00			`yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.`
			`model_runner.model)`