2024-01-24 00:26:37 +01:00
|
|
|
import tempfile
|
|
|
|
from collections import OrderedDict
|
2024-06-15 12:45:31 +08:00
|
|
|
from typing import Dict, List, TypedDict
|
2024-03-25 23:59:47 +09:00
|
|
|
from unittest.mock import MagicMock, patch
|
2024-01-24 00:26:37 +01:00
|
|
|
|
|
|
|
import pytest
|
|
|
|
import torch
|
|
|
|
import torch.nn as nn
|
|
|
|
from huggingface_hub import snapshot_download
|
|
|
|
|
|
|
|
import vllm
|
|
|
|
from vllm.config import LoRAConfig
|
2024-10-18 14:30:55 -07:00
|
|
|
from vllm.distributed import (cleanup_dist_env_and_memory,
|
2024-06-12 17:27:08 -07:00
|
|
|
init_distributed_environment,
|
|
|
|
initialize_model_parallel)
|
2024-01-24 00:26:37 +01:00
|
|
|
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
|
|
|
MergedColumnParallelLinear,
|
|
|
|
RowParallelLinear)
|
2024-03-25 23:59:47 +09:00
|
|
|
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
|
|
|
from vllm.model_executor.layers.sampler import Sampler
|
2024-01-24 00:26:37 +01:00
|
|
|
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
2024-03-25 23:59:47 +09:00
|
|
|
from vllm.model_executor.model_loader import get_model
|
2024-01-24 00:26:37 +01:00
|
|
|
|
2024-06-15 12:45:31 +08:00
|
|
|
|
|
|
|
class ContextIDInfo(TypedDict):
|
|
|
|
lora_id: int
|
|
|
|
context_length: str
|
|
|
|
|
|
|
|
|
|
|
|
class ContextInfo(TypedDict):
|
|
|
|
lora: str
|
|
|
|
context_length: str
|
|
|
|
|
|
|
|
|
|
|
|
LONG_LORA_INFOS: List[ContextIDInfo] = [{
|
2024-05-18 16:05:23 +09:00
|
|
|
"lora_id": 1,
|
|
|
|
"context_length": "16k",
|
|
|
|
}, {
|
|
|
|
"lora_id": 2,
|
|
|
|
"context_length": "16k",
|
|
|
|
}, {
|
|
|
|
"lora_id": 3,
|
|
|
|
"context_length": "32k",
|
|
|
|
}]
|
|
|
|
|
2024-01-24 00:26:37 +01:00
|
|
|
|
2024-06-06 19:07:57 -07:00
|
|
|
@pytest.fixture()
|
|
|
|
def should_do_global_cleanup_after_test(request) -> bool:
|
|
|
|
"""Allow subdirectories to skip global cleanup by overriding this fixture.
|
|
|
|
This can provide a ~10x speedup for non-GPU unit tests since they don't need
|
|
|
|
to initialize torch.
|
|
|
|
"""
|
|
|
|
|
2024-09-18 07:00:56 -04:00
|
|
|
return not request.node.get_closest_marker("skip_global_cleanup")
|
2024-06-06 19:07:57 -07:00
|
|
|
|
|
|
|
|
2024-01-24 00:26:37 +01:00
|
|
|
@pytest.fixture(autouse=True)
|
2024-06-06 19:07:57 -07:00
|
|
|
def cleanup_fixture(should_do_global_cleanup_after_test: bool):
|
2024-01-24 00:26:37 +01:00
|
|
|
yield
|
2024-06-06 19:07:57 -07:00
|
|
|
if should_do_global_cleanup_after_test:
|
2024-10-18 14:30:55 -07:00
|
|
|
cleanup_dist_env_and_memory(shutdown_ray=True)
|
2024-01-24 00:26:37 +01:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def dist_init():
|
2024-06-12 17:27:08 -07:00
|
|
|
temp_file = tempfile.mkstemp()[1]
|
|
|
|
init_distributed_environment(
|
|
|
|
world_size=1,
|
|
|
|
rank=0,
|
|
|
|
distributed_init_method=f"file://{temp_file}",
|
|
|
|
local_rank=0,
|
|
|
|
backend="nccl",
|
|
|
|
)
|
2024-01-24 00:26:37 +01:00
|
|
|
initialize_model_parallel(1, 1)
|
|
|
|
yield
|
2024-10-18 14:30:55 -07:00
|
|
|
cleanup_dist_env_and_memory(shutdown_ray=True)
|
2024-01-24 00:26:37 +01:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def dist_init_torch_only():
|
|
|
|
if torch.distributed.is_initialized():
|
|
|
|
return
|
|
|
|
temp_file = tempfile.mkstemp()[1]
|
|
|
|
torch.distributed.init_process_group(
|
|
|
|
backend="nccl",
|
|
|
|
world_size=1,
|
|
|
|
rank=0,
|
|
|
|
init_method=f"file://{temp_file}",
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def dummy_model() -> nn.Module:
|
|
|
|
model = nn.Sequential(
|
|
|
|
OrderedDict([
|
|
|
|
("dense1", ColumnParallelLinear(764, 100)),
|
|
|
|
("dense2", RowParallelLinear(100, 50)),
|
|
|
|
(
|
|
|
|
"layer1",
|
|
|
|
nn.Sequential(
|
|
|
|
OrderedDict([
|
|
|
|
("dense1", ColumnParallelLinear(100, 10)),
|
|
|
|
("dense2", RowParallelLinear(10, 50)),
|
|
|
|
])),
|
|
|
|
),
|
|
|
|
("act2", nn.ReLU()),
|
|
|
|
("output", ColumnParallelLinear(50, 10)),
|
|
|
|
("outact", nn.Sigmoid()),
|
|
|
|
# Special handling for lm_head & sampler
|
|
|
|
("lm_head", ParallelLMHead(512, 10)),
|
2024-03-21 07:25:01 +08:00
|
|
|
("logits_processor", LogitsProcessor(512)),
|
|
|
|
("sampler", Sampler())
|
2024-01-24 00:26:37 +01:00
|
|
|
]))
|
|
|
|
model.config = MagicMock()
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def dummy_model_gate_up() -> nn.Module:
|
|
|
|
model = nn.Sequential(
|
|
|
|
OrderedDict([
|
|
|
|
("dense1", ColumnParallelLinear(764, 100)),
|
|
|
|
("dense2", RowParallelLinear(100, 50)),
|
|
|
|
(
|
|
|
|
"layer1",
|
|
|
|
nn.Sequential(
|
|
|
|
OrderedDict([
|
|
|
|
("dense1", ColumnParallelLinear(100, 10)),
|
|
|
|
("dense2", RowParallelLinear(10, 50)),
|
|
|
|
])),
|
|
|
|
),
|
|
|
|
("act2", nn.ReLU()),
|
|
|
|
("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
|
|
|
|
("outact", nn.Sigmoid()),
|
|
|
|
# Special handling for lm_head & sampler
|
|
|
|
("lm_head", ParallelLMHead(512, 10)),
|
2024-03-21 07:25:01 +08:00
|
|
|
("logits_processor", LogitsProcessor(512)),
|
|
|
|
("sampler", Sampler())
|
2024-01-24 00:26:37 +01:00
|
|
|
]))
|
|
|
|
model.config = MagicMock()
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
2024-07-22 15:42:40 -07:00
|
|
|
def sql_lora_huggingface_id():
|
|
|
|
# huggingface repo id is used to test lora runtime downloading.
|
|
|
|
return "yard1/llama-2-7b-sql-lora-test"
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def sql_lora_files(sql_lora_huggingface_id):
|
|
|
|
return snapshot_download(repo_id=sql_lora_huggingface_id)
|
2024-01-24 00:26:37 +01:00
|
|
|
|
|
|
|
|
2024-02-13 15:55:45 -08:00
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def mixtral_lora_files():
|
2024-07-01 02:11:15 +09:00
|
|
|
# Note: this module has incorrect adapter_config.json to test
|
|
|
|
# https://github.com/vllm-project/vllm/pull/5909/files.
|
|
|
|
return snapshot_download(repo_id="SangBinCho/mixtral-lora")
|
2024-02-13 15:55:45 -08:00
|
|
|
|
|
|
|
|
2024-10-04 09:24:40 -07:00
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def mixtral_lora_files_all_target_modules():
|
|
|
|
return snapshot_download(repo_id="dyang415/mixtral-lora-v0")
|
|
|
|
|
|
|
|
|
2024-02-28 13:03:28 -08:00
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def gemma_lora_files():
|
|
|
|
return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
|
|
|
|
|
|
|
|
|
2024-03-26 09:09:31 +08:00
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def chatglm3_lora_files():
|
|
|
|
return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def baichuan_lora_files():
|
|
|
|
return snapshot_download(repo_id="jeeejeee/baichuan7b-text2sql-spider")
|
|
|
|
|
|
|
|
|
2024-04-19 15:59:54 +08:00
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def baichuan_zero_lora_files():
|
|
|
|
# all the lora_B weights are initialized to zero.
|
|
|
|
return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init")
|
|
|
|
|
|
|
|
|
2024-10-11 20:31:21 +08:00
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def baichuan_regex_lora_files():
|
|
|
|
return snapshot_download(repo_id="jeeejeee/baichuan-7b-lora-zero-regex")
|
|
|
|
|
|
|
|
|
2024-09-29 14:59:45 +08:00
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def minicpmv_lora_files():
|
|
|
|
return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon")
|
|
|
|
|
|
|
|
|
2024-04-12 12:02:44 +08:00
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def tinyllama_lora_files():
|
|
|
|
return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
|
|
|
|
|
|
|
|
|
2024-05-21 13:24:17 +08:00
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def phi2_lora_files():
|
|
|
|
return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")
|
|
|
|
|
|
|
|
|
2024-05-18 16:05:23 +09:00
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def long_context_lora_files_16k_1():
|
|
|
|
return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_1")
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def long_context_lora_files_16k_2():
|
|
|
|
return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_2")
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def long_context_lora_files_32k():
|
|
|
|
return snapshot_download(repo_id="SangBinCho/long_context_32k_testing")
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
|
|
def long_context_infos(long_context_lora_files_16k_1,
|
|
|
|
long_context_lora_files_16k_2,
|
|
|
|
long_context_lora_files_32k):
|
2024-10-18 14:30:55 -07:00
|
|
|
cleanup_dist_env_and_memory(shutdown_ray=True)
|
2024-06-15 12:45:31 +08:00
|
|
|
infos: Dict[int, ContextInfo] = {}
|
2024-05-18 16:05:23 +09:00
|
|
|
for lora_checkpoint_info in LONG_LORA_INFOS:
|
|
|
|
lora_id = lora_checkpoint_info["lora_id"]
|
|
|
|
if lora_id == 1:
|
|
|
|
lora = long_context_lora_files_16k_1
|
|
|
|
elif lora_id == 2:
|
|
|
|
lora = long_context_lora_files_16k_2
|
|
|
|
elif lora_id == 3:
|
|
|
|
lora = long_context_lora_files_32k
|
|
|
|
else:
|
|
|
|
raise AssertionError("Unknown lora id")
|
|
|
|
infos[lora_id] = {
|
|
|
|
"context_length": lora_checkpoint_info["context_length"],
|
|
|
|
"lora": lora,
|
|
|
|
}
|
|
|
|
return infos
|
|
|
|
|
|
|
|
|
2024-01-24 00:26:37 +01:00
|
|
|
@pytest.fixture
|
2024-06-15 12:45:31 +08:00
|
|
|
def llama_2_7b_engine_extra_embeddings():
|
2024-10-18 14:30:55 -07:00
|
|
|
cleanup_dist_env_and_memory(shutdown_ray=True)
|
2024-01-24 00:26:37 +01:00
|
|
|
get_model_old = get_model
|
|
|
|
|
2024-11-02 12:08:49 -07:00
|
|
|
def get_model_patched(**kwargs):
|
|
|
|
kwargs["vllm_config"].lora_config = LoRAConfig(max_loras=4,
|
|
|
|
max_lora_rank=8)
|
|
|
|
return get_model_old(**kwargs)
|
2024-01-24 00:26:37 +01:00
|
|
|
|
|
|
|
with patch("vllm.worker.model_runner.get_model", get_model_patched):
|
|
|
|
engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
|
|
|
|
yield engine.llm_engine
|
|
|
|
del engine
|
2024-10-18 14:30:55 -07:00
|
|
|
cleanup_dist_env_and_memory(shutdown_ray=True)
|
2024-01-24 00:26:37 +01:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
2024-06-15 12:45:31 +08:00
|
|
|
def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
|
2024-03-11 11:03:45 -07:00
|
|
|
yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
|
|
|
|
model_runner.model)
|