[CI/Build] Fix CI LoRA failure (#16270)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
parent
2755c34a8f
commit
86c3369eb8
@ -256,3 +256,15 @@ def run_with_both_engines_lora(request, monkeypatch):
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def reset_default_device():
|
||||
"""
|
||||
Some tests, such as `test_punica_ops.py`, explicitly set the
|
||||
default device, which can affect subsequent tests. Adding this fixture
|
||||
helps avoid this problem.
|
||||
"""
|
||||
original_device = torch.get_default_device()
|
||||
yield
|
||||
torch.set_default_device(original_device)
|
||||
|
@ -73,7 +73,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
tensor_parallel_size=1,
|
||||
trust_remote_code=True,
|
||||
fully_sharded_loras=fully_sharded)
|
||||
output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
|
||||
|
@ -61,7 +61,6 @@ def test_chatglm3_lora(chatglm3_lora_files):
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
tensor_parallel_size=1,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True)
|
||||
|
||||
|
@ -65,7 +65,7 @@ VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def clean_cache():
|
||||
def clean_cache_reset_device(reset_default_device):
|
||||
# Release any memory we might be holding on to. CI runs OOMs otherwise.
|
||||
from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
|
||||
_LORA_B_PTR_DICT)
|
||||
|
@ -88,7 +88,6 @@ def test_llama_lora(sql_lora_files):
|
||||
# also test odd max_num_seqs
|
||||
max_num_seqs=13,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=1,
|
||||
enable_chunked_prefill=True)
|
||||
generate_and_test(llm, sql_lora_files)
|
||||
|
||||
|
@ -13,6 +13,11 @@ from vllm.platforms import current_platform
|
||||
from .utils import PunicaTensors, assert_close, generate_data_for_nslices
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def reset_device(reset_default_device):
|
||||
pass
|
||||
|
||||
|
||||
# Utility shrink and expand operations used as reference implementations.
|
||||
def sgmv_shrink_for_nslices(
|
||||
nslices: int, inputs_tensor: torch.Tensor,
|
||||
|
@ -78,12 +78,7 @@ def do_sample(llm: vllm.LLM,
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", [1])
|
||||
def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
|
||||
tp_size):
|
||||
if num_gpus_available < tp_size and \
|
||||
tp_size > 1 and current_platform.is_cuda_alike():
|
||||
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
|
||||
def test_quant_model_lora(tinyllama_lora_files, model):
|
||||
|
||||
llm = vllm.LLM(
|
||||
model=model.model_path,
|
||||
@ -91,7 +86,6 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
max_model_len=400,
|
||||
tensor_parallel_size=tp_size,
|
||||
gpu_memory_utilization=0.2, #avoid OOM
|
||||
quantization=model.quantization,
|
||||
trust_remote_code=True,
|
||||
@ -185,7 +179,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.2, #avoid OOM
|
||||
quantization=model.quantization,
|
||||
trust_remote_code=True,
|
||||
|
@ -53,7 +53,6 @@ def test_ilama_lora(ilama_lora_files):
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=16,
|
||||
tensor_parallel_size=1,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user