[CI/Build] LoRA: Delete long context tests (#15503)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
This commit is contained in:
parent
a5cfbab3c8
commit
ff38f0a32c
@ -287,7 +287,7 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py
|
||||
parallelism: 4
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||
@ -592,8 +592,6 @@ steps:
|
||||
# FIXIT: find out which code initialize cuda before running the test
|
||||
# before the fix, we need to use spawn to test it
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||
- pytest -v -s -x lora/test_long_context.py
|
||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||
# requires multi-GPU testing for validation.
|
||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||
|
@ -1,301 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import ast
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from vllm import SamplingParams
|
||||
from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLoRA
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.layers.rotary_embedding import (
|
||||
LinearScalingRotaryEmbedding)
|
||||
|
||||
from .data.long_context_test_data import prompts_and_responses
|
||||
|
||||
context_len_to_scaling_factor = {
|
||||
"16k": 4,
|
||||
"32k": 8,
|
||||
}
|
||||
|
||||
# We use the same sampling params for all requests
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=100,
|
||||
)
|
||||
|
||||
|
||||
def _create_lora_request(lora_id, long_context_infos):
|
||||
context_len = long_context_infos[lora_id]["context_length"]
|
||||
scaling_factor = context_len_to_scaling_factor[context_len]
|
||||
return LoRARequest(
|
||||
# There are 2 LoRAs for 16K, we need to add lora_id to indicate
|
||||
# they are different LoRAs.
|
||||
context_len + str(lora_id),
|
||||
lora_id,
|
||||
long_context_infos[lora_id]["lora"],
|
||||
None,
|
||||
4096 * scaling_factor,
|
||||
)
|
||||
|
||||
|
||||
def evaluate_json_response(model_response, golden_response):
|
||||
"""Evaluates the model response against the golden response.
|
||||
|
||||
Returns a score between 0 and 1, where 1 is a perfect match and 0 is no
|
||||
match. The score quantifies how well the model is able to extract the
|
||||
golden JSON from the long context.
|
||||
"""
|
||||
try:
|
||||
model_response = ast.literal_eval(model_response)
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
f"Model response is not a valid JSON. Expected {golden_response}, "
|
||||
f"got {model_response}") from e
|
||||
|
||||
# Normally, we would flatten the dictionary and compare the values, but in
|
||||
# this case, we know that the dictionary is only 2 levels deep
|
||||
positive_values = 0
|
||||
total_values = 0
|
||||
# We look at all the attributes of the person that we are extracting a
|
||||
# biography of and copmare them to the golden response
|
||||
for person_attribute, person_attribute_value in golden_response.items():
|
||||
if person_attribute in model_response:
|
||||
if isinstance(person_attribute_value, dict):
|
||||
for (sub_attribute,
|
||||
sub_attribute_value) in person_attribute_value.items():
|
||||
total_values += 1
|
||||
if sub_attribute in model_response[
|
||||
person_attribute] and model_response[
|
||||
person_attribute][
|
||||
sub_attribute] == sub_attribute_value:
|
||||
positive_values += 1
|
||||
else:
|
||||
total_values += 1
|
||||
if model_response[person_attribute] == person_attribute_value:
|
||||
positive_values += 1
|
||||
else:
|
||||
# We count a missing sub-dict as a single missed value.
|
||||
total_values += 1
|
||||
|
||||
# Return a score between 0 and 1
|
||||
return positive_values / total_values
|
||||
|
||||
|
||||
def generate(
|
||||
llm: vllm.LLM,
|
||||
inputs: tuple[str, SamplingParams, Optional[LoRARequest]],
|
||||
):
|
||||
prompts, sampling_param, lora_request = inputs
|
||||
outputs = llm.generate(prompts, sampling_param, lora_request=lora_request)
|
||||
return outputs[0].outputs[0].text.strip()
|
||||
|
||||
|
||||
def batched_generate(
|
||||
llm: vllm.LLM,
|
||||
inputs: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
|
||||
):
|
||||
for input in inputs:
|
||||
prompt, sampling_param, lora_req = input
|
||||
# Add requests to the engine and run the engine
|
||||
llm._validate_and_add_requests(prompt,
|
||||
sampling_param,
|
||||
lora_request=lora_req,
|
||||
prompt_adapter_request=None)
|
||||
|
||||
outputs = llm._run_engine(use_tqdm=True)
|
||||
return [outputs[i].outputs[0].text.strip() for i in range(len(outputs))]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def lora_llm(long_context_infos):
|
||||
scaling_factors = [
|
||||
context_len_to_scaling_factor[info["context_length"]]
|
||||
for info in long_context_infos.values()
|
||||
]
|
||||
|
||||
llm = vllm.LLM(
|
||||
"meta-llama/Llama-2-13b-chat-hf",
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=2,
|
||||
long_lora_scaling_factors=tuple(scaling_factors),
|
||||
max_num_batched_tokens=4096 * 8,
|
||||
tensor_parallel_size=4,
|
||||
# FIXME enable async output processor
|
||||
disable_async_output_proc=True,
|
||||
distributed_executor_backend="mp",
|
||||
enable_chunked_prefill=True)
|
||||
yield llm
|
||||
del llm
|
||||
|
||||
|
||||
def test_rotary_emb_replaced(dist_init):
|
||||
"""Verify rotary emb in all the layers are replaced"""
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.worker.model_runner import ModelRunner
|
||||
engine_args = EngineArgs("meta-llama/Llama-2-7b-hf",
|
||||
long_lora_scaling_factors=(4.0, ),
|
||||
enable_lora=True)
|
||||
engine_config = engine_args.create_engine_config()
|
||||
model_runner = ModelRunner(
|
||||
vllm_config=engine_config,
|
||||
is_driver_worker=True,
|
||||
)
|
||||
model_runner.load_model()
|
||||
rotary_emb_count = 0
|
||||
for module_name, module in model_runner.model.named_modules(
|
||||
remove_duplicate=False):
|
||||
if "rotary_emb" in module_name:
|
||||
if "base_layer" not in module_name:
|
||||
rotary_emb_count += 1
|
||||
assert isinstance(module, LinearScalingRotaryEmbeddingWithLoRA)
|
||||
else:
|
||||
assert isinstance(module, LinearScalingRotaryEmbedding)
|
||||
# Llama 2 has 32 layers.
|
||||
assert rotary_emb_count == 32
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_batched_rope_kernel(lora_llm, long_context_infos):
|
||||
"""We test the batched kernel by comparing the results of batched an
|
||||
non-batched generation.
|
||||
"""
|
||||
# Create non batched results first to compare against batched results
|
||||
non_batched_results: list[str] = []
|
||||
|
||||
for lora_id, info in long_context_infos.items():
|
||||
context_len = info["context_length"]
|
||||
lora_prompt = (prompts_and_responses[context_len][0]["prompt"],
|
||||
sampling_params,
|
||||
_create_lora_request(lora_id, long_context_infos))
|
||||
lora_output = generate(lora_llm, lora_prompt)
|
||||
non_batched_results.append(lora_output)
|
||||
|
||||
# Create batched results
|
||||
# Each element of the batch must be
|
||||
# (prompt, prompt_sampling_params, prompt_lora_request)
|
||||
batched_prompts: list[tuple[str, SamplingParams,
|
||||
Optional[LoRARequest]]] = []
|
||||
for lora_id, info in long_context_infos.items():
|
||||
context_len = info["context_length"]
|
||||
batched_prompts.extend([
|
||||
(prompts_and_responses[context_len][0]["prompt"], sampling_params,
|
||||
_create_lora_request(lora_id, long_context_infos))
|
||||
])
|
||||
batched_results = batched_generate(lora_llm, batched_prompts)
|
||||
|
||||
# Results should be the same
|
||||
for non_batched, batched in zip(non_batched_results, batched_results):
|
||||
assert non_batched == batched, (
|
||||
"Non batched and batched results should be the "
|
||||
f"same:\n{batched}\n{non_batched}")
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_self_consistency(lora_llm, long_context_infos):
|
||||
"""We test consistency of the batched kernel by permuting batched
|
||||
inputs and comparing the results to the non-permuted batched results.
|
||||
"""
|
||||
num_loras = len(long_context_infos)
|
||||
|
||||
# Create results in order of long_context_infos
|
||||
batched_prompts: list[tuple[str, SamplingParams,
|
||||
Optional[LoRARequest]]] = []
|
||||
for lora_id, info in long_context_infos.items():
|
||||
context_len = info["context_length"]
|
||||
batched_prompts.extend([
|
||||
(prompts_and_responses[context_len][0]["prompt"], sampling_params,
|
||||
_create_lora_request(lora_id, long_context_infos))
|
||||
])
|
||||
|
||||
batched_results = batched_generate(lora_llm, batched_prompts)
|
||||
|
||||
permutation = np.random.default_rng(seed=42).permutation(num_loras)
|
||||
|
||||
# Create results in random order of permutation
|
||||
batched_prompts = []
|
||||
for i in permutation:
|
||||
lora_id, info = list(long_context_infos.items())[i]
|
||||
context_len = info["context_length"]
|
||||
batched_prompts.extend([
|
||||
(prompts_and_responses[context_len][0]["prompt"], sampling_params,
|
||||
_create_lora_request(lora_id, long_context_infos))
|
||||
])
|
||||
|
||||
permutated_batched_results = batched_generate(lora_llm, batched_prompts)
|
||||
|
||||
# Results should be the same
|
||||
for i in range(num_loras):
|
||||
assert batched_results[i] == permutated_batched_results[
|
||||
permutation[i]], (
|
||||
f"Results should be the same:\n{batched_results[i]}"
|
||||
f"\n{permutated_batched_results[permutation[i]]}")
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_quality(lora_llm, long_context_infos):
|
||||
"""We test the quality of the answers given by the LoRA model by
|
||||
comparing the generated text to the merged model's outputs.
|
||||
|
||||
This is effectively a mini-benchmark over four prompts.
|
||||
If this test fails, this indicates that the quality of the LoRA model
|
||||
is suboptimal compared to the merged model. For example, if the model
|
||||
does not output valid dictionaries, this test will fail.
|
||||
|
||||
If needed for testing, the merged versions of the models are available
|
||||
as part of the `conftest`.
|
||||
|
||||
The test is expected to run for about 1 minute on a p4de.24xlarge
|
||||
instance.
|
||||
"""
|
||||
scores: list[float] = []
|
||||
for lora_id, info in long_context_infos.items():
|
||||
context_len = info["context_length"]
|
||||
for prompt_and_response in prompts_and_responses[context_len]:
|
||||
lora_prompt = (prompt_and_response["prompt"], sampling_params,
|
||||
_create_lora_request(lora_id, long_context_infos))
|
||||
response = generate(lora_llm, lora_prompt)
|
||||
golden_answer = prompt_and_response["golden_answer"]
|
||||
score = evaluate_json_response(response, golden_answer)
|
||||
scores.append(score)
|
||||
assert score > 0.3, ("Quality of the answer is not good enough. "
|
||||
f"Expected {golden_answer}, got {response}")
|
||||
assert np.mean(scores) > 0.5
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_max_len(lora_llm, long_context_infos):
|
||||
"""Test that we raise an ValueError when the input of a given LoRA
|
||||
model exceeds the maximum length."""
|
||||
# Since each LoRA model has a different maximum length, we need to
|
||||
# test each one separately
|
||||
for lora_id, info in long_context_infos.items():
|
||||
context_len = info["context_length"]
|
||||
lora_request = _create_lora_request(lora_id, long_context_infos)
|
||||
# Good prompt should be fine
|
||||
good_prompt = prompts_and_responses[context_len][0]["prompt"]
|
||||
generate(lora_llm, (good_prompt, sampling_params, lora_request))
|
||||
# Bad prompt should raise an error
|
||||
bad_prompt = good_prompt * 2
|
||||
with pytest.raises(ValueError):
|
||||
generate(lora_llm, (bad_prompt, sampling_params, lora_request))
|
||||
|
||||
# Also test batched
|
||||
batched_prompts: list[tuple[str, SamplingParams,
|
||||
Optional[LoRARequest]]] = []
|
||||
for lora_id_with_bad_inputs in long_context_infos:
|
||||
for lora_id, info in long_context_infos.items():
|
||||
context_len = info["context_length"]
|
||||
batched_prompts.extend([
|
||||
(prompts_and_responses[context_len][0]["prompt"] *
|
||||
(2 if lora_id == lora_id_with_bad_inputs else 1),
|
||||
sampling_params,
|
||||
_create_lora_request(lora_id, long_context_infos))
|
||||
])
|
||||
# Turn good prompt into bad prompt inside of batched prompts
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
batched_generate(lora_llm, batched_prompts)
|
Loading…
x
Reference in New Issue
Block a user