2024-08-21 15:49:39 -07:00
|
|
|
from typing import List, Optional, Tuple, Type
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
import pytest
|
|
|
|
from transformers import AutoModel, AutoTokenizer, BatchEncoding
|
|
|
|
|
|
|
|
from vllm.sequence import SampleLogprobs
|
|
|
|
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
|
|
|
|
2024-09-14 01:20:06 +08:00
|
|
|
from ....conftest import HfRunner, VllmRunner
|
|
|
|
from ...utils import check_logprobs_close
|
2024-08-21 15:49:39 -07:00
|
|
|
|
|
|
|
MODEL_NAME = "fixie-ai/ultravox-v0_3"
|
|
|
|
|
|
|
|
AudioTuple = Tuple[np.ndarray, int]
|
|
|
|
|
2024-09-03 21:38:21 -07:00
|
|
|
VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
|
|
|
|
HF_PLACEHOLDER = "<|audio|>"
|
|
|
|
|
2024-08-21 15:49:39 -07:00
|
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
2024-09-03 21:38:21 -07:00
|
|
|
def audio_assets():
|
2024-09-01 14:46:57 -07:00
|
|
|
from vllm.assets.audio import AudioAsset
|
2024-09-03 21:38:21 -07:00
|
|
|
return [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
|
2024-08-21 15:49:39 -07:00
|
|
|
|
|
|
|
|
2024-09-03 21:38:21 -07:00
|
|
|
@pytest.fixture(scope="module", params=("mary_had_lamb", "winning_call"))
|
|
|
|
def audio(request):
|
|
|
|
from vllm.assets.audio import AudioAsset
|
|
|
|
return AudioAsset(request.param)
|
2024-08-21 15:49:39 -07:00
|
|
|
|
|
|
|
|
2024-09-03 21:38:21 -07:00
|
|
|
def _get_prompt(audio_count, question, placeholder):
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
|
|
|
placeholder = f"{placeholder}\n" * audio_count
|
2024-08-21 15:49:39 -07:00
|
|
|
|
2024-09-03 21:38:21 -07:00
|
|
|
return tokenizer.apply_chat_template([{
|
|
|
|
'role': 'user',
|
|
|
|
'content': f"{placeholder}{question}"
|
|
|
|
}],
|
|
|
|
tokenize=False,
|
|
|
|
add_generation_prompt=True)
|
2024-08-21 15:49:39 -07:00
|
|
|
|
|
|
|
|
|
|
|
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
|
|
|
Optional[SampleLogprobs]],
|
|
|
|
model: str):
|
|
|
|
"""Sanitize vllm output to be comparable with hf output."""
|
|
|
|
output_ids, output_str, out_logprobs = vllm_output
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model)
|
|
|
|
eos_token_id = tokenizer.eos_token_id
|
|
|
|
|
|
|
|
hf_output_ids = output_ids[:]
|
|
|
|
hf_output_str = output_str
|
|
|
|
if hf_output_ids[-1] == eos_token_id:
|
|
|
|
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
|
|
|
|
|
|
|
return hf_output_ids, hf_output_str, out_logprobs
|
|
|
|
|
|
|
|
|
|
|
|
def run_test(
|
|
|
|
hf_runner: Type[HfRunner],
|
|
|
|
vllm_runner: Type[VllmRunner],
|
|
|
|
prompts_and_audios: List[Tuple[str, str, AudioTuple]],
|
|
|
|
model: str,
|
|
|
|
*,
|
|
|
|
dtype: str,
|
|
|
|
max_tokens: int,
|
|
|
|
num_logprobs: int,
|
|
|
|
tensor_parallel_size: int,
|
|
|
|
distributed_executor_backend: Optional[str] = None,
|
|
|
|
):
|
|
|
|
"""Inference result should be the same between hf and vllm."""
|
|
|
|
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
|
|
|
|
|
|
|
# NOTE: take care of the order. run vLLM first, and then run HF.
|
|
|
|
# vLLM needs a fresh new process without cuda initialization.
|
|
|
|
# if we run HF first, the cuda initialization will be done and it
|
|
|
|
# will hurt multiprocessing backend with fork method (the default method).
|
|
|
|
|
|
|
|
with vllm_runner(model,
|
|
|
|
dtype=dtype,
|
|
|
|
tensor_parallel_size=tensor_parallel_size,
|
|
|
|
distributed_executor_backend=distributed_executor_backend,
|
|
|
|
enforce_eager=True) as vllm_model:
|
|
|
|
vllm_outputs_per_audio = [
|
|
|
|
vllm_model.generate_greedy_logprobs([vllm_prompt],
|
|
|
|
max_tokens,
|
|
|
|
num_logprobs=num_logprobs,
|
|
|
|
audios=[audio])
|
|
|
|
for vllm_prompt, _, audio in prompts_and_audios
|
|
|
|
]
|
|
|
|
|
2024-10-30 10:32:17 -06:00
|
|
|
def process(hf_inputs: BatchEncoding, **kwargs):
|
2024-08-21 15:49:39 -07:00
|
|
|
hf_inputs["audio_values"] = hf_inputs["audio_values"] \
|
|
|
|
.to(torch_dtype) # type: ignore
|
|
|
|
return hf_inputs
|
|
|
|
|
|
|
|
with hf_runner(model,
|
|
|
|
dtype=dtype,
|
|
|
|
postprocess_inputs=process,
|
|
|
|
auto_cls=AutoModel) as hf_model:
|
2024-09-01 14:46:57 -07:00
|
|
|
import librosa
|
2024-08-21 15:49:39 -07:00
|
|
|
|
|
|
|
hf_outputs_per_audio = [
|
|
|
|
hf_model.generate_greedy_logprobs_limit(
|
|
|
|
[hf_prompt],
|
|
|
|
max_tokens,
|
|
|
|
num_logprobs=num_logprobs,
|
|
|
|
audios=[(librosa.resample(audio[0],
|
|
|
|
orig_sr=audio[1],
|
|
|
|
target_sr=16000), 16000)])
|
|
|
|
for _, hf_prompt, audio in prompts_and_audios
|
|
|
|
]
|
|
|
|
|
|
|
|
for hf_outputs, vllm_outputs in zip(hf_outputs_per_audio,
|
|
|
|
vllm_outputs_per_audio):
|
|
|
|
check_logprobs_close(
|
|
|
|
outputs_0_lst=hf_outputs,
|
|
|
|
outputs_1_lst=[
|
|
|
|
vllm_to_hf_output(vllm_output, model)
|
|
|
|
for vllm_output in vllm_outputs
|
|
|
|
],
|
|
|
|
name_0="hf",
|
|
|
|
name_1="vllm",
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2024-09-03 21:38:21 -07:00
|
|
|
def run_multi_audio_test(
|
|
|
|
vllm_runner: Type[VllmRunner],
|
|
|
|
prompts_and_audios: List[Tuple[str, List[AudioTuple]]],
|
|
|
|
model: str,
|
|
|
|
*,
|
|
|
|
dtype: str,
|
|
|
|
max_tokens: int,
|
|
|
|
num_logprobs: int,
|
|
|
|
tensor_parallel_size: int,
|
|
|
|
distributed_executor_backend: Optional[str] = None,
|
|
|
|
):
|
|
|
|
with vllm_runner(model,
|
|
|
|
dtype=dtype,
|
|
|
|
tensor_parallel_size=tensor_parallel_size,
|
|
|
|
distributed_executor_backend=distributed_executor_backend,
|
|
|
|
enforce_eager=True,
|
|
|
|
limit_mm_per_prompt={
|
|
|
|
"audio":
|
|
|
|
max((len(audio) for _, audio in prompts_and_audios))
|
|
|
|
}) as vllm_model:
|
|
|
|
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
|
|
|
[prompt for prompt, _ in prompts_and_audios],
|
|
|
|
max_tokens,
|
|
|
|
num_logprobs=num_logprobs,
|
|
|
|
audios=[audios for _, audios in prompts_and_audios])
|
|
|
|
|
|
|
|
# The HuggingFace model doesn't support multiple audios yet, so
|
|
|
|
# just assert that some tokens were generated.
|
|
|
|
assert all(tokens for tokens, *_ in vllm_outputs)
|
|
|
|
|
|
|
|
|
2024-08-21 15:49:39 -07:00
|
|
|
@pytest.mark.parametrize("dtype", ["half"])
|
|
|
|
@pytest.mark.parametrize("max_tokens", [128])
|
|
|
|
@pytest.mark.parametrize("num_logprobs", [5])
|
2024-09-03 21:38:21 -07:00
|
|
|
def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
|
|
|
|
num_logprobs: int) -> None:
|
|
|
|
|
|
|
|
vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER)
|
|
|
|
hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER)
|
2024-08-21 15:49:39 -07:00
|
|
|
run_test(
|
|
|
|
hf_runner,
|
|
|
|
vllm_runner,
|
2024-09-03 21:38:21 -07:00
|
|
|
[(vllm_prompt, hf_prompt, audio.audio_and_sample_rate)],
|
|
|
|
MODEL_NAME,
|
|
|
|
dtype=dtype,
|
|
|
|
max_tokens=max_tokens,
|
|
|
|
num_logprobs=num_logprobs,
|
|
|
|
tensor_parallel_size=1,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("dtype", ["half"])
|
|
|
|
@pytest.mark.parametrize("max_tokens", [128])
|
|
|
|
@pytest.mark.parametrize("num_logprobs", [5])
|
|
|
|
def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
|
|
|
|
max_tokens: int,
|
|
|
|
num_logprobs: int) -> None:
|
|
|
|
|
|
|
|
vllm_prompt = _get_prompt(len(audio_assets),
|
|
|
|
"Describe each of the audios above.",
|
|
|
|
VLLM_PLACEHOLDER)
|
|
|
|
run_multi_audio_test(
|
|
|
|
vllm_runner,
|
|
|
|
[(vllm_prompt, [audio.audio_and_sample_rate
|
|
|
|
for audio in audio_assets])],
|
2024-08-21 15:49:39 -07:00
|
|
|
MODEL_NAME,
|
|
|
|
dtype=dtype,
|
|
|
|
max_tokens=max_tokens,
|
|
|
|
num_logprobs=num_logprobs,
|
|
|
|
tensor_parallel_size=1,
|
|
|
|
)
|