# Test the LLMEngine with multi-step-decoding import pytest from ..models.utils import check_outputs_equal MODELS = [ "JackFram/llama-160m", ] NUM_SCHEDULER_STEPS = [8] # Multi-step decoding steps NUM_PROMPTS = [10] @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("tp_size", [1]) @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("enforce_eager", [True]) @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS) @pytest.mark.parametrize("num_prompts", NUM_PROMPTS) def test_multi_step_llm(hf_runner, vllm_runner, example_prompts, model: str, dtype: str, tp_size: int, max_tokens: int, enforce_eager: int, num_scheduler_steps: int, num_prompts: int) -> None: prompts = example_prompts if len(prompts) < num_prompts: prompts = prompts * ((num_prompts // len(prompts)) + 1) prompts = prompts[:num_prompts] assert len(prompts) == num_prompts with vllm_runner(model, dtype=dtype, enforce_eager=enforce_eager, gpu_memory_utilization=0.7, tensor_parallel_size=tp_size, use_v2_block_manager=True, num_scheduler_steps=num_scheduler_steps) as vllm_model: vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens) with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy(prompts, max_tokens) check_outputs_equal( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, name_0="hf", name_1="vllm", )