From f2b20fe491073dad7e3c3fe8ab0303b97fb50643 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 14 Feb 2025 06:18:03 +0000 Subject: [PATCH] Consolidate Llama model usage in tests (#13094) --- .../basic_correctness/test_basic_correctness.py | 10 +++++----- tests/basic_correctness/test_chunked_prefill.py | 6 +++--- tests/basic_correctness/test_cpu_offload.py | 2 +- tests/basic_correctness/test_cumem.py | 2 +- tests/compile/test_basic_correctness.py | 2 +- tests/compile/utils.py | 16 +++++----------- tests/distributed/test_pipeline_parallel.py | 4 ++-- tests/entrypoints/openai/test_serving_models.py | 2 +- tests/entrypoints/openai/test_shutdown.py | 2 +- tests/kv_transfer/disagg_test.py | 10 ++++------ tests/models/decoder_only/language/test_fp8.py | 8 ++++---- tests/models/registry.py | 2 +- .../test_register_quantization_config.py | 2 +- tests/samplers/test_ignore_eos.py | 2 +- tests/spec_decode/e2e/test_compatibility.py | 6 +++--- tests/test_config.py | 2 +- tests/test_sharded_state_loader.py | 8 ++++---- tests/tokenization/test_detokenize.py | 2 +- tests/tokenization/test_get_eos.py | 4 ++-- tests/v1/engine/test_async_llm.py | 2 +- tests/v1/sample/test_logprobs.py | 2 +- tests/v1/sample/test_logprobs_e2e.py | 2 +- 22 files changed, 45 insertions(+), 53 deletions(-) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index f001a893..bd97dd94 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -17,7 +17,7 @@ from ..utils import multi_gpu_test MODELS = [ "google/gemma-2-2b-it", - "meta-llama/Llama-3.2-1B", + "meta-llama/Llama-3.2-1B-Instruct", ] TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4") @@ -96,12 +96,12 @@ def test_models( "test_suite", [ ("facebook/opt-125m", "ray", "", "L4"), ("facebook/opt-125m", "mp", "", "L4"), - ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"), - ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"), + ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"), + ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"), ("facebook/opt-125m", "ray", "", "A100"), ("facebook/opt-125m", "mp", "", "A100"), ("facebook/opt-125m", "mp", "FLASHINFER", "A100"), - ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"), + ("meta-llama/Llama-3.2-1B-Instruct", "ray", "FLASHINFER", "A100"), ]) def test_models_distributed( hf_runner, @@ -116,7 +116,7 @@ def test_models_distributed( if test_suite != TARGET_TEST_SUITE: pytest.skip(f"Skip test for {test_suite}") - if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa + if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa # test ray adag os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index cefd54d1..d041f0c4 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -20,7 +20,7 @@ from ..utils import multi_gpu_test MODELS = [ "facebook/opt-125m", - "meta-llama/Llama-3.2-1B", + "meta-llama/Llama-3.2-1B-Instruct", ] @@ -92,7 +92,7 @@ def test_models_distributed( ) -> None: override_backend_env_variable(monkeypatch, attention_backend) - if (model == "meta-llama/Llama-2-7b-hf" + if (model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray"): # test ray adag os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" @@ -221,7 +221,7 @@ def test_with_prefix_caching( Checks exact match decode with and without prefix caching with chunked prefill enabled. """ - model = "meta-llama/Llama-2-7b-chat-hf" + model = "meta-llama/Llama-3.2-1B-Instruct" # The common prompt has 142 tokens with Llama-2 tokenizer. common_prompt = "You are a helpful AI assistant " * 20 unique_prompts = [ diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py index b4d558ce..be3ad123 100644 --- a/tests/basic_correctness/test_cpu_offload.py +++ b/tests/basic_correctness/test_cpu_offload.py @@ -4,5 +4,5 @@ from ..utils import compare_two_settings def test_cpu_offload(): - compare_two_settings("meta-llama/Llama-3.2-1B", [], + compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [], ["--cpu-offload-gb", "1"]) diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index 3ac94879..f16b8007 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -118,7 +118,7 @@ def test_cumem_with_cudagraph(): @pytest.mark.parametrize( "model", [ - "meta-llama/Llama-3.2-1B", # sleep mode with safetensors + "meta-llama/Llama-3.2-1B-Instruct", # sleep mode with safetensors "facebook/opt-125m" # sleep mode with pytorch checkpoint ]) def test_end_to_end(model): diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index d7acec69..587c0a60 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -26,7 +26,7 @@ class TestSetting: test_settings = [ # basic llama model TestSetting( - model="meta-llama/Llama-3.2-1B", + model="meta-llama/Llama-3.2-1B-Instruct", model_args=[], pp_size=2, tp_size=2, diff --git a/tests/compile/utils.py b/tests/compile/utils.py index e4a88584..fb8270c2 100644 --- a/tests/compile/utils.py +++ b/tests/compile/utils.py @@ -6,7 +6,6 @@ import torch from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams -from vllm.config import CompilationLevel from vllm.platforms import current_platform TEST_MODELS = [ @@ -15,14 +14,14 @@ TEST_MODELS = [ "dtype": torch.float16, "quantization": "compressed-tensors" }), - ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", { + ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", { "dtype": torch.float16, - "quantization": "fp8" - }), - ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", { "quantization": "compressed-tensors" }), - ("meta-llama/Meta-Llama-3-8B", {}), + ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", { + "quantization": "compressed-tensors" + }), + ("meta-llama/Llama-3.2-1B-Instruct", {}), ] if is_quant_method_supported("aqlm"): @@ -69,11 +68,6 @@ def check_full_graph_support(model, # make sure these models can be captured in full graph mode os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1" - # The base meta llama uses too much memory. - if (model == "meta-llama/Meta-Llama-3-8B" - and optimization_level >= CompilationLevel.PIECEWISE): - return - print(f"MODEL={model}") prompts = [ diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index eb9cd5db..06f94358 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -162,7 +162,7 @@ TEXT_GENERATION_MODELS = { "internlm/internlm2-chat-7b": PPTestSettings.fast(), "inceptionai/jais-13b-chat": PPTestSettings.fast(), "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(), - "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(), + "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(), "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(), "openbmb/MiniCPM3-4B": PPTestSettings.fast(), # Uses Llama @@ -230,7 +230,7 @@ MULTIMODAL_MODELS = { TEST_MODELS = [ # [LANGUAGE GENERATION] "microsoft/Phi-3.5-MoE-instruct", - "meta-llama/Meta-Llama-3-8B", + "meta-llama/Llama-3.2-1B-Instruct", "ibm/PowerLM-3b", # [LANGUAGE EMBEDDING] "intfloat/e5-mistral-7b-instruct", diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py index 70ca8507..55900163 100644 --- a/tests/entrypoints/openai/test_serving_models.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -14,7 +14,7 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath, OpenAIServingModels) from vllm.lora.request import LoRARequest -MODEL_NAME = "meta-llama/Llama-2-7b" +MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)] LORA_LOADING_SUCCESS_MESSAGE = ( "Success: LoRA adapter '{lora_name}' added successfully.") diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py index 5edf85ab..0f12ac9b 100644 --- a/tests/entrypoints/openai/test_shutdown.py +++ b/tests/entrypoints/openai/test_shutdown.py @@ -5,7 +5,7 @@ import pytest from ...utils import RemoteOpenAIServer -MODEL_NAME = "meta-llama/Llama-3.2-1B" +MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" @pytest.mark.asyncio diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/disagg_test.py index 97e0d6eb..5b9ea6db 100644 --- a/tests/kv_transfer/disagg_test.py +++ b/tests/kv_transfer/disagg_test.py @@ -28,7 +28,7 @@ def setup_servers(): "-m", "vllm.entrypoints.openai.api_server", "--model", - "meta-llama/Meta-Llama-3.1-8B-Instruct", + "meta-llama/Llama-3.2-1B-Instruct", "--port", "8100", "--gpu-memory-utilization", @@ -49,7 +49,7 @@ def setup_servers(): "-m", "vllm.entrypoints.openai.api_server", "--model", - "meta-llama/Meta-Llama-3.1-8B-Instruct", + "meta-llama/Llama-3.2-1B-Instruct", "--port", "8200", "--gpu-memory-utilization", @@ -100,8 +100,7 @@ def test_disaggregated_prefilling(prompt): response = requests.post("http://localhost:8100/v1/completions", headers={"Content-Type": "application/json"}, json={ - "model": - "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.2-1B-Instruct", "prompt": prompt, "max_tokens": 1, "temperature": 0 @@ -112,8 +111,7 @@ def test_disaggregated_prefilling(prompt): response = requests.post("http://localhost:8200/v1/completions", headers={"Content-Type": "application/json"}, json={ - "model": - "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.2-1B-Instruct", "prompt": prompt, "max_tokens": 10, "temperature": 0 diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py index 6a0e148d..27c12516 100644 --- a/tests/models/decoder_only/language/test_fp8.py +++ b/tests/models/decoder_only/language/test_fp8.py @@ -26,12 +26,12 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true" # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors. ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct", "nm-testing/Llama-3.2-1B-Instruct-FP8-KV"), - # Test FP16 checkpoint w. fp8_e5m2 kv-cache. + # Test BF16 checkpoint w. fp8_e5m2 kv-cache. ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct", "meta-llama/Llama-3.2-1B-Instruct"), - # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json. - ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf", - "meta-llama/Llama-2-7b-chat-hf") + # Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json. + ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct", + "meta-llama/Llama-3.2-1B-Instruct") ]) # Due to low-precision numerical divergence, we only test logprob of 4 tokens @pytest.mark.parametrize("max_tokens", [4]) diff --git a/tests/models/registry.py b/tests/models/registry.py index 9c0e6b33..c3e1c785 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -141,7 +141,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"), "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini", extras={"tiny": "ai21labs/Jamba-tiny-dev"}), # noqa: E501 - "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"), + "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct"), "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf", is_available_online=False), "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"), diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py index 9e1867f9..da59dc75 100644 --- a/tests/quantization/test_register_quantization_config.py +++ b/tests/quantization/test_register_quantization_config.py @@ -99,7 +99,7 @@ def test_register_quantization_config(): @pytest.mark.parametrize(argnames="model", argvalues=[ - "meta-llama/Meta-Llama-3-8B-Instruct", + "meta-llama/Llama-3.2-1B-Instruct", ]) def test_custom_quant(vllm_runner, model): """Test infer with the custom quantization method.""" diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py index 7f26698c..9a92b08f 100644 --- a/tests/samplers/test_ignore_eos.py +++ b/tests/samplers/test_ignore_eos.py @@ -10,7 +10,7 @@ from vllm import SamplingParams # We also test with llama because it has generation_config to specify EOS # (past regression). -MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"] +MODELS = ["facebook/opt-125m", "meta-llama/Llama-3.2-1B-Instruct"] @pytest.mark.parametrize("model", MODELS) diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py index 14a0ebf1..83d1551a 100644 --- a/tests/spec_decode/e2e/test_compatibility.py +++ b/tests/spec_decode/e2e/test_compatibility.py @@ -8,7 +8,7 @@ from .conftest import get_output_from_llm_generator @pytest.mark.parametrize("common_llm_kwargs", [{ - "model": "meta-llama/Llama-2-7b-chat-hf", + "model": "meta-llama/Llama-3.2-1B-Instruct", "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, }]) @@ -27,8 +27,8 @@ from .conftest import get_output_from_llm_generator }, { # Speculative max model len > target max model len should raise. - # https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/f5db02db724555f92da89c216ac04704f23d4590/config.json#L12 - "speculative_max_model_len": 4096 + 1, + # https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18 + "speculative_max_model_len": 131072 + 1, }, ]) @pytest.mark.parametrize("test_llm_kwargs", [{}]) diff --git a/tests/test_config.py b/tests/test_config.py index 3fb83b4c..746ca729 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -251,7 +251,7 @@ def test_rope_customization(): @pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [ ("facebook/opt-125m", False), ("facebook/bart-base", True), - ("meta-llama/Llama-3.2-1B", False), + ("meta-llama/Llama-3.2-1B-Instruct", False), ("meta-llama/Llama-3.2-11B-Vision", True), ]) def test_is_encoder_decoder(model_id, is_encoder_decoder): diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py index 088b95be..8406f305 100644 --- a/tests/test_sharded_state_loader.py +++ b/tests/test_sharded_state_loader.py @@ -46,9 +46,9 @@ def test_filter_subtensors(): @pytest.fixture(scope="module") -def llama_2_7b_files(): +def llama_3p2_1b_files(): with TemporaryDirectory() as cache_dir: - input_dir = snapshot_download("meta-llama/Llama-3.2-1B", + input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct", cache_dir=cache_dir, ignore_patterns=["*.bin*", "original/*"]) @@ -81,13 +81,13 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs): @pytest.mark.parametrize("enable_lora", [False, True]) @pytest.mark.parametrize("tp_size", [1, 2]) def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available, - llama_2_7b_files): + llama_3p2_1b_files): if num_gpus_available < tp_size: pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") weights_patterns = ("*.safetensors", ) gpu_memory_utilization = 0.8 - input_dir = llama_2_7b_files + input_dir = llama_3p2_1b_files ctx = mp.get_context("spawn") # Run in separate processes for memory & CUDA isolation diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 57832394..851c79d2 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -31,7 +31,7 @@ TOKENIZERS = [ "bigscience/bloom-560m", "mosaicml/mpt-7b", "tiiuae/falcon-7b", - "meta-llama/Llama-2-7b-hf", + "meta-llama/Llama-3.2-1B-Instruct", "codellama/CodeLlama-7b-hf", "mistralai/Pixtral-12B-2409", ] diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py index 787fb6ea..fc47bcb9 100644 --- a/tests/tokenization/test_get_eos.py +++ b/tests/tokenization/test_get_eos.py @@ -9,7 +9,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer def test_get_llama3_eos_token(): - model_name = "meta-llama/Meta-Llama-3-8B-Instruct" + model_name = "meta-llama/Llama-3.2-1B-Instruct" tokenizer = get_tokenizer(model_name) assert tokenizer.eos_token_id == 128009 @@ -17,7 +17,7 @@ def test_get_llama3_eos_token(): generation_config = try_get_generation_config(model_name, trust_remote_code=False) assert generation_config is not None - assert generation_config.eos_token_id == [128001, 128009] + assert generation_config.eos_token_id == [128001, 128008, 128009] def test_get_blip2_eos_token(): diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 94e18289..05197f44 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -17,7 +17,7 @@ if not current_platform.is_cuda(): pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True) -ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B", +ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True, disable_log_requests=True) diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 86c576cd..a26a8c4e 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -14,7 +14,7 @@ from vllm import SamplingParams from ...conftest import VllmRunner -MODEL = "meta-llama/Llama-3.2-1B" +MODEL = "meta-llama/Llama-3.2-1B-Instruct" DTYPE = "half" diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py index 28c177fd..f6277006 100644 --- a/tests/v1/sample/test_logprobs_e2e.py +++ b/tests/v1/sample/test_logprobs_e2e.py @@ -11,7 +11,7 @@ RTOL = 0.03 EXPECTED_VALUE = 0.62 # FIXME(rob): enable prefix caching once supported. -MODEL = "meta-llama/Llama-3.2-1B" +MODEL = "meta-llama/Llama-3.2-1B-Instruct" MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False" # noqa: E501 SERVER_ARGS = [ "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests"