Consolidate Llama model usage in tests (#13094)

2025-02-14 06:18:03 +00:00 · 2025-02-14 06:18:03 +00:00 · f2b20fe491
commit f2b20fe491
parent 40932d7a05
22 changed files with 45 additions and 53 deletions
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@ -17,7 +17,7 @@ from ..utils import multi_gpu_test

 MODELS = [
    "google/gemma-2-2b-it",
-    "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-1B-Instruct",
 ]

 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
@ -96,12 +96,12 @@ def test_models(
    "test_suite", [
        ("facebook/opt-125m", "ray", "", "L4"),
        ("facebook/opt-125m", "mp", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
+        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
+        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
        ("facebook/opt-125m", "ray", "", "A100"),
        ("facebook/opt-125m", "mp", "", "A100"),
        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
-        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
+        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "FLASHINFER", "A100"),
    ])
 def test_models_distributed(
    hf_runner,
@ -116,7 +116,7 @@ def test_models_distributed(
    if test_suite != TARGET_TEST_SUITE:
        pytest.skip(f"Skip test for {test_suite}")

-    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+    if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
        # test ray adag
        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@ -20,7 +20,7 @@ from ..utils import multi_gpu_test

 MODELS = [
    "facebook/opt-125m",
-    "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-1B-Instruct",
 ]


@ -92,7 +92,7 @@ def test_models_distributed(
 ) -> None:
    override_backend_env_variable(monkeypatch, attention_backend)

-    if (model == "meta-llama/Llama-2-7b-hf"
+    if (model == "meta-llama/Llama-3.2-1B-Instruct"
            and distributed_executor_backend == "ray"):
        # test ray adag
        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
@ -221,7 +221,7 @@ def test_with_prefix_caching(
    Checks exact match decode with and without prefix caching
    with chunked prefill enabled.
    """
-    model = "meta-llama/Llama-2-7b-chat-hf"
+    model = "meta-llama/Llama-3.2-1B-Instruct"
    # The common prompt has 142 tokens with Llama-2 tokenizer.
    common_prompt = "You are a helpful AI assistant " * 20
    unique_prompts = [
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@ -4,5 +4,5 @@ from ..utils import compare_two_settings


 def test_cpu_offload():
-    compare_two_settings("meta-llama/Llama-3.2-1B", [],
+    compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
                         ["--cpu-offload-gb", "1"])
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@ -118,7 +118,7 @@ def test_cumem_with_cudagraph():
@pytest.mark.parametrize(
    "model",
    [
-        "meta-llama/Llama-3.2-1B",  # sleep mode with safetensors
+        "meta-llama/Llama-3.2-1B-Instruct",  # sleep mode with safetensors
        "facebook/opt-125m"  # sleep mode with pytorch checkpoint
    ])
 def test_end_to_end(model):
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@ -26,7 +26,7 @@ class TestSetting:
 test_settings = [
    # basic llama model
    TestSetting(
-        model="meta-llama/Llama-3.2-1B",
+        model="meta-llama/Llama-3.2-1B-Instruct",
        model_args=[],
        pp_size=2,
        tp_size=2,
--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@ -6,7 +6,6 @@ import torch

 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.config import CompilationLevel
 from vllm.platforms import current_platform

 TEST_MODELS = [
@ -15,14 +14,14 @@ TEST_MODELS = [
        "dtype": torch.float16,
        "quantization": "compressed-tensors"
    }),
-    ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", {
+    ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
        "dtype": torch.float16,
-        "quantization": "fp8"
-    }),
-    ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
        "quantization": "compressed-tensors"
    }),
-    ("meta-llama/Meta-Llama-3-8B", {}),
+    ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
+        "quantization": "compressed-tensors"
+    }),
+    ("meta-llama/Llama-3.2-1B-Instruct", {}),
 ]

 if is_quant_method_supported("aqlm"):
@ -69,11 +68,6 @@ def check_full_graph_support(model,
    # make sure these models can be captured in full graph mode
    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"

-    # The base meta llama uses too much memory.
-    if (model == "meta-llama/Meta-Llama-3-8B"
-            and optimization_level >= CompilationLevel.PIECEWISE):
-        return
-
    print(f"MODEL={model}")

    prompts = [
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@ -162,7 +162,7 @@ TEXT_GENERATION_MODELS = {
    "internlm/internlm2-chat-7b": PPTestSettings.fast(),
    "inceptionai/jais-13b-chat": PPTestSettings.fast(),
    "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
-    "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
+    "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
    "openbmb/MiniCPM3-4B": PPTestSettings.fast(),
    # Uses Llama
@ -230,7 +230,7 @@ MULTIMODAL_MODELS = {
 TEST_MODELS = [
    # [LANGUAGE GENERATION]
    "microsoft/Phi-3.5-MoE-instruct",
-    "meta-llama/Meta-Llama-3-8B",
+    "meta-llama/Llama-3.2-1B-Instruct",
    "ibm/PowerLM-3b",
    # [LANGUAGE EMBEDDING]
    "intfloat/e5-mistral-7b-instruct",
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@ -14,7 +14,7 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath,
                                                    OpenAIServingModels)
 from vllm.lora.request import LoRARequest

-MODEL_NAME = "meta-llama/Llama-2-7b"
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
 LORA_LOADING_SUCCESS_MESSAGE = (
    "Success: LoRA adapter '{lora_name}' added successfully.")
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@ -5,7 +5,7 @@ import pytest

 from ...utils import RemoteOpenAIServer

-MODEL_NAME = "meta-llama/Llama-3.2-1B"
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"


@pytest.mark.asyncio
--- a/tests/kv_transfer/disagg_test.py
+++ b/tests/kv_transfer/disagg_test.py
@ -28,7 +28,7 @@ def setup_servers():
        "-m",
        "vllm.entrypoints.openai.api_server",
        "--model",
-        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "meta-llama/Llama-3.2-1B-Instruct",
        "--port",
        "8100",
        "--gpu-memory-utilization",
@ -49,7 +49,7 @@ def setup_servers():
        "-m",
        "vllm.entrypoints.openai.api_server",
        "--model",
-        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "meta-llama/Llama-3.2-1B-Instruct",
        "--port",
        "8200",
        "--gpu-memory-utilization",
@ -100,8 +100,7 @@ def test_disaggregated_prefilling(prompt):
    response = requests.post("http://localhost:8100/v1/completions",
                             headers={"Content-Type": "application/json"},
                             json={
-                                 "model":
-                                 "meta-llama/Meta-Llama-3.1-8B-Instruct",
+                                 "model": "meta-llama/Llama-3.2-1B-Instruct",
                                 "prompt": prompt,
                                 "max_tokens": 1,
                                 "temperature": 0
@ -112,8 +111,7 @@ def test_disaggregated_prefilling(prompt):
    response = requests.post("http://localhost:8200/v1/completions",
                             headers={"Content-Type": "application/json"},
                             json={
-                                 "model":
-                                 "meta-llama/Meta-Llama-3.1-8B-Instruct",
+                                 "model": "meta-llama/Llama-3.2-1B-Instruct",
                                 "prompt": prompt,
                                 "max_tokens": 10,
                                 "temperature": 0
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@ -26,12 +26,12 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
        # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
         "nm-testing/Llama-3.2-1B-Instruct-FP8-KV"),
-        # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
+        # Test BF16 checkpoint w. fp8_e5m2 kv-cache.
        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
         "meta-llama/Llama-3.2-1B-Instruct"),
-        # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
-        ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
-         "meta-llama/Llama-2-7b-chat-hf")
+        # Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
+        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
+         "meta-llama/Llama-3.2-1B-Instruct")
    ])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -141,7 +141,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
    "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
                                        extras={"tiny": "ai21labs/Jamba-tiny-dev"}),  # noqa: E501
-    "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"),
+    "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct"),
    "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
                                        is_available_online=False),
    "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@ -99,7 +99,7 @@ def test_register_quantization_config():

@pytest.mark.parametrize(argnames="model",
                         argvalues=[
-                             "meta-llama/Meta-Llama-3-8B-Instruct",
+                             "meta-llama/Llama-3.2-1B-Instruct",
                         ])
 def test_custom_quant(vllm_runner, model):
    """Test infer with the custom quantization method."""
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@ -10,7 +10,7 @@ from vllm import SamplingParams

 # We also test with llama because it has generation_config to specify EOS
 # (past regression).
-MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"]
+MODELS = ["facebook/opt-125m", "meta-llama/Llama-3.2-1B-Instruct"]


@pytest.mark.parametrize("model", MODELS)
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@ -8,7 +8,7 @@ from .conftest import get_output_from_llm_generator


@pytest.mark.parametrize("common_llm_kwargs", [{
-    "model": "meta-llama/Llama-2-7b-chat-hf",
+    "model": "meta-llama/Llama-3.2-1B-Instruct",
    "speculative_model": "JackFram/llama-68m",
    "num_speculative_tokens": 5,
 }])
@ -27,8 +27,8 @@ from .conftest import get_output_from_llm_generator
        },
        {
            # Speculative max model len > target max model len should raise.
-            # https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/f5db02db724555f92da89c216ac04704f23d4590/config.json#L12
-            "speculative_max_model_len": 4096 + 1,
+            # https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
+            "speculative_max_model_len": 131072 + 1,
        },
    ])
@pytest.mark.parametrize("test_llm_kwargs", [{}])
--- a/tests/test_config.py
+++ b/tests/test_config.py
@ -251,7 +251,7 @@ def test_rope_customization():
@pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
    ("facebook/opt-125m", False),
    ("facebook/bart-base", True),
-    ("meta-llama/Llama-3.2-1B", False),
+    ("meta-llama/Llama-3.2-1B-Instruct", False),
    ("meta-llama/Llama-3.2-11B-Vision", True),
 ])
 def test_is_encoder_decoder(model_id, is_encoder_decoder):
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@ -46,9 +46,9 @@ def test_filter_subtensors():


@pytest.fixture(scope="module")
-def llama_2_7b_files():
+def llama_3p2_1b_files():
    with TemporaryDirectory() as cache_dir:
-        input_dir = snapshot_download("meta-llama/Llama-3.2-1B",
+        input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
                                      cache_dir=cache_dir,
                                      ignore_patterns=["*.bin*", "original/*"])

@ -81,13 +81,13 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
@pytest.mark.parametrize("enable_lora", [False, True])
@pytest.mark.parametrize("tp_size", [1, 2])
 def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
-                              llama_2_7b_files):
+                              llama_3p2_1b_files):
    if num_gpus_available < tp_size:
        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")

    weights_patterns = ("*.safetensors", )
    gpu_memory_utilization = 0.8
-    input_dir = llama_2_7b_files
+    input_dir = llama_3p2_1b_files
    ctx = mp.get_context("spawn")

    # Run in separate processes for memory & CUDA isolation
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@ -31,7 +31,7 @@ TOKENIZERS = [
    "bigscience/bloom-560m",
    "mosaicml/mpt-7b",
    "tiiuae/falcon-7b",
-    "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Llama-3.2-1B-Instruct",
    "codellama/CodeLlama-7b-hf",
    "mistralai/Pixtral-12B-2409",
 ]
--- a/tests/tokenization/test_get_eos.py
+++ b/tests/tokenization/test_get_eos.py
@ -9,7 +9,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer


 def test_get_llama3_eos_token():
-    model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
+    model_name = "meta-llama/Llama-3.2-1B-Instruct"

    tokenizer = get_tokenizer(model_name)
    assert tokenizer.eos_token_id == 128009
@ -17,7 +17,7 @@ def test_get_llama3_eos_token():
    generation_config = try_get_generation_config(model_name,
                                                  trust_remote_code=False)
    assert generation_config is not None
-    assert generation_config.eos_token_id == [128001, 128009]
+    assert generation_config.eos_token_id == [128001, 128008, 128009]


 def test_get_blip2_eos_token():
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@ -17,7 +17,7 @@ if not current_platform.is_cuda():
    pytest.skip(reason="V1 currently only supported on CUDA.",
                allow_module_level=True)

-ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B",
+ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B-Instruct",
                              enforce_eager=True,
                              disable_log_requests=True)

--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@ -14,7 +14,7 @@ from vllm import SamplingParams

 from ...conftest import VllmRunner

-MODEL = "meta-llama/Llama-3.2-1B"
+MODEL = "meta-llama/Llama-3.2-1B-Instruct"
 DTYPE = "half"


--- a/tests/v1/sample/test_logprobs_e2e.py
+++ b/tests/v1/sample/test_logprobs_e2e.py
@ -11,7 +11,7 @@ RTOL = 0.03
 EXPECTED_VALUE = 0.62

 # FIXME(rob): enable prefix caching once supported.
-MODEL = "meta-llama/Llama-3.2-1B"
+MODEL = "meta-llama/Llama-3.2-1B-Instruct"
 MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False"  # noqa: E501
 SERVER_ARGS = [
    "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests"