From 2c5e637b57bcb2d5b3c7d992fd8b75a8bbeafcc3 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Sat, 22 Feb 2025 19:19:45 -0800 Subject: [PATCH] [ci] Use env var to control whether to use S3 bucket in CI (#13634) --- .buildkite/test-pipeline.yaml | 4 +- .../test_basic_correctness.py | 11 +- tests/basic_correctness/test_cumem.py | 9 +- tests/conftest.py | 73 +--------- tests/engine/test_computed_prefix_blocks.py | 7 +- tests/engine/test_detokenization.py | 8 +- tests/engine/test_executor.py | 21 +-- tests/engine/test_skip_tokenizer_init.py | 13 +- tests/entrypoints/llm/test_chat.py | 13 +- tests/entrypoints/llm/test_collective_rpc.py | 2 +- tests/entrypoints/llm/test_encode.py | 4 +- tests/entrypoints/llm/test_generate.py | 4 +- .../llm/test_generate_multiple_loras.py | 4 +- tests/entrypoints/llm/test_guided_generate.py | 7 +- tests/entrypoints/llm/test_lazy_outlines.py | 7 +- .../entrypoints/llm/test_prompt_validation.py | 9 +- tests/metrics/test_metrics.py | 55 ++++---- tests/models/test_initialization.py | 6 +- tests/mq_llm_engine/test_abort.py | 4 +- tests/mq_llm_engine/test_error_handling.py | 6 +- tests/mq_llm_engine/test_load.py | 6 +- tests/multimodal/test_processing.py | 6 +- tests/prefix_caching/test_prefix_caching.py | 2 +- tests/test_config.py | 14 +- tests/test_regression.py | 13 +- tests/worker/test_swap.py | 2 +- vllm/engine/arg_utils.py | 9 ++ vllm/envs.py | 4 + vllm/model_executor/model_loader/loader.py | 1 - vllm/test_utils.py | 129 ++++++++++++++++++ 30 files changed, 222 insertions(+), 231 deletions(-) create mode 100644 vllm/test_utils.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d96f0183..931057e6 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -278,7 +278,7 @@ steps: command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py parallelism: 4 -- label: "PyTorch Fullgraph Smoke Test" # 9min +- label: PyTorch Fullgraph Smoke Test # 9min fast_check: true source_file_dependencies: - vllm/ @@ -289,7 +289,7 @@ steps: - pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_toy_llama.py -- label: "PyTorch Fullgraph Test" # 18min +- label: PyTorch Fullgraph Test # 18min source_file_dependencies: - vllm/ - tests/compile diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index cc25c879..d2fc0916 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -9,7 +9,6 @@ import weakref import pytest from vllm import LLM -from vllm.config import LoadFormat from vllm.platforms import current_platform from ..conftest import VllmRunner @@ -34,7 +33,7 @@ def v1(run_with_both_engines): def test_vllm_gc_ed(): """Verify vllm instance is GC'ed when it is deleted""" - llm = LLM("distilbert/distilgpt2", load_format=LoadFormat.RUNAI_STREAMER) + llm = LLM("distilbert/distilgpt2") weak_llm = weakref.ref(llm) del llm # If there's any circular reference to vllm, this fails @@ -43,10 +42,10 @@ def test_vllm_gc_ed(): @pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"]) +@pytest.mark.parametrize("backend", ["FLASH_ATTN"]) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [5]) -@pytest.mark.parametrize("enforce_eager", [False, True]) +@pytest.mark.parametrize("enforce_eager", [False]) def test_models( hf_runner, model: str, @@ -97,8 +96,8 @@ def test_models( "test_suite", [ ("distilbert/distilgpt2", "ray", "", "L4"), ("distilbert/distilgpt2", "mp", "", "L4"), - ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"), - ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"), + ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"), + ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"), ("distilbert/distilgpt2", "ray", "", "A100"), ("distilbert/distilgpt2", "mp", "", "A100"), ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"), diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index f1148fc8..61c79a7b 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -4,11 +4,9 @@ import pytest import torch from vllm import LLM, SamplingParams -from vllm.config import LoadFormat from vllm.device_allocator.cumem import CuMemAllocator from vllm.utils import GiB_bytes -from ..conftest import MODEL_WEIGHTS_S3_BUCKET from ..utils import fork_new_process_for_each_test @@ -121,7 +119,7 @@ def test_cumem_with_cudagraph(): "model, use_v1", [ # sleep mode with safetensors - (f"{MODEL_WEIGHTS_S3_BUCKET}/meta-llama/Llama-3.2-1B", True), + ("meta-llama/Llama-3.2-1B", True), # sleep mode with pytorch checkpoint ("facebook/opt-125m", False), ]) @@ -130,10 +128,7 @@ def test_end_to_end(model: str, use_v1: bool): os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0" free, total = torch.cuda.mem_get_info() used_bytes_baseline = total - free # in case other process is running - load_format = LoadFormat.AUTO - if "Llama" in model: - load_format = LoadFormat.RUNAI_STREAMER - llm = LLM(model, load_format=load_format, enable_sleep_mode=True) + llm = LLM(model, enable_sleep_mode=True) prompt = "How are you?" sampling_params = SamplingParams(temperature=0, max_tokens=10) output = llm.generate(prompt, sampling_params) diff --git a/tests/conftest.py b/tests/conftest.py index 9304b8f1..dd339030 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,7 +24,7 @@ from tests.models.utils import (TokensTextLogprobs, from vllm import LLM, SamplingParams from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset -from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig +from vllm.config import TaskOption, TokenizerPoolConfig from vllm.connections import global_http_connection from vllm.distributed import (cleanup_dist_env_and_memory, init_distributed_environment, @@ -47,70 +47,6 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt") _M = TypeVar("_M") -MODELS_ON_S3 = [ - "distilbert/distilgpt2", - "meta-llama/Llama-2-7b-hf", - "meta-llama/Meta-Llama-3-8B", - "meta-llama/Llama-3.2-1B", - "meta-llama/Llama-3.2-1B-Instruct", - "openai-community/gpt2", - "ArthurZ/Ilama-3.2-1B", - "llava-hf/llava-1.5-7b-hf", - "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "ai21labs/Jamba-tiny-random", - "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV", - "nm-testing/Phi-3-mini-128k-instruct-FP8", - "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV", - "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV", - "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", - "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue", - "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse", - "AMead10/Llama-3.2-1B-Instruct-AWQ", - "shuyuej/Llama-3.2-1B-Instruct-GPTQ", - "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", - "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", - "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", - "neuralmagic/Meta-Llama-3-8B-Instruct-FP8", - "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test", - "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", - "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", - "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama", - "neuralmagic/Llama-3.2-1B-quantized.w8a8", - "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym", - "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym", - "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym", - "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", - "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", - "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", - "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", - "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym", - "nm-testing/tinyllama-oneshot-w4a16-channel-v2", - "nm-testing/tinyllama-oneshot-w4a16-group128-v2", - "nm-testing/tinyllama-oneshot-w8a16-per-channel", - "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t", - "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test", - "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme", - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing", - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing", - "nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor", - "nm-testing/llama2.c-stories42M-pruned2.4-compressed", -] - -MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights" - _PromptMultiModalInput = Union[List[_M], List[List[_M]]] PromptImageInput = _PromptMultiModalInput[Image.Image] @@ -742,14 +678,8 @@ class VllmRunner: enable_chunked_prefill: bool = False, swap_space: int = 4, enforce_eager: Optional[bool] = False, - load_format: Optional[LoadFormat] = None, **kwargs, ) -> None: - if model_name in MODELS_ON_S3 and not load_format: - model_name = (f"{MODEL_WEIGHTS_S3_BUCKET}/{model_name}") - load_format = LoadFormat.RUNAI_STREAMER - if not load_format: - load_format = LoadFormat.AUTO self.model = LLM( model=model_name, task=task, @@ -764,7 +694,6 @@ class VllmRunner: max_model_len=max_model_len, block_size=block_size, enable_chunked_prefill=enable_chunked_prefill, - load_format=load_format, **kwargs, ) diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py index 51e7c8e7..049fa2c8 100644 --- a/tests/engine/test_computed_prefix_blocks.py +++ b/tests/engine/test_computed_prefix_blocks.py @@ -2,16 +2,12 @@ import pytest -from vllm.config import LoadFormat from vllm.engine.arg_utils import EngineArgs from vllm.engine.llm_engine import LLMEngine from vllm.sampling_params import SamplingParams -from ..conftest import MODEL_WEIGHTS_S3_BUCKET - -@pytest.mark.parametrize("model", - [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"]) +@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) @pytest.mark.parametrize("block_size", [16]) def test_computed_prefix_blocks(model: str, block_size: int): # This test checks if we are able to run the engine to completion @@ -28,7 +24,6 @@ def test_computed_prefix_blocks(model: str, block_size: int): "decoration.") engine_args = EngineArgs(model=model, - load_format=LoadFormat.RUNAI_STREAMER, block_size=block_size, enable_prefix_caching=True) diff --git a/tests/engine/test_detokenization.py b/tests/engine/test_detokenization.py index 6ae4be2e..2b7ebf70 100644 --- a/tests/engine/test_detokenization.py +++ b/tests/engine/test_detokenization.py @@ -2,15 +2,11 @@ import pytest -from vllm.config import LoadFormat from vllm.entrypoints.llm import LLM from vllm.sampling_params import SamplingParams -from ..conftest import MODEL_WEIGHTS_S3_BUCKET - -@pytest.mark.parametrize("model", - [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"]) +@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) def test_computed_prefix_blocks(model: str): # This test checks if the engine generates completions both with and # without optional detokenization, that detokenization includes text @@ -21,7 +17,7 @@ def test_computed_prefix_blocks(model: str): "paper clips? Is there an easy to follow video tutorial available " "online for free?") - llm = LLM(model=model, load_format=LoadFormat.RUNAI_STREAMER) + llm = LLM(model=model) sampling_params = SamplingParams(max_tokens=10, temperature=0.0, detokenize=False) diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py index 6a86401c..c0a339e4 100644 --- a/tests/engine/test_executor.py +++ b/tests/engine/test_executor.py @@ -6,17 +6,12 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union import pytest -from vllm.config import LoadFormat from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.llm_engine import LLMEngine from vllm.executor.uniproc_executor import UniProcExecutor from vllm.sampling_params import SamplingParams -from ..conftest import MODEL_WEIGHTS_S3_BUCKET - -RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER - class Mock: ... @@ -38,12 +33,10 @@ class CustomUniExecutor(UniProcExecutor): CustomUniExecutorAsync = CustomUniExecutor -@pytest.mark.parametrize("model", - [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"]) +@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) def test_custom_executor_type_checking(model): with pytest.raises(ValueError): engine_args = EngineArgs(model=model, - load_format=RUNAI_STREAMER_LOAD_FORMAT, distributed_executor_backend=Mock) LLMEngine.from_engine_args(engine_args) with pytest.raises(ValueError): @@ -52,8 +45,7 @@ def test_custom_executor_type_checking(model): AsyncLLMEngine.from_engine_args(engine_args) -@pytest.mark.parametrize("model", - [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"]) +@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) def test_custom_executor(model, tmp_path): cwd = os.path.abspath(".") os.chdir(tmp_path) @@ -62,7 +54,6 @@ def test_custom_executor(model, tmp_path): engine_args = EngineArgs( model=model, - load_format=RUNAI_STREAMER_LOAD_FORMAT, distributed_executor_backend=CustomUniExecutor, enforce_eager=True, # reduce test time ) @@ -77,8 +68,7 @@ def test_custom_executor(model, tmp_path): os.chdir(cwd) -@pytest.mark.parametrize("model", - [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"]) +@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) def test_custom_executor_async(model, tmp_path): cwd = os.path.abspath(".") os.chdir(tmp_path) @@ -87,7 +77,6 @@ def test_custom_executor_async(model, tmp_path): engine_args = AsyncEngineArgs( model=model, - load_format=RUNAI_STREAMER_LOAD_FORMAT, distributed_executor_backend=CustomUniExecutorAsync, enforce_eager=True, # reduce test time ) @@ -106,8 +95,7 @@ def test_custom_executor_async(model, tmp_path): os.chdir(cwd) -@pytest.mark.parametrize("model", - [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"]) +@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) def test_respect_ray(model): # even for TP=1 and PP=1, # if users specify ray, we should use ray. @@ -116,7 +104,6 @@ def test_respect_ray(model): engine_args = EngineArgs( model=model, distributed_executor_backend="ray", - load_format=RUNAI_STREAMER_LOAD_FORMAT, enforce_eager=True, # reduce test time ) engine = LLMEngine.from_engine_args(engine_args) diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py index b0930eaa..5e197f5f 100644 --- a/tests/engine/test_skip_tokenizer_init.py +++ b/tests/engine/test_skip_tokenizer_init.py @@ -2,22 +2,19 @@ import pytest -from vllm.config import LoadFormat from vllm.entrypoints.llm import LLM from vllm.sampling_params import SamplingParams -from ..conftest import MODEL_WEIGHTS_S3_BUCKET - -@pytest.mark.parametrize("model", - [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"]) +@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) def test_skip_tokenizer_initialization(model: str): # This test checks if the flag skip_tokenizer_init skips the initialization # of tokenizer and detokenizer. The generated output is expected to contain # token ids. - llm = LLM(model=model, - skip_tokenizer_init=True, - load_format=LoadFormat.RUNAI_STREAMER) + llm = LLM( + model=model, + skip_tokenizer_init=True, + ) sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) with pytest.raises(ValueError, match="cannot pass text prompts when"): diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index f6fda512..77c80b2f 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -5,17 +5,12 @@ from typing import List import pytest from vllm import LLM -from vllm.config import LoadFormat -from ...conftest import MODEL_WEIGHTS_S3_BUCKET from ..openai.test_vision import TEST_IMAGE_URLS -RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER - def test_chat(): - llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct", - load_format=RUNAI_STREAMER_LOAD_FORMAT) + llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct") prompt1 = "Explain the concept of entropy." messages = [ @@ -33,8 +28,7 @@ def test_chat(): def test_multi_chat(): - llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct", - load_format=RUNAI_STREAMER_LOAD_FORMAT) + llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct") prompt1 = "Explain the concept of entropy." prompt2 = "Explain what among us is." @@ -71,8 +65,7 @@ def test_multi_chat(): [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) def test_chat_multi_image(image_urls: List[str]): llm = LLM( - model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct", - load_format=RUNAI_STREAMER_LOAD_FORMAT, + model="microsoft/Phi-3.5-vision-instruct", dtype="bfloat16", max_model_len=4096, max_num_seqs=5, diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py index 69c60bbe..39d4810d 100644 --- a/tests/entrypoints/llm/test_collective_rpc.py +++ b/tests/entrypoints/llm/test_collective_rpc.py @@ -28,7 +28,7 @@ def test_collective_rpc(tp_size, backend): def echo_rank(self): return self.rank - llm = LLM(model="s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct", + llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True, load_format="dummy", tensor_parallel_size=tp_size, diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py index 61085bf4..ebec8bab 100644 --- a/tests/entrypoints/llm/test_encode.py +++ b/tests/entrypoints/llm/test_encode.py @@ -6,10 +6,9 @@ from typing import List import pytest from vllm import LLM, PoolingParams, PoolingRequestOutput -from vllm.config import LoadFormat from vllm.distributed import cleanup_dist_env_and_memory -MODEL_NAME = "s3://vllm-ci-model-weights/e5-mistral-7b-instruct" +MODEL_NAME = "intfloat/e5-mistral-7b-instruct" PROMPTS = [ "Hello, my name is", @@ -33,7 +32,6 @@ def llm(): # pytest caches the fixture so we use weakref.proxy to # enable garbage collection llm = LLM(model=MODEL_NAME, - load_format=LoadFormat.RUNAI_STREAMER, max_num_batched_tokens=32768, tensor_parallel_size=1, gpu_memory_utilization=0.75, diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py index f1bad876..910e1a45 100644 --- a/tests/entrypoints/llm/test_generate.py +++ b/tests/entrypoints/llm/test_generate.py @@ -6,10 +6,9 @@ from typing import List import pytest from vllm import LLM, RequestOutput, SamplingParams -from vllm.config import LoadFormat from vllm.distributed import cleanup_dist_env_and_memory -MODEL_NAME = "s3://vllm-ci-model-weights/distilgpt2" +MODEL_NAME = "distilbert/distilgpt2" PROMPTS = [ "Hello, my name is", @@ -31,7 +30,6 @@ def llm(): # pytest caches the fixture so we use weakref.proxy to # enable garbage collection llm = LLM(model=MODEL_NAME, - load_format=LoadFormat.RUNAI_STREAMER, max_num_batched_tokens=4096, tensor_parallel_size=1, gpu_memory_utilization=0.10, diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py index 487c0046..90e1d581 100644 --- a/tests/entrypoints/llm/test_generate_multiple_loras.py +++ b/tests/entrypoints/llm/test_generate_multiple_loras.py @@ -7,11 +7,10 @@ import pytest from huggingface_hub import snapshot_download from vllm import LLM -from vllm.config import LoadFormat from vllm.distributed import cleanup_dist_env_and_memory from vllm.lora.request import LoRARequest -MODEL_NAME = "s3://vllm-ci-model-weights/zephyr-7b-beta" +MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" PROMPTS = [ "Hello, my name is", @@ -28,7 +27,6 @@ def llm(): # pytest caches the fixture so we use weakref.proxy to # enable garbage collection llm = LLM(model=MODEL_NAME, - load_format=LoadFormat.RUNAI_STREAMER, tensor_parallel_size=1, max_model_len=8192, enable_lora=True, diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py index 252eb3fb..314dc593 100644 --- a/tests/entrypoints/llm/test_guided_generate.py +++ b/tests/entrypoints/llm/test_guided_generate.py @@ -7,13 +7,12 @@ import weakref import jsonschema import pytest -from vllm.config import LoadFormat from vllm.distributed import cleanup_dist_env_and_memory from vllm.entrypoints.llm import LLM from vllm.outputs import RequestOutput from vllm.sampling_params import GuidedDecodingParams, SamplingParams -MODEL_NAME = "s3://vllm-ci-model-weights/Qwen2.5-1.5B-Instruct" +MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"] @@ -21,9 +20,7 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"] def llm(): # pytest caches the fixture so we use weakref.proxy to # enable garbage collection - llm = LLM(model=MODEL_NAME, - load_format=LoadFormat.RUNAI_STREAMER, - max_model_len=1024) + llm = LLM(model=MODEL_NAME, max_model_len=1024) with llm.deprecate_legacy_api(): yield weakref.proxy(llm) diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py index 07608e15..0598e399 100644 --- a/tests/entrypoints/llm/test_lazy_outlines.py +++ b/tests/entrypoints/llm/test_lazy_outlines.py @@ -6,7 +6,6 @@ from contextlib import nullcontext from vllm_test_utils import BlameResult, blame from vllm import LLM, SamplingParams -from vllm.config import LoadFormat from vllm.distributed import cleanup_dist_env_and_memory @@ -44,8 +43,7 @@ def run_normal(): sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM without guided decoding as a baseline. - llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2", - load_format=LoadFormat.RUNAI_STREAMER, + llm = LLM(model="distilbert/distilgpt2", enforce_eager=True, gpu_memory_utilization=0.3) outputs = llm.generate(prompts, sampling_params) @@ -61,8 +59,7 @@ def run_normal(): def run_lmfe(sample_regex): # Create an LLM with guided decoding enabled. - llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2", - load_format=LoadFormat.RUNAI_STREAMER, + llm = LLM(model="distilbert/distilgpt2", enforce_eager=True, guided_decoding_backend="lm-format-enforcer", gpu_memory_utilization=0.3) diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py index 04848131..61bd1d46 100644 --- a/tests/entrypoints/llm/test_prompt_validation.py +++ b/tests/entrypoints/llm/test_prompt_validation.py @@ -3,7 +3,6 @@ import pytest from vllm import LLM -from vllm.config import LoadFormat @pytest.fixture(autouse=True) @@ -15,17 +14,13 @@ def v1(run_with_both_engines): def test_empty_prompt(): - llm = LLM(model="s3://vllm-ci-model-weights/gpt2", - load_format=LoadFormat.RUNAI_STREAMER, - enforce_eager=True) + llm = LLM(model="openai-community/gpt2", enforce_eager=True) with pytest.raises(ValueError, match='Prompt cannot be empty'): llm.generate([""]) @pytest.mark.skip_v1 def test_out_of_vocab_token(): - llm = LLM(model="s3://vllm-ci-model-weights/gpt2", - load_format=LoadFormat.RUNAI_STREAMER, - enforce_eager=True) + llm = LLM(model="openai-community/gpt2", enforce_eager=True) with pytest.raises(ValueError, match='out of vocabulary'): llm.generate({"prompt_token_ids": [999999]}) diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 45a13488..d6183379 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -8,21 +8,17 @@ import ray from prometheus_client import REGISTRY from vllm import EngineArgs, LLMEngine -from vllm.config import LoadFormat from vllm.distributed import cleanup_dist_env_and_memory from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.metrics import RayPrometheusStatLogger from vllm.sampling_params import SamplingParams - -from ..conftest import MODEL_WEIGHTS_S3_BUCKET +from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET MODELS = [ "distilbert/distilgpt2", ] -RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER - @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @@ -146,9 +142,8 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str, metrics_tag_content = stat_logger.labels["model_name"] if served_model_name is None or served_model_name == []: - actual_model_name = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}" - assert metrics_tag_content == actual_model_name, ( - f"Metrics tag model_name is wrong! expect: {actual_model_name!r}\n" + assert metrics_tag_content == f"{MODEL_WEIGHTS_S3_BUCKET}/{model}", ( + f"Metrics tag model_name is wrong! expect: {model!r}\n" f"actual: {metrics_tag_content!r}") else: assert metrics_tag_content == served_model_name[0], ( @@ -174,10 +169,11 @@ async def test_async_engine_log_metrics_regression( when disable_log_stats=False (see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678) """ - engine_args = AsyncEngineArgs(model=model, - dtype=dtype, - disable_log_stats=disable_log_stats, - load_format=RUNAI_STREAMER_LOAD_FORMAT) + engine_args = AsyncEngineArgs( + model=model, + dtype=dtype, + disable_log_stats=disable_log_stats, + ) async_engine = AsyncLLMEngine.from_engine_args(engine_args) for i, prompt in enumerate(example_prompts): results = async_engine.generate( @@ -189,7 +185,7 @@ async def test_async_engine_log_metrics_regression( async for _ in results: pass - assert_metrics(async_engine.engine, disable_log_stats, + assert_metrics(model, async_engine.engine, disable_log_stats, len(example_prompts)) @@ -204,10 +200,11 @@ def test_engine_log_metrics_regression( max_tokens: int, disable_log_stats: bool, ) -> None: - engine_args = EngineArgs(model=model, - dtype=dtype, - disable_log_stats=disable_log_stats, - load_format=RUNAI_STREAMER_LOAD_FORMAT) + engine_args = EngineArgs( + model=model, + dtype=dtype, + disable_log_stats=disable_log_stats, + ) engine = LLMEngine.from_engine_args(engine_args) for i, prompt in enumerate(example_prompts): engine.add_request( @@ -218,7 +215,8 @@ def test_engine_log_metrics_regression( while engine.has_unfinished_requests(): engine.step() - assert_metrics(engine, disable_log_stats, len(example_prompts)) + assert_metrics(f"{MODEL_WEIGHTS_S3_BUCKET}/{model}", engine, + disable_log_stats, len(example_prompts)) @pytest.mark.parametrize("model", MODELS) @@ -285,14 +283,15 @@ def test_metric_spec_decode_interval( ) -> None: k = 5 - engine_args = EngineArgs(model=model, - dtype=dtype, - disable_log_stats=False, - gpu_memory_utilization=0.4, - speculative_model=model, - num_speculative_tokens=k, - enforce_eager=True, - load_format=RUNAI_STREAMER_LOAD_FORMAT) + engine_args = EngineArgs( + model=model, + dtype=dtype, + disable_log_stats=False, + gpu_memory_utilization=0.4, + speculative_model=model, + num_speculative_tokens=k, + enforce_eager=True, + ) engine = LLMEngine.from_engine_args(engine_args) @@ -359,7 +358,7 @@ def test_metric_spec_decode_interval( cleanup_dist_env_and_memory() -def assert_metrics(engine: LLMEngine, disable_log_stats: bool, +def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool, num_requests: int) -> None: if disable_log_stats: with pytest.raises(AttributeError): @@ -370,7 +369,7 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool, # Ensure the count bucket of request-level histogram metrics matches # the number of requests as a simple sanity check to ensure metrics are # generated - labels = {'model_name': engine.model_config.model} + labels = {'model_name': model} request_histogram_metrics = [ "vllm:e2e_request_latency_seconds", "vllm:request_prompt_tokens", diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index e0d5e003..c58c6372 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -7,7 +7,6 @@ from transformers import PretrainedConfig from vllm import LLM -from ..conftest import MODELS_ON_S3 from .registry import HF_EXAMPLE_MODELS @@ -43,11 +42,8 @@ def test_can_initialize(model_arch): with patch.object(LLM.get_engine_class(), "_initialize_kv_caches", _initialize_kv_caches): - model_name = model_info.default - if model_name in MODELS_ON_S3: - model_name = f"s3://vllm-ci-model-weights/{model_name.split('/')[-1]}" LLM( - model_name, + model_info.default, tokenizer=model_info.tokenizer, tokenizer_mode=model_info.tokenizer_mode, speculative_model=model_info.speculative_model, diff --git a/tests/mq_llm_engine/test_abort.py b/tests/mq_llm_engine/test_abort.py index b0ac0fb3..808346b5 100644 --- a/tests/mq_llm_engine/test_abort.py +++ b/tests/mq_llm_engine/test_abort.py @@ -10,8 +10,8 @@ import pytest from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate from vllm.engine.arg_utils import AsyncEngineArgs -MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it" -ENGINE_ARGS = AsyncEngineArgs(model=MODEL, load_format="runai_streamer") +MODEL = "google/gemma-1.1-2b-it" +ENGINE_ARGS = AsyncEngineArgs(model=MODEL) RAISED_ERROR = KeyError RAISED_VALUE = "foo" EXPECTED_TOKENS = 250 diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py index 4eac7341..35d00178 100644 --- a/tests/mq_llm_engine/test_error_handling.py +++ b/tests/mq_llm_engine/test_error_handling.py @@ -21,10 +21,8 @@ from vllm.lora.request import LoRARequest from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser -MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it" -ENGINE_ARGS = AsyncEngineArgs(model=MODEL, - load_format="runai_streamer", - enforce_eager=True) +MODEL = "google/gemma-1.1-2b-it" +ENGINE_ARGS = AsyncEngineArgs(model=MODEL, enforce_eager=True) RAISED_ERROR = KeyError RAISED_VALUE = "foo" diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py index 3162d56c..2069ff98 100644 --- a/tests/mq_llm_engine/test_load.py +++ b/tests/mq_llm_engine/test_load.py @@ -10,14 +10,12 @@ import pytest from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate from vllm.engine.arg_utils import AsyncEngineArgs -MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it" +MODEL = "google/gemma-1.1-2b-it" NUM_EXPECTED_TOKENS = 10 NUM_REQUESTS = 10000 # Scenarios to test for num generated token. -ENGINE_ARGS = AsyncEngineArgs(model=MODEL, - load_format="runai_streamer", - disable_log_requests=True) +ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True) @pytest.fixture(scope="function") diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index b247321e..c2fbe83a 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -553,8 +553,7 @@ def test_find_mm_placeholders( assert result == expected -@pytest.mark.parametrize( - "model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"]) +@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @pytest.mark.parametrize( ("limit", "num_supported", "is_valid"), [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True), @@ -593,8 +592,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): profiler.get_dummy_data(model_config.max_model_len) -@pytest.mark.parametrize( - "model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"]) +@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @pytest.mark.parametrize( ("num_images", "limit", "is_valid"), [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True), diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 90d424fe..2773d27a 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -16,7 +16,7 @@ from vllm.engine.llm_engine import LLMEngine from ..models.utils import check_outputs_equal MODELS = [ - "facebook/opt-125m", + "distilbert/distilgpt2", ] UNSTABLE_PROMPT_SEQUENCE = [ diff --git a/tests/test_config.py b/tests/test_config.py index bc87e6cc..8927a14d 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -8,20 +8,14 @@ from vllm.config import ModelConfig, PoolerConfig from vllm.model_executor.layers.pooler import PoolingType from vllm.platforms import current_platform -from .conftest import MODEL_WEIGHTS_S3_BUCKET - @pytest.mark.parametrize( ("model_id", "expected_runner_type", "expected_task"), [ - (f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2", "generate", - "generate"), - (f"{MODEL_WEIGHTS_S3_BUCKET}/intfloat/e5-mistral-7b-instruct", - "pooling", "embed"), - (f"{MODEL_WEIGHTS_S3_BUCKET}/jason9693/Qwen2.5-1.5B-apeach", "pooling", - "classify"), - (f"{MODEL_WEIGHTS_S3_BUCKET}/cross-encoder/ms-marco-MiniLM-L-6-v2", - "pooling", "score"), + ("distilbert/distilgpt2", "generate", "generate"), + ("intfloat/e5-mistral-7b-instruct", "pooling", "embed"), + ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"), + ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"), ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"), ("openai/whisper-small", "transcription", "transcription"), ], diff --git a/tests/test_regression.py b/tests/test_regression.py index 8cecc289..ce9498e8 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -10,9 +10,6 @@ import gc import torch from vllm import LLM, SamplingParams -from vllm.config import LoadFormat - -from .conftest import MODEL_WEIGHTS_S3_BUCKET def test_duplicated_ignored_sequence_group(): @@ -21,8 +18,7 @@ def test_duplicated_ignored_sequence_group(): sampling_params = SamplingParams(temperature=0.01, top_p=0.1, max_tokens=256) - llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2", - load_format=LoadFormat.RUNAI_STREAMER, + llm = LLM(model="distilbert/distilgpt2", max_num_batched_tokens=4096, tensor_parallel_size=1) prompts = ["This is a short prompt", "This is a very long prompt " * 1000] @@ -35,8 +31,7 @@ def test_max_tokens_none(): sampling_params = SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None) - llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2", - load_format=LoadFormat.RUNAI_STREAMER, + llm = LLM(model="distilbert/distilgpt2", max_num_batched_tokens=4096, tensor_parallel_size=1) prompts = ["Just say hello!"] @@ -46,9 +41,7 @@ def test_max_tokens_none(): def test_gc(): - llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2", - load_format=LoadFormat.RUNAI_STREAMER, - enforce_eager=True) + llm = LLM(model="distilbert/distilgpt2", enforce_eager=True) del llm gc.collect() diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index 2c337cc9..3ab80709 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -10,7 +10,7 @@ from vllm.worker.worker import Worker def test_swap() -> None: # Configure the engine. - engine_args = EngineArgs(model="s3://vllm-ci-model-weights/distilgpt2", + engine_args = EngineArgs(model="distilbert/distilgpt2", dtype="half", load_format="dummy") engine_config = engine_args.create_engine_config() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d75e2324..bab7cfe2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -22,6 +22,7 @@ from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.plugins import load_general_plugins +from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 from vllm.transformers_utils.utils import check_gguf_file from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser, StoreBoolean @@ -1141,6 +1142,14 @@ class EngineArgs: f", but got {self.cpu_offload_gb}") device_config = DeviceConfig(device=self.device) + + # NOTE: This is to allow model loading from S3 in CI + if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3 + and self.model in MODELS_ON_S3 + and self.load_format == LoadFormat.AUTO): # noqa: E501 + self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}" + self.load_format = LoadFormat.RUNAI_STREAMER + model_config = self.create_model_config() if (model_config.is_multimodal_model and not envs.VLLM_USE_V1 diff --git a/vllm/envs.py b/vllm/envs.py index 8be9ebb9..dbf1d462 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -618,6 +618,10 @@ environment_variables: Dict[str, Callable[[], Any]] = { # Port of the master node in the data parallel setting "VLLM_DP_MASTER_PORT": lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")), + + # Whether to use S3 path for model loading in CI via RunAI Streamer + "VLLM_CI_USE_S3": + lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1", } # end-env-vars-definition diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index df957cfc..8736cf1c 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -1394,7 +1394,6 @@ class RunaiModelStreamerLoader(BaseModelLoader): def get_model_loader(load_config: LoadConfig) -> BaseModelLoader: """Get a model loader based on the load format.""" - if isinstance(load_config.load_format, type): return load_config.load_format(load_config) diff --git a/vllm/test_utils.py b/vllm/test_utils.py new file mode 100644 index 00000000..eb9a4d80 --- /dev/null +++ b/vllm/test_utils.py @@ -0,0 +1,129 @@ +# SPDX-License-Identifier: Apache-2.0 +MODELS_ON_S3 = [ + "adept/fuyu-8b", + "ai21labs/AI21-Jamba-1.5-Mini", + "ai21labs/Jamba-tiny-random", + "ai21labs/Jamba-tiny-reward-dev", + "allenai/Molmo-7B-D-0924", + "allenai/OLMo-1B-hf", + "allenai/OLMoE-1B-7B-0924-Instruct", + "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test", + "AMead10/Llama-3.2-1B-Instruct-AWQ", + "ArthurZ/Ilama-3.2-1B", + "BAAI/bge-base-en-v1.5", + "BAAI/bge-multilingual-gemma2", + "BAAI/bge-reranker-v2-m3", + "bigcode/starcoder2-3b", + "cross-encoder/ms-marco-MiniLM-L-6-v2", + "cross-encoder/quora-roberta-base", + "deepseek-ai/deepseek-vl2-tiny", + "distilbert/distilgpt2", + "facebook/bart-base", + "facebook/bart-large-cnn", + # "fixie-ai/ultravox-v0_5-llama-3_2-1b", + "google/gemma-1.1-2b-it", + "google/gemma-2-2b-it", + "google/paligemma-3b-pt-224", + "h2oai/h2ovl-mississippi-800m", + "HuggingFaceM4/Idefics3-8B-Llama3", + "internlm/internlm2-1_8b-reward", + "intfloat/e5-mistral-7b-instruct", + "intfloat/multilingual-e5-large", + "jason9693/Qwen2.5-1.5B-apeach", + "llava-hf/llava-1.5-7b-hf", + "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", + "llava-hf/llava-v1.6-mistral-7b-hf", + "llava-hf/LLaVA-NeXT-Video-7B-hf", + # "meta-llama/Llama-2-7b-hf", + "meta-llama/Llama-3.2-11B-Vision-Instruct", + "meta-llama/Llama-3.2-1B", + "meta-llama/Llama-3.2-1B-Instruct", + "meta-llama/Meta-Llama-3-8B", + "microsoft/phi-2", + "microsoft/Phi-3-mini-4k-instruct", + "microsoft/Phi-3-small-8k-instruct", + "microsoft/Phi-3-vision-128k-instruct", + "microsoft/Phi-3.5-MoE-instruct", + "microsoft/Phi-3.5-vision-instruct", + # "mistralai/Mistral-7B-Instruct-v0.1", + "mistralai/Mixtral-8x7B-Instruct-v0.1", + "mistralai/Pixtral-12B-2409", + "mistral-community/Mixtral-8x22B-v0.1-AWQ", + "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", + "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse", + "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue", + "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", + "neuralmagic/Llama-3.2-1B-quantized.w8a8", + "neuralmagic/Meta-Llama-3-8B-Instruct-FP8", + "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV", + "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama", + "nm-testing/llama2.c-stories42M-pruned2.4-compressed", + "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t", + "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test", + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing", + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing", + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym", + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym", + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym", + "nm-testing/Phi-3-mini-128k-instruct-FP8", + "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV", + "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", + "nm-testing/tinyllama-oneshot-w4a16-channel-v2", + "nm-testing/tinyllama-oneshot-w4a16-group128-v2", + "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", + "nm-testing/tinyllama-oneshot-w8a16-per-channel", + "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", + "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym", + "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", + "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", + "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing", + "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", + "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme", + "nvidia/NVLM-D-72B", + "openai-community/gpt2", + # "openai/whisper-large-v3", + "openbmb/MiniCPM-o-2_6", + "openbmb/MiniCPM-V-2_6", + "OpenGVLab/InternVL2-1B", + "parasail-ai/GritLM-7B-vllm", + "Qwen/Qwen1.5-MoE-A2.7B-Chat", + "Qwen/Qwen2-7B-Instruct", + "Qwen/Qwen2-Audio-7B-Instruct", + "Qwen/Qwen2-VL-2B-Instruct", + "Qwen/Qwen2.5-1.5B-Instruct", + "Qwen/Qwen2.5-Math-PRM-7B", + "Qwen/Qwen2.5-Math-RM-72B", + "Qwen/Qwen2.5-VL-3B-Instruct", + "royokong/e5-v", + "sentence-transformers/all-roberta-large-v1", + "sentence-transformers/stsb-roberta-base-v2", + "shanearora/OLMo-7B-1124-hf", + "shuyuej/Llama-3.2-1B-Instruct-GPTQ", + "ssmits/Qwen2-7B-Instruct-embed-base", + "stabilityai/stablelm-3b-4e1t", + "stabilityai/stablelm-zephyr-3b", + "state-spaces/mamba-130m-hf", + "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", + "THUDM/glm-4v-9b", + "TIGER-Lab/Mantis-8B-siglip-llama3", + "TIGER-Lab/VLM2Vec-Full", + "tiiuae/falcon-40b", + "tiiuae/falcon-mamba-7b-instruct", + "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "upstage/solar-pro-preview-instruct", +] + +MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"