[ci] Use env var to control whether to use S3 bucket in CI (#13634)

This commit is contained in:
Kevin H. Luu 2025-02-22 19:19:45 -08:00 committed by GitHub
parent 322d2a27d6
commit 2c5e637b57
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
30 changed files with 222 additions and 231 deletions

View File

@ -278,7 +278,7 @@ steps:
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
parallelism: 4 parallelism: 4
- label: "PyTorch Fullgraph Smoke Test" # 9min - label: PyTorch Fullgraph Smoke Test # 9min
fast_check: true fast_check: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -289,7 +289,7 @@ steps:
- pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py - pytest -v -s compile/piecewise/test_toy_llama.py
- label: "PyTorch Fullgraph Test" # 18min - label: PyTorch Fullgraph Test # 18min
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/compile - tests/compile

View File

@ -9,7 +9,6 @@ import weakref
import pytest import pytest
from vllm import LLM from vllm import LLM
from vllm.config import LoadFormat
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ..conftest import VllmRunner from ..conftest import VllmRunner
@ -34,7 +33,7 @@ def v1(run_with_both_engines):
def test_vllm_gc_ed(): def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted""" """Verify vllm instance is GC'ed when it is deleted"""
llm = LLM("distilbert/distilgpt2", load_format=LoadFormat.RUNAI_STREAMER) llm = LLM("distilbert/distilgpt2")
weak_llm = weakref.ref(llm) weak_llm = weakref.ref(llm)
del llm del llm
# If there's any circular reference to vllm, this fails # If there's any circular reference to vllm, this fails
@ -43,10 +42,10 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"]) @pytest.mark.parametrize("backend", ["FLASH_ATTN"])
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False, True]) @pytest.mark.parametrize("enforce_eager", [False])
def test_models( def test_models(
hf_runner, hf_runner,
model: str, model: str,
@ -97,8 +96,8 @@ def test_models(
"test_suite", [ "test_suite", [
("distilbert/distilgpt2", "ray", "", "L4"), ("distilbert/distilgpt2", "ray", "", "L4"),
("distilbert/distilgpt2", "mp", "", "L4"), ("distilbert/distilgpt2", "mp", "", "L4"),
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"), ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"), ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
("distilbert/distilgpt2", "ray", "", "A100"), ("distilbert/distilgpt2", "ray", "", "A100"),
("distilbert/distilgpt2", "mp", "", "A100"), ("distilbert/distilgpt2", "mp", "", "A100"),
("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"), ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),

View File

@ -4,11 +4,9 @@ import pytest
import torch import torch
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.config import LoadFormat
from vllm.device_allocator.cumem import CuMemAllocator from vllm.device_allocator.cumem import CuMemAllocator
from vllm.utils import GiB_bytes from vllm.utils import GiB_bytes
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
from ..utils import fork_new_process_for_each_test from ..utils import fork_new_process_for_each_test
@ -121,7 +119,7 @@ def test_cumem_with_cudagraph():
"model, use_v1", "model, use_v1",
[ [
# sleep mode with safetensors # sleep mode with safetensors
(f"{MODEL_WEIGHTS_S3_BUCKET}/meta-llama/Llama-3.2-1B", True), ("meta-llama/Llama-3.2-1B", True),
# sleep mode with pytorch checkpoint # sleep mode with pytorch checkpoint
("facebook/opt-125m", False), ("facebook/opt-125m", False),
]) ])
@ -130,10 +128,7 @@ def test_end_to_end(model: str, use_v1: bool):
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0" os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
free, total = torch.cuda.mem_get_info() free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running used_bytes_baseline = total - free # in case other process is running
load_format = LoadFormat.AUTO llm = LLM(model, enable_sleep_mode=True)
if "Llama" in model:
load_format = LoadFormat.RUNAI_STREAMER
llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
prompt = "How are you?" prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10) sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params) output = llm.generate(prompt, sampling_params)

View File

@ -24,7 +24,7 @@ from tests.models.utils import (TokensTextLogprobs,
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset from vllm.assets.video import VideoAsset
from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig from vllm.config import TaskOption, TokenizerPoolConfig
from vllm.connections import global_http_connection from vllm.connections import global_http_connection
from vllm.distributed import (cleanup_dist_env_and_memory, from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment, init_distributed_environment,
@ -47,70 +47,6 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
_M = TypeVar("_M") _M = TypeVar("_M")
MODELS_ON_S3 = [
"distilbert/distilgpt2",
"meta-llama/Llama-2-7b-hf",
"meta-llama/Meta-Llama-3-8B",
"meta-llama/Llama-3.2-1B",
"meta-llama/Llama-3.2-1B-Instruct",
"openai-community/gpt2",
"ArthurZ/Ilama-3.2-1B",
"llava-hf/llava-1.5-7b-hf",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"ai21labs/Jamba-tiny-random",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
"nm-testing/Phi-3-mini-128k-instruct-FP8",
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
"AMead10/Llama-3.2-1B-Instruct-AWQ",
"shuyuej/Llama-3.2-1B-Instruct-GPTQ",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
"ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
]
MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
_PromptMultiModalInput = Union[List[_M], List[List[_M]]] _PromptMultiModalInput = Union[List[_M], List[List[_M]]]
PromptImageInput = _PromptMultiModalInput[Image.Image] PromptImageInput = _PromptMultiModalInput[Image.Image]
@ -742,14 +678,8 @@ class VllmRunner:
enable_chunked_prefill: bool = False, enable_chunked_prefill: bool = False,
swap_space: int = 4, swap_space: int = 4,
enforce_eager: Optional[bool] = False, enforce_eager: Optional[bool] = False,
load_format: Optional[LoadFormat] = None,
**kwargs, **kwargs,
) -> None: ) -> None:
if model_name in MODELS_ON_S3 and not load_format:
model_name = (f"{MODEL_WEIGHTS_S3_BUCKET}/{model_name}")
load_format = LoadFormat.RUNAI_STREAMER
if not load_format:
load_format = LoadFormat.AUTO
self.model = LLM( self.model = LLM(
model=model_name, model=model_name,
task=task, task=task,
@ -764,7 +694,6 @@ class VllmRunner:
max_model_len=max_model_len, max_model_len=max_model_len,
block_size=block_size, block_size=block_size,
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
load_format=load_format,
**kwargs, **kwargs,
) )

View File

@ -2,16 +2,12 @@
import pytest import pytest
from vllm.config import LoadFormat
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine from vllm.engine.llm_engine import LLMEngine
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
@pytest.mark.parametrize("model",
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
@pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("block_size", [16])
def test_computed_prefix_blocks(model: str, block_size: int): def test_computed_prefix_blocks(model: str, block_size: int):
# This test checks if we are able to run the engine to completion # This test checks if we are able to run the engine to completion
@ -28,7 +24,6 @@ def test_computed_prefix_blocks(model: str, block_size: int):
"decoration.") "decoration.")
engine_args = EngineArgs(model=model, engine_args = EngineArgs(model=model,
load_format=LoadFormat.RUNAI_STREAMER,
block_size=block_size, block_size=block_size,
enable_prefix_caching=True) enable_prefix_caching=True)

View File

@ -2,15 +2,11 @@
import pytest import pytest
from vllm.config import LoadFormat
from vllm.entrypoints.llm import LLM from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
@pytest.mark.parametrize("model",
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
def test_computed_prefix_blocks(model: str): def test_computed_prefix_blocks(model: str):
# This test checks if the engine generates completions both with and # This test checks if the engine generates completions both with and
# without optional detokenization, that detokenization includes text # without optional detokenization, that detokenization includes text
@ -21,7 +17,7 @@ def test_computed_prefix_blocks(model: str):
"paper clips? Is there an easy to follow video tutorial available " "paper clips? Is there an easy to follow video tutorial available "
"online for free?") "online for free?")
llm = LLM(model=model, load_format=LoadFormat.RUNAI_STREAMER) llm = LLM(model=model)
sampling_params = SamplingParams(max_tokens=10, sampling_params = SamplingParams(max_tokens=10,
temperature=0.0, temperature=0.0,
detokenize=False) detokenize=False)

View File

@ -6,17 +6,12 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import pytest import pytest
from vllm.config import LoadFormat
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.llm_engine import LLMEngine from vllm.engine.llm_engine import LLMEngine
from vllm.executor.uniproc_executor import UniProcExecutor from vllm.executor.uniproc_executor import UniProcExecutor
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
class Mock: class Mock:
... ...
@ -38,12 +33,10 @@ class CustomUniExecutor(UniProcExecutor):
CustomUniExecutorAsync = CustomUniExecutor CustomUniExecutorAsync = CustomUniExecutor
@pytest.mark.parametrize("model", @pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
def test_custom_executor_type_checking(model): def test_custom_executor_type_checking(model):
with pytest.raises(ValueError): with pytest.raises(ValueError):
engine_args = EngineArgs(model=model, engine_args = EngineArgs(model=model,
load_format=RUNAI_STREAMER_LOAD_FORMAT,
distributed_executor_backend=Mock) distributed_executor_backend=Mock)
LLMEngine.from_engine_args(engine_args) LLMEngine.from_engine_args(engine_args)
with pytest.raises(ValueError): with pytest.raises(ValueError):
@ -52,8 +45,7 @@ def test_custom_executor_type_checking(model):
AsyncLLMEngine.from_engine_args(engine_args) AsyncLLMEngine.from_engine_args(engine_args)
@pytest.mark.parametrize("model", @pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
def test_custom_executor(model, tmp_path): def test_custom_executor(model, tmp_path):
cwd = os.path.abspath(".") cwd = os.path.abspath(".")
os.chdir(tmp_path) os.chdir(tmp_path)
@ -62,7 +54,6 @@ def test_custom_executor(model, tmp_path):
engine_args = EngineArgs( engine_args = EngineArgs(
model=model, model=model,
load_format=RUNAI_STREAMER_LOAD_FORMAT,
distributed_executor_backend=CustomUniExecutor, distributed_executor_backend=CustomUniExecutor,
enforce_eager=True, # reduce test time enforce_eager=True, # reduce test time
) )
@ -77,8 +68,7 @@ def test_custom_executor(model, tmp_path):
os.chdir(cwd) os.chdir(cwd)
@pytest.mark.parametrize("model", @pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
def test_custom_executor_async(model, tmp_path): def test_custom_executor_async(model, tmp_path):
cwd = os.path.abspath(".") cwd = os.path.abspath(".")
os.chdir(tmp_path) os.chdir(tmp_path)
@ -87,7 +77,6 @@ def test_custom_executor_async(model, tmp_path):
engine_args = AsyncEngineArgs( engine_args = AsyncEngineArgs(
model=model, model=model,
load_format=RUNAI_STREAMER_LOAD_FORMAT,
distributed_executor_backend=CustomUniExecutorAsync, distributed_executor_backend=CustomUniExecutorAsync,
enforce_eager=True, # reduce test time enforce_eager=True, # reduce test time
) )
@ -106,8 +95,7 @@ def test_custom_executor_async(model, tmp_path):
os.chdir(cwd) os.chdir(cwd)
@pytest.mark.parametrize("model", @pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
def test_respect_ray(model): def test_respect_ray(model):
# even for TP=1 and PP=1, # even for TP=1 and PP=1,
# if users specify ray, we should use ray. # if users specify ray, we should use ray.
@ -116,7 +104,6 @@ def test_respect_ray(model):
engine_args = EngineArgs( engine_args = EngineArgs(
model=model, model=model,
distributed_executor_backend="ray", distributed_executor_backend="ray",
load_format=RUNAI_STREAMER_LOAD_FORMAT,
enforce_eager=True, # reduce test time enforce_eager=True, # reduce test time
) )
engine = LLMEngine.from_engine_args(engine_args) engine = LLMEngine.from_engine_args(engine_args)

View File

@ -2,22 +2,19 @@
import pytest import pytest
from vllm.config import LoadFormat
from vllm.entrypoints.llm import LLM from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
@pytest.mark.parametrize("model",
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
def test_skip_tokenizer_initialization(model: str): def test_skip_tokenizer_initialization(model: str):
# This test checks if the flag skip_tokenizer_init skips the initialization # This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain # of tokenizer and detokenizer. The generated output is expected to contain
# token ids. # token ids.
llm = LLM(model=model, llm = LLM(
model=model,
skip_tokenizer_init=True, skip_tokenizer_init=True,
load_format=LoadFormat.RUNAI_STREAMER) )
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
with pytest.raises(ValueError, match="cannot pass text prompts when"): with pytest.raises(ValueError, match="cannot pass text prompts when"):

View File

@ -5,17 +5,12 @@ from typing import List
import pytest import pytest
from vllm import LLM from vllm import LLM
from vllm.config import LoadFormat
from ...conftest import MODEL_WEIGHTS_S3_BUCKET
from ..openai.test_vision import TEST_IMAGE_URLS from ..openai.test_vision import TEST_IMAGE_URLS
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
def test_chat(): def test_chat():
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct", llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
load_format=RUNAI_STREAMER_LOAD_FORMAT)
prompt1 = "Explain the concept of entropy." prompt1 = "Explain the concept of entropy."
messages = [ messages = [
@ -33,8 +28,7 @@ def test_chat():
def test_multi_chat(): def test_multi_chat():
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct", llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
load_format=RUNAI_STREAMER_LOAD_FORMAT)
prompt1 = "Explain the concept of entropy." prompt1 = "Explain the concept of entropy."
prompt2 = "Explain what among us is." prompt2 = "Explain what among us is."
@ -71,8 +65,7 @@ def test_multi_chat():
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
def test_chat_multi_image(image_urls: List[str]): def test_chat_multi_image(image_urls: List[str]):
llm = LLM( llm = LLM(
model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct", model="microsoft/Phi-3.5-vision-instruct",
load_format=RUNAI_STREAMER_LOAD_FORMAT,
dtype="bfloat16", dtype="bfloat16",
max_model_len=4096, max_model_len=4096,
max_num_seqs=5, max_num_seqs=5,

View File

@ -28,7 +28,7 @@ def test_collective_rpc(tp_size, backend):
def echo_rank(self): def echo_rank(self):
return self.rank return self.rank
llm = LLM(model="s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct", llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
enforce_eager=True, enforce_eager=True,
load_format="dummy", load_format="dummy",
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,

View File

@ -6,10 +6,9 @@ from typing import List
import pytest import pytest
from vllm import LLM, PoolingParams, PoolingRequestOutput from vllm import LLM, PoolingParams, PoolingRequestOutput
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
MODEL_NAME = "s3://vllm-ci-model-weights/e5-mistral-7b-instruct" MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
PROMPTS = [ PROMPTS = [
"Hello, my name is", "Hello, my name is",
@ -33,7 +32,6 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to # pytest caches the fixture so we use weakref.proxy to
# enable garbage collection # enable garbage collection
llm = LLM(model=MODEL_NAME, llm = LLM(model=MODEL_NAME,
load_format=LoadFormat.RUNAI_STREAMER,
max_num_batched_tokens=32768, max_num_batched_tokens=32768,
tensor_parallel_size=1, tensor_parallel_size=1,
gpu_memory_utilization=0.75, gpu_memory_utilization=0.75,

View File

@ -6,10 +6,9 @@ from typing import List
import pytest import pytest
from vllm import LLM, RequestOutput, SamplingParams from vllm import LLM, RequestOutput, SamplingParams
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
MODEL_NAME = "s3://vllm-ci-model-weights/distilgpt2" MODEL_NAME = "distilbert/distilgpt2"
PROMPTS = [ PROMPTS = [
"Hello, my name is", "Hello, my name is",
@ -31,7 +30,6 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to # pytest caches the fixture so we use weakref.proxy to
# enable garbage collection # enable garbage collection
llm = LLM(model=MODEL_NAME, llm = LLM(model=MODEL_NAME,
load_format=LoadFormat.RUNAI_STREAMER,
max_num_batched_tokens=4096, max_num_batched_tokens=4096,
tensor_parallel_size=1, tensor_parallel_size=1,
gpu_memory_utilization=0.10, gpu_memory_utilization=0.10,

View File

@ -7,11 +7,10 @@ import pytest
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from vllm import LLM from vllm import LLM
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
MODEL_NAME = "s3://vllm-ci-model-weights/zephyr-7b-beta" MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
PROMPTS = [ PROMPTS = [
"Hello, my name is", "Hello, my name is",
@ -28,7 +27,6 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to # pytest caches the fixture so we use weakref.proxy to
# enable garbage collection # enable garbage collection
llm = LLM(model=MODEL_NAME, llm = LLM(model=MODEL_NAME,
load_format=LoadFormat.RUNAI_STREAMER,
tensor_parallel_size=1, tensor_parallel_size=1,
max_model_len=8192, max_model_len=8192,
enable_lora=True, enable_lora=True,

View File

@ -7,13 +7,12 @@ import weakref
import jsonschema import jsonschema
import pytest import pytest
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from vllm.entrypoints.llm import LLM from vllm.entrypoints.llm import LLM
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.sampling_params import GuidedDecodingParams, SamplingParams from vllm.sampling_params import GuidedDecodingParams, SamplingParams
MODEL_NAME = "s3://vllm-ci-model-weights/Qwen2.5-1.5B-Instruct" MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"] GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
@ -21,9 +20,7 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
def llm(): def llm():
# pytest caches the fixture so we use weakref.proxy to # pytest caches the fixture so we use weakref.proxy to
# enable garbage collection # enable garbage collection
llm = LLM(model=MODEL_NAME, llm = LLM(model=MODEL_NAME, max_model_len=1024)
load_format=LoadFormat.RUNAI_STREAMER,
max_model_len=1024)
with llm.deprecate_legacy_api(): with llm.deprecate_legacy_api():
yield weakref.proxy(llm) yield weakref.proxy(llm)

View File

@ -6,7 +6,6 @@ from contextlib import nullcontext
from vllm_test_utils import BlameResult, blame from vllm_test_utils import BlameResult, blame
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
@ -44,8 +43,7 @@ def run_normal():
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM without guided decoding as a baseline. # Create an LLM without guided decoding as a baseline.
llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2", llm = LLM(model="distilbert/distilgpt2",
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True, enforce_eager=True,
gpu_memory_utilization=0.3) gpu_memory_utilization=0.3)
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
@ -61,8 +59,7 @@ def run_normal():
def run_lmfe(sample_regex): def run_lmfe(sample_regex):
# Create an LLM with guided decoding enabled. # Create an LLM with guided decoding enabled.
llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2", llm = LLM(model="distilbert/distilgpt2",
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True, enforce_eager=True,
guided_decoding_backend="lm-format-enforcer", guided_decoding_backend="lm-format-enforcer",
gpu_memory_utilization=0.3) gpu_memory_utilization=0.3)

View File

@ -3,7 +3,6 @@
import pytest import pytest
from vllm import LLM from vllm import LLM
from vllm.config import LoadFormat
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
@ -15,17 +14,13 @@ def v1(run_with_both_engines):
def test_empty_prompt(): def test_empty_prompt():
llm = LLM(model="s3://vllm-ci-model-weights/gpt2", llm = LLM(model="openai-community/gpt2", enforce_eager=True)
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True)
with pytest.raises(ValueError, match='Prompt cannot be empty'): with pytest.raises(ValueError, match='Prompt cannot be empty'):
llm.generate([""]) llm.generate([""])
@pytest.mark.skip_v1 @pytest.mark.skip_v1
def test_out_of_vocab_token(): def test_out_of_vocab_token():
llm = LLM(model="s3://vllm-ci-model-weights/gpt2", llm = LLM(model="openai-community/gpt2", enforce_eager=True)
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True)
with pytest.raises(ValueError, match='out of vocabulary'): with pytest.raises(ValueError, match='out of vocabulary'):
llm.generate({"prompt_token_ids": [999999]}) llm.generate({"prompt_token_ids": [999999]})

View File

@ -8,21 +8,17 @@ import ray
from prometheus_client import REGISTRY from prometheus_client import REGISTRY
from vllm import EngineArgs, LLMEngine from vllm import EngineArgs, LLMEngine
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.metrics import RayPrometheusStatLogger from vllm.engine.metrics import RayPrometheusStatLogger
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
MODELS = [ MODELS = [
"distilbert/distilgpt2", "distilbert/distilgpt2",
] ]
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", ["float"])
@ -146,9 +142,8 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
metrics_tag_content = stat_logger.labels["model_name"] metrics_tag_content = stat_logger.labels["model_name"]
if served_model_name is None or served_model_name == []: if served_model_name is None or served_model_name == []:
actual_model_name = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}" assert metrics_tag_content == f"{MODEL_WEIGHTS_S3_BUCKET}/{model}", (
assert metrics_tag_content == actual_model_name, ( f"Metrics tag model_name is wrong! expect: {model!r}\n"
f"Metrics tag model_name is wrong! expect: {actual_model_name!r}\n"
f"actual: {metrics_tag_content!r}") f"actual: {metrics_tag_content!r}")
else: else:
assert metrics_tag_content == served_model_name[0], ( assert metrics_tag_content == served_model_name[0], (
@ -174,10 +169,11 @@ async def test_async_engine_log_metrics_regression(
when disable_log_stats=False when disable_log_stats=False
(see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678) (see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678)
""" """
engine_args = AsyncEngineArgs(model=model, engine_args = AsyncEngineArgs(
model=model,
dtype=dtype, dtype=dtype,
disable_log_stats=disable_log_stats, disable_log_stats=disable_log_stats,
load_format=RUNAI_STREAMER_LOAD_FORMAT) )
async_engine = AsyncLLMEngine.from_engine_args(engine_args) async_engine = AsyncLLMEngine.from_engine_args(engine_args)
for i, prompt in enumerate(example_prompts): for i, prompt in enumerate(example_prompts):
results = async_engine.generate( results = async_engine.generate(
@ -189,7 +185,7 @@ async def test_async_engine_log_metrics_regression(
async for _ in results: async for _ in results:
pass pass
assert_metrics(async_engine.engine, disable_log_stats, assert_metrics(model, async_engine.engine, disable_log_stats,
len(example_prompts)) len(example_prompts))
@ -204,10 +200,11 @@ def test_engine_log_metrics_regression(
max_tokens: int, max_tokens: int,
disable_log_stats: bool, disable_log_stats: bool,
) -> None: ) -> None:
engine_args = EngineArgs(model=model, engine_args = EngineArgs(
model=model,
dtype=dtype, dtype=dtype,
disable_log_stats=disable_log_stats, disable_log_stats=disable_log_stats,
load_format=RUNAI_STREAMER_LOAD_FORMAT) )
engine = LLMEngine.from_engine_args(engine_args) engine = LLMEngine.from_engine_args(engine_args)
for i, prompt in enumerate(example_prompts): for i, prompt in enumerate(example_prompts):
engine.add_request( engine.add_request(
@ -218,7 +215,8 @@ def test_engine_log_metrics_regression(
while engine.has_unfinished_requests(): while engine.has_unfinished_requests():
engine.step() engine.step()
assert_metrics(engine, disable_log_stats, len(example_prompts)) assert_metrics(f"{MODEL_WEIGHTS_S3_BUCKET}/{model}", engine,
disable_log_stats, len(example_prompts))
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@ -285,14 +283,15 @@ def test_metric_spec_decode_interval(
) -> None: ) -> None:
k = 5 k = 5
engine_args = EngineArgs(model=model, engine_args = EngineArgs(
model=model,
dtype=dtype, dtype=dtype,
disable_log_stats=False, disable_log_stats=False,
gpu_memory_utilization=0.4, gpu_memory_utilization=0.4,
speculative_model=model, speculative_model=model,
num_speculative_tokens=k, num_speculative_tokens=k,
enforce_eager=True, enforce_eager=True,
load_format=RUNAI_STREAMER_LOAD_FORMAT) )
engine = LLMEngine.from_engine_args(engine_args) engine = LLMEngine.from_engine_args(engine_args)
@ -359,7 +358,7 @@ def test_metric_spec_decode_interval(
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
def assert_metrics(engine: LLMEngine, disable_log_stats: bool, def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool,
num_requests: int) -> None: num_requests: int) -> None:
if disable_log_stats: if disable_log_stats:
with pytest.raises(AttributeError): with pytest.raises(AttributeError):
@ -370,7 +369,7 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
# Ensure the count bucket of request-level histogram metrics matches # Ensure the count bucket of request-level histogram metrics matches
# the number of requests as a simple sanity check to ensure metrics are # the number of requests as a simple sanity check to ensure metrics are
# generated # generated
labels = {'model_name': engine.model_config.model} labels = {'model_name': model}
request_histogram_metrics = [ request_histogram_metrics = [
"vllm:e2e_request_latency_seconds", "vllm:e2e_request_latency_seconds",
"vllm:request_prompt_tokens", "vllm:request_prompt_tokens",

View File

@ -7,7 +7,6 @@ from transformers import PretrainedConfig
from vllm import LLM from vllm import LLM
from ..conftest import MODELS_ON_S3
from .registry import HF_EXAMPLE_MODELS from .registry import HF_EXAMPLE_MODELS
@ -43,11 +42,8 @@ def test_can_initialize(model_arch):
with patch.object(LLM.get_engine_class(), "_initialize_kv_caches", with patch.object(LLM.get_engine_class(), "_initialize_kv_caches",
_initialize_kv_caches): _initialize_kv_caches):
model_name = model_info.default
if model_name in MODELS_ON_S3:
model_name = f"s3://vllm-ci-model-weights/{model_name.split('/')[-1]}"
LLM( LLM(
model_name, model_info.default,
tokenizer=model_info.tokenizer, tokenizer=model_info.tokenizer,
tokenizer_mode=model_info.tokenizer_mode, tokenizer_mode=model_info.tokenizer_mode,
speculative_model=model_info.speculative_model, speculative_model=model_info.speculative_model,

View File

@ -10,8 +10,8 @@ import pytest
from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it" MODEL = "google/gemma-1.1-2b-it"
ENGINE_ARGS = AsyncEngineArgs(model=MODEL, load_format="runai_streamer") ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
RAISED_ERROR = KeyError RAISED_ERROR = KeyError
RAISED_VALUE = "foo" RAISED_VALUE = "foo"
EXPECTED_TOKENS = 250 EXPECTED_TOKENS = 250

View File

@ -21,10 +21,8 @@ from vllm.lora.request import LoRARequest
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it" MODEL = "google/gemma-1.1-2b-it"
ENGINE_ARGS = AsyncEngineArgs(model=MODEL, ENGINE_ARGS = AsyncEngineArgs(model=MODEL, enforce_eager=True)
load_format="runai_streamer",
enforce_eager=True)
RAISED_ERROR = KeyError RAISED_ERROR = KeyError
RAISED_VALUE = "foo" RAISED_VALUE = "foo"

View File

@ -10,14 +10,12 @@ import pytest
from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it" MODEL = "google/gemma-1.1-2b-it"
NUM_EXPECTED_TOKENS = 10 NUM_EXPECTED_TOKENS = 10
NUM_REQUESTS = 10000 NUM_REQUESTS = 10000
# Scenarios to test for num generated token. # Scenarios to test for num generated token.
ENGINE_ARGS = AsyncEngineArgs(model=MODEL, ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True)
load_format="runai_streamer",
disable_log_requests=True)
@pytest.fixture(scope="function") @pytest.fixture(scope="function")

View File

@ -553,8 +553,7 @@ def test_find_mm_placeholders(
assert result == expected assert result == expected
@pytest.mark.parametrize( @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
"model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize( @pytest.mark.parametrize(
("limit", "num_supported", "is_valid"), ("limit", "num_supported", "is_valid"),
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True), [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
@ -593,8 +592,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
profiler.get_dummy_data(model_config.max_model_len) profiler.get_dummy_data(model_config.max_model_len)
@pytest.mark.parametrize( @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
"model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize( @pytest.mark.parametrize(
("num_images", "limit", "is_valid"), ("num_images", "limit", "is_valid"),
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True), [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),

View File

@ -16,7 +16,7 @@ from vllm.engine.llm_engine import LLMEngine
from ..models.utils import check_outputs_equal from ..models.utils import check_outputs_equal
MODELS = [ MODELS = [
"facebook/opt-125m", "distilbert/distilgpt2",
] ]
UNSTABLE_PROMPT_SEQUENCE = [ UNSTABLE_PROMPT_SEQUENCE = [

View File

@ -8,20 +8,14 @@ from vllm.config import ModelConfig, PoolerConfig
from vllm.model_executor.layers.pooler import PoolingType from vllm.model_executor.layers.pooler import PoolingType
from vllm.platforms import current_platform from vllm.platforms import current_platform
from .conftest import MODEL_WEIGHTS_S3_BUCKET
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_task"), ("model_id", "expected_runner_type", "expected_task"),
[ [
(f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2", "generate", ("distilbert/distilgpt2", "generate", "generate"),
"generate"), ("intfloat/e5-mistral-7b-instruct", "pooling", "embed"),
(f"{MODEL_WEIGHTS_S3_BUCKET}/intfloat/e5-mistral-7b-instruct", ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
"pooling", "embed"), ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"),
(f"{MODEL_WEIGHTS_S3_BUCKET}/jason9693/Qwen2.5-1.5B-apeach", "pooling",
"classify"),
(f"{MODEL_WEIGHTS_S3_BUCKET}/cross-encoder/ms-marco-MiniLM-L-6-v2",
"pooling", "score"),
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"), ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
("openai/whisper-small", "transcription", "transcription"), ("openai/whisper-small", "transcription", "transcription"),
], ],

View File

@ -10,9 +10,6 @@ import gc
import torch import torch
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.config import LoadFormat
from .conftest import MODEL_WEIGHTS_S3_BUCKET
def test_duplicated_ignored_sequence_group(): def test_duplicated_ignored_sequence_group():
@ -21,8 +18,7 @@ def test_duplicated_ignored_sequence_group():
sampling_params = SamplingParams(temperature=0.01, sampling_params = SamplingParams(temperature=0.01,
top_p=0.1, top_p=0.1,
max_tokens=256) max_tokens=256)
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2", llm = LLM(model="distilbert/distilgpt2",
load_format=LoadFormat.RUNAI_STREAMER,
max_num_batched_tokens=4096, max_num_batched_tokens=4096,
tensor_parallel_size=1) tensor_parallel_size=1)
prompts = ["This is a short prompt", "This is a very long prompt " * 1000] prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
@ -35,8 +31,7 @@ def test_max_tokens_none():
sampling_params = SamplingParams(temperature=0.01, sampling_params = SamplingParams(temperature=0.01,
top_p=0.1, top_p=0.1,
max_tokens=None) max_tokens=None)
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2", llm = LLM(model="distilbert/distilgpt2",
load_format=LoadFormat.RUNAI_STREAMER,
max_num_batched_tokens=4096, max_num_batched_tokens=4096,
tensor_parallel_size=1) tensor_parallel_size=1)
prompts = ["Just say hello!"] prompts = ["Just say hello!"]
@ -46,9 +41,7 @@ def test_max_tokens_none():
def test_gc(): def test_gc():
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2", llm = LLM(model="distilbert/distilgpt2", enforce_eager=True)
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True)
del llm del llm
gc.collect() gc.collect()

View File

@ -10,7 +10,7 @@ from vllm.worker.worker import Worker
def test_swap() -> None: def test_swap() -> None:
# Configure the engine. # Configure the engine.
engine_args = EngineArgs(model="s3://vllm-ci-model-weights/distilgpt2", engine_args = EngineArgs(model="distilbert/distilgpt2",
dtype="half", dtype="half",
load_format="dummy") load_format="dummy")
engine_config = engine_args.create_engine_config() engine_config = engine_args.create_engine_config()

View File

@ -22,6 +22,7 @@ from vllm.executor.executor_base import ExecutorBase
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.plugins import load_general_plugins from vllm.plugins import load_general_plugins
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
from vllm.transformers_utils.utils import check_gguf_file from vllm.transformers_utils.utils import check_gguf_file
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser, StoreBoolean from vllm.utils import FlexibleArgumentParser, StoreBoolean
@ -1141,6 +1142,14 @@ class EngineArgs:
f", but got {self.cpu_offload_gb}") f", but got {self.cpu_offload_gb}")
device_config = DeviceConfig(device=self.device) device_config = DeviceConfig(device=self.device)
# NOTE: This is to allow model loading from S3 in CI
if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
and self.model in MODELS_ON_S3
and self.load_format == LoadFormat.AUTO): # noqa: E501
self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
self.load_format = LoadFormat.RUNAI_STREAMER
model_config = self.create_model_config() model_config = self.create_model_config()
if (model_config.is_multimodal_model and not envs.VLLM_USE_V1 if (model_config.is_multimodal_model and not envs.VLLM_USE_V1

View File

@ -618,6 +618,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# Port of the master node in the data parallel setting # Port of the master node in the data parallel setting
"VLLM_DP_MASTER_PORT": "VLLM_DP_MASTER_PORT":
lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")), lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
# Whether to use S3 path for model loading in CI via RunAI Streamer
"VLLM_CI_USE_S3":
lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
} }
# end-env-vars-definition # end-env-vars-definition

View File

@ -1394,7 +1394,6 @@ class RunaiModelStreamerLoader(BaseModelLoader):
def get_model_loader(load_config: LoadConfig) -> BaseModelLoader: def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
"""Get a model loader based on the load format.""" """Get a model loader based on the load format."""
if isinstance(load_config.load_format, type): if isinstance(load_config.load_format, type):
return load_config.load_format(load_config) return load_config.load_format(load_config)

129
vllm/test_utils.py Normal file
View File

@ -0,0 +1,129 @@
# SPDX-License-Identifier: Apache-2.0
MODELS_ON_S3 = [
"adept/fuyu-8b",
"ai21labs/AI21-Jamba-1.5-Mini",
"ai21labs/Jamba-tiny-random",
"ai21labs/Jamba-tiny-reward-dev",
"allenai/Molmo-7B-D-0924",
"allenai/OLMo-1B-hf",
"allenai/OLMoE-1B-7B-0924-Instruct",
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
"AMead10/Llama-3.2-1B-Instruct-AWQ",
"ArthurZ/Ilama-3.2-1B",
"BAAI/bge-base-en-v1.5",
"BAAI/bge-multilingual-gemma2",
"BAAI/bge-reranker-v2-m3",
"bigcode/starcoder2-3b",
"cross-encoder/ms-marco-MiniLM-L-6-v2",
"cross-encoder/quora-roberta-base",
"deepseek-ai/deepseek-vl2-tiny",
"distilbert/distilgpt2",
"facebook/bart-base",
"facebook/bart-large-cnn",
# "fixie-ai/ultravox-v0_5-llama-3_2-1b",
"google/gemma-1.1-2b-it",
"google/gemma-2-2b-it",
"google/paligemma-3b-pt-224",
"h2oai/h2ovl-mississippi-800m",
"HuggingFaceM4/Idefics3-8B-Llama3",
"internlm/internlm2-1_8b-reward",
"intfloat/e5-mistral-7b-instruct",
"intfloat/multilingual-e5-large",
"jason9693/Qwen2.5-1.5B-apeach",
"llava-hf/llava-1.5-7b-hf",
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
"llava-hf/llava-v1.6-mistral-7b-hf",
"llava-hf/LLaVA-NeXT-Video-7B-hf",
# "meta-llama/Llama-2-7b-hf",
"meta-llama/Llama-3.2-11B-Vision-Instruct",
"meta-llama/Llama-3.2-1B",
"meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Meta-Llama-3-8B",
"microsoft/phi-2",
"microsoft/Phi-3-mini-4k-instruct",
"microsoft/Phi-3-small-8k-instruct",
"microsoft/Phi-3-vision-128k-instruct",
"microsoft/Phi-3.5-MoE-instruct",
"microsoft/Phi-3.5-vision-instruct",
# "mistralai/Mistral-7B-Instruct-v0.1",
"mistralai/Mixtral-8x7B-Instruct-v0.1",
"mistralai/Pixtral-12B-2409",
"mistral-community/Mixtral-8x22B-v0.1-AWQ",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
"ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
"nm-testing/Phi-3-mini-128k-instruct-FP8",
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
"nvidia/NVLM-D-72B",
"openai-community/gpt2",
# "openai/whisper-large-v3",
"openbmb/MiniCPM-o-2_6",
"openbmb/MiniCPM-V-2_6",
"OpenGVLab/InternVL2-1B",
"parasail-ai/GritLM-7B-vllm",
"Qwen/Qwen1.5-MoE-A2.7B-Chat",
"Qwen/Qwen2-7B-Instruct",
"Qwen/Qwen2-Audio-7B-Instruct",
"Qwen/Qwen2-VL-2B-Instruct",
"Qwen/Qwen2.5-1.5B-Instruct",
"Qwen/Qwen2.5-Math-PRM-7B",
"Qwen/Qwen2.5-Math-RM-72B",
"Qwen/Qwen2.5-VL-3B-Instruct",
"royokong/e5-v",
"sentence-transformers/all-roberta-large-v1",
"sentence-transformers/stsb-roberta-base-v2",
"shanearora/OLMo-7B-1124-hf",
"shuyuej/Llama-3.2-1B-Instruct-GPTQ",
"ssmits/Qwen2-7B-Instruct-embed-base",
"stabilityai/stablelm-3b-4e1t",
"stabilityai/stablelm-zephyr-3b",
"state-spaces/mamba-130m-hf",
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
"THUDM/glm-4v-9b",
"TIGER-Lab/Mantis-8B-siglip-llama3",
"TIGER-Lab/VLM2Vec-Full",
"tiiuae/falcon-40b",
"tiiuae/falcon-mamba-7b-instruct",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"upstage/solar-pro-preview-instruct",
]
MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"