[ci] Use env var to control whether to use S3 bucket in CI (#13634)
This commit is contained in:
parent
322d2a27d6
commit
2c5e637b57
@ -278,7 +278,7 @@ steps:
|
|||||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
|
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: "PyTorch Fullgraph Smoke Test" # 9min
|
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||||
fast_check: true
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@ -289,7 +289,7 @@ steps:
|
|||||||
- pytest -v -s compile/piecewise/test_simple.py
|
- pytest -v -s compile/piecewise/test_simple.py
|
||||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||||
|
|
||||||
- label: "PyTorch Fullgraph Test" # 18min
|
- label: PyTorch Fullgraph Test # 18min
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
|
@ -9,7 +9,6 @@ import weakref
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
from vllm.config import LoadFormat
|
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
from ..conftest import VllmRunner
|
from ..conftest import VllmRunner
|
||||||
@ -34,7 +33,7 @@ def v1(run_with_both_engines):
|
|||||||
|
|
||||||
def test_vllm_gc_ed():
|
def test_vllm_gc_ed():
|
||||||
"""Verify vllm instance is GC'ed when it is deleted"""
|
"""Verify vllm instance is GC'ed when it is deleted"""
|
||||||
llm = LLM("distilbert/distilgpt2", load_format=LoadFormat.RUNAI_STREAMER)
|
llm = LLM("distilbert/distilgpt2")
|
||||||
weak_llm = weakref.ref(llm)
|
weak_llm = weakref.ref(llm)
|
||||||
del llm
|
del llm
|
||||||
# If there's any circular reference to vllm, this fails
|
# If there's any circular reference to vllm, this fails
|
||||||
@ -43,10 +42,10 @@ def test_vllm_gc_ed():
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
|
@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
@pytest.mark.parametrize("max_tokens", [5])
|
@pytest.mark.parametrize("max_tokens", [5])
|
||||||
@pytest.mark.parametrize("enforce_eager", [False, True])
|
@pytest.mark.parametrize("enforce_eager", [False])
|
||||||
def test_models(
|
def test_models(
|
||||||
hf_runner,
|
hf_runner,
|
||||||
model: str,
|
model: str,
|
||||||
@ -97,8 +96,8 @@ def test_models(
|
|||||||
"test_suite", [
|
"test_suite", [
|
||||||
("distilbert/distilgpt2", "ray", "", "L4"),
|
("distilbert/distilgpt2", "ray", "", "L4"),
|
||||||
("distilbert/distilgpt2", "mp", "", "L4"),
|
("distilbert/distilgpt2", "mp", "", "L4"),
|
||||||
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
|
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
|
||||||
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
|
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
|
||||||
("distilbert/distilgpt2", "ray", "", "A100"),
|
("distilbert/distilgpt2", "ray", "", "A100"),
|
||||||
("distilbert/distilgpt2", "mp", "", "A100"),
|
("distilbert/distilgpt2", "mp", "", "A100"),
|
||||||
("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
|
("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
|
||||||
|
@ -4,11 +4,9 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.config import LoadFormat
|
|
||||||
from vllm.device_allocator.cumem import CuMemAllocator
|
from vllm.device_allocator.cumem import CuMemAllocator
|
||||||
from vllm.utils import GiB_bytes
|
from vllm.utils import GiB_bytes
|
||||||
|
|
||||||
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
|
|
||||||
from ..utils import fork_new_process_for_each_test
|
from ..utils import fork_new_process_for_each_test
|
||||||
|
|
||||||
|
|
||||||
@ -121,7 +119,7 @@ def test_cumem_with_cudagraph():
|
|||||||
"model, use_v1",
|
"model, use_v1",
|
||||||
[
|
[
|
||||||
# sleep mode with safetensors
|
# sleep mode with safetensors
|
||||||
(f"{MODEL_WEIGHTS_S3_BUCKET}/meta-llama/Llama-3.2-1B", True),
|
("meta-llama/Llama-3.2-1B", True),
|
||||||
# sleep mode with pytorch checkpoint
|
# sleep mode with pytorch checkpoint
|
||||||
("facebook/opt-125m", False),
|
("facebook/opt-125m", False),
|
||||||
])
|
])
|
||||||
@ -130,10 +128,7 @@ def test_end_to_end(model: str, use_v1: bool):
|
|||||||
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
|
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
|
||||||
free, total = torch.cuda.mem_get_info()
|
free, total = torch.cuda.mem_get_info()
|
||||||
used_bytes_baseline = total - free # in case other process is running
|
used_bytes_baseline = total - free # in case other process is running
|
||||||
load_format = LoadFormat.AUTO
|
llm = LLM(model, enable_sleep_mode=True)
|
||||||
if "Llama" in model:
|
|
||||||
load_format = LoadFormat.RUNAI_STREAMER
|
|
||||||
llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
|
|
||||||
prompt = "How are you?"
|
prompt = "How are you?"
|
||||||
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
||||||
output = llm.generate(prompt, sampling_params)
|
output = llm.generate(prompt, sampling_params)
|
||||||
|
@ -24,7 +24,7 @@ from tests.models.utils import (TokensTextLogprobs,
|
|||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.assets.image import ImageAsset
|
from vllm.assets.image import ImageAsset
|
||||||
from vllm.assets.video import VideoAsset
|
from vllm.assets.video import VideoAsset
|
||||||
from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig
|
from vllm.config import TaskOption, TokenizerPoolConfig
|
||||||
from vllm.connections import global_http_connection
|
from vllm.connections import global_http_connection
|
||||||
from vllm.distributed import (cleanup_dist_env_and_memory,
|
from vllm.distributed import (cleanup_dist_env_and_memory,
|
||||||
init_distributed_environment,
|
init_distributed_environment,
|
||||||
@ -47,70 +47,6 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
|
|||||||
|
|
||||||
_M = TypeVar("_M")
|
_M = TypeVar("_M")
|
||||||
|
|
||||||
MODELS_ON_S3 = [
|
|
||||||
"distilbert/distilgpt2",
|
|
||||||
"meta-llama/Llama-2-7b-hf",
|
|
||||||
"meta-llama/Meta-Llama-3-8B",
|
|
||||||
"meta-llama/Llama-3.2-1B",
|
|
||||||
"meta-llama/Llama-3.2-1B-Instruct",
|
|
||||||
"openai-community/gpt2",
|
|
||||||
"ArthurZ/Ilama-3.2-1B",
|
|
||||||
"llava-hf/llava-1.5-7b-hf",
|
|
||||||
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
|
||||||
"ai21labs/Jamba-tiny-random",
|
|
||||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
|
|
||||||
"nm-testing/Phi-3-mini-128k-instruct-FP8",
|
|
||||||
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
|
|
||||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
|
|
||||||
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
|
|
||||||
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
|
|
||||||
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
|
|
||||||
"AMead10/Llama-3.2-1B-Instruct-AWQ",
|
|
||||||
"shuyuej/Llama-3.2-1B-Instruct-GPTQ",
|
|
||||||
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
|
|
||||||
"ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
|
|
||||||
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
|
|
||||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
|
|
||||||
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
|
|
||||||
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
|
|
||||||
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
|
|
||||||
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
|
|
||||||
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
|
|
||||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
|
|
||||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
|
|
||||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
|
|
||||||
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
|
|
||||||
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
|
|
||||||
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
|
|
||||||
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
|
|
||||||
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
|
|
||||||
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
|
|
||||||
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
|
|
||||||
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
|
|
||||||
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
|
|
||||||
"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
|
|
||||||
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
|
|
||||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
|
|
||||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
|
|
||||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
|
|
||||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
|
|
||||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
|
|
||||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
|
|
||||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
|
|
||||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
|
|
||||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
|
|
||||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
|
|
||||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
|
|
||||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
|
|
||||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
|
|
||||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
|
|
||||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
|
|
||||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
|
|
||||||
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
|
|
||||||
]
|
|
||||||
|
|
||||||
MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
|
|
||||||
|
|
||||||
_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
|
_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
|
||||||
|
|
||||||
PromptImageInput = _PromptMultiModalInput[Image.Image]
|
PromptImageInput = _PromptMultiModalInput[Image.Image]
|
||||||
@ -742,14 +678,8 @@ class VllmRunner:
|
|||||||
enable_chunked_prefill: bool = False,
|
enable_chunked_prefill: bool = False,
|
||||||
swap_space: int = 4,
|
swap_space: int = 4,
|
||||||
enforce_eager: Optional[bool] = False,
|
enforce_eager: Optional[bool] = False,
|
||||||
load_format: Optional[LoadFormat] = None,
|
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
if model_name in MODELS_ON_S3 and not load_format:
|
|
||||||
model_name = (f"{MODEL_WEIGHTS_S3_BUCKET}/{model_name}")
|
|
||||||
load_format = LoadFormat.RUNAI_STREAMER
|
|
||||||
if not load_format:
|
|
||||||
load_format = LoadFormat.AUTO
|
|
||||||
self.model = LLM(
|
self.model = LLM(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
task=task,
|
task=task,
|
||||||
@ -764,7 +694,6 @@ class VllmRunner:
|
|||||||
max_model_len=max_model_len,
|
max_model_len=max_model_len,
|
||||||
block_size=block_size,
|
block_size=block_size,
|
||||||
enable_chunked_prefill=enable_chunked_prefill,
|
enable_chunked_prefill=enable_chunked_prefill,
|
||||||
load_format=load_format,
|
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -2,16 +2,12 @@
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.config import LoadFormat
|
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.engine.llm_engine import LLMEngine
|
from vllm.engine.llm_engine import LLMEngine
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
|
|
||||||
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
|
||||||
@pytest.mark.parametrize("model",
|
|
||||||
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
|
|
||||||
@pytest.mark.parametrize("block_size", [16])
|
@pytest.mark.parametrize("block_size", [16])
|
||||||
def test_computed_prefix_blocks(model: str, block_size: int):
|
def test_computed_prefix_blocks(model: str, block_size: int):
|
||||||
# This test checks if we are able to run the engine to completion
|
# This test checks if we are able to run the engine to completion
|
||||||
@ -28,7 +24,6 @@ def test_computed_prefix_blocks(model: str, block_size: int):
|
|||||||
"decoration.")
|
"decoration.")
|
||||||
|
|
||||||
engine_args = EngineArgs(model=model,
|
engine_args = EngineArgs(model=model,
|
||||||
load_format=LoadFormat.RUNAI_STREAMER,
|
|
||||||
block_size=block_size,
|
block_size=block_size,
|
||||||
enable_prefix_caching=True)
|
enable_prefix_caching=True)
|
||||||
|
|
||||||
|
@ -2,15 +2,11 @@
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.config import LoadFormat
|
|
||||||
from vllm.entrypoints.llm import LLM
|
from vllm.entrypoints.llm import LLM
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
|
|
||||||
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
|
||||||
@pytest.mark.parametrize("model",
|
|
||||||
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
|
|
||||||
def test_computed_prefix_blocks(model: str):
|
def test_computed_prefix_blocks(model: str):
|
||||||
# This test checks if the engine generates completions both with and
|
# This test checks if the engine generates completions both with and
|
||||||
# without optional detokenization, that detokenization includes text
|
# without optional detokenization, that detokenization includes text
|
||||||
@ -21,7 +17,7 @@ def test_computed_prefix_blocks(model: str):
|
|||||||
"paper clips? Is there an easy to follow video tutorial available "
|
"paper clips? Is there an easy to follow video tutorial available "
|
||||||
"online for free?")
|
"online for free?")
|
||||||
|
|
||||||
llm = LLM(model=model, load_format=LoadFormat.RUNAI_STREAMER)
|
llm = LLM(model=model)
|
||||||
sampling_params = SamplingParams(max_tokens=10,
|
sampling_params = SamplingParams(max_tokens=10,
|
||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
detokenize=False)
|
detokenize=False)
|
||||||
|
@ -6,17 +6,12 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.config import LoadFormat
|
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||||
from vllm.engine.llm_engine import LLMEngine
|
from vllm.engine.llm_engine import LLMEngine
|
||||||
from vllm.executor.uniproc_executor import UniProcExecutor
|
from vllm.executor.uniproc_executor import UniProcExecutor
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
|
|
||||||
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
|
|
||||||
|
|
||||||
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
|
|
||||||
|
|
||||||
|
|
||||||
class Mock:
|
class Mock:
|
||||||
...
|
...
|
||||||
@ -38,12 +33,10 @@ class CustomUniExecutor(UniProcExecutor):
|
|||||||
CustomUniExecutorAsync = CustomUniExecutor
|
CustomUniExecutorAsync = CustomUniExecutor
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model",
|
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
|
||||||
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
|
|
||||||
def test_custom_executor_type_checking(model):
|
def test_custom_executor_type_checking(model):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
engine_args = EngineArgs(model=model,
|
engine_args = EngineArgs(model=model,
|
||||||
load_format=RUNAI_STREAMER_LOAD_FORMAT,
|
|
||||||
distributed_executor_backend=Mock)
|
distributed_executor_backend=Mock)
|
||||||
LLMEngine.from_engine_args(engine_args)
|
LLMEngine.from_engine_args(engine_args)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
@ -52,8 +45,7 @@ def test_custom_executor_type_checking(model):
|
|||||||
AsyncLLMEngine.from_engine_args(engine_args)
|
AsyncLLMEngine.from_engine_args(engine_args)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model",
|
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
|
||||||
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
|
|
||||||
def test_custom_executor(model, tmp_path):
|
def test_custom_executor(model, tmp_path):
|
||||||
cwd = os.path.abspath(".")
|
cwd = os.path.abspath(".")
|
||||||
os.chdir(tmp_path)
|
os.chdir(tmp_path)
|
||||||
@ -62,7 +54,6 @@ def test_custom_executor(model, tmp_path):
|
|||||||
|
|
||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(
|
||||||
model=model,
|
model=model,
|
||||||
load_format=RUNAI_STREAMER_LOAD_FORMAT,
|
|
||||||
distributed_executor_backend=CustomUniExecutor,
|
distributed_executor_backend=CustomUniExecutor,
|
||||||
enforce_eager=True, # reduce test time
|
enforce_eager=True, # reduce test time
|
||||||
)
|
)
|
||||||
@ -77,8 +68,7 @@ def test_custom_executor(model, tmp_path):
|
|||||||
os.chdir(cwd)
|
os.chdir(cwd)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model",
|
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
|
||||||
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
|
|
||||||
def test_custom_executor_async(model, tmp_path):
|
def test_custom_executor_async(model, tmp_path):
|
||||||
cwd = os.path.abspath(".")
|
cwd = os.path.abspath(".")
|
||||||
os.chdir(tmp_path)
|
os.chdir(tmp_path)
|
||||||
@ -87,7 +77,6 @@ def test_custom_executor_async(model, tmp_path):
|
|||||||
|
|
||||||
engine_args = AsyncEngineArgs(
|
engine_args = AsyncEngineArgs(
|
||||||
model=model,
|
model=model,
|
||||||
load_format=RUNAI_STREAMER_LOAD_FORMAT,
|
|
||||||
distributed_executor_backend=CustomUniExecutorAsync,
|
distributed_executor_backend=CustomUniExecutorAsync,
|
||||||
enforce_eager=True, # reduce test time
|
enforce_eager=True, # reduce test time
|
||||||
)
|
)
|
||||||
@ -106,8 +95,7 @@ def test_custom_executor_async(model, tmp_path):
|
|||||||
os.chdir(cwd)
|
os.chdir(cwd)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model",
|
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
|
||||||
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
|
|
||||||
def test_respect_ray(model):
|
def test_respect_ray(model):
|
||||||
# even for TP=1 and PP=1,
|
# even for TP=1 and PP=1,
|
||||||
# if users specify ray, we should use ray.
|
# if users specify ray, we should use ray.
|
||||||
@ -116,7 +104,6 @@ def test_respect_ray(model):
|
|||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(
|
||||||
model=model,
|
model=model,
|
||||||
distributed_executor_backend="ray",
|
distributed_executor_backend="ray",
|
||||||
load_format=RUNAI_STREAMER_LOAD_FORMAT,
|
|
||||||
enforce_eager=True, # reduce test time
|
enforce_eager=True, # reduce test time
|
||||||
)
|
)
|
||||||
engine = LLMEngine.from_engine_args(engine_args)
|
engine = LLMEngine.from_engine_args(engine_args)
|
||||||
|
@ -2,22 +2,19 @@
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.config import LoadFormat
|
|
||||||
from vllm.entrypoints.llm import LLM
|
from vllm.entrypoints.llm import LLM
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
|
|
||||||
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
|
||||||
@pytest.mark.parametrize("model",
|
|
||||||
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
|
|
||||||
def test_skip_tokenizer_initialization(model: str):
|
def test_skip_tokenizer_initialization(model: str):
|
||||||
# This test checks if the flag skip_tokenizer_init skips the initialization
|
# This test checks if the flag skip_tokenizer_init skips the initialization
|
||||||
# of tokenizer and detokenizer. The generated output is expected to contain
|
# of tokenizer and detokenizer. The generated output is expected to contain
|
||||||
# token ids.
|
# token ids.
|
||||||
llm = LLM(model=model,
|
llm = LLM(
|
||||||
|
model=model,
|
||||||
skip_tokenizer_init=True,
|
skip_tokenizer_init=True,
|
||||||
load_format=LoadFormat.RUNAI_STREAMER)
|
)
|
||||||
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
|
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
|
||||||
|
|
||||||
with pytest.raises(ValueError, match="cannot pass text prompts when"):
|
with pytest.raises(ValueError, match="cannot pass text prompts when"):
|
||||||
|
@ -5,17 +5,12 @@ from typing import List
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
from vllm.config import LoadFormat
|
|
||||||
|
|
||||||
from ...conftest import MODEL_WEIGHTS_S3_BUCKET
|
|
||||||
from ..openai.test_vision import TEST_IMAGE_URLS
|
from ..openai.test_vision import TEST_IMAGE_URLS
|
||||||
|
|
||||||
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
|
|
||||||
|
|
||||||
|
|
||||||
def test_chat():
|
def test_chat():
|
||||||
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
|
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
|
||||||
load_format=RUNAI_STREAMER_LOAD_FORMAT)
|
|
||||||
|
|
||||||
prompt1 = "Explain the concept of entropy."
|
prompt1 = "Explain the concept of entropy."
|
||||||
messages = [
|
messages = [
|
||||||
@ -33,8 +28,7 @@ def test_chat():
|
|||||||
|
|
||||||
|
|
||||||
def test_multi_chat():
|
def test_multi_chat():
|
||||||
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
|
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
|
||||||
load_format=RUNAI_STREAMER_LOAD_FORMAT)
|
|
||||||
|
|
||||||
prompt1 = "Explain the concept of entropy."
|
prompt1 = "Explain the concept of entropy."
|
||||||
prompt2 = "Explain what among us is."
|
prompt2 = "Explain what among us is."
|
||||||
@ -71,8 +65,7 @@ def test_multi_chat():
|
|||||||
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
|
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
|
||||||
def test_chat_multi_image(image_urls: List[str]):
|
def test_chat_multi_image(image_urls: List[str]):
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct",
|
model="microsoft/Phi-3.5-vision-instruct",
|
||||||
load_format=RUNAI_STREAMER_LOAD_FORMAT,
|
|
||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=5,
|
max_num_seqs=5,
|
||||||
|
@ -28,7 +28,7 @@ def test_collective_rpc(tp_size, backend):
|
|||||||
def echo_rank(self):
|
def echo_rank(self):
|
||||||
return self.rank
|
return self.rank
|
||||||
|
|
||||||
llm = LLM(model="s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct",
|
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
load_format="dummy",
|
load_format="dummy",
|
||||||
tensor_parallel_size=tp_size,
|
tensor_parallel_size=tp_size,
|
||||||
|
@ -6,10 +6,9 @@ from typing import List
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm import LLM, PoolingParams, PoolingRequestOutput
|
from vllm import LLM, PoolingParams, PoolingRequestOutput
|
||||||
from vllm.config import LoadFormat
|
|
||||||
from vllm.distributed import cleanup_dist_env_and_memory
|
from vllm.distributed import cleanup_dist_env_and_memory
|
||||||
|
|
||||||
MODEL_NAME = "s3://vllm-ci-model-weights/e5-mistral-7b-instruct"
|
MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
|
||||||
|
|
||||||
PROMPTS = [
|
PROMPTS = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
@ -33,7 +32,6 @@ def llm():
|
|||||||
# pytest caches the fixture so we use weakref.proxy to
|
# pytest caches the fixture so we use weakref.proxy to
|
||||||
# enable garbage collection
|
# enable garbage collection
|
||||||
llm = LLM(model=MODEL_NAME,
|
llm = LLM(model=MODEL_NAME,
|
||||||
load_format=LoadFormat.RUNAI_STREAMER,
|
|
||||||
max_num_batched_tokens=32768,
|
max_num_batched_tokens=32768,
|
||||||
tensor_parallel_size=1,
|
tensor_parallel_size=1,
|
||||||
gpu_memory_utilization=0.75,
|
gpu_memory_utilization=0.75,
|
||||||
|
@ -6,10 +6,9 @@ from typing import List
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm import LLM, RequestOutput, SamplingParams
|
from vllm import LLM, RequestOutput, SamplingParams
|
||||||
from vllm.config import LoadFormat
|
|
||||||
from vllm.distributed import cleanup_dist_env_and_memory
|
from vllm.distributed import cleanup_dist_env_and_memory
|
||||||
|
|
||||||
MODEL_NAME = "s3://vllm-ci-model-weights/distilgpt2"
|
MODEL_NAME = "distilbert/distilgpt2"
|
||||||
|
|
||||||
PROMPTS = [
|
PROMPTS = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
@ -31,7 +30,6 @@ def llm():
|
|||||||
# pytest caches the fixture so we use weakref.proxy to
|
# pytest caches the fixture so we use weakref.proxy to
|
||||||
# enable garbage collection
|
# enable garbage collection
|
||||||
llm = LLM(model=MODEL_NAME,
|
llm = LLM(model=MODEL_NAME,
|
||||||
load_format=LoadFormat.RUNAI_STREAMER,
|
|
||||||
max_num_batched_tokens=4096,
|
max_num_batched_tokens=4096,
|
||||||
tensor_parallel_size=1,
|
tensor_parallel_size=1,
|
||||||
gpu_memory_utilization=0.10,
|
gpu_memory_utilization=0.10,
|
||||||
|
@ -7,11 +7,10 @@ import pytest
|
|||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
from vllm.config import LoadFormat
|
|
||||||
from vllm.distributed import cleanup_dist_env_and_memory
|
from vllm.distributed import cleanup_dist_env_and_memory
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
|
|
||||||
MODEL_NAME = "s3://vllm-ci-model-weights/zephyr-7b-beta"
|
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||||
|
|
||||||
PROMPTS = [
|
PROMPTS = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
@ -28,7 +27,6 @@ def llm():
|
|||||||
# pytest caches the fixture so we use weakref.proxy to
|
# pytest caches the fixture so we use weakref.proxy to
|
||||||
# enable garbage collection
|
# enable garbage collection
|
||||||
llm = LLM(model=MODEL_NAME,
|
llm = LLM(model=MODEL_NAME,
|
||||||
load_format=LoadFormat.RUNAI_STREAMER,
|
|
||||||
tensor_parallel_size=1,
|
tensor_parallel_size=1,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
enable_lora=True,
|
enable_lora=True,
|
||||||
|
@ -7,13 +7,12 @@ import weakref
|
|||||||
import jsonschema
|
import jsonschema
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.config import LoadFormat
|
|
||||||
from vllm.distributed import cleanup_dist_env_and_memory
|
from vllm.distributed import cleanup_dist_env_and_memory
|
||||||
from vllm.entrypoints.llm import LLM
|
from vllm.entrypoints.llm import LLM
|
||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||||
|
|
||||||
MODEL_NAME = "s3://vllm-ci-model-weights/Qwen2.5-1.5B-Instruct"
|
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
|
||||||
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
|
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
|
||||||
|
|
||||||
|
|
||||||
@ -21,9 +20,7 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
|
|||||||
def llm():
|
def llm():
|
||||||
# pytest caches the fixture so we use weakref.proxy to
|
# pytest caches the fixture so we use weakref.proxy to
|
||||||
# enable garbage collection
|
# enable garbage collection
|
||||||
llm = LLM(model=MODEL_NAME,
|
llm = LLM(model=MODEL_NAME, max_model_len=1024)
|
||||||
load_format=LoadFormat.RUNAI_STREAMER,
|
|
||||||
max_model_len=1024)
|
|
||||||
|
|
||||||
with llm.deprecate_legacy_api():
|
with llm.deprecate_legacy_api():
|
||||||
yield weakref.proxy(llm)
|
yield weakref.proxy(llm)
|
||||||
|
@ -6,7 +6,6 @@ from contextlib import nullcontext
|
|||||||
from vllm_test_utils import BlameResult, blame
|
from vllm_test_utils import BlameResult, blame
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.config import LoadFormat
|
|
||||||
from vllm.distributed import cleanup_dist_env_and_memory
|
from vllm.distributed import cleanup_dist_env_and_memory
|
||||||
|
|
||||||
|
|
||||||
@ -44,8 +43,7 @@ def run_normal():
|
|||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
|
|
||||||
# Create an LLM without guided decoding as a baseline.
|
# Create an LLM without guided decoding as a baseline.
|
||||||
llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
|
llm = LLM(model="distilbert/distilgpt2",
|
||||||
load_format=LoadFormat.RUNAI_STREAMER,
|
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
gpu_memory_utilization=0.3)
|
gpu_memory_utilization=0.3)
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
@ -61,8 +59,7 @@ def run_normal():
|
|||||||
|
|
||||||
def run_lmfe(sample_regex):
|
def run_lmfe(sample_regex):
|
||||||
# Create an LLM with guided decoding enabled.
|
# Create an LLM with guided decoding enabled.
|
||||||
llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
|
llm = LLM(model="distilbert/distilgpt2",
|
||||||
load_format=LoadFormat.RUNAI_STREAMER,
|
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
guided_decoding_backend="lm-format-enforcer",
|
guided_decoding_backend="lm-format-enforcer",
|
||||||
gpu_memory_utilization=0.3)
|
gpu_memory_utilization=0.3)
|
||||||
|
@ -3,7 +3,6 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
from vllm.config import LoadFormat
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
@ -15,17 +14,13 @@ def v1(run_with_both_engines):
|
|||||||
|
|
||||||
|
|
||||||
def test_empty_prompt():
|
def test_empty_prompt():
|
||||||
llm = LLM(model="s3://vllm-ci-model-weights/gpt2",
|
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
|
||||||
load_format=LoadFormat.RUNAI_STREAMER,
|
|
||||||
enforce_eager=True)
|
|
||||||
with pytest.raises(ValueError, match='Prompt cannot be empty'):
|
with pytest.raises(ValueError, match='Prompt cannot be empty'):
|
||||||
llm.generate([""])
|
llm.generate([""])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip_v1
|
@pytest.mark.skip_v1
|
||||||
def test_out_of_vocab_token():
|
def test_out_of_vocab_token():
|
||||||
llm = LLM(model="s3://vllm-ci-model-weights/gpt2",
|
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
|
||||||
load_format=LoadFormat.RUNAI_STREAMER,
|
|
||||||
enforce_eager=True)
|
|
||||||
with pytest.raises(ValueError, match='out of vocabulary'):
|
with pytest.raises(ValueError, match='out of vocabulary'):
|
||||||
llm.generate({"prompt_token_ids": [999999]})
|
llm.generate({"prompt_token_ids": [999999]})
|
||||||
|
@ -8,21 +8,17 @@ import ray
|
|||||||
from prometheus_client import REGISTRY
|
from prometheus_client import REGISTRY
|
||||||
|
|
||||||
from vllm import EngineArgs, LLMEngine
|
from vllm import EngineArgs, LLMEngine
|
||||||
from vllm.config import LoadFormat
|
|
||||||
from vllm.distributed import cleanup_dist_env_and_memory
|
from vllm.distributed import cleanup_dist_env_and_memory
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||||
from vllm.engine.metrics import RayPrometheusStatLogger
|
from vllm.engine.metrics import RayPrometheusStatLogger
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
|
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET
|
||||||
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
|
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
"distilbert/distilgpt2",
|
"distilbert/distilgpt2",
|
||||||
]
|
]
|
||||||
|
|
||||||
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@pytest.mark.parametrize("dtype", ["float"])
|
@pytest.mark.parametrize("dtype", ["float"])
|
||||||
@ -146,9 +142,8 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
|
|||||||
metrics_tag_content = stat_logger.labels["model_name"]
|
metrics_tag_content = stat_logger.labels["model_name"]
|
||||||
|
|
||||||
if served_model_name is None or served_model_name == []:
|
if served_model_name is None or served_model_name == []:
|
||||||
actual_model_name = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}"
|
assert metrics_tag_content == f"{MODEL_WEIGHTS_S3_BUCKET}/{model}", (
|
||||||
assert metrics_tag_content == actual_model_name, (
|
f"Metrics tag model_name is wrong! expect: {model!r}\n"
|
||||||
f"Metrics tag model_name is wrong! expect: {actual_model_name!r}\n"
|
|
||||||
f"actual: {metrics_tag_content!r}")
|
f"actual: {metrics_tag_content!r}")
|
||||||
else:
|
else:
|
||||||
assert metrics_tag_content == served_model_name[0], (
|
assert metrics_tag_content == served_model_name[0], (
|
||||||
@ -174,10 +169,11 @@ async def test_async_engine_log_metrics_regression(
|
|||||||
when disable_log_stats=False
|
when disable_log_stats=False
|
||||||
(see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678)
|
(see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678)
|
||||||
"""
|
"""
|
||||||
engine_args = AsyncEngineArgs(model=model,
|
engine_args = AsyncEngineArgs(
|
||||||
|
model=model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
disable_log_stats=disable_log_stats,
|
disable_log_stats=disable_log_stats,
|
||||||
load_format=RUNAI_STREAMER_LOAD_FORMAT)
|
)
|
||||||
async_engine = AsyncLLMEngine.from_engine_args(engine_args)
|
async_engine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||||
for i, prompt in enumerate(example_prompts):
|
for i, prompt in enumerate(example_prompts):
|
||||||
results = async_engine.generate(
|
results = async_engine.generate(
|
||||||
@ -189,7 +185,7 @@ async def test_async_engine_log_metrics_regression(
|
|||||||
async for _ in results:
|
async for _ in results:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
assert_metrics(async_engine.engine, disable_log_stats,
|
assert_metrics(model, async_engine.engine, disable_log_stats,
|
||||||
len(example_prompts))
|
len(example_prompts))
|
||||||
|
|
||||||
|
|
||||||
@ -204,10 +200,11 @@ def test_engine_log_metrics_regression(
|
|||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
disable_log_stats: bool,
|
disable_log_stats: bool,
|
||||||
) -> None:
|
) -> None:
|
||||||
engine_args = EngineArgs(model=model,
|
engine_args = EngineArgs(
|
||||||
|
model=model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
disable_log_stats=disable_log_stats,
|
disable_log_stats=disable_log_stats,
|
||||||
load_format=RUNAI_STREAMER_LOAD_FORMAT)
|
)
|
||||||
engine = LLMEngine.from_engine_args(engine_args)
|
engine = LLMEngine.from_engine_args(engine_args)
|
||||||
for i, prompt in enumerate(example_prompts):
|
for i, prompt in enumerate(example_prompts):
|
||||||
engine.add_request(
|
engine.add_request(
|
||||||
@ -218,7 +215,8 @@ def test_engine_log_metrics_regression(
|
|||||||
while engine.has_unfinished_requests():
|
while engine.has_unfinished_requests():
|
||||||
engine.step()
|
engine.step()
|
||||||
|
|
||||||
assert_metrics(engine, disable_log_stats, len(example_prompts))
|
assert_metrics(f"{MODEL_WEIGHTS_S3_BUCKET}/{model}", engine,
|
||||||
|
disable_log_stats, len(example_prompts))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@ -285,14 +283,15 @@ def test_metric_spec_decode_interval(
|
|||||||
) -> None:
|
) -> None:
|
||||||
k = 5
|
k = 5
|
||||||
|
|
||||||
engine_args = EngineArgs(model=model,
|
engine_args = EngineArgs(
|
||||||
|
model=model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
disable_log_stats=False,
|
disable_log_stats=False,
|
||||||
gpu_memory_utilization=0.4,
|
gpu_memory_utilization=0.4,
|
||||||
speculative_model=model,
|
speculative_model=model,
|
||||||
num_speculative_tokens=k,
|
num_speculative_tokens=k,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
load_format=RUNAI_STREAMER_LOAD_FORMAT)
|
)
|
||||||
|
|
||||||
engine = LLMEngine.from_engine_args(engine_args)
|
engine = LLMEngine.from_engine_args(engine_args)
|
||||||
|
|
||||||
@ -359,7 +358,7 @@ def test_metric_spec_decode_interval(
|
|||||||
cleanup_dist_env_and_memory()
|
cleanup_dist_env_and_memory()
|
||||||
|
|
||||||
|
|
||||||
def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
|
def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool,
|
||||||
num_requests: int) -> None:
|
num_requests: int) -> None:
|
||||||
if disable_log_stats:
|
if disable_log_stats:
|
||||||
with pytest.raises(AttributeError):
|
with pytest.raises(AttributeError):
|
||||||
@ -370,7 +369,7 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
|
|||||||
# Ensure the count bucket of request-level histogram metrics matches
|
# Ensure the count bucket of request-level histogram metrics matches
|
||||||
# the number of requests as a simple sanity check to ensure metrics are
|
# the number of requests as a simple sanity check to ensure metrics are
|
||||||
# generated
|
# generated
|
||||||
labels = {'model_name': engine.model_config.model}
|
labels = {'model_name': model}
|
||||||
request_histogram_metrics = [
|
request_histogram_metrics = [
|
||||||
"vllm:e2e_request_latency_seconds",
|
"vllm:e2e_request_latency_seconds",
|
||||||
"vllm:request_prompt_tokens",
|
"vllm:request_prompt_tokens",
|
||||||
|
@ -7,7 +7,6 @@ from transformers import PretrainedConfig
|
|||||||
|
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
|
||||||
from ..conftest import MODELS_ON_S3
|
|
||||||
from .registry import HF_EXAMPLE_MODELS
|
from .registry import HF_EXAMPLE_MODELS
|
||||||
|
|
||||||
|
|
||||||
@ -43,11 +42,8 @@ def test_can_initialize(model_arch):
|
|||||||
|
|
||||||
with patch.object(LLM.get_engine_class(), "_initialize_kv_caches",
|
with patch.object(LLM.get_engine_class(), "_initialize_kv_caches",
|
||||||
_initialize_kv_caches):
|
_initialize_kv_caches):
|
||||||
model_name = model_info.default
|
|
||||||
if model_name in MODELS_ON_S3:
|
|
||||||
model_name = f"s3://vllm-ci-model-weights/{model_name.split('/')[-1]}"
|
|
||||||
LLM(
|
LLM(
|
||||||
model_name,
|
model_info.default,
|
||||||
tokenizer=model_info.tokenizer,
|
tokenizer=model_info.tokenizer,
|
||||||
tokenizer_mode=model_info.tokenizer_mode,
|
tokenizer_mode=model_info.tokenizer_mode,
|
||||||
speculative_model=model_info.speculative_model,
|
speculative_model=model_info.speculative_model,
|
||||||
|
@ -10,8 +10,8 @@ import pytest
|
|||||||
from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
|
from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||||
|
|
||||||
MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it"
|
MODEL = "google/gemma-1.1-2b-it"
|
||||||
ENGINE_ARGS = AsyncEngineArgs(model=MODEL, load_format="runai_streamer")
|
ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
|
||||||
RAISED_ERROR = KeyError
|
RAISED_ERROR = KeyError
|
||||||
RAISED_VALUE = "foo"
|
RAISED_VALUE = "foo"
|
||||||
EXPECTED_TOKENS = 250
|
EXPECTED_TOKENS = 250
|
||||||
|
@ -21,10 +21,8 @@ from vllm.lora.request import LoRARequest
|
|||||||
from vllm.usage.usage_lib import UsageContext
|
from vllm.usage.usage_lib import UsageContext
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it"
|
MODEL = "google/gemma-1.1-2b-it"
|
||||||
ENGINE_ARGS = AsyncEngineArgs(model=MODEL,
|
ENGINE_ARGS = AsyncEngineArgs(model=MODEL, enforce_eager=True)
|
||||||
load_format="runai_streamer",
|
|
||||||
enforce_eager=True)
|
|
||||||
RAISED_ERROR = KeyError
|
RAISED_ERROR = KeyError
|
||||||
RAISED_VALUE = "foo"
|
RAISED_VALUE = "foo"
|
||||||
|
|
||||||
|
@ -10,14 +10,12 @@ import pytest
|
|||||||
from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
|
from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||||
|
|
||||||
MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it"
|
MODEL = "google/gemma-1.1-2b-it"
|
||||||
NUM_EXPECTED_TOKENS = 10
|
NUM_EXPECTED_TOKENS = 10
|
||||||
NUM_REQUESTS = 10000
|
NUM_REQUESTS = 10000
|
||||||
|
|
||||||
# Scenarios to test for num generated token.
|
# Scenarios to test for num generated token.
|
||||||
ENGINE_ARGS = AsyncEngineArgs(model=MODEL,
|
ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True)
|
||||||
load_format="runai_streamer",
|
|
||||||
disable_log_requests=True)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function")
|
@pytest.fixture(scope="function")
|
||||||
|
@ -553,8 +553,7 @@ def test_find_mm_placeholders(
|
|||||||
assert result == expected
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||||
"model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("limit", "num_supported", "is_valid"),
|
("limit", "num_supported", "is_valid"),
|
||||||
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
|
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
|
||||||
@ -593,8 +592,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
|
|||||||
profiler.get_dummy_data(model_config.max_model_len)
|
profiler.get_dummy_data(model_config.max_model_len)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||||
"model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("num_images", "limit", "is_valid"),
|
("num_images", "limit", "is_valid"),
|
||||||
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
|
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
|
||||||
|
@ -16,7 +16,7 @@ from vllm.engine.llm_engine import LLMEngine
|
|||||||
from ..models.utils import check_outputs_equal
|
from ..models.utils import check_outputs_equal
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
"facebook/opt-125m",
|
"distilbert/distilgpt2",
|
||||||
]
|
]
|
||||||
|
|
||||||
UNSTABLE_PROMPT_SEQUENCE = [
|
UNSTABLE_PROMPT_SEQUENCE = [
|
||||||
|
@ -8,20 +8,14 @@ from vllm.config import ModelConfig, PoolerConfig
|
|||||||
from vllm.model_executor.layers.pooler import PoolingType
|
from vllm.model_executor.layers.pooler import PoolingType
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
from .conftest import MODEL_WEIGHTS_S3_BUCKET
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("model_id", "expected_runner_type", "expected_task"),
|
("model_id", "expected_runner_type", "expected_task"),
|
||||||
[
|
[
|
||||||
(f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2", "generate",
|
("distilbert/distilgpt2", "generate", "generate"),
|
||||||
"generate"),
|
("intfloat/e5-mistral-7b-instruct", "pooling", "embed"),
|
||||||
(f"{MODEL_WEIGHTS_S3_BUCKET}/intfloat/e5-mistral-7b-instruct",
|
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
|
||||||
"pooling", "embed"),
|
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"),
|
||||||
(f"{MODEL_WEIGHTS_S3_BUCKET}/jason9693/Qwen2.5-1.5B-apeach", "pooling",
|
|
||||||
"classify"),
|
|
||||||
(f"{MODEL_WEIGHTS_S3_BUCKET}/cross-encoder/ms-marco-MiniLM-L-6-v2",
|
|
||||||
"pooling", "score"),
|
|
||||||
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
|
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
|
||||||
("openai/whisper-small", "transcription", "transcription"),
|
("openai/whisper-small", "transcription", "transcription"),
|
||||||
],
|
],
|
||||||
|
@ -10,9 +10,6 @@ import gc
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.config import LoadFormat
|
|
||||||
|
|
||||||
from .conftest import MODEL_WEIGHTS_S3_BUCKET
|
|
||||||
|
|
||||||
|
|
||||||
def test_duplicated_ignored_sequence_group():
|
def test_duplicated_ignored_sequence_group():
|
||||||
@ -21,8 +18,7 @@ def test_duplicated_ignored_sequence_group():
|
|||||||
sampling_params = SamplingParams(temperature=0.01,
|
sampling_params = SamplingParams(temperature=0.01,
|
||||||
top_p=0.1,
|
top_p=0.1,
|
||||||
max_tokens=256)
|
max_tokens=256)
|
||||||
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
|
llm = LLM(model="distilbert/distilgpt2",
|
||||||
load_format=LoadFormat.RUNAI_STREAMER,
|
|
||||||
max_num_batched_tokens=4096,
|
max_num_batched_tokens=4096,
|
||||||
tensor_parallel_size=1)
|
tensor_parallel_size=1)
|
||||||
prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
|
prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
|
||||||
@ -35,8 +31,7 @@ def test_max_tokens_none():
|
|||||||
sampling_params = SamplingParams(temperature=0.01,
|
sampling_params = SamplingParams(temperature=0.01,
|
||||||
top_p=0.1,
|
top_p=0.1,
|
||||||
max_tokens=None)
|
max_tokens=None)
|
||||||
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
|
llm = LLM(model="distilbert/distilgpt2",
|
||||||
load_format=LoadFormat.RUNAI_STREAMER,
|
|
||||||
max_num_batched_tokens=4096,
|
max_num_batched_tokens=4096,
|
||||||
tensor_parallel_size=1)
|
tensor_parallel_size=1)
|
||||||
prompts = ["Just say hello!"]
|
prompts = ["Just say hello!"]
|
||||||
@ -46,9 +41,7 @@ def test_max_tokens_none():
|
|||||||
|
|
||||||
|
|
||||||
def test_gc():
|
def test_gc():
|
||||||
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
|
llm = LLM(model="distilbert/distilgpt2", enforce_eager=True)
|
||||||
load_format=LoadFormat.RUNAI_STREAMER,
|
|
||||||
enforce_eager=True)
|
|
||||||
del llm
|
del llm
|
||||||
|
|
||||||
gc.collect()
|
gc.collect()
|
||||||
|
@ -10,7 +10,7 @@ from vllm.worker.worker import Worker
|
|||||||
|
|
||||||
def test_swap() -> None:
|
def test_swap() -> None:
|
||||||
# Configure the engine.
|
# Configure the engine.
|
||||||
engine_args = EngineArgs(model="s3://vllm-ci-model-weights/distilgpt2",
|
engine_args = EngineArgs(model="distilbert/distilgpt2",
|
||||||
dtype="half",
|
dtype="half",
|
||||||
load_format="dummy")
|
load_format="dummy")
|
||||||
engine_config = engine_args.create_engine_config()
|
engine_config = engine_args.create_engine_config()
|
||||||
|
@ -22,6 +22,7 @@ from vllm.executor.executor_base import ExecutorBase
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||||
from vllm.plugins import load_general_plugins
|
from vllm.plugins import load_general_plugins
|
||||||
|
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
|
||||||
from vllm.transformers_utils.utils import check_gguf_file
|
from vllm.transformers_utils.utils import check_gguf_file
|
||||||
from vllm.usage.usage_lib import UsageContext
|
from vllm.usage.usage_lib import UsageContext
|
||||||
from vllm.utils import FlexibleArgumentParser, StoreBoolean
|
from vllm.utils import FlexibleArgumentParser, StoreBoolean
|
||||||
@ -1141,6 +1142,14 @@ class EngineArgs:
|
|||||||
f", but got {self.cpu_offload_gb}")
|
f", but got {self.cpu_offload_gb}")
|
||||||
|
|
||||||
device_config = DeviceConfig(device=self.device)
|
device_config = DeviceConfig(device=self.device)
|
||||||
|
|
||||||
|
# NOTE: This is to allow model loading from S3 in CI
|
||||||
|
if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
|
||||||
|
and self.model in MODELS_ON_S3
|
||||||
|
and self.load_format == LoadFormat.AUTO): # noqa: E501
|
||||||
|
self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
|
||||||
|
self.load_format = LoadFormat.RUNAI_STREAMER
|
||||||
|
|
||||||
model_config = self.create_model_config()
|
model_config = self.create_model_config()
|
||||||
|
|
||||||
if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
|
if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
|
||||||
|
@ -618,6 +618,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
|||||||
# Port of the master node in the data parallel setting
|
# Port of the master node in the data parallel setting
|
||||||
"VLLM_DP_MASTER_PORT":
|
"VLLM_DP_MASTER_PORT":
|
||||||
lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
|
lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
|
||||||
|
|
||||||
|
# Whether to use S3 path for model loading in CI via RunAI Streamer
|
||||||
|
"VLLM_CI_USE_S3":
|
||||||
|
lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
|
||||||
}
|
}
|
||||||
|
|
||||||
# end-env-vars-definition
|
# end-env-vars-definition
|
||||||
|
@ -1394,7 +1394,6 @@ class RunaiModelStreamerLoader(BaseModelLoader):
|
|||||||
|
|
||||||
def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
|
def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
|
||||||
"""Get a model loader based on the load format."""
|
"""Get a model loader based on the load format."""
|
||||||
|
|
||||||
if isinstance(load_config.load_format, type):
|
if isinstance(load_config.load_format, type):
|
||||||
return load_config.load_format(load_config)
|
return load_config.load_format(load_config)
|
||||||
|
|
||||||
|
129
vllm/test_utils.py
Normal file
129
vllm/test_utils.py
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
MODELS_ON_S3 = [
|
||||||
|
"adept/fuyu-8b",
|
||||||
|
"ai21labs/AI21-Jamba-1.5-Mini",
|
||||||
|
"ai21labs/Jamba-tiny-random",
|
||||||
|
"ai21labs/Jamba-tiny-reward-dev",
|
||||||
|
"allenai/Molmo-7B-D-0924",
|
||||||
|
"allenai/OLMo-1B-hf",
|
||||||
|
"allenai/OLMoE-1B-7B-0924-Instruct",
|
||||||
|
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
|
||||||
|
"AMead10/Llama-3.2-1B-Instruct-AWQ",
|
||||||
|
"ArthurZ/Ilama-3.2-1B",
|
||||||
|
"BAAI/bge-base-en-v1.5",
|
||||||
|
"BAAI/bge-multilingual-gemma2",
|
||||||
|
"BAAI/bge-reranker-v2-m3",
|
||||||
|
"bigcode/starcoder2-3b",
|
||||||
|
"cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||||
|
"cross-encoder/quora-roberta-base",
|
||||||
|
"deepseek-ai/deepseek-vl2-tiny",
|
||||||
|
"distilbert/distilgpt2",
|
||||||
|
"facebook/bart-base",
|
||||||
|
"facebook/bart-large-cnn",
|
||||||
|
# "fixie-ai/ultravox-v0_5-llama-3_2-1b",
|
||||||
|
"google/gemma-1.1-2b-it",
|
||||||
|
"google/gemma-2-2b-it",
|
||||||
|
"google/paligemma-3b-pt-224",
|
||||||
|
"h2oai/h2ovl-mississippi-800m",
|
||||||
|
"HuggingFaceM4/Idefics3-8B-Llama3",
|
||||||
|
"internlm/internlm2-1_8b-reward",
|
||||||
|
"intfloat/e5-mistral-7b-instruct",
|
||||||
|
"intfloat/multilingual-e5-large",
|
||||||
|
"jason9693/Qwen2.5-1.5B-apeach",
|
||||||
|
"llava-hf/llava-1.5-7b-hf",
|
||||||
|
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
||||||
|
"llava-hf/llava-v1.6-mistral-7b-hf",
|
||||||
|
"llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||||
|
# "meta-llama/Llama-2-7b-hf",
|
||||||
|
"meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||||
|
"meta-llama/Llama-3.2-1B",
|
||||||
|
"meta-llama/Llama-3.2-1B-Instruct",
|
||||||
|
"meta-llama/Meta-Llama-3-8B",
|
||||||
|
"microsoft/phi-2",
|
||||||
|
"microsoft/Phi-3-mini-4k-instruct",
|
||||||
|
"microsoft/Phi-3-small-8k-instruct",
|
||||||
|
"microsoft/Phi-3-vision-128k-instruct",
|
||||||
|
"microsoft/Phi-3.5-MoE-instruct",
|
||||||
|
"microsoft/Phi-3.5-vision-instruct",
|
||||||
|
# "mistralai/Mistral-7B-Instruct-v0.1",
|
||||||
|
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"mistralai/Pixtral-12B-2409",
|
||||||
|
"mistral-community/Mixtral-8x22B-v0.1-AWQ",
|
||||||
|
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
|
||||||
|
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
|
||||||
|
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
|
||||||
|
"ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
|
||||||
|
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
|
||||||
|
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
|
||||||
|
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
|
||||||
|
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
|
||||||
|
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
|
||||||
|
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
|
||||||
|
"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
|
||||||
|
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
|
||||||
|
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
|
||||||
|
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
|
||||||
|
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
|
||||||
|
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
|
||||||
|
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
|
||||||
|
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
|
||||||
|
"nm-testing/Phi-3-mini-128k-instruct-FP8",
|
||||||
|
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
|
||||||
|
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
|
||||||
|
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
|
||||||
|
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
|
||||||
|
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
|
||||||
|
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
|
||||||
|
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
|
||||||
|
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
|
||||||
|
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
|
||||||
|
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
|
||||||
|
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
|
||||||
|
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
|
||||||
|
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
|
||||||
|
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
|
||||||
|
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
|
||||||
|
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
|
||||||
|
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
|
||||||
|
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
|
||||||
|
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
|
||||||
|
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
|
||||||
|
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
|
||||||
|
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
|
||||||
|
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
|
||||||
|
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
|
||||||
|
"nvidia/NVLM-D-72B",
|
||||||
|
"openai-community/gpt2",
|
||||||
|
# "openai/whisper-large-v3",
|
||||||
|
"openbmb/MiniCPM-o-2_6",
|
||||||
|
"openbmb/MiniCPM-V-2_6",
|
||||||
|
"OpenGVLab/InternVL2-1B",
|
||||||
|
"parasail-ai/GritLM-7B-vllm",
|
||||||
|
"Qwen/Qwen1.5-MoE-A2.7B-Chat",
|
||||||
|
"Qwen/Qwen2-7B-Instruct",
|
||||||
|
"Qwen/Qwen2-Audio-7B-Instruct",
|
||||||
|
"Qwen/Qwen2-VL-2B-Instruct",
|
||||||
|
"Qwen/Qwen2.5-1.5B-Instruct",
|
||||||
|
"Qwen/Qwen2.5-Math-PRM-7B",
|
||||||
|
"Qwen/Qwen2.5-Math-RM-72B",
|
||||||
|
"Qwen/Qwen2.5-VL-3B-Instruct",
|
||||||
|
"royokong/e5-v",
|
||||||
|
"sentence-transformers/all-roberta-large-v1",
|
||||||
|
"sentence-transformers/stsb-roberta-base-v2",
|
||||||
|
"shanearora/OLMo-7B-1124-hf",
|
||||||
|
"shuyuej/Llama-3.2-1B-Instruct-GPTQ",
|
||||||
|
"ssmits/Qwen2-7B-Instruct-embed-base",
|
||||||
|
"stabilityai/stablelm-3b-4e1t",
|
||||||
|
"stabilityai/stablelm-zephyr-3b",
|
||||||
|
"state-spaces/mamba-130m-hf",
|
||||||
|
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
|
||||||
|
"THUDM/glm-4v-9b",
|
||||||
|
"TIGER-Lab/Mantis-8B-siglip-llama3",
|
||||||
|
"TIGER-Lab/VLM2Vec-Full",
|
||||||
|
"tiiuae/falcon-40b",
|
||||||
|
"tiiuae/falcon-mamba-7b-instruct",
|
||||||
|
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||||
|
"upstage/solar-pro-preview-instruct",
|
||||||
|
]
|
||||||
|
|
||||||
|
MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
|
Loading…
x
Reference in New Issue
Block a user