[CI/Build] Update CPU tests to include all "standard" tests (#5481)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
208ce622c7
commit
b489fc3c91
@ -19,17 +19,22 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test bash -c "
|
||||
pip install pytest matplotlib einops transformers_stream_generator
|
||||
pytest -v -s tests/models -m \"not vlm\" \
|
||||
--ignore=tests/models/test_embedding.py \
|
||||
--ignore=tests/models/test_oot_registration.py \
|
||||
--ignore=tests/models/test_registry.py \
|
||||
--ignore=tests/models/test_jamba.py \
|
||||
--ignore=tests/models/test_mamba.py \
|
||||
--ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported
|
||||
set -e
|
||||
pip install pytest pytest-asyncio \
|
||||
decord einops librosa peft Pillow sentence-transformers soundfile \
|
||||
transformers_stream_generator matplotlib datamodel_code_generator
|
||||
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
|
||||
# Embedding models are not supported for CPU yet
|
||||
# pytest -v -s tests/models/embedding/language
|
||||
pytest -v -s tests/models/encoder_decoder/language
|
||||
pytest -v -s tests/models/decoder_only/language/test_models.py
|
||||
# Chunked prefill not supported for CPU yet
|
||||
# pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
||||
|
||||
# online inference
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
|
||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
|
@ -20,32 +20,41 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
|
||||
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
|
||||
docker exec cpu-test-avx2 bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference.py"
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test bash -c "
|
||||
pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
|
||||
set -e
|
||||
pip install pytest pytest-asyncio \
|
||||
decord einops librosa peft Pillow sentence-transformers soundfile \
|
||||
transformers_stream_generator matplotlib datamodel_code_generator
|
||||
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
|
||||
# Embedding models are not supported for CPU yet
|
||||
# pytest -v -s tests/models/embedding/language
|
||||
pytest -v -s tests/models/encoder_decoder/language
|
||||
pytest -v -s tests/models/decoder_only/language \
|
||||
--ignore=tests/models/test_fp8.py \
|
||||
--ignore=tests/models/decoder_only/language/test_jamba.py \
|
||||
--ignore=tests/models/decoder_only/language/test_mamba.py \
|
||||
--ignore=tests/models/decoder_only/language/test_granitemoe.py \
|
||||
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
||||
pytest -v -s tests/models/decoder_only/language/test_models.py
|
||||
# Chunked prefill not supported for CPU yet
|
||||
# pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
||||
|
||||
# Run compressed-tensor test
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
||||
|
||||
# Run AWQ test
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/quantization/test_ipex_quant.py"
|
||||
|
||||
# online inference
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
export VLLM_CPU_KVCACHE_SPACE=10
|
||||
export VLLM_CPU_OMP_THREADS_BIND=48-92
|
||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
||||
|
@ -269,7 +269,6 @@ steps:
|
||||
source_file_dependencies:
|
||||
- benchmarks/
|
||||
commands:
|
||||
- pip install aiohttp
|
||||
- bash run-benchmarks.sh
|
||||
|
||||
- label: Quantization Test # 33min
|
||||
@ -331,7 +330,7 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
|
||||
|
||||
- label: Decoder-only Multi-Modal Models Test (Standard)
|
||||
- label: Decoder-only Multi-Modal Models Test (Standard) # 26min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
|
@ -93,7 +93,8 @@ skip_gitignore = true
|
||||
[tool.pytest.ini_options]
|
||||
markers = [
|
||||
"skip_global_cleanup",
|
||||
"core_model: run this model test in each PR instead of just daily",
|
||||
"core_model: enable this model test in each PR instead of only nightly",
|
||||
"cpu_model: enable this model test in CPU tests",
|
||||
"distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
|
||||
"skip_v1: do not run this test with v1",
|
||||
]
|
||||
|
@ -12,9 +12,7 @@ decord # required for video tests
|
||||
einops # required for MPT, qwen-vl and Mamba
|
||||
httpx
|
||||
librosa # required for audio tests
|
||||
opencv-python # required for video tests
|
||||
peft
|
||||
requests
|
||||
ray[adag]==2.35
|
||||
sentence-transformers # required for embedding tests
|
||||
soundfile # required for audio tests
|
||||
@ -29,9 +27,6 @@ lm-eval[api]==0.4.4 # required for model evaluation test
|
||||
# TODO: Add this after fully implementing llava(mantis)
|
||||
# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
|
||||
|
||||
# Benchmarking
|
||||
aiohttp
|
||||
|
||||
# quantization
|
||||
bitsandbytes>=0.44.0
|
||||
buildkite-test-collector==0.1.9
|
||||
|
@ -5,11 +5,11 @@ import pytest
|
||||
import pytest_asyncio
|
||||
from transformers import AutoModel, AutoTokenizer, BatchEncoding
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
from ....conftest import HfRunner, VllmRunner
|
||||
from ....utils import RemoteOpenAIServer
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODEL_NAME = "fixie-ai/ultravox-v0_3"
|
||||
@ -39,7 +39,10 @@ def audio(request):
|
||||
return AudioAsset(request.param)
|
||||
|
||||
|
||||
@pytest.fixture(params=({}, CHUNKED_PREFILL_KWARGS))
|
||||
@pytest.fixture(params=[
|
||||
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||
])
|
||||
def server(request, audio_assets):
|
||||
args = [
|
||||
"--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
|
||||
@ -185,7 +188,10 @@ def run_multi_audio_test(
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
|
||||
@pytest.mark.parametrize("vllm_kwargs", [
|
||||
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||
])
|
||||
def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
|
||||
num_logprobs: int, vllm_kwargs: dict) -> None:
|
||||
|
||||
@ -207,7 +213,10 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
|
||||
@pytest.mark.parametrize("vllm_kwargs", [
|
||||
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||
])
|
||||
def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
|
||||
max_tokens: int, num_logprobs: int,
|
||||
vllm_kwargs: dict) -> None:
|
||||
|
@ -14,7 +14,6 @@ models = [
|
||||
"h2oai/h2ovl-mississippi-800m", # Replace with your actual model names
|
||||
"h2oai/h2ovl-mississippi-2b",
|
||||
]
|
||||
target_dtype = "bfloat16"
|
||||
|
||||
|
||||
def run_preprocessing_test(
|
||||
|
@ -94,7 +94,7 @@ VLM_TEST_SETTINGS = {
|
||||
),
|
||||
limit_mm_per_prompt={"image": 4},
|
||||
)],
|
||||
marks=[pytest.mark.core_model],
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
"paligemma": VLMTestInfo(
|
||||
models=["google/paligemma-3b-mix-224"],
|
||||
@ -111,7 +111,8 @@ VLM_TEST_SETTINGS = {
|
||||
"pixel_values"
|
||||
),
|
||||
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
|
||||
dtype="half" if current_platform.is_rocm() else ("half", "float"),
|
||||
dtype=("half" if current_platform.is_cpu() or current_platform.is_rocm()
|
||||
else ("half", "float")),
|
||||
marks=[pytest.mark.core_model],
|
||||
),
|
||||
"qwen2_vl": VLMTestInfo(
|
||||
@ -128,7 +129,7 @@ VLM_TEST_SETTINGS = {
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
||||
marks=[pytest.mark.core_model],
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
),
|
||||
#### Extended model tests
|
||||
@ -172,7 +173,6 @@ VLM_TEST_SETTINGS = {
|
||||
use_tokenizer_eos=True,
|
||||
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
|
||||
num_logprobs=10,
|
||||
dtype="bfloat16" if current_platform.is_cpu() else "half",
|
||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
),
|
||||
"glm4": VLMTestInfo(
|
||||
@ -245,7 +245,6 @@ VLM_TEST_SETTINGS = {
|
||||
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
dtype="half",
|
||||
num_video_frames=16,
|
||||
max_model_len=16384,
|
||||
postprocess_inputs=model_utils.get_key_type_post_processor(
|
||||
@ -404,7 +403,6 @@ VLM_TEST_SETTINGS = {
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
max_model_len=4096,
|
||||
dtype="bfloat16" if current_platform.is_cpu() else "half",
|
||||
use_tokenizer_eos=True,
|
||||
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
||||
custom_test_opts=[
|
||||
@ -419,7 +417,6 @@ VLM_TEST_SETTINGS = {
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
max_model_len=16384,
|
||||
max_num_seqs=2,
|
||||
dtype="half",
|
||||
postprocess_inputs=model_utils.get_key_type_post_processor(
|
||||
"pixel_values"
|
||||
),
|
||||
|
@ -44,8 +44,6 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
||||
|
||||
|
||||
target_dtype = "half"
|
||||
if current_platform.is_cpu():
|
||||
target_dtype = "bfloat16"
|
||||
|
||||
# ROCm Triton FA can run into shared memory issues with these models,
|
||||
# use other backends in the meantime
|
||||
|
@ -5,7 +5,6 @@ import torch
|
||||
|
||||
from vllm.config import ModelConfig, TaskOption
|
||||
from vllm.inputs import InputContext
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
|
||||
|
||||
TokensText = Tuple[List[int], str]
|
||||
@ -270,7 +269,7 @@ def build_model_context(model_name: str,
|
||||
if tokenizer_name is None:
|
||||
tokenizer_name = model_name
|
||||
if dtype is None:
|
||||
dtype = "bfloat16" if current_platform.is_cpu() else "half"
|
||||
dtype = "half"
|
||||
|
||||
model_config = ModelConfig(
|
||||
model_name,
|
||||
|
@ -27,4 +27,4 @@ class ImageAsset:
|
||||
"""
|
||||
image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
|
||||
s3_prefix=VLM_IMAGES_DIR)
|
||||
return torch.load(image_path)
|
||||
return torch.load(image_path, map_location="cpu")
|
||||
|
@ -134,9 +134,9 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
|
||||
if sr != feature_extractor.sampling_rate:
|
||||
try:
|
||||
import librosa
|
||||
except ImportError:
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"Please install vllm[audio] for audio support.") from None
|
||||
"Please install vllm[audio] for audio support.") from exc
|
||||
audio = librosa.resample(audio,
|
||||
orig_sr=sr,
|
||||
target_sr=feature_extractor.sampling_rate)
|
||||
|
@ -206,9 +206,9 @@ def try_import_audio_packages() -> Tuple[Any, Any]:
|
||||
try:
|
||||
import librosa
|
||||
import soundfile
|
||||
except ImportError:
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"Please install vllm[audio] for audio support.") from None
|
||||
"Please install vllm[audio] for audio support.") from exc
|
||||
return librosa, soundfile
|
||||
|
||||
|
||||
@ -344,9 +344,9 @@ def try_import_video_packages() -> Any:
|
||||
try:
|
||||
import cv2
|
||||
import decord
|
||||
except ImportError:
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"Please install vllm[video] for video support.") from None
|
||||
"Please install vllm[video] for video support.") from exc
|
||||
return cv2, decord
|
||||
|
||||
|
||||
|
@ -151,7 +151,11 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
|
||||
self.local_omp_cpuid = omp_cpuids.split("|")[rank]
|
||||
|
||||
ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner
|
||||
if self.model_config.is_encoder_decoder:
|
||||
if self.model_config.task == "embedding":
|
||||
raise NotImplementedError(
|
||||
"Embedding models are not supported for CPU backend")
|
||||
# ModelRunnerClass = CPUEmbeddingModelRunner
|
||||
elif self.model_config.is_encoder_decoder:
|
||||
ModelRunnerClass = CPUEncoderDecoderModelRunner
|
||||
self.model_runner: CPUModelRunner = ModelRunnerClass(
|
||||
vllm_config=vllm_config,
|
||||
|
Loading…
x
Reference in New Issue
Block a user