[CI/Build] Update CPU tests to include all "standard" tests (#5481)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-08 23:30:04 +08:00 · 2024-11-08 23:30:04 +08:00 · b489fc3c91
commit b489fc3c91
parent 208ce622c7
14 changed files with 63 additions and 48 deletions
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@ -19,17 +19,22 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
 # Run basic model test
 docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator
+  set -e
-  pytest -v -s tests/models -m \"not vlm\" \
+  pip install pytest pytest-asyncio \
-    --ignore=tests/models/test_embedding.py \
+    decord einops librosa peft Pillow sentence-transformers soundfile \
-    --ignore=tests/models/test_oot_registration.py \
+    transformers_stream_generator matplotlib datamodel_code_generator
-    --ignore=tests/models/test_registry.py \
+  pip install torchvision --index-url https://download.pytorch.org/whl/cpu
-    --ignore=tests/models/test_jamba.py \
+  # Embedding models are not supported for CPU yet
-    --ignore=tests/models/test_mamba.py \
+  # pytest -v -s tests/models/embedding/language
-    --ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported
+  pytest -v -s tests/models/encoder_decoder/language
  pytest -v -s tests/models/decoder_only/language/test_models.py
  # Chunked prefill not supported for CPU yet
  # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
  pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
 # online inference
 docker exec cpu-test bash -c "
  set -e
  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
  timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
  python3 benchmarks/benchmark_serving.py \
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -20,32 +20,41 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
 --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
 # offline inference
-docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
+docker exec cpu-test-avx2 bash -c "
  set -e
  python3 examples/offline_inference.py"
 # Run basic model test
 docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
+  set -e
  pip install pytest pytest-asyncio \
    decord einops librosa peft Pillow sentence-transformers soundfile \
    transformers_stream_generator matplotlib datamodel_code_generator
  pip install torchvision --index-url https://download.pytorch.org/whl/cpu
  # Embedding models are not supported for CPU yet
  # pytest -v -s tests/models/embedding/language
  pytest -v -s tests/models/encoder_decoder/language
-  pytest -v -s tests/models/decoder_only/language \
+  pytest -v -s tests/models/decoder_only/language/test_models.py
-    --ignore=tests/models/test_fp8.py \
+  # Chunked prefill not supported for CPU yet
-    --ignore=tests/models/decoder_only/language/test_jamba.py \
+  # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
-    --ignore=tests/models/decoder_only/language/test_mamba.py \
+  pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
    --ignore=tests/models/decoder_only/language/test_granitemoe.py \
    --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
 # Run compressed-tensor test
 docker exec cpu-test bash -c "
  set -e
  pytest -s -v \
  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
 # Run AWQ test
 docker exec cpu-test bash -c "
  set -e
  pytest -s -v \
  tests/quantization/test_ipex_quant.py"
 # online inference
 docker exec cpu-test bash -c "
  set -e
  export VLLM_CPU_KVCACHE_SPACE=10 
  export VLLM_CPU_OMP_THREADS_BIND=48-92 
  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -269,7 +269,6 @@ steps:
  source_file_dependencies:
  - benchmarks/
  commands:
  - pip install aiohttp
  - bash run-benchmarks.sh
 - label: Quantization Test # 33min
@ -331,7 +330,7 @@ steps:
  commands:
    - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
- label: Decoder-only Multi-Modal Models Test (Standard)
+- label: Decoder-only Multi-Modal Models Test (Standard) # 26min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
--- a/pyproject.toml
+++ b/pyproject.toml
@ -93,7 +93,8 @@ skip_gitignore = true
 [tool.pytest.ini_options]
 markers = [
    "skip_global_cleanup",
-    "core_model: run this model test in each PR instead of just daily",
+    "core_model: enable this model test in each PR instead of only nightly",
    "cpu_model: enable this model test in CPU tests",
    "distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
    "skip_v1: do not run this test with v1",
 ]
--- a/requirements-test.in
+++ b/requirements-test.in
@ -12,9 +12,7 @@ decord # required for video tests
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
 opencv-python # required for video tests
 peft
 requests
 ray[adag]==2.35
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
@ -29,9 +27,6 @@ lm-eval[api]==0.4.4 # required for model evaluation test
 # TODO: Add this after fully implementing llava(mantis)
 # git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
 # Benchmarking
 aiohttp
 # quantization
 bitsandbytes>=0.44.0
 buildkite-test-collector==0.1.9
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@ -5,11 +5,11 @@ import pytest
 import pytest_asyncio
 from transformers import AutoModel, AutoTokenizer, BatchEncoding
 from tests.utils import RemoteOpenAIServer
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 from ....conftest import HfRunner, VllmRunner
 from ....utils import RemoteOpenAIServer
 from ...utils import check_logprobs_close
 MODEL_NAME = "fixie-ai/ultravox-v0_3"
@ -39,7 +39,10 @@ def audio(request):
    return AudioAsset(request.param)
-@pytest.fixture(params=({}, CHUNKED_PREFILL_KWARGS))
+@pytest.fixture(params=[
    pytest.param({}, marks=pytest.mark.cpu_model),
    pytest.param(CHUNKED_PREFILL_KWARGS),
 ])
 def server(request, audio_assets):
    args = [
        "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
@ -185,7 +188,10 @@ def run_multi_audio_test(
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
+@pytest.mark.parametrize("vllm_kwargs", [
    pytest.param({}, marks=pytest.mark.cpu_model),
    pytest.param(CHUNKED_PREFILL_KWARGS),
 ])
 def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
                num_logprobs: int, vllm_kwargs: dict) -> None:
@ -207,7 +213,10 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
+@pytest.mark.parametrize("vllm_kwargs", [
    pytest.param({}, marks=pytest.mark.cpu_model),
    pytest.param(CHUNKED_PREFILL_KWARGS),
 ])
 def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
                                     max_tokens: int, num_logprobs: int,
                                     vllm_kwargs: dict) -> None:
--- a/tests/models/decoder_only/vision_language/test_h2ovl.py
+++ b/tests/models/decoder_only/vision_language/test_h2ovl.py
@ -14,7 +14,6 @@ models = [
    "h2oai/h2ovl-mississippi-800m",  # Replace with your actual model names
    "h2oai/h2ovl-mississippi-2b",
 ]
 target_dtype = "bfloat16"
 def run_preprocessing_test(
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@ -94,7 +94,7 @@ VLM_TEST_SETTINGS = {
            ),
            limit_mm_per_prompt={"image": 4},
        )],
-        marks=[pytest.mark.core_model],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
    "paligemma": VLMTestInfo(
        models=["google/paligemma-3b-mix-224"],
@ -111,7 +111,8 @@ VLM_TEST_SETTINGS = {
            "pixel_values"
        ),
        vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
-        dtype="half" if current_platform.is_rocm() else ("half", "float"),
+        dtype=("half" if current_platform.is_cpu() or current_platform.is_rocm()
               else ("half", "float")),
        marks=[pytest.mark.core_model],
    ),
    "qwen2_vl": VLMTestInfo(
@ -128,7 +129,7 @@ VLM_TEST_SETTINGS = {
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
-        marks=[pytest.mark.core_model],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
    ),
    #### Extended model tests
@ -172,7 +173,6 @@ VLM_TEST_SETTINGS = {
        use_tokenizer_eos=True,
        vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
        num_logprobs=10,
        dtype="bfloat16" if current_platform.is_cpu() else "half",
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
    ),
    "glm4": VLMTestInfo(
@ -245,7 +245,6 @@ VLM_TEST_SETTINGS = {
        models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
        test_type=VLMTestType.CUSTOM_INPUTS,
        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
        dtype="half",
        num_video_frames=16,
        max_model_len=16384,
        postprocess_inputs=model_utils.get_key_type_post_processor(
@ -404,7 +403,6 @@ VLM_TEST_SETTINGS = {
        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=4096,
        dtype="bfloat16" if current_platform.is_cpu() else "half",
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
        custom_test_opts=[
@ -419,7 +417,6 @@ VLM_TEST_SETTINGS = {
        test_type=VLMTestType.CUSTOM_INPUTS,
        max_model_len=16384,
        max_num_seqs=2,
        dtype="half",
        postprocess_inputs=model_utils.get_key_type_post_processor(
            "pixel_values"
        ),
--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@ -44,8 +44,6 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
 target_dtype = "half"
 if current_platform.is_cpu():
    target_dtype = "bfloat16"
 # ROCm Triton FA can run into shared memory issues with these models,
 # use other backends in the meantime
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@ -5,7 +5,6 @@ import torch
 from vllm.config import ModelConfig, TaskOption
 from vllm.inputs import InputContext
 from vllm.platforms import current_platform
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 TokensText = Tuple[List[int], str]
@ -270,7 +269,7 @@ def build_model_context(model_name: str,
    if tokenizer_name is None:
        tokenizer_name = model_name
    if dtype is None:
-        dtype = "bfloat16" if current_platform.is_cpu() else "half"
+        dtype = "half"
    model_config = ModelConfig(
        model_name,
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@ -27,4 +27,4 @@ class ImageAsset:
        """
        image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
                                            s3_prefix=VLM_IMAGES_DIR)
-        return torch.load(image_path)
+        return torch.load(image_path, map_location="cpu")
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@ -134,9 +134,9 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
        if sr != feature_extractor.sampling_rate:
            try:
                import librosa
-            except ImportError:
+            except ImportError as exc:
                raise ImportError(
-                    "Please install vllm[audio] for audio support.") from None
+                    "Please install vllm[audio] for audio support.") from exc
            audio = librosa.resample(audio,
                                     orig_sr=sr,
                                     target_sr=feature_extractor.sampling_rate)
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@ -206,9 +206,9 @@ def try_import_audio_packages() -> Tuple[Any, Any]:
    try:
        import librosa
        import soundfile
-    except ImportError:
+    except ImportError as exc:
        raise ImportError(
-            "Please install vllm[audio] for audio support.") from None
+            "Please install vllm[audio] for audio support.") from exc
    return librosa, soundfile
@ -344,9 +344,9 @@ def try_import_video_packages() -> Any:
    try:
        import cv2
        import decord
-    except ImportError:
+    except ImportError as exc:
        raise ImportError(
-            "Please install vllm[video] for video support.") from None
+            "Please install vllm[video] for video support.") from exc
    return cv2, decord
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@ -151,7 +151,11 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
            self.local_omp_cpuid = omp_cpuids.split("|")[rank]
        ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner
-        if self.model_config.is_encoder_decoder:
+        if self.model_config.task == "embedding":
            raise NotImplementedError(
                "Embedding models are not supported for CPU backend")
            # ModelRunnerClass = CPUEmbeddingModelRunner
        elif self.model_config.is_encoder_decoder:
            ModelRunnerClass = CPUEncoderDecoderModelRunner
        self.model_runner: CPUModelRunner = ModelRunnerClass(
            vllm_config=vllm_config,