[CI/Build] Update CPU tests to include all "standard" tests (#5481)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
208ce622c7
commit
b489fc3c91
@ -19,17 +19,22 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
|
|||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
pip install pytest matplotlib einops transformers_stream_generator
|
set -e
|
||||||
pytest -v -s tests/models -m \"not vlm\" \
|
pip install pytest pytest-asyncio \
|
||||||
--ignore=tests/models/test_embedding.py \
|
decord einops librosa peft Pillow sentence-transformers soundfile \
|
||||||
--ignore=tests/models/test_oot_registration.py \
|
transformers_stream_generator matplotlib datamodel_code_generator
|
||||||
--ignore=tests/models/test_registry.py \
|
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
|
||||||
--ignore=tests/models/test_jamba.py \
|
# Embedding models are not supported for CPU yet
|
||||||
--ignore=tests/models/test_mamba.py \
|
# pytest -v -s tests/models/embedding/language
|
||||||
--ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported
|
pytest -v -s tests/models/encoder_decoder/language
|
||||||
|
pytest -v -s tests/models/decoder_only/language/test_models.py
|
||||||
|
# Chunked prefill not supported for CPU yet
|
||||||
|
# pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
|
||||||
|
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
||||||
|
|
||||||
# online inference
|
# online inference
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
|
set -e
|
||||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
|
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
|
||||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
python3 benchmarks/benchmark_serving.py \
|
python3 benchmarks/benchmark_serving.py \
|
||||||
|
@ -20,32 +20,41 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
|
|||||||
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
|
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
|
||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
|
docker exec cpu-test-avx2 bash -c "
|
||||||
|
set -e
|
||||||
|
python3 examples/offline_inference.py"
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
|
set -e
|
||||||
|
pip install pytest pytest-asyncio \
|
||||||
|
decord einops librosa peft Pillow sentence-transformers soundfile \
|
||||||
|
transformers_stream_generator matplotlib datamodel_code_generator
|
||||||
|
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
|
||||||
|
# Embedding models are not supported for CPU yet
|
||||||
|
# pytest -v -s tests/models/embedding/language
|
||||||
pytest -v -s tests/models/encoder_decoder/language
|
pytest -v -s tests/models/encoder_decoder/language
|
||||||
pytest -v -s tests/models/decoder_only/language \
|
pytest -v -s tests/models/decoder_only/language/test_models.py
|
||||||
--ignore=tests/models/test_fp8.py \
|
# Chunked prefill not supported for CPU yet
|
||||||
--ignore=tests/models/decoder_only/language/test_jamba.py \
|
# pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
|
||||||
--ignore=tests/models/decoder_only/language/test_mamba.py \
|
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
||||||
--ignore=tests/models/decoder_only/language/test_granitemoe.py \
|
|
||||||
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
|
||||||
|
|
||||||
# Run compressed-tensor test
|
# Run compressed-tensor test
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
|
set -e
|
||||||
pytest -s -v \
|
pytest -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
||||||
|
|
||||||
# Run AWQ test
|
# Run AWQ test
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
|
set -e
|
||||||
pytest -s -v \
|
pytest -s -v \
|
||||||
tests/quantization/test_ipex_quant.py"
|
tests/quantization/test_ipex_quant.py"
|
||||||
|
|
||||||
# online inference
|
# online inference
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
|
set -e
|
||||||
export VLLM_CPU_KVCACHE_SPACE=10
|
export VLLM_CPU_KVCACHE_SPACE=10
|
||||||
export VLLM_CPU_OMP_THREADS_BIND=48-92
|
export VLLM_CPU_OMP_THREADS_BIND=48-92
|
||||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
||||||
|
@ -269,7 +269,6 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- benchmarks/
|
- benchmarks/
|
||||||
commands:
|
commands:
|
||||||
- pip install aiohttp
|
|
||||||
- bash run-benchmarks.sh
|
- bash run-benchmarks.sh
|
||||||
|
|
||||||
- label: Quantization Test # 33min
|
- label: Quantization Test # 33min
|
||||||
@ -331,7 +330,7 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
|
- pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
|
||||||
|
|
||||||
- label: Decoder-only Multi-Modal Models Test (Standard)
|
- label: Decoder-only Multi-Modal Models Test (Standard) # 26min
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
|
@ -93,7 +93,8 @@ skip_gitignore = true
|
|||||||
[tool.pytest.ini_options]
|
[tool.pytest.ini_options]
|
||||||
markers = [
|
markers = [
|
||||||
"skip_global_cleanup",
|
"skip_global_cleanup",
|
||||||
"core_model: run this model test in each PR instead of just daily",
|
"core_model: enable this model test in each PR instead of only nightly",
|
||||||
|
"cpu_model: enable this model test in CPU tests",
|
||||||
"distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
|
"distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
|
||||||
"skip_v1: do not run this test with v1",
|
"skip_v1: do not run this test with v1",
|
||||||
]
|
]
|
||||||
|
@ -12,9 +12,7 @@ decord # required for video tests
|
|||||||
einops # required for MPT, qwen-vl and Mamba
|
einops # required for MPT, qwen-vl and Mamba
|
||||||
httpx
|
httpx
|
||||||
librosa # required for audio tests
|
librosa # required for audio tests
|
||||||
opencv-python # required for video tests
|
|
||||||
peft
|
peft
|
||||||
requests
|
|
||||||
ray[adag]==2.35
|
ray[adag]==2.35
|
||||||
sentence-transformers # required for embedding tests
|
sentence-transformers # required for embedding tests
|
||||||
soundfile # required for audio tests
|
soundfile # required for audio tests
|
||||||
@ -29,9 +27,6 @@ lm-eval[api]==0.4.4 # required for model evaluation test
|
|||||||
# TODO: Add this after fully implementing llava(mantis)
|
# TODO: Add this after fully implementing llava(mantis)
|
||||||
# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
|
# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
|
||||||
|
|
||||||
# Benchmarking
|
|
||||||
aiohttp
|
|
||||||
|
|
||||||
# quantization
|
# quantization
|
||||||
bitsandbytes>=0.44.0
|
bitsandbytes>=0.44.0
|
||||||
buildkite-test-collector==0.1.9
|
buildkite-test-collector==0.1.9
|
||||||
|
@ -5,11 +5,11 @@ import pytest
|
|||||||
import pytest_asyncio
|
import pytest_asyncio
|
||||||
from transformers import AutoModel, AutoTokenizer, BatchEncoding
|
from transformers import AutoModel, AutoTokenizer, BatchEncoding
|
||||||
|
|
||||||
from tests.utils import RemoteOpenAIServer
|
|
||||||
from vllm.sequence import SampleLogprobs
|
from vllm.sequence import SampleLogprobs
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
from ....conftest import HfRunner, VllmRunner
|
from ....conftest import HfRunner, VllmRunner
|
||||||
|
from ....utils import RemoteOpenAIServer
|
||||||
from ...utils import check_logprobs_close
|
from ...utils import check_logprobs_close
|
||||||
|
|
||||||
MODEL_NAME = "fixie-ai/ultravox-v0_3"
|
MODEL_NAME = "fixie-ai/ultravox-v0_3"
|
||||||
@ -39,7 +39,10 @@ def audio(request):
|
|||||||
return AudioAsset(request.param)
|
return AudioAsset(request.param)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(params=({}, CHUNKED_PREFILL_KWARGS))
|
@pytest.fixture(params=[
|
||||||
|
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||||
|
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||||
|
])
|
||||||
def server(request, audio_assets):
|
def server(request, audio_assets):
|
||||||
args = [
|
args = [
|
||||||
"--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
|
"--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
|
||||||
@ -185,7 +188,10 @@ def run_multi_audio_test(
|
|||||||
@pytest.mark.parametrize("dtype", ["half"])
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
@pytest.mark.parametrize("max_tokens", [128])
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
|
@pytest.mark.parametrize("vllm_kwargs", [
|
||||||
|
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||||
|
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||||
|
])
|
||||||
def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
|
def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
|
||||||
num_logprobs: int, vllm_kwargs: dict) -> None:
|
num_logprobs: int, vllm_kwargs: dict) -> None:
|
||||||
|
|
||||||
@ -207,7 +213,10 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
|
|||||||
@pytest.mark.parametrize("dtype", ["half"])
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
@pytest.mark.parametrize("max_tokens", [128])
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
|
@pytest.mark.parametrize("vllm_kwargs", [
|
||||||
|
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||||
|
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||||
|
])
|
||||||
def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
|
def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
|
||||||
max_tokens: int, num_logprobs: int,
|
max_tokens: int, num_logprobs: int,
|
||||||
vllm_kwargs: dict) -> None:
|
vllm_kwargs: dict) -> None:
|
||||||
|
@ -14,7 +14,6 @@ models = [
|
|||||||
"h2oai/h2ovl-mississippi-800m", # Replace with your actual model names
|
"h2oai/h2ovl-mississippi-800m", # Replace with your actual model names
|
||||||
"h2oai/h2ovl-mississippi-2b",
|
"h2oai/h2ovl-mississippi-2b",
|
||||||
]
|
]
|
||||||
target_dtype = "bfloat16"
|
|
||||||
|
|
||||||
|
|
||||||
def run_preprocessing_test(
|
def run_preprocessing_test(
|
||||||
|
@ -94,7 +94,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
),
|
),
|
||||||
limit_mm_per_prompt={"image": 4},
|
limit_mm_per_prompt={"image": 4},
|
||||||
)],
|
)],
|
||||||
marks=[pytest.mark.core_model],
|
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||||
),
|
),
|
||||||
"paligemma": VLMTestInfo(
|
"paligemma": VLMTestInfo(
|
||||||
models=["google/paligemma-3b-mix-224"],
|
models=["google/paligemma-3b-mix-224"],
|
||||||
@ -111,7 +111,8 @@ VLM_TEST_SETTINGS = {
|
|||||||
"pixel_values"
|
"pixel_values"
|
||||||
),
|
),
|
||||||
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
|
||||||
dtype="half" if current_platform.is_rocm() else ("half", "float"),
|
dtype=("half" if current_platform.is_cpu() or current_platform.is_rocm()
|
||||||
|
else ("half", "float")),
|
||||||
marks=[pytest.mark.core_model],
|
marks=[pytest.mark.core_model],
|
||||||
),
|
),
|
||||||
"qwen2_vl": VLMTestInfo(
|
"qwen2_vl": VLMTestInfo(
|
||||||
@ -128,7 +129,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForVision2Seq,
|
||||||
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
||||||
marks=[pytest.mark.core_model],
|
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||||
),
|
),
|
||||||
#### Extended model tests
|
#### Extended model tests
|
||||||
@ -172,7 +173,6 @@ VLM_TEST_SETTINGS = {
|
|||||||
use_tokenizer_eos=True,
|
use_tokenizer_eos=True,
|
||||||
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
|
||||||
num_logprobs=10,
|
num_logprobs=10,
|
||||||
dtype="bfloat16" if current_platform.is_cpu() else "half",
|
|
||||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||||
),
|
),
|
||||||
"glm4": VLMTestInfo(
|
"glm4": VLMTestInfo(
|
||||||
@ -245,7 +245,6 @@ VLM_TEST_SETTINGS = {
|
|||||||
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
|
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
|
||||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||||
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
dtype="half",
|
|
||||||
num_video_frames=16,
|
num_video_frames=16,
|
||||||
max_model_len=16384,
|
max_model_len=16384,
|
||||||
postprocess_inputs=model_utils.get_key_type_post_processor(
|
postprocess_inputs=model_utils.get_key_type_post_processor(
|
||||||
@ -404,7 +403,6 @@ VLM_TEST_SETTINGS = {
|
|||||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
dtype="bfloat16" if current_platform.is_cpu() else "half",
|
|
||||||
use_tokenizer_eos=True,
|
use_tokenizer_eos=True,
|
||||||
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
||||||
custom_test_opts=[
|
custom_test_opts=[
|
||||||
@ -419,7 +417,6 @@ VLM_TEST_SETTINGS = {
|
|||||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||||
max_model_len=16384,
|
max_model_len=16384,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
dtype="half",
|
|
||||||
postprocess_inputs=model_utils.get_key_type_post_processor(
|
postprocess_inputs=model_utils.get_key_type_post_processor(
|
||||||
"pixel_values"
|
"pixel_values"
|
||||||
),
|
),
|
||||||
|
@ -44,8 +44,6 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
|||||||
|
|
||||||
|
|
||||||
target_dtype = "half"
|
target_dtype = "half"
|
||||||
if current_platform.is_cpu():
|
|
||||||
target_dtype = "bfloat16"
|
|
||||||
|
|
||||||
# ROCm Triton FA can run into shared memory issues with these models,
|
# ROCm Triton FA can run into shared memory issues with these models,
|
||||||
# use other backends in the meantime
|
# use other backends in the meantime
|
||||||
|
@ -5,7 +5,6 @@ import torch
|
|||||||
|
|
||||||
from vllm.config import ModelConfig, TaskOption
|
from vllm.config import ModelConfig, TaskOption
|
||||||
from vllm.inputs import InputContext
|
from vllm.inputs import InputContext
|
||||||
from vllm.platforms import current_platform
|
|
||||||
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
|
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
|
||||||
|
|
||||||
TokensText = Tuple[List[int], str]
|
TokensText = Tuple[List[int], str]
|
||||||
@ -270,7 +269,7 @@ def build_model_context(model_name: str,
|
|||||||
if tokenizer_name is None:
|
if tokenizer_name is None:
|
||||||
tokenizer_name = model_name
|
tokenizer_name = model_name
|
||||||
if dtype is None:
|
if dtype is None:
|
||||||
dtype = "bfloat16" if current_platform.is_cpu() else "half"
|
dtype = "half"
|
||||||
|
|
||||||
model_config = ModelConfig(
|
model_config = ModelConfig(
|
||||||
model_name,
|
model_name,
|
||||||
|
@ -27,4 +27,4 @@ class ImageAsset:
|
|||||||
"""
|
"""
|
||||||
image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
|
image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
|
||||||
s3_prefix=VLM_IMAGES_DIR)
|
s3_prefix=VLM_IMAGES_DIR)
|
||||||
return torch.load(image_path)
|
return torch.load(image_path, map_location="cpu")
|
||||||
|
@ -134,9 +134,9 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
|
|||||||
if sr != feature_extractor.sampling_rate:
|
if sr != feature_extractor.sampling_rate:
|
||||||
try:
|
try:
|
||||||
import librosa
|
import librosa
|
||||||
except ImportError:
|
except ImportError as exc:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"Please install vllm[audio] for audio support.") from None
|
"Please install vllm[audio] for audio support.") from exc
|
||||||
audio = librosa.resample(audio,
|
audio = librosa.resample(audio,
|
||||||
orig_sr=sr,
|
orig_sr=sr,
|
||||||
target_sr=feature_extractor.sampling_rate)
|
target_sr=feature_extractor.sampling_rate)
|
||||||
|
@ -206,9 +206,9 @@ def try_import_audio_packages() -> Tuple[Any, Any]:
|
|||||||
try:
|
try:
|
||||||
import librosa
|
import librosa
|
||||||
import soundfile
|
import soundfile
|
||||||
except ImportError:
|
except ImportError as exc:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"Please install vllm[audio] for audio support.") from None
|
"Please install vllm[audio] for audio support.") from exc
|
||||||
return librosa, soundfile
|
return librosa, soundfile
|
||||||
|
|
||||||
|
|
||||||
@ -344,9 +344,9 @@ def try_import_video_packages() -> Any:
|
|||||||
try:
|
try:
|
||||||
import cv2
|
import cv2
|
||||||
import decord
|
import decord
|
||||||
except ImportError:
|
except ImportError as exc:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"Please install vllm[video] for video support.") from None
|
"Please install vllm[video] for video support.") from exc
|
||||||
return cv2, decord
|
return cv2, decord
|
||||||
|
|
||||||
|
|
||||||
|
@ -151,7 +151,11 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
|
|||||||
self.local_omp_cpuid = omp_cpuids.split("|")[rank]
|
self.local_omp_cpuid = omp_cpuids.split("|")[rank]
|
||||||
|
|
||||||
ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner
|
ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner
|
||||||
if self.model_config.is_encoder_decoder:
|
if self.model_config.task == "embedding":
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Embedding models are not supported for CPU backend")
|
||||||
|
# ModelRunnerClass = CPUEmbeddingModelRunner
|
||||||
|
elif self.model_config.is_encoder_decoder:
|
||||||
ModelRunnerClass = CPUEncoderDecoderModelRunner
|
ModelRunnerClass = CPUEncoderDecoderModelRunner
|
||||||
self.model_runner: CPUModelRunner = ModelRunnerClass(
|
self.model_runner: CPUModelRunner = ModelRunnerClass(
|
||||||
vllm_config=vllm_config,
|
vllm_config=vllm_config,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user