diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 2eeba904..4feea786 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -24,28 +24,26 @@ steps: command: pytest -v -s core - label: Distributed Comm Ops Test - command: pytest -v -s test_comm_ops.py - working_dir: "/vllm-workspace/tests/distributed" + command: pytest -v -s distributed/test_comm_ops.py + working_dir: "/vllm-workspace/tests" num_gpus: 2 - label: Distributed Tests - working_dir: "/vllm-workspace/tests/distributed" - - num_gpus: 2 # only support 1 or 2 for now. + working_dir: "/vllm-workspace/tests" + num_gpus: 2 mirror_hardwares: [amd] - commands: - - pytest -v -s test_pynccl_library.py - - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py + - pytest -v -s distributed/test_pynccl_library.py + - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s distributed/test_chunked_prefill_distributed.py + - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s distributed/test_chunked_prefill_distributed.py - label: Distributed Tests (Multiple Groups) - working_dir: "/vllm-workspace/tests/distributed" + working_dir: "/vllm-workspace/tests" num_gpus: 4 commands: - - pytest -v -s test_pynccl.py + - pytest -v -s distributed/test_pynccl.py - label: Engine Test #mirror_hardwares: [amd] diff --git a/tests/async_engine/__init__.py b/tests/async_engine/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index 4b97af88..ace4c539 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -1,61 +1,16 @@ -# imports for guided decoding tests -import os -import subprocess -import sys -import time - import openai # use the official client for correctness check import pytest # using Ray for overall ease of process management, parallel requests, # and debugging. import ray -import requests -MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds +from ..utils import ServerRunner + # any model with a chat template should work here MODEL_NAME = "facebook/opt-125m" -@ray.remote(num_gpus=1) -class ServerRunner: - - def __init__(self, args): - env = os.environ.copy() - env["PYTHONUNBUFFERED"] = "1" - self.proc = subprocess.Popen( - ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args, - env=env, - stdout=sys.stdout, - stderr=sys.stderr, - ) - self._wait_for_server() - - def ready(self): - return True - - def _wait_for_server(self): - # run health check - start = time.time() - while True: - try: - if requests.get( - "http://localhost:8000/health").status_code == 200: - break - except Exception as err: - if self.proc.poll() is not None: - raise RuntimeError("Server exited unexpectedly.") from err - - time.sleep(0.5) - if time.time() - start > MAX_SERVER_START_WAIT_S: - raise RuntimeError( - "Server failed to start in time.") from err - - def __del__(self): - if hasattr(self, "proc"): - self.proc.terminate() - - -@pytest.fixture(scope="session") +@pytest.fixture(scope="module") def server(): ray.init() server_runner = ServerRunner.remote([ diff --git a/tests/basic_correctness/__init__.py b/tests/basic_correctness/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/core/block/e2e/__init__.py b/tests/core/block/e2e/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py index 1d99cb5d..b0d62c89 100644 --- a/tests/core/block/e2e/conftest.py +++ b/tests/core/block/e2e/conftest.py @@ -1,9 +1,10 @@ import pytest -from tests.conftest import cleanup from vllm import LLM from vllm.model_executor.utils import set_random_seed +from ....conftest import cleanup + @pytest.fixture def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, diff --git a/tests/distributed/__init__.py b/tests/distributed/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py index 52745263..d63f0155 100644 --- a/tests/distributed/test_basic_distributed_correctness.py +++ b/tests/distributed/test_basic_distributed_correctness.py @@ -4,10 +4,12 @@ by one. The solution is to pass arguments (model name) by environment variables. Run: ```sh +cd $VLLM_PATH/tests + TEST_DIST_MODEL=facebook/opt-125m pytest \ - test_basic_distributed_correctness.py + distributed/test_basic_distributed_correctness.py TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \ - test_basic_distributed_correctness.py + distributed/test_basic_distributed_correctness.py ``` """ import os diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index a4423bbf..53654dc4 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -11,8 +11,9 @@ import torch from vllm.distributed import (broadcast_tensor_dict, tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce) -from vllm.test_utils import (init_test_distributed_environment, - multi_process_tensor_parallel) + +from ..utils import (init_test_distributed_environment, + multi_process_tensor_parallel) @ray.remote(num_gpus=1, max_calls=1) diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index bdca031e..630b25a3 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -10,8 +10,9 @@ from vllm.distributed.communication_op import ( # noqa graph_capture, tensor_model_parallel_all_reduce) from vllm.distributed.parallel_state import (get_tensor_model_parallel_group, get_tp_ca_communicator) -from vllm.test_utils import (init_test_distributed_environment, - multi_process_tensor_parallel) + +from ..utils import (init_test_distributed_environment, + multi_process_tensor_parallel) random.seed(42) test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)] diff --git a/tests/engine/__init__.py b/tests/engine/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/engine/output_processor/__init__.py b/tests/engine/output_processor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/engine/output_processor/test_multi_step.py b/tests/engine/output_processor/test_multi_step.py index 2bf4bf69..4f32a622 100644 --- a/tests/engine/output_processor/test_multi_step.py +++ b/tests/engine/output_processor/test_multi_step.py @@ -4,7 +4,6 @@ from unittest.mock import MagicMock import pytest from transformers import PreTrainedTokenizer -from tests.core.utils import create_seq_group from vllm.core.scheduler import Scheduler from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor from vllm.engine.output_processor.stop_checker import StopChecker @@ -14,6 +13,8 @@ from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, from vllm.transformers_utils.detokenizer import Detokenizer from vllm.utils import Counter +from ...core.utils import create_seq_group + @pytest.mark.parametrize("seq_output_len", [128]) @pytest.mark.parametrize("num_new_tokens", [1, 12]) diff --git a/tests/entrypoints/__init__.py b/tests/entrypoints/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index c22ac450..ee2f034f 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -1,10 +1,6 @@ # imports for guided decoding tests import json -import os import re -import subprocess -import sys -import time import jsonschema import openai # use the official client for correctness check @@ -12,7 +8,6 @@ import pytest # using Ray for overall ease of process management, parallel requests, # and debugging. import ray -import requests import torch # downloading lora to test lora requests from huggingface_hub import snapshot_download @@ -20,7 +15,8 @@ from openai import BadRequestError from vllm.transformers_utils.tokenizer import get_tokenizer -MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds +from ..utils import ServerRunner + # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct" @@ -78,45 +74,6 @@ TEST_CHOICE = [ pytestmark = pytest.mark.asyncio -@ray.remote(num_gpus=1) -class ServerRunner: - - def __init__(self, args): - env = os.environ.copy() - env["PYTHONUNBUFFERED"] = "1" - self.proc = subprocess.Popen( - ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args, - env=env, - stdout=sys.stdout, - stderr=sys.stderr, - ) - self._wait_for_server() - - def ready(self): - return True - - def _wait_for_server(self): - # run health check - start = time.time() - while True: - try: - if requests.get( - "http://localhost:8000/health").status_code == 200: - break - except Exception as err: - if self.proc.poll() is not None: - raise RuntimeError("Server exited unexpectedly.") from err - - time.sleep(0.5) - if time.time() - start > MAX_SERVER_START_WAIT_S: - raise RuntimeError( - "Server failed to start in time.") from err - - def __del__(self): - if hasattr(self, "proc"): - self.proc.terminate() - - @pytest.fixture(scope="session") def zephyr_lora_files(): return snapshot_download(repo_id=LORA_NAME) diff --git a/tests/kernels/__init__.py b/tests/kernels/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index 86ecc641..a624c4ca 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -2,11 +2,12 @@ from typing import Type import pytest import torch -from allclose_default import get_default_atol, get_default_rtol from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul, NewGELU, SiluAndMul) +from .allclose_default import get_default_atol, get_default_rtol + DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing D = [512, 4096, 5120, 13824] # Arbitrary values for testing diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 28496f18..fdf31326 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -3,13 +3,14 @@ from typing import List, Optional, Tuple import pytest import torch -from allclose_default import get_default_atol, get_default_rtol from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask from vllm import _custom_ops as ops from vllm.utils import get_max_shared_memory_bytes, is_hip +from .allclose_default import get_default_atol, get_default_rtol + FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. # - 512 as a buffer diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index bf185697..18c8e351 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -3,10 +3,11 @@ from typing import List, Optional import pytest import torch -from allclose_default import get_default_atol, get_default_rtol from vllm.model_executor.layers.rotary_embedding import get_rope +from .allclose_default import get_default_atol, get_default_rtol + IS_NEOX_STYLE = [True, False] DTYPES = [torch.half, torch.bfloat16, torch.float] HEAD_SIZES = [64, 80, 96, 112, 128, 256] diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/model_executor/__init__.py b/tests/model_executor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/models/__init__.py b/tests/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index b1c2b88b..db55d448 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -13,9 +13,10 @@ import os import pytest import torch -from tests.models.utils import check_logprobs_close from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from .utils import check_logprobs_close + os.environ["TOKENIZERS_PARALLELISM"] = "true" MAX_MODEL_LEN = 1024 diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index fa846d43..37c1664a 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -15,9 +15,10 @@ from dataclasses import dataclass import pytest import torch -from tests.models.utils import check_logprobs_close from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from .utils import check_logprobs_close + capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] marlin_not_supported = (capability < diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py index 33d28da8..d0a5bfbf 100644 --- a/tests/models/test_mistral.py +++ b/tests/models/test_mistral.py @@ -4,7 +4,7 @@ Run `pytest tests/models/test_mistral.py`. """ import pytest -from tests.models.utils import check_logprobs_close +from .utils import check_logprobs_close MODELS = [ "mistralai/Mistral-7B-Instruct-v0.1", diff --git a/tests/prefix_caching/__init__.py b/tests/prefix_caching/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/quantization/__init__.py b/tests/quantization/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/samplers/__init__.py b/tests/samplers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 57d6d2a4..40d054cd 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -1,9 +1,10 @@ import pytest import torch -from tests.conftest import VllmRunner from vllm import SamplingParams +from ..conftest import VllmRunner + MODELS = ["facebook/opt-125m"] diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index eda7293e..da8b9271 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -9,7 +9,6 @@ import torch from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlInit) -from tests.conftest import cleanup from vllm import LLM from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine @@ -21,6 +20,8 @@ from vllm.sequence import Logprob, MultiModalData from vllm.usage.usage_lib import UsageContext from vllm.utils import Counter, random_uuid +from ...conftest import cleanup + class AsyncLLM: """AsyncLLM diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index df1db4e6..ad4748c5 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -9,12 +9,13 @@ import pytest import ray import torch -from tests.entrypoints.test_openai_server import ServerRunner from vllm import SamplingParams from vllm.model_executor.model_loader.tensorizer import ( EncryptionParams, TensorizerConfig, TensorSerializer, is_vllm_serialized_tensorizer, load_with_tensorizer, open_stream) +from ..utils import ServerRunner + prompts = [ "Hello, my name is", "The president of the United States is", diff --git a/tests/test_sequence.py b/tests/test_sequence.py index b8ea1f6b..31364025 100644 --- a/tests/test_sequence.py +++ b/tests/test_sequence.py @@ -1,9 +1,10 @@ import pytest -from tests.core.utils import create_dummy_prompt from vllm.sequence import (CompletionSequenceGroupOutput, SamplerOutput, SequenceData, SequenceOutput) +from .core.utils import create_dummy_prompt + @pytest.fixture def sample_outputs(): diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 00000000..689d8c8c --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,89 @@ +import os +import subprocess +import sys +import time + +import ray +import requests + +from vllm.distributed import (ensure_model_parallel_initialized, + init_distributed_environment) +from vllm.utils import get_open_port + +# Path to root of repository so that utilities can be imported by ray workers +VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)) + + +@ray.remote(num_gpus=1) +class ServerRunner: + MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds + + def __init__(self, args): + env = os.environ.copy() + env["PYTHONUNBUFFERED"] = "1" + self.proc = subprocess.Popen( + ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args, + env=env, + stdout=sys.stdout, + stderr=sys.stderr, + ) + self._wait_for_server() + + def ready(self): + return True + + def _wait_for_server(self): + # run health check + start = time.time() + while True: + try: + if requests.get( + "http://localhost:8000/health").status_code == 200: + break + except Exception as err: + if self.proc.poll() is not None: + raise RuntimeError("Server exited unexpectedly.") from err + + time.sleep(0.5) + if time.time() - start > self.MAX_SERVER_START_WAIT_S: + raise RuntimeError( + "Server failed to start in time.") from err + + def __del__(self): + if hasattr(self, "proc"): + self.proc.terminate() + + +def init_test_distributed_environment( + tp_size: int, + pp_size: int, + rank: int, + distributed_init_port: str, + local_rank: int = -1, +) -> None: + distributed_init_method = f"tcp://localhost:{distributed_init_port}" + init_distributed_environment( + world_size=pp_size * tp_size, + rank=rank, + distributed_init_method=distributed_init_method, + local_rank=local_rank) + ensure_model_parallel_initialized(tp_size, pp_size) + + +def multi_process_tensor_parallel( + tp_size: int, + pp_size: int, + test_target, +) -> None: + # Using ray helps debugging the error when it failed + # as compared to multiprocessing. + ray.init(runtime_env={"working_dir": VLLM_PATH}) + + distributed_init_port = get_open_port() + refs = [] + for rank in range(tp_size * pp_size): + refs.append( + test_target.remote(tp_size, pp_size, rank, distributed_init_port)) + ray.get(refs) + + ray.shutdown() diff --git a/vllm/test_utils.py b/vllm/test_utils.py deleted file mode 100644 index addd8ec1..00000000 --- a/vllm/test_utils.py +++ /dev/null @@ -1,40 +0,0 @@ -import ray - -from vllm.distributed import (ensure_model_parallel_initialized, - init_distributed_environment) -from vllm.utils import get_open_port - - -def init_test_distributed_environment( - tp_size: int, - pp_size: int, - rank: int, - distributed_init_port: str, - local_rank: int = -1, -) -> None: - distributed_init_method = f"tcp://localhost:{distributed_init_port}" - init_distributed_environment( - world_size=pp_size * tp_size, - rank=rank, - distributed_init_method=distributed_init_method, - local_rank=local_rank) - ensure_model_parallel_initialized(tp_size, pp_size) - - -def multi_process_tensor_parallel( - tp_size: int, - pp_size: int, - test_target, -) -> None: - # Using ray helps debugging the error when it failed - # as compared to multiprocessing. - ray.init() - - distributed_init_port = get_open_port() - refs = [] - for rank in range(tp_size * pp_size): - refs.append( - test_target.remote(tp_size, pp_size, rank, distributed_init_port)) - ray.get(refs) - - ray.shutdown()