[CI] Add Buildkite (#2355)

This commit is contained in:
Simon Mo 2024-01-14 12:37:58 -08:00 committed by GitHub
parent 9f659bf07f
commit 6e01e8c1c8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 192 additions and 37 deletions

View File

@ -0,0 +1,24 @@
# This script is run by buildkite to run the benchmarks and upload the results to buildkite
set -ex
# cd into parent directory of this file
cd "$(dirname "${BASH_SOURCE[0]}")/.."
# run benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
# write the results into a markdown file
echo "### Latency Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_latency.txt >> benchmark_results.md
echo "" >> benchmark_results.md
sed -n '$p' benchmark_latency.txt >> benchmark_results.md
echo "### Throughput Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_throughput.txt >> benchmark_results.md
echo "" >> benchmark_results.md
sed -n '$p' benchmark_throughput.txt >> benchmark_results.md
# upload the results to buildkite
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md

View File

@ -0,0 +1,41 @@
# In this file, you can add more tests to run either by adding a new step or
# adding a new command to an existing step. See different options here for examples.
# This script will be feed into Jinja template in `test-template.j2` to generate
# the final pipeline yaml file.
steps:
- label: Regression Test
command: pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
- label: AsyncEngine Test
command: pytest -v -s async_engine
- label: Distributed Test
command: pytest -v -s test_comm_ops.py
working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 2 # only support 1 or 2 for now.
- label: Engine Test
command: pytest -v -s engine
- label: Kernels Test
command: pytest -v -s kernels
soft_fail: true
- label: Models Test
commands:
- pytest -v -s models --forked
soft_fail: true
- label: Samplers Test
command: pytest -v -s samplers --forked
- label: Worker Test
command: pytest -v -s worker
- label: Benchmarks
working_dir: "/vllm-workspace/.buildkite"
commands:
- pip install aiohttp
- bash run-benchmarks.sh

View File

@ -0,0 +1,46 @@
{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
{% set default_num_gpu = 1 %}
{% set default_working_dir = "/vllm-workspace/tests" %}
steps:
- label: ":docker: build image"
commands:
- "docker build --tag {{ docker_image }} --target test --progress plain ."
- "docker push {{ docker_image }}"
env:
DOCKER_BUILDKIT: "1"
- wait
{% for step in steps %}
- label: "{{ step.label }}"
agents:
queue: kubernetes
soft_fail: {{ step.soft_fail or false }}
plugins:
- kubernetes:
podSpec:
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- image: "{{ docker_image }}"
command: ["bash"]
args:
- "-c"
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
resources:
requests:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
limits:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
env:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
volumeMounts:
- mountPath: /dev/shm
name: dshm
{% endfor %}

View File

@ -1,7 +1,11 @@
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.
#################### BASE BUILD IMAGE ####################
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
RUN apt-get update -y \ RUN apt-get update -y \
&& apt-get install -y python3-pip && apt-get install -y python3-pip git
WORKDIR /workspace WORKDIR /workspace
@ -14,8 +18,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
COPY requirements-dev.txt requirements-dev.txt COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-dev.txt pip install -r requirements-dev.txt
#################### BASE BUILD IMAGE ####################
# image to build pytorch extensions
#################### EXTENSION BUILD IMAGE ####################
FROM dev AS build FROM dev AS build
# install build dependencies # install build dependencies
@ -30,6 +36,7 @@ COPY requirements.txt requirements.txt
COPY pyproject.toml pyproject.toml COPY pyproject.toml pyproject.toml
COPY vllm/__init__.py vllm/__init__.py COPY vllm/__init__.py vllm/__init__.py
# cuda arch list used by torch
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX' ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# max jobs used by Ninja to build extensions # max jobs used by Ninja to build extensions
@ -40,18 +47,26 @@ ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads ENV NVCC_THREADS=$nvcc_threads
RUN python3 setup.py build_ext --inplace RUN python3 setup.py build_ext --inplace
#################### EXTENSION Build IMAGE ####################
#################### TEST IMAGE ####################
# image to run unit testing suite # image to run unit testing suite
FROM dev AS test FROM dev AS test
# copy pytorch extensions separately to avoid having to rebuild # copy pytorch extensions separately to avoid having to rebuild
# when python code changes # when python code changes
COPY --from=build /workspace/vllm/*.so /workspace/vllm/ WORKDIR /vllm-workspace
COPY tests tests # ADD is used to preserve directory structure
COPY vllm vllm ADD . /vllm-workspace/
COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
# ignore build dependencies installation because we are using pre-complied extensions
RUN rm pyproject.toml
RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
#################### TEST IMAGE ####################
ENTRYPOINT ["python3", "-m", "pytest", "tests"]
#################### RUNTIME BASE IMAGE ####################
# use CUDA base as CUDA runtime dependencies are already installed via pip # use CUDA base as CUDA runtime dependencies are already installed via pip
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
@ -63,14 +78,10 @@ WORKDIR /workspace
COPY requirements.txt requirements.txt COPY requirements.txt requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements.txt pip install -r requirements.txt
#################### RUNTIME BASE IMAGE ####################
FROM vllm-base AS vllm
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY vllm vllm
EXPOSE 8000
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"]
#################### OPENAI API SERVER ####################
# openai api server alternative # openai api server alternative
FROM vllm-base AS vllm-openai FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server # install additional dependencies for openai api server
@ -81,3 +92,4 @@ COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY vllm vllm COPY vllm vllm
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################

View File

@ -13,4 +13,6 @@ types-setuptools
pytest pytest
pytest-forked pytest-forked
pytest-asyncio pytest-asyncio
httpx
einops # required for MPT
flash_attn # required for HuggingFace's llama implementation

View File

@ -293,6 +293,11 @@ def get_requirements() -> List[str]:
return requirements return requirements
package_data = {"vllm": ["py.typed"]}
if os.environ.get("VLLM_USE_PRECOMPILED"):
ext_modules = []
package_data["vllm"].append("*.so")
setuptools.setup( setuptools.setup(
name="vllm", name="vllm",
version=get_vllm_version(), version=get_vllm_version(),
@ -321,5 +326,5 @@ setuptools.setup(
install_requires=get_requirements(), install_requires=get_requirements(),
ext_modules=ext_modules, ext_modules=ext_modules,
cmdclass={"build_ext": BuildExtension}, cmdclass={"build_ext": BuildExtension},
package_data={"vllm": ["py.typed"]}, package_data=package_data,
) )

View File

@ -29,8 +29,13 @@ def api_server():
script_path = Path(__file__).parent.joinpath( script_path = Path(__file__).parent.joinpath(
"api_server_async_engine.py").absolute() "api_server_async_engine.py").absolute()
uvicorn_process = subprocess.Popen([ uvicorn_process = subprocess.Popen([
sys.executable, "-u", sys.executable,
str(script_path), "--model", "facebook/opt-125m" "-u",
str(script_path),
"--model",
"facebook/opt-125m",
"--host",
"127.0.0.1",
]) ])
yield yield
uvicorn_process.terminate() uvicorn_process.terminate()
@ -81,6 +86,9 @@ def test_api_server(api_server):
pool.join() pool.join()
# check cancellation stats # check cancellation stats
# give it some times to update the stats
time.sleep(1)
num_aborted_requests = requests.get( num_aborted_requests = requests.get(
"http://localhost:8000/stats").json()["num_aborted_requests"] "http://localhost:8000/stats").json()["num_aborted_requests"]
assert num_aborted_requests > 0 assert num_aborted_requests > 0

View File

@ -1,19 +1,24 @@
from argparse import Namespace from argparse import Namespace
from dataclasses import dataclass from dataclasses import dataclass
import os
import pathlib
import pytest import pytest
from fastapi.testclient import TestClient from fastapi.testclient import TestClient
from vllm.entrypoints.openai.api_server import * from vllm.entrypoints.openai.api_server import *
chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
__file__))).parent.parent / "examples/template_chatml.jinja"
assert chatml_jinja_path.exists()
# Define models, templates, and their corresponding expected outputs # Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATON_OUTPUT = [ MODEL_TEMPLATE_GENERATON_OUTPUT = [
("facebook/opt-125m", None, True, ("facebook/opt-125m", None, True,
"Hello</s>Hi there!</s>What is the capital of</s>"), "Hello</s>Hi there!</s>What is the capital of</s>"),
("facebook/opt-125m", None, False, ("facebook/opt-125m", None, False,
"Hello</s>Hi there!</s>What is the capital of</s>"), "Hello</s>Hi there!</s>What is the capital of</s>"),
("facebook/opt-125m", "../../examples/template_chatml.jinja", True, ("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user
"""<|im_start|>user
Hello<|im_end|> Hello<|im_end|>
<|im_start|>assistant <|im_start|>assistant
Hi there!<|im_end|> Hi there!<|im_end|>
@ -21,8 +26,7 @@ Hi there!<|im_end|>
What is the capital of<|im_end|> What is the capital of<|im_end|>
<|im_start|>assistant <|im_start|>assistant
"""), """),
("facebook/opt-125m", "../../examples/template_chatml.jinja", False, ("facebook/opt-125m", chatml_jinja_path, False, """<|im_start|>user
"""<|im_start|>user
Hello<|im_end|> Hello<|im_end|>
<|im_start|>assistant <|im_start|>assistant
Hi there!<|im_end|> Hi there!<|im_end|>
@ -54,8 +58,7 @@ class MockTokenizer:
def test_load_chat_template(): def test_load_chat_template():
# Testing chatml template # Testing chatml template
template = "../../examples/template_chatml.jinja" mock_args = Namespace(chat_template=chatml_jinja_path)
mock_args = Namespace(chat_template=template)
tokenizer = MockTokenizer() tokenizer = MockTokenizer()
# Call the function with the mocked args # Call the function with the mocked args

View File

@ -2,10 +2,9 @@
Run `pytest tests/distributed/test_comm_ops.py --forked`. Run `pytest tests/distributed/test_comm_ops.py --forked`.
""" """
from multiprocessing import Process, set_start_method
import pytest import pytest
import torch import torch
import ray
from vllm.config import ParallelConfig from vllm.config import ParallelConfig
from vllm.utils import get_open_port from vllm.utils import get_open_port
@ -23,11 +22,11 @@ def init_test_distributed_environment(pipeline_parallel_size: int,
tensor_parallel_size, tensor_parallel_size,
worker_use_ray=True) worker_use_ray=True)
distributed_init_method = f"tcp://localhost:{distributed_init_port}" distributed_init_method = f"tcp://localhost:{distributed_init_port}"
torch.cuda.set_device(rank)
_init_distributed_environment(parallel_config, rank, _init_distributed_environment(parallel_config, rank,
distributed_init_method) distributed_init_method)
@ray.remote(num_gpus=1, max_calls=1)
def all_reduce_test_worker(tensor_parallel_size: int, rank: int, def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
distributed_init_port: str): distributed_init_port: str):
init_test_distributed_environment(1, tensor_parallel_size, rank, init_test_distributed_environment(1, tensor_parallel_size, rank,
@ -43,6 +42,7 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
assert torch.allclose(t, expected) assert torch.allclose(t, expected)
@ray.remote(num_gpus=1, max_calls=1)
def all_gather_test_worker(tensor_parallel_size: int, rank: int, def all_gather_test_worker(tensor_parallel_size: int, rank: int,
distributed_init_port: str): distributed_init_port: str):
init_test_distributed_environment(1, tensor_parallel_size, rank, init_test_distributed_environment(1, tensor_parallel_size, rank,
@ -70,14 +70,16 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
@pytest.mark.parametrize("test_target", @pytest.mark.parametrize("test_target",
[all_reduce_test_worker, all_gather_test_worker]) [all_reduce_test_worker, all_gather_test_worker])
def test_multi_process_tensor_parallel(tensor_parallel_size, test_target): def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
set_start_method("spawn", force=True) # Using ray helps debugging the error when it failed
# as compared to multiprocessing.
ray.init()
distributed_init_port = get_open_port() distributed_init_port = get_open_port()
processes = [] refs = []
for rank in range(tensor_parallel_size): for rank in range(tensor_parallel_size):
p = Process(target=test_target, refs.append(
args=(tensor_parallel_size, rank, distributed_init_port)) test_target.remote(tensor_parallel_size, rank,
p.start() distributed_init_port))
processes.append(p) ray.get(refs)
for p in processes:
p.join() ray.shutdown()
assert all(p.exitcode == 0 for p in processes)

View File

@ -13,7 +13,7 @@ FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability. # This will change depending on the compute capability.
# - 512 as a buffer # - 512 as a buffer
MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512 MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
NUM_BLOCKS = 40000 # Arbitrary values for testing NUM_BLOCKS = 12000 # Arbitrary values for testing
PARTITION_SIZE = 512 PARTITION_SIZE = 512
DTYPES = [torch.half, torch.bfloat16, torch.float] DTYPES = [torch.half, torch.bfloat16, torch.float]

View File

@ -6,12 +6,12 @@ import torch
from vllm._C import cache_ops from vllm._C import cache_ops
DTYPES = [torch.half, torch.bfloat16, torch.float] DTYPES = [torch.half, torch.bfloat16, torch.float]
NUM_TOKENS = [83] # Arbitrary values for testing NUM_TOKENS = [42] # Arbitrary values for testing
NUM_LAYERS = [1] # Arbitrary values for testing NUM_LAYERS = [1] # Arbitrary values for testing
NUM_HEADS = [8] # Arbitrary values for testing NUM_HEADS = [8] # Arbitrary values for testing
HEAD_SIZES = [64, 80, 96, 112, 128, 256] HEAD_SIZES = [64, 80, 96, 112, 128, 256]
BLOCK_SIZES = [8, 16, 32] BLOCK_SIZES = [8, 16, 32]
NUM_BLOCKS = [1024, 36000] # Arbitrary values for testing NUM_BLOCKS = [1024, 3600] # Arbitrary values for testing
NUM_MAPPINGS = [256] # Arbitrary values for testing NUM_MAPPINGS = [256] # Arbitrary values for testing
SEEDS = [0] SEEDS = [0]
DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)] DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]

View File

@ -30,6 +30,7 @@ def test_get_prompt_logprobs(
temperature=0.0) temperature=0.0)
vllm_results = vllm_model.model.generate( vllm_results = vllm_model.model.generate(
example_prompts, sampling_params=vllm_sampling_params) example_prompts, sampling_params=vllm_sampling_params)
del vllm_model
# Test whether logprobs are included in the results. # Test whether logprobs are included in the results.
for result in vllm_results: for result in vllm_results:

View File

@ -75,6 +75,8 @@ def test_sampler_all_greedy(seed: int):
for nth_output in sequence_output.samples: for nth_output in sequence_output.samples:
assert nth_output.output_token == expected[i].item() assert nth_output.output_token == expected[i].item()
del model_runner
@pytest.mark.parametrize("seed", RANDOM_SEEDS) @pytest.mark.parametrize("seed", RANDOM_SEEDS)
def test_sampler_all_random(seed: int): def test_sampler_all_random(seed: int):
@ -111,6 +113,8 @@ def test_sampler_all_random(seed: int):
for nth_output in sequence_output.samples: for nth_output in sequence_output.samples:
assert nth_output.output_token == i assert nth_output.output_token == i
del model_runner
@pytest.mark.parametrize("seed", RANDOM_SEEDS) @pytest.mark.parametrize("seed", RANDOM_SEEDS)
def test_sampler_all_beam(seed: int): def test_sampler_all_beam(seed: int):
@ -144,6 +148,7 @@ def test_sampler_all_beam(seed: int):
# the outputs are expected - in other words, this just tests # the outputs are expected - in other words, this just tests
# whether there are no exceptions in the sampler # whether there are no exceptions in the sampler
# when handling an all-beam search case. # when handling an all-beam search case.
del model_runner
@pytest.mark.parametrize("seed", RANDOM_SEEDS) @pytest.mark.parametrize("seed", RANDOM_SEEDS)
@ -198,6 +203,8 @@ def test_sampler_mixed(seed: int):
for nth_output in sequence_output.samples: for nth_output in sequence_output.samples:
assert nth_output.output_token in expected_tokens assert nth_output.output_token in expected_tokens
del model_runner
@pytest.mark.parametrize("seed", RANDOM_SEEDS) @pytest.mark.parametrize("seed", RANDOM_SEEDS)
def test_sampler_logits_processors(seed: int): def test_sampler_logits_processors(seed: int):
@ -235,6 +242,8 @@ def test_sampler_logits_processors(seed: int):
for idx, nth_output in enumerate(sequence_output.samples): for idx, nth_output in enumerate(sequence_output.samples):
assert nth_output.output_token == idx assert nth_output.output_token == idx
del model_runner
@pytest.mark.parametrize("seed", RANDOM_SEEDS) @pytest.mark.parametrize("seed", RANDOM_SEEDS)
def test_sampler_top_k_top_p(seed: int): def test_sampler_top_k_top_p(seed: int):
@ -296,3 +305,5 @@ def test_sampler_top_k_top_p(seed: int):
hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float) hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
assert torch.allclose(hf_probs, sample_probs, atol=1e-5) assert torch.allclose(hf_probs, sample_probs, atol=1e-5)
assert torch.equal(hf_probs.eq(0), sample_probs.eq(0)) assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))
del model_runner