[CI] Add Buildkite (#2355)
This commit is contained in:
parent
9f659bf07f
commit
6e01e8c1c8
24
.buildkite/run-benchmarks.sh
Normal file
24
.buildkite/run-benchmarks.sh
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
# This script is run by buildkite to run the benchmarks and upload the results to buildkite
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# cd into parent directory of this file
|
||||||
|
cd "$(dirname "${BASH_SOURCE[0]}")/.."
|
||||||
|
|
||||||
|
# run benchmarks and upload the result to buildkite
|
||||||
|
python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
|
||||||
|
|
||||||
|
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
|
||||||
|
|
||||||
|
# write the results into a markdown file
|
||||||
|
echo "### Latency Benchmarks" >> benchmark_results.md
|
||||||
|
sed -n '1p' benchmark_latency.txt >> benchmark_results.md
|
||||||
|
echo "" >> benchmark_results.md
|
||||||
|
sed -n '$p' benchmark_latency.txt >> benchmark_results.md
|
||||||
|
echo "### Throughput Benchmarks" >> benchmark_results.md
|
||||||
|
sed -n '1p' benchmark_throughput.txt >> benchmark_results.md
|
||||||
|
echo "" >> benchmark_results.md
|
||||||
|
sed -n '$p' benchmark_throughput.txt >> benchmark_results.md
|
||||||
|
|
||||||
|
# upload the results to buildkite
|
||||||
|
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
|
41
.buildkite/test-pipeline.yaml
Normal file
41
.buildkite/test-pipeline.yaml
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
# In this file, you can add more tests to run either by adding a new step or
|
||||||
|
# adding a new command to an existing step. See different options here for examples.
|
||||||
|
# This script will be feed into Jinja template in `test-template.j2` to generate
|
||||||
|
# the final pipeline yaml file.
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- label: Regression Test
|
||||||
|
command: pytest -v -s test_regression.py
|
||||||
|
working_dir: "/vllm-workspace/tests" # optional
|
||||||
|
|
||||||
|
- label: AsyncEngine Test
|
||||||
|
command: pytest -v -s async_engine
|
||||||
|
|
||||||
|
- label: Distributed Test
|
||||||
|
command: pytest -v -s test_comm_ops.py
|
||||||
|
working_dir: "/vllm-workspace/tests/distributed"
|
||||||
|
num_gpus: 2 # only support 1 or 2 for now.
|
||||||
|
|
||||||
|
- label: Engine Test
|
||||||
|
command: pytest -v -s engine
|
||||||
|
|
||||||
|
- label: Kernels Test
|
||||||
|
command: pytest -v -s kernels
|
||||||
|
soft_fail: true
|
||||||
|
|
||||||
|
- label: Models Test
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models --forked
|
||||||
|
soft_fail: true
|
||||||
|
|
||||||
|
- label: Samplers Test
|
||||||
|
command: pytest -v -s samplers --forked
|
||||||
|
|
||||||
|
- label: Worker Test
|
||||||
|
command: pytest -v -s worker
|
||||||
|
|
||||||
|
- label: Benchmarks
|
||||||
|
working_dir: "/vllm-workspace/.buildkite"
|
||||||
|
commands:
|
||||||
|
- pip install aiohttp
|
||||||
|
- bash run-benchmarks.sh
|
46
.buildkite/test-template.j2
Normal file
46
.buildkite/test-template.j2
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
|
||||||
|
{% set default_num_gpu = 1 %}
|
||||||
|
{% set default_working_dir = "/vllm-workspace/tests" %}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- label: ":docker: build image"
|
||||||
|
commands:
|
||||||
|
- "docker build --tag {{ docker_image }} --target test --progress plain ."
|
||||||
|
- "docker push {{ docker_image }}"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
- wait
|
||||||
|
|
||||||
|
{% for step in steps %}
|
||||||
|
- label: "{{ step.label }}"
|
||||||
|
agents:
|
||||||
|
queue: kubernetes
|
||||||
|
soft_fail: {{ step.soft_fail or false }}
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
volumes:
|
||||||
|
- name: dshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
containers:
|
||||||
|
- image: "{{ docker_image }}"
|
||||||
|
command: ["bash"]
|
||||||
|
args:
|
||||||
|
- "-c"
|
||||||
|
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
|
||||||
|
env:
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
volumeMounts:
|
||||||
|
- mountPath: /dev/shm
|
||||||
|
name: dshm
|
||||||
|
{% endfor %}
|
36
Dockerfile
36
Dockerfile
@ -1,7 +1,11 @@
|
|||||||
|
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||||
|
# to run the OpenAI compatible server.
|
||||||
|
|
||||||
|
#################### BASE BUILD IMAGE ####################
|
||||||
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
|
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
|
||||||
|
|
||||||
RUN apt-get update -y \
|
RUN apt-get update -y \
|
||||||
&& apt-get install -y python3-pip
|
&& apt-get install -y python3-pip git
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
@ -14,8 +18,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
COPY requirements-dev.txt requirements-dev.txt
|
COPY requirements-dev.txt requirements-dev.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install -r requirements-dev.txt
|
pip install -r requirements-dev.txt
|
||||||
|
#################### BASE BUILD IMAGE ####################
|
||||||
|
|
||||||
# image to build pytorch extensions
|
|
||||||
|
#################### EXTENSION BUILD IMAGE ####################
|
||||||
FROM dev AS build
|
FROM dev AS build
|
||||||
|
|
||||||
# install build dependencies
|
# install build dependencies
|
||||||
@ -30,6 +36,7 @@ COPY requirements.txt requirements.txt
|
|||||||
COPY pyproject.toml pyproject.toml
|
COPY pyproject.toml pyproject.toml
|
||||||
COPY vllm/__init__.py vllm/__init__.py
|
COPY vllm/__init__.py vllm/__init__.py
|
||||||
|
|
||||||
|
# cuda arch list used by torch
|
||||||
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
|
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
|
||||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||||
# max jobs used by Ninja to build extensions
|
# max jobs used by Ninja to build extensions
|
||||||
@ -40,18 +47,26 @@ ARG nvcc_threads=8
|
|||||||
ENV NVCC_THREADS=$nvcc_threads
|
ENV NVCC_THREADS=$nvcc_threads
|
||||||
|
|
||||||
RUN python3 setup.py build_ext --inplace
|
RUN python3 setup.py build_ext --inplace
|
||||||
|
#################### EXTENSION Build IMAGE ####################
|
||||||
|
|
||||||
|
|
||||||
|
#################### TEST IMAGE ####################
|
||||||
# image to run unit testing suite
|
# image to run unit testing suite
|
||||||
FROM dev AS test
|
FROM dev AS test
|
||||||
|
|
||||||
# copy pytorch extensions separately to avoid having to rebuild
|
# copy pytorch extensions separately to avoid having to rebuild
|
||||||
# when python code changes
|
# when python code changes
|
||||||
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
|
WORKDIR /vllm-workspace
|
||||||
COPY tests tests
|
# ADD is used to preserve directory structure
|
||||||
COPY vllm vllm
|
ADD . /vllm-workspace/
|
||||||
|
COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
|
||||||
|
# ignore build dependencies installation because we are using pre-complied extensions
|
||||||
|
RUN rm pyproject.toml
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
|
||||||
|
#################### TEST IMAGE ####################
|
||||||
|
|
||||||
ENTRYPOINT ["python3", "-m", "pytest", "tests"]
|
|
||||||
|
|
||||||
|
#################### RUNTIME BASE IMAGE ####################
|
||||||
# use CUDA base as CUDA runtime dependencies are already installed via pip
|
# use CUDA base as CUDA runtime dependencies are already installed via pip
|
||||||
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
|
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
|
||||||
|
|
||||||
@ -63,14 +78,10 @@ WORKDIR /workspace
|
|||||||
COPY requirements.txt requirements.txt
|
COPY requirements.txt requirements.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
#################### RUNTIME BASE IMAGE ####################
|
||||||
|
|
||||||
FROM vllm-base AS vllm
|
|
||||||
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
|
|
||||||
COPY vllm vllm
|
|
||||||
|
|
||||||
EXPOSE 8000
|
|
||||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"]
|
|
||||||
|
|
||||||
|
#################### OPENAI API SERVER ####################
|
||||||
# openai api server alternative
|
# openai api server alternative
|
||||||
FROM vllm-base AS vllm-openai
|
FROM vllm-base AS vllm-openai
|
||||||
# install additional dependencies for openai api server
|
# install additional dependencies for openai api server
|
||||||
@ -81,3 +92,4 @@ COPY --from=build /workspace/vllm/*.so /workspace/vllm/
|
|||||||
COPY vllm vllm
|
COPY vllm vllm
|
||||||
|
|
||||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
#################### OPENAI API SERVER ####################
|
||||||
|
@ -13,4 +13,6 @@ types-setuptools
|
|||||||
pytest
|
pytest
|
||||||
pytest-forked
|
pytest-forked
|
||||||
pytest-asyncio
|
pytest-asyncio
|
||||||
|
httpx
|
||||||
|
einops # required for MPT
|
||||||
|
flash_attn # required for HuggingFace's llama implementation
|
||||||
|
7
setup.py
7
setup.py
@ -293,6 +293,11 @@ def get_requirements() -> List[str]:
|
|||||||
return requirements
|
return requirements
|
||||||
|
|
||||||
|
|
||||||
|
package_data = {"vllm": ["py.typed"]}
|
||||||
|
if os.environ.get("VLLM_USE_PRECOMPILED"):
|
||||||
|
ext_modules = []
|
||||||
|
package_data["vllm"].append("*.so")
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name="vllm",
|
name="vllm",
|
||||||
version=get_vllm_version(),
|
version=get_vllm_version(),
|
||||||
@ -321,5 +326,5 @@ setuptools.setup(
|
|||||||
install_requires=get_requirements(),
|
install_requires=get_requirements(),
|
||||||
ext_modules=ext_modules,
|
ext_modules=ext_modules,
|
||||||
cmdclass={"build_ext": BuildExtension},
|
cmdclass={"build_ext": BuildExtension},
|
||||||
package_data={"vllm": ["py.typed"]},
|
package_data=package_data,
|
||||||
)
|
)
|
||||||
|
@ -29,8 +29,13 @@ def api_server():
|
|||||||
script_path = Path(__file__).parent.joinpath(
|
script_path = Path(__file__).parent.joinpath(
|
||||||
"api_server_async_engine.py").absolute()
|
"api_server_async_engine.py").absolute()
|
||||||
uvicorn_process = subprocess.Popen([
|
uvicorn_process = subprocess.Popen([
|
||||||
sys.executable, "-u",
|
sys.executable,
|
||||||
str(script_path), "--model", "facebook/opt-125m"
|
"-u",
|
||||||
|
str(script_path),
|
||||||
|
"--model",
|
||||||
|
"facebook/opt-125m",
|
||||||
|
"--host",
|
||||||
|
"127.0.0.1",
|
||||||
])
|
])
|
||||||
yield
|
yield
|
||||||
uvicorn_process.terminate()
|
uvicorn_process.terminate()
|
||||||
@ -81,6 +86,9 @@ def test_api_server(api_server):
|
|||||||
pool.join()
|
pool.join()
|
||||||
|
|
||||||
# check cancellation stats
|
# check cancellation stats
|
||||||
|
# give it some times to update the stats
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
num_aborted_requests = requests.get(
|
num_aborted_requests = requests.get(
|
||||||
"http://localhost:8000/stats").json()["num_aborted_requests"]
|
"http://localhost:8000/stats").json()["num_aborted_requests"]
|
||||||
assert num_aborted_requests > 0
|
assert num_aborted_requests > 0
|
||||||
|
@ -1,19 +1,24 @@
|
|||||||
from argparse import Namespace
|
from argparse import Namespace
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from fastapi.testclient import TestClient
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
from vllm.entrypoints.openai.api_server import *
|
from vllm.entrypoints.openai.api_server import *
|
||||||
|
|
||||||
|
chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
|
||||||
|
__file__))).parent.parent / "examples/template_chatml.jinja"
|
||||||
|
assert chatml_jinja_path.exists()
|
||||||
|
|
||||||
# Define models, templates, and their corresponding expected outputs
|
# Define models, templates, and their corresponding expected outputs
|
||||||
MODEL_TEMPLATE_GENERATON_OUTPUT = [
|
MODEL_TEMPLATE_GENERATON_OUTPUT = [
|
||||||
("facebook/opt-125m", None, True,
|
("facebook/opt-125m", None, True,
|
||||||
"Hello</s>Hi there!</s>What is the capital of</s>"),
|
"Hello</s>Hi there!</s>What is the capital of</s>"),
|
||||||
("facebook/opt-125m", None, False,
|
("facebook/opt-125m", None, False,
|
||||||
"Hello</s>Hi there!</s>What is the capital of</s>"),
|
"Hello</s>Hi there!</s>What is the capital of</s>"),
|
||||||
("facebook/opt-125m", "../../examples/template_chatml.jinja", True,
|
("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user
|
||||||
"""<|im_start|>user
|
|
||||||
Hello<|im_end|>
|
Hello<|im_end|>
|
||||||
<|im_start|>assistant
|
<|im_start|>assistant
|
||||||
Hi there!<|im_end|>
|
Hi there!<|im_end|>
|
||||||
@ -21,8 +26,7 @@ Hi there!<|im_end|>
|
|||||||
What is the capital of<|im_end|>
|
What is the capital of<|im_end|>
|
||||||
<|im_start|>assistant
|
<|im_start|>assistant
|
||||||
"""),
|
"""),
|
||||||
("facebook/opt-125m", "../../examples/template_chatml.jinja", False,
|
("facebook/opt-125m", chatml_jinja_path, False, """<|im_start|>user
|
||||||
"""<|im_start|>user
|
|
||||||
Hello<|im_end|>
|
Hello<|im_end|>
|
||||||
<|im_start|>assistant
|
<|im_start|>assistant
|
||||||
Hi there!<|im_end|>
|
Hi there!<|im_end|>
|
||||||
@ -54,8 +58,7 @@ class MockTokenizer:
|
|||||||
|
|
||||||
def test_load_chat_template():
|
def test_load_chat_template():
|
||||||
# Testing chatml template
|
# Testing chatml template
|
||||||
template = "../../examples/template_chatml.jinja"
|
mock_args = Namespace(chat_template=chatml_jinja_path)
|
||||||
mock_args = Namespace(chat_template=template)
|
|
||||||
tokenizer = MockTokenizer()
|
tokenizer = MockTokenizer()
|
||||||
|
|
||||||
# Call the function with the mocked args
|
# Call the function with the mocked args
|
||||||
|
@ -2,10 +2,9 @@
|
|||||||
|
|
||||||
Run `pytest tests/distributed/test_comm_ops.py --forked`.
|
Run `pytest tests/distributed/test_comm_ops.py --forked`.
|
||||||
"""
|
"""
|
||||||
from multiprocessing import Process, set_start_method
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
import ray
|
||||||
|
|
||||||
from vllm.config import ParallelConfig
|
from vllm.config import ParallelConfig
|
||||||
from vllm.utils import get_open_port
|
from vllm.utils import get_open_port
|
||||||
@ -23,11 +22,11 @@ def init_test_distributed_environment(pipeline_parallel_size: int,
|
|||||||
tensor_parallel_size,
|
tensor_parallel_size,
|
||||||
worker_use_ray=True)
|
worker_use_ray=True)
|
||||||
distributed_init_method = f"tcp://localhost:{distributed_init_port}"
|
distributed_init_method = f"tcp://localhost:{distributed_init_port}"
|
||||||
torch.cuda.set_device(rank)
|
|
||||||
_init_distributed_environment(parallel_config, rank,
|
_init_distributed_environment(parallel_config, rank,
|
||||||
distributed_init_method)
|
distributed_init_method)
|
||||||
|
|
||||||
|
|
||||||
|
@ray.remote(num_gpus=1, max_calls=1)
|
||||||
def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
|
def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
|
||||||
distributed_init_port: str):
|
distributed_init_port: str):
|
||||||
init_test_distributed_environment(1, tensor_parallel_size, rank,
|
init_test_distributed_environment(1, tensor_parallel_size, rank,
|
||||||
@ -43,6 +42,7 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
|
|||||||
assert torch.allclose(t, expected)
|
assert torch.allclose(t, expected)
|
||||||
|
|
||||||
|
|
||||||
|
@ray.remote(num_gpus=1, max_calls=1)
|
||||||
def all_gather_test_worker(tensor_parallel_size: int, rank: int,
|
def all_gather_test_worker(tensor_parallel_size: int, rank: int,
|
||||||
distributed_init_port: str):
|
distributed_init_port: str):
|
||||||
init_test_distributed_environment(1, tensor_parallel_size, rank,
|
init_test_distributed_environment(1, tensor_parallel_size, rank,
|
||||||
@ -70,14 +70,16 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
|
|||||||
@pytest.mark.parametrize("test_target",
|
@pytest.mark.parametrize("test_target",
|
||||||
[all_reduce_test_worker, all_gather_test_worker])
|
[all_reduce_test_worker, all_gather_test_worker])
|
||||||
def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
|
def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
|
||||||
set_start_method("spawn", force=True)
|
# Using ray helps debugging the error when it failed
|
||||||
|
# as compared to multiprocessing.
|
||||||
|
ray.init()
|
||||||
|
|
||||||
distributed_init_port = get_open_port()
|
distributed_init_port = get_open_port()
|
||||||
processes = []
|
refs = []
|
||||||
for rank in range(tensor_parallel_size):
|
for rank in range(tensor_parallel_size):
|
||||||
p = Process(target=test_target,
|
refs.append(
|
||||||
args=(tensor_parallel_size, rank, distributed_init_port))
|
test_target.remote(tensor_parallel_size, rank,
|
||||||
p.start()
|
distributed_init_port))
|
||||||
processes.append(p)
|
ray.get(refs)
|
||||||
for p in processes:
|
|
||||||
p.join()
|
ray.shutdown()
|
||||||
assert all(p.exitcode == 0 for p in processes)
|
|
||||||
|
@ -13,7 +13,7 @@ FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
|||||||
# This will change depending on the compute capability.
|
# This will change depending on the compute capability.
|
||||||
# - 512 as a buffer
|
# - 512 as a buffer
|
||||||
MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
|
MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
|
||||||
NUM_BLOCKS = 40000 # Arbitrary values for testing
|
NUM_BLOCKS = 12000 # Arbitrary values for testing
|
||||||
PARTITION_SIZE = 512
|
PARTITION_SIZE = 512
|
||||||
|
|
||||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||||
|
@ -6,12 +6,12 @@ import torch
|
|||||||
from vllm._C import cache_ops
|
from vllm._C import cache_ops
|
||||||
|
|
||||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||||
NUM_TOKENS = [83] # Arbitrary values for testing
|
NUM_TOKENS = [42] # Arbitrary values for testing
|
||||||
NUM_LAYERS = [1] # Arbitrary values for testing
|
NUM_LAYERS = [1] # Arbitrary values for testing
|
||||||
NUM_HEADS = [8] # Arbitrary values for testing
|
NUM_HEADS = [8] # Arbitrary values for testing
|
||||||
HEAD_SIZES = [64, 80, 96, 112, 128, 256]
|
HEAD_SIZES = [64, 80, 96, 112, 128, 256]
|
||||||
BLOCK_SIZES = [8, 16, 32]
|
BLOCK_SIZES = [8, 16, 32]
|
||||||
NUM_BLOCKS = [1024, 36000] # Arbitrary values for testing
|
NUM_BLOCKS = [1024, 3600] # Arbitrary values for testing
|
||||||
NUM_MAPPINGS = [256] # Arbitrary values for testing
|
NUM_MAPPINGS = [256] # Arbitrary values for testing
|
||||||
SEEDS = [0]
|
SEEDS = [0]
|
||||||
DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
|
DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
|
||||||
|
@ -30,6 +30,7 @@ def test_get_prompt_logprobs(
|
|||||||
temperature=0.0)
|
temperature=0.0)
|
||||||
vllm_results = vllm_model.model.generate(
|
vllm_results = vllm_model.model.generate(
|
||||||
example_prompts, sampling_params=vllm_sampling_params)
|
example_prompts, sampling_params=vllm_sampling_params)
|
||||||
|
del vllm_model
|
||||||
|
|
||||||
# Test whether logprobs are included in the results.
|
# Test whether logprobs are included in the results.
|
||||||
for result in vllm_results:
|
for result in vllm_results:
|
||||||
|
@ -75,6 +75,8 @@ def test_sampler_all_greedy(seed: int):
|
|||||||
for nth_output in sequence_output.samples:
|
for nth_output in sequence_output.samples:
|
||||||
assert nth_output.output_token == expected[i].item()
|
assert nth_output.output_token == expected[i].item()
|
||||||
|
|
||||||
|
del model_runner
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||||
def test_sampler_all_random(seed: int):
|
def test_sampler_all_random(seed: int):
|
||||||
@ -111,6 +113,8 @@ def test_sampler_all_random(seed: int):
|
|||||||
for nth_output in sequence_output.samples:
|
for nth_output in sequence_output.samples:
|
||||||
assert nth_output.output_token == i
|
assert nth_output.output_token == i
|
||||||
|
|
||||||
|
del model_runner
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||||
def test_sampler_all_beam(seed: int):
|
def test_sampler_all_beam(seed: int):
|
||||||
@ -144,6 +148,7 @@ def test_sampler_all_beam(seed: int):
|
|||||||
# the outputs are expected - in other words, this just tests
|
# the outputs are expected - in other words, this just tests
|
||||||
# whether there are no exceptions in the sampler
|
# whether there are no exceptions in the sampler
|
||||||
# when handling an all-beam search case.
|
# when handling an all-beam search case.
|
||||||
|
del model_runner
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||||
@ -198,6 +203,8 @@ def test_sampler_mixed(seed: int):
|
|||||||
for nth_output in sequence_output.samples:
|
for nth_output in sequence_output.samples:
|
||||||
assert nth_output.output_token in expected_tokens
|
assert nth_output.output_token in expected_tokens
|
||||||
|
|
||||||
|
del model_runner
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||||
def test_sampler_logits_processors(seed: int):
|
def test_sampler_logits_processors(seed: int):
|
||||||
@ -235,6 +242,8 @@ def test_sampler_logits_processors(seed: int):
|
|||||||
for idx, nth_output in enumerate(sequence_output.samples):
|
for idx, nth_output in enumerate(sequence_output.samples):
|
||||||
assert nth_output.output_token == idx
|
assert nth_output.output_token == idx
|
||||||
|
|
||||||
|
del model_runner
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||||
def test_sampler_top_k_top_p(seed: int):
|
def test_sampler_top_k_top_p(seed: int):
|
||||||
@ -296,3 +305,5 @@ def test_sampler_top_k_top_p(seed: int):
|
|||||||
hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
|
hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
|
||||||
assert torch.allclose(hf_probs, sample_probs, atol=1e-5)
|
assert torch.allclose(hf_probs, sample_probs, atol=1e-5)
|
||||||
assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))
|
assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))
|
||||||
|
|
||||||
|
del model_runner
|
||||||
|
Loading…
x
Reference in New Issue
Block a user