
Signed-off-by: ilmarkov <imarkov@redhat.com> Co-authored-by: ilmarkov <imarkov@redhat.com>
137 lines
5.3 KiB
Python
137 lines
5.3 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
import random
|
|
|
|
import pytest
|
|
import ray
|
|
import torch
|
|
import torch.distributed as dist
|
|
|
|
from vllm.distributed.communication_op import ( # noqa
|
|
tensor_model_parallel_all_reduce)
|
|
from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
|
|
get_tp_group, graph_capture)
|
|
|
|
from ..utils import (ensure_model_parallel_initialized,
|
|
init_test_distributed_environment, multi_process_parallel)
|
|
|
|
random.seed(42)
|
|
test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
|
|
for i, v in enumerate(test_sizes):
|
|
test_sizes[i] -= v % 8
|
|
|
|
|
|
@ray.remote(num_gpus=1, max_calls=1)
|
|
def graph_allreduce(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
tp_size,
|
|
pp_size,
|
|
rank,
|
|
distributed_init_port,
|
|
):
|
|
with monkeypatch.context() as m:
|
|
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
|
device = torch.device(f"cuda:{rank}")
|
|
torch.cuda.set_device(device)
|
|
init_test_distributed_environment(tp_size, pp_size, rank,
|
|
distributed_init_port)
|
|
ensure_model_parallel_initialized(tp_size, pp_size)
|
|
group = get_tensor_model_parallel_group().device_group
|
|
|
|
# A small all_reduce for warmup.
|
|
# this is needed because device communicators might be created lazily
|
|
# (e.g. NCCL). This will ensure that the communicator is initialized
|
|
# before any communication happens, so that this group can be used for
|
|
# graph capture immediately.
|
|
data = torch.zeros(1)
|
|
data = data.to(device=device)
|
|
torch.distributed.all_reduce(data, group=group)
|
|
torch.cuda.synchronize()
|
|
del data
|
|
|
|
# we use the first group to communicate once
|
|
# and the second group to communicate twice
|
|
# and so on
|
|
# this is used to demonstrate that each group can
|
|
# communicate independently
|
|
num_communication = rank // tp_size + 1
|
|
|
|
for sz in test_sizes:
|
|
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
|
|
with graph_capture(device=device) as graph_capture_context:
|
|
# use integers so result matches NCCL exactly
|
|
inp1 = torch.randint(1,
|
|
16, (sz, ),
|
|
dtype=dtype,
|
|
device=torch.cuda.current_device())
|
|
inp2 = torch.randint(1,
|
|
16, (sz, ),
|
|
dtype=dtype,
|
|
device=torch.cuda.current_device())
|
|
torch.cuda.synchronize()
|
|
graph = torch.cuda.CUDAGraph()
|
|
with torch.cuda.graph(graph,
|
|
stream=graph_capture_context.stream):
|
|
for i in range(num_communication):
|
|
out1 = tensor_model_parallel_all_reduce(inp1)
|
|
# the input buffer is immediately modified to test
|
|
# synchronization
|
|
dist.all_reduce(inp1, group=group)
|
|
out2 = tensor_model_parallel_all_reduce(inp2)
|
|
dist.all_reduce(inp2, group=group)
|
|
graph.replay()
|
|
torch.testing.assert_close(out1, inp1)
|
|
torch.testing.assert_close(out2, inp2)
|
|
|
|
|
|
@ray.remote(num_gpus=1, max_calls=1)
|
|
def eager_allreduce(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
tp_size,
|
|
pp_size,
|
|
rank,
|
|
distributed_init_port,
|
|
):
|
|
with monkeypatch.context() as m:
|
|
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
|
device = torch.device(f"cuda:{rank}")
|
|
torch.cuda.set_device(device)
|
|
init_test_distributed_environment(tp_size, pp_size, rank,
|
|
distributed_init_port)
|
|
|
|
# we use the first group to communicate once
|
|
# and the second group to communicate twice
|
|
# and so on
|
|
# this is used to demonstrate that each group can
|
|
# communicate independently
|
|
num_communication = rank // tp_size + 1
|
|
sz = 1024
|
|
fa = get_tp_group().device_communicator.ca_comm
|
|
inp = torch.ones(sz, dtype=torch.float32, device=device)
|
|
out = inp
|
|
for _ in range(num_communication):
|
|
out = fa.all_reduce(out, registered=False)
|
|
torch.testing.assert_close(out, inp * (tp_size**num_communication))
|
|
|
|
inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
|
|
out = inp
|
|
for _ in range(num_communication):
|
|
out = fa.all_reduce(out, registered=False)
|
|
torch.testing.assert_close(out, inp * (tp_size**num_communication))
|
|
|
|
|
|
@pytest.mark.parametrize("tp_size", [2])
|
|
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
|
|
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
|
|
def test_custom_allreduce(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
tp_size,
|
|
pipeline_parallel_size,
|
|
test_target,
|
|
):
|
|
world_size = tp_size * pipeline_parallel_size
|
|
if world_size > torch.cuda.device_count():
|
|
pytest.skip("Not enough GPUs to run the test.")
|
|
multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
|
|
test_target)
|