[core][distributed] add pynccl broadcast (#10843)

Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-12-02 20:53:23 -08:00 · 2024-12-02 20:53:23 -08:00 · 21fe7b481a
commit 21fe7b481a
parent a4cf256159
3 changed files with 78 additions and 2 deletions
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@ -61,6 +61,7 @@ def worker_fn():
                        dtype=torch.float32).cuda(pynccl_comm.rank)
    with pynccl_comm.change_state(enable=True):
        tensor = pynccl_comm.all_reduce(tensor)
+    torch.cuda.synchronize()
    result = tensor.mean().cpu().item()
    assert result == pynccl_comm.world_size

@ -86,10 +87,12 @@ def multiple_allreduce_worker_fn():
        if torch.distributed.get_rank() in [0, 1]:
            tensor = pynccl_comm.all_reduce(tensor)
            tensor = pynccl_comm.all_reduce(tensor)
+            torch.cuda.synchronize()
            result = tensor.mean().cpu().item()
            assert result == 4
        else:
            tensor = pynccl_comm.all_reduce(tensor)
+            torch.cuda.synchronize()
            result = tensor.mean().cpu().item()
            assert result == 2

@ -112,10 +115,12 @@ def multiple_allreduce_with_vllm_worker_fn():
        if torch.distributed.get_rank() in [0, 1]:
            tensor = tensor_model_parallel_all_reduce(tensor)
            tensor = tensor_model_parallel_all_reduce(tensor)
+            torch.cuda.synchronize()
            result = tensor.mean().cpu().item()
            assert result == 4
        else:
            tensor = tensor_model_parallel_all_reduce(tensor)
+            torch.cuda.synchronize()
            result = tensor.mean().cpu().item()
            assert result == 2

@ -141,9 +146,9 @@ def worker_fn_with_cudagraph():
                graph, stream=pynccl_comm.stream), pynccl_comm.change_state(
                    enable=True):
            a_out = pynccl_comm.all_reduce(a)
-        pynccl_comm.stream.synchronize()
+        torch.cuda.synchronize()
        graph.replay()
-        pynccl_comm.stream.synchronize()
+        torch.cuda.synchronize()
        assert a_out.mean().cpu().item() == pynccl_comm.world_size**1


@ -170,6 +175,7 @@ def all_gather_worker_fn():

    with pynccl_comm.change_state(enable=True):
        pynccl_comm.all_gather(result, tensor)
+    torch.cuda.synchronize()
    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)


@ -207,6 +213,7 @@ def reduce_scatter_worker_fn():

    with pynccl_comm.change_state(enable=True):
        pynccl_comm.reduce_scatter(result, tensor)
+    torch.cuda.synchronize()
    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)


@ -241,6 +248,7 @@ def send_recv_worker_fn():
            pynccl_comm.recv(tensor,
                             src=(pynccl_comm.rank - 1) %
                             pynccl_comm.world_size)
+    torch.cuda.synchronize()
    result = tensor.mean().cpu().item()
    assert result == 1

@ -280,6 +288,7 @@ def multiple_send_recv_worker_fn():
            pynccl_comm.recv(tensor,
                             src=(pynccl_comm.rank - 1) %
                             pynccl_comm.world_size)
+    torch.cuda.synchronize()
    result = tensor.mean().cpu().item()
    if torch.distributed.get_rank() in [0, 2]:
        assert result == 1
@ -293,6 +302,38 @@ def test_pynccl_multiple_send_recv():
    distributed_run(multiple_send_recv_worker_fn, 4)


+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+def test_pynccl_broadcast():
+    distributed_run(broadcast_worker_fn, 4)
+
+
+@worker_fn_wrapper
+def broadcast_worker_fn():
+    # Test broadcast for every root rank.
+    # Essentially this is an all-gather operation.
+    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
+                                     device=get_world_group().device)
+    recv_tensors = [
+        torch.empty(16,
+                    1024,
+                    1024,
+                    dtype=torch.float32,
+                    device=pynccl_comm.device)
+        for i in range(pynccl_comm.world_size)
+    ]
+    recv_tensors[pynccl_comm.rank] = torch.ones(
+        16, 1024, 1024, dtype=torch.float32,
+        device=pynccl_comm.device) * pynccl_comm.rank
+
+    for i in range(pynccl_comm.world_size):
+        pynccl_comm.broadcast(recv_tensors[i], src=i)
+        # the broadcast op might be launched in a different stream
+        # need to synchronize to make sure the tensor is ready
+        torch.cuda.synchronize()
+        assert torch.all(recv_tensors[i] == i).cpu().item()
+
+
 def test_ncclGetUniqueId():
    lib = NCCLLibrary()
    unique_id = lib.ncclGetUniqueId()
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@ -197,6 +197,25 @@ class PyNcclCommunicator:
                           ncclDataTypeEnum.from_torch(tensor.dtype), src,
                           self.comm, cudaStream_t(stream.cuda_stream))

+    def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
+        if self.disabled:
+            return
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}")
+        if stream is None:
+            stream = self.stream
+        if src == self.rank:
+            sendbuff = buffer_type(tensor.data_ptr())
+            # NCCL requires the sender also to have a receive buffer
+            recvbuff = buffer_type(tensor.data_ptr())
+        else:
+            sendbuff = buffer_type()
+            recvbuff = buffer_type(tensor.data_ptr())
+        self.nccl.ncclBroadcast(sendbuff, recvbuff, tensor.numel(),
+                                ncclDataTypeEnum.from_torch(tensor.dtype), src,
+                                self.comm, cudaStream_t(stream.cuda_stream))
+
    @contextmanager
    def change_state(self,
                     enable: Optional[bool] = None,
--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@ -189,6 +189,15 @@ class NCCLLibrary:
            ncclComm_t, cudaStream_t
        ]),

+        # ncclResult_t ncclBroadcast(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, int root, ncclComm_t comm,
+        #   cudaStream_t stream);
+        Function("ncclBroadcast", ncclResult_t, [
+            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
+            ctypes.c_int, ncclComm_t, cudaStream_t
+        ]),
+
        # be cautious! this is a collective call, it will block until all
        # processes in the communicator have called this function.
        # because Python object destruction can happen in random order,
@ -312,6 +321,13 @@ class NCCLLibrary:
        self.NCCL_CHECK(self._funcs["ncclRecv"](recvbuff, count, datatype, src,
                                                comm, stream))

+    def ncclBroadcast(self, sendbuff: buffer_type, recvbuff: buffer_type,
+                      count: int, datatype: int, root: int, comm: ncclComm_t,
+                      stream: cudaStream_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclBroadcast"](sendbuff, recvbuff, count,
+                                                     datatype, root, comm,
+                                                     stream))
+
    def ncclCommDestroy(self, comm: ncclComm_t) -> None:
        self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))