vllm/tests/distributed/test_ca_buffer_sharing.py

# can only run on machines with p2p access across GPUs
# can only run with torchrun:
# torchrun --nproc_per_node=2 tests/distributed/test_ca_buffer_sharing.py

import ctypes

import torch
import torch.distributed as dist

from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
from vllm.distributed.device_communicators.custom_all_reduce import (  # noqa
    CustomAllreduce)

# create a cpu process group for communicating metadata (ipc handle)
dist.init_process_group(backend="gloo")
rank = local_rank = dist.get_rank()
world_size = dist.get_world_size()

# every process sets its own device (differently)
lib = CudaRTLibrary()
lib.cudaSetDevice(rank)

buffer_size_in_bytes = 1024
byte_value = 2  # the value we write to the buffer for verification

pointers = CustomAllreduce.create_shared_buffer(buffer_size_in_bytes)

print(f"Rank {rank} has pointers {pointers}")

dist.barrier()
torch.cuda.synchronize()

if rank == 0:
    # the first rank tries to write to all buffers
    for p in pointers:
        pointer = ctypes.c_void_p(p)
        lib.cudaMemset(pointer, byte_value, buffer_size_in_bytes)

dist.barrier()
torch.cuda.synchronize()

host_data = (ctypes.c_char * buffer_size_in_bytes)()

# all ranks read from all buffers, and check if the data is correct
for p in pointers:
    pointer = ctypes.c_void_p(p)
    lib.cudaMemcpy(host_data, pointer, buffer_size_in_bytes)
    for i in range(buffer_size_in_bytes):
        assert ord(host_data[i]) == byte_value, (
            f"Rank {rank} failed"
            f" to verify buffer {p}. Expected {byte_value}, "
            f"got {ord(host_data[i])}")

print(f"Rank {rank} verified all buffers")

dist.barrier()
torch.cuda.synchronize()

CustomAllreduce.free_shared_buffer(pointers)
[distributed] add function to create ipc buffers directly (#10064) Signed-off-by: youkaichao <youkaichao@gmail.com> 2024-11-05 22:35:03 -08:00			`# can only run on machines with p2p access across GPUs`
			`# can only run with torchrun:`
			`# torchrun --nproc_per_node=2 tests/distributed/test_ca_buffer_sharing.py`

			`import ctypes`

			`import torch`
			`import torch.distributed as dist`

			`from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary`
			`from vllm.distributed.device_communicators.custom_all_reduce import ( # noqa`
			`CustomAllreduce)`

			`# create a cpu process group for communicating metadata (ipc handle)`
			`dist.init_process_group(backend="gloo")`
			`rank = local_rank = dist.get_rank()`
			`world_size = dist.get_world_size()`

			`# every process sets its own device (differently)`
			`lib = CudaRTLibrary()`
			`lib.cudaSetDevice(rank)`

			`buffer_size_in_bytes = 1024`
			`byte_value = 2 # the value we write to the buffer for verification`

			`pointers = CustomAllreduce.create_shared_buffer(buffer_size_in_bytes)`

			`print(f"Rank {rank} has pointers {pointers}")`

			`dist.barrier()`
			`torch.cuda.synchronize()`

			`if rank == 0:`
			`# the first rank tries to write to all buffers`
			`for p in pointers:`
			`pointer = ctypes.c_void_p(p)`
			`lib.cudaMemset(pointer, byte_value, buffer_size_in_bytes)`

			`dist.barrier()`
			`torch.cuda.synchronize()`

			`host_data = (ctypes.c_char * buffer_size_in_bytes)()`

			`# all ranks read from all buffers, and check if the data is correct`
			`for p in pointers:`
			`pointer = ctypes.c_void_p(p)`
			`lib.cudaMemcpy(host_data, pointer, buffer_size_in_bytes)`
			`for i in range(buffer_size_in_bytes):`
			`assert ord(host_data[i]) == byte_value, (`
			`f"Rank {rank} failed"`
			`f" to verify buffer {p}. Expected {byte_value}, "`
			`f"got {ord(host_data[i])}")`

			`print(f"Rank {rank} verified all buffers")`

			`dist.barrier()`
			`torch.cuda.synchronize()`

			`CustomAllreduce.free_shared_buffer(pointers)`