[Misc] Update disaggregation benchmark scripts and test logs (#11456)
Signed-off-by: Jiaxin Shan <seedjeffwan@gmail.com>
This commit is contained in:
parent
9832e5572a
commit
fc601665eb
@ -10,7 +10,8 @@ set -ex
|
|||||||
|
|
||||||
kill_gpu_processes() {
|
kill_gpu_processes() {
|
||||||
# kill all processes on GPU.
|
# kill all processes on GPU.
|
||||||
pkill -f pt_main_thread
|
pgrep pt_main_thread | xargs -r kill -9
|
||||||
|
pgrep python3 | xargs -r kill -9
|
||||||
sleep 10
|
sleep 10
|
||||||
|
|
||||||
# remove vllm config file
|
# remove vllm config file
|
||||||
@ -54,7 +55,7 @@ benchmark() {
|
|||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python3 \
|
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
-m vllm.entrypoints.openai.api_server \
|
||||||
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
|
--model $model \
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
@ -64,7 +65,7 @@ benchmark() {
|
|||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=1 python3 \
|
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
-m vllm.entrypoints.openai.api_server \
|
||||||
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
|
--model $model \
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
@ -87,7 +88,7 @@ benchmark() {
|
|||||||
--port 8100 \
|
--port 8100 \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $results_folder \
|
--result-dir $results_folder \
|
||||||
--result-filename disagg_prefill_2xtp4.json \
|
--result-filename disagg_prefill_tp1.json \
|
||||||
--request-rate "inf"
|
--request-rate "inf"
|
||||||
|
|
||||||
|
|
||||||
@ -105,7 +106,7 @@ benchmark() {
|
|||||||
--port 8200 \
|
--port 8200 \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $results_folder \
|
--result-dir $results_folder \
|
||||||
--result-filename disagg_prefill_2xtp4.json \
|
--result-filename disagg_prefill_tp1_overhead.json \
|
||||||
--request-rate "$qps"
|
--request-rate "$qps"
|
||||||
kill_gpu_processes
|
kill_gpu_processes
|
||||||
|
|
||||||
@ -118,7 +119,7 @@ main() {
|
|||||||
(which jq) || (apt-get -y install jq)
|
(which jq) || (apt-get -y install jq)
|
||||||
(which socat) || (apt-get -y install socat)
|
(which socat) || (apt-get -y install socat)
|
||||||
|
|
||||||
pip install quart httpx
|
pip install quart httpx datasets
|
||||||
|
|
||||||
cd "$(dirname "$0")"
|
cd "$(dirname "$0")"
|
||||||
|
|
||||||
|
@ -1,13 +1,12 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# Requirement: 8x H100 GPUs.
|
# Requirement: 2x GPUs.
|
||||||
|
|
||||||
|
|
||||||
# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV
|
# Model: meta-llama/Meta-Llama-3.1-8B-Instruct
|
||||||
# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
|
# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests
|
||||||
# Resource: 8x H100
|
# Resource: 2x GPU
|
||||||
# Approaches:
|
# Approaches:
|
||||||
# 1. Chunked prefill: 1 vllm instance with tp=8
|
|
||||||
# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
|
# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
|
||||||
# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
|
# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
|
||||||
# Prefilling instance: max_output_token=1
|
# Prefilling instance: max_output_token=1
|
||||||
@ -114,7 +113,6 @@ benchmark() {
|
|||||||
--request-rate "$qps"
|
--request-rate "$qps"
|
||||||
|
|
||||||
sleep 2
|
sleep 2
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -123,8 +121,9 @@ main() {
|
|||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
(which jq) || (apt-get -y install jq)
|
(which jq) || (apt-get -y install jq)
|
||||||
(which socat) || (apt-get -y install socat)
|
(which socat) || (apt-get -y install socat)
|
||||||
|
(which lsof) || (apt-get -y install lsof)
|
||||||
|
|
||||||
pip install quart httpx matplotlib aiohttp
|
pip install quart httpx matplotlib aiohttp datasets
|
||||||
|
|
||||||
cd "$(dirname "$0")"
|
cd "$(dirname "$0")"
|
||||||
|
|
||||||
|
@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
|
|||||||
assert buffer.buffer_size == 0
|
assert buffer.buffer_size == 0
|
||||||
assert len(buffer.buffer) == 0
|
assert len(buffer.buffer) == 0
|
||||||
|
|
||||||
print("Test run passed!")
|
print("My rank: %d, Test run passed!" % (my_rank))
|
||||||
|
|
||||||
|
|
||||||
def stress_test(my_rank, buf, device):
|
def stress_test(my_rank, buf, device):
|
||||||
@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
|
|||||||
else:
|
else:
|
||||||
torch.distributed.send(torch.tensor([n]), 0)
|
torch.distributed.send(torch.tensor([n]), 0)
|
||||||
|
|
||||||
print("Passed stress test!")
|
print("My rank: %d, Passed stress test!" % (my_rank))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1,3 +1,8 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
RANK=0 python test_lookup_buffer.py &
|
RANK=0 python3 test_lookup_buffer.py &
|
||||||
RANK=1 python test_lookup_buffer.py &
|
PID0=$!
|
||||||
|
RANK=1 python3 test_lookup_buffer.py &
|
||||||
|
PID1=$!
|
||||||
|
|
||||||
|
wait $PID0
|
||||||
|
wait $PID1
|
||||||
|
@ -10,39 +10,42 @@ from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
|
|||||||
|
|
||||||
|
|
||||||
def test_run(my_rank, pipe):
|
def test_run(my_rank, pipe):
|
||||||
|
print(f"rank {my_rank} test_run starts....")
|
||||||
# test run
|
# test run
|
||||||
x = torch.tensor([1]).to(pipe.device)
|
x = torch.tensor([1]).to(pipe.device)
|
||||||
y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
|
y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
|
||||||
if my_rank == 0:
|
if my_rank == 0:
|
||||||
pipe.send_tensor(x)
|
pipe.send_tensor(x)
|
||||||
print("sent tensor x")
|
print(f"rank {my_rank} sent tensor x")
|
||||||
pipe.send_tensor(y)
|
pipe.send_tensor(y)
|
||||||
print("sent tensor y")
|
print(f"rank {my_rank} sent tensor y")
|
||||||
x2 = pipe.recv_tensor()
|
x2 = pipe.recv_tensor()
|
||||||
print("received x2 = ", x2)
|
print(f"rank {my_rank} received x2 = ", x2)
|
||||||
y2 = pipe.recv_tensor()
|
y2 = pipe.recv_tensor()
|
||||||
print("received y2 = ", x2)
|
print(f"rank {my_rank} received y2 = ", x2)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
x2 = pipe.recv_tensor()
|
x2 = pipe.recv_tensor()
|
||||||
print("received x2 = ", x2)
|
print(f"rank {my_rank} received x2 = ", x2)
|
||||||
y2 = pipe.recv_tensor()
|
y2 = pipe.recv_tensor()
|
||||||
print("received y2 = ", x2)
|
print(f"rank {my_rank} received y2 = ", x2)
|
||||||
pipe.send_tensor(x)
|
pipe.send_tensor(x)
|
||||||
print("sent tensor x")
|
print(f"rank {my_rank} sent tensor x")
|
||||||
pipe.send_tensor(y)
|
pipe.send_tensor(y)
|
||||||
print("sent tensor y")
|
print(f"rank {my_rank} sent tensor y")
|
||||||
|
|
||||||
assert torch.allclose(x, x2)
|
assert torch.allclose(x, x2)
|
||||||
assert torch.allclose(y, y2)
|
assert torch.allclose(y, y2)
|
||||||
|
|
||||||
|
print(f"rank {my_rank} test_run passed!")
|
||||||
|
|
||||||
|
|
||||||
def stress_test(my_rank, pipe):
|
def stress_test(my_rank, pipe):
|
||||||
|
print(f"rank {my_rank} stress_test starts....")
|
||||||
torch.distributed.barrier()
|
|
||||||
|
|
||||||
tensors: List[torch.Tensor] = []
|
tensors: List[torch.Tensor] = []
|
||||||
|
|
||||||
|
torch.distributed.barrier()
|
||||||
torch.manual_seed(0)
|
torch.manual_seed(0)
|
||||||
|
|
||||||
for i in tqdm(range(500)):
|
for i in tqdm(range(500)):
|
||||||
@ -86,7 +89,6 @@ def stress_test(my_rank, pipe):
|
|||||||
|
|
||||||
|
|
||||||
def latency_test(my_rank, pipe, nelement, ntensor):
|
def latency_test(my_rank, pipe, nelement, ntensor):
|
||||||
|
|
||||||
latencies = []
|
latencies = []
|
||||||
|
|
||||||
torch.distributed.barrier()
|
torch.distributed.barrier()
|
||||||
@ -149,6 +151,7 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
|
|
||||||
test_run(my_rank, pipe)
|
test_run(my_rank, pipe)
|
||||||
|
|
||||||
stress_test(my_rank, pipe)
|
stress_test(my_rank, pipe)
|
||||||
|
|
||||||
# Use this function if you want to test the latency of pipe impl.
|
# Use this function if you want to test the latency of pipe impl.
|
||||||
|
@ -1,3 +1,9 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
RANK=0 python3 test_send_recv.py &
|
RANK=0 python3 test_send_recv.py &
|
||||||
|
PID0=$!
|
||||||
RANK=1 python3 test_send_recv.py &
|
RANK=1 python3 test_send_recv.py &
|
||||||
|
PID1=$!
|
||||||
|
|
||||||
|
wait $PID0
|
||||||
|
wait $PID1
|
||||||
|
Loading…
x
Reference in New Issue
Block a user