[ci][distributed] add tests for custom allreduce (#5689)
This commit is contained in:
parent
afed90a034
commit
d571ca0108
@ -182,7 +182,11 @@ steps:
|
|||||||
- pip install -r requirements-docs.txt
|
- pip install -r requirements-docs.txt
|
||||||
- SPHINXOPTS=\"-W\" make html
|
- SPHINXOPTS=\"-W\" make html
|
||||||
|
|
||||||
- label: A100 status
|
- label: Distributed Tests (A100)
|
||||||
gpu: a100
|
gpu: a100
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||||
|
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||||
|
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||||
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
@ -11,7 +11,8 @@ from vllm.distributed.communication_op import ( # noqa
|
|||||||
from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
|
from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
|
||||||
get_tp_group, graph_capture)
|
get_tp_group, graph_capture)
|
||||||
|
|
||||||
from ..utils import (init_test_distributed_environment,
|
from ..utils import (ensure_model_parallel_initialized,
|
||||||
|
init_test_distributed_environment,
|
||||||
multi_process_tensor_parallel)
|
multi_process_tensor_parallel)
|
||||||
|
|
||||||
random.seed(42)
|
random.seed(42)
|
||||||
@ -27,8 +28,8 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
|
|||||||
torch.cuda.set_device(device)
|
torch.cuda.set_device(device)
|
||||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||||
distributed_init_port)
|
distributed_init_port)
|
||||||
|
ensure_model_parallel_initialized(tp_size, pp_size)
|
||||||
group = get_tensor_model_parallel_group()
|
group = get_tensor_model_parallel_group().device_group
|
||||||
|
|
||||||
# A small all_reduce for warmup.
|
# A small all_reduce for warmup.
|
||||||
# this is needed because device communicators might be created lazily
|
# this is needed because device communicators might be created lazily
|
||||||
|
Loading…
x
Reference in New Issue
Block a user