From d3d4956261e864b492150251dec2d3d062293537 Mon Sep 17 00:00:00 2001 From: Liangfu Chen Date: Thu, 13 Mar 2025 20:46:56 -0700 Subject: [PATCH] [Neuron] flatten test parameterization for neuron attention kernels (#14712) --- .buildkite/run-neuron-test.sh | 2 +- tests/neuron/{ => 1_core}/test_activation.py | 0 tests/neuron/{ => 1_core}/test_block_table.py | 0 tests/neuron/{ => 1_core}/test_cache.py | 0 tests/neuron/{ => 1_core}/test_layernorm.py | 0 .../{ => 1_core}/test_logits_processor.py | 0 .../{ => 1_core}/test_prefix_prefill.py | 46 ++++++++++--------- .../{ => 1_core}/test_rotary_embedding.py | 0 tests/neuron/{ => 2_core}/test_comm_ops.py | 0 9 files changed, 26 insertions(+), 22 deletions(-) rename tests/neuron/{ => 1_core}/test_activation.py (100%) rename tests/neuron/{ => 1_core}/test_block_table.py (100%) rename tests/neuron/{ => 1_core}/test_cache.py (100%) rename tests/neuron/{ => 1_core}/test_layernorm.py (100%) rename tests/neuron/{ => 1_core}/test_logits_processor.py (100%) rename tests/neuron/{ => 1_core}/test_prefix_prefill.py (92%) rename tests/neuron/{ => 1_core}/test_rotary_embedding.py (100%) rename tests/neuron/{ => 2_core}/test_comm_ops.py (100%) diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh index 55c374fc..06924fea 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/run-neuron-test.sh @@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ --name "${container_name}" \ ${image_name} \ - /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys" + /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys" diff --git a/tests/neuron/test_activation.py b/tests/neuron/1_core/test_activation.py similarity index 100% rename from tests/neuron/test_activation.py rename to tests/neuron/1_core/test_activation.py diff --git a/tests/neuron/test_block_table.py b/tests/neuron/1_core/test_block_table.py similarity index 100% rename from tests/neuron/test_block_table.py rename to tests/neuron/1_core/test_block_table.py diff --git a/tests/neuron/test_cache.py b/tests/neuron/1_core/test_cache.py similarity index 100% rename from tests/neuron/test_cache.py rename to tests/neuron/1_core/test_cache.py diff --git a/tests/neuron/test_layernorm.py b/tests/neuron/1_core/test_layernorm.py similarity index 100% rename from tests/neuron/test_layernorm.py rename to tests/neuron/1_core/test_layernorm.py diff --git a/tests/neuron/test_logits_processor.py b/tests/neuron/1_core/test_logits_processor.py similarity index 100% rename from tests/neuron/test_logits_processor.py rename to tests/neuron/1_core/test_logits_processor.py diff --git a/tests/neuron/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py similarity index 92% rename from tests/neuron/test_prefix_prefill.py rename to tests/neuron/1_core/test_prefix_prefill.py index 2c6ac478..326a1f82 100644 --- a/tests/neuron/test_prefix_prefill.py +++ b/tests/neuron/1_core/test_prefix_prefill.py @@ -292,28 +292,32 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size, @pytest.mark.parametrize( - "prefill_batch_size,decode_batch_size,block_size,large_tile_size", + "prefill_batch_size,decode_batch_size,block_size,large_tile_size,num_heads,num_queries_per_kv,head_size,mixed_precision", [ - (1, 199, 1, 512), # 512 blocks - (4, 12, 256, 2048), # 128 blocks - (4, 12, 16, 2048), # 128 blocks - (4, 12, 4, 1024), # 256 blocks - (4, 12, 32, 2048), # 64 blocks - (4, 12, 32, 4096), # 128 blocks - (4, 12, 32, 8192), # 256 blocks - (4, 12, 64, 8192), # 128 blocks - ], -) -@pytest.mark.parametrize( - "num_heads,num_queries_per_kv,head_size", - [ - (4, 2, 8), - (32, 8, 64), - (4, 4, 128), - (8, 1, 32), - ], -) -@pytest.mark.parametrize("mixed_precision", [True, False]) + # Test minimal configurations (small block size) + (1, 199, 1, 512, 4, 2, 8, False + ), # minimal block size, small dimensions + (1, 199, 1, 512, 4, 2, 8, True), # same with mixed precision + + # Test common/medium configurations + (4, 12, 32, 2048, 32, 8, 64, False), # common case, larger heads + (4, 12, 32, 2048, 16, 4, 32, + True), # medium size, mixed precision, grouped-query attention (GQA) + + # Test large configurations + (4, 12, 256, 8192, 8, 1, 128, False), # large blocks, large head size + (4, 12, 256, 8192, 64, 8, 64, True), # large blocks, many heads + + # Test asymmetric configurations + (2, 24, 64, 4096, 12, 4, 96, False), # varied batch sizes + (8, 8, 128, 2048, 24, 2, 48, True), # balanced batches + + # Test edge cases + (1, 128, 16, 1024, 4, 2, 16, False), # large decode batch + (16, 4, 8, 8192, 48, 1, 128, True), # large prefill batch + (4, 12, 32, 2048, 16, 1, 32, True), # multi-head attention (MHA) + (4, 12, 32, 2048, 16, 16, 32, True), # multi-query attention (MQA) + ]) @torch.inference_mode() def test_contexted_kv_attention( prefill_batch_size: int, diff --git a/tests/neuron/test_rotary_embedding.py b/tests/neuron/1_core/test_rotary_embedding.py similarity index 100% rename from tests/neuron/test_rotary_embedding.py rename to tests/neuron/1_core/test_rotary_embedding.py diff --git a/tests/neuron/test_comm_ops.py b/tests/neuron/2_core/test_comm_ops.py similarity index 100% rename from tests/neuron/test_comm_ops.py rename to tests/neuron/2_core/test_comm_ops.py