[BugFix] Fix fusion test and add them to CI (#16287)

Signed-off-by: luka <luka@neuralmagic.com>
2025-04-09 02:46:45 -04:00 · 2025-04-09 02:46:45 -04:00 · 9cdde47289
commit 9cdde47289
parent b1eb4ca152
3 changed files with 75 additions and 50 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -292,6 +292,14 @@ steps:
  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
  parallelism: 4
 - label: PyTorch Compilation Unit Tests
  source_file_dependencies:
    - vllm/
    - tests/compile
  commands:
    - pytest -v -s compile/test_pass_manager.py
    - pytest -v -s compile/test_fusion.py
 - label: PyTorch Fullgraph Smoke Test # 9min
  source_file_dependencies:
  - vllm/
@ -301,7 +309,6 @@ steps:
  # these tests need to be separated, cannot combine
  - pytest -v -s compile/piecewise/test_simple.py
  - pytest -v -s compile/piecewise/test_toy_llama.py
  - pytest -v -s compile/test_pass_manager.py
 - label: PyTorch Fullgraph Test # 18min
  source_file_dependencies:
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@ -2,7 +2,7 @@
 from __future__ import annotations
-from typing import Any, Union
+from typing import Any, Optional, Union
 import pytest
 import torch
@ -15,7 +15,7 @@ from vllm.platforms import current_platform
 from ..utils import create_new_process_for_each_test
-def models_list(all: bool):
+def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
    TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
        ("facebook/opt-125m", {}),
        ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
@ -32,9 +32,7 @@ def models_list(all: bool):
        ("meta-llama/Llama-3.2-1B-Instruct", {}),
    ]
-    if not all:
+    if all:
        return TEST_MODELS
        if is_quant_method_supported("aqlm"):
            TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
                "quantization": "aqlm"
@ -72,8 +70,13 @@ def models_list(all: bool):
                "quantization": "AWQ"
            }))
    if keywords is None:
        return TEST_MODELS
    # filter by keywords
    pred = lambda model: any(keyword in model[0] for keyword in keywords)
    return list(filter(pred, TEST_MODELS))
@pytest.mark.parametrize(
    "optimization_level",
@ -96,20 +99,30 @@ def test_full_graph(
        run_model(optimization_level, model, model_kwargs)
 PassConfig = CompilationConfig.PassConfig
 # TODO(luka) add other supported compilation config scenarios here
@pytest.mark.parametrize(
-    "compilation_config",
+    "compilation_config, model_info",
    # additional compile sizes
    [
-        CompilationConfig(level=CompilationLevel.PIECEWISE,
+        # additional compile sizes, only some of the models
-                          compile_sizes=[1, 2])
+        (CompilationConfig(level=CompilationLevel.PIECEWISE,
                           compile_sizes=[1, 2]), model)
        for model in models_list(all=False)
    ] + [
        # RMSNorm + quant fusion, only 8-bit quant models
        (CompilationConfig(level=CompilationLevel.PIECEWISE,
                           custom_ops=["+rms_norm"],
                           pass_config=PassConfig(enable_fusion=True,
                                                  enable_noop=True)), model)
        for model in models_list(keywords=["FP8-dynamic", "quantized.w8a8"])
    ])
 # only test some of the models
@pytest.mark.parametrize("model_info", models_list(all=False))
@create_new_process_for_each_test()
 def test_custom_compile_config(
    model_info: tuple[str, dict[str, Any]],
    compilation_config: CompilationConfig,
    model_info: tuple[str, dict[str, Any]],
 ):
    model, model_kwargs = model_info
    print(f"MODEL={model}")
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@ -44,12 +44,17 @@ class TestModel(torch.nn.Module):
        resid = torch.sqrt(x)
        y = self.norm[0](x)
-        x2 = self.fp8_linear.apply(y, self.w[0], self.wscale[0], self.scale[0])
+        x2 = self.fp8_linear.apply(y,
                                   self.w[0],
                                   self.wscale[0],
                                   input_scale=self.scale[0])
        # make sure resid is used for replacement to work
        y2, resid = self.norm[1](x2, resid)
-        x3 = self.fp8_linear.apply(y2, self.w[1], self.wscale[1],
+        x3 = self.fp8_linear.apply(y2,
-                                   self.scale[1])
+                                   self.w[1],
                                   self.wscale[1],
                                   input_scale=self.scale[1])
        y3, resid = self.norm[2](x3, resid)  # use resid here
        return y3