diff --git a/docs/source/design/v1/torch_compile.md b/docs/source/design/v1/torch_compile.md index 57dba680..83409380 100644 --- a/docs/source/design/v1/torch_compile.md +++ b/docs/source/design/v1/torch_compile.md @@ -134,6 +134,6 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`: -`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"` +`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation-config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"` Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture. diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 92387b46..dcdcdd45 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -53,12 +53,20 @@ def test_compilation_config(): assert args.compilation_config.level == 3 # set to string form of a dict - args = parser.parse_args(["--compilation-config", "{'level': 3}"]) - assert args.compilation_config.level == 3 + args = parser.parse_args([ + "--compilation-config", + "{'level': 3, 'cudagraph_capture_sizes': [1, 2, 4, 8]}", + ]) + assert (args.compilation_config.level == 3 and + args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]) # set to string form of a dict - args = parser.parse_args(["--compilation-config={'level': 3}"]) - assert args.compilation_config.level == 3 + args = parser.parse_args([ + "--compilation-config=" + "{'level': 3, 'cudagraph_capture_sizes': [1, 2, 4, 8]}", + ]) + assert (args.compilation_config.level == 3 and + args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]) def test_prefix_cache_default(): diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7c1bde0f..f98f9517 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -939,10 +939,11 @@ class EngineArgs: 'testing only. level 3 is the recommended level ' 'for production.\n' 'To specify the full compilation config, ' - 'use a JSON string.\n' + 'use a JSON string, e.g. ``{"level": 3, ' + '"cudagraph_capture_sizes": [1, 2, 4, 8]}``\n' 'Following the convention of traditional ' - 'compilers, using -O without space is also ' - 'supported. -O3 is equivalent to -O 3.') + 'compilers, using ``-O`` without space is also ' + 'supported. ``-O3`` is equivalent to ``-O 3``.') parser.add_argument('--kv-transfer-config', type=KVTransferConfig.from_cli,