[BugFix] Lazily import XgrammarBackend to avoid early cuda init (#15171)

Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
Nick Hill 2025-03-19 18:30:43 -07:00 committed by GitHub
parent cfbca8a2f2
commit c47aafa37c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -9,7 +9,6 @@ from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
StructuredOutputGrammar)
from vllm.v1.structured_output.backend_xgrammar import XgrammarBackend
if TYPE_CHECKING:
import numpy as np
@ -47,6 +46,9 @@ class StructuredOutputManager:
if self.backend is None:
backend_name = request.sampling_params.guided_decoding.backend_name
if backend_name == "xgrammar":
from vllm.v1.structured_output.backend_xgrammar import (
XgrammarBackend)
self.backend = XgrammarBackend(self.vllm_config)
else:
raise ValueError(