diff --git a/CMakeLists.txt b/CMakeLists.txt index b7bfdc6c..65d1ddbe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,8 +46,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101") # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1") -set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1") +set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0") +set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0") # # Try to find python package with an executable that exactly matches diff --git a/Dockerfile b/Dockerfile index ff4a0839..79bca1cf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -222,7 +222,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist RUN --mount=type=cache,target=/root/.cache/uv \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ - uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \ + uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \ fi COPY examples examples diff --git a/pyproject.toml b/pyproject.toml index 836389bc..ee4e2ed0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "packaging", "setuptools>=61", "setuptools-scm>=8.0", - "torch == 2.5.1", + "torch == 2.6.0", "wheel", "jinja2", ] diff --git a/requirements/build.txt b/requirements/build.txt index fec01caa..364a16d8 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -4,6 +4,6 @@ ninja packaging setuptools>=61 setuptools-scm>=8 -torch==2.5.1 +torch==2.6.0 wheel jinja2 diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 46bb1736..702d4b0b 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -4,9 +4,9 @@ numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding # Dependencies for NVIDIA GPUs -ray[cgraph] >= 2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1. -torch == 2.5.1 -torchaudio==2.5.1 +ray[cgraph]>=2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1. +torch==2.6.0 +torchaudio==2.6.0 # These must be updated alongside torch -torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version -xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1 +torchvision==0.21.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +xformers==0.0.29.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.6.0 diff --git a/requirements/test.in b/requirements/test.in index de33f92b..cc89d518 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -21,8 +21,9 @@ sentence-transformers # required for embedding tests soundfile # required for audio tests jiwer # required for audio tests timm # required for internvl test -torch==2.5.1 -torchaudio==2.5.1 +torch==2.6.0 +torchaudio==2.6.0 +torchvision==0.21.0 transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test mistral_common[opencv] >= 1.5.0 # required for pixtral test @@ -30,7 +31,7 @@ datamodel_code_generator # required for minicpm3 test lm-eval[api]==0.4.4 # required for model evaluation test transformers==4.48.2 # quantization -bitsandbytes>=0.45.0 +bitsandbytes>=0.45.3 buildkite-test-collector==0.1.9 genai_perf==0.0.8 diff --git a/requirements/test.txt b/requirements/test.txt index f1123207..a235c8b2 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -33,7 +33,7 @@ audioread==3.0.1 # via librosa awscli==1.35.23 # via -r requirements/test.in -bitsandbytes==0.45.0 +bitsandbytes==0.45.3 # via -r requirements/test.in black==24.10.0 # via datamodel-code-generator @@ -127,7 +127,6 @@ filelock==3.16.1 # ray # torch # transformers - # triton fonttools==4.54.1 # via matplotlib frozendict==2.4.6 @@ -320,6 +319,8 @@ nvidia-cusparse-cu12==12.3.1.170 # via # nvidia-cusolver-cu12 # torch +nvidia-cusparselt-cu12==0.6.2 + # via torch nvidia-nccl-cu12==2.21.5 # via torch nvidia-nvjitlink-cu12==12.4.127 @@ -591,7 +592,7 @@ timm==1.0.11 # via -r requirements/test.in tokenizers==0.21.0 # via transformers -torch==2.5.1 +torch==2.6.0 # via # -r requirements/test.in # accelerate @@ -607,13 +608,15 @@ torch==2.5.1 # torchvision # vector-quantize-pytorch # vocos -torchaudio==2.5.1 +torchaudio==2.6.0 # via # -r requirements/test.in # encodec # vocos -torchvision==0.20.1 - # via timm +torchvision==0.21.0 + # via + # -r requirements/test.in + # timm tqdm==4.66.6 # via # datasets @@ -638,7 +641,7 @@ transformers==4.48.2 # transformers-stream-generator transformers-stream-generator==0.0.5 # via -r requirements/test.in -triton==3.1.0 +triton==3.2.0 # via torch tritonclient==2.51.0 # via @@ -651,7 +654,6 @@ typepy==1.3.2 # tabledata typing-extensions==4.12.2 # via - # bitsandbytes # huggingface-hub # librosa # mistral-common diff --git a/tests/compile/backend.py b/tests/compile/backend.py index 64416eb1..a21e8eca 100644 --- a/tests/compile/backend.py +++ b/tests/compile/backend.py @@ -6,6 +6,7 @@ from typing import Callable, Union from torch import fx from vllm.compilation.inductor_pass import InductorPass +from vllm.config import get_current_vllm_config class TestBackend: @@ -17,13 +18,14 @@ class TestBackend: Inductor config can be modified directly by editing the inductor_config property. This can be helpful for adding passes like the 'pre_grad_custom_pass' and the 'post_grad_custom_pre_pass'. + Inductor config is default-initialized from VllmConfig.CompilationConfig. """ def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph], None]]): self.custom_passes = list(passes) - from torch._inductor import config - self.inductor_config = config.shallow_copy_dict() + compile_config = get_current_vllm_config().compilation_config + self.inductor_config = compile_config.inductor_compile_config self.inductor_config['force_disable_caches'] = True self.inductor_config['post_grad_custom_post_pass'] = self.post_pass diff --git a/vllm/config.py b/vllm/config.py index 429ec0dd..40ea50cb 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -52,6 +52,8 @@ if TYPE_CHECKING: else: QuantizationConfig = None +from packaging.version import Version + logger = init_logger(__name__) # This value is chosen to have a balance between ITL and TTFT. Note it is @@ -3126,6 +3128,19 @@ class CompilationConfig(BaseModel): count_all = self.custom_ops.count("all") assert count_none + count_all <= 1, "Can only specify 'none' or 'all'" + # TODO(zou3519/luka): There are 2 issues with auto-functionalization V2: + # 1. A bug in PyTorch, fixed in 2.7: + # https://github.com/pytorch/pytorch/issues/147924 + # 2. Custom passes (fusion) rely on auto-functionalization V1 and don't + # work with V2. Addressing this will take extra engineering effort + # and it is not yet a priority. RFC here: + # https://github.com/vllm-project/vllm/issues/14703 + + if Version(torch.__version__) >= Version("2.6"): + KEY = 'enable_auto_functionalized_v2' + if KEY not in self.inductor_compile_config: + self.inductor_compile_config[KEY] = False + if self.splitting_ops is None: if envs.VLLM_USE_V1: # v1 must split the graph on attention ops