2024-01-14 12:37:58 -08:00
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.
2024-04-30 10:41:59 -07:00
# Please update any changes made here to
2025-01-08 21:34:44 +08:00
# docs/source/contributing/dockerfile/dockerfile.md and
# docs/source/assets/contributing/dockerfile-stages-dependency.png
2024-04-30 10:41:59 -07:00
2024-06-18 11:00:36 -07:00
ARG CUDA_VERSION = 12 .4.1
2024-01-14 12:37:58 -08:00
#################### BASE BUILD IMAGE ####################
2024-04-04 21:53:16 -07:00
# prepare basic build environment
2024-07-18 20:29:25 -04:00
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
2024-06-18 11:00:36 -07:00
ARG CUDA_VERSION = 12 .4.1
2024-09-07 14:03:16 -06:00
ARG PYTHON_VERSION = 3 .12
2024-12-16 17:20:49 +08:00
ARG TARGETPLATFORM
2024-06-18 11:00:36 -07:00
ENV DEBIAN_FRONTEND = noninteractive
2025-03-24 05:53:10 -07:00
# Install Python and other dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${ PYTHON_VERSION } python${ PYTHON_VERSION } -dev python${ PYTHON_VERSION } -venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${ PYTHON_VERSION } 1 \
&& update-alternatives --set python3 /usr/bin/python${ PYTHON_VERSION } \
&& ln -sf /usr/bin/python${ PYTHON_VERSION } -config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${ PYTHON_VERSION } \
&& python3 --version && python3 -m pip --version
# Install uv for faster pip installs
RUN --mount= type = cache,target= /root/.cache/uv \
python3 -m pip install uv
2023-10-31 12:36:47 -07:00
2025-03-05 20:09:29 -05:00
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT = 500
2024-09-26 13:07:18 -04:00
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
RUN apt-get install -y gcc-10 g++-10
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
RUN <<EOF
gcc --version
EOF
2024-01-31 14:34:17 -08:00
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
2024-06-18 11:00:36 -07:00
RUN ldconfig /usr/local/cuda-$( echo $CUDA_VERSION | cut -d. -f1,2) /compat/
2024-01-31 14:34:17 -08:00
2023-10-31 12:36:47 -07:00
WORKDIR /workspace
2025-03-24 05:53:10 -07:00
# install build and runtime dependencies
2024-12-19 18:13:06 -08:00
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
# we need to install torch and torchvision from the nightly builds first,
# pytorch will not appear as a vLLM dependency in all of the following steps
# after this step
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2024-12-16 17:20:49 +08:00
if [ " $TARGETPLATFORM " = "linux/arm64" ] ; then \
2025-03-24 05:53:10 -07:00
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton = = 3.3.0+gitab727c40; \
2024-12-16 17:20:49 +08:00
fi
2024-07-03 02:11:29 +03:00
2025-03-08 17:44:35 +01:00
COPY requirements/common.txt requirements/common.txt
COPY requirements/cuda.txt requirements/cuda.txt
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-03-24 05:53:10 -07:00
uv pip install --system -r requirements/cuda.txt
2024-12-19 18:13:06 -08:00
2024-04-04 10:26:19 -07:00
# cuda arch list used by torch
# can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2
# see https://github.com/pytorch/pytorch/pull/123243
ARG torch_cuda_arch_list = '7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST = ${ torch_cuda_arch_list }
2024-09-21 02:27:10 -04:00
# Override the arch list for flash-attn to reduce the binary size
ARG vllm_fa_cmake_gpu_arches = '80-real;90-real'
ENV VLLM_FA_CMAKE_GPU_ARCHES = ${ vllm_fa_cmake_gpu_arches }
2024-01-14 12:37:58 -08:00
#################### BASE BUILD IMAGE ####################
2024-04-04 21:53:16 -07:00
#################### WHEEL BUILD IMAGE ####################
2024-06-18 11:00:36 -07:00
FROM base AS build
2024-12-16 17:20:49 +08:00
ARG TARGETPLATFORM
2024-06-18 11:00:36 -07:00
2023-11-30 15:06:50 +08:00
# install build dependencies
2025-03-08 17:44:35 +01:00
COPY requirements/build.txt requirements/build.txt
2024-06-18 11:00:36 -07:00
2025-03-05 20:09:29 -05:00
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT = 500
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-03-24 05:53:10 -07:00
uv pip install --system -r requirements/build.txt
2023-11-30 15:06:50 +08:00
2024-10-14 20:34:47 +02:00
COPY . .
2024-10-17 19:25:06 +02:00
ARG GIT_REPO_CHECK = 0
RUN --mount= type = bind,source= .git,target= .git \
2025-03-06 16:08:36 -05:00
if [ " $GIT_REPO_CHECK " != "0" ] ; then bash tools/check_repo.sh ; fi
2023-10-31 12:36:47 -07:00
# max jobs used by Ninja to build extensions
2023-12-08 09:53:47 -08:00
ARG max_jobs = 2
ENV MAX_JOBS = ${ max_jobs }
2023-12-07 16:00:32 -03:00
# number of threads used by nvcc
ARG nvcc_threads = 8
ENV NVCC_THREADS = $nvcc_threads
2023-12-08 09:53:47 -08:00
2024-06-12 17:58:12 -07:00
ARG USE_SCCACHE
2024-08-22 13:10:55 -07:00
ARG SCCACHE_BUCKET_NAME = vllm-build-sccache
ARG SCCACHE_REGION_NAME = us-west-2
2024-09-16 15:11:27 -07:00
ARG SCCACHE_S3_NO_CREDENTIALS = 0
2024-06-12 17:58:12 -07:00
# if USE_SCCACHE is set, use sccache to speed up compilation
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2024-09-23 18:44:26 +02:00
--mount= type = bind,source= .git,target= .git \
2024-06-12 17:58:12 -07:00
if [ " $USE_SCCACHE " = "1" ] ; then \
echo "Installing sccache..." \
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
&& tar -xzf sccache.tar.gz \
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
2024-08-22 13:10:55 -07:00
&& export SCCACHE_BUCKET = ${ SCCACHE_BUCKET_NAME } \
&& export SCCACHE_REGION = ${ SCCACHE_REGION_NAME } \
2024-09-16 15:11:27 -07:00
&& export SCCACHE_S3_NO_CREDENTIALS = ${ SCCACHE_S3_NO_CREDENTIALS } \
2024-08-22 13:10:55 -07:00
&& export SCCACHE_IDLE_TIMEOUT = 0 \
2024-07-05 17:19:53 -07:00
&& export CMAKE_BUILD_TYPE = Release \
2024-06-12 17:58:12 -07:00
&& sccache --show-stats \
2024-07-14 21:54:46 -04:00
&& python3 setup.py bdist_wheel --dist-dir= dist --py-limited-api= cp38 \
2024-06-12 17:58:12 -07:00
&& sccache --show-stats; \
fi
2024-03-28 22:14:24 -07:00
ENV CCACHE_DIR = /root/.cache/ccache
RUN --mount= type = cache,target= /root/.cache/ccache \
2025-02-22 17:25:20 +01:00
--mount= type = cache,target= /root/.cache/uv \
2024-09-23 18:44:26 +02:00
--mount= type = bind,source= .git,target= .git \
2024-06-12 17:58:12 -07:00
if [ " $USE_SCCACHE " != "1" ] ; then \
2025-03-06 16:08:36 -05:00
# Clean any existing CMake artifacts
rm -rf .deps && \
mkdir -p .deps && \
2024-07-14 21:54:46 -04:00
python3 setup.py bdist_wheel --dist-dir= dist --py-limited-api= cp38; \
2024-06-12 17:58:12 -07:00
fi
2024-04-04 21:53:16 -07:00
2024-09-04 14:17:05 +08:00
# Check the size of the wheel if RUN_WHEEL_CHECK is true
2024-05-04 13:44:36 -07:00
COPY .buildkite/check-wheel-size.py check-wheel-size.py
2025-01-24 17:54:29 +08:00
# sync the default value with .buildkite/check-wheel-size.py
2025-02-03 15:59:49 +08:00
ARG VLLM_MAX_SIZE_MB = 400
2024-09-04 14:17:05 +08:00
ENV VLLM_MAX_SIZE_MB = $VLLM_MAX_SIZE_MB
ARG RUN_WHEEL_CHECK = true
RUN if [ " $RUN_WHEEL_CHECK " = "true" ] ; then \
python3 check-wheel-size.py dist; \
else \
echo "Skipping wheel size check." ; \
fi
2024-01-14 12:37:58 -08:00
#################### EXTENSION Build IMAGE ####################
2023-10-31 12:36:47 -07:00
2024-06-18 11:00:36 -07:00
#################### DEV IMAGE ####################
FROM base as dev
2025-03-05 20:09:29 -05:00
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT = 500
2025-03-08 17:44:35 +01:00
COPY requirements/lint.txt requirements/lint.txt
COPY requirements/test.txt requirements/test.txt
COPY requirements/dev.txt requirements/dev.txt
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-03-24 05:53:10 -07:00
uv pip install --system -r requirements/dev.txt
2024-06-18 11:00:36 -07:00
#################### DEV IMAGE ####################
2024-12-16 17:20:49 +08:00
2024-04-04 21:53:16 -07:00
#################### vLLM installation IMAGE ####################
# image with vLLM installed
2025-01-28 02:19:24 +08:00
# TODO: Restore to base image after FlashInfer AOT wheel fixed
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
2024-06-18 11:00:36 -07:00
ARG CUDA_VERSION = 12 .4.1
2024-09-07 14:03:16 -06:00
ARG PYTHON_VERSION = 3 .12
2024-01-14 12:37:58 -08:00
WORKDIR /vllm-workspace
2024-08-22 13:10:55 -07:00
ENV DEBIAN_FRONTEND = noninteractive
2024-12-16 17:20:49 +08:00
ARG TARGETPLATFORM
2024-08-22 13:10:55 -07:00
RUN PYTHON_VERSION_STR = $( echo ${ PYTHON_VERSION } | sed 's/\.//g' ) && \
echo " export PYTHON_VERSION_STR= ${ PYTHON_VERSION_STR } " >> /etc/environment
2024-04-04 21:53:16 -07:00
2025-03-24 05:53:10 -07:00
# Install Python and other dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${ PYTHON_VERSION } python${ PYTHON_VERSION } -dev python${ PYTHON_VERSION } -venv libibverbs-dev \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${ PYTHON_VERSION } 1 \
&& update-alternatives --set python3 /usr/bin/python${ PYTHON_VERSION } \
&& ln -sf /usr/bin/python${ PYTHON_VERSION } -config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${ PYTHON_VERSION } \
&& python3 --version && python3 -m pip --version
# Install uv for faster pip installs
RUN --mount= type = cache,target= /root/.cache/uv \
python3 -m pip install uv
2024-04-04 21:53:16 -07:00
2025-03-05 20:09:29 -05:00
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT = 500
2024-04-04 21:53:16 -07:00
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
2024-06-18 11:00:36 -07:00
RUN ldconfig /usr/local/cuda-$( echo $CUDA_VERSION | cut -d. -f1,2) /compat/
2024-04-04 21:53:16 -07:00
2024-12-19 18:13:06 -08:00
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
# we need to install torch and torchvision from the nightly builds first,
# pytorch will not appear as a vLLM dependency in all of the following steps
# after this step
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2024-12-19 18:13:06 -08:00
if [ " $TARGETPLATFORM " = "linux/arm64" ] ; then \
2025-03-24 05:53:10 -07:00
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton = = 3.3.0+gitab727c40; \
2024-12-19 18:13:06 -08:00
fi
2024-12-16 17:20:49 +08:00
# Install vllm wheel first, so that torch etc will be installed.
2024-04-04 21:53:16 -07:00
RUN --mount= type = bind,from= build,src= /workspace/dist,target= /vllm-workspace/dist \
2025-02-22 17:25:20 +01:00
--mount= type = cache,target= /root/.cache/uv \
2025-03-24 05:53:10 -07:00
uv pip install --system dist/*.whl --verbose
2024-07-03 02:11:29 +03:00
2025-02-15 21:33:13 +08:00
# If we need to build FlashInfer wheel before its release:
2025-01-28 02:19:24 +08:00
# $ export FLASHINFER_ENABLE_AOT=1
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
# $ cd flashinfer
# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
2025-02-15 21:33:13 +08:00
# $ rm -rf build
2025-01-28 02:19:24 +08:00
# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
2025-02-15 21:33:13 +08:00
# $ ls dist
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
2025-01-28 02:19:24 +08:00
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-03-24 05:53:10 -07:00
. /etc/environment && \
2024-12-16 17:20:49 +08:00
if [ " $TARGETPLATFORM " != "linux/arm64" ] ; then \
2025-03-24 05:53:10 -07:00
uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
2024-12-16 17:20:49 +08:00
fi
2024-10-09 00:37:34 +08:00
COPY examples examples
2025-04-16 12:21:14 +08:00
COPY benchmarks benchmarks
2025-04-18 13:13:35 +08:00
COPY ./vllm/collect_env.py .
2025-01-28 02:19:24 +08:00
# Although we build Flashinfer with AOT mode, there's still
# some issues w.r.t. JIT compilation. Therefore we need to
# install build dependencies for JIT compilation.
# TODO: Remove this once FlashInfer AOT wheel is fixed
2025-03-08 17:44:35 +01:00
COPY requirements/build.txt requirements/build.txt
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-03-24 05:53:10 -07:00
uv pip install --system -r requirements/build.txt
2025-01-28 02:19:24 +08:00
2024-04-04 21:53:16 -07:00
#################### vLLM installation IMAGE ####################
2023-10-31 12:36:47 -07:00
2024-04-04 21:53:16 -07:00
#################### TEST IMAGE ####################
# image to run unit testing suite
# note that this uses vllm installed by `pip`
FROM vllm-base AS test
2023-10-31 12:36:47 -07:00
2024-04-04 21:53:16 -07:00
ADD . /vllm-workspace/
2023-10-31 12:36:47 -07:00
2025-03-05 20:09:29 -05:00
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT = 500
2024-04-04 21:53:16 -07:00
# install development dependencies (for testing)
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-03-24 05:53:10 -07:00
uv pip install --system -r requirements/dev.txt
2024-03-14 18:55:54 +01:00
2024-11-26 00:20:04 -08:00
# install development dependencies (for testing)
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-03-24 05:53:10 -07:00
uv pip install --system -e tests/vllm_test_utils
2024-11-26 00:20:04 -08:00
2024-11-08 03:35:25 -05:00
# enable fast downloads from hf (for testing)
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2025-03-24 05:53:10 -07:00
uv pip install --system hf_transfer
2024-11-08 03:35:25 -05:00
ENV HF_HUB_ENABLE_HF_TRANSFER 1
2024-11-06 12:57:35 -07:00
# Copy in the v1 package for testing (it isn't distributed yet)
COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
2024-04-04 21:53:16 -07:00
# doc requires source code
# we hide them inside `test_docs/` , so that this source code
# will not be imported by other tests
RUN mkdir test_docs
RUN mv docs test_docs/
RUN mv vllm test_docs/
#################### TEST IMAGE ####################
2023-10-31 12:36:47 -07:00
2024-01-14 12:37:58 -08:00
#################### OPENAI API SERVER ####################
2025-01-03 10:59:25 +11:00
# base openai image with additional requirements, for any subsequent openai-style images
FROM vllm-base AS vllm-openai-base
2024-04-04 21:53:16 -07:00
2025-03-05 20:09:29 -05:00
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT = 500
2023-12-12 17:34:17 -08:00
# install additional dependencies for openai api server
2025-02-22 17:25:20 +01:00
RUN --mount= type = cache,target= /root/.cache/uv \
2024-12-16 17:20:49 +08:00
if [ " $TARGETPLATFORM " = "linux/arm64" ] ; then \
2025-03-24 05:53:10 -07:00
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[ s3] ; \
2024-12-16 17:20:49 +08:00
else \
2025-03-24 05:53:10 -07:00
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[ s3] ; \
2024-12-16 17:20:49 +08:00
fi
2024-12-19 18:13:06 -08:00
2024-03-28 22:16:12 -07:00
ENV VLLM_USAGE_SOURCE production-docker-image
2025-01-03 10:59:25 +11:00
# define sagemaker first, so it is not default from `docker build`
FROM vllm-openai-base AS vllm-sagemaker
2025-01-08 13:09:53 +00:00
COPY examples/online_serving/sagemaker-entrypoint.sh .
2025-01-03 10:59:25 +11:00
RUN chmod +x sagemaker-entrypoint.sh
ENTRYPOINT [ "./sagemaker-entrypoint.sh" ]
FROM vllm-openai-base AS vllm-openai
2023-10-31 12:36:47 -07:00
ENTRYPOINT [ "python3" , "-m" , "vllm.entrypoints.openai.api_server" ]
2024-01-14 12:37:58 -08:00
#################### OPENAI API SERVER ####################