vllm/Dockerfile

# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.

# Please update any changes made here to
# docs/source/dev/dockerfile/dockerfile.rst and
# docs/source/assets/dev/dockerfile-stages-dependency.png

#################### BASE BUILD IMAGE ####################
# prepare basic build environment
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev

RUN apt-get update -y \
    && apt-get install -y python3-pip git

# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-12.1/compat/

WORKDIR /workspace

# install build and runtime dependencies
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-cuda.txt

# install development dependencies
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-dev.txt

# cuda arch list used by torch
# can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2
# see https://github.com/pytorch/pytorch/pull/123243
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
#################### BASE BUILD IMAGE ####################


#################### WHEEL BUILD IMAGE ####################
FROM dev AS build

# install build dependencies
COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-build.txt

# install compiler cache to speed up compilation leveraging local or remote caching
RUN apt-get update -y && apt-get install -y ccache

# files and directories related to build wheels
COPY csrc csrc
COPY setup.py setup.py
COPY cmake cmake
COPY CMakeLists.txt CMakeLists.txt
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
COPY pyproject.toml pyproject.toml
COPY vllm vllm

# max jobs used by Ninja to build extensions
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# number of threads used by nvcc
ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1

ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/pip \
    python3 setup.py bdist_wheel --dist-dir=dist

# the `vllm_nccl` package must be installed from source distribution
# pip is too smart to store a wheel in the cache, and other CI jobs
# will directly use the wheel from the cache, which is not what we want.
# we need to remove it manually
RUN --mount=type=cache,target=/root/.cache/pip \
    pip cache remove vllm_nccl*
#################### EXTENSION Build IMAGE ####################

#################### FLASH_ATTENTION Build IMAGE ####################
FROM dev as flash-attn-builder
# max jobs used for build
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# flash attention version
ARG flash_attn_version=v2.5.8
ENV FLASH_ATTN_VERSION=${flash_attn_version}

WORKDIR /usr/src/flash-attention-v2

# Download the wheel or build it if a pre-compiled release doesn't exist
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
    --no-build-isolation --no-deps --no-cache-dir

#################### FLASH_ATTENTION Build IMAGE ####################

#################### vLLM installation IMAGE ####################
# image with vLLM installed
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
WORKDIR /vllm-workspace

RUN apt-get update -y \
    && apt-get install -y python3-pip git vim

# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-12.1/compat/

# install vllm wheel first, so that torch etc will be installed
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/pip \
    pip install dist/*.whl --verbose

RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
    --mount=type=cache,target=/root/.cache/pip \
    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
#################### vLLM installation IMAGE ####################


#################### TEST IMAGE ####################
# image to run unit testing suite
# note that this uses vllm installed by `pip`
FROM vllm-base AS test

ADD . /vllm-workspace/

# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-dev.txt

# doc requires source code
# we hide them inside `test_docs/` , so that this source code
# will not be imported by other tests
RUN mkdir test_docs
RUN mv docs test_docs/
RUN mv vllm test_docs/

#################### TEST IMAGE ####################

#################### OPENAI API SERVER ####################
# openai api server alternative
FROM vllm-base AS vllm-openai

# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install accelerate hf_transfer modelscope

ENV VLLM_USAGE_SOURCE production-docker-image

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################
[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00			`# The vLLM Dockerfile is used to construct vLLM image that can be directly used`
			`# to run the OpenAI compatible server.`

[Doc] add visualization for multi-stage dockerfile (#4456) Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com> Co-authored-by: Roger Wang <ywang@roblox.com> 2024-04-30 10:41:59 -07:00			`# Please update any changes made here to`
			`# docs/source/dev/dockerfile/dockerfile.rst and`
			`# docs/source/assets/dev/dockerfile-stages-dependency.png`

[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00			`#################### BASE BUILD IMAGE ####################`
[CI/Build] refactor dockerfile & fix pip cache [CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859) 2024-04-04 21:53:16 -07:00			`# prepare basic build environment`
Dockerfile: Upgrade Cuda to 12.1 (#1609) 2023-11-09 20:49:02 +01:00			`FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00
			`RUN apt-get update -y \`
Fix docker python version (#2845) 2024-02-14 10:17:57 -08:00			`&& apt-get install -y python3-pip git`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00
Add unit test for Mixtral MoE layer (#2677) 2024-01-31 14:34:17 -08:00			`# Workaround for https://github.com/openai/triton/issues/2507 and`
			`# https://github.com/pytorch/pytorch/issues/107960 -- hopefully`
			`# this won't be needed for future versions of this docker image`
			`# or future versions of triton.`
			`RUN ldconfig /usr/local/cuda-12.1/compat/`

Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00			`WORKDIR /workspace`

			`# install build and runtime dependencies`
[Misc] Define common requirements (#3841) 2024-04-05 00:39:17 -07:00			`COPY requirements-common.txt requirements-common.txt`
			`COPY requirements-cuda.txt requirements-cuda.txt`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00			`RUN --mount=type=cache,target=/root/.cache/pip \`
[Misc] Define common requirements (#3841) 2024-04-05 00:39:17 -07:00			`pip install -r requirements-cuda.txt`
Dockerfile: Upgrade Cuda to 12.1 (#1609) 2023-11-09 20:49:02 +01:00
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00			`# install development dependencies`
			`COPY requirements-dev.txt requirements-dev.txt`
			`RUN --mount=type=cache,target=/root/.cache/pip \`
			`pip install -r requirements-dev.txt`
[Core] manage nccl via a pypi package & upgrade to pt 2.2.1 (#3805) 2024-04-04 10:26:19 -07:00
			`# cuda arch list used by torch`
			# can be useful for both `dev` and `test`
			`# explicitly set the list to avoid issues with torch 2.2`
			`# see https://github.com/pytorch/pytorch/pull/123243`
			`ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'`
			`ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}`
[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00			`#################### BASE BUILD IMAGE ####################`

Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00
[CI/Build] refactor dockerfile & fix pip cache [CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859) 2024-04-04 21:53:16 -07:00			`#################### WHEEL BUILD IMAGE ####################`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00			`FROM dev AS build`

[FIX] Fix docker build error (#1831) (#1832) Co-authored-by: Antoni Baum <antoni.baum@protonmail.com> 2023-11-30 15:06:50 +08:00			`# install build dependencies`
			`COPY requirements-build.txt requirements-build.txt`
			`RUN --mount=type=cache,target=/root/.cache/pip \`
			`pip install -r requirements-build.txt`

add ccache to docker build image (#3704) 2024-03-28 22:14:24 -07:00			`# install compiler cache to speed up compilation leveraging local or remote caching`
			`RUN apt-get update -y && apt-get install -y ccache`

[CI/Build] refactor dockerfile & fix pip cache [CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859) 2024-04-04 21:53:16 -07:00			`# files and directories related to build wheels`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00			`COPY csrc csrc`
			`COPY setup.py setup.py`
Cmake based build system (#2830) 2024-03-18 18:38:33 -04:00			`COPY cmake cmake`
			`COPY CMakeLists.txt CMakeLists.txt`
[Misc] Define common requirements (#3841) 2024-04-05 00:39:17 -07:00			`COPY requirements-common.txt requirements-common.txt`
			`COPY requirements-cuda.txt requirements-cuda.txt`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00			`COPY pyproject.toml pyproject.toml`
[CI/Build] refactor dockerfile & fix pip cache [CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859) 2024-04-04 21:53:16 -07:00			`COPY vllm vllm`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00
			`# max jobs used by Ninja to build extensions`
[Docker] Add cuda arch list as build option (#1950) 2023-12-08 09:53:47 -08:00			`ARG max_jobs=2`
			`ENV MAX_JOBS=${max_jobs}`
[Docker] Adding number of nvcc_threads during build as envar (#1893) 2023-12-07 16:00:32 -03:00			`# number of threads used by nvcc`
			`ARG nvcc_threads=8`
			`ENV NVCC_THREADS=$nvcc_threads`
Don't build punica kernels by default (#2605) 2024-01-26 15:19:19 -08:00			`# make sure punica kernels are built (for LoRA)`
			`ENV VLLM_INSTALL_PUNICA_KERNELS=1`
[Docker] Add cuda arch list as build option (#1950) 2023-12-08 09:53:47 -08:00
add ccache to docker build image (#3704) 2024-03-28 22:14:24 -07:00			`ENV CCACHE_DIR=/root/.cache/ccache`
			`RUN --mount=type=cache,target=/root/.cache/ccache \`
[CI/Build] refactor dockerfile & fix pip cache [CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859) 2024-04-04 21:53:16 -07:00			`--mount=type=cache,target=/root/.cache/pip \`
			`python3 setup.py bdist_wheel --dist-dir=dist`

			# the `vllm_nccl` package must be installed from source distribution
			`# pip is too smart to store a wheel in the cache, and other CI jobs`
			`# will directly use the wheel from the cache, which is not what we want.`
			`# we need to remove it manually`
			`RUN --mount=type=cache,target=/root/.cache/pip \`
			`pip cache remove vllm_nccl*`
[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00			`#################### EXTENSION Build IMAGE ####################`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00
Install `flash_attn` in Docker image (#3396) 2024-03-14 18:55:54 +01:00			`#################### FLASH_ATTENTION Build IMAGE ####################`
			`FROM dev as flash-attn-builder`
			`# max jobs used for build`
			`ARG max_jobs=2`
			`ENV MAX_JOBS=${max_jobs}`
			`# flash attention version`
[Misc] Upgrade to `torch==2.3.0` (#4454) 2024-04-29 20:05:47 -04:00			`ARG flash_attn_version=v2.5.8`
Install `flash_attn` in Docker image (#3396) 2024-03-14 18:55:54 +01:00			`ENV FLASH_ATTN_VERSION=${flash_attn_version}`

			`WORKDIR /usr/src/flash-attention-v2`

			`# Download the wheel or build it if a pre-compiled release doesn't exist`
			`RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \`
			`--no-build-isolation --no-deps --no-cache-dir`

			`#################### FLASH_ATTENTION Build IMAGE ####################`
[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00
[CI/Build] refactor dockerfile & fix pip cache [CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859) 2024-04-04 21:53:16 -07:00			`#################### vLLM installation IMAGE ####################`
			`# image with vLLM installed`
			`FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base`
[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00			`WORKDIR /vllm-workspace`
[CI/Build] refactor dockerfile & fix pip cache [CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859) 2024-04-04 21:53:16 -07:00
			`RUN apt-get update -y \`
			`&& apt-get install -y python3-pip git vim`

			`# Workaround for https://github.com/openai/triton/issues/2507 and`
			`# https://github.com/pytorch/pytorch/issues/107960 -- hopefully`
			`# this won't be needed for future versions of this docker image`
			`# or future versions of triton.`
			`RUN ldconfig /usr/local/cuda-12.1/compat/`

			`# install vllm wheel first, so that torch etc will be installed`
			`RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \`
			`--mount=type=cache,target=/root/.cache/pip \`
			`pip install dist/*.whl --verbose`

Install `flash_attn` in Docker image (#3396) 2024-03-14 18:55:54 +01:00			`RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \`
[CI/Build] refactor dockerfile & fix pip cache [CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859) 2024-04-04 21:53:16 -07:00			`--mount=type=cache,target=/root/.cache/pip \`
Install `flash_attn` in Docker image (#3396) 2024-03-14 18:55:54 +01:00			`pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir`
[CI/Build] refactor dockerfile & fix pip cache [CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859) 2024-04-04 21:53:16 -07:00			`#################### vLLM installation IMAGE ####################`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00

[CI/Build] refactor dockerfile & fix pip cache [CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859) 2024-04-04 21:53:16 -07:00			`#################### TEST IMAGE ####################`
			`# image to run unit testing suite`
			# note that this uses vllm installed by `pip`
			`FROM vllm-base AS test`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00
[CI/Build] refactor dockerfile & fix pip cache [CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859) 2024-04-04 21:53:16 -07:00			`ADD . /vllm-workspace/`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00
[CI/Build] refactor dockerfile & fix pip cache [CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859) 2024-04-04 21:53:16 -07:00			`# install development dependencies (for testing)`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00			`RUN --mount=type=cache,target=/root/.cache/pip \`
[CI/Build] refactor dockerfile & fix pip cache [CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859) 2024-04-04 21:53:16 -07:00			`pip install -r requirements-dev.txt`
Install `flash_attn` in Docker image (#3396) 2024-03-14 18:55:54 +01:00
[CI/Build] refactor dockerfile & fix pip cache [CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859) 2024-04-04 21:53:16 -07:00			`# doc requires source code`
			# we hide them inside `test_docs/` , so that this source code
			`# will not be imported by other tests`
			`RUN mkdir test_docs`
			`RUN mv docs test_docs/`
			`RUN mv vllm test_docs/`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00
[CI/Build] refactor dockerfile & fix pip cache [CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859) 2024-04-04 21:53:16 -07:00			`#################### TEST IMAGE ####################`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00
[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00			`#################### OPENAI API SERVER ####################`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00			`# openai api server alternative`
			`FROM vllm-base AS vllm-openai`
[CI/Build] refactor dockerfile & fix pip cache [CI/Build] fix pip cache with vllm_nccl & refactor dockerfile to build wheels (#3859) 2024-04-04 21:53:16 -07:00
Update Dockerfile to build Megablocks (#2042) 2023-12-12 17:34:17 -08:00			`# install additional dependencies for openai api server`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00			`RUN --mount=type=cache,target=/root/.cache/pip \`
Update dockerfile with ModelScope support (#3429) 2024-03-20 01:54:59 +08:00			`pip install accelerate hf_transfer modelscope`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00
Usage Stats Collection (#2852) 2024-03-28 22:16:12 -07:00			`ENV VLLM_USAGE_SOURCE production-docker-image`

Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00			`ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]`
[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00			`#################### OPENAI API SERVER ####################`