vllm/Dockerfile

# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.

#################### BASE BUILD IMAGE ####################
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev

RUN apt-get update -y \
    && apt-get install -y python3-pip git

# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-12.1/compat/

WORKDIR /workspace

# install build and runtime dependencies
COPY requirements.txt requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements.txt

# install development dependencies
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-dev.txt
#################### BASE BUILD IMAGE ####################


#################### EXTENSION BUILD IMAGE ####################
FROM dev AS build

# install build dependencies
COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-build.txt

# copy input files
COPY csrc csrc
COPY setup.py setup.py
COPY requirements.txt requirements.txt
COPY pyproject.toml pyproject.toml
COPY vllm/__init__.py vllm/__init__.py

# cuda arch list used by torch
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# max jobs used by Ninja to build extensions
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# number of threads used by nvcc
ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1

RUN python3 setup.py build_ext --inplace
#################### EXTENSION Build IMAGE ####################

#################### FLASH_ATTENTION Build IMAGE ####################
FROM dev as flash-attn-builder
# max jobs used for build
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# flash attention version
ARG flash_attn_version=v2.5.6
ENV FLASH_ATTN_VERSION=${flash_attn_version}

WORKDIR /usr/src/flash-attention-v2

# Download the wheel or build it if a pre-compiled release doesn't exist
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
    --no-build-isolation --no-deps --no-cache-dir

#################### FLASH_ATTENTION Build IMAGE ####################

#################### TEST IMAGE ####################
# image to run unit testing suite
FROM dev AS test

# copy pytorch extensions separately to avoid having to rebuild
# when python code changes
WORKDIR /vllm-workspace
# ADD is used to preserve directory structure
ADD . /vllm-workspace/
COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
# Install flash attention (from pre-built wheel)
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
# ignore build dependencies installation because we are using pre-complied extensions
RUN rm pyproject.toml
RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
#################### TEST IMAGE ####################


#################### RUNTIME BASE IMAGE ####################
# We used base cuda image because pytorch installs its own cuda libraries.
# However cupy depends on cuda libraries so we had to switch to the runtime image
# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base

# libnccl required for ray
RUN apt-get update -y \
    && apt-get install -y python3-pip

WORKDIR /workspace
COPY requirements.txt requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements.txt

# Install flash attention (from pre-built wheel)
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir

#################### RUNTIME BASE IMAGE ####################


#################### OPENAI API SERVER ####################
# openai api server alternative
FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install accelerate hf_transfer

COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY vllm vllm

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################
[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00			`# The vLLM Dockerfile is used to construct vLLM image that can be directly used`
			`# to run the OpenAI compatible server.`

			`#################### BASE BUILD IMAGE ####################`
Dockerfile: Upgrade Cuda to 12.1 (#1609) 2023-11-09 20:49:02 +01:00			`FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00
			`RUN apt-get update -y \`
Fix docker python version (#2845) 2024-02-14 10:17:57 -08:00			`&& apt-get install -y python3-pip git`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00
Add unit test for Mixtral MoE layer (#2677) 2024-01-31 14:34:17 -08:00			`# Workaround for https://github.com/openai/triton/issues/2507 and`
			`# https://github.com/pytorch/pytorch/issues/107960 -- hopefully`
			`# this won't be needed for future versions of this docker image`
			`# or future versions of triton.`
			`RUN ldconfig /usr/local/cuda-12.1/compat/`

Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00			`WORKDIR /workspace`

			`# install build and runtime dependencies`
			`COPY requirements.txt requirements.txt`
			`RUN --mount=type=cache,target=/root/.cache/pip \`
			`pip install -r requirements.txt`
Dockerfile: Upgrade Cuda to 12.1 (#1609) 2023-11-09 20:49:02 +01:00
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00			`# install development dependencies`
			`COPY requirements-dev.txt requirements-dev.txt`
			`RUN --mount=type=cache,target=/root/.cache/pip \`
			`pip install -r requirements-dev.txt`
[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00			`#################### BASE BUILD IMAGE ####################`

Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00
[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00			`#################### EXTENSION BUILD IMAGE ####################`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00			`FROM dev AS build`

[FIX] Fix docker build error (#1831) (#1832) Co-authored-by: Antoni Baum <antoni.baum@protonmail.com> 2023-11-30 15:06:50 +08:00			`# install build dependencies`
			`COPY requirements-build.txt requirements-build.txt`
			`RUN --mount=type=cache,target=/root/.cache/pip \`
			`pip install -r requirements-build.txt`

Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00			`# copy input files`
			`COPY csrc csrc`
			`COPY setup.py setup.py`
			`COPY requirements.txt requirements.txt`
			`COPY pyproject.toml pyproject.toml`
			`COPY vllm/__init__.py vllm/__init__.py`

[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00			`# cuda arch list used by torch`
[Docker] Add cuda arch list as build option (#1950) 2023-12-08 09:53:47 -08:00			`ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'`
			`ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00			`# max jobs used by Ninja to build extensions`
[Docker] Add cuda arch list as build option (#1950) 2023-12-08 09:53:47 -08:00			`ARG max_jobs=2`
			`ENV MAX_JOBS=${max_jobs}`
[Docker] Adding number of nvcc_threads during build as envar (#1893) 2023-12-07 16:00:32 -03:00			`# number of threads used by nvcc`
			`ARG nvcc_threads=8`
			`ENV NVCC_THREADS=$nvcc_threads`
Don't build punica kernels by default (#2605) 2024-01-26 15:19:19 -08:00			`# make sure punica kernels are built (for LoRA)`
			`ENV VLLM_INSTALL_PUNICA_KERNELS=1`
[Docker] Add cuda arch list as build option (#1950) 2023-12-08 09:53:47 -08:00
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00			`RUN python3 setup.py build_ext --inplace`
[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00			`#################### EXTENSION Build IMAGE ####################`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00
Install `flash_attn` in Docker image (#3396) 2024-03-14 18:55:54 +01:00			`#################### FLASH_ATTENTION Build IMAGE ####################`
			`FROM dev as flash-attn-builder`
			`# max jobs used for build`
			`ARG max_jobs=2`
			`ENV MAX_JOBS=${max_jobs}`
			`# flash attention version`
			`ARG flash_attn_version=v2.5.6`
			`ENV FLASH_ATTN_VERSION=${flash_attn_version}`

			`WORKDIR /usr/src/flash-attention-v2`

			`# Download the wheel or build it if a pre-compiled release doesn't exist`
			`RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \`
			`--no-build-isolation --no-deps --no-cache-dir`

			`#################### FLASH_ATTENTION Build IMAGE ####################`
[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00
			`#################### TEST IMAGE ####################`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00			`# image to run unit testing suite`
			`FROM dev AS test`

			`# copy pytorch extensions separately to avoid having to rebuild`
			`# when python code changes`
[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00			`WORKDIR /vllm-workspace`
			`# ADD is used to preserve directory structure`
			`ADD . /vllm-workspace/`
			`COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/`
Install `flash_attn` in Docker image (#3396) 2024-03-14 18:55:54 +01:00			`# Install flash attention (from pre-built wheel)`
			`RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \`
			`pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir`
[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00			`# ignore build dependencies installation because we are using pre-complied extensions`
			`RUN rm pyproject.toml`
			`RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose`
			`#################### TEST IMAGE ####################`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00

[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00			`#################### RUNTIME BASE IMAGE ####################`
Fix docker python version (#2845) 2024-02-14 10:17:57 -08:00			`# We used base cuda image because pytorch installs its own cuda libraries.`
			`# However cupy depends on cuda libraries so we had to switch to the runtime image`
			`# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda`
			`FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00
			`# libnccl required for ray`
			`RUN apt-get update -y \`
			`&& apt-get install -y python3-pip`

			`WORKDIR /workspace`
			`COPY requirements.txt requirements.txt`
			`RUN --mount=type=cache,target=/root/.cache/pip \`
			`pip install -r requirements.txt`
Install `flash_attn` in Docker image (#3396) 2024-03-14 18:55:54 +01:00
			`# Install flash attention (from pre-built wheel)`
			`RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \`
			`pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir`

[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00			`#################### RUNTIME BASE IMAGE ####################`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00

[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00			`#################### OPENAI API SERVER ####################`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00			`# openai api server alternative`
			`FROM vllm-base AS vllm-openai`
Update Dockerfile to build Megablocks (#2042) 2023-12-12 17:34:17 -08:00			`# install additional dependencies for openai api server`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00			`RUN --mount=type=cache,target=/root/.cache/pip \`
add hf_transfer to requirements.txt (#3031) 2024-03-13 06:33:43 +00:00			`pip install accelerate hf_transfer`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00
Update Dockerfile to build Megablocks (#2042) 2023-12-12 17:34:17 -08:00			`COPY --from=build /workspace/vllm/*.so /workspace/vllm/`
Optimize Mixtral with expert parallelism (#2090) 2023-12-13 23:55:07 -08:00			`COPY vllm vllm`
Add Dockerfile (#1350) 2023-10-31 12:36:47 -07:00
			`ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]`
[CI] Add Buildkite (#2355) 2024-01-14 12:37:58 -08:00			`#################### OPENAI API SERVER ####################`