2024-01-14 12:37:58 -08:00
|
|
|
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
|
|
|
# to run the OpenAI compatible server.
|
|
|
|
|
|
|
|
#################### BASE BUILD IMAGE ####################
|
2023-11-09 20:49:02 +01:00
|
|
|
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
|
2023-10-31 12:36:47 -07:00
|
|
|
|
|
|
|
RUN apt-get update -y \
|
2024-02-14 10:17:57 -08:00
|
|
|
&& apt-get install -y python3-pip git
|
2023-10-31 12:36:47 -07:00
|
|
|
|
2024-01-31 14:34:17 -08:00
|
|
|
# Workaround for https://github.com/openai/triton/issues/2507 and
|
|
|
|
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
|
|
|
# this won't be needed for future versions of this docker image
|
|
|
|
# or future versions of triton.
|
|
|
|
RUN ldconfig /usr/local/cuda-12.1/compat/
|
|
|
|
|
2023-10-31 12:36:47 -07:00
|
|
|
WORKDIR /workspace
|
|
|
|
|
|
|
|
# install build and runtime dependencies
|
|
|
|
COPY requirements.txt requirements.txt
|
|
|
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
|
|
pip install -r requirements.txt
|
2023-11-09 20:49:02 +01:00
|
|
|
|
2023-10-31 12:36:47 -07:00
|
|
|
# install development dependencies
|
|
|
|
COPY requirements-dev.txt requirements-dev.txt
|
|
|
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
|
|
pip install -r requirements-dev.txt
|
2024-01-14 12:37:58 -08:00
|
|
|
#################### BASE BUILD IMAGE ####################
|
|
|
|
|
2023-10-31 12:36:47 -07:00
|
|
|
|
2024-01-14 12:37:58 -08:00
|
|
|
#################### EXTENSION BUILD IMAGE ####################
|
2023-10-31 12:36:47 -07:00
|
|
|
FROM dev AS build
|
|
|
|
|
2023-11-30 15:06:50 +08:00
|
|
|
# install build dependencies
|
|
|
|
COPY requirements-build.txt requirements-build.txt
|
|
|
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
|
|
pip install -r requirements-build.txt
|
|
|
|
|
2023-10-31 12:36:47 -07:00
|
|
|
# copy input files
|
|
|
|
COPY csrc csrc
|
|
|
|
COPY setup.py setup.py
|
|
|
|
COPY requirements.txt requirements.txt
|
|
|
|
COPY pyproject.toml pyproject.toml
|
|
|
|
COPY vllm/__init__.py vllm/__init__.py
|
|
|
|
|
2024-01-14 12:37:58 -08:00
|
|
|
# cuda arch list used by torch
|
2023-12-08 09:53:47 -08:00
|
|
|
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
|
|
|
|
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
2023-10-31 12:36:47 -07:00
|
|
|
# max jobs used by Ninja to build extensions
|
2023-12-08 09:53:47 -08:00
|
|
|
ARG max_jobs=2
|
|
|
|
ENV MAX_JOBS=${max_jobs}
|
2023-12-07 16:00:32 -03:00
|
|
|
# number of threads used by nvcc
|
|
|
|
ARG nvcc_threads=8
|
|
|
|
ENV NVCC_THREADS=$nvcc_threads
|
2024-01-26 15:19:19 -08:00
|
|
|
# make sure punica kernels are built (for LoRA)
|
|
|
|
ENV VLLM_INSTALL_PUNICA_KERNELS=1
|
2023-12-08 09:53:47 -08:00
|
|
|
|
2023-10-31 12:36:47 -07:00
|
|
|
RUN python3 setup.py build_ext --inplace
|
2024-01-14 12:37:58 -08:00
|
|
|
#################### EXTENSION Build IMAGE ####################
|
2023-10-31 12:36:47 -07:00
|
|
|
|
2024-03-14 18:55:54 +01:00
|
|
|
#################### FLASH_ATTENTION Build IMAGE ####################
|
|
|
|
FROM dev as flash-attn-builder
|
|
|
|
# max jobs used for build
|
|
|
|
ARG max_jobs=2
|
|
|
|
ENV MAX_JOBS=${max_jobs}
|
|
|
|
# flash attention version
|
|
|
|
ARG flash_attn_version=v2.5.6
|
|
|
|
ENV FLASH_ATTN_VERSION=${flash_attn_version}
|
|
|
|
|
|
|
|
WORKDIR /usr/src/flash-attention-v2
|
|
|
|
|
|
|
|
# Download the wheel or build it if a pre-compiled release doesn't exist
|
|
|
|
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
|
|
|
|
--no-build-isolation --no-deps --no-cache-dir
|
|
|
|
|
|
|
|
#################### FLASH_ATTENTION Build IMAGE ####################
|
2024-01-14 12:37:58 -08:00
|
|
|
|
|
|
|
#################### TEST IMAGE ####################
|
2023-10-31 12:36:47 -07:00
|
|
|
# image to run unit testing suite
|
|
|
|
FROM dev AS test
|
|
|
|
|
|
|
|
# copy pytorch extensions separately to avoid having to rebuild
|
|
|
|
# when python code changes
|
2024-01-14 12:37:58 -08:00
|
|
|
WORKDIR /vllm-workspace
|
|
|
|
# ADD is used to preserve directory structure
|
|
|
|
ADD . /vllm-workspace/
|
|
|
|
COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
|
2024-03-14 18:55:54 +01:00
|
|
|
# Install flash attention (from pre-built wheel)
|
|
|
|
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
|
|
|
|
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
|
2024-01-14 12:37:58 -08:00
|
|
|
# ignore build dependencies installation because we are using pre-complied extensions
|
|
|
|
RUN rm pyproject.toml
|
|
|
|
RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
|
|
|
|
#################### TEST IMAGE ####################
|
2023-10-31 12:36:47 -07:00
|
|
|
|
|
|
|
|
2024-01-14 12:37:58 -08:00
|
|
|
#################### RUNTIME BASE IMAGE ####################
|
2024-02-14 10:17:57 -08:00
|
|
|
# We used base cuda image because pytorch installs its own cuda libraries.
|
|
|
|
# However cupy depends on cuda libraries so we had to switch to the runtime image
|
|
|
|
# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
|
|
|
|
FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base
|
2023-10-31 12:36:47 -07:00
|
|
|
|
|
|
|
# libnccl required for ray
|
|
|
|
RUN apt-get update -y \
|
|
|
|
&& apt-get install -y python3-pip
|
|
|
|
|
|
|
|
WORKDIR /workspace
|
|
|
|
COPY requirements.txt requirements.txt
|
|
|
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
|
|
pip install -r requirements.txt
|
2024-03-14 18:55:54 +01:00
|
|
|
|
|
|
|
# Install flash attention (from pre-built wheel)
|
|
|
|
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
|
|
|
|
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
|
|
|
|
|
2024-01-14 12:37:58 -08:00
|
|
|
#################### RUNTIME BASE IMAGE ####################
|
2023-10-31 12:36:47 -07:00
|
|
|
|
|
|
|
|
2024-01-14 12:37:58 -08:00
|
|
|
#################### OPENAI API SERVER ####################
|
2023-10-31 12:36:47 -07:00
|
|
|
# openai api server alternative
|
|
|
|
FROM vllm-base AS vllm-openai
|
2023-12-12 17:34:17 -08:00
|
|
|
# install additional dependencies for openai api server
|
2023-10-31 12:36:47 -07:00
|
|
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
2024-03-13 06:33:43 +00:00
|
|
|
pip install accelerate hf_transfer
|
2023-10-31 12:36:47 -07:00
|
|
|
|
2023-12-12 17:34:17 -08:00
|
|
|
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
|
2023-12-13 23:55:07 -08:00
|
|
|
COPY vllm vllm
|
2023-10-31 12:36:47 -07:00
|
|
|
|
|
|
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
2024-01-14 12:37:58 -08:00
|
|
|
#################### OPENAI API SERVER ####################
|