[ci][gh200] dockerfile clean up (#11351)
Signed-off-by: drikster80 <ed.sealing@gmail.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: drikster80 <ed.sealing@gmail.com> Co-authored-by: cenzhiyao <2523403608@qq.com>
This commit is contained in:
parent
48edab8041
commit
7801f56ed7
@ -4,6 +4,9 @@
|
|||||||
# It serves a sanity check for compilation and basic model usage.
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
|
# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
|
||||||
|
python3 use_existing_torch.py
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
DOCKER_BUILDKIT=1 docker build . \
|
DOCKER_BUILDKIT=1 docker build . \
|
||||||
--target vllm-openai \
|
--target vllm-openai \
|
||||||
|
39
Dockerfile
39
Dockerfile
@ -45,17 +45,21 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
|||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
# install build and runtime dependencies
|
# install build and runtime dependencies
|
||||||
COPY requirements-common.txt requirements-common.txt
|
|
||||||
COPY requirements-cuda.txt requirements-cuda.txt
|
|
||||||
COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
python3 -m pip install -r requirements-cuda.txt
|
|
||||||
|
|
||||||
|
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
|
||||||
|
# we need to install torch and torchvision from the nightly builds first,
|
||||||
|
# pytorch will not appear as a vLLM dependency in all of the following steps
|
||||||
|
# after this step
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||||
python3 -m pip install -r requirements-cuda-arm64.txt; \
|
python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
COPY requirements-common.txt requirements-common.txt
|
||||||
|
COPY requirements-cuda.txt requirements-cuda.txt
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install -r requirements-cuda.txt
|
||||||
|
|
||||||
# cuda arch list used by torch
|
# cuda arch list used by torch
|
||||||
# can be useful for both `dev` and `test`
|
# can be useful for both `dev` and `test`
|
||||||
# explicitly set the list to avoid issues with torch 2.2
|
# explicitly set the list to avoid issues with torch 2.2
|
||||||
@ -77,11 +81,6 @@ COPY requirements-build.txt requirements-build.txt
|
|||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
python3 -m pip install -r requirements-build.txt
|
python3 -m pip install -r requirements-build.txt
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
|
||||||
python3 -m pip install -r requirements-cuda-arm64.txt; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
ARG GIT_REPO_CHECK=0
|
ARG GIT_REPO_CHECK=0
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
@ -157,8 +156,6 @@ WORKDIR /vllm-workspace
|
|||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
|
|
||||||
COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
|
|
||||||
|
|
||||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||||
|
|
||||||
@ -183,17 +180,20 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
# or future versions of triton.
|
# or future versions of triton.
|
||||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||||
|
|
||||||
|
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
|
||||||
|
# we need to install torch and torchvision from the nightly builds first,
|
||||||
|
# pytorch will not appear as a vLLM dependency in all of the following steps
|
||||||
|
# after this step
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||||
|
python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \
|
||||||
|
fi
|
||||||
|
|
||||||
# Install vllm wheel first, so that torch etc will be installed.
|
# Install vllm wheel first, so that torch etc will be installed.
|
||||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||||
--mount=type=cache,target=/root/.cache/pip \
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
python3 -m pip install dist/*.whl --verbose
|
python3 -m pip install dist/*.whl --verbose
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
|
||||||
pip uninstall -y torch && \
|
|
||||||
python3 -m pip install -r requirements-cuda-arm64.txt; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
. /etc/environment && \
|
. /etc/environment && \
|
||||||
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
||||||
@ -244,6 +244,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
else \
|
else \
|
||||||
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
|
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||||
|
|
||||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
@ -3,6 +3,9 @@
|
|||||||
Deploying with Docker
|
Deploying with Docker
|
||||||
============================
|
============================
|
||||||
|
|
||||||
|
Use vLLM's Official Docker Image
|
||||||
|
--------------------------------
|
||||||
|
|
||||||
vLLM offers an official Docker image for deployment.
|
vLLM offers an official Docker image for deployment.
|
||||||
The image can be used to run OpenAI compatible server and is available on Docker Hub as `vllm/vllm-openai <https://hub.docker.com/r/vllm/vllm-openai/tags>`_.
|
The image can be used to run OpenAI compatible server and is available on Docker Hub as `vllm/vllm-openai <https://hub.docker.com/r/vllm/vllm-openai/tags>`_.
|
||||||
|
|
||||||
@ -24,12 +27,15 @@ The image can be used to run OpenAI compatible server and is available on Docker
|
|||||||
memory to share data between processes under the hood, particularly for tensor parallel inference.
|
memory to share data between processes under the hood, particularly for tensor parallel inference.
|
||||||
|
|
||||||
|
|
||||||
|
Building vLLM's Docker Image from Source
|
||||||
|
----------------------------------------
|
||||||
|
|
||||||
You can build and run vLLM from source via the provided `Dockerfile <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_. To build vLLM:
|
You can build and run vLLM from source via the provided `Dockerfile <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_. To build vLLM:
|
||||||
|
|
||||||
.. code-block:: console
|
.. code-block:: console
|
||||||
|
|
||||||
$ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
|
$ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
|
||||||
|
$ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
@ -41,18 +47,19 @@ Building for Arm64/aarch64
|
|||||||
--------------------------
|
--------------------------
|
||||||
|
|
||||||
A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
|
A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
|
||||||
of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
|
of PyTorch Nightly and should be considered **experimental**. Using the flag ``--platform "linux/arm64"`` will attempt to build for arm64.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
|
Multiple modules must be compiled, so this process can take a while. Recommend using ``--build-arg max_jobs=`` & ``--build-arg nvcc_threads=``
|
||||||
flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits.
|
flags to speed up build process. However, ensure your ``max_jobs`` is substantially larger than ``nvcc_threads`` to get the most benefits.
|
||||||
Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
|
Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
|
||||||
|
|
||||||
.. code-block:: console
|
.. code-block:: console
|
||||||
|
|
||||||
# Example of building on Nvidia GH200 server. (Memory usage: ~12GB, Build time: ~1475s / ~25 min, Image size: 7.26GB)
|
# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
|
||||||
$ DOCKER_BUILDKIT=1 sudo docker build . \
|
$ python3 use_existing_torch.py
|
||||||
|
$ DOCKER_BUILDKIT=1 docker build . \
|
||||||
--target vllm-openai \
|
--target vllm-openai \
|
||||||
--platform "linux/arm64" \
|
--platform "linux/arm64" \
|
||||||
-t vllm/vllm-gh200-openai:latest \
|
-t vllm/vllm-gh200-openai:latest \
|
||||||
@ -61,7 +68,10 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
|
|||||||
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
||||||
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
||||||
|
|
||||||
To run vLLM:
|
Use the custom-built vLLM Docker image
|
||||||
|
--------------------------------------
|
||||||
|
|
||||||
|
To run vLLM with the custom-built Docker image:
|
||||||
|
|
||||||
.. code-block:: console
|
.. code-block:: console
|
||||||
|
|
||||||
@ -71,6 +81,8 @@ To run vLLM:
|
|||||||
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
||||||
vllm/vllm-openai <args...>
|
vllm/vllm-openai <args...>
|
||||||
|
|
||||||
|
The argument ``vllm/vllm-openai`` specifies the image to run, and should be replaced with the name of the custom-built image (the ``-t`` tag from the build command).
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
**For `v0.4.1` and `v0.4.2` only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` .
|
**For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` .
|
||||||
|
@ -4,6 +4,6 @@ ninja
|
|||||||
packaging
|
packaging
|
||||||
setuptools>=61
|
setuptools>=61
|
||||||
setuptools-scm>=8
|
setuptools-scm>=8
|
||||||
torch==2.5.1; platform_machine != 'aarch64'
|
torch==2.5.1
|
||||||
wheel
|
wheel
|
||||||
jinja2
|
jinja2
|
||||||
|
@ -19,7 +19,7 @@ pillow # Required for image processing
|
|||||||
prometheus-fastapi-instrumentator >= 7.0.0
|
prometheus-fastapi-instrumentator >= 7.0.0
|
||||||
tiktoken >= 0.6.0 # Required for DBRX tokenizer
|
tiktoken >= 0.6.0 # Required for DBRX tokenizer
|
||||||
lm-format-enforcer >= 0.10.9, < 0.11
|
lm-format-enforcer >= 0.10.9, < 0.11
|
||||||
outlines == 0.1.11
|
outlines == 0.1.11 # Requires pytorch
|
||||||
lark == 1.2.2
|
lark == 1.2.2
|
||||||
xgrammar >= 0.1.6; platform_machine == "x86_64"
|
xgrammar >= 0.1.6; platform_machine == "x86_64"
|
||||||
typing_extensions >= 4.10
|
typing_extensions >= 4.10
|
||||||
@ -34,5 +34,6 @@ pyyaml
|
|||||||
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||||
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
|
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
|
||||||
einops # Required for Qwen2-VL.
|
einops # Required for Qwen2-VL.
|
||||||
compressed-tensors == 0.8.1 # required for compressed-tensors
|
compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch
|
||||||
depyf==0.18.0 # required for profiling and debugging torch.compile
|
depyf==0.18.0 # required for profiling and debugging with compilation config
|
||||||
|
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
|
||||||
|
@ -1,3 +0,0 @@
|
|||||||
--index-url https://download.pytorch.org/whl/nightly/cu124
|
|
||||||
torchvision==0.22.0.dev20241215; platform_machine == 'aarch64'
|
|
||||||
torch==2.6.0.dev20241210+cu124; platform_machine == 'aarch64'
|
|
@ -4,7 +4,7 @@
|
|||||||
# Dependencies for NVIDIA GPUs
|
# Dependencies for NVIDIA GPUs
|
||||||
ray >= 2.9
|
ray >= 2.9
|
||||||
nvidia-ml-py >= 12.560.30 # for pynvml package
|
nvidia-ml-py >= 12.560.30 # for pynvml package
|
||||||
torch == 2.5.1; platform_machine != 'aarch64'
|
torch == 2.5.1
|
||||||
# These must be updated alongside torch
|
# These must be updated alongside torch
|
||||||
torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
||||||
xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1
|
xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1
|
||||||
|
Loading…
x
Reference in New Issue
Block a user