[CPU][CI] Improve CPU Dockerfile (#15690)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-03-28 16:36:31 +08:00 · 2025-03-28 16:36:31 +08:00 · 280d074103
commit 280d074103
parent 32b14baf8a
5 changed files with 151 additions and 59 deletions
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -82,7 +82,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f Dockerfile.cpu ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -8,15 +8,19 @@ set -ex
 CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 # Try building the docker image
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
 # Setup cleanup
-remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
+remove_docker_container() { 
    set -e; 
    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
 }
 trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f Dockerfile.cpu .
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f Dockerfile.cpu .
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
 --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
@ -36,8 +40,6 @@ function cpu_tests() {
  # Run basic model test
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
    pip install -r vllm/requirements/test.txt
    pip install -r vllm/requirements/cpu.txt
    pytest -v -s tests/kernels/test_cache.py -m cpu_model
    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
    pytest -v -s tests/models/decoder_only/language -m cpu_model
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -1,69 +1,138 @@
 # This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
 #
 # Build targets:
 #   vllm-openai (default): used for serving deployment
 #   vllm-test: used for CI tests
 #   vllm-dev: used for development
 #
 # Build arguments:
 #   PYTHON_VERSION=3.12 (default)|3.11|3.10|3.9
 #   VLLM_CPU_DISABLE_AVX512=false (default)|true
 #
-FROM ubuntu:22.04 AS cpu-test-1
+######################### BASE IMAGE #########################
 FROM ubuntu:22.04 AS base
 WORKDIR /workspace/
 ARG PYTHON_VERSION=3.12
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 # Install minimal dependencies and uv
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked \
    apt-get update -y \
    && apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
        gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
    && curl -LsSf https://astral.sh/uv/install.sh | sh
 ENV CCACHE_DIR=/root/.cache/ccache
 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
-RUN --mount=type=cache,target=/var/cache/apt \
+ENV PATH="/root/.local/bin:$PATH"
-    apt-get update -y \
+ENV VIRTUAL_ENV="/opt/venv"
-    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
-# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
+ENV UV_HTTP_TIMEOUT=500
 # intel-openmp provides additional performance improvement vs. openmp
 # tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install intel-openmp==2025.0.1
-ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
+# Install Python dependencies 
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE="copy"
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
    uv pip install --upgrade pip && \
    uv pip install -r requirements/cpu.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install intel-openmp==2024.2.1 intel_extension_for_pytorch==2.6.0
 ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
 RUN echo 'ulimit -c 0' >> ~/.bashrc
-RUN pip install intel_extension_for_pytorch==2.6.0
+######################### BUILD IMAGE #########################
 FROM base AS vllm-build
 WORKDIR /workspace
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
    pip install --upgrade pip && \
    pip install -r requirements/build.txt
 FROM cpu-test-1 AS build
 WORKDIR /workspace/vllm
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
    pip install -v -r requirements/cpu.txt
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512
 ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
-RUN --mount=type=cache,target=/root/.cache/pip \
+WORKDIR /workspace/vllm
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
    uv pip install -r requirements/build.txt
 COPY . .
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel 
-    pip install dist/*.whl && \
+
-    rm -rf dist
+######################### DEV IMAGE #########################
 FROM vllm-build AS vllm-dev
 WORKDIR /workspace/vllm
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked \
    apt-get install -y --no-install-recommends vim numactl
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install -e tests/vllm_test_utils 
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py develop 
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install -r requirements/dev.txt && \
    pre-commit install --hook-type pre-commit --hook-type commit-msg
 ENTRYPOINT ["bash"]
 ######################### TEST IMAGE #########################
 FROM base AS vllm-test
 WORKDIR /workspace/
-RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,src=requirements/test.txt,target=requirements/test.txt \
    uv pip install -r requirements/test.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
    uv pip install dist/*.whl
 ADD ./tests/ ./tests/
 ADD ./examples/ ./examples/
 ADD ./benchmarks/ ./benchmarks/
 # install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/uv \
-    pip install -e tests/vllm_test_utils
+    uv pip install -e tests/vllm_test_utils 
 ENTRYPOINT ["bash"]
 ######################### RELEASE IMAGE #########################
 FROM base AS vllm-openai
 WORKDIR /workspace/
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
    uv pip install dist/*.whl
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@ -159,18 +159,37 @@ Currently, there are no pre-built CPU wheels.
 ### Pre-built images
-Currently, there are no pre-build CPU images.
+:::::{tab-set}
 :sync-group: device
 ::::{tab-item} Intel/AMD x86
 :sync: x86
 :::{include} cpu/x86.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 :::
 ::::
 :::::
 ### Build image from source
 ```console
-$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
+$ docker build -f Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
-$ docker run -it \
+
-             --rm \
+# Launching OpenAI server 
-             --network=host \
+$ docker run --rm \
-             --cpuset-cpus=<cpu-id-list, optional> \
+             --privileged=true \
-             --cpuset-mems=<memory-node, optional> \
+             --shm-size=4g \
-             vllm-cpu-env
+             -p 8000:8000 \
             -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
             -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
             vllm-cpu-env \
             --model=meta-llama/Llama-3.2-1B-Instruct \
             --dtype=bfloat16 \
             other vLLM OpenAI server arguments
 ```
 ::::{tip}
--- a/docs/source/getting_started/installation/cpu/x86.inc.md
+++ b/docs/source/getting_started/installation/cpu/x86.inc.md
@ -34,6 +34,8 @@ There are no pre-built wheels or images for this device, so you must build vLLM
 ### Pre-built images
 See [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
 ### Build image from source
 ## Extra information