vllm/.buildkite/run-cpu-test.sh

#!/bin/bash

# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex

# Try building the docker image
numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .

# Setup cleanup
remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
trap remove_docker_container EXIT
remove_docker_container

# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
 --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
 --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2

# offline inference
docker exec cpu-test-avx2 bash -c "
  set -e
  python3 examples/offline_inference.py"

# Run basic model test
docker exec cpu-test bash -c "
  set -e
  pip install pytest pytest-asyncio \
    decord einops librosa peft Pillow sentence-transformers soundfile \
    transformers_stream_generator matplotlib datamodel_code_generator
  pip install torchvision --index-url https://download.pytorch.org/whl/cpu
  # Embedding models are not supported for CPU yet
  # pytest -v -s tests/models/embedding/language
  pytest -v -s tests/models/encoder_decoder/language
  pytest -v -s tests/models/decoder_only/language/test_models.py
  # Chunked prefill not supported for CPU yet
  # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
  pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"

# Run compressed-tensor test
docker exec cpu-test bash -c "
  set -e
  pytest -s -v \
  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"

# Run AWQ test
docker exec cpu-test bash -c "
  set -e
  pytest -s -v \
  tests/quantization/test_ipex_quant.py"

# online inference
docker exec cpu-test bash -c "
  set -e
  export VLLM_CPU_KVCACHE_SPACE=10 
  export VLLM_CPU_OMP_THREADS_BIND=48-92 
  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
  timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
  python3 benchmarks/benchmark_serving.py \
    --backend vllm \
    --dataset-name random \
    --model facebook/opt-125m \
    --num-prompts 20 \
    --endpoint /v1/completions \
    --tokenizer facebook/opt-125m"
[CI/Build] Add shell script linting using shellcheck (#7925) Signed-off-by: Russell Bryant <rbryant@redhat.com> 2024-11-07 13:17:29 -05:00			`#!/bin/bash`

[Hardware][Intel] Add CPU inference backend (#3634) Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Yuan Zhou <yuan.zhou@intel.com> 2024-04-02 13:07:30 +08:00			`# This script build the CPU docker image and run the offline inference inside the container.`
			`# It serves a sanity check for compilation and basic model usage.`
			`set -ex`

			`# Try building the docker image`
[Hardware] [Intel] Enable Multiprocessing and tensor parallel in CPU backend and update documentation (#6125) 2024-07-27 04:50:10 +08:00			`numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .`
			`numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .`
[Hardware][Intel] Add CPU inference backend (#3634) Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Yuan Zhou <yuan.zhou@intel.com> 2024-04-02 13:07:30 +08:00
			`# Setup cleanup`
[CI/BUILD] Support non-AVX512 vLLM building and testing (#5574) 2024-06-18 02:36:10 +08:00			`remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 \|\| true; }`
[Hardware][Intel] Add CPU inference backend (#3634) Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Yuan Zhou <yuan.zhou@intel.com> 2024-04-02 13:07:30 +08:00			`trap remove_docker_container EXIT`
			`remove_docker_container`

[Hardware] [Intel] Enable Multiprocessing and tensor parallel in CPU backend and update documentation (#6125) 2024-07-27 04:50:10 +08:00			`# Run the image, setting --shm-size=4g for tensor parallel.`
[Hardware][Intel CPU] Adding intel openmp tunings in Docker file (#6008) Signed-off-by: Yuan Zhou <yuan.zhou@intel.com> 2024-07-05 06:22:12 +08:00			`docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \`
[Hardware] [Intel] Enable Multiprocessing and tensor parallel in CPU backend and update documentation (#6125) 2024-07-27 04:50:10 +08:00			`--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test`
[Hardware][Intel CPU] Adding intel openmp tunings in Docker file (#6008) Signed-off-by: Yuan Zhou <yuan.zhou@intel.com> 2024-07-05 06:22:12 +08:00			`docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \`
[Hardware] [Intel] Enable Multiprocessing and tensor parallel in CPU backend and update documentation (#6125) 2024-07-27 04:50:10 +08:00			`--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2`
[CI/BUILD] enable intel queue for longer CPU tests (#4113) 2024-06-04 01:39:50 +08:00
			`# offline inference`
[CI/Build] Update CPU tests to include all "standard" tests (#5481) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-11-08 23:30:04 +08:00			`docker exec cpu-test-avx2 bash -c "`
			`set -e`
			`python3 examples/offline_inference.py"`
[CI/BUILD] enable intel queue for longer CPU tests (#4113) 2024-06-04 01:39:50 +08:00
			`# Run basic model test`
[Hardware] [Intel] Enable Multiprocessing and tensor parallel in CPU backend and update documentation (#6125) 2024-07-27 04:50:10 +08:00			`docker exec cpu-test bash -c "`
[CI/Build] Update CPU tests to include all "standard" tests (#5481) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-11-08 23:30:04 +08:00			`set -e`
			`pip install pytest pytest-asyncio \`
			`decord einops librosa peft Pillow sentence-transformers soundfile \`
			`transformers_stream_generator matplotlib datamodel_code_generator`
			`pip install torchvision --index-url https://download.pytorch.org/whl/cpu`
			`# Embedding models are not supported for CPU yet`
			`# pytest -v -s tests/models/embedding/language`
[Hardware][CPU] Cross-attention and Encoder-Decoder models support on CPU backend (#9089) 2024-10-07 14:50:35 +08:00			`pytest -v -s tests/models/encoder_decoder/language`
[CI/Build] Update CPU tests to include all "standard" tests (#5481) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-11-08 23:30:04 +08:00			`pytest -v -s tests/models/decoder_only/language/test_models.py`
			`# Chunked prefill not supported for CPU yet`
			`# pytest -v -s tests/models/decoder_only/audio_language -m cpu_model`
			`pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"`
[Hardware] [Intel] Enable Multiprocessing and tensor parallel in CPU backend and update documentation (#6125) 2024-07-27 04:50:10 +08:00
[Hardware][Intel] Support compressed-tensor W8A8 for CPU backend (#7257) 2024-09-12 00:46:46 +08:00			`# Run compressed-tensor test`
[Hardware][CPU] compressed-tensor INT8 W8A8 AZP support (#9344) 2024-10-18 00:21:04 +08:00			`docker exec cpu-test bash -c "`
[CI/Build] Update CPU tests to include all "standard" tests (#5481) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-11-08 23:30:04 +08:00			`set -e`
[Hardware][CPU] compressed-tensor INT8 W8A8 AZP support (#9344) 2024-10-18 00:21:04 +08:00			`pytest -s -v \`
			`tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \`
			`tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"`
[Hardware][CPU] Support AWQ for CPU backend (#7515) 2024-10-10 00:28:08 +08:00
			`# Run AWQ test`
[Hardware][Intel] Support compressed-tensor W8A8 for CPU backend (#7257) 2024-09-12 00:46:46 +08:00			`docker exec cpu-test bash -c "`
[CI/Build] Update CPU tests to include all "standard" tests (#5481) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-11-08 23:30:04 +08:00			`set -e`
[Hardware][Intel] Support compressed-tensor W8A8 for CPU backend (#7257) 2024-09-12 00:46:46 +08:00			`pytest -s -v \`
[Hardware][CPU] Support AWQ for CPU backend (#7515) 2024-10-10 00:28:08 +08:00			`tests/quantization/test_ipex_quant.py"`
[Hardware][Intel] Support compressed-tensor W8A8 for CPU backend (#7257) 2024-09-12 00:46:46 +08:00
[Hardware] [Intel] Enable Multiprocessing and tensor parallel in CPU backend and update documentation (#6125) 2024-07-27 04:50:10 +08:00			`# online inference`
			`docker exec cpu-test bash -c "`
[CI/Build] Update CPU tests to include all "standard" tests (#5481) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-11-08 23:30:04 +08:00			`set -e`
[Hardware] [Intel] Enable Multiprocessing and tensor parallel in CPU backend and update documentation (#6125) 2024-07-27 04:50:10 +08:00			`export VLLM_CPU_KVCACHE_SPACE=10`
			`export VLLM_CPU_OMP_THREADS_BIND=48-92`
[Hardware][CPU] Update torch 2.5 (#9911) Signed-off-by: jiang1.li <jiang1.li@intel.com> 2024-11-07 12:43:08 +08:00			`python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &`
[Hardware] [Intel] Enable Multiprocessing and tensor parallel in CPU backend and update documentation (#6125) 2024-07-27 04:50:10 +08:00			`timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' \|\| exit 1`
			`python3 benchmarks/benchmark_serving.py \`
			`--backend vllm \`
			`--dataset-name random \`
			`--model facebook/opt-125m \`
			`--num-prompts 20 \`
			`--endpoint /v1/completions \`
			`--tokenizer facebook/opt-125m"`