[Doc] Move examples into categories (#11840)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
2a0596bc48
commit
aba8d6ee00
@ -30,7 +30,7 @@ function cpu_tests() {
|
||||
# offline inference
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference.py"
|
||||
python3 examples/offline_inference/offline_inference.py"
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
|
@ -24,5 +24,5 @@ remove_docker_container
|
||||
|
||||
# Run the image and test offline inference
|
||||
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||
python3 examples/offline_inference.py
|
||||
python3 examples/offline_inference/offline_inference.py
|
||||
'
|
||||
|
@ -13,4 +13,4 @@ trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
|
||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py
|
@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
|
||||
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||
--name "${container_name}" \
|
||||
${image_name} \
|
||||
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py"
|
||||
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py"
|
||||
|
@ -13,4 +13,4 @@ trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
|
||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py
|
||||
|
@ -14,4 +14,4 @@ remove_docker_container
|
||||
# For HF_TOKEN.
|
||||
source /etc/environment
|
||||
# Run a simple end-to-end example.
|
||||
docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
|
||||
docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
|
||||
|
@ -14,6 +14,6 @@ remove_docker_container
|
||||
|
||||
# Run the image and test offline inference/tensor parallel
|
||||
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
|
||||
python3 examples/offline_inference.py
|
||||
python3 examples/offline_inference_cli.py -tp 2
|
||||
python3 examples/offline_inference/offline_inference.py
|
||||
python3 examples/offline_inference/offline_inference_cli.py -tp 2
|
||||
'
|
||||
|
@ -187,19 +187,19 @@ steps:
|
||||
- examples/
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
- python3 offline_inference.py
|
||||
- python3 cpu_offload.py
|
||||
- python3 offline_inference_chat.py
|
||||
- python3 offline_inference_with_prefix.py
|
||||
- python3 llm_engine_example.py
|
||||
- python3 offline_inference_vision_language.py
|
||||
- python3 offline_inference_vision_language_multi_image.py
|
||||
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference_encoder_decoder.py
|
||||
- python3 offline_inference_classification.py
|
||||
- python3 offline_inference_embedding.py
|
||||
- python3 offline_inference_scoring.py
|
||||
- python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||
- python3 offline_inference/offline_inference.py
|
||||
- python3 offline_inference/cpu_offload.py
|
||||
- python3 offline_inference/offline_inference_chat.py
|
||||
- python3 offline_inference/offline_inference_with_prefix.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 offline_inference/offline_inference_vision_language.py
|
||||
- python3 offline_inference/offline_inference_vision_language_multi_image.py
|
||||
- python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/offline_inference_encoder_decoder.py
|
||||
- python3 offline_inference/offline_inference_classification.py
|
||||
- python3 offline_inference/offline_inference_embedding.py
|
||||
- python3 offline_inference/offline_inference_scoring.py
|
||||
- python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||
|
||||
- label: Prefix Caching Test # 9min
|
||||
mirror_hardwares: [amd]
|
||||
|
4
.github/workflows/lint-and-deploy.yaml
vendored
4
.github/workflows/lint-and-deploy.yaml
vendored
@ -27,7 +27,7 @@ jobs:
|
||||
version: v3.10.1
|
||||
|
||||
- name: Run chart-testing (lint)
|
||||
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm
|
||||
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
|
||||
|
||||
- name: Setup minio
|
||||
run: |
|
||||
@ -64,7 +64,7 @@ jobs:
|
||||
run: |
|
||||
export AWS_ACCESS_KEY_ID=minioadmin
|
||||
export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
||||
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
||||
|
||||
- name: curl test
|
||||
run: |
|
||||
|
@ -250,7 +250,7 @@ ENV VLLM_USAGE_SOURCE production-docker-image
|
||||
# define sagemaker first, so it is not default from `docker build`
|
||||
FROM vllm-openai-base AS vllm-sagemaker
|
||||
|
||||
COPY examples/sagemaker-entrypoint.sh .
|
||||
COPY examples/online_serving/sagemaker-entrypoint.sh .
|
||||
RUN chmod +x sagemaker-entrypoint.sh
|
||||
ENTRYPOINT ["./sagemaker-entrypoint.sh"]
|
||||
|
||||
|
@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve
|
||||
|
||||
### Offline Inference
|
||||
|
||||
Refer to <gh-file:examples/offline_inference_with_profiler.py> for an example.
|
||||
Refer to <gh-file:examples/offline_inference/offline_inference_with_profiler.py> for an example.
|
||||
|
||||
### OpenAI Server
|
||||
|
||||
|
@ -61,7 +61,7 @@ run: |
|
||||
|
||||
echo 'Starting gradio server...'
|
||||
git clone https://github.com/vllm-project/vllm.git || true
|
||||
python vllm/examples/gradio_openai_chatbot_webserver.py \
|
||||
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
|
||||
-m $MODEL_NAME \
|
||||
--port 8811 \
|
||||
--model-url http://localhost:8081/v1 \
|
||||
@ -321,7 +321,7 @@ run: |
|
||||
|
||||
echo 'Starting gradio server...'
|
||||
git clone https://github.com/vllm-project/vllm.git || true
|
||||
python vllm/examples/gradio_openai_chatbot_webserver.py \
|
||||
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
|
||||
-m $MODEL_NAME \
|
||||
--port 8811 \
|
||||
--model-url http://$ENDPOINT/v1 \
|
||||
|
@ -21,7 +21,7 @@ Disaggregated prefill DOES NOT improve throughput.
|
||||
|
||||
## Usage example
|
||||
|
||||
Please refer to `examples/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
|
||||
Please refer to `examples/online_serving/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
|
||||
|
||||
## Benchmarks
|
||||
|
||||
|
@ -47,7 +47,7 @@ outputs = llm.generate(
|
||||
)
|
||||
```
|
||||
|
||||
Check out <gh-file:examples/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
|
||||
Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
|
||||
|
||||
## Serving LoRA Adapters
|
||||
|
||||
|
@ -47,7 +47,7 @@ print(f'Model is quantized and saved at "{quant_path}"')
|
||||
To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
|
||||
|
||||
```console
|
||||
$ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
|
||||
$ python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
|
||||
```
|
||||
|
||||
AWQ models are also supported directly through the LLM entrypoint:
|
||||
|
@ -28,7 +28,7 @@ Here is an example of how to enable this feature:
|
||||
|
||||
```python
|
||||
# two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to
|
||||
# https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own.
|
||||
# https://github.com/vllm-project/vllm/blob/main/examples/other/fp8/README.md to generate kv_cache_scales.json of your own.
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
sampling_params = SamplingParams(temperature=1.3, top_p=0.8)
|
||||
|
@ -131,7 +131,7 @@ completion = client.chat.completions.create(
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/openai_chat_completion_structured_outputs.py>
|
||||
Full example: <gh-file:examples/online_serving/openai_chat_completion_structured_outputs.py>
|
||||
|
||||
## Experimental Automatic Parsing (OpenAI API)
|
||||
|
||||
@ -257,4 +257,4 @@ outputs = llm.generate(
|
||||
print(outputs[0].outputs[0].text)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/offline_inference_structured_outputs.py>
|
||||
Full example: <gh-file:examples/offline_inference/offline_inference_structured_outputs.py>
|
||||
|
@ -12,6 +12,7 @@ EXAMPLE_DOC_DIR = ROOT_DIR / "docs/source/getting_started/examples"
|
||||
def fix_case(text: str) -> str:
|
||||
subs = {
|
||||
"api": "API",
|
||||
"Cli": "CLI",
|
||||
"cpu": "CPU",
|
||||
"llm": "LLM",
|
||||
"tpu": "TPU",
|
||||
@ -58,7 +59,7 @@ class Index:
|
||||
content = f"# {self.title}\n\n{self.description}\n\n"
|
||||
content += "```{toctree}\n"
|
||||
content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n"
|
||||
content += "\n".join(sorted(self.documents)) + "\n```\n"
|
||||
content += "\n".join(self.documents) + "\n```\n"
|
||||
return content
|
||||
|
||||
|
||||
@ -131,11 +132,14 @@ class Example:
|
||||
ROOT_DIR)
|
||||
|
||||
content = f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
|
||||
if self.main_file.suffix == ".py":
|
||||
content += f"# {self.title}\n\n"
|
||||
include = "include" if self.main_file.suffix == ".md" else \
|
||||
"literalinclude"
|
||||
content += f":::{{{include}}} {make_relative(self.main_file)}\n:::\n\n"
|
||||
if include == "literalinclude":
|
||||
content += f"# {self.title}\n\n"
|
||||
content += f":::{{{include}}} {make_relative(self.main_file)}\n"
|
||||
if include == "literalinclude":
|
||||
content += f":language: {self.main_file.suffix[1:]}\n"
|
||||
content += ":::\n\n"
|
||||
|
||||
if not self.other_files:
|
||||
return content
|
||||
@ -163,14 +167,16 @@ def generate_examples():
|
||||
description=
|
||||
"A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using <gh-file:docs/source/generate_examples.py> from examples found in <gh-file:examples>.", # noqa: E501
|
||||
caption="Examples",
|
||||
maxdepth=1) # TODO change to 2 when examples start being categorised
|
||||
maxdepth=2)
|
||||
# Category indices stored in reverse order because they are inserted into
|
||||
# examples_index.documents at index 0 in order
|
||||
category_indices = {
|
||||
"offline_inference":
|
||||
"other":
|
||||
Index(
|
||||
path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
|
||||
title="Offline Inference",
|
||||
path=EXAMPLE_DOC_DIR / "examples_other_index.md",
|
||||
title="Other",
|
||||
description=
|
||||
"Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.", # noqa: E501
|
||||
"Other examples that don't strongly fit into the online or offline serving categories.", # noqa: E501
|
||||
caption="Examples",
|
||||
),
|
||||
"online_serving":
|
||||
@ -181,31 +187,30 @@ def generate_examples():
|
||||
"Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.", # noqa: E501
|
||||
caption="Examples",
|
||||
),
|
||||
"other":
|
||||
"offline_inference":
|
||||
Index(
|
||||
path=EXAMPLE_DOC_DIR / "examples_other_index.md",
|
||||
title="Other",
|
||||
path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
|
||||
title="Offline Inference",
|
||||
description=
|
||||
"Other examples that don't strongly fit into the online or offline serving categories.", # noqa: E501
|
||||
"Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.", # noqa: E501
|
||||
caption="Examples",
|
||||
),
|
||||
}
|
||||
|
||||
examples = []
|
||||
glob_patterns = ["*.py", "*.md", "*.sh"]
|
||||
# Find categorised examples
|
||||
for category in category_indices:
|
||||
category_dir = EXAMPLE_DIR / category
|
||||
py = category_dir.glob("*.py")
|
||||
md = category_dir.glob("*.md")
|
||||
for path in itertools.chain(py, md):
|
||||
globs = [category_dir.glob(pattern) for pattern in glob_patterns]
|
||||
for path in itertools.chain(*globs):
|
||||
examples.append(Example(path, category))
|
||||
# Find examples in subdirectories
|
||||
for path in category_dir.glob("*/*.md"):
|
||||
examples.append(Example(path.parent, category))
|
||||
# Find uncategorised examples
|
||||
py = EXAMPLE_DIR.glob("*.py")
|
||||
md = EXAMPLE_DIR.glob("*.md")
|
||||
for path in itertools.chain(py, md):
|
||||
globs = [EXAMPLE_DIR.glob(pattern) for pattern in glob_patterns]
|
||||
for path in itertools.chain(*globs):
|
||||
examples.append(Example(path))
|
||||
# Find examples in subdirectories
|
||||
for path in EXAMPLE_DIR.glob("*/*.md"):
|
||||
@ -215,7 +220,7 @@ def generate_examples():
|
||||
examples.append(Example(path.parent))
|
||||
|
||||
# Generate the example documentation
|
||||
for example in examples:
|
||||
for example in sorted(examples, key=lambda e: e.path.stem):
|
||||
doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md"
|
||||
with open(doc_path, "w+") as f:
|
||||
f.write(example.generate())
|
||||
|
@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
|
||||
$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
|
||||
$ find / -name *libtcmalloc* # find the dynamic link library path
|
||||
$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
|
||||
$ python examples/offline_inference.py # run vLLM
|
||||
$ python examples/offline_inference/offline_inference.py # run vLLM
|
||||
```
|
||||
|
||||
- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
|
||||
@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
|
||||
|
||||
# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
|
||||
$ export VLLM_CPU_OMP_THREADS_BIND=0-7
|
||||
$ python examples/offline_inference.py
|
||||
$ python examples/offline_inference/offline_inference.py
|
||||
```
|
||||
|
||||
- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
|
||||
|
@ -71,4 +71,4 @@ $ --pipeline-parallel-size=2 \
|
||||
$ -tp=8
|
||||
```
|
||||
|
||||
By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/run_cluster.sh> helper script.
|
||||
By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
|
||||
|
@ -31,7 +31,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in
|
||||
|
||||
## Offline Batched Inference
|
||||
|
||||
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference.py>
|
||||
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/offline_inference.py>
|
||||
|
||||
The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
|
||||
|
||||
@ -133,7 +133,7 @@ completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
print("Completion result:", completion)
|
||||
```
|
||||
|
||||
A more detailed client example can be found here: <gh-file:examples/openai_completion_client.py>
|
||||
A more detailed client example can be found here: <gh-file:examples/online_serving/openai_completion_client.py>
|
||||
|
||||
### OpenAI Chat Completions API with vLLM
|
||||
|
||||
|
@ -24,7 +24,7 @@ To isolate the model downloading and loading issue, you can use the `--load-form
|
||||
|
||||
## Model is too large
|
||||
|
||||
If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
|
||||
If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
|
||||
|
||||
## Enable more logging
|
||||
|
||||
|
@ -9,7 +9,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
|
||||
|
||||
For more information on CoreWeave's Tensorizer, please refer to
|
||||
[CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
|
||||
the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html).
|
||||
the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html).
|
||||
|
||||
```{note}
|
||||
Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
|
||||
|
@ -46,7 +46,7 @@ for output in outputs:
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
A code example can be found here: <gh-file:examples/offline_inference.py>
|
||||
A code example can be found here: <gh-file:examples/offline_inference/offline_inference.py>
|
||||
|
||||
### `LLM.beam_search`
|
||||
|
||||
@ -103,7 +103,7 @@ for output in outputs:
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
A code example can be found here: <gh-file:examples/offline_inference_chat.py>
|
||||
A code example can be found here: <gh-file:examples/offline_inference/offline_inference_chat.py>
|
||||
|
||||
If the model doesn't have a chat template or you want to specify another one,
|
||||
you can explicitly pass a chat template:
|
||||
|
@ -65,7 +65,7 @@ embeds = output.outputs.embedding
|
||||
print(f"Embeddings: {embeds!r} (size={len(embeds)})")
|
||||
```
|
||||
|
||||
A code example can be found here: <gh-file:examples/offline_inference_embedding.py>
|
||||
A code example can be found here: <gh-file:examples/offline_inference/offline_inference_embedding.py>
|
||||
|
||||
### `LLM.classify`
|
||||
|
||||
@ -80,7 +80,7 @@ probs = output.outputs.probs
|
||||
print(f"Class Probabilities: {probs!r} (size={len(probs)})")
|
||||
```
|
||||
|
||||
A code example can be found here: <gh-file:examples/offline_inference_classification.py>
|
||||
A code example can be found here: <gh-file:examples/offline_inference/offline_inference_classification.py>
|
||||
|
||||
### `LLM.score`
|
||||
|
||||
@ -102,7 +102,7 @@ score = output.outputs.score
|
||||
print(f"Score: {score}")
|
||||
```
|
||||
|
||||
A code example can be found here: <gh-file:examples/offline_inference_scoring.py>
|
||||
A code example can be found here: <gh-file:examples/offline_inference/offline_inference_scoring.py>
|
||||
|
||||
## Online Inference
|
||||
|
||||
|
@ -51,7 +51,7 @@ $ --pipeline-parallel-size 2
|
||||
|
||||
If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
|
||||
|
||||
The first step, is to start containers and organize them into a cluster. We have provided the helper script <gh-file:examples/run_cluster.sh> to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command.
|
||||
The first step, is to start containers and organize them into a cluster. We have provided the helper script <gh-file:examples/online_serving/run_cluster.sh> to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command.
|
||||
|
||||
Pick a node as the head node, and run the following command:
|
||||
|
||||
|
@ -60,7 +60,7 @@ for o in outputs:
|
||||
print(generated_text)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/offline_inference_vision_language.py>
|
||||
Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
|
||||
|
||||
To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
|
||||
|
||||
@ -91,7 +91,7 @@ for o in outputs:
|
||||
print(generated_text)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/offline_inference_vision_language_multi_image.py>
|
||||
Full example: <gh-file:examples/offline_inference/offline_inference_vision_language_multi_image.py>
|
||||
|
||||
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
|
||||
|
||||
@ -125,13 +125,13 @@ for o in outputs:
|
||||
You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
|
||||
instead of using multi-image input.
|
||||
|
||||
Full example: <gh-file:examples/offline_inference_vision_language.py>
|
||||
Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
|
||||
|
||||
### Audio
|
||||
|
||||
You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
|
||||
|
||||
Full example: <gh-file:examples/offline_inference_audio_language.py>
|
||||
Full example: <gh-file:examples/offline_inference/offline_inference_audio_language.py>
|
||||
|
||||
### Embedding
|
||||
|
||||
@ -271,7 +271,7 @@ chat_response = client.chat.completions.create(
|
||||
print("Chat completion output:", chat_response.choices[0].message.content)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
|
||||
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
||||
|
||||
```{tip}
|
||||
Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
|
||||
@ -342,7 +342,7 @@ result = chat_completion_from_url.choices[0].message.content
|
||||
print("Chat completion output from image url:", result)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
|
||||
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
||||
|
||||
````{note}
|
||||
By default, the timeout for fetching videos through HTTP URL is `30` seconds.
|
||||
@ -445,7 +445,7 @@ result = chat_completion_from_url.choices[0].message.content
|
||||
print("Chat completion output from audio url:", result)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
|
||||
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
|
||||
|
||||
````{note}
|
||||
By default, the timeout for fetching audios through HTTP URL is `10` seconds.
|
||||
@ -529,4 +529,4 @@ Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of th
|
||||
example below for details.
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/openai_chat_embedding_client_for_multimodal.py>
|
||||
Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py>
|
||||
|
@ -191,7 +191,7 @@ The order of priorities is `command line > config file values > defaults`.
|
||||
Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
|
||||
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
|
||||
|
||||
Code example: <gh-file:examples/openai_completion_client.py>
|
||||
Code example: <gh-file:examples/online_serving/openai_completion_client.py>
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
@ -222,7 +222,7 @@ We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
|
||||
see our [Multimodal Inputs](#multimodal-inputs) guide for more information.
|
||||
- *Note: `image_url.detail` parameter is not supported.*
|
||||
|
||||
Code example: <gh-file:examples/openai_chat_completion_client.py>
|
||||
Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
@ -255,7 +255,7 @@ which will be treated as a single prompt to the model.
|
||||
This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details.
|
||||
```
|
||||
|
||||
Code example: <gh-file:examples/openai_embedding_client.py>
|
||||
Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
@ -299,7 +299,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_
|
||||
|
||||
The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
|
||||
|
||||
Code example: <gh-file:examples/openai_pooling_client.py>
|
||||
Code example: <gh-file:examples/online_serving/openai_pooling_client.py>
|
||||
|
||||
(score-api)=
|
||||
### Score API
|
||||
@ -309,7 +309,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent
|
||||
|
||||
You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
|
||||
|
||||
Code example: <gh-file:examples/openai_cross_encoder_score.py>
|
||||
Code example: <gh-file:examples/online_serving/openai_cross_encoder_score.py>
|
||||
|
||||
#### Single inference
|
||||
|
||||
|
@ -3,7 +3,8 @@ Demonstrate prompting of text-to-text
|
||||
encoder/decoder models, specifically Florence-2
|
||||
'''
|
||||
# TODO(Isotr0py):
|
||||
# Move to offline_inference_vision_language.py after porting vision backbone
|
||||
# Move to offline_inference/offline_inference_vision_language.py
|
||||
# after porting vision backbone
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
dtype = "float"
|
@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format
|
||||
|
||||
The OpenAI batch file format consists of a series of json objects on new lines.
|
||||
|
||||
[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl)
|
||||
[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl)
|
||||
|
||||
Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
|
||||
|
||||
@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints
|
||||
To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
|
||||
|
||||
```
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
|
||||
```
|
||||
|
||||
Once you've created your batch file it should look like this
|
||||
|
||||
```
|
||||
$ cat openai_example_batch.jsonl
|
||||
$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
|
||||
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
|
||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
|
||||
```
|
||||
@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line.
|
||||
You can run the batch with the following command, which will write its results to a file called `results.jsonl`
|
||||
|
||||
```
|
||||
python -m vllm.entrypoints.openai.run_batch -i openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
python -m vllm.entrypoints.openai.run_batch -i offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
```
|
||||
|
||||
### Step 3: Check your results
|
||||
@ -66,10 +66,10 @@ $ cat results.jsonl
|
||||
|
||||
The batch runner supports remote input and output urls that are accessible via http/https.
|
||||
|
||||
For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl`, you can run
|
||||
For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl`, you can run
|
||||
|
||||
```
|
||||
python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
```
|
||||
|
||||
## Example 3: Integrating with AWS S3
|
||||
@ -90,13 +90,13 @@ To integrate with cloud blob storage, we recommend using presigned urls.
|
||||
To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
|
||||
|
||||
```
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
|
||||
```
|
||||
|
||||
Once you've created your batch file it should look like this
|
||||
|
||||
```
|
||||
$ cat openai_example_batch.jsonl
|
||||
$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
|
||||
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
|
||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
|
||||
```
|
||||
@ -104,7 +104,7 @@ $ cat openai_example_batch.jsonl
|
||||
Now upload your batch file to your S3 bucket.
|
||||
|
||||
```
|
||||
aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
|
||||
aws s3 cp offline_inference/offline_inference_openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
|
||||
```
|
||||
|
||||
### Step 2: Generate your presigned urls
|
@ -363,7 +363,7 @@ Profile a model
|
||||
|
||||
example:
|
||||
```
|
||||
python examples/offline_profile.py \\
|
||||
python examples/offline_inference/offline_profile.py \\
|
||||
--model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
|
||||
--prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
|
||||
--enforce-eager run_num_steps -n 2
|
21
examples/online_serving/chart-helm/README.md
Normal file
21
examples/online_serving/chart-helm/README.md
Normal file
@ -0,0 +1,21 @@
|
||||
# Helm Charts
|
||||
|
||||
This directory contains a Helm chart for deploying the vllm application. The chart includes configurations for deployment, autoscaling, resource management, and more.
|
||||
|
||||
## Files
|
||||
|
||||
- Chart.yaml: Defines the chart metadata including name, version, and maintainers.
|
||||
- ct.yaml: Configuration for chart testing.
|
||||
- lintconf.yaml: Linting rules for YAML files.
|
||||
- values.schema.json: JSON schema for validating values.yaml.
|
||||
- values.yaml: Default values for the Helm chart.
|
||||
- templates/_helpers.tpl: Helper templates for defining common configurations.
|
||||
- templates/configmap.yaml: Template for creating ConfigMaps.
|
||||
- templates/custom-objects.yaml: Template for custom Kubernetes objects.
|
||||
- templates/deployment.yaml: Template for creating Deployments.
|
||||
- templates/hpa.yaml: Template for Horizontal Pod Autoscaler.
|
||||
- templates/job.yaml: Template for Kubernetes Jobs.
|
||||
- templates/poddisruptionbudget.yaml: Template for Pod Disruption Budget.
|
||||
- templates/pvc.yaml: Template for Persistent Volume Claims.
|
||||
- templates/secrets.yaml: Template for Kubernetes Secrets.
|
||||
- templates/service.yaml: Template for creating Services.
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user