vllm/benchmarks/launch_tgi_server.sh

#!/bin/bash

PORT=8000
MODEL=$1
TOKENS=$2

docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
           -v "$PWD/data:/data" \
           ghcr.io/huggingface/text-generation-inference:2.2.0 \
           --model-id "$MODEL" \
           --sharded false  \
           --max-input-length 1024 \
           --max-total-tokens 2048 \
           --max-best-of 5 \
           --max-concurrent-requests 5000 \
           --max-batch-total-tokens "$TOKENS"
Add script for benchmarking serving throughput (#145) 2023-06-14 19:55:38 -07:00			`#!/bin/bash`

[Fix] Fix default port number in benchmark scripts (#265) 2023-06-26 13:15:35 -07:00			`PORT=8000`
Add script for benchmarking serving throughput (#145) 2023-06-14 19:55:38 -07:00			`MODEL=$1`
			`TOKENS=$2`

[CI/Build] Add shell script linting using shellcheck (#7925) Signed-off-by: Russell Bryant <rbryant@redhat.com> 2024-11-07 13:17:29 -05:00			`docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \`
			`-v "$PWD/data:/data" \`
[benchmark] Update TGI version (#7917) 2024-08-28 00:07:53 +02:00			`ghcr.io/huggingface/text-generation-inference:2.2.0 \`
[CI/Build] Add shell script linting using shellcheck (#7925) Signed-off-by: Russell Bryant <rbryant@redhat.com> 2024-11-07 13:17:29 -05:00			`--model-id "$MODEL" \`
Add script for benchmarking serving throughput (#145) 2023-06-14 19:55:38 -07:00			`--sharded false \`
			`--max-input-length 1024 \`
			`--max-total-tokens 2048 \`
			`--max-best-of 5 \`
			`--max-concurrent-requests 5000 \`
[CI/Build] Add shell script linting using shellcheck (#7925) Signed-off-by: Russell Bryant <rbryant@redhat.com> 2024-11-07 13:17:29 -05:00			`--max-batch-total-tokens "$TOKENS"`