diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 82c6b426..e2f712df 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -7,9 +7,6 @@ On the server side, run one of the following commands: --swap-space 16 \ --disable-log-requests - (TGI backend) - ./launch_tgi_server.sh - On the client side, run: python benchmarks/benchmark_serving.py \ --backend \ diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index c79a93fa..71cb420a 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -5,9 +5,6 @@ On the server side, run one of the following commands: (vLLM OpenAI API server) vllm serve --disable-log-requests - (TGI backend) - ./launch_tgi_server.sh - On the client side, run: python benchmarks/benchmark_serving_structured_output.py \ --backend \ diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh deleted file mode 100755 index ba7383d8..00000000 --- a/benchmarks/launch_tgi_server.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -PORT=8000 -MODEL=$1 -TOKENS=$2 - -docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \ - -v "$PWD/data:/data" \ - ghcr.io/huggingface/text-generation-inference:2.2.0 \ - --model-id "$MODEL" \ - --sharded false \ - --max-input-length 1024 \ - --max-total-tokens 2048 \ - --max-best-of 5 \ - --max-concurrent-requests 5000 \ - --max-batch-total-tokens "$TOKENS"