diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py index 4a8f17ba..956219d3 100644 --- a/examples/offline_inference/tpu.py +++ b/examples/offline_inference/tpu.py @@ -14,10 +14,7 @@ answers = [ ] N = 1 # Currently, top-p sampling is disabled. `top_p` should be 1.0. -sampling_params = SamplingParams(temperature=0.7, - top_p=1.0, - n=N, - max_tokens=16) +sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16) # Set `enforce_eager=True` to avoid ahead-of-time compilation. # In real workloads, `enforace_eager` should be `False`. diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index cf5c56b9..65a4048a 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -88,7 +88,7 @@ class TPUModelRunner: self.max_model_len = model_config.max_model_len self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size) self.max_num_tokens = scheduler_config.max_num_batched_tokens - self.max_num_reqs = scheduler_config.max_num_seqs + self.max_num_reqs = max(scheduler_config.max_num_seqs, MIN_NUM_SEQS) # Model-related. self.num_attn_layers = model_config.get_num_layers_by_block_type(