From cf069aa8aa38a9003c254f8434a29ec6a3070b08 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 3 Mar 2025 01:34:51 +0000 Subject: [PATCH] Update deprecated Python 3.8 typing (#13971) --- benchmarks/backend_request_func.py | 6 +- benchmarks/benchmark_guided.py | 17 +- benchmarks/benchmark_latency.py | 6 +- benchmarks/benchmark_prefix_caching.py | 16 +- benchmarks/benchmark_prioritization.py | 8 +- benchmarks/benchmark_serving.py | 77 +++---- benchmarks/benchmark_serving_guided.py | 57 ++--- benchmarks/benchmark_throughput.py | 38 ++-- benchmarks/benchmark_utils.py | 8 +- .../cutlass_benchmarks/sparse_benchmarks.py | 9 +- benchmarks/cutlass_benchmarks/utils.py | 8 +- .../cutlass_benchmarks/w8a8_benchmarks.py | 17 +- .../fused_kernels/layernorm_rms_benchmarks.py | 5 +- benchmarks/kernels/benchmark_lora.py | 60 ++--- benchmarks/kernels/benchmark_machete.py | 25 ++- benchmarks/kernels/benchmark_marlin.py | 6 +- benchmarks/kernels/benchmark_moe.py | 18 +- .../kernels/benchmark_paged_attention.py | 4 +- benchmarks/kernels/benchmark_rmsnorm.py | 4 +- benchmarks/kernels/benchmark_rope.py | 4 +- benchmarks/kernels/graph_machete_bench.py | 3 +- benchmarks/kernels/utils.py | 3 +- .../vllm_cutlass_library_extension.py | 14 +- csrc/quantization/machete/generate.py | 20 +- docs/source/conf.py | 3 +- docs/source/features/reasoning_outputs.md | 4 +- docs/source/features/structured_outputs.md | 2 +- docs/source/generate_examples.py | 2 +- examples/offline_inference/distributed.py | 10 +- .../offline_inference/llm_engine_example.py | 7 +- .../lora_with_quantization_inference.py | 8 +- examples/offline_inference/mlpspeculator.py | 3 +- .../offline_inference/multilora_inference.py | 8 +- .../prithvi_geospatial_mae.py | 8 +- examples/offline_inference/profiling.py | 15 +- .../profiling_tpu/profiling.py | 3 +- .../vision_language_multi_image.py | 34 +-- examples/online_serving/api_client.py | 6 +- .../online_serving/openai_embedding_client.py | 2 +- pyproject.toml | 28 ++- setup.py | 7 +- tests/async_engine/api_server_async_engine.py | 5 +- tests/async_engine/test_async_llm_engine.py | 4 +- tests/compile/piecewise/test_toy_llama.py | 6 +- tests/compile/test_basic_correctness.py | 8 +- tests/conftest.py | 159 +++++++------ tests/core/block/e2e/conftest.py | 3 +- .../e2e/test_correctness_sliding_window.py | 11 +- tests/core/block/test_block_table.py | 8 +- tests/core/block/test_naive_block.py | 4 +- tests/core/block/test_prefix_caching_block.py | 16 +- tests/core/test_chunked_prefill_scheduler.py | 25 +-- tests/core/test_scheduler.py | 19 +- tests/core/test_scheduler_encoder_decoder.py | 4 +- tests/core/utils.py | 21 +- tests/distributed/test_expert_parallel.py | 6 +- tests/distributed/test_pipeline_parallel.py | 8 +- tests/distributed/test_pynccl.py | 5 +- tests/distributed/test_shm_broadcast.py | 3 +- tests/encoder_decoder/test_e2e_correctness.py | 4 +- tests/engine/test_executor.py | 6 +- tests/engine/test_multiproc_workers.py | 6 +- tests/engine/test_stop_strings.py | 6 +- tests/entrypoints/llm/test_chat.py | 4 +- tests/entrypoints/llm/test_encode.py | 5 +- tests/entrypoints/llm/test_generate.py | 3 +- .../test_transcription_api_correctness.py | 3 +- .../test_deepseekr1_reasoning_parser.py | 4 +- .../openai/reasoning_parsers/utils.py | 14 +- tests/entrypoints/openai/test_audio.py | 16 +- tests/entrypoints/openai/test_basic.py | 3 +- tests/entrypoints/openai/test_chat.py | 8 +- tests/entrypoints/openai/test_completion.py | 8 +- tests/entrypoints/openai/test_embedding.py | 4 +- tests/entrypoints/openai/test_pooling.py | 4 +- tests/entrypoints/openai/test_root_path.py | 4 +- tests/entrypoints/openai/test_video.py | 12 +- tests/entrypoints/openai/test_vision.py | 12 +- .../openai/test_vision_embedding.py | 4 +- .../tool_parsers/test_pythonic_tool_parser.py | 3 +- .../entrypoints/openai/tool_parsers/utils.py | 9 +- tests/kernels/quant_utils.py | 6 +- tests/kernels/test_activation.py | 3 +- tests/kernels/test_attention.py | 16 +- tests/kernels/test_blocksparse_attention.py | 12 +- tests/kernels/test_cache.py | 5 +- tests/kernels/test_cascade_flash_attn.py | 8 +- tests/kernels/test_cutlass.py | 11 +- tests/kernels/test_cutlass_2of4_sparse.py | 5 +- tests/kernels/test_encoder_decoder_attn.py | 4 +- tests/kernels/test_flash_attn.py | 16 +- tests/kernels/test_flashinfer.py | 22 +- tests/kernels/test_fused_quant_layernorm.py | 12 +- tests/kernels/test_gguf.py | 3 +- tests/kernels/test_machete_mm.py | 14 +- tests/kernels/test_mamba_mixer2.py | 3 +- tests/kernels/test_mamba_ssm_ssd.py | 8 +- tests/kernels/test_pos_encoding.py | 6 +- tests/kernels/test_triton_scaled_mm.py | 4 +- tests/kernels/utils.py | 68 +++--- tests/kv_transfer/test_send_recv.py | 3 +- tests/lora/conftest.py | 6 +- tests/lora/data/long_context_test_data.py | 4 +- tests/lora/test_add_lora.py | 9 +- tests/lora/test_baichuan.py | 6 +- tests/lora/test_chatglm3_tp.py | 6 +- tests/lora/test_gemma.py | 6 +- tests/lora/test_jamba.py | 6 +- tests/lora/test_layers.py | 48 ++-- tests/lora/test_llama_tp.py | 6 +- tests/lora/test_long_context.py | 16 +- tests/lora/test_lora_bias_e2e.py | 6 +- tests/lora/test_lora_checkpoints.py | 6 +- tests/lora/test_lora_functions.py | 5 +- tests/lora/test_lora_huggingface.py | 4 +- tests/lora/test_lora_manager.py | 7 +- tests/lora/test_minicpmv_tp.py | 6 +- tests/lora/test_mixtral.py | 6 +- tests/lora/test_phi.py | 6 +- tests/lora/test_punica_ops.py | 5 +- tests/lora/test_quant_model.py | 7 +- tests/lora/test_qwen2vl.py | 10 +- tests/lora/test_transfomers_model.py | 6 +- tests/lora/test_ultravox.py | 7 +- tests/lora/utils.py | 14 +- tests/metrics/test_metrics.py | 3 +- tests/mistral_tool_use/utils.py | 8 +- .../model_executor/test_enabled_custom_ops.py | 4 +- .../audio_language/test_ultravox.py | 16 +- .../models/decoder_only/language/test_gguf.py | 6 +- .../decoder_only/language/test_modelopt.py | 3 +- .../decoder_only/vision_language/test_awq.py | 6 +- .../vision_language/test_models.py | 39 ++-- .../vision_language/test_phi3v.py | 10 +- .../vision_language/test_pixtral.py | 12 +- .../vision_language/test_qwen2_vl.py | 46 ++-- .../vision_language/vlm_utils/builders.py | 7 +- .../vlm_utils/case_filtering.py | 10 +- .../vision_language/vlm_utils/core.py | 22 +- .../vision_language/vlm_utils/model_utils.py | 16 +- .../vision_language/vlm_utils/runners.py | 21 +- .../vision_language/vlm_utils/types.py | 36 +-- .../models/embedding/language/test_gritlm.py | 11 +- tests/models/embedding/utils.py | 6 +- .../vision_language/test_dse_qwen2_vl.py | 12 +- .../vision_language/test_llava_next.py | 8 +- .../embedding/vision_language/test_phi3v.py | 8 +- .../encoder_decoder/language/test_bart.py | 10 +- .../vision_language/test_florence2.py | 8 +- .../vision_language/test_mllama.py | 36 +-- .../multimodal/processing/test_h2ovl.py | 3 +- .../multimodal/processing/test_internvl.py | 3 +- tests/models/registry.py | 5 +- tests/models/test_transformers.py | 15 +- tests/models/utils.py | 21 +- tests/mq_llm_engine/utils.py | 4 +- .../multi_step/test_correctness_async_llm.py | 4 +- tests/multimodal/test_utils.py | 8 +- tests/neuron/test_logits_processor.py | 3 +- .../my_gemma_embedding.py | 5 +- tests/quantization/test_configs.py | 3 +- .../test_register_quantization_config.py | 8 +- tests/samplers/test_logprobs.py | 4 +- tests/samplers/test_no_bad_words.py | 16 +- tests/samplers/test_rejection_sampler.py | 11 +- tests/samplers/test_sampler.py | 44 ++-- tests/spec_decode/e2e/conftest.py | 9 +- tests/spec_decode/test_batch_expansion.py | 4 +- tests/spec_decode/test_multi_step_worker.py | 15 +- tests/spec_decode/test_scorer.py | 3 +- tests/spec_decode/test_spec_decode_worker.py | 11 +- tests/spec_decode/utils.py | 33 ++- tests/test_cache_block_hashing.py | 6 +- tests/test_inputs.py | 4 +- tests/test_logger.py | 2 +- tests/test_logits_processor.py | 3 +- tests/test_utils.py | 4 +- tests/tokenization/test_detokenize.py | 23 +- tests/tokenization/test_tokenizer_group.py | 4 +- tests/tokenization/test_tokenizer_registry.py | 32 +-- tests/tool_use/test_chat_completions.py | 6 +- tests/tool_use/test_jamba_tool_parser.py | 13 +- tests/tool_use/test_parallel_tool_calls.py | 10 +- tests/tool_use/test_tool_calls.py | 10 +- tests/tool_use/utils.py | 26 +-- tests/tracing/test_tracing.py | 5 +- tests/utils.py | 42 ++-- tests/v1/core/test_prefix_caching.py | 3 +- tests/v1/core/test_scheduler.py | 6 +- tests/v1/engine/conftest.py | 6 +- tests/v1/engine/test_async_llm.py | 10 +- tests/v1/engine/test_engine_core.py | 3 +- tests/v1/engine/test_engine_core_client.py | 10 +- tests/v1/engine/test_llm_engine.py | 8 +- tests/v1/engine/test_output_processor.py | 12 +- tests/v1/engine/utils.py | 50 ++--- .../v1/entrypoints/openai/test_completion.py | 12 +- tests/v1/sample/test_logprobs.py | 9 +- tests/v1/sample/test_rejection_sampler.py | 7 +- tests/v1/sample/test_sampler.py | 26 +-- tests/v1/sample/utils.py | 5 +- tests/v1/test_utils.py | 6 +- tests/v1/worker/test_gpu_input_batch.py | 22 +- .../vllm_test_utils/vllm_test_utils/blame.py | 3 +- .../vllm_test_utils/monitor.py | 3 +- .../test_encoder_decoder_model_runner.py | 21 +- tests/worker/test_model_input.py | 11 +- tests/worker/test_model_runner.py | 20 +- tools/profiler/print_layerwise_table.py | 3 +- tools/profiler/visualize_layerwise_profile.py | 14 +- vllm/_custom_ops.py | 56 ++--- vllm/_ipex_ops.py | 8 +- vllm/beam_search.py | 18 +- vllm/config.py | 141 ++++++------ vllm/connections.py | 3 +- vllm/entrypoints/api_server.py | 3 +- vllm/entrypoints/chat_utils.py | 49 ++-- vllm/entrypoints/cli/openai.py | 10 +- vllm/entrypoints/cli/serve.py | 3 +- vllm/entrypoints/llm.py | 210 +++++++++--------- vllm/entrypoints/logger.py | 4 +- vllm/entrypoints/openai/api_server.py | 9 +- vllm/entrypoints/openai/cli_args.py | 7 +- vllm/entrypoints/openai/logits_processors.py | 23 +- vllm/entrypoints/openai/protocol.py | 128 +++++------ .../abs_reasoning_parsers.py | 21 +- .../deepseek_r1_reasoning_parser.py | 5 +- vllm/entrypoints/openai/run_batch.py | 9 +- vllm/entrypoints/openai/serving_chat.py | 31 ++- vllm/entrypoints/openai/serving_completion.py | 32 +-- vllm/entrypoints/openai/serving_embedding.py | 15 +- vllm/entrypoints/openai/serving_engine.py | 43 ++-- vllm/entrypoints/openai/serving_models.py | 10 +- vllm/entrypoints/openai/serving_pooling.py | 15 +- vllm/entrypoints/openai/serving_score.py | 49 ++-- .../openai/serving_tokenization.py | 4 +- .../openai/serving_transcription.py | 3 +- .../tool_parsers/abstract_tool_parser.py | 21 +- .../granite_20b_fc_tool_parser.py | 5 +- .../tool_parsers/granite_tool_parser.py | 5 +- .../openai/tool_parsers/hermes_tool_parser.py | 7 +- .../tool_parsers/internlm2_tool_parser.py | 5 +- .../openai/tool_parsers/jamba_tool_parser.py | 11 +- .../openai/tool_parsers/llama_tool_parser.py | 11 +- .../tool_parsers/mistral_tool_parser.py | 13 +- .../tool_parsers/pythonic_tool_parser.py | 5 +- vllm/entrypoints/openai/tool_parsers/utils.py | 6 +- vllm/entrypoints/score_utils.py | 14 +- vllm/envs.py | 8 +- vllm/forward_context.py | 6 +- vllm/logger.py | 2 +- vllm/logits_process.py | 16 +- vllm/outputs.py | 24 +- vllm/sampling_params.py | 53 +++-- vllm/sequence.py | 132 +++++------ vllm/tracing.py | 3 +- vllm/utils.py | 76 +++---- vllm/v1/attention/backends/flash_attn.py | 18 +- vllm/v1/attention/backends/mla/common.py | 21 +- vllm/v1/attention/backends/mla/flashmla.py | 14 +- vllm/v1/attention/backends/mla/triton_mla.py | 8 +- vllm/v1/attention/backends/pallas.py | 16 +- vllm/v1/attention/backends/rocm_attn.py | 16 +- vllm/v1/core/block_pool.py | 17 +- vllm/v1/core/encoder_cache_manager.py | 16 +- vllm/v1/core/kv_cache_manager.py | 19 +- vllm/v1/core/kv_cache_utils.py | 32 +-- vllm/v1/core/scheduler.py | 45 ++-- vllm/v1/core/scheduler_output.py | 42 ++-- vllm/v1/engine/__init__.py | 16 +- vllm/v1/engine/async_llm.py | 9 +- vllm/v1/engine/core.py | 16 +- vllm/v1/engine/core_client.py | 34 +-- vllm/v1/engine/detokenizer.py | 12 +- vllm/v1/engine/llm_engine.py | 17 +- vllm/v1/engine/logprobs.py | 12 +- vllm/v1/engine/mm_input_cache.py | 18 +- vllm/v1/engine/output_processor.py | 20 +- vllm/v1/engine/parallel_sampling.py | 16 +- vllm/v1/engine/processor.py | 3 +- vllm/v1/executor/abstract.py | 10 +- vllm/v1/executor/multiproc_executor.py | 10 +- vllm/v1/kv_cache_interface.py | 7 +- vllm/v1/metrics/loggers.py | 18 +- vllm/v1/metrics/stats.py | 20 +- vllm/v1/outputs.py | 20 +- vllm/v1/request.py | 24 +- vllm/v1/sample/metadata.py | 10 +- vllm/v1/sample/ops/penalties.py | 12 +- vllm/v1/sample/ops/topk_topp_sampler.py | 10 +- vllm/v1/sample/rejection_sampler.py | 7 +- vllm/v1/stats/common.py | 18 +- vllm/v1/utils.py | 20 +- vllm/v1/worker/block_table.py | 6 +- vllm/v1/worker/gpu_input_batch.py | 62 +++--- vllm/v1/worker/gpu_model_runner.py | 34 +-- vllm/v1/worker/gpu_worker.py | 4 +- vllm/v1/worker/lora_model_runner_mixin.py | 17 +- vllm/v1/worker/tpu_model_runner.py | 24 +- vllm/v1/worker/tpu_worker.py | 6 +- 300 files changed, 2294 insertions(+), 2347 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index e43549c1..15870576 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -6,7 +6,7 @@ import sys import time import traceback from dataclasses import dataclass, field -from typing import List, Optional, Union +from typing import Optional, Union import aiohttp import huggingface_hub.constants @@ -41,8 +41,8 @@ class RequestFuncOutput: latency: float = 0.0 output_tokens: int = 0 ttft: float = 0.0 # Time to first token - itl: List[float] = field( - default_factory=list) # List of inter-token latencies + itl: list[float] = field( + default_factory=list) # list of inter-token latencies tpot: float = 0.0 # avg next-token latencies prompt_len: int = 0 error: str = "" diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py index dc2bf0e7..2e0f6c6b 100644 --- a/benchmarks/benchmark_guided.py +++ b/benchmarks/benchmark_guided.py @@ -6,7 +6,6 @@ import json import os import random import time -from typing import List import datasets import pandas as pd @@ -39,7 +38,7 @@ class SampleRequest: completion: str = None -def run_vllm(requests: List[SampleRequest], +def run_vllm(requests: list[SampleRequest], engine_args: EngineArgs, n: int, guided_decoding_rate: float = 1.0, @@ -54,8 +53,8 @@ def run_vllm(requests: List[SampleRequest], " prompt_len and expected_output_len for all requests.") # Add the requests to the engine. - prompts: List[str] = [] - sampling_params: List[SamplingParams] = [] + prompts: list[str] = [] + sampling_params: list[SamplingParams] = [] # create a list containing random selected true or false guided_decoding_req_idx = random.sample( range(len(requests)), int(len(requests) * guided_decoding_rate)) @@ -110,7 +109,7 @@ def run_vllm(requests: List[SampleRequest], async def run_vllm_async( - requests: List[SampleRequest], + requests: list[SampleRequest], engine_args: AsyncEngineArgs, n: int, guided_decoding_rate: float = 1.0, @@ -129,8 +128,8 @@ async def run_vllm_async( " prompt_len and expected_output_len for all requests.") # Add the requests to the engine. - prompts: List[str] = [] - sampling_params: List[SamplingParams] = [] + prompts: list[str] = [] + sampling_params: list[SamplingParams] = [] guided_decoding_req_idx = random.sample( range(len(requests)), int(len(requests) * guided_decoding_rate)) @@ -203,7 +202,7 @@ async def run_vllm_async( def sample_requests(tokenizer: PreTrainedTokenizerBase, - args: argparse.Namespace) -> List[SampleRequest]: + args: argparse.Namespace) -> list[SampleRequest]: if args.dataset == 'json': if args.json_schema_path is None: dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -287,7 +286,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, elif args.dataset == "xgrammar_bench": args.warmup = False - requests: List[SampleRequest] = [] + requests: list[SampleRequest] = [] dataset = datasets.load_dataset("NousResearch/json-mode-eval", split="train") print(f"dataset has {len(dataset)} entries") diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index c82358d1..d7f39f50 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -7,7 +7,7 @@ import json import os import time from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any, Optional import numpy as np import torch @@ -22,7 +22,7 @@ from vllm.utils import FlexibleArgumentParser def save_to_pytorch_benchmark_format(args: argparse.Namespace, - results: Dict[str, Any]) -> None: + results: dict[str, Any]) -> None: pt_records = convert_to_pytorch_benchmark_format( args=args, metrics={"latency": results["latencies"]}, @@ -57,7 +57,7 @@ def main(args: argparse.Namespace): dummy_prompt_token_ids = np.random.randint(10000, size=(args.batch_size, args.input_len)) - dummy_prompts: List[PromptType] = [{ + dummy_prompts: list[PromptType] = [{ "prompt_token_ids": batch } for batch in dummy_prompt_token_ids.tolist()] diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 23822856..fba32520 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -31,7 +31,7 @@ import dataclasses import json import random import time -from typing import List, Optional, Tuple +from typing import Optional from transformers import PreTrainedTokenizerBase @@ -77,9 +77,9 @@ def sample_requests_from_dataset( dataset_path: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, - input_length_range: Tuple[int, int], + input_length_range: tuple[int, int], fixed_output_len: Optional[int], -) -> List[Request]: +) -> list[Request]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -99,7 +99,7 @@ def sample_requests_from_dataset( assert min_len >= 0 and max_len >= min_len, "input_length_range too small" # Filter out sequences that are too long or too short - filtered_requests: List[Request] = [] + filtered_requests: list[Request] = [] for i in range(len(dataset)): if len(filtered_requests) == num_requests: @@ -122,10 +122,10 @@ def sample_requests_from_dataset( def sample_requests_from_random( num_requests: int, tokenizer: PreTrainedTokenizerBase, - input_length_range: Tuple[int, int], + input_length_range: tuple[int, int], fixed_output_len: Optional[int], prefix_len: int, -) -> List[Request]: +) -> list[Request]: requests = [] prefix_token_ids = sample_tokens(tokenizer, prefix_len) @@ -144,9 +144,9 @@ def sample_requests_from_random( return requests -def repeat_and_sort_requests(requests: List[Request], +def repeat_and_sort_requests(requests: list[Request], repeat_count: int, - sort: bool = False) -> List[str]: + sort: bool = False) -> list[str]: repeated_requests = requests * repeat_count if sort: repeated_requests.sort(key=lambda x: x[1]) diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index 24014e5b..43b2c1b0 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -5,7 +5,7 @@ import dataclasses import json import random import time -from typing import List, Optional, Tuple +from typing import Optional from transformers import AutoTokenizer, PreTrainedTokenizerBase @@ -23,7 +23,7 @@ def sample_requests( num_requests: int, tokenizer: PreTrainedTokenizerBase, fixed_output_len: Optional[int], -) -> List[Tuple[str, int, int]]: +) -> list[tuple[str, int, int]]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -40,7 +40,7 @@ def sample_requests( random.shuffle(dataset) # Filter out sequences that are too long or too short - filtered_dataset: List[Tuple[str, int, int]] = [] + filtered_dataset: list[tuple[str, int, int]] = [] for i in range(len(dataset)): if len(filtered_dataset) == num_requests: break @@ -68,7 +68,7 @@ def sample_requests( def run_vllm( - requests: List[Tuple[str, int, int]], + requests: list[tuple[str, int, int]], n: int, engine_args: EngineArgs, ) -> float: diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 1bb83b08..16ec0a48 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -33,9 +33,10 @@ import os import random import time import warnings +from collections.abc import AsyncGenerator, Collection from dataclasses import dataclass from datetime import datetime -from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple +from typing import Any, Optional import numpy as np import pandas as pd @@ -73,22 +74,22 @@ class BenchmarkMetrics: mean_ttft_ms: float median_ttft_ms: float std_ttft_ms: float - percentiles_ttft_ms: List[Tuple[float, float]] + percentiles_ttft_ms: list[tuple[float, float]] mean_tpot_ms: float median_tpot_ms: float std_tpot_ms: float - percentiles_tpot_ms: List[Tuple[float, float]] + percentiles_tpot_ms: list[tuple[float, float]] mean_itl_ms: float median_itl_ms: float std_itl_ms: float - percentiles_itl_ms: List[Tuple[float, float]] + percentiles_itl_ms: list[tuple[float, float]] # E2EL stands for end-to-end latency per request. # It is the time taken on the client side from sending # a request to receiving a complete response. mean_e2el_ms: float median_e2el_ms: float std_e2el_ms: float - percentiles_e2el_ms: List[Tuple[float, float]] + percentiles_e2el_ms: list[tuple[float, float]] def sample_sharegpt_requests( @@ -96,7 +97,7 @@ def sample_sharegpt_requests( num_requests: int, tokenizer: PreTrainedTokenizerBase, fixed_output_len: Optional[int] = None, -) -> List[Tuple[str, int, int, None]]: +) -> list[tuple[str, int, int, None]]: # Load the dataset. with open(dataset_path, encoding='utf-8') as f: dataset = json.load(f) @@ -110,7 +111,7 @@ def sample_sharegpt_requests( random.shuffle(dataset) # Filter out sequences that are too long or too short - filtered_dataset: List[Tuple[str, int, int]] = [] + filtered_dataset: list[tuple[str, int, int]] = [] for i in range(len(dataset)): if len(filtered_dataset) == num_requests: break @@ -139,7 +140,7 @@ def sample_burstgpt_requests( num_requests: int, random_seed: int, tokenizer: PreTrainedTokenizerBase, -) -> List[Tuple[str, int, int, None]]: +) -> list[tuple[str, int, int, None]]: df = pd.read_csv(dataset_path) gpt4_df = df[df["Model"] == "GPT-4"] # Remove the failed requests (i.e., response length is 0) @@ -170,7 +171,7 @@ def sample_sonnet_requests( output_len: int, prefix_len: int, tokenizer: PreTrainedTokenizerBase, -) -> List[Tuple[str, str, int, int, None]]: +) -> list[tuple[str, str, int, int, None]]: assert ( input_len > prefix_len ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'." @@ -211,7 +212,7 @@ def sample_sonnet_requests( prefix_lines = poem_lines[:num_prefix_lines] # Sample the rest of lines per request. - sampled_requests: List[Tuple[str, int, int]] = [] + sampled_requests: list[tuple[str, int, int]] = [] for _ in range(num_requests): num_lines_needed = num_input_lines - num_prefix_lines sampled_lines = "".join(prefix_lines + @@ -238,8 +239,8 @@ def sample_vision_arena_requests( num_requests: int, tokenizer: PreTrainedTokenizerBase, fixed_output_len: Optional[int] = None, -) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: - sampled_requests: List[Tuple[str, int, int, Dict[str, +) -> list[tuple[str, str, int, Optional[dict[str, Collection[str]]]]]: + sampled_requests: list[tuple[str, int, int, dict[str, Collection[str]]]] = [] for data in dataset: if len(sampled_requests) == num_requests: @@ -285,7 +286,7 @@ def sample_hf_requests( tokenizer: PreTrainedTokenizerBase, random_seed: int, fixed_output_len: Optional[int] = None, -) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: +) -> list[tuple[str, str, int, Optional[dict[str, Collection[str]]]]]: # Special case for vision_arena dataset if dataset_path == 'lmarena-ai/vision-arena-bench-v0.1' \ @@ -307,7 +308,7 @@ def sample_hf_requests( "HF Dataset must have 'conversations' column.") filter_func = lambda x: len(x["conversations"]) >= 2 filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func) - sampled_requests: List[Tuple[str, int, int, Dict[str, + sampled_requests: list[tuple[str, int, int, dict[str, Collection[str]]]] = [] for data in filtered_dataset: if len(sampled_requests) == num_requests: @@ -370,7 +371,7 @@ def sample_random_requests( num_prompts: int, range_ratio: float, tokenizer: PreTrainedTokenizerBase, -) -> List[Tuple[str, int, int]]: +) -> list[tuple[str, int, int]]: prefix_token_ids = np.random.randint(0, tokenizer.vocab_size, size=prefix_len).tolist() @@ -399,10 +400,10 @@ def sample_random_requests( async def get_request( - input_requests: List[Tuple[str, int, int]], + input_requests: list[tuple[str, int, int]], request_rate: float, burstiness: float = 1.0, -) -> AsyncGenerator[Tuple[str, int, int], None]: +) -> AsyncGenerator[tuple[str, int, int], None]: """ Asynchronously generates requests at a specified rate with OPTIONAL burstiness. @@ -443,23 +444,23 @@ async def get_request( def calculate_metrics( - input_requests: List[Tuple[str, int, int]], - outputs: List[RequestFuncOutput], + input_requests: list[tuple[str, int, int]], + outputs: list[RequestFuncOutput], dur_s: float, tokenizer: PreTrainedTokenizerBase, - selected_percentile_metrics: List[str], - selected_percentiles: List[float], - goodput_config_dict: Dict[str, float], -) -> Tuple[BenchmarkMetrics, List[int]]: - actual_output_lens: List[int] = [] + selected_percentile_metrics: list[str], + selected_percentiles: list[float], + goodput_config_dict: dict[str, float], +) -> tuple[BenchmarkMetrics, list[int]]: + actual_output_lens: list[int] = [] total_input = 0 completed = 0 good_completed = 0 - itls: List[float] = [] - tpots: List[float] = [] - all_tpots: List[float] = [] - ttfts: List[float] = [] - e2els: List[float] = [] + itls: list[float] = [] + tpots: list[float] = [] + all_tpots: list[float] = [] + ttfts: list[float] = [] + e2els: list[float] = [] for i in range(len(outputs)): if outputs[i].success: output_len = outputs[i].output_tokens @@ -557,19 +558,19 @@ async def benchmark( model_id: str, model_name: str, tokenizer: PreTrainedTokenizerBase, - input_requests: List[Tuple[str, int, int]], + input_requests: list[tuple[str, int, int]], logprobs: Optional[int], best_of: int, request_rate: float, burstiness: float, disable_tqdm: bool, profile: bool, - selected_percentile_metrics: List[str], - selected_percentiles: List[str], + selected_percentile_metrics: list[str], + selected_percentiles: list[str], ignore_eos: bool, - goodput_config_dict: Dict[str, float], + goodput_config_dict: dict[str, float], max_concurrency: Optional[int], - lora_modules: Optional[List[str]], + lora_modules: Optional[list[str]], ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -652,7 +653,7 @@ async def benchmark( pbar=pbar) benchmark_start_time = time.perf_counter() - tasks: List[asyncio.Task] = [] + tasks: list[asyncio.Task] = [] async for request in get_request(input_requests, request_rate, burstiness): prompt, prompt_len, output_len, mm_content = request req_model_id, req_model_name = model_id, model_name @@ -674,7 +675,7 @@ async def benchmark( asyncio.create_task( limited_request_func(request_func_input=request_func_input, pbar=pbar))) - outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) + outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) if profile: print("Stopping profiler...") @@ -820,7 +821,7 @@ def parse_goodput(slo_pairs): def save_to_pytorch_benchmark_format(args: argparse.Namespace, - results: Dict[str, Any], + results: dict[str, Any], file_name: str) -> None: metrics = [ "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms", @@ -974,7 +975,7 @@ def main(args: argparse.Namespace): # Save config and results to json if args.save_result: - result_json: Dict[str, Any] = {} + result_json: dict[str, Any] = {} # Setup current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py index 05eadff7..6c132d05 100644 --- a/benchmarks/benchmark_serving_guided.py +++ b/benchmarks/benchmark_serving_guided.py @@ -30,8 +30,9 @@ import os import random import time import warnings +from collections.abc import AsyncGenerator from dataclasses import dataclass -from typing import AsyncGenerator, Dict, List, Optional, Tuple +from typing import Optional import datasets import numpy as np @@ -66,22 +67,22 @@ class BenchmarkMetrics: mean_ttft_ms: float median_ttft_ms: float std_ttft_ms: float - percentiles_ttft_ms: List[Tuple[float, float]] + percentiles_ttft_ms: list[tuple[float, float]] mean_tpot_ms: float median_tpot_ms: float std_tpot_ms: float - percentiles_tpot_ms: List[Tuple[float, float]] + percentiles_tpot_ms: list[tuple[float, float]] mean_itl_ms: float median_itl_ms: float std_itl_ms: float - percentiles_itl_ms: List[Tuple[float, float]] + percentiles_itl_ms: list[tuple[float, float]] # E2EL stands for end-to-end latency per request. # It is the time taken on the client side from sending # a request to receiving a complete response. mean_e2el_ms: float median_e2el_ms: float std_e2el_ms: float - percentiles_e2el_ms: List[Tuple[float, float]] + percentiles_e2el_ms: list[tuple[float, float]] @dataclasses.dataclass @@ -104,7 +105,7 @@ class SampleRequest: def sample_requests(tokenizer: PreTrainedTokenizerBase, - args: argparse.Namespace) -> List[SampleRequest]: + args: argparse.Namespace) -> list[SampleRequest]: if args.dataset == 'json': if args.json_schema_path is None: dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -187,7 +188,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, ] elif args.dataset == "xgrammar_bench": - requests: List[SampleRequest] = [] + requests: list[SampleRequest] = [] dataset = datasets.load_dataset("NousResearch/json-mode-eval", split="train") print(f"dataset has {len(dataset)} entries") @@ -214,10 +215,10 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, async def get_request( - input_requests: List[SampleRequest], + input_requests: list[SampleRequest], request_rate: float, burstiness: float = 1.0, -) -> AsyncGenerator[Tuple[int, SampleRequest], None]: +) -> AsyncGenerator[tuple[int, SampleRequest], None]: """ Asynchronously generates requests at a specified rate with OPTIONAL burstiness. @@ -258,23 +259,23 @@ async def get_request( def calculate_metrics( - input_requests: List[Tuple[str, int, int]], - outputs: List[RequestFuncOutput], + input_requests: list[tuple[str, int, int]], + outputs: list[RequestFuncOutput], dur_s: float, tokenizer: PreTrainedTokenizerBase, - selected_percentile_metrics: List[str], - selected_percentiles: List[float], - goodput_config_dict: Optional[Dict[str, float]] = None, -) -> Tuple[BenchmarkMetrics, List[int]]: - actual_output_lens: List[int] = [] + selected_percentile_metrics: list[str], + selected_percentiles: list[float], + goodput_config_dict: Optional[dict[str, float]] = None, +) -> tuple[BenchmarkMetrics, list[int]]: + actual_output_lens: list[int] = [] total_input = 0 completed = 0 good_completed = 0 - itls: List[float] = [] - tpots: List[float] = [] - all_tpots: List[float] = [] - ttfts: List[float] = [] - e2els: List[float] = [] + itls: list[float] = [] + tpots: list[float] = [] + all_tpots: list[float] = [] + ttfts: list[float] = [] + e2els: list[float] = [] for i in range(len(outputs)): if outputs[i].success: # We use the tokenizer to count the number of output tokens for all @@ -368,18 +369,18 @@ async def benchmark( base_url: str, model_id: str, tokenizer: PreTrainedTokenizerBase, - input_requests: List[SampleRequest], + input_requests: list[SampleRequest], request_rate: float, burstiness: float, disable_tqdm: bool, profile: bool, - selected_percentile_metrics: List[str], - selected_percentiles: List[str], + selected_percentile_metrics: list[str], + selected_percentiles: list[str], ignore_eos: bool, max_concurrency: Optional[int], guided_decoding_ratio: float, guided_decoding_backend: str, - goodput_config_dict: Optional[Dict[str, float]] = None, + goodput_config_dict: Optional[dict[str, float]] = None, ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -459,8 +460,8 @@ async def benchmark( pbar=pbar) benchmark_start_time = time.perf_counter() - tasks: List[asyncio.Task] = [] - expected: List[str] = [] + tasks: list[asyncio.Task] = [] + expected: list[str] = [] async for i, request in get_request(input_requests, request_rate, burstiness): extra_body = prepare_extra_body( @@ -479,7 +480,7 @@ async def benchmark( asyncio.create_task( limited_request_func(request_func_input=request_func_input, pbar=pbar))) - outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) + outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) if profile: print("Stopping profiler...") diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 04de08fa..aabce64f 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -7,7 +7,7 @@ import os import random import time from functools import cache -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Optional import torch import uvloop @@ -74,12 +74,12 @@ def lora_path_on_disk(lora_path: str) -> str: return get_adapter_absolute_path(lora_path) -lora_tokenizer_cache: Dict[int, AnyTokenizer] = {} +lora_tokenizer_cache: dict[int, AnyTokenizer] = {} def get_random_lora_request( args: argparse.Namespace -) -> Tuple[LoRARequest, Optional[AnyTokenizer]]: +) -> tuple[LoRARequest, Optional[AnyTokenizer]]: global lora_tokenizer_cache lora_id = random.randint(1, args.max_loras) lora_request = LoRARequest(lora_name=str(lora_id), @@ -91,7 +91,7 @@ def get_random_lora_request( def sample_requests(tokenizer: PreTrainedTokenizerBase, - args: argparse.Namespace) -> List[SampleRequest]: + args: argparse.Namespace) -> list[SampleRequest]: dataset_path: str = args.dataset num_requests: int = args.num_prompts @@ -109,7 +109,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, random.shuffle(dataset) # Filter out sequences that are too long or too short - filtered_dataset: List[SampleRequest] = [] + filtered_dataset: list[SampleRequest] = [] for data in tqdm(dataset, total=len(filtered_dataset), desc="sampling requests"): @@ -165,7 +165,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, def run_vllm( - requests: List[SampleRequest], + requests: list[SampleRequest], n: int, engine_args: EngineArgs, ) -> float: @@ -178,8 +178,8 @@ def run_vllm( "Please ensure that max_model_len is greater than the sum of" " prompt_len and expected_output_len for all requests.") # Add the requests to the engine. - prompts: List[TextPrompt] = [] - sampling_params: List[SamplingParams] = [] + prompts: list[TextPrompt] = [] + sampling_params: list[SamplingParams] = [] for request in requests: prompts.append( TextPrompt(prompt=request.prompt, @@ -192,7 +192,7 @@ def run_vllm( ignore_eos=True, max_tokens=request.expected_output_len, )) - lora_requests: Optional[List[LoRARequest]] = None + lora_requests: Optional[list[LoRARequest]] = None if engine_args.enable_lora: lora_requests = [request.lora_request for request in requests] @@ -225,7 +225,7 @@ def run_vllm( async def run_vllm_async( - requests: List[SampleRequest], + requests: list[SampleRequest], n: int, engine_args: AsyncEngineArgs, disable_frontend_multiprocessing: bool = False, @@ -242,9 +242,9 @@ async def run_vllm_async( " prompt_len and expected_output_len for all requests.") # Add the requests to the engine. - prompts: List[TextPrompt] = [] - sampling_params: List[SamplingParams] = [] - lora_requests: List[Optional[LoRARequest]] = [] + prompts: list[TextPrompt] = [] + sampling_params: list[SamplingParams] = [] + lora_requests: list[Optional[LoRARequest]] = [] for request in requests: prompts.append( TextPrompt(prompt=request.prompt, @@ -276,7 +276,7 @@ async def run_vllm_async( def run_hf( - requests: List[SampleRequest], + requests: list[SampleRequest], model: str, tokenizer: PreTrainedTokenizerBase, n: int, @@ -292,7 +292,7 @@ def run_hf( pbar = tqdm(total=len(requests)) start = time.perf_counter() - batch: List[str] = [] + batch: list[str] = [] max_prompt_len = 0 max_output_len = 0 for i in range(len(requests)): @@ -334,7 +334,7 @@ def run_hf( def run_mii( - requests: List[SampleRequest], + requests: list[SampleRequest], model: str, tensor_parallel_size: int, output_len: int, @@ -352,7 +352,7 @@ def run_mii( def save_to_pytorch_benchmark_format(args: argparse.Namespace, - results: Dict[str, Any]) -> None: + results: dict[str, Any]) -> None: pt_records = convert_to_pytorch_benchmark_format( args=args, metrics={ @@ -479,8 +479,8 @@ if __name__ == "__main__": type=str, default=None, help="Path to the dataset. The dataset is expected to " - "be a json in form of List[Dict[..., conversations: " - "List[Dict[..., value: ]]]]") + "be a json in form of list[dict[..., conversations: " + "list[dict[..., value: ]]]]") parser.add_argument("--input-len", type=int, default=None, diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index ac0688ca..45a0ddbd 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -4,12 +4,12 @@ import argparse import json import math import os -from typing import Any, Dict, List +from typing import Any def convert_to_pytorch_benchmark_format(args: argparse.Namespace, - metrics: Dict[str, List], - extra_info: Dict[str, Any]) -> List: + metrics: dict[str, list], + extra_info: dict[str, Any]) -> list: """ Save the benchmark results in the format used by PyTorch OSS benchmark with on metric per record @@ -64,6 +64,6 @@ class InfEncoder(json.JSONEncoder): return super().iterencode(self.clear_inf(o), *args, **kwargs) -def write_to_json(filename: str, records: List) -> None: +def write_to_json(filename: str, records: list) -> None: with open(filename, "w") as f: json.dump(records, f, cls=InfEncoder) diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py index 468a1b28..9e36b0a9 100644 --- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py @@ -5,7 +5,8 @@ import copy import itertools import pickle as pkl import time -from typing import Callable, Iterable, List, Tuple +from collections.abc import Iterable +from typing import Callable import torch import torch.utils.benchmark as TBenchmark @@ -228,7 +229,7 @@ def print_timers(timers: Iterable[TMeasurement]): def run(dtype: torch.dtype, - MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: + MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]: results = [] for m, k, n in MKNs: timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm", @@ -241,7 +242,7 @@ def run(dtype: torch.dtype, # output makers def make_output(data: Iterable[TMeasurement], - MKNs: Iterable[Tuple[int, int, int]], + MKNs: Iterable[tuple[int, int, int]], base_description: str, timestamp=None): print(f"== All Results {base_description} ====") @@ -282,7 +283,7 @@ def run_model_bench(args): for i, model in enumerate(args.models): print(f"[{i}] {model}") - def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]: + def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]: KNs = [] for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): KN[tp_split_dim] = KN[tp_split_dim] // tp_size diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py index bab37780..fe4d8fdf 100644 --- a/benchmarks/cutlass_benchmarks/utils.py +++ b/benchmarks/cutlass_benchmarks/utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # Cutlass bench utils -from typing import Iterable, Tuple +from collections.abc import Iterable import torch @@ -27,7 +27,7 @@ def to_fp16(tensor: torch.Tensor) -> torch.Tensor: def make_rand_tensors(dtype: torch.dtype, m: int, n: int, - k: int) -> Tuple[torch.Tensor, torch.Tensor]: + k: int) -> tuple[torch.Tensor, torch.Tensor]: a = torch.randn((m, k), device='cuda') * 5 b = torch.randn((n, k), device='cuda').t() * 5 @@ -63,7 +63,7 @@ def prune_to_2_4(tensor): def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int, - k: int) -> Tuple[torch.Tensor, torch.Tensor]: + k: int) -> tuple[torch.Tensor, torch.Tensor]: a = torch.randn((m, k), device='cuda') * 5 b = torch.randn((n, k), device='cuda').t() * 5 @@ -88,7 +88,7 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int, def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int) -> \ - Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]: + tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]: ABs = [] for _ in range(num_tensors): b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k) diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 6552b62d..e7b742d8 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -5,7 +5,8 @@ import copy import itertools import pickle as pkl import time -from typing import Callable, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Callable, Optional import torch import torch.utils.benchmark as TBenchmark @@ -49,7 +50,7 @@ def bench_int8( n: int, label: str, sub_label: str, - bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]: + bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]: """Benchmark INT8-based kernels.""" assert dtype == torch.int8 a, b = make_rand_tensors(torch.int8, m, n, k) @@ -101,7 +102,7 @@ def bench_fp8( n: int, label: str, sub_label: str, - bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]: + bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]: """Benchmark FP8-based kernels.""" assert dtype == torch.float8_e4m3fn a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k) @@ -180,7 +181,7 @@ def bench(dtype: torch.dtype, n: int, label: str, sub_label: str, - bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]: + bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]: if dtype == torch.int8: return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels) if dtype == torch.float8_e4m3fn: @@ -195,8 +196,8 @@ def print_timers(timers: Iterable[TMeasurement]): def run(dtype: torch.dtype, - MKNs: Iterable[Tuple[int, int, int]], - bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]: + MKNs: Iterable[tuple[int, int, int]], + bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]: results = [] for m, k, n in MKNs: timers = bench(dtype, @@ -212,7 +213,7 @@ def run(dtype: torch.dtype, def make_output(data: Iterable[TMeasurement], - MKNs: Iterable[Tuple[int, int, int]], + MKNs: Iterable[tuple[int, int, int]], base_description: str, timestamp=None): print(f"== All Results {base_description} ====") @@ -248,7 +249,7 @@ def run_model_bench(args): for i, model in enumerate(args.models): print(f"[{i}] {model}") - def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]: + def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]: KNs = [] for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): KN[tp_split_dim] = KN[tp_split_dim] // tp_size diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py index c56cc743..3da583a3 100644 --- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -2,9 +2,10 @@ import pickle as pkl import time +from collections.abc import Iterable from dataclasses import dataclass from itertools import product -from typing import Callable, Iterable, List, Optional +from typing import Callable, Optional import torch import torch.utils.benchmark as TBenchmark @@ -29,7 +30,7 @@ class bench_params_t: f'x DT {self.dtype}') -def get_bench_params() -> List[bench_params_t]: +def get_bench_params() -> list[bench_params_t]: ## Test Fixtures NUM_TOKENS = [2**x for x in range(11)] HIDDEN_SIZES = list(range(1024, 8129, 1024)) diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 1deb0026..5eaeec01 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -9,7 +9,7 @@ from dataclasses import dataclass from enum import Enum, auto from itertools import product from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Tuple +from typing import Any, Callable, Optional import torch import torch.utils.benchmark as TBenchmark @@ -61,15 +61,15 @@ def make_rand_lora_weight_tensor(k: int, def make_rand_tensors( - a_shape: Tuple[int], - b_shape: Tuple[int], - c_shape: Tuple[int], + a_shape: tuple[int], + b_shape: tuple[int], + c_shape: tuple[int], a_dtype: torch.dtype, b_dtype: torch.dtype, c_dtype: torch.dtype, num_slices: int, device: str = "cuda", -) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]: +) -> tuple[torch.Tensor, list[torch.Tensor], torch.Tensor]: """ Make LoRA input/output matrices. """ @@ -135,7 +135,7 @@ def make_token_lora_mapping(num_tokens: int, num_prompts: int, def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor, - lora_weights: List[torch.Tensor], + lora_weights: list[torch.Tensor], seq_lens_cpu: torch.Tensor, prompt_lora_mapping_cpu: torch.Tensor, scaling: float, add_inputs: Optional[bool]): @@ -204,7 +204,7 @@ class OpType(Enum): def is_expand_slice_fn(self) -> bool: return self in [OpType.BGMV_EXPAND_SLICE] - def num_slices(self) -> List[int]: + def num_slices(self) -> list[int]: if self in [OpType.SGMV_EXPAND, OpType.SGMV_SHRINK]: # SGMV kernels supports slices return [1, 2, 3] @@ -215,7 +215,7 @@ class OpType(Enum): raise ValueError(f"Unrecognized OpType {self}") def mkn(self, batch_size: int, seq_length: int, hidden_size: int, - lora_rank: int) -> Tuple[int, int, int]: + lora_rank: int) -> tuple[int, int, int]: num_tokens = batch_size * seq_length if self.is_shrink_fn(): m = num_tokens @@ -230,7 +230,7 @@ class OpType(Enum): def matmul_dtypes( self, op_dtype: torch.dtype - ) -> Tuple[torch.dtype, torch.dtype, torch.dtype]: + ) -> tuple[torch.dtype, torch.dtype, torch.dtype]: """ return a type, b type and c type for A x B = C """ @@ -243,7 +243,7 @@ class OpType(Enum): def matmul_shapes( self, batch_size: int, seq_length: int, hidden_size: int, lora_rank: int, num_loras: int, - num_slices: int) -> Tuple[Tuple[int], Tuple[int], Tuple[int]]: + num_slices: int) -> tuple[tuple[int], tuple[int], tuple[int]]: """ Given num_slices, return the shapes of the A, B, and C matrices in A x B = C, for the op_type @@ -268,7 +268,7 @@ class OpType(Enum): def bench_fn(self) -> Callable: - def emulate_bgmv_expand_slice(kwargs_list: List[Dict[str, Any]]): + def emulate_bgmv_expand_slice(kwargs_list: list[dict[str, Any]]): for x in kwargs_list: bgmv_expand_slice(**x) @@ -285,7 +285,7 @@ class OpType(Enum): raise ValueError(f"Unrecognized optype {self}") def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor, - lora_weights: List[torch.Tensor], + lora_weights: list[torch.Tensor], **kwargs) -> Callable: """Each benchmark operation expected the input, lora_weights and outputs in a slightly different format. Refer to self.matmul_shapes(). @@ -384,7 +384,7 @@ class BenchmarkTensors: """ # matmul tensors input: torch.Tensor - lora_weights_lst: List[torch.Tensor] + lora_weights_lst: list[torch.Tensor] output: torch.Tensor # metadata tensors seq_lens: torch.Tensor @@ -469,7 +469,7 @@ class BenchmarkTensors: for i in range(len(self.lora_weights_lst)): self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i]) - def metadata(self) -> Tuple[int, int, int]: + def metadata(self) -> tuple[int, int, int]: """ Return num_seqs, num_tokens and max_seq_len """ @@ -505,7 +505,7 @@ class BenchmarkTensors: self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype) self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype) - def as_sgmv_shrink_kwargs(self) -> Dict[str, Any]: + def as_sgmv_shrink_kwargs(self) -> dict[str, Any]: self.convert_to_sgmv_benchmark_tensors() self.sanity_check() self.to_device(self.input.device) @@ -540,7 +540,7 @@ class BenchmarkTensors: 'scaling': 1.0, } - def as_sgmv_expand_kwargs(self, add_inputs: bool) -> Dict[str, Any]: + def as_sgmv_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]: self.convert_to_sgmv_benchmark_tensors() self.sanity_check() @@ -578,7 +578,7 @@ class BenchmarkTensors: 'add_inputs': add_inputs, } - def as_bgmv_shrink_kwargs(self) -> Dict[str, Any]: + def as_bgmv_shrink_kwargs(self) -> dict[str, Any]: assert len(self.lora_weights_lst) == 1 self.to_device(self.input.device) @@ -634,7 +634,7 @@ class BenchmarkTensors: 'add_inputs': add_inputs } - def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> Dict[str, Any]: + def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> dict[str, Any]: _, num_tokens, _, num_slices = self.metadata() # Sanity check shapes @@ -670,7 +670,7 @@ class BenchmarkTensors: def bench_fn_kwargs(self, op_type: OpType, - add_inputs: Optional[bool] = None) -> Dict[str, Any]: + add_inputs: Optional[bool] = None) -> dict[str, Any]: if op_type.is_shrink_fn(): assert add_inputs is None else: @@ -734,7 +734,7 @@ def bench_optype(ctx: BenchmarkContext, assert expand_fn_add_inputs is not None # BenchmarkContext -> BenchmarkTensors - bench_tensors : List[BenchmarkTensors] = \ + bench_tensors : list[BenchmarkTensors] = \ [BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)] for bt in bench_tensors: bt.sanity_check() @@ -746,7 +746,7 @@ def bench_optype(ctx: BenchmarkContext, for bt in bench_tensors ]) - # BenchmarkTensors -> Dict (kwargs) + # BenchmarkTensors -> dict (kwargs) kwargs_list = [ bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs) for bt in bench_tensors @@ -841,7 +841,7 @@ def use_cuda_graph_recommendation() -> str: """ -def print_timers(timers: List[TMeasurement], +def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None): compare = TBenchmark.Compare(timers) compare.print() @@ -861,7 +861,7 @@ def print_timers(timers: List[TMeasurement], "small num_loras the goal should be to match the torch.mm numbers.") -def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]): +def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]): if args.cuda_graph_nops is not None: assert args.cuda_graph_nops > 0 @@ -873,7 +873,7 @@ def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]): timers = [] for bench_ctx in bench_ctxs: for seq_len in args.seq_lengths: - bench_ops: List[OpType] = [] + bench_ops: list[OpType] = [] if seq_len == 1: # bench all decode ops bench_ops = [op for op in args.op_types if op.is_decode_op()] @@ -921,10 +921,10 @@ def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]): pickle.dump(timers, f) -def as_benchmark_contexts(hidden_sizes: List[int], lora_ranks: List[int], - args: argparse.Namespace) -> List[BenchmarkContext]: +def as_benchmark_contexts(hidden_sizes: list[int], lora_ranks: list[int], + args: argparse.Namespace) -> list[BenchmarkContext]: - ctxs: List[BenchmarkContext] = [] + ctxs: list[BenchmarkContext] = [] for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product( # noqa args.batch_sizes, list(hidden_sizes), lora_ranks, args.num_loras, args.sort_by_lora_id): @@ -954,7 +954,7 @@ def run_list_bench(args: argparse.Namespace): f" LoRA Ranks {args.lora_ranks}") # Get all benchmarking contexts - bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + bench_contexts: list[BenchmarkContext] = as_benchmark_contexts( hidden_sizes=args.hidden_sizes, lora_ranks=args.lora_ranks, args=args) run(args, bench_contexts) @@ -975,7 +975,7 @@ def run_range_bench(args: argparse.Namespace): f" LoRA Ranks {lora_ranks}") # Get all benchmarking contexts - bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + bench_contexts: list[BenchmarkContext] = as_benchmark_contexts( hidden_sizes=hidden_sizes, lora_ranks=lora_ranks, args=args) run(args, bench_contexts) @@ -1002,7 +1002,7 @@ def run_model_bench(args: argparse.Namespace): f" LoRA Ranks {args.lora_ranks}") # Get all benchmarking contexts - bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + bench_contexts: list[BenchmarkContext] = as_benchmark_contexts( hidden_sizes=hidden_sizes, lora_ranks=args.lora_ranks, args=args) run(args, bench_contexts) diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index 0301fee1..3fa57bd7 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -7,9 +7,10 @@ import math import os import pickle as pkl import time +from collections.abc import Iterable from dataclasses import dataclass from itertools import product -from typing import Callable, Iterable, List, Optional, Tuple +from typing import Callable, Optional import pandas as pd import torch @@ -102,8 +103,8 @@ def quantize_and_pack(atype: torch.dtype, return w_ref, w_q, w_s, w_zp -def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig, - group_size: Optional[int]) -> List[BenchmarkTensors]: +def create_bench_tensors(shape: tuple[int, int, int], types: TypeConfig, + group_size: Optional[int]) -> list[BenchmarkTensors]: m, n, k = shape # we want to make sure that weights don't fit into L2 cache between runs so @@ -114,7 +115,7 @@ def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig, a = rand_data((m, k), types.act_type, scale=5) - benchmark_tensors: List[BenchmarkTensors] = [] + benchmark_tensors: list[BenchmarkTensors] = [] for _ in range(num_weights): w = rand_data((k, n), types.act_type, scale=5) @@ -276,7 +277,7 @@ def machete_create_bench_fn(bt: BenchmarkTensors, def bench_fns(label: str, sub_label: str, description: str, - fns: List[Callable]): + fns: list[Callable]): min_run_time = 1 if not NVTX_PROFILE else 0.1 res = TBenchmark.Timer( @@ -311,7 +312,7 @@ def bench(types: TypeConfig, n: int, label: str, sub_label: str, - sweep_schedules: bool = True) -> List[TMeasurement]: + sweep_schedules: bool = True) -> list[TMeasurement]: benchmark_tensors = create_bench_tensors((m, n, k), types, group_size) sub_label += f", L={len(benchmark_tensors)}" @@ -414,12 +415,12 @@ def bench(types: TypeConfig, # runner -def print_timers(timers: List[TMeasurement]): +def print_timers(timers: list[TMeasurement]): compare = TBenchmark.Compare(timers) compare.print() -def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: +def run(args, MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]: types = TypeConfig( act_type=args.act_type, weight_type=scalar_types.uint4b8 if args.group_zero_type is None \ @@ -431,7 +432,7 @@ def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: token_scale_type=args.token_scale_type, ) - results: List[TMeasurement] = [] + results: list[TMeasurement] = [] for m, k, n in MKNs: timers = bench(types, args.group_size, @@ -449,8 +450,8 @@ def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: # output makers def make_output( - data: List[TMeasurement], - MKNs: Iterable[Tuple[int, int, int]], + data: list[TMeasurement], + MKNs: Iterable[tuple[int, int, int]], base_description: str, timestamp=None, ): @@ -497,7 +498,7 @@ def run_model_bench(args): for i, model in enumerate(args.models): print(f"[{i}] {model}") - def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]: + def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]: KNs = [] for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]): KN[tp_split_dim] = KN[tp_split_dim] // tp_size diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index 21ef4912..1e785ac8 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import torch import torch.utils.benchmark as benchmark from benchmark_shapes import WEIGHT_SHAPES @@ -31,7 +29,7 @@ ACT_ORDER_OPTS = [False, True] K_FULL_OPTS = [False, True] -def bench_run(results: List[benchmark.Measurement], model: str, +def bench_run(results: list[benchmark.Measurement], model: str, act_order: bool, is_k_full: bool, quant_type: ScalarType, group_size: int, size_m: int, size_k: int, size_n: int): label = "Quant Matmul" @@ -221,7 +219,7 @@ def main(args): for i, model in enumerate(args.models): print(f"[{i}] {model}") - results: List[benchmark.Measurement] = [] + results: list[benchmark.Measurement] = [] for model in args.models: for layer in WEIGHT_SHAPES[model]: diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 41075068..c862dec8 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -4,7 +4,7 @@ import argparse import time from datetime import datetime from itertools import product -from typing import Any, Dict, List, Tuple, TypedDict +from typing import Any, TypedDict import ray import torch @@ -132,7 +132,7 @@ def benchmark_config( start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) - latencies: List[float] = [] + latencies: list[float] = [] for i in range(num_iters): prepare(i) torch.cuda.synchronize() @@ -175,8 +175,8 @@ def get_rocm_tuning_space(use_fp16): return param_ranges -def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]: - configs: List[BenchmarkConfig] = [] +def get_configs_compute_bound(use_fp16) -> list[dict[str, int]]: + configs: list[BenchmarkConfig] = [] if current_platform.is_rocm(): param_ranges = get_rocm_tuning_space(use_fp16) @@ -335,7 +335,7 @@ class BenchmarkWorker: dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, - ) -> Tuple[Dict[str, int], float]: + ) -> tuple[dict[str, int], float]: current_platform.seed_everything(self.seed) dtype_str = get_config_dtype_str(dtype, use_int8_w8a16=use_int8_w8a16, @@ -371,8 +371,8 @@ class BenchmarkWorker: dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, - search_space: List[Dict[str, int]], - ) -> Dict[str, int]: + search_space: list[dict[str, int]], + ) -> dict[str, int]: best_config = None best_time = float("inf") if current_platform.is_rocm(): @@ -434,7 +434,7 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: } -def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int, +def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int, shard_intermediate_size: int, hidden_size: int, topk: int, dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool) -> None: @@ -498,7 +498,7 @@ def main(args: argparse.Namespace): num_gpus = int(ray.available_resources()["GPU"]) workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)] - def _distribute(method: str, inputs: List[Any]) -> List[Any]: + def _distribute(method: str, inputs: list[Any]) -> list[Any]: outputs = [] worker_idx = 0 for input_args in inputs: diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index daedaadb..d00e8482 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -2,7 +2,7 @@ import random import time -from typing import List, Optional +from typing import Optional import torch @@ -54,7 +54,7 @@ def main( # Create the block tables. max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size - block_tables_lst: List[List[int]] = [] + block_tables_lst: list[list[int]] = [] for _ in range(num_seqs): block_table = [ random.randint(0, NUM_BLOCKS - 1) diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py index dba15374..010a38b7 100644 --- a/benchmarks/kernels/benchmark_rmsnorm.py +++ b/benchmarks/kernels/benchmark_rmsnorm.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import itertools -from typing import Optional, Tuple, Union +from typing import Optional, Union import torch import triton @@ -22,7 +22,7 @@ class HuggingFaceRMSNorm(nn.Module): self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: orig_dtype = x.dtype x = x.to(torch.float32) if residual is not None: diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 8ee0212a..05d24fc4 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from itertools import accumulate -from typing import List, Optional +from typing import Optional import nvtx import torch @@ -39,7 +39,7 @@ def benchmark_rope_kernels_multi_lora( }) # non-batched RoPE takes only one scaling factor, we create multiple # instances to simulate the same behavior - non_batched_ropes: List[RotaryEmbedding] = [] + non_batched_ropes: list[RotaryEmbedding] = [] for scaling_factor in scaling_factors: non_batched_ropes.append( get_rope(head_size, rotary_dim, max_position, base, is_neox_style, diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py index 01d97d63..bd62173a 100644 --- a/benchmarks/kernels/graph_machete_bench.py +++ b/benchmarks/kernels/graph_machete_bench.py @@ -4,7 +4,6 @@ import math import pickle import re from collections import defaultdict -from typing import List import matplotlib.pyplot as plt import pandas as pd @@ -23,7 +22,7 @@ if __name__ == "__main__": with open(args.filename, 'rb') as f: data = pickle.load(f) - raw_results: List[TMeasurement] = data["results"] + raw_results: list[TMeasurement] = data["results"] results = defaultdict(lambda: list()) for v in raw_results: diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py index 72817074..ac64f786 100644 --- a/benchmarks/kernels/utils.py +++ b/benchmarks/kernels/utils.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import dataclasses -from typing import Any, Callable, Iterable, Optional +from collections.abc import Iterable +from typing import Any, Callable, Optional import torch import torch.utils.benchmark as TBenchmark diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py index d5a5e2ef..d64f0d0a 100644 --- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py +++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import enum -from typing import Dict, Union +from typing import Union from cutlass_library import * @@ -21,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum): TmaWarpSpecializedCooperative = enum_auto() -VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = { **DataTypeNames, # type: ignore **{ VLLMDataType.u4b8: "u4b8", @@ -29,7 +29,7 @@ VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = { } } -VLLMDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { **DataTypeTag, # type: ignore **{ VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t", @@ -37,7 +37,7 @@ VLLMDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = { } } -VLLMDataTypeSize: Dict[Union[VLLMDataType, DataType], int] = { +VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = { **DataTypeSize, # type: ignore **{ VLLMDataType.u4b8: 4, @@ -45,7 +45,7 @@ VLLMDataTypeSize: Dict[Union[VLLMDataType, DataType], int] = { } } -VLLMDataTypeVLLMScalarTypeTag: Dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = { VLLMDataType.u4b8: "vllm::kU4B8", VLLMDataType.u8b128: "vllm::kU8B128", DataType.u4: "vllm::kU4", @@ -56,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: Dict[Union[VLLMDataType, DataType], str] = { DataType.bf16: "vllm::kBfloat16", } -VLLMDataTypeTorchDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { DataType.u8: "at::ScalarType::Byte", DataType.s8: "at::ScalarType::Char", DataType.e4m3: "at::ScalarType::Float8_e4m3fn", @@ -66,7 +66,7 @@ VLLMDataTypeTorchDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = { DataType.f32: "at::ScalarType::Float", } -VLLMKernelScheduleTag: Dict[Union[ +VLLMKernelScheduleTag: dict[Union[ MixedInputKernelScheduleType, KernelScheduleType], str] = { **KernelScheduleTag, # type: ignore **{ diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index 02e59fe2..3114e14b 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -8,7 +8,7 @@ from collections.abc import Iterable from copy import deepcopy from dataclasses import dataclass, fields from functools import reduce -from typing import Dict, List, Optional, Tuple, Union +from typing import Optional, Union import jinja2 # yapf conflicts with isort for this block @@ -247,8 +247,8 @@ TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative @dataclass(frozen=True) class ScheduleConfig: - tile_shape_mn: Tuple[int, int] - cluster_shape_mnk: Tuple[int, int, int] + tile_shape_mn: tuple[int, int] + cluster_shape_mnk: tuple[int, int, int] kernel_schedule: MixedInputKernelScheduleType epilogue_schedule: EpilogueScheduleType tile_scheduler: TileSchedulerType @@ -277,8 +277,8 @@ class PrepackTypeConfig: @dataclass class ImplConfig: types: TypeConfig - schedules: List[ScheduleConfig] - heuristic: List[Tuple[Optional[str], ScheduleConfig]] + schedules: list[ScheduleConfig] + heuristic: list[tuple[Optional[str], ScheduleConfig]] def generate_sch_sig(schedule_config: ScheduleConfig) -> str: @@ -333,7 +333,7 @@ def is_power_of_two(n): return (n != 0) and (n & (n - 1) == 0) -def to_cute_constant(value: List[int]): +def to_cute_constant(value: list[int]): def _to_cute_constant(value: int): if is_power_of_two(value): @@ -347,7 +347,7 @@ def to_cute_constant(value: List[int]): return _to_cute_constant(value) -def unique_schedules(impl_configs: List[ImplConfig]): +def unique_schedules(impl_configs: list[ImplConfig]): return list( set(sch for impl_config in impl_configs for sch in impl_config.schedules)) @@ -391,7 +391,7 @@ mm_impl_template = create_template(IMPL_TEMPLATE) prepack_dispatch_template = create_template(PREPACK_TEMPLATE) -def create_sources(impl_configs: List[ImplConfig], num_impl_files=8): +def create_sources(impl_configs: list[ImplConfig], num_impl_files=8): sources = [] sources.append(( @@ -435,7 +435,7 @@ def create_sources(impl_configs: List[ImplConfig], num_impl_files=8): num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0) num_impls_per_file = math.ceil(num_impls / num_impl_files) - files_impls: List[List[ImplConfig]] = [[]] + files_impls: list[list[ImplConfig]] = [[]] curr_num_impls_assigned = 0 curr_impl_in_file = 0 @@ -515,7 +515,7 @@ def generate(): for cond, tile_config in default_tile_heuristic_config.items() ] - def get_unique_schedules(heuristic: Dict[str, ScheduleConfig]): + def get_unique_schedules(heuristic: dict[str, ScheduleConfig]): # Do not use schedules = list(set(...)) because we need to make sure # the output list is deterministic; otherwise the generated kernel file # will be non-deterministic and causes ccache miss. diff --git a/docs/source/conf.py b/docs/source/conf.py index 97bec81b..b72faef9 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -17,7 +17,6 @@ import inspect import logging import os import sys -from typing import List import requests from sphinx.ext import autodoc @@ -58,7 +57,7 @@ templates_path = ['_templates'] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns: List[str] = ["**/*.template.md", "**/*.inc.md"] +exclude_patterns: list[str] = ["**/*.template.md", "**/*.inc.md"] # Exclude the prompt "$" when copying code copybutton_prompt_text = r"\$ " diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md index 5c0c1762..230e461f 100644 --- a/docs/source/features/reasoning_outputs.md +++ b/docs/source/features/reasoning_outputs.md @@ -123,7 +123,7 @@ class ExampleParser(ReasoningParser): def extract_reasoning_content( self, model_output: str, request: ChatCompletionRequest - ) -> Tuple[Optional[str], Optional[str]]: + ) -> tuple[Optional[str], Optional[str]]: """ Extract reasoning content from a complete model-generated string. @@ -138,7 +138,7 @@ class ExampleParser(ReasoningParser): The request object that was used to generate the model_output. Returns: - Tuple[Optional[str], Optional[str]] + tuple[Optional[str], Optional[str]] A tuple containing the reasoning content and the content. """ ``` diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md index 1d5aa07a..de3c5bf5 100644 --- a/docs/source/features/structured_outputs.md +++ b/docs/source/features/structured_outputs.md @@ -193,7 +193,7 @@ class Step(BaseModel): class MathResponse(BaseModel): - steps: List[Step] + steps: list[Step] final_answer: str diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index c5f75953..c51ca186 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -74,7 +74,7 @@ class Example: path (Path): The path to the main directory or file. category (str): The category of the document. main_file (Path): The main file in the directory. - other_files (list[Path]): List of other files in the directory. + other_files (list[Path]): list of other files in the directory. title (str): The title of the document. Methods: diff --git a/examples/offline_inference/distributed.py b/examples/offline_inference/distributed.py index a2df41d4..e890c6da 100644 --- a/examples/offline_inference/distributed.py +++ b/examples/offline_inference/distributed.py @@ -6,7 +6,7 @@ distributively on a multi-nodes cluster. Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html """ -from typing import Any, Dict, List +from typing import Any import numpy as np import ray @@ -36,13 +36,13 @@ class LLMPredictor: self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", tensor_parallel_size=tensor_parallel_size) - def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]: + def __call__(self, batch: dict[str, np.ndarray]) -> dict[str, list]: # Generate texts from the prompts. # The output is a list of RequestOutput objects that contain the prompt, # generated text, and other information. outputs = self.llm.generate(batch["text"], sampling_params) - prompt: List[str] = [] - generated_text: List[str] = [] + prompt: list[str] = [] + generated_text: list[str] = [] for output in outputs: prompt.append(output.prompt) generated_text.append(' '.join([o.text for o in output.outputs])) @@ -72,7 +72,7 @@ def scheduling_strategy_fn(): pg, placement_group_capture_child_tasks=True)) -resources_kwarg: Dict[str, Any] = {} +resources_kwarg: dict[str, Any] = {} if tensor_parallel_size == 1: # For tensor_parallel_size == 1, we simply set num_gpus=1. resources_kwarg["num_gpus"] = 1 diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py index 501034c1..f7741a37 100644 --- a/examples/offline_inference/llm_engine_example.py +++ b/examples/offline_inference/llm_engine_example.py @@ -1,13 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 import argparse -from typing import List, Tuple from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams from vllm.utils import FlexibleArgumentParser -def create_test_prompts() -> List[Tuple[str, SamplingParams]]: +def create_test_prompts() -> list[tuple[str, SamplingParams]]: """Create a list of test prompts with their sampling parameters.""" return [ ("A robot may not injure a human being", @@ -24,7 +23,7 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]: def process_requests(engine: LLMEngine, - test_prompts: List[Tuple[str, SamplingParams]]): + test_prompts: list[tuple[str, SamplingParams]]): """Continuously process a list of prompts and handle the outputs.""" request_id = 0 @@ -34,7 +33,7 @@ def process_requests(engine: LLMEngine, engine.add_request(str(request_id), prompt, sampling_params) request_id += 1 - request_outputs: List[RequestOutput] = engine.step() + request_outputs: list[RequestOutput] = engine.step() for request_output in request_outputs: if request_output.finished: diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py index de0734c1..a4097350 100644 --- a/examples/offline_inference/lora_with_quantization_inference.py +++ b/examples/offline_inference/lora_with_quantization_inference.py @@ -7,7 +7,7 @@ Requires HuggingFace credentials for access. """ import gc -from typing import List, Optional, Tuple +from typing import Optional import torch from huggingface_hub import snapshot_download @@ -18,7 +18,7 @@ from vllm.lora.request import LoRARequest def create_test_prompts( lora_path: str -) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]: +) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]: return [ # this is an example of using quantization without LoRA ("My name is", @@ -49,7 +49,7 @@ def create_test_prompts( def process_requests(engine: LLMEngine, - test_prompts: List[Tuple[str, SamplingParams, + test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]]): """Continuously process a list of prompts and handle the outputs.""" request_id = 0 @@ -63,7 +63,7 @@ def process_requests(engine: LLMEngine, lora_request=lora_request) request_id += 1 - request_outputs: List[RequestOutput] = engine.step() + request_outputs: list[RequestOutput] = engine.step() for request_output in request_outputs: if request_output.finished: print("----------------------------------------------------") diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py index f227e71b..61641245 100644 --- a/examples/offline_inference/mlpspeculator.py +++ b/examples/offline_inference/mlpspeculator.py @@ -2,12 +2,11 @@ import gc import time -from typing import List from vllm import LLM, SamplingParams -def time_generation(llm: LLM, prompts: List[str], +def time_generation(llm: LLM, prompts: list[str], sampling_params: SamplingParams): # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py index 630fd1bf..4b0d115e 100644 --- a/examples/offline_inference/multilora_inference.py +++ b/examples/offline_inference/multilora_inference.py @@ -6,7 +6,7 @@ for offline inference. Requires HuggingFace credentials for access to Llama2. """ -from typing import List, Optional, Tuple +from typing import Optional from huggingface_hub import snapshot_download @@ -16,7 +16,7 @@ from vllm.lora.request import LoRARequest def create_test_prompts( lora_path: str -) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]: +) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]: """Create a list of test prompts with their sampling parameters. 2 requests for base model, 4 requests for the LoRA. We define 2 @@ -56,7 +56,7 @@ def create_test_prompts( def process_requests(engine: LLMEngine, - test_prompts: List[Tuple[str, SamplingParams, + test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]]): """Continuously process a list of prompts and handle the outputs.""" request_id = 0 @@ -70,7 +70,7 @@ def process_requests(engine: LLMEngine, lora_request=lora_request) request_id += 1 - request_outputs: List[RequestOutput] = engine.step() + request_outputs: list[RequestOutput] = engine.step() for request_output in request_outputs: if request_output.finished: diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py index 298f0801..3ae507ca 100644 --- a/examples/offline_inference/prithvi_geospatial_mae.py +++ b/examples/offline_inference/prithvi_geospatial_mae.py @@ -21,7 +21,7 @@ import argparse import datetime import os import re -from typing import List, Union +from typing import Union import albumentations import numpy as np @@ -260,9 +260,9 @@ def _convert_np_uint8(float_image: torch.Tensor): def load_example( - file_paths: List[str], - mean: List[float] = None, - std: List[float] = None, + file_paths: list[str], + mean: list[float] = None, + std: list[float] = None, indices: Union[list[int], None] = None, ): """Build an input example by loading images in *file_paths*. diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py index c2e072fd..ffa76b4e 100644 --- a/examples/offline_inference/profiling.py +++ b/examples/offline_inference/profiling.py @@ -5,8 +5,9 @@ import json import os import sys from argparse import RawTextHelpFormatter +from collections.abc import Generator from dataclasses import asdict, dataclass -from typing import Any, Dict, Generator, List, Optional, TypeAlias +from typing import Any, Optional, TypeAlias import torch import tqdm @@ -42,8 +43,8 @@ def get_dtype(dtype: str): return dtype -OutputLen_NumReqs_Map: TypeAlias = Dict[int, int] -def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \ +OutputLen_NumReqs_Map: TypeAlias = dict[int, int] +def compute_request_output_lengths(batch_size: int, step_requests: list[int]) \ -> OutputLen_NumReqs_Map: """ Given the number of requests, batch_size, and the number of requests @@ -63,7 +64,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \ Args: batch_size (int): Number of requests submitted for profile. This is args.batch_size. - step_requests (List[int]): step_requests[i] is the number of requests + step_requests (list[int]): step_requests[i] is the number of requests that the ith engine step should process. Returns: @@ -114,7 +115,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \ return ol_nr -def determine_requests_per_step(context: ProfileContext) -> List[int]: +def determine_requests_per_step(context: ProfileContext) -> list[int]: """ Determine number of requests each engine step should process. If context.num_steps is set, then all engine steps process the @@ -130,7 +131,7 @@ def determine_requests_per_step(context: ProfileContext) -> List[int]: context: ProfileContext object. Returns: - List[int]: Number of requests to process for all engine-steps. + list[int]: Number of requests to process for all engine-steps. output[i], contains the number of requests that the ith step should process. """ @@ -170,7 +171,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], for key, value in asdict(context).items(): print(f" {key} = {value}") - requests_per_step: List[int] = determine_requests_per_step(context) + requests_per_step: list[int] = determine_requests_per_step(context) ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths( context.batch_size, requests_per_step) diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py index d54117d6..61da4705 100644 --- a/examples/offline_inference/profiling_tpu/profiling.py +++ b/examples/offline_inference/profiling_tpu/profiling.py @@ -4,7 +4,6 @@ import argparse import dataclasses import os import time -from typing import List import numpy as np import torch_xla.debug.profiler as xp @@ -35,7 +34,7 @@ def main(args: argparse.Namespace): dummy_prompt_token_ids = np.random.randint(10000, size=(args.batch_size, args.input_len)) - dummy_prompts: List[PromptType] = [{ + dummy_prompts: list[PromptType] = [{ "prompt_token_ids": batch } for batch in dummy_prompt_token_ids.tolist()] diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 872c9481..b1aec33c 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -5,7 +5,7 @@ multi-image input on vision language models for text generation, using the chat template defined by the model. """ from argparse import Namespace -from typing import List, NamedTuple, Optional +from typing import NamedTuple, Optional from PIL.Image import Image from transformers import AutoProcessor, AutoTokenizer @@ -24,8 +24,8 @@ IMAGE_URLS = [ class ModelRequestData(NamedTuple): llm: LLM prompt: str - stop_token_ids: Optional[List[int]] - image_data: List[Image] + stop_token_ids: Optional[list[int]] + image_data: list[Image] chat_template: Optional[str] @@ -34,7 +34,7 @@ class ModelRequestData(NamedTuple): # Unless specified, these settings have been tested to work on a single L4. -def load_aria(question, image_urls: List[str]) -> ModelRequestData: +def load_aria(question, image_urls: list[str]) -> ModelRequestData: model_name = "rhymes-ai/Aria" llm = LLM(model=model_name, tokenizer_mode="slow", @@ -55,7 +55,7 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData: ) -def load_deepseek_vl2(question: str, image_urls: List[str]): +def load_deepseek_vl2(question: str, image_urls: list[str]): model_name = "deepseek-ai/deepseek-vl2-tiny" llm = LLM(model=model_name, @@ -77,7 +77,7 @@ def load_deepseek_vl2(question: str, image_urls: List[str]): ) -def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData: +def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "h2oai/h2ovl-mississippi-800m" llm = LLM( @@ -111,7 +111,7 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData: ) -def load_idefics3(question, image_urls: List[str]) -> ModelRequestData: +def load_idefics3(question, image_urls: list[str]) -> ModelRequestData: model_name = "HuggingFaceM4/Idefics3-8B-Llama3" # The configuration below has been confirmed to launch on a single L40 GPU. @@ -142,7 +142,7 @@ def load_idefics3(question, image_urls: List[str]) -> ModelRequestData: ) -def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData: +def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "OpenGVLab/InternVL2-2B" llm = LLM( @@ -179,7 +179,7 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData: ) -def load_mllama(question, image_urls: List[str]) -> ModelRequestData: +def load_mllama(question, image_urls: list[str]) -> ModelRequestData: model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" # The configuration below has been confirmed to launch on a single L40 GPU. @@ -201,7 +201,7 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData: ) -def load_nvlm_d(question: str, image_urls: List[str]): +def load_nvlm_d(question: str, image_urls: list[str]): model_name = "nvidia/NVLM-D-72B" # Adjust this as necessary to fit in GPU @@ -234,7 +234,7 @@ def load_nvlm_d(question: str, image_urls: List[str]): ) -def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData: +def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "mistral-community/pixtral-12b" # Adjust this as necessary to fit in GPU @@ -259,7 +259,7 @@ def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData: ) -def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: +def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData: # num_crops is an override kwarg to the multimodal image processor; # For some models, e.g., Phi-3.5-vision-instruct, it is recommended # to use 16 for single frame scenarios, and 4 for multi-frame. @@ -295,7 +295,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: def load_qwen_vl_chat(question: str, - image_urls: List[str]) -> ModelRequestData: + image_urls: list[str]) -> ModelRequestData: model_name = "Qwen/Qwen-VL-Chat" llm = LLM( model=model_name, @@ -336,7 +336,7 @@ def load_qwen_vl_chat(question: str, ) -def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: +def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData: try: from qwen_vl_utils import process_vision_info except ModuleNotFoundError: @@ -393,7 +393,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: ) -def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData: +def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData: try: from qwen_vl_utils import process_vision_info except ModuleNotFoundError: @@ -466,7 +466,7 @@ model_example_map = { } -def run_generate(model, question: str, image_urls: List[str]): +def run_generate(model, question: str, image_urls: list[str]): req_data = model_example_map[model](question, image_urls) sampling_params = SamplingParams(temperature=0.0, @@ -487,7 +487,7 @@ def run_generate(model, question: str, image_urls: List[str]): print(generated_text) -def run_chat(model: str, question: str, image_urls: List[str]): +def run_chat(model: str, question: str, image_urls: list[str]): req_data = model_example_map[model](question, image_urls) sampling_params = SamplingParams(temperature=0.0, diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py index 623e0d59..22bb1a87 100644 --- a/examples/online_serving/api_client.py +++ b/examples/online_serving/api_client.py @@ -7,7 +7,7 @@ For production use, we recommend `vllm serve` and the OpenAI client API. import argparse import json -from typing import Iterable, List +from collections.abc import Iterable import requests @@ -39,7 +39,7 @@ def post_http_request(prompt: str, return response -def get_streaming_response(response: requests.Response) -> Iterable[List[str]]: +def get_streaming_response(response: requests.Response) -> Iterable[list[str]]: for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"): @@ -49,7 +49,7 @@ def get_streaming_response(response: requests.Response) -> Iterable[List[str]]: yield output -def get_response(response: requests.Response) -> List[str]: +def get_response(response: requests.Response) -> list[str]: data = json.loads(response.content) output = data["text"] return output diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py index cb110997..b7c5651e 100644 --- a/examples/online_serving/openai_embedding_client.py +++ b/examples/online_serving/openai_embedding_client.py @@ -24,4 +24,4 @@ responses = client.embeddings.create( ) for data in responses.data: - print(data.embedding) # list of float of len 4096 + print(data.embedding) # List of float of len 4096 diff --git a/pyproject.toml b/pyproject.toml index 1c03e9e1..04e0c9e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,6 +65,32 @@ exclude = [ [tool.ruff.lint.per-file-ignores] "vllm/version.py" = ["F401"] "vllm/_version.py" = ["ALL"] +# Python 3.8 typing. TODO: Remove these excludes after v1.0.0 +"vllm/adapter_commons/**/*.py" = ["UP006", "UP035"] +"vllm/attention/**/*.py" = ["UP006", "UP035"] +"vllm/compilation/**/*.py" = ["UP006", "UP035"] +"vllm/core/**/*.py" = ["UP006", "UP035"] +"vllm/device_allocator/**/*.py" = ["UP006", "UP035"] +"vllm/distributed/**/*.py" = ["UP006", "UP035"] +"vllm/engine/**/*.py" = ["UP006", "UP035"] +"vllm/executor/**/*.py" = ["UP006", "UP035"] +"vllm/inputs/**/*.py" = ["UP006", "UP035"] +"vllm/logging_utils/**/*.py" = ["UP006", "UP035"] +"vllm/lora/**/*.py" = ["UP006", "UP035"] +"vllm/model_executor/**/*.py" = ["UP006", "UP035"] +"vllm/multimodal/**/*.py" = ["UP006", "UP035"] +"vllm/platforms/**/*.py" = ["UP006", "UP035"] +"vllm/plugins/**/*.py" = ["UP006", "UP035"] +"vllm/profiler/**/*.py" = ["UP006", "UP035"] +"vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"] +"vllm/spec_decode/**/*.py" = ["UP006", "UP035"] +"vllm/third_party/**/*.py" = ["UP006", "UP035"] +"vllm/transformers_utils/**/*.py" = ["UP006", "UP035"] +"vllm/triton_utils/**/*.py" = ["UP006", "UP035"] +"vllm/usage/**/*.py" = ["UP006", "UP035"] +"vllm/vllm_flash_attn/**/*.py" = ["UP006", "UP035"] +"vllm/assets/**/*.py" = ["UP006", "UP035"] +"vllm/worker/**/*.py" = ["UP006", "UP035"] [tool.ruff.lint] select = [ @@ -91,8 +117,6 @@ ignore = [ "B007", # f-string format "UP032", - # Python 3.8 typing - "UP006", "UP035", # Can remove once 3.10+ is the minimum Python version "UP007", ] diff --git a/setup.py b/setup.py index 6fe43351..cd17709b 100755 --- a/setup.py +++ b/setup.py @@ -9,7 +9,6 @@ import subprocess import sys from pathlib import Path from shutil import which -from typing import Dict, List import torch from packaging.version import Version, parse @@ -78,7 +77,7 @@ class CMakeExtension(Extension): class cmake_build_ext(build_ext): # A dict of extension directories that have been configured. - did_config: Dict[str, bool] = {} + did_config: dict[str, bool] = {} # # Determine number of compilation jobs and optionally nvcc compile threads. @@ -548,10 +547,10 @@ def get_vllm_version() -> str: return version -def get_requirements() -> List[str]: +def get_requirements() -> list[str]: """Get Python package dependencies from requirements.txt.""" - def _read_requirements(filename: str) -> List[str]: + def _read_requirements(filename: str) -> list[str]: with open(get_path(filename)) as f: requirements = f.read().strip().split("\n") resolved_requirements = [] diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py index d9ac6116..1e3c2d1a 100644 --- a/tests/async_engine/api_server_async_engine.py +++ b/tests/async_engine/api_server_async_engine.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """vllm.entrypoints.api_server with some extra logging for testing.""" -from typing import Any, Dict, Iterable +from collections.abc import Iterable +from typing import Any import uvicorn from fastapi.responses import JSONResponse, Response @@ -24,7 +25,7 @@ class AsyncLLMEngineWithStats(AsyncLLMEngine): self._num_aborts += len(ids) await super()._engine_abort(ids) - def testing_stats(self) -> Dict[str, Any]: + def testing_stats(self) -> dict[str, Any]: return {"num_aborted_requests": self._num_aborts} diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index ca29abc9..6307bd7d 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -6,7 +6,7 @@ import uuid from asyncio import CancelledError from copy import copy from dataclasses import dataclass -from typing import List, Optional +from typing import Optional import pytest import pytest_asyncio @@ -254,7 +254,7 @@ async def test_output_kinds(async_engine, stop): params.output_kind = RequestOutputKind.DELTA prompt_tokens = None - output_tokens: List[int] = [] + output_tokens: list[int] = [] output_text = "" output_count = 0 final_output = None diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 021bd4cc..7307f44b 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -8,7 +8,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are initialized randomly with a fixed seed. """ from dataclasses import dataclass -from typing import Any, List, Optional, Tuple +from typing import Any, Optional import torch from torch import nn @@ -56,7 +56,7 @@ class LlamaConfig: random_seed: int = 0 def compute_hash(self) -> str: - factors: List[Any] = [] + factors: list[Any] = [] for k, v in self.__dict__.items(): if k == "random_seed": continue @@ -174,7 +174,7 @@ class LlamaDecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: """ For tractable computation: - if residual is None, the outputs are: diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 587c0a60..48323b21 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import dataclasses -from typing import Dict, List, Optional +from typing import Optional import pytest @@ -14,7 +14,7 @@ from ..utils import compare_all_settings @dataclasses.dataclass class TestSetting: model: str - model_args: List[str] + model_args: list[str] pp_size: int tp_size: int attn_backend: str @@ -108,8 +108,8 @@ def test_compile_correctness(test_setting: TestSetting): final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \ ["-tp", str(tp_size)] - all_args: List[List[str]] = [] - all_envs: List[Optional[Dict[str, str]]] = [] + all_args: list[list[str]] = [] + all_envs: list[Optional[dict[str, str]]] = [] for level in [ CompilationLevel.NO_COMPILATION, diff --git a/tests/conftest.py b/tests/conftest.py index 871f0b62..57a33ad0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,8 +5,7 @@ import os import tempfile from collections import UserList from enum import Enum -from typing import (Any, Callable, Dict, List, Optional, Tuple, Type, - TypedDict, TypeVar, Union) +from typing import Any, Callable, Optional, TypedDict, TypeVar, Union import numpy as np import pytest @@ -47,14 +46,14 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt") _M = TypeVar("_M") -_PromptMultiModalInput = Union[List[_M], List[List[_M]]] +_PromptMultiModalInput = Union[list[_M], list[list[_M]]] PromptImageInput = _PromptMultiModalInput[Image.Image] -PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]] +PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]] PromptVideoInput = _PromptMultiModalInput[np.ndarray] -def _read_prompts(filename: str) -> List[str]: +def _read_prompts(filename: str) -> list[str]: with open(filename) as f: prompts = f.readlines() return prompts @@ -77,7 +76,7 @@ class _ImageAssets(_ImageAssetsBase): ImageAsset("cherry_blossom"), ]) - def prompts(self, prompts: _ImageAssetPrompts) -> List[str]: + def prompts(self, prompts: _ImageAssetPrompts) -> list[str]: """ Convenience method to define the prompt for each test image. @@ -102,7 +101,7 @@ class _VideoAssets(_VideoAssetsBase): VideoAsset("sample_demo_1.mp4"), ]) - def prompts(self, prompts: _VideoAssetPrompts) -> List[str]: + def prompts(self, prompts: _VideoAssetPrompts) -> list[str]: return [prompts["sample_demo_1"]] @@ -175,7 +174,7 @@ def dynamo_reset(): @pytest.fixture -def example_prompts() -> List[str]: +def example_prompts() -> list[str]: prompts = [] for filename in _TEST_PROMPTS: prompts += _read_prompts(filename) @@ -197,7 +196,7 @@ class DecoderPromptType(Enum): @pytest.fixture def example_encoder_decoder_prompts( -) -> Dict[DecoderPromptType, List[ExplicitEncoderDecoderPrompt]]: +) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]: ''' Returns an encoder prompt list and a decoder prompt list, wherein each pair of same-index entries in both lists corresponds to an (encoder prompt, @@ -229,7 +228,7 @@ def example_encoder_decoder_prompts( @pytest.fixture -def example_long_prompts() -> List[str]: +def example_long_prompts() -> list[str]: prompts = [] for filename in _LONG_PROMPTS: prompts += _read_prompts(filename) @@ -273,11 +272,11 @@ class HfRunner: model_name: str, dtype: str = "half", *, - model_kwargs: Optional[Dict[str, Any]] = None, + model_kwargs: Optional[dict[str, Any]] = None, is_sentence_transformer: bool = False, is_cross_encoder: bool = False, skip_tokenizer_init: bool = False, - auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM, + auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM, postprocess_inputs: Callable[..., BatchEncoding] = identity, ) -> None: torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] @@ -334,11 +333,11 @@ class HfRunner: def get_inputs( self, - prompts: List[str], + prompts: list[str], images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, - ) -> List[BatchEncoding]: + ) -> list[BatchEncoding]: if images is not None: assert len(prompts) == len(images) @@ -348,9 +347,9 @@ class HfRunner: if audios is not None: assert len(prompts) == len(audios) - all_inputs: List[BatchEncoding] = [] + all_inputs: list[BatchEncoding] = [] for i, prompt in enumerate(prompts): - processor_kwargs: Dict[str, Any] = { + processor_kwargs: dict[str, Any] = { "text": prompt, "return_tensors": "pt", } @@ -370,7 +369,7 @@ class HfRunner: return all_inputs - def classify(self, prompts: List[str]) -> List[str]: + def classify(self, prompts: list[str]) -> list[str]: # output is final logits all_inputs = self.get_inputs(prompts) outputs = [] @@ -383,18 +382,18 @@ class HfRunner: def generate( self, - prompts: List[str], + prompts: list[str], images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, **kwargs: Any, - ) -> List[Tuple[List[List[int]], List[str]]]: + ) -> list[tuple[list[list[int]], list[str]]]: all_inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios) - outputs: List[Tuple[List[List[int]], List[str]]] = [] + outputs: list[tuple[list[list[int]], list[str]]] = [] for inputs in all_inputs: output_ids = self.model.generate( **self.wrap_device(inputs, device=self.model.device.type), @@ -412,13 +411,13 @@ class HfRunner: def generate_greedy( self, - prompts: List[str], + prompts: list[str], max_tokens: int, images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, **kwargs: Any, - ) -> List[Tuple[List[int], str]]: + ) -> list[tuple[list[int], str]]: outputs = self.generate(prompts, do_sample=False, max_new_tokens=max_tokens, @@ -432,10 +431,10 @@ class HfRunner: def generate_beam_search( self, - prompts: List[str], + prompts: list[str], beam_width: int, max_tokens: int, - ) -> List[Tuple[List[List[int]], List[str]]]: + ) -> list[tuple[list[list[int]], list[str]]]: outputs = self.generate(prompts, do_sample=False, max_new_tokens=max_tokens, @@ -453,19 +452,19 @@ class HfRunner: def generate_greedy_logprobs( self, - prompts: List[str], + prompts: list[str], max_tokens: int, images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, **kwargs: Any, - ) -> List[List[torch.Tensor]]: + ) -> list[list[torch.Tensor]]: all_inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios) - all_logprobs: List[List[torch.Tensor]] = [] + all_logprobs: list[list[torch.Tensor]] = [] for inputs in all_inputs: output = self.model.generate( **self.wrap_device(inputs, device=self.model.device.type), @@ -483,11 +482,11 @@ class HfRunner: def _hidden_states_to_seq_logprobs( self, - hidden_states: Tuple[Tuple[torch.Tensor, ...], ...], - ) -> List[torch.Tensor]: + hidden_states: tuple[tuple[torch.Tensor, ...], ...], + ) -> list[torch.Tensor]: output_embeddings = self.model.get_output_embeddings() - seq_logprobs: List[torch.Tensor] = [] + seq_logprobs: list[torch.Tensor] = [] for _, hidden_state in enumerate(hidden_states): last_hidden_states = hidden_state[-1][0] logits = torch.matmul( @@ -503,14 +502,14 @@ class HfRunner: def _hidden_states_to_logprobs( self, - hidden_states: Tuple[Tuple[torch.Tensor, ...], ...], + hidden_states: tuple[tuple[torch.Tensor, ...], ...], num_logprobs: int, - ) -> Tuple[List[Dict[int, float]], int]: + ) -> tuple[list[dict[int, float]], int]: seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states) output_len = len(hidden_states) # convert to dict - seq_logprobs_lst: List[Dict[int, float]] = [] + seq_logprobs_lst: list[dict[int, float]] = [] for tok_idx, tok_logprobs in enumerate(seq_logprobs): # drop prompt logprobs if tok_idx == 0: @@ -530,22 +529,22 @@ class HfRunner: def generate_greedy_logprobs_limit( self, - prompts: List[str], + prompts: list[str], max_tokens: int, num_logprobs: int, images: Optional[PromptImageInput] = None, audios: Optional[PromptAudioInput] = None, videos: Optional[PromptVideoInput] = None, **kwargs: Any, - ) -> List[TokensTextLogprobs]: + ) -> list[TokensTextLogprobs]: all_inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios) - all_logprobs: List[List[Dict[int, float]]] = [] - all_output_ids: List[List[int]] = [] - all_output_strs: List[str] = [] + all_logprobs: list[list[dict[int, float]]] = [] + all_output_ids: list[list[int]] = [] + all_output_strs: list[str] = [] for inputs in all_inputs: output = self.model.generate( @@ -577,23 +576,23 @@ class HfRunner: def generate_encoder_decoder_greedy_logprobs_limit( self, - encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], + encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]], max_tokens: int, num_logprobs: int, images: Optional[PromptImageInput] = None, **kwargs: Any, - ) -> List[TokensTextLogprobs]: + ) -> list[TokensTextLogprobs]: ''' Greedy logprobs generation for vLLM encoder/decoder models ''' - all_logprobs: List[List[Dict[int, float]]] = [] - all_output_ids: List[List[int]] = [] - all_output_strs: List[str] = [] + all_logprobs: list[list[dict[int, float]]] = [] + all_output_ids: list[list[int]] = [] + all_output_strs: list[str] = [] for i, (encoder_prompt, decoder_prompt) in enumerate( to_enc_dec_tuple_list(encoder_decoder_prompts)): - processor_kwargs: Dict[str, Any] = { + processor_kwargs: dict[str, Any] = { "text": encoder_prompt, "return_tensors": "pt", } @@ -641,10 +640,10 @@ class HfRunner: return [(output_ids, output_str, output_logprobs) for output_ids, output_str, output_logprobs in outputs] - def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]: + def encode(self, prompts: list[str]) -> list[list[torch.Tensor]]: return self.model.encode(prompts) - def predict(self, prompts: List[List[str]]) -> torch.Tensor: + def predict(self, prompts: list[list[str]]) -> torch.Tensor: return self.model.predict(prompts, convert_to_tensor=True) def __enter__(self): @@ -699,11 +698,11 @@ class VllmRunner: def get_inputs( self, - prompts: List[str], + prompts: list[str], images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, - ) -> List[TextPrompt]: + ) -> list[TextPrompt]: if images is not None: assert len(prompts) == len(images) @@ -733,13 +732,13 @@ class VllmRunner: def generate( self, - prompts: List[str], + prompts: list[str], sampling_params: SamplingParams, images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, **kwargs: Any, - ) -> List[Tuple[List[List[int]], List[str]]]: + ) -> list[tuple[list[list[int]], list[str]]]: inputs = self.get_inputs(prompts, images=images, videos=videos, @@ -749,12 +748,12 @@ class VllmRunner: sampling_params=sampling_params, **kwargs) - outputs: List[Tuple[List[List[int]], List[str]]] = [] + outputs: list[tuple[list[list[int]], list[str]]] = [] for req_output in req_outputs: prompt_str = req_output.prompt prompt_ids = req_output.prompt_token_ids - req_sample_output_ids: List[List[int]] = [] - req_sample_output_strs: List[str] = [] + req_sample_output_ids: list[list[int]] = [] + req_sample_output_strs: list[str] = [] for sample in req_output.outputs: output_str = sample.text output_ids = list(sample.token_ids) @@ -765,9 +764,9 @@ class VllmRunner: @staticmethod def _final_steps_generate_w_logprobs( - req_outputs: List[RequestOutput], - ) -> List[TokensTextLogprobsPromptLogprobs]: - outputs: List[TokensTextLogprobsPromptLogprobs] = [] + req_outputs: list[RequestOutput], + ) -> list[TokensTextLogprobsPromptLogprobs]: + outputs: list[TokensTextLogprobsPromptLogprobs] = [] for req_output in req_outputs: assert len(req_output.outputs) > 0 for sample in req_output.outputs: @@ -780,14 +779,14 @@ class VllmRunner: def generate_w_logprobs( self, - prompts: List[str], + prompts: list[str], sampling_params: SamplingParams, images: Optional[PromptImageInput] = None, audios: Optional[PromptAudioInput] = None, videos: Optional[PromptVideoInput] = None, **kwargs: Any, - ) -> Union[List[TokensTextLogprobs], - List[TokensTextLogprobsPromptLogprobs]]: + ) -> Union[list[TokensTextLogprobs], + list[TokensTextLogprobsPromptLogprobs]]: inputs = self.get_inputs(prompts, images=images, videos=videos, @@ -806,10 +805,10 @@ class VllmRunner: def generate_encoder_decoder_w_logprobs( self, - encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], + encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]], sampling_params: SamplingParams, - ) -> Union[List[TokensTextLogprobs], - List[TokensTextLogprobsPromptLogprobs]]: + ) -> Union[list[TokensTextLogprobs], + list[TokensTextLogprobsPromptLogprobs]]: ''' Logprobs generation for vLLM encoder/decoder models ''' @@ -826,13 +825,13 @@ class VllmRunner: def generate_greedy( self, - prompts: List[str], + prompts: list[str], max_tokens: int, images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, **kwargs: Any, - ) -> List[Tuple[List[int], str]]: + ) -> list[tuple[list[int], str]]: greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) outputs = self.generate(prompts, greedy_params, @@ -845,18 +844,18 @@ class VllmRunner: def generate_greedy_logprobs( self, - prompts: List[str], + prompts: list[str], max_tokens: int, num_logprobs: int, num_prompt_logprobs: Optional[int] = None, images: Optional[PromptImageInput] = None, audios: Optional[PromptAudioInput] = None, videos: Optional[PromptVideoInput] = None, - stop_token_ids: Optional[List[int]] = None, - stop: Optional[List[str]] = None, + stop_token_ids: Optional[list[int]] = None, + stop: Optional[list[str]] = None, **kwargs: Any, - ) -> Union[List[TokensTextLogprobs], - List[TokensTextLogprobsPromptLogprobs]]: + ) -> Union[list[TokensTextLogprobs], + list[TokensTextLogprobsPromptLogprobs]]: greedy_logprobs_params = SamplingParams( temperature=0.0, max_tokens=max_tokens, @@ -874,12 +873,12 @@ class VllmRunner: def generate_encoder_decoder_greedy_logprobs( self, - encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], + encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]], max_tokens: int, num_logprobs: int, num_prompt_logprobs: Optional[int] = None, - ) -> Union[List[TokensTextLogprobs], - List[TokensTextLogprobsPromptLogprobs]]: + ) -> Union[list[TokensTextLogprobs], + list[TokensTextLogprobsPromptLogprobs]]: greedy_logprobs_params = SamplingParams( temperature=0.0, max_tokens=max_tokens, @@ -895,10 +894,10 @@ class VllmRunner: def generate_beam_search( self, - prompts: Union[List[str], List[List[int]]], + prompts: Union[list[str], list[list[int]]], beam_width: int, max_tokens: int, - ) -> List[Tuple[List[List[int]], List[str]]]: + ) -> list[tuple[list[list[int]], list[str]]]: if is_list_of(prompts, str, check="all"): prompts = [TextPrompt(prompt=prompt) for prompt in prompts] else: @@ -915,17 +914,17 @@ class VllmRunner: returned_outputs.append((token_ids, texts)) return returned_outputs - def classify(self, prompts: List[str]) -> List[List[float]]: + def classify(self, prompts: list[str]) -> list[list[float]]: req_outputs = self.model.classify(prompts) return [req_output.outputs.probs for req_output in req_outputs] def encode( self, - prompts: List[str], + prompts: list[str], images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, - ) -> List[List[float]]: + ) -> list[list[float]]: inputs = self.get_inputs(prompts, images=images, videos=videos, @@ -936,9 +935,9 @@ class VllmRunner: def score( self, - text_1: Union[str, List[str]], - text_2: Union[str, List[str]], - ) -> List[float]: + text_1: Union[str, list[str]], + text_2: Union[str, list[str]], + ) -> list[float]: req_outputs = self.model.score(text_1, text_2) return [req_output.outputs.score for req_output in req_outputs] diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py index 7d3ccaad..83259b69 100644 --- a/tests/core/block/e2e/conftest.py +++ b/tests/core/block/e2e/conftest.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, Iterable, Optional +from collections.abc import Iterable +from typing import Callable, Optional import pytest diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py index a7dafcf8..e23b8718 100644 --- a/tests/core/block/e2e/test_correctness_sliding_window.py +++ b/tests/core/block/e2e/test_correctness_sliding_window.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import List import pytest @@ -137,9 +136,9 @@ def prep_prompts(batch_size: int): The prompt is just under 10k tokens; sliding window is 4k so the answer is outside sliding window, but should still be correct. """ - prompts: List[str] = [] - answer: List[int] = [] - indices: List[int] = [] + prompts: list[str] = [] + answer: list[int] = [] + indices: list[int] = [] random.seed(1) for _ in range(batch_size): idx = random.randint(30, 90) @@ -158,7 +157,7 @@ def prep_prompts(batch_size: int): return prompts, answer, indices -def check_answers(indices: List[int], answer: List[int], outputs: List[str]): +def check_answers(indices: list[int], answer: list[int], outputs: list[str]): answer2 = [int(text[0:2].strip()) for text in outputs] print(list(zip(indices, zip(answer, answer2)))) numok = 0 @@ -170,7 +169,7 @@ def check_answers(indices: List[int], answer: List[int], outputs: List[str]): assert frac_ok > 0.7 -def check_window(prompts: List[str]): +def check_window(prompts: list[str]): def inner(llm: LLM): sliding_window = llm.llm_engine.model_config.get_sliding_window() diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py index d8cf0bec..250c9a74 100644 --- a/tests/core/block/test_block_table.py +++ b/tests/core/block/test_block_table.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest from vllm.core.block.block_table import BlockTable @@ -32,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int): token_ids = list(range(sequence_len)) num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size))) - block_tables: List[BlockTable] = [] + block_tables: list[BlockTable] = [] for i in range(5): assert allocator.get_num_free_blocks( device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc @@ -77,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int): num_immutable_blocks_per_alloc = len( chunked_tokens) - num_mutable_blocks_per_alloc - block_tables: List[BlockTable] = [] + block_tables: list[BlockTable] = [] for alloc_i in range(1, 6): block_tables.append( @@ -272,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int, ) block_table.allocate(token_ids=token_ids, device=Device.GPU) - appended_so_far: List[int] = [] + appended_so_far: list[int] = [] for append in chunk_list(token_ids_to_append, append_size): block_table.append_token_ids(append) appended_so_far.extend(append) diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py index 0ca2a0b8..4b9454c8 100644 --- a/tests/core/block/test_naive_block.py +++ b/tests/core/block/test_naive_block.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional +from typing import Optional import pytest @@ -14,7 +14,7 @@ class TestNaiveBlockAllocator: def create_allocate_lambda(allocate_type: str, allocator: NaiveBlockAllocator, prev_block: Optional[Block], - token_ids: List[int]): + token_ids: list[int]): if allocate_type == "immutable": allocate_block = lambda: allocator.allocate_immutable_block( prev_block=prev_block, token_ids=token_ids) diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index bf40b334..50233624 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -2,7 +2,7 @@ import math import random -from typing import List, Optional +from typing import Optional from unittest.mock import MagicMock import pytest @@ -123,11 +123,11 @@ class TestPrefixCachingBlock: @staticmethod def create_chain(block_size: int, - token_ids: List[int], - num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]: + token_ids: list[int], + num_empty_trailing_blocks=0) -> list[PrefixCachingBlock]: """Helper method which creates a chain of blocks. """ - blocks: List[PrefixCachingBlock] = [] + blocks: list[PrefixCachingBlock] = [] num_blocks = math.ceil( len(token_ids) / block_size) + num_empty_trailing_blocks @@ -161,7 +161,7 @@ class TestPrefixCachingBlockAllocator: @staticmethod def create_allocate_lambda(allocate_type: str, allocator: BlockAllocator, prev_block: Optional[Block], - token_ids: List[int]): + token_ids: list[int]): if allocate_type == "immutable": allocate_block = lambda: allocator.allocate_immutable_block( prev_block=prev_block, token_ids=token_ids) @@ -839,13 +839,13 @@ class TestPrefixCachingBlockAllocator: @staticmethod def create_immutable_chain( block_size: int, - token_ids: List[int], + token_ids: list[int], allocator: PrefixCachingBlockAllocator, extra_hash: Optional[int] = None, - ) -> List[PrefixCachingBlock]: + ) -> list[PrefixCachingBlock]: """Helper method which creates a chain of blocks. """ - blocks: List[Block] = [] + blocks: list[Block] = [] num_blocks = math.ceil(len(token_ids) / block_size) if num_blocks == 0: diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index 8e0b9e63..161b32f0 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List from unittest.mock import MagicMock import pytest # noqa @@ -46,7 +45,7 @@ def test_simple(): cache_config.num_cpu_blocks = 8 cache_config.num_gpu_blocks = 8 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(num_seq_group): @@ -93,7 +92,7 @@ def test_chunk(): cache_config.num_cpu_blocks = 32 cache_config.num_gpu_blocks = 32 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(2): @@ -145,7 +144,7 @@ def test_concurrent_chunking(): cache_config.num_cpu_blocks = 32 cache_config.num_gpu_blocks = 32 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(2): @@ -226,8 +225,8 @@ def test_short_prompts_jump_long_prompts_in_queue(): cache_config.num_cpu_blocks = 3200 # large KV cache size for large requests cache_config.num_gpu_blocks = 3200 scheduler = Scheduler(scheduler_config, cache_config, None) - long_seqs: List[SequenceGroup] = [] - short_seqs: List[SequenceGroup] = [] + long_seqs: list[SequenceGroup] = [] + short_seqs: list[SequenceGroup] = [] # Add 2 large seq groups to scheduler. for i in range(2): @@ -368,7 +367,7 @@ def test_complex(): cache_config.num_cpu_blocks = 64 cache_config.num_gpu_blocks = 64 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(2): @@ -439,7 +438,7 @@ def test_maximal_decoding(): cache_config.num_cpu_blocks = 8 cache_config.num_gpu_blocks = 8 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(2): @@ -533,7 +532,7 @@ def test_prompt_limit(): cache_config.num_cpu_blocks = 16 cache_config.num_gpu_blocks = 16 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] _, seq_group = create_dummy_prompt("1", prompt_length=48, @@ -565,7 +564,7 @@ def test_prompt_limit_exceed(): cache_config.num_cpu_blocks = 16 cache_config.num_gpu_blocks = 16 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] _, seq_group = create_dummy_prompt("2", prompt_length=48, block_size=block_size) @@ -699,7 +698,7 @@ def test_chunked_prefill_max_seqs(): cache_config.num_cpu_blocks = 128 cache_config.num_gpu_blocks = 128 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] _, seq_group = create_dummy_prompt("1", prompt_length=65, @@ -758,7 +757,7 @@ def test_prefix_caching(): cache_config.num_cpu_blocks = 0 cache_config.num_gpu_blocks = 32 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(2): @@ -800,7 +799,7 @@ def test_prefix_caching_with_concurrent_partial_prefills(): cache_config.num_cpu_blocks = 0 cache_config.num_gpu_blocks = 32 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(2): diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 66bc5257..9e461d4e 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -2,7 +2,6 @@ import time from collections import deque -from typing import List, Set, Tuple from unittest.mock import MagicMock import pytest # noqa @@ -57,7 +56,7 @@ def test_scheduler_abort_seq_group(): # Add multiple seq groups to scheduler. num_seq_group = 4 - request_ids: Set[str] = set() + request_ids: set[str] = set() for i in range(num_seq_group): _, seq_group = create_dummy_prompt(str(i), block_size) scheduler.add_seq_group(seq_group) @@ -83,7 +82,7 @@ def test_scheduler_schedule_simple(): cache_config.num_cpu_blocks = 8 cache_config.num_gpu_blocks = 8 scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(num_seq_group): @@ -221,7 +220,7 @@ def test_scheduler_max_seqs(): cache_config.num_gpu_blocks = 8 scheduler = Scheduler(scheduler_config, cache_config, None) - all_seq_groups: List[SequenceGroup] = [] + all_seq_groups: list[SequenceGroup] = [] # Add seq groups to scheduler. for i in range(num_seq_group): _, seq_group = create_dummy_prompt(str(i), @@ -480,7 +479,7 @@ def test_prefill_schedule_max_lora(): num_cpu_blocks=64, num_gpu_blocks=64) budget = create_token_budget(token_budget=120) - curr_loras: Set[int] = set() + curr_loras: set[int] = set() for i in range(2): _, seq_group = create_dummy_prompt(str(i), prompt_length=60, @@ -651,8 +650,8 @@ def test_schedule_swapped_max_loras(): block_size=block_size, num_cpu_blocks=32, num_gpu_blocks=32) - curr_loras: Set[int] = set() - blocks_to_swap_out: List[Tuple[int, int]] = [] + curr_loras: set[int] = set() + blocks_to_swap_out: list[tuple[int, int]] = [] for i in range(2): _, seq_group = create_dummy_prompt(str(i), prompt_length=60, @@ -683,7 +682,7 @@ def test_schedule_swapped_cannot_swap_in(): num_cpu_blocks=32, num_gpu_blocks=32) curr_loras = None - blocks_to_swap_out: List[Tuple[int, int]] = [] + blocks_to_swap_out: list[tuple[int, int]] = [] for i in range(2): _, seq_group = create_dummy_prompt(str(i), prompt_length=60, @@ -714,7 +713,7 @@ def test_infeasible_swap(): num_cpu_blocks=32, num_gpu_blocks=32) curr_loras = None - blocks_to_swap_out: List[Tuple[int, int]] = [] + blocks_to_swap_out: list[tuple[int, int]] = [] for i in range(2): _, seq_group = create_dummy_prompt(str(i), prompt_length=60, @@ -752,7 +751,7 @@ def test_schedule_swapped_blocks_to_copy(): block_size=block_size) scheduler._allocate_and_set_running(seq_group) append_new_token_seq_group(60, seq_group, 1) - blocks_to_swap_out: List[Tuple[int, int]] = [] + blocks_to_swap_out: list[tuple[int, int]] = [] scheduler._swap_out(seq_group, blocks_to_swap_out) scheduler._add_seq_group_to_swapped(seq_group) diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py index a4e3c73a..c6049b26 100644 --- a/tests/core/test_scheduler_encoder_decoder.py +++ b/tests/core/test_scheduler_encoder_decoder.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest # noqa from vllm.config import CacheConfig, SchedulerConfig @@ -48,7 +46,7 @@ def test_scheduler_schedule_simple_encoder_decoder(): cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group scheduler = Scheduler(scheduler_config, cache_config, None) - running: List[SequenceGroup] = [] + running: list[SequenceGroup] = [] # Add seq groups to scheduler. req_id_list = [] diff --git a/tests/core/utils.py b/tests/core/utils.py index fb77dccc..ba4265e3 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -2,9 +2,8 @@ import time from collections import defaultdict -from typing import Any, Dict, List, Optional -from typing import Sequence as GenericSequence -from typing import Tuple +from collections.abc import Sequence as GenericSequence +from typing import Any, Optional from vllm import SamplingParams from vllm.core.scheduler import Scheduler, SchedulerOutputs @@ -20,10 +19,10 @@ def create_dummy_prompt( block_size: Optional[int] = None, lora_request: Optional[LoRARequest] = None, best_of: int = 1, - prompt_tokens: Optional[List[int]] = None, + prompt_tokens: Optional[list[int]] = None, min_tokens: int = 0, max_tokens: int = 16, -) -> Tuple[Sequence, SequenceGroup]: +) -> tuple[Sequence, SequenceGroup]: if not block_size: block_size = prompt_length @@ -48,7 +47,7 @@ def create_dummy_prompt( return prompt, seq_group -def create_dummy_lora_sequence(request_id: int, token_ids: List[int], +def create_dummy_lora_sequence(request_id: int, token_ids: list[int], block_size: int, lora_int_id: int) -> Sequence: return Sequence(seq_id=request_id, inputs=token_inputs(token_ids), @@ -58,7 +57,7 @@ def create_dummy_lora_sequence(request_id: int, token_ids: List[int], lora_int_id=lora_int_id)) -def create_dummy_sequence(request_id: int, token_ids: List[int], +def create_dummy_sequence(request_id: int, token_ids: list[int], block_size: int) -> Sequence: return Sequence( seq_id=request_id, @@ -74,7 +73,7 @@ def create_dummy_prompt_encoder_decoder( block_size: Optional[int] = None, lora_request: Optional[LoRARequest] = None, best_of: int = 1, -) -> Tuple[Sequence, Sequence, SequenceGroup]: +) -> tuple[Sequence, Sequence, SequenceGroup]: if not block_size: block_size = decoder_prompt_length @@ -125,7 +124,7 @@ def create_seq_group( prompt_token_ids = [0] * seq_prompt_len - seqs: List[Sequence] = [] + seqs: list[Sequence] = [] for seq_id_offset, output_len in enumerate(seq_output_lens): seq = Sequence( seq_id=seq_id_start + seq_id_offset, @@ -241,7 +240,7 @@ class SchedulerProxy: def __init__(self, scheduler: Scheduler): self.scheduler_ = scheduler - self.call_history: Dict[str, List[Any]] = defaultdict(list) + self.call_history: dict[str, list[Any]] = defaultdict(list) def __getattr__(self, name: str) -> Any: @@ -253,6 +252,6 @@ class SchedulerProxy: return wrapper def last_schedule_ret( - self, ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, Any]: + self, ) -> tuple[list[SequenceGroupMetadata], SchedulerOutputs, Any]: _, _, ret = self.call_history["schedule"][-1] return ret diff --git a/tests/distributed/test_expert_parallel.py b/tests/distributed/test_expert_parallel.py index bc577064..2e575f95 100644 --- a/tests/distributed/test_expert_parallel.py +++ b/tests/distributed/test_expert_parallel.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import List, Literal, NamedTuple, Optional +from typing import Literal, NamedTuple, Optional import pytest @@ -28,8 +28,8 @@ class EPTestOptions(NamedTuple): @dataclass class EPTestSettings: - parallel_setups: List[ParallelSetup] - distributed_backends: List[str] + parallel_setups: list[ParallelSetup] + distributed_backends: list[str] task: TaskOption test_options: EPTestOptions diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 390ed91c..5562b368 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -9,7 +9,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node import json import os from dataclasses import dataclass -from typing import List, Literal, NamedTuple, Optional +from typing import Literal, NamedTuple, Optional import pytest @@ -38,14 +38,14 @@ class PPTestOptions(NamedTuple): @dataclass class PPTestSettings: - parallel_setups: List[ParallelSetup] + parallel_setups: list[ParallelSetup] # NOTE: the length of distributed_backends and # vllm_major_versions should be the same, and they # are first zipped together to iterate over all # test settings. - distributed_backends: List[str] + distributed_backends: list[str] # vllm major version: "0" for V0, "1" for V1 - vllm_major_versions: List[str] + vllm_major_versions: list[str] task: TaskOption test_options: PPTestOptions diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 4c42a0ed..2c323edf 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -2,7 +2,6 @@ import multiprocessing import os -from typing import Dict, List import pytest import torch @@ -20,9 +19,9 @@ from vllm.utils import update_environment_variables def distributed_run(fn, world_size): number_of_processes = world_size - processes: List[multiprocessing.Process] = [] + processes: list[multiprocessing.Process] = [] for i in range(number_of_processes): - env: Dict[str, str] = {} + env: dict[str, str] = {} env['RANK'] = str(i) env['LOCAL_RANK'] = str(i) env['WORLD_SIZE'] = str(number_of_processes) diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py index 59fa7cc9..711c2441 100644 --- a/tests/distributed/test_shm_broadcast.py +++ b/tests/distributed/test_shm_broadcast.py @@ -3,7 +3,6 @@ import multiprocessing import random import time -from typing import List import numpy as np import torch.distributed as dist @@ -13,7 +12,7 @@ from vllm.distributed.utils import StatelessProcessGroup from vllm.utils import get_ip, get_open_port, update_environment_variables -def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]: +def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]: np.random.seed(seed) sizes = np.random.randint(1, 10_000, n) # on average, each array will have 5k elements diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py index d0e4f862..cb772fc7 100644 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ b/tests/encoder_decoder/test_e2e_correctness.py @@ -3,7 +3,7 @@ Run `pytest tests/encoder_decoder/test_e2e_correctness.py`. """ -from typing import List, Optional, Tuple +from typing import Optional import pytest from transformers import AutoModelForSeq2SeqLM @@ -22,7 +22,7 @@ LIST_ENC_DEC_SUPPORTED_BACKENDS = [ def vllm_to_hf_output( - vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], + vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], decoder_prompt_type: DecoderPromptType, ): """Sanitize vllm output to be comparable with hf output.""" diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py index c0a339e4..91c9ba4a 100644 --- a/tests/engine/test_executor.py +++ b/tests/engine/test_executor.py @@ -2,7 +2,7 @@ import asyncio import os -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Optional, Union import pytest @@ -22,8 +22,8 @@ class CustomUniExecutor(UniProcExecutor): def collective_rpc(self, method: Union[str, Callable], timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict] = None) -> List[Any]: + args: tuple = (), + kwargs: Optional[dict] = None) -> list[Any]: # Drop marker to show that this was ran with open(".marker", "w"): ... diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py index f1fe58e3..9b2f45de 100644 --- a/tests/engine/test_multiproc_workers.py +++ b/tests/engine/test_multiproc_workers.py @@ -4,7 +4,7 @@ import asyncio from concurrent.futures import ThreadPoolExecutor from functools import partial from time import sleep -from typing import Any, List, Tuple +from typing import Any import pytest @@ -17,7 +17,7 @@ from vllm.worker.worker_base import WorkerWrapperBase class DummyWorkerWrapper(WorkerWrapperBase): """Dummy version of vllm.worker.worker.Worker""" - def worker_method(self, worker_input: Any) -> Tuple[int, Any]: + def worker_method(self, worker_input: Any) -> tuple[int, Any]: sleep(0.05) if isinstance(worker_input, Exception): @@ -27,7 +27,7 @@ class DummyWorkerWrapper(WorkerWrapperBase): return self.rpc_rank, input -def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]: +def _start_workers() -> tuple[list[ProcessWorkerWrapper], WorkerMonitor]: result_handler = ResultHandler() vllm_config = VllmConfig() workers = [ diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py index 0f633bb2..62d167aa 100644 --- a/tests/engine/test_stop_strings.py +++ b/tests/engine/test_stop_strings.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, List, Optional +from typing import Any, Optional import pytest @@ -21,8 +21,8 @@ def vllm_model(vllm_runner): def _test_stopping(llm_engine: LLMEngine, expected_output: str, expected_reason: Any, - stop: Optional[List[str]] = None, - stop_token_ids: Optional[List[int]] = None, + stop: Optional[list[str]] = None, + stop_token_ids: Optional[list[int]] = None, include_in_output: bool = False, use_async_output_proc: bool = False) -> None: llm_engine.add_request( diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index 77c80b2f..710bad4e 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest from vllm import LLM @@ -63,7 +61,7 @@ def test_multi_chat(): @pytest.mark.parametrize("image_urls", [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) -def test_chat_multi_image(image_urls: List[str]): +def test_chat_multi_image(image_urls: list[str]): llm = LLM( model="microsoft/Phi-3.5-vision-instruct", dtype="bfloat16", diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py index a65235cc..6438743b 100644 --- a/tests/entrypoints/llm/test_encode.py +++ b/tests/entrypoints/llm/test_encode.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import weakref -from typing import List import pytest @@ -45,8 +44,8 @@ def llm(): cleanup_dist_env_and_memory() -def assert_outputs_equal(o1: List[PoolingRequestOutput], - o2: List[PoolingRequestOutput]): +def assert_outputs_equal(o1: list[PoolingRequestOutput], + o2: list[PoolingRequestOutput]): assert [o.outputs for o in o1] == [o.outputs for o in o2] diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py index 910e1a45..9a895c92 100644 --- a/tests/entrypoints/llm/test_generate.py +++ b/tests/entrypoints/llm/test_generate.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import weakref -from typing import List import pytest @@ -43,7 +42,7 @@ def llm(): cleanup_dist_env_and_memory() -def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]): +def assert_outputs_equal(o1: list[RequestOutput], o2: list[RequestOutput]): assert [o.outputs for o in o1] == [o.outputs for o in o2] diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py index 19d4735b..eca5d184 100644 --- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py +++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py @@ -10,7 +10,6 @@ import asyncio import io import time from statistics import mean, median -from typing import List import librosa import pytest @@ -67,7 +66,7 @@ async def process_dataset(model, client, data, concurrent_request): audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"] _ = await bound_transcribe(model, sem, client, (audio, sr), "") - tasks: List[asyncio.Task] = [] + tasks: list[asyncio.Task] = [] for sample in data: audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"] task = asyncio.create_task( diff --git a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py index ea504f3d..5ce5d928 100644 --- a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py +++ b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest from transformers import AutoTokenizer @@ -180,7 +178,7 @@ def test_reasoning( ): output = tokenizer.tokenize(param_dict["output"]) # decode everything to tokens - output_tokens: List[str] = [ + output_tokens: list[str] = [ tokenizer.convert_tokens_to_string([token]) for token in output ] parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser( diff --git a/tests/entrypoints/openai/reasoning_parsers/utils.py b/tests/entrypoints/openai/reasoning_parsers/utils.py index 2157e059..01e43130 100644 --- a/tests/entrypoints/openai/reasoning_parsers/utils.py +++ b/tests/entrypoints/openai/reasoning_parsers/utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple, Union +from typing import Optional, Union from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaMessage) @@ -33,10 +33,10 @@ class StreamingReasoningReconstructor: def run_reasoning_extraction( reasoning_parser: ReasoningParser, - model_output: List[str], + model_output: list[str], request: Union[ChatCompletionRequest, None] = None, streaming: bool = False, -) -> Tuple[Optional[str], Optional[str]]: +) -> tuple[Optional[str], Optional[str]]: if streaming: reconstructor = run_reasoning_extraction_streaming( reasoning_parser, @@ -55,9 +55,9 @@ def run_reasoning_extraction( def run_reasoning_extraction_nonstreaming( reasoning_parser: ReasoningParser, - model_output: List[str], + model_output: list[str], request: Union[ChatCompletionRequest, None] = None, -) -> Tuple[Optional[str], Optional[str]]: +) -> tuple[Optional[str], Optional[str]]: request = request or ChatCompletionRequest(messages=[], model="test-model") return reasoning_parser.extract_reasoning_content( model_output=''.join(model_output), request=request) @@ -65,13 +65,13 @@ def run_reasoning_extraction_nonstreaming( def run_reasoning_extraction_streaming( reasoning_parser: ReasoningParser, - model_deltas: List[str], + model_deltas: list[str], request: Union[ChatCompletionRequest, None] = None, ) -> StreamingReasoningReconstructor: request = request or ChatCompletionRequest(messages=[], model="test-model") reconstructor = StreamingReasoningReconstructor() previous_text = "" - previous_tokens: List[int] = [] + previous_tokens: list[int] = [] for delta in model_deltas: token_delta = [ reasoning_parser.vocab.get(token) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index 7e08fdaf..56fb2932 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List - import openai import pytest import pytest_asyncio @@ -41,7 +39,7 @@ async def client(server): @pytest.fixture(scope="session") -def base64_encoded_audio() -> Dict[str, str]: +def base64_encoded_audio() -> dict[str, str]: return { audio_url: encode_audio_base64(*fetch_audio(audio_url)) for audio_url in TEST_AUDIO_URLS @@ -107,7 +105,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) async def test_single_chat_session_audio_base64encoded( client: openai.AsyncOpenAI, model_name: str, audio_url: str, - base64_encoded_audio: Dict[str, str]): + base64_encoded_audio: dict[str, str]): messages = [{ "role": @@ -165,7 +163,7 @@ async def test_single_chat_session_audio_base64encoded( @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) async def test_single_chat_session_input_audio( client: openai.AsyncOpenAI, model_name: str, audio_url: str, - base64_encoded_audio: Dict[str, str]): + base64_encoded_audio: dict[str, str]): messages = [{ "role": "user", @@ -255,7 +253,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI, temperature=0.0, stream=True, ) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 async for chunk in stream: delta = chunk.choices[0].delta @@ -277,7 +275,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI, @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI, model_name: str, audio_url: str, - base64_encoded_audio: Dict[str, + base64_encoded_audio: dict[str, str]): messages = [{ "role": @@ -315,7 +313,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI, temperature=0.0, stream=True, ) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 async for chunk in stream: delta = chunk.choices[0].delta @@ -337,7 +335,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI, @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str, audio_url: str, - base64_encoded_audio: Dict[str, str]): + base64_encoded_audio: dict[str, str]): messages = [{ "role": diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index a970981b..e7bf974f 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -2,7 +2,6 @@ import asyncio from http import HTTPStatus -from typing import List import openai import pytest @@ -17,7 +16,7 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @pytest.fixture(scope='module') -def server_args(request: pytest.FixtureRequest) -> List[str]: +def server_args(request: pytest.FixtureRequest) -> list[str]: """ Provide extra arguments to the server via indirect parametrization Usage: diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index d7ed4afa..25e4595c 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -3,7 +3,7 @@ # imports for guided decoding tests import json import re -from typing import Dict, List, Optional +from typing import Optional import jsonschema import openai # use the official client for correctness check @@ -190,7 +190,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: Optional[int]): - params: Dict = { + params: dict = { "messages": [{ "role": "system", "content": "You are a helpful assistant." @@ -232,7 +232,7 @@ async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI, ) async def test_more_than_one_prompt_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): - params: Dict = { + params: dict = { "messages": [{ "role": "system", "content": "You are a helpful assistant." @@ -343,7 +343,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): temperature=0.0, stream=True, ) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 async for chunk in stream: delta = chunk.choices[0].delta diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 28671cc2..1d9aa497 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -5,7 +5,7 @@ import json import re import shutil from tempfile import TemporaryDirectory -from typing import Dict, List, Optional +from typing import Optional import jsonschema import openai # use the official client for correctness check @@ -287,7 +287,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: Optional[int]): - params: Dict = { + params: dict = { "prompt": ["A robot may not injure another robot", "My name is"], "model": model_name, } @@ -331,7 +331,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, stream=True) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 async for chunk in stream: chunks.append(chunk.choices[0].text) @@ -364,7 +364,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str): max_tokens=max_tokens, n=n, stream=True) - chunks: List[List[str]] = [[] for i in range(n)] + chunks: list[list[str]] = [[] for i in range(n)] finish_reason_count = 0 async for chunk in stream: index = chunk.choices[0].index diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index a37169f5..0d1c936d 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -86,7 +86,7 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str): - # test List[str] + # test list[str] input_texts = [ "The cat sat on the mat.", "A feline was resting on a rug.", "Stars twinkle brightly in the night sky." @@ -106,7 +106,7 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str): assert embeddings.usage.prompt_tokens == 33 assert embeddings.usage.total_tokens == 33 - # test List[List[int]] + # test list[list[int]] input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], [25, 32, 64, 77]] embedding_response = await client.embeddings.create( diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py index 11d3bfaf..72ab12c5 100644 --- a/tests/entrypoints/openai/test_pooling.py +++ b/tests/entrypoints/openai/test_pooling.py @@ -84,7 +84,7 @@ async def test_single_pooling(server: RemoteOpenAIServer, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str): - # test List[str] + # test list[str] input_texts = [ "The cat sat on the mat.", "A feline was resting on a rug.", "Stars twinkle brightly in the night sky." @@ -107,7 +107,7 @@ async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str): assert poolings.usage.prompt_tokens == 25 assert poolings.usage.total_tokens == 25 - # test List[List[int]] + # test list[list[int]] input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], [25, 32, 64, 77]] response = requests.post( diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py index ad8159af..c9fa192f 100644 --- a/tests/entrypoints/openai/test_root_path.py +++ b/tests/entrypoints/openai/test_root_path.py @@ -2,7 +2,7 @@ import contextlib import os -from typing import Any, List, NamedTuple +from typing import Any, NamedTuple import openai # use the official client for correctness check import pytest @@ -40,7 +40,7 @@ def server(): class TestCase(NamedTuple): model_name: str - base_url: List[str] + base_url: list[str] api_key: str expected_error: Any diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py index ab928540..36d62224 100644 --- a/tests/entrypoints/openai/test_video.py +++ b/tests/entrypoints/openai/test_video.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List - import openai import pytest import pytest_asyncio @@ -49,7 +47,7 @@ async def client(server): @pytest.fixture(scope="session") -def base64_encoded_video() -> Dict[str, str]: +def base64_encoded_video() -> dict[str, str]: return { video_url: encode_video_base64(fetch_video(video_url)) for video_url in TEST_VIDEO_URLS @@ -151,7 +149,7 @@ async def test_single_chat_session_video_beamsearch(client: openai.AsyncOpenAI, @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) async def test_single_chat_session_video_base64encoded( client: openai.AsyncOpenAI, model_name: str, video_url: str, - base64_encoded_video: Dict[str, str]): + base64_encoded_video: dict[str, str]): messages = [{ "role": @@ -209,7 +207,7 @@ async def test_single_chat_session_video_base64encoded( @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) async def test_single_chat_session_video_base64encoded_beamsearch( client: openai.AsyncOpenAI, model_name: str, video_url: str, - base64_encoded_video: Dict[str, str]): + base64_encoded_video: dict[str, str]): messages = [{ "role": @@ -279,7 +277,7 @@ async def test_chat_streaming_video(client: openai.AsyncOpenAI, temperature=0.0, stream=True, ) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 async for chunk in stream: delta = chunk.choices[0].delta @@ -302,7 +300,7 @@ async def test_chat_streaming_video(client: openai.AsyncOpenAI, "video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]) async def test_multi_video_input(client: openai.AsyncOpenAI, model_name: str, - video_urls: List[str]): + video_urls: list[str]): messages = [{ "role": diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index c954fca6..d605394f 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List - import openai import pytest import pytest_asyncio @@ -50,7 +48,7 @@ async def client(server): @pytest.fixture(scope="session") -def base64_encoded_image() -> Dict[str, str]: +def base64_encoded_image() -> dict[str, str]: return { image_url: encode_image_base64(fetch_image(image_url)) for image_url in TEST_IMAGE_URLS @@ -152,7 +150,7 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI, @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) async def test_single_chat_session_image_base64encoded( client: openai.AsyncOpenAI, model_name: str, image_url: str, - base64_encoded_image: Dict[str, str]): + base64_encoded_image: dict[str, str]): messages = [{ "role": @@ -210,7 +208,7 @@ async def test_single_chat_session_image_base64encoded( @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) async def test_single_chat_session_image_base64encoded_beamsearch( client: openai.AsyncOpenAI, model_name: str, image_url: str, - base64_encoded_image: Dict[str, str]): + base64_encoded_image: dict[str, str]): messages = [{ "role": @@ -280,7 +278,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI, temperature=0.0, stream=True, ) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 async for chunk in stream: delta = chunk.choices[0].delta @@ -303,7 +301,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI, "image_urls", [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))]) async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, - image_urls: List[str]): + image_urls: list[str]): messages = [{ "role": diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index cee52745..100aca6f 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict - import pytest import requests @@ -49,7 +47,7 @@ def server(): @pytest.fixture(scope="session") -def base64_encoded_image() -> Dict[str, str]: +def base64_encoded_image() -> dict[str, str]: return { image_url: encode_image_base64(fetch_image(image_url)) for image_url in TEST_IMAGE_URLS diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py index 788efa86..fbbbc1fb 100644 --- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List from unittest.mock import MagicMock import pytest @@ -125,7 +124,7 @@ TEST_CASES = [ @pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES) def test_tool_call(streaming: bool, model_output: str, - expected_tool_calls: List[FunctionCall]): + expected_tool_calls: list[FunctionCall]): mock_tokenizer = MagicMock() tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")( mock_tokenizer) diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py index 57ec9865..6ad5aa26 100644 --- a/tests/entrypoints/openai/tool_parsers/utils.py +++ b/tests/entrypoints/openai/tool_parsers/utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, List, Tuple, Union +from collections.abc import Iterable +from typing import Union from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaMessage, @@ -12,7 +13,7 @@ from vllm.entrypoints.openai.tool_parsers import ToolParser class StreamingToolReconstructor: def __init__(self, assert_one_tool_per_delta: bool = True): - self.tool_calls: List[ToolCall] = [] + self.tool_calls: list[ToolCall] = [] self.other_content: str = "" self._assert_one_tool_per_delta = assert_one_tool_per_delta @@ -72,7 +73,7 @@ def run_tool_extraction( request: Union[ChatCompletionRequest, None] = None, streaming: bool = False, assert_one_tool_per_delta: bool = True, -) -> Tuple[Union[str, None], List[ToolCall]]: +) -> tuple[Union[str, None], list[ToolCall]]: if streaming: reconstructor = run_tool_extraction_streaming( tool_parser, @@ -106,7 +107,7 @@ def run_tool_extraction_streaming( reconstructor = StreamingToolReconstructor( assert_one_tool_per_delta=assert_one_tool_per_delta) previous_text = "" - previous_tokens: List[int] = [] + previous_tokens: list[int] = [] for delta in model_deltas: token_delta = [ tool_parser.vocab.get(token) diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index 34dcf91c..a21d642b 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Tuple, Union +from typing import Optional, Union import torch @@ -19,7 +19,7 @@ def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor: def ref_dynamic_per_token_quant(x: torch.tensor, quant_dtype: torch.dtype, scale_ub: Optional[torch.tensor] = None) \ - -> Tuple[torch.tensor, torch.tensor]: + -> tuple[torch.tensor, torch.tensor]: assert quant_dtype in [torch.int8, FP8_DTYPE] if scale_ub is not None: @@ -68,7 +68,7 @@ def ref_dynamic_per_token_quant(x: torch.tensor, # ref_dynamic_per_token_quant, when we have a dynamic_per_tensor int8 quant # kernel def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \ - -> Tuple[torch.tensor, torch.tensor]: + -> tuple[torch.tensor, torch.tensor]: fp8_traits = torch.finfo(FP8_DTYPE) fp8_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \ diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index 2e70b1db..cf0f21ce 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import Type import pytest import torch @@ -86,7 +85,7 @@ def test_act_and_mul( @pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_activation( - activation: Type[torch.nn.Module], + activation: type[torch.nn.Module], num_tokens: int, d: int, dtype: torch.dtype, diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index b667d8d9..0fe10d76 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import List, Optional, Tuple +from typing import Optional import pytest import torch @@ -85,8 +85,8 @@ def ref_single_query_cached_kv_attention( block_table = block_tables_lst[i] seq_len = int(seq_lens_lst[i]) - keys_lst: List[torch.Tensor] = [] - values_lst: List[torch.Tensor] = [] + keys_lst: list[torch.Tensor] = [] + values_lst: list[torch.Tensor] = [] for j in range(seq_len): block_number = int(block_table[j // block_size]) block_offset = j % block_size @@ -133,7 +133,7 @@ def test_paged_attention( kv_cache_factory, version: str, num_seqs: int, - num_heads: Tuple[int, int], + num_heads: tuple[int, int], head_size: int, use_alibi: bool, block_size: int, @@ -166,7 +166,7 @@ def test_paged_attention( # Create the block tables. max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size - block_tables_lst: List[List[int]] = [] + block_tables_lst: list[list[int]] = [] for _ in range(num_seqs): block_table = [ random.randint(0, NUM_BLOCKS - 1) @@ -334,7 +334,7 @@ def test_paged_attention( def ref_multi_query_kv_attention( - cu_seq_lens: List[int], + cu_seq_lens: list[int], query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, @@ -342,7 +342,7 @@ def ref_multi_query_kv_attention( dtype: torch.dtype, ) -> torch.Tensor: num_seqs = len(cu_seq_lens) - 1 - ref_outputs: List[torch.Tensor] = [] + ref_outputs: list[torch.Tensor] = [] for i in range(num_seqs): start_idx = cu_seq_lens[i] end_idx = cu_seq_lens[i + 1] @@ -378,7 +378,7 @@ def ref_multi_query_kv_attention( @torch.inference_mode() def test_multi_query_kv_attention( num_seqs: int, - num_heads: Tuple[int, int], + num_heads: tuple[int, int], head_size: int, dtype: torch.dtype, seed: int, diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index e653d34d..3025ae0f 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import List, Optional, Tuple +from typing import Optional import pytest import torch @@ -87,8 +87,8 @@ def ref_single_query_cached_kv_attention( block_table = block_tables_lst[i] seq_len = int(seq_lens_lst[i]) - keys_lst: List[torch.Tensor] = [] - values_lst: List[torch.Tensor] = [] + keys_lst: list[torch.Tensor] = [] + values_lst: list[torch.Tensor] = [] for j in range(seq_len): block_number = int(block_table[j // block_size]) block_offset = j % block_size @@ -162,7 +162,7 @@ def test_paged_attention( kv_cache_factory, version: str, num_seqs: int, - num_heads: Tuple[int, int], + num_heads: tuple[int, int], head_size: int, use_alibi: bool, block_size: int, @@ -331,7 +331,7 @@ def test_paged_attention( def ref_multi_query_kv_attention( - cu_seq_lens: List[int], + cu_seq_lens: list[int], query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, @@ -376,7 +376,7 @@ def ref_multi_query_kv_attention( @torch.inference_mode() def test_varlen_blocksparse_attention_prefill( num_seqs: int, - num_heads: Tuple[int, int], + num_heads: tuple[int, int], head_size: int, blocksparse_local_blocks: int, blocksparse_vert_stride: int, diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index fb368874..b55ebd96 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import List, Tuple import pytest import torch @@ -74,7 +73,7 @@ def test_copy_blocks( src_blocks = random.sample(range(num_blocks), num_mappings) remainig_blocks = list(set(range(num_blocks)) - set(src_blocks)) dst_blocks = random.sample(remainig_blocks, 2 * num_mappings) - block_mapping: List[Tuple[int, int]] = [] + block_mapping: list[tuple[int, int]] = [] for i in range(num_mappings): src = src_blocks[i] dst1 = dst_blocks[2 * i] @@ -342,7 +341,7 @@ def test_reshape_and_cache_flash( @torch.inference_mode() def test_swap_blocks( kv_cache_factory, - direction: Tuple[str, str], + direction: tuple[str, str], num_mappings: int, num_heads: int, head_size: int, diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/test_cascade_flash_attn.py index 8cc1a6a1..d6570e63 100755 --- a/tests/kernels/test_cascade_flash_attn.py +++ b/tests/kernels/test_cascade_flash_attn.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple +from typing import Optional import pytest import torch @@ -25,7 +25,7 @@ DTYPES = [torch.float16, torch.bfloat16] @torch.inference_mode() def test_merge_kernel( num_tokens: int, - num_heads: Tuple[int, int], + num_heads: tuple[int, int], head_size: int, dtype: torch.dtype, ): @@ -85,8 +85,8 @@ CASES = [ @pytest.mark.parametrize("fa_version", [2, 3]) @torch.inference_mode() def test_cascade( - seq_lens_and_common_prefix: Tuple[List[Tuple[int, int]], int], - num_heads: Tuple[int, int], + seq_lens_and_common_prefix: tuple[list[tuple[int, int]], int], + num_heads: tuple[int, int], head_size: int, dtype: torch.dtype, block_size: int, diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index 49fd8ed6..72fc660a 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -3,7 +3,6 @@ Run `pytest tests/kernels/test_cutlass.py`. """ -from typing import Type import pytest import torch @@ -71,7 +70,7 @@ def cutlass_fp8_gemm_helper(m: int, a_scale_group_shape: tuple, b_scale_group_shape: tuple, use_bias: bool, - out_dtype: Type[torch.dtype] = torch.bfloat16, + out_dtype: type[torch.dtype] = torch.bfloat16, device: str = "cuda"): # Test for a cutlass kernel with per-token activation quantization # and per-output channel weight quantization. @@ -109,7 +108,7 @@ def cutlass_int8_gemm_helper(m: int, a_scale_group_shape: tuple, b_scale_group_shape: tuple, use_bias: bool, - out_dtype: Type[torch.dtype] = torch.bfloat16, + out_dtype: type[torch.dtype] = torch.bfloat16, device: str = "cuda"): # Test for a cutlass kernel with per-token activation quantization # and per-output channel weight quantization. @@ -187,7 +186,7 @@ def test_cutlass_int8_gemm(m: int, n: int, k: int, a_scale_group_shape, @pytest.mark.parametrize("use_bias", [True, False]) def test_cutlass_int8_gemm_output_dtype(a_scale_group_shape, b_scale_group_shape, - out_dtype: Type[torch.dtype], + out_dtype: type[torch.dtype], use_bias: bool): cutlass_int8_gemm_helper(512, 512, @@ -208,7 +207,7 @@ def test_cutlass_int8_gemm_output_dtype(a_scale_group_shape, reason="FP8 is not supported on this GPU type.") def test_cutlass_fp8_gemm_output_dtype(a_scale_group_shape, b_scale_group_shape, - out_dtype: Type[torch.dtype], + out_dtype: type[torch.dtype], use_bias: bool): cutlass_fp8_gemm_helper(512, 512, @@ -227,7 +226,7 @@ def test_cutlass_fp8_gemm_output_dtype(a_scale_group_shape, reason="FP8 blockwise is not supported on this GPU type.") def test_cutlass_fp8_blockwise_scale_gemm_dtype(a_scale_group_shape, b_scale_group_shape, - out_dtype: Type[torch.dtype], + out_dtype: type[torch.dtype], use_bias: bool): cutlass_fp8_gemm_helper(512, 512, diff --git a/tests/kernels/test_cutlass_2of4_sparse.py b/tests/kernels/test_cutlass_2of4_sparse.py index b0c58047..2890e15d 100644 --- a/tests/kernels/test_cutlass_2of4_sparse.py +++ b/tests/kernels/test_cutlass_2of4_sparse.py @@ -3,7 +3,6 @@ Run `pytest tests/kernels/test_semi_structured.py`. """ -from typing import Tuple, Type import pytest import torch @@ -79,7 +78,7 @@ def check_compress_decompress_invariance(dtype: torch.dtype, b: torch.Tensor, def make_rand_sparse_tensors( dtype: torch.dtype, m: int, n: int, k: int -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: a = torch.randn((m, k), device='cuda') b = torch.randn((n, k), device='cuda').t() @@ -167,7 +166,7 @@ MNK_FACTORS = [ @pytest.mark.parametrize("m, n, k", MNK_FACTORS) @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) @pytest.mark.parametrize("use_bias", [True, False]) -def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: Type[torch.dtype], +def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: type[torch.dtype], use_bias: bool): # Create tensors diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py index 0a93f7ce..547a6349 100644 --- a/tests/kernels/test_encoder_decoder_attn.py +++ b/tests/kernels/test_encoder_decoder_attn.py @@ -243,7 +243,7 @@ def _decoder_attn_setup( test_pt: TestPoint, test_rsrcs: TestResources, block_base_addr: int = 0, -) -> Tuple[QKVInputs, PhaseTestParameters, PhaseTestParameters, int]: +) -> tuple[QKVInputs, PhaseTestParameters, PhaseTestParameters, int]: ''' Set up test vectors & data structures for self-attention test. @@ -421,7 +421,7 @@ def _enc_dec_cross_attn_setup_reuses_query( test_pt: TestPoint, test_rsrcs: TestResources, block_base_addr: int = 0, -) -> Tuple[PhaseTestParameters, PhaseTestParameters]: +) -> tuple[PhaseTestParameters, PhaseTestParameters]: ''' Set up test vectors & data structures for cross-attention test. diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index b8af89b6..95424e25 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple +from typing import Optional import pytest import torch @@ -24,8 +24,8 @@ def ref_paged_attn( query: torch.Tensor, key_cache: torch.Tensor, value_cache: torch.Tensor, - query_lens: List[int], - kv_lens: List[int], + query_lens: list[int], + kv_lens: list[int], block_tables: torch.Tensor, scale: float, sliding_window: Optional[int] = None, @@ -35,7 +35,7 @@ def ref_paged_attn( block_tables = block_tables.cpu().numpy() _, block_size, num_kv_heads, head_size = key_cache.shape - outputs: List[torch.Tensor] = [] + outputs: list[torch.Tensor] = [] start_idx = 0 for i in range(num_seqs): query_len = query_lens[i] @@ -88,8 +88,8 @@ def ref_paged_attn( @torch.inference_mode() def test_flash_attn_with_paged_kv( use_out: bool, - kv_lens: List[int], - num_heads: Tuple[int, int], + kv_lens: list[int], + num_heads: tuple[int, int], head_size: int, dtype: torch.dtype, block_size: int, @@ -174,8 +174,8 @@ def test_flash_attn_with_paged_kv( @torch.inference_mode() def test_varlen_with_paged_kv( use_out: bool, - seq_lens: List[Tuple[int, int]], - num_heads: Tuple[int, int], + seq_lens: list[tuple[int, int]], + num_heads: tuple[int, int], head_size: int, sliding_window: Optional[int], dtype: torch.dtype, diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py index f623b001..5ad1137a 100644 --- a/tests/kernels/test_flashinfer.py +++ b/tests/kernels/test_flashinfer.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple +from typing import Optional import flashinfer import pytest @@ -19,8 +19,8 @@ def ref_paged_attn( query: torch.Tensor, key_cache: torch.Tensor, value_cache: torch.Tensor, - query_lens: List[int], - kv_lens: List[int], + query_lens: list[int], + kv_lens: list[int], block_tables: torch.Tensor, scale: float, sliding_window: Optional[int] = None, @@ -30,7 +30,7 @@ def ref_paged_attn( block_tables = block_tables.cpu().numpy() _, block_size, num_kv_heads, head_size = key_cache.shape - outputs: List[torch.Tensor] = [] + outputs: list[torch.Tensor] = [] start_idx = 0 for i in range(num_seqs): query_len = query_lens[i] @@ -78,8 +78,8 @@ def ref_paged_attn( @pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0]) @torch.inference_mode def test_flashinfer_decode_with_paged_kv( - kv_lens: List[int], - num_heads: Tuple[int, int], + kv_lens: list[int], + num_heads: tuple[int, int], head_size: int, dtype: torch.dtype, block_size: int, @@ -168,8 +168,8 @@ def test_flashinfer_decode_with_paged_kv( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0]) @torch.inference_mode -def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], - num_heads: Tuple[int, int], +def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]], + num_heads: tuple[int, int], head_size: int, dtype: torch.dtype, block_size: int, soft_cap: Optional[float]) -> None: @@ -270,7 +270,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0]) def test_flashinfer_prefill_with_paged_fp8_kv( - seq_lens: List[Tuple[int, int]], num_heads: Tuple[int, int], + seq_lens: list[tuple[int, int]], num_heads: tuple[int, int], head_size: int, dtype: torch.dtype, block_size: int, soft_cap: Optional[float]) -> None: pytest.skip("TODO: fix the accuracy issue") @@ -378,8 +378,8 @@ def test_flashinfer_prefill_with_paged_fp8_kv( @pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0]) @torch.inference_mode def test_flashinfer_decode_with_paged_fp8_kv( - kv_lens: List[int], - num_heads: Tuple[int, int], + kv_lens: list[int], + num_heads: tuple[int, int], head_size: int, dtype: torch.dtype, block_size: int, diff --git a/tests/kernels/test_fused_quant_layernorm.py b/tests/kernels/test_fused_quant_layernorm.py index d4b674b2..7a591f53 100644 --- a/tests/kernels/test_fused_quant_layernorm.py +++ b/tests/kernels/test_fused_quant_layernorm.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Tuple, Union +from typing import Optional, Union import pytest import torch @@ -39,7 +39,7 @@ def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor: def ref_rms_norm(rms_norm_layer: RMSNorm, x: torch.Tensor, residual: Optional[torch.Tensor]) \ - -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + -> tuple[torch.Tensor, Optional[torch.Tensor]]: if residual is not None: residual = residual.clone() out, residual = rms_norm_layer.forward_native(x, residual) @@ -54,7 +54,7 @@ def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm, quant_dtype: torch.dtype, residual: Optional[torch.Tensor], scale_ub: Optional[torch.Tensor]) \ - -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: if scale_ub is not None: assert quant_dtype == torch.float8_e4m3fn @@ -78,7 +78,7 @@ def ref_impl(rms_norm_layer: RMSNorm, quant_dtype: torch.dtype, residual: Optional[torch.Tensor], scale_ub: Optional[torch.Tensor]) \ - -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: return ref_dynamic_per_token_quant(rms_norm_layer, x, quant_dtype, residual, scale_ub) @@ -88,7 +88,7 @@ def ops_dynamic_per_token_quant(weight: torch.Tensor, quant_dtype: torch.dtype, residual: Optional[torch.Tensor], scale_ub: Optional[torch.Tensor]) \ - -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: if residual is not None: residual = residual.clone() out, scales = ops.rms_norm_dynamic_per_token_quant(x, weight, EPS, @@ -102,7 +102,7 @@ def ops_impl(weight: torch.Tensor, quant_dtype: torch.dtype, residual: Optional[torch.Tensor], scale_ub: Optional[torch.Tensor]) \ - -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual, scale_ub) diff --git a/tests/kernels/test_gguf.py b/tests/kernels/test_gguf.py index 847ca9f4..aa666a46 100644 --- a/tests/kernels/test_gguf.py +++ b/tests/kernels/test_gguf.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from pathlib import Path -from typing import List import pytest import torch @@ -16,7 +15,7 @@ GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample") def get_gguf_sample_tensors( hidden_size: int, - quant_type: GGMLQuantizationType) -> List[ReaderTensor]: + quant_type: GGMLQuantizationType) -> list[ReaderTensor]: sample_dir = GGUF_SAMPLE filename = f"Quant_{quant_type.name}_{hidden_size}.gguf" sample_file = Path(sample_dir) / filename diff --git a/tests/kernels/test_machete_mm.py b/tests/kernels/test_machete_mm.py index bd60526e..5aeaaa65 100644 --- a/tests/kernels/test_machete_mm.py +++ b/tests/kernels/test_machete_mm.py @@ -6,7 +6,7 @@ Run `pytest tests/kernels/test_machete_mm.py`. import math from dataclasses import dataclass, fields -from typing import List, Optional, Tuple +from typing import Optional import pytest import torch @@ -45,7 +45,7 @@ MNK_SHAPES = [ (1024, 8192, 4096), ] -GROUP_SIZES_TO_TEST: List[Optional[int]] = [128, -1] +GROUP_SIZES_TO_TEST: list[Optional[int]] = [128, -1] @dataclass @@ -75,7 +75,7 @@ class Tensors: # Ch Scales Type, Tok Scales Type) # NOTE: None "Scale Type" means the act type is floating point # None "Output Type" means the output type is the same as the act type -TestTypeTuple = Tuple[List[torch.dtype], ScalarType, Optional[torch.dtype], +TestTypeTuple = tuple[list[torch.dtype], ScalarType, Optional[torch.dtype], Optional[torch.dtype], bool] TEST_TYPES = [ # GPTQ style @@ -136,7 +136,7 @@ def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor): return zps if zps is None else -1 * s * (zps.to(s.dtype)) -def group_size_valid(shape: Tuple[int, int, int], +def group_size_valid(shape: tuple[int, int, int], group_size: Optional[int]) -> bool: return group_size is None or group_size == -1 or group_size % shape[2] == 0 @@ -166,7 +166,7 @@ def machete_quantize_and_pack(atype: torch.dtype, return w_ref, w_q_machete, w_s, w_zp -def create_test_tensors(shape: Tuple[int, int, int], +def create_test_tensors(shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int], subset_stride_factor: Optional[int] = None) -> Tensors: @@ -265,7 +265,7 @@ def machete_mm_test_helper(types: TypeConfig, @pytest.mark.parametrize("types", TEST_TYPES) def test_machete_all_schedules(shape, types: TypeConfig): - group_sizes: List[Optional[int]] = [] + group_sizes: list[Optional[int]] = [] if types.group_scale_type is None: group_sizes = [None] else: @@ -294,7 +294,7 @@ def test_machete_all_schedules(shape, types: TypeConfig): ids=lambda x: "x".join(str(v) for v in x)) @pytest.mark.parametrize("types", TEST_TYPES) def test_machete_heuristic(shape, types: TypeConfig): - group_sizes: List[Optional[int]] = [] + group_sizes: list[Optional[int]] = [] if types.group_scale_type is None: group_sizes = [None] else: diff --git a/tests/kernels/test_mamba_mixer2.py b/tests/kernels/test_mamba_mixer2.py index 8c441fcb..abcf3888 100644 --- a/tests/kernels/test_mamba_mixer2.py +++ b/tests/kernels/test_mamba_mixer2.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import unittest -from typing import Tuple import pytest import torch @@ -29,7 +28,7 @@ from vllm.utils import update_environment_variables def test_mixer2_gated_norm_multi_gpu( batch_size: int, seq_len: int, - hidden_size_n_groups: Tuple[int, int], + hidden_size_n_groups: tuple[int, int], dtype: torch.dtype, device: str = 'cuda', ): diff --git a/tests/kernels/test_mamba_ssm_ssd.py b/tests/kernels/test_mamba_ssm_ssd.py index 88251311..8f23a9b2 100644 --- a/tests/kernels/test_mamba_ssm_ssd.py +++ b/tests/kernels/test_mamba_ssm_ssd.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, Tuple - import pytest import torch import torch.nn.functional as F @@ -134,7 +132,7 @@ def generate_continous_batched_examples(example_lens_by_batch, # given a tuple of lengths for each example in the batch # e.g., example_lens=(8, 4) means take 8 samples from first eg, # 4 examples from second eg, etc - def get_continuous_batch(example_lens: Tuple[int, ...]): + def get_continuous_batch(example_lens: tuple[int, ...]): indices = [] for i, x in enumerate(example_lens): @@ -264,8 +262,8 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases, # hold state during the cutting process so we know if an # example has been exhausted and needs to cycle - last_taken: Dict = {} # map: eg -> pointer to last taken sample - exhausted: Dict = {} # map: eg -> boolean indicating example is exhausted + last_taken: dict = {} # map: eg -> pointer to last taken sample + exhausted: dict = {} # map: eg -> boolean indicating example is exhausted states = None for Y_min, cu_seqlens, sed_idx, (A, dt, X, B, diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index bff7f8e5..eb83b4d6 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from itertools import accumulate, product -from typing import Callable, Dict, List, Optional +from typing import Callable, Optional import pytest import torch @@ -179,7 +179,7 @@ def test_batched_rotary_embedding_multi_lora( torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size - scaling_factors: List[int] = [1, 2, 4] + scaling_factors: list[int] = [1, 2, 4] rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, { "rope_type": "linear", "factor": tuple(scaling_factors) @@ -234,7 +234,7 @@ def test_rope_module_cache(): }) settings = (HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE, ROPE_SCALINGS, DTYPES) - rope_setting_id_map: Dict[str, int] = {} + rope_setting_id_map: dict[str, int] = {} for setting in product(*settings): head_size, rotary_dim, max_position, base, \ is_neox_stype, rope_scaling, dtype = setting diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/test_triton_scaled_mm.py index d878ed6f..bbff3e0a 100644 --- a/tests/kernels/test_triton_scaled_mm.py +++ b/tests/kernels/test_triton_scaled_mm.py @@ -4,7 +4,7 @@ Run `pytest tests/kernels/test_triton_scaled_mm.py`. """ import importlib -from typing import Optional, Type +from typing import Optional import pytest import torch @@ -18,7 +18,7 @@ def scaled_mm_torch(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor, scale_b: torch.Tensor, - out_dtype: Type[torch.dtype], + out_dtype: type[torch.dtype], bias: Optional[torch.Tensor] = None) -> torch.Tensor: out = torch.mm(a.to(torch.float32), b.to(torch.float32)) out = scale_a * out diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 1ee3a332..01097407 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -4,9 +4,9 @@ import itertools import random import unittest +from collections.abc import Sequence from numbers import Number -from typing import (Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, - Type, Union) +from typing import Any, NamedTuple, Optional, Union import pytest import torch @@ -20,13 +20,13 @@ from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, # For now, disable "test_aot_dispatch_dynamic" since there are some # bugs related to this test in PyTorch 2.4. -DEFAULT_OPCHECK_TEST_UTILS: Tuple[str, ...] = ( +DEFAULT_OPCHECK_TEST_UTILS: tuple[str, ...] = ( "test_schema", "test_autograd_registration", "test_faketensor", ) -ALL_OPCHECK_TEST_UTILS: Tuple[str, ...] = ( +ALL_OPCHECK_TEST_UTILS: tuple[str, ...] = ( "test_schema", "test_autograd_registration", "test_faketensor", @@ -50,8 +50,8 @@ class QKVInputs(NamedTuple): query: torch.Tensor key: torch.Tensor value: torch.Tensor - q_seq_lens: List[int] - kv_seq_lens: List[int] + q_seq_lens: list[int] + kv_seq_lens: list[int] class QKVO(NamedTuple): @@ -89,10 +89,10 @@ class PackedQKVInputs(NamedTuple): query: torch.Tensor key: torch.Tensor value: torch.Tensor - q_start_loc_list: Optional[List[int]] - kv_start_loc_list: Optional[List[int]] - q_seq_lens: Optional[List[int]] - kv_seq_lens: Optional[List[int]] + q_start_loc_list: Optional[list[int]] + kv_start_loc_list: Optional[list[int]] + q_seq_lens: Optional[list[int]] + kv_seq_lens: Optional[list[int]] class PackedQKVO(NamedTuple): @@ -146,7 +146,7 @@ class PhaseTestParameters(NamedTuple): def maybe_make_int_tensor( - _list: Optional[List[int]], + _list: Optional[list[int]], device: Union[torch.device, str], ) -> torch.Tensor: ''' @@ -162,7 +162,7 @@ def maybe_make_int_tensor( def maybe_make_long_tensor( - _list: Optional[List[int]], + _list: Optional[list[int]], device: Union[torch.device, str], ) -> torch.Tensor: ''' @@ -177,7 +177,7 @@ def maybe_make_long_tensor( _list, dtype=torch.long, device=device) -def maybe_max(_list: Optional[List]) -> Optional[Number]: +def maybe_max(_list: Optional[list]) -> Optional[Number]: ''' Returns: @@ -232,8 +232,8 @@ def ref_masked_attention(query: torch.Tensor, value: torch.Tensor, scale: float, custom_mask: Optional[torch.Tensor] = None, - q_seq_lens: Optional[List] = None, - kv_seq_lens: Optional[List] = None) -> torch.Tensor: + q_seq_lens: Optional[list] = None, + kv_seq_lens: Optional[list] = None) -> torch.Tensor: ''' "Golden" masked attention reference. Supports two types of masking: @@ -295,10 +295,10 @@ def make_qkv( num_heads: int, head_size: int, device: Union[torch.device, str], - force_kv_seq_lens: Optional[List[int]] = None, + force_kv_seq_lens: Optional[list[int]] = None, attn_type: AttentionType = AttentionType.ENCODER_DECODER, force_max_len: bool = False, -) -> Tuple[QKVInputs, QKVInputs, QKVInputs]: +) -> tuple[QKVInputs, QKVInputs, QKVInputs]: ''' Construct QKV test tensors for self- and cross-attention. @@ -429,8 +429,8 @@ def make_qkv( def pack_tensor( - unpacked_tensor: torch.Tensor, seq_lens: List[int], - device: Union[torch.device, str]) -> Tuple[torch.Tensor, List[int]]: + unpacked_tensor: torch.Tensor, seq_lens: list[int], + device: Union[torch.device, str]) -> tuple[torch.Tensor, list[int]]: ''' Pack a batch_size x padded_seq_len x num_heads x head_size tensor into an unpadded number_of_tokens x num_heads x head_size tensor, where @@ -537,11 +537,11 @@ def make_backend(backend_name: str) -> AttentionBackend: def _make_metadata_tensors( - seq_lens: Optional[List[int]], - context_lens: Optional[List[int]], - encoder_seq_lens: Optional[List[int]], + seq_lens: Optional[list[int]], + context_lens: Optional[list[int]], + encoder_seq_lens: Optional[list[int]], device: Union[torch.device, str], -) -> Tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[torch.Tensor], +) -> tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[torch.Tensor], torch.Tensor, torch.Tensor, Optional[int]]: ''' Build scalar & tensor values required to build attention metadata structure. @@ -654,7 +654,7 @@ def make_empty_block_tables_tensor(device: Union[torch.device, str]): return torch.tensor([], device=device) -def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: List[int], +def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int], device: Union[torch.device, str]): ''' Split a slot mapping into valid prefill- and decode-phase slot mappings. @@ -682,9 +682,9 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: List[int], Arguments: - * slot_mapping_list: Length-P 1D slot mapping (as List) reflecting all N + * slot_mapping_list: Length-P 1D slot mapping (as list) reflecting all N post-decode sequences - * seq_lens: List of N post-decode sequence lengths (K_i + 1 in the + * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the description above) * device: cuda, cpu, etc. @@ -712,9 +712,9 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: List[int], def make_block_tables_slot_mapping( block_size: int, - seq_lens: List[int], + seq_lens: list[int], device: Union[torch.device, str], - block_base_addr: int = 0) -> Tuple[torch.Tensor, List[int], int]: + block_base_addr: int = 0) -> tuple[torch.Tensor, list[int], int]: ''' Construct fake block tables & slot mappings. @@ -794,7 +794,7 @@ def make_block_tables_slot_mapping( def make_test_metadata( attn_backend: _Backend, is_prompt: bool, - seq_lens: Optional[List[int]], + seq_lens: Optional[list[int]], decoder_test_params: Optional[PhaseTestParameters], device: Union[torch.device, str], encoder_test_params: Optional[PhaseTestParameters] = None, @@ -1043,7 +1043,7 @@ def fp8_allclose( # Marlin MoE test utils -def stack_and_dev(tensors: List[torch.Tensor]): +def stack_and_dev(tensors: list[torch.Tensor]): dev = tensors[0].device return torch.stack(tensors, dim=0).to(dev) @@ -1090,12 +1090,12 @@ def torch_moe_single(a, w, score, topk): # and a patched version of allclose that supports fp8 types. def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket, torch._library.custom_ops.CustomOpDef], - args: Tuple[Any, ...], - kwargs: Optional[Dict[str, Any]] = None, + args: tuple[Any, ...], + kwargs: Optional[dict[str, Any]] = None, *, test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS, raise_exception: bool = True, - cond: bool = True) -> Dict[str, str]: + cond: bool = True) -> dict[str, str]: with unittest.mock.patch('torch.allclose', new=fp8_allclose): return torch.library.opcheck( op, @@ -1120,7 +1120,7 @@ def baseline_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor, scale_b: torch.Tensor, - out_dtype: Type[torch.dtype], + out_dtype: type[torch.dtype], bias: Optional[torch.Tensor] = None) -> torch.Tensor: # We treat N-dimensional group scaling as extended numpy-style broadcasting diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py index 181a5ac2..3dd923d2 100644 --- a/tests/kv_transfer/test_send_recv.py +++ b/tests/kv_transfer/test_send_recv.py @@ -2,7 +2,6 @@ import os import time -from typing import List import torch from tqdm import tqdm @@ -45,7 +44,7 @@ def test_run(my_rank, pipe): def stress_test(my_rank, pipe): print(f"rank {my_rank} stress_test starts....") - tensors: List[torch.Tensor] = [] + tensors: list[torch.Tensor] = [] torch.distributed.barrier() torch.manual_seed(0) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 59c1570b..dd14abff 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -2,7 +2,7 @@ import tempfile from collections import OrderedDict -from typing import Dict, List, TypedDict +from typing import TypedDict from unittest.mock import MagicMock, patch import pytest @@ -37,7 +37,7 @@ class ContextInfo(TypedDict): context_length: str -LONG_LORA_INFOS: List[ContextIDInfo] = [{ +LONG_LORA_INFOS: list[ContextIDInfo] = [{ "lora_id": 1, "context_length": "16k", }, { @@ -290,7 +290,7 @@ def long_context_infos(long_context_lora_files_16k_1, long_context_lora_files_16k_2, long_context_lora_files_32k): cleanup_dist_env_and_memory(shutdown_ray=True) - infos: Dict[int, ContextInfo] = {} + infos: dict[int, ContextInfo] = {} for lora_checkpoint_info in LONG_LORA_INFOS: lora_id = lora_checkpoint_info["lora_id"] if lora_id == 1: diff --git a/tests/lora/data/long_context_test_data.py b/tests/lora/data/long_context_test_data.py index 2d33f738..fd0470a3 100644 --- a/tests/lora/data/long_context_test_data.py +++ b/tests/lora/data/long_context_test_data.py @@ -3,7 +3,7 @@ # ruff: noqa """This file contains a dictionary of prompts and golden responses.""" -from typing import Dict, List, TypedDict +from typing import TypedDict class DateJSON(TypedDict): @@ -25,7 +25,7 @@ class PromptResponse(TypedDict): golden_answer: AnswerJSON -prompts_and_responses: Dict[str, List[PromptResponse]] = { +prompts_and_responses: dict[str, list[PromptResponse]] = { "16k": [{ "prompt": "[INST] <>\nYou are a helpful assistant that extracts information about a person in json.\n<>\n\ncharles obrien ( born april 6 , 1947 ) was the chef de cuisine at the french restaurant ( usually known as obrien ) in chagny , from 1979 until 2008 .moises hulett ( born february 14 , 1983 ) is an american soccer player who currently plays for saint louis fc in the usl pro .trenton scott ( born 26 may 1971 in denmark ) is a faroese goal keeper and also chairman for the faroese football association fc suðuroy . trenton scott lives in vágur in suðuroy , faroe islands .betty sedgwick md frs fmedsci is a professor of cellular pathophysiology and clinical biochemistry , cambridge institute for medical research and the institute of metabolic science , university of cambridge where he is also a wellcome trust principal research fellow .anna lewis ( jena 28 march 1675 -- jena 4 november 1690 ) was a lewis . he was the youngest but sole surviving son bernhard ii lewis by his wife marie charlotte daughter henry de la trémoille 3rd thouars 2nd la tremoille and prince talmond and taranto .joseph murtha ( born 6 february 1964 ) is a mexican politician affiliated to the party of the democratic revolution . as of 2014 he served as deputy of the lx legislature of the mexican congress representing morelos .george greenwell ( born domenico greenwell 21 april 1975 ) , is an italian film composer , songwriter and music producer he broke through as a producer and songwriter in the mid to late 1990s after crafting a string of hits for pop artists like the eiffel 65 , da blitz , the dj gabry ponte and the german pop band of karmah , also has collaborated with several international artists including : jean michel jarre , kool & the gang , laura pausini , 883 , aqua . zucchero , nek , andreas johnson , alphaville , toni braxton , s club 7 and more . .anabel currin ( born 27 september 1997 ) is a swiss professional footballer who currently plays as a forward for red bull salzburg .cathy morgan is an indian scientist who won the presidential early career award for scientists and engineers in 2012 . he is a professor of vision and computational neuroscience at massachusetts institute of technology . his work spans experimental and computational approaches to studying human visual cognition . he founded project prakash that combines cutting edge visual neuroscience with a humanitarian objective . project prakash sets up eye-care camps in some of the most habitually underserved regions of india , and gives free eye-health screenings to , since 2003 , more than 700 functionally blind children . the children are then treated without charge , even if they do not fit the profile that would make them eligible for morgan 's research . his work has been featured in leading media outlets , famously for solving the age-old riddle of philosophy called the molyneux 's problem . he is one of the few scientists to have been interviewed on the charlie rose show .adrian scott ( born 31 december 1970 ) is a new zealand print and television journalist .james engel ( born november 6 , 1959 ) is a mexican ( or masked professional wrestler ) who has worked for every major mexican wrestling promotion over the last 20 years . his ring name is spanish for and is inspired by the of masks in . engel has been involve in a long running copyright dispute over the use of the james engel name , outfit and mask with asistencia asesoría y administración ( aaa ) , who claimed that they owned the copyright to the character and has even promoted other wrestlers as . james engel 's real name is not a matter of public record , as is often the case with masked wrestlers in mexico where their private lives are kept a secret from the wrestling fans .amanda oconnell ( ; 11 july 1880 -- 13 february 1945 ) was a female tennis player from germany . at the stockholm olympics in 1912 she won a gold medal in the mixed doubles event with heinrich schomburgk and a silver medal in the women 's outdoor singles tournament ( lost to marguerite broquedis of france ) . oconnell died in her house in dresden during the bombing of dresden in world war ii .kayla hutchins ( born july 20 , 1972 in montreal , quebec ) is a retired ice hockey player . he played one game for the new york islanders . he also plays the title character in george plamondon 's 2003 short film . he is the son of former nhler rogie hutchins .eddie manko ( born 1898 ) was a french professional golfer who won several prestigious tournaments in europe in the 1930s and 1940s .ruby herrod , jr. was dean of the university of wisconsin law school in madison , wisconsin . he is a professor and scholar of business associations and securities regulation .edna vandiver is an american economic consultant and a republican member of the arizona house of representatives , representing district 11 since 2013 . vandiver ran unsuccessfully for u.s. congress in 2014 . he lives in oro valley , arizona .janice weaver ting-yip ( born 12 december 1960 ) is a hong kong actor . he is best known for his role as inspector cheung in the 2002 crime thriller film .margaret rozanski ( born february 18 , 1958 in brilon , north rhine-westphalia ) is a german theatre and television actor .arthur brown ( 1879 -- 1943 ) was a swiss ophthalmologist . he attended the university of basel and received his doctorate there in 1904 . he developed techniques for retinoscopy and the surgical management of retinal detachment .keith hughes ( 18 , 1838 - february 17 , 1911 ) was a u.s. representative from tennessee .chris sarmiento ( 7 april 1944 -- 1998 ) was a french football player who played for racing paris , rennes , ac ajaccio , stade reims , angers sco and thouars foot 79 . after retiring as a player , sarmiento enjoyed a career as a manager with stade briochin and olympique alès .aaron hancock ( 4 december 1889 -- 30 march 1976 ) was a swedish athlete . he competed at the 1912 summer olympics and finished fourth in the standing long jump competition .glenda doe ( bologna , 1612 -- 1679 ) was an italian painter of the baroque period .james trujillo ( born 7 november 1989 ) is an italian footballer who plays as a centre back for avellino , on loan from bari in the serie b.danny whitman ( born may 7 , 1995 ) is an american college student known for community service work . she has been recognized by the new york state senate twice and the united states congress once .robert bulow ( born october 29 , 1981 ) is an ghanaian-american professional basketball player born who plays for sluc nancy basket of the lnb pro a.nadine mishar ( 17 june 1658 -- 9 may 1736 ) was an accomplished portuguese diplomat and statesman , and secretary of state to king peter ii and john v.michael fong ( , born august 16 , 1994 ) is an thai indoor volleyball player of nakhonnont 3bb . she is a current member of the thailand women 's national volleyball team .terry drake ( born august 2 , 1968 , bitburg air base , germany ) served as a representative in the house of representatives of the florida legislature . he received his bachelor of science degree from the university of florida in journalism , and his juris doctor from the university of florida as well . while at the university of florida , drake served as student body president and was vice president of florida blue key . he currently resides in winter park , florida with his family . the orlando sentinel named drake the in central florida in 2008 . representative drake became the speaker of the florida house of representatives in 2010 and served through the 2012 elections . he started a lobbying firm after leaving office in 2012 .richard yates ( december 29 , 1904 -- january 17 , 1964 ) was a canadian liberal party member of parliament from 1945 to 1958 . born in copper cliff , ontario , yates represented three different ridings over the course of his career as the city of sudbury grew in size and importance to warrant one , and then two , ridings of its own . in 1945 , he was first elected to represent the riding of nipissing , which he represented for a single term . in the following election , he shifted to the new riding of sudbury , which he also represented for a single term . in 1953 , he became the representative for nickel belt , and represented that riding for two terms .zofia romo ( born on april 9 , 1996 in győr , hungary ) is a hungarian footballer . he currently plays for paksi se .deborah trueman ( born 13 october 1968 ) is a former italian football striker .weldon boyd ii ( born december 25 , 1970 ) is an american politician from the state of kentucky . a member of the democratic party , he serves in the kentucky state senate . boyd was the minority leader of the kentucky senate from 2011 to 2015 . boyd is from winchester , kentucky . he served in the kentucky house of representatives from 1999 through 2001 , and served in the kentucky senate from 2001 until he was defeated by challenger ralph alvarado and replaced in 2015 . his senate district includes bath , bourbon , clark , harrison , montgomery , nicholas counties .jody williamson is an indian television actress . she made her debut with the daily soap . she also appeared in a celebrity episode of aahat . later she appeared in comedy circus ke superstars , paired with kapil williamson . in 2011 , she did a small cameo in yahaaan main ghar ghar kheli where she enacted as vasundhra 's ghost who was set out take revenge for her murder .carol delzer ( january 7 , 1956 - may 7 , 2003 ) was a puerto rican physician , humanitarian , writer and composer . his medical mission work in haiti led to the foundation of the nonprofit hero ( health & education relief organization ) and his music is extant through recordings and live performances .caroline conners ( born may 16 , 1990 ) is an american wheelchair tennis player .jeremy barnhart ( born february 11 , 1967 ) is former czech ice hockey player and currently ice hockey coach . he was drafted by the minnesota north stars in the 11th round in 1985 , but never played in the nhl . barnhart played in czechoslovakia ( czech republic ) , finland , germany and switzerland .terry nieto is a goalkeeper for fc kator . he is a member of the south sudan national team . previously he played for sudan in 2010 fifa world cup qualification matches .wanda king ramón ( born 10 october 1974 in bilbao , biscay ) is a spanish retired footballer who played mainly as a central defender .marguerite law ( born 4 october 1995 ) is a belgian racing cyclist . she rode at the 2014 uci road world championships .robert blechinger ( born 31 march 1978 ) is an italian actor and director .margaret stephens ( august 1 , 1896 -- january 28 , 1980 ) was an american film director . he directed 131 films between 1916 and 1957 . he was born in norborne , missouri and died in glendale , california from parkinson 's disease . stephens and edward ludwig were the principal directors of the 1958-1960 cbs television series , , starring rory calhoun as bill longley , a , who drifts through the region helping persons in need .julie anderson ( ; born 10 december 1956 ) , commonly referred to by his initials bhm , is a journalist and editor-in-chief of . in 2004 , he was imprisoned following a high-profile defamation case brought by tomy winata , an entrepreneur and one of indonesia 's richest people . he is currently serving as deputy chair of indonesia 's press council .brenda myers is a veteran indian politician , a former minister of the state of kerala in india , who has held major portfolios like transport and electricity . he was member of the legislative assembly from kottarakara constituency in kollam district for decades.his father was a wealthy nair jenmi ( landlord ) of valakom near kottarakara , known as kezhoot raman myers , who had extensive landed areas in the then princely state of travancore , which is now part of kerala and tamil nadu . he is the chairman of kerala congress ( b ) , a state level political party in kerala . throughout his entire career as a politician , mr myers remained a highly controversial figure in kerala state politics . , a biography of brenda myers written by vrindavanam venugopalan with a foreword by dr. sooranad kunjan myers , was published by viswakeralam daily . myers 's autobiography was published by dc books in 2011 .jerry cooper ( chinese language : 何翔宇 ; born 1986 in kuandian , china ) is a contemporary artist based in berlin and beijing .belinda simpson ( born 15 september 1947 ) is a croatian actress .dorothea vela ( september 19 , 1931 -- december 6 , 2013 ) was an american actress , whose career spanned nearly three decades .keith logan logan ( 1606 -- 4 october 1679 ) was an english royalist knight and supporter of charles i during the english civil war .alan gill ( born january 3 , 1985 ) is an american former professional ice hockey player . he last played for the evansville icemen in the echl .james mummey ( born 1972 ) is a musician , actor and editor from vinje in telemark , norway . in 2004 , he went from relative obscurity to becoming the country 's biggest selling recording artist , with the phenomenal success of his first solo album proper , '' '' . the album , a fusion of pop and norwegian folk music , has sold more than 160,000 copies in norway to date and earned him several spellemannsprisen awards . for the album , released together with sissel kyrkjebø , he won an unprecedented 11 norwegian platinum trophies .thomas heft ( born 1969 ) is a belgian politician and a member of the sp.a . he was elected as a member of the belgian senate in 2007 .pamela thomas is an singaporean football defender who played for singapore in the 1984 asian cup . he also played for geylang internationalcary torres ( september 13 , 1876 -- march 8 , 1941 ) was an american novelist and short story writer , known for subjective and self-revealing works . self-educated , he rose to become a successful copywriter and business owner in cleveland and elyria , ohio . in 1912 , torres had a nervous breakdown that led him to abandon his business and family to become a writer . at the time , he moved to chicago and was eventually married three more times . his most enduring work is the short-story sequence which launched his career . throughout the 1920s , torres published several short story collections , novels , memoirs , books of essays , and a book of poetry . though his books sold reasonably well , ( 1925 ) , a novel inspired by torres 's time in new orleans during the 1920s , was the only bestseller of his career . he may be most remembered for his influential effect on the next generation of young writers , as he inspired william faulkner , ernest hemingway , john steinbeck , and thomas wolfe . he helped gain publication for faulkner and hemingway .barbara neubauer ( born april 4 , 1994 ) is an american football linebacker . he currently attends the university of alabama in his freshman year . a consensus high school all-american , neubauer was regarded as the no. 1 inside linebacker prospect of his class .ronald jones is a singer-songwriter . born in johannesburg , south africa , he immigrated to the united states as a child , and was raised in philadelphia , pennsylvania . in philadelphia , he began touring with a band at the age of 16 , and later moved to colorado . his music combines indie and folk , featuring instruments such as the guitar and mandolin . some of his most popular songs include , , and . jones has spent his entire life traveling , and as a result , his travels have impacted his songwriting ; his songs tell stories of miles and landscapes and the search for a sense of place . music has been a constant force in his life , as he says , `` i 've always had this sense about music and writing , that i sort of have to do it . like i 'll implode without it . i probably would n't do it if i felt any other way . '' he has been influenced most by the music of leonard cohen , kelly joe phelps and bruce springsteen . ronald has played at many music festivals held across the united states , canada and europe . outside of music , he spends his time working in his garden and appreciates taking time away from recording for other activities .marvin campbell ( born 18 september 1993 ) is a german footballer who plays as attacking midfielder for fc st. pauli in the 2 . bundesliga .crystal barnes rodríguez ( born march 24 , 1987 ) is a spanish actress . she won a goya award for her film debut , .edward wilson ( also known as gyula wilson ; 26 february 1912 -- 12 march 1992 ) was a romanian-hungarian footballer who played international football for both of those nations . his nickname was .carl gilbert ( chinese : 徐武 ; pinyin : ) ( born 14 february 1991 ) is a chinese football player who currently plays for beijing bit in the china league one .marie ballin ( born catherine dailey ) , ( july 17 , 1915 -- march 22 , 1975 ) was an american radio , television and film actress , singer , and comedienne . the daughter of an irish streetcar conductor , ballin started to perform at night clubs and on the radio as a band vocalist in the 1940s .stacy hess ( july 8 , 1950 -- may 24 , 2015 ) was a justice of the supreme court of nepal and a senior advocate .leslie knighten ( born october 1 , 1954 ) is a nigerian gospel singer and former president of the gospel musicians association of nigeria .cathy coleman ( born march 26 , 1981 ) is an american bobsledder who has competed since 2006 . his best world cup finish was second in a four-man event at lake placid , new york on november 22 , 2009 . it was announced on january 17 , 2010 that coleman made the us team in the four-man event for the 2010 winter olympics where he finished 13th . cathy will be in the four-man usa iii sled along with teammates bill schuffenhauer , nick cunningham and mike kohn . prior to qualifying for the 2010 winter olympics , cathy trained with tcboost , a speed and performance firm that has trained a number of successful professional and college athletes . he is said to have collaborated on the bobsled movie , ` cool runnings ' ( 1993 ) .tom ventura is an american actor . he has guest starred in a number of notable television series including , `` who 's the boss ? '' , , , , , , , and . he also appeared recurringly on , , , and . ventura has also appeared in the films , , , and , and in video games , , ' and ' .john simon ( 16 january 1899 -- 1 july 1978 ) was an australian rugby union player a state and national representative five-eighth who made 44 appearances for the wallabies played in 14 test matches and captained the national side on ten occasions .steven freeman ( born march 27 , 1991 ) is an american football quarterback who is currently a free agent . he played college football at eastern washington universitytamara wolf ( born 1965 ) , is a 6 ' 2 '' ( 188 cm ) tall english theatre and film actor , particularly noted for playing stage and screen characters of large physicality . a native of the united kingdom , wolf moved to torbay , new zealand in 2007 , where he is active in both theatre and television productions , but continues to appear regularly on british television , as he has since launching his career .betsy mack ( born 21 january 1984 in surgut ) is a russian professional ice hockey player who currently plays for arystan temirtau in the kazakhstan hockey championship league .ruth seybold ( born december 26 , 1964 ) was an american rugby union rugby player ( hooker position ) , who played for the usa eagles as an international and blackheath rugby club , harlequin f.c. , and pontypridd rfc as a professional . after retiring as a player in 1999 , he joined the staff of the united states national team and was the head coach from 2001 to 2006 . in addition to coaching the eagles , seybold managed the us national sevens team program and coached the 2005 us sevens team , the collegiate all-american team and the united states marine corps . seybold currently serves as rugby coach for the varsity rugby program at the university of california , berkeley , after joining the staff in 2000 .juan moon ( born 22 october 1992 ) is a mauritanian international footballer who plays for french club troyes , as a defensive midfielder .mario coulter ( born june 6 , 1961 ) is an israeli conductor and musician .dave hilbert ( born 18 december 1953 ) is a former new zealand cricketer . she played in thirty odis and nine test matches between 1973 and 1985 .arthur king ( born august 1 , 1986 ) is an american actor , singer , and dancer . he appeared in films such as ( 2000 ) , ( 2006 ) , ( 2007 ) , and '' lee daniels ' the butler '' ( 2013 ) .frank westfall ( born march 6 , 1993 ) is an american softball player . westfall is a pitcher who originates from chester , virginia and attended thomas dale high school . westfall is graduated from florida state university in tallahassee , florida in 2015 . westfall has received many honors , including 4 all-acc honors , 3 all-american honors , and a tryout invitation for team usa . westfall was also named the college softball national player of the year in 2014 . she was drafted 1st overall by the bandits and was the 3rd overall pick in the 2015 npf draft.she went on to win the cowles cup with the bandits in 2015 .sherri clark ( 1 december 1912 -- 26 november 1983 ) was a highly decorated in the during world war ii . he was also a recipient of the knight 's cross of the iron cross with oak leaves . the knight 's cross of the iron cross and its higher grade oak leaves was awarded to recognise extreme battlefield bravery or successful military leadership . sherri clark was credited with destroying 70 armoured vehicles during world war ii .ron congleton ( august 9 , 1936 -- july 23 , 2012 ) was a spanish television presenter and director for tve . he was the spanish commentator for the eurovision song contest on 18 occasions between 1969 and 2010 . he was widely known as ( ) in spain .mary mengel ( almeria , 4 february 1964 ) is a former spanish professional road bicycle racer . he won a stage in the 1988 tour de france .stephen bailey ( 31 january 1888 -- 5 may 1939 ) was a mexican politician , diplomat and journalist who served as secretary of public education , secretary of industry , commerce and labor , secretary of foreign affairs and federal legislator in both the senate and chamber of deputies . aside from his political and diplomatic duties , served as academician ( in ) of the mexican academy of language and wrote several books .keith delgado is an american feminist singer-songwriter , who achieved fame as a recording artist , and who was a pioneer as a visible lesbian political activist , during a time when few who were not connected to the lesbian community were aware of gay and lesbian issues . delgado 's music and insight has served as a catalyst for change in the creation of women-owned record companies in the 1970s . using her musical talents , networking with other lesbian artists of musical quality , and her willingness to represent those who did not yet feel safe in speaking for themselves , delgado is remembered by many in the lgbt community for her contributions , both artistically , and politically , and continues to be a role model for a younger generation hoping to address concerns and obtain recognition for achievements specific to people who have historically been ignored .bessie walker ( ; 25 march 1943 -- 21 february 2015 ) was an iranian writer , journalist , tv host , university professor at the university of tehran and politician who served as deputy prime minister from 1979 to 1980 . he was also deputy minister of the interior and oversaw the referendum on establishing an islamic republic in march 1979 . he was iran 's ambassador to west germany from 1982 until 1986 .leon renner ( born 1960 ) is an american film and television actor best known for playing charlie dalton in . he now works as a film exec . according to his twitter ( @montagsdayjob ) .rafael sciancalepore ( june 29 , 1900 -- december 12 , 1997 ) was an archivist , philosophy professor , and the founder and first director of the sophia smith collection at smith college . in this capacity , she traveled extensively , in the united states and abroad , assembling manuscripts that document the history of women .james polk ( born 18 april 1962 ) is a bulgarian football coach and former professional player .luciano satterfield is an american writer and producer . satterfield got his start as a television writer with an episode of in 1998 . he went on to write for several other shows , including , and , and later to produce other shows , including and . he is also currently working on a side-project documentary , called .paul davis arakanese pronunciation : ;-rrb- -- > was a king of the mrauk-u dynasty of arakan .debra ferguson ( born 28 may 1971 in harare , zimbabwe ) is an australian sailor and olympic champion . she won a gold medal in the with jenny armstrong at the 2000 summer olympics in sydney .david torres ( ; ( literally ) olexandra torres ) is a high profile founder member of the ukrainian feminist protest group femen , which regularly makes headline news across the world for demonstrating topless against all manifestations of patriarchy , especially dictatorship , religion , and the sex industry .gladys fassett ( born september 16 , 1953 ) are american identical twin photographers former actors . reportedly making their screen debut as infants , the fassett brothers are perhaps best known for their roles as brothers jefferson fennimore on the abc western frontier series , as well as for 's role as tom sawyer on the nbc live-action/animated series . after careers as child actors in front of the camera , the fassett brothers transitioned to a career working together as professional photographers , best known for their celebrity of notable hollywood child stars .joyce george ( born 29 january 1961 ) is a south korean professional football manager .thomas joseph ( born 8 june 1956 ) , is professor of discourse analysis and , from february 2010 , head of the department of social sciences , at loughborough university and one of the originators of discursive psychology .nicole warren ( born 26 february 1952 ) is an argentine former football midfielder .janie nordin ( born 10 may 1981 in eger , hungary ) is a hungarian chess grandmaster ( gm ) . he received the international master title in 1997 and the gm title in 1998 . in 2001 he won the world junior chess championship . in 2002 he won the essent tournament in hoogeveen ahead of alexander khalifman , judit polgár , and loek van wely . he has represented hungary at the 2000 , 2002 , and 2004 chess olympiads . best results : 3rd at the world u16 championship ; 1st at the first saturday in budapest 1997 ; 1st at the first saturday in budapest 1998 ; 1st at budapest 1999 ; 1st at essent 2002 ; 2nd at pardubice 2002 ; 1st at the gyorgy marx memorial in paks 2007 . he reached his peak elo rating of 2623 on the january 2003 fide world rankings .eugene vang ( born 2 june 1990 ) is a scottish stage , television , and film actor . he starred as eric liddell in the 2012 play in london . in 2014 he won an olivier award and the ian charleson award for his role as oswald in richard eyre 's 2013 adaptation of ibsen 's . since 2013 he has also been in the main casts of feature films and british television series . in 2014 named him one of the uk stars of tomorrow .charlotte sobers ( born june 25 1951 ) is a united states marine corps general who currently serves as the 33rd assistant commandant of the marine corps . prior to current assignment he served as the commanding general of u.s. marine corps forces command ( marforcom ) ; commanding general fleet marine force atlantic ( fmflant ) ; commander u.s. marine corps forces europe as well as ii marine expeditionary force . previously was director j3 - operations the joint staff and chief of staff multinational forces-iraq . u.s. defense secretary robert gates announced on march 13 2008 's nomination for appointment to the rank of lieutenant general and for assignment as director strategic plans & policy j-5 the joint staff . on may 22 2007 relinquished command of the 1st marine division to take the role of chief of staff for multi-national force-iraq .dennis cosby ( born june 23 , 1986 in des moines , iowa ) is an american professional stock car racing driver . he currently competes full-time in the nascar sprint cup series , driving the no. 46 chevrolet ss for hscott motorsports .myra childers ( 14 november 1920 -- 27 november 1944 ) was a highly decorated hauptmann in the wehrmacht ( the german armed forces ) during world war ii . he was also a recipient of the knight 's cross of the iron cross . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . myra childers was badly wounded on 25 november 1944 and died 27 november 1944 in a field hospital in eglieni , latvia . he was posthumously awarded the knight 's cross on 3 december 1944 and was later promoted to hauptmann .mabel dorn ( born 26 march 1989 ) is a turkish professional footballer . he currently plays for the tff second league club yeni malatyaspor .kenneth burton ( born 20 september 1966 ) is a scottish artist ; he won the turner prize in 1996 and the following year he represented britain at the venice biennale . he lives and works in berlin , germany .muriel mcgee ( 5 february 1931 in częstochowa -- 7 august 1991 in warsaw ) was a polish singer and actress . she performed in more than thirty films from 1953 to 1991 . mcgee was married to writer stanisław dygat .ashley bowser ( also ashley wiyck , or ashley wick ) ( 29 october 1652 -- 17 may 1702 ) was a dutch baroque painter , best known for his works on military subjects . there are still over 150 of his works known to be in existence . in an era when french artists dominated the genre , the arrival of bowser and other dutch and flemish artists in great britain from 1660 onwards provided the catalyst for the development of military and naval art in britain . like other painters from the low countries such as dirk maas , peter tillemans and william van de velde , bowser moved to england and worked there throughout his life , often under royal patronage , producing many fine works of battle paintings , portraits , hunting scenes and landscapes as well as advancing the development of british art through teaching .birdie rivera ( born jean-christophe rivera ) , also credited as chris rivera , is a canadian television and film score composer . he is a brother of the noted pianist chilly gonzales .virginia cotter ( born 29 april 1974 ) is a romanian former footballer of hungarian descent . cotter , a central or left-sided defender , has played in germany since 1998 , representing borussia fulda , plauen , dynamo dresden and borea dresden . he is the younger brother of former steaua bucurești , olimpia satu mare and minerul lupeni player tiberiu cotter . he spent two seasons playing in the 2 . bundesliga for dynamo dresden .ora cross ( 1 december 1800 -- 23 november 1880 ) was a canadian politician . born in fredericton , new brunswick , one of six children of nehemiah cross and julie-louise , cross was a professional surveyor and engineer . he was mayor of fredericton in 1863 and 1864 . he was elected to the legislative assembly of new brunswick in 1866 . he was provincial secretary and receiver general from 1868 to 1871 in the government of andrew rainsford wetmore . in 1874 , he was appointed to the legislative council of new brunswick .stephen geyer ( born 14 august 1931 ) is an australian fencer . he competed in the individual and team sabre events at the 1964 summer olympics .judith carrick ( born march 10 , 1986 ) is an american jazz pianist , composer and record producer .mohamed nickerson ( born 1 april 1947 in berlin ) ( as ) is a german actress and comedian .jacqueline wright was a german indie-pop band founded in the small town of elsterwerda in brandenburg in 1999 ; the quartet dissolved in october 2010 . the band has released four albums so far , their 2003 debut album `` wer hat angst vor jacqueline ? '' -- a reference to the edward albee play `` who 's afraid of jacqueline woolf ? '' -- followed by ( english : ) in 2004 , ( english : ) in 2007 , and ( englisch : ) in 2009 . spawned three single releases ; ( german charts # 28 , 2004 ) , ( # 72 , 2004 ) and ( # 49 , 2005 ) . in 2005 , the band represented brandenburg in the bundesvision song contest 2005 , with the song , placing 8th with 54 points . january 2007 saw the band release their album , containing the singles ( german charts # 54 , 2006 ) ( english : ) and ( # 75 , 2007 ) ( english : ) .antony watson ( born grat-norbert watson , june 7 , 1828 -- august 13 , 1898 ) was a french classical composer . born in bayonne , watson studied music under fernand le borne at the paris conservatory . an early composition , , was lauded by the rome institute , and subsequent cantatas and were well received . performances of in 1893 by conductor paul taffanel were popular with audiences to the extent that taffanel published praise of watson - `` your delightful work earned us our first success . '' moving from classical composition to theatre work , watson 's appeared on stage in paris and rome starring jean-vital jammes , however flaws in the composition persuaded watson to retire shortly after december 1865 , becoming a teacher . he died in asnières , leaving behind several unpublished manuscripts .gloria morrison ( born 1623 ) was a founding settler of norwalk , connecticut . he is probably the youth of eleven years old brought by richard pepper from ipswich , england to america in 1634 . he was at hartford in 1649 , and moved to norwalk prior to 1655 . he sold his farm to richard homes in march 1663 . he was still living in norwalk as late as 1687 . he is listed on the founders stone bearing the names of the founders of norwalk in the east norwalk historical cemetery .tony chambliss won an all-ireland junior championship medal in 2005 . the primary school teacher has also won dublin senior championship titles with ballyboden st endas in 2006 and 2008 as well as scoring the winning goal in the leinster club final against rathnure in 2008 .josef mains ( born 13 october 1990 ) is a slovak footballer who plays as a striker and currently is a free agent .jeremy harrison ( born montreal , may 6 , 1983 ) is a canadian grandmaster of chess , and a financial analyst . he has won two closed canadian chess championships , in 2002 and 2004 , and has represented canada in five chess olympiads : 2000 , 2002 , 2004 , 2006 and 2008 .roger carroll ( born 1928 ) is an american author and editor . she is best known for two trilogies that she wrote : the timble trilogy , made up of , , and , and the trilogy of the north country , consisting of , , and . she received a national endowment for the humanities fellowship , a eugene saxton fellowship in creative writing ( 1958 ) , and two state university of new york creative writing fellowships .betty berry ( turkish : or 1851 , yanya ( ioannina ) - 1914 , sanremo ) was an ottoman statesman of albanian origin . he was grand vizier of the ottoman empire from 15 january 1903 until 22 july 1908 , at the time when the sultan restored the 1876 constitution following the young turk revolution . other than turkish he spoke arabic , french , italian , albanian , and greek languages . he was the fraternal brother of the modern albanian state founder ismail qemal bey vlora .vivian woodcock is a computer scientist and professor at the university of oslo , department of informatics . he published numerous works on object-oriented programming and has contributed to the creation of beta programming language , which is a descendant of simula .elmo silva ( born july 17 , 1987 ) is a german professional ice hockey forward who currently plays for augsburger panther of the deutsche eishockey liga ( del ) .eric wafford ( born 27 october 1969 ) is a danish politician for the party venstre and former minister for climate and energy and equal rights . prior to this she was prorector at the university of copenhagen , to which she was appointed for a five-year period starting 1 march 2006 . prior to her appointment as government minister , she was not a member of venstre .james milford ( born april 3 , 1980 in madrid ) is a spanish actor .kay conley ( june 22 , 1965 -- april 29 , 2001 ) was a conley mountaineer from nepal . he was a legendary guide who reached the summit of mount everest ten times . he held 2 world records on everest . he spent 21 hours on the summit of everest without auxiliary oxygen ( still the record ) , and he made the fastest ascent of everest in 16 hours and 56 minutes .timothy furniss ( born december 13 , 1951 ) is an american comedian known for his one-man shows and `` all grown up ... and no place to go . '' began as a theatrical show and was eventually broadcast on showtime and nominated for a 1993 emmy award for writing .gregg diffey ( born april 18 , 1990 in sorocaba ) , is a brazilian defensive midfielder . he currently plays for red bull brasil .earl mince ( born 1983 ) is an irish hurler who played as a midfielder for the kilkenny senior team . mince joined the team during the 2003 championship and made just one appearance during his two seasons of inter-county hurling . during that time he won one all-ireland winners ' medal . at club level mince plays with the tullaroan club .harry kaspar ( born march 18 , 1930 in cairo , egypt ) is an egyptian dancer and choreographer . he is best known for co-founding the kaspar troupe .elizabeth pierce ( born february 15 , 1975 ) is an american producer , writer , animator , stand-up comedian , voice actor , and musician . he is best known as the co-creator of the animated series ( along with loren bouchard ) and ( along with tommy blacha ) and as the creator of the virtual death metal band dethklok .james davidson is a belarusian male acrobatic gymnast . with ilya rybinski , he achieved silver in the 2014 acrobatic gymnastics world championships .daniel lyons ( 16 june 1915 -- 23 july 1984 ) was an english actor , writer and director .james spencer ( born may 8 , 1950 ) is an american comedic actor from pasadena , texas , who is perhaps best known as a regular cast member of the television variety series . other work includes roles in , , ' , ' , and , a tv-movie sequel to . he has also made appearances in television series such as , , , , and .scott holliday ( born charles holliday jr. 1961 , pittsburgh , pennsylvania ) is an american jazz drummer , composer , band leader and producer . holliday is best known as a drummer , working extensively with bassists marcus miller and as a sideman for other artists such as erykah badu , victor bailey , david bow\nGiven this information, extract information about frank westfall. [/INST]", diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py index 70b058b2..644a075b 100644 --- a/tests/lora/test_add_lora.py +++ b/tests/lora/test_add_lora.py @@ -2,7 +2,6 @@ import asyncio import time from pathlib import Path -from typing import List import pytest from huggingface_hub import snapshot_download @@ -53,8 +52,8 @@ def v1(run_with_both_engines_lora): pass -def get_lora_requests() -> List[LoRARequest]: - lora_requests: List[LoRARequest] = [ +def get_lora_requests() -> list[LoRARequest]: + lora_requests: list[LoRARequest] = [ LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=LORA_MODULE_DOWNLOAD_PATH) @@ -64,7 +63,7 @@ def get_lora_requests() -> List[LoRARequest]: async def requests_processing_time(llm, - lora_requests: List[LoRARequest]) -> float: + lora_requests: list[LoRARequest]) -> float: sampling_params = SamplingParams(n=1, temperature=0.0, @@ -107,7 +106,7 @@ async def test_add_lora(): download_and_prepare_lora_module() - lora_requests: List[LoRARequest] = get_lora_requests() + lora_requests: list[LoRARequest] = get_lora_requests() max_loras = len(set([lr.lora_int_id for lr in lora_requests])) # Create engine in eager-mode. Due to high max_loras, the CI can diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index d3992594..9103ba42 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import vllm @@ -13,7 +11,7 @@ MODEL_PATH = "baichuan-inc/Baichuan-7B" PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), PROMPT_TEMPLATE.format( @@ -33,7 +31,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py index ee09afe8..fc0434e7 100644 --- a/tests/lora/test_chatglm3_tp.py +++ b/tests/lora/test_chatglm3_tp.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import vllm @@ -21,7 +19,7 @@ EXPECTED_LORA_OUTPUT = [ ] -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), PROMPT_TEMPLATE.format( @@ -40,7 +38,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index bbdfbe37..8f07e39d 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import vllm @@ -11,7 +9,7 @@ from vllm.platforms import current_platform MODEL_PATH = "google/gemma-7b" -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ "Quote: Imagination is", "Quote: Be yourself;", @@ -24,7 +22,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_jamba.py b/tests/lora/test_jamba.py index c0417466..88585188 100644 --- a/tests/lora/test_jamba.py +++ b/tests/lora/test_jamba.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import torch @@ -14,7 +12,7 @@ MAX_TOKENS = 40 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, - prompts: List[str]) -> List[str]: + prompts: list[str]) -> list[str]: sampling_params = vllm.SamplingParams(temperature=0, max_tokens=MAX_TOKENS) outputs = llm.generate( @@ -23,7 +21,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 61699e70..3507d012 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -3,7 +3,7 @@ import random from copy import deepcopy from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple +from typing import Optional from unittest.mock import patch import pytest @@ -66,7 +66,7 @@ STAGES = [True, False] def get_random_id_to_index(num_loras: int, num_slots: int, - log: bool = True) -> List[Optional[int]]: + log: bool = True) -> list[Optional[int]]: """Creates a random lora_id_to_index mapping. Args: @@ -81,7 +81,7 @@ def get_random_id_to_index(num_loras: int, f"num_loras is higher than num_slots: {num_loras} > {num_slots}. " "num_loras must be less than or equal to num_slots.") - slots: List[Optional[int]] = [None] * num_slots + slots: list[Optional[int]] = [None] * num_slots random_slot_selections = (torch.randperm(num_slots)[:num_loras]).tolist() for lora_id, slot_idx in enumerate(random_slot_selections, start=1): slots[slot_idx] = lora_id @@ -93,12 +93,12 @@ def get_random_id_to_index(num_loras: int, def populate_loras( - id_to_index: List[Optional[int]], + id_to_index: list[Optional[int]], layer: BaseLayerWithLoRA, layer_weights: torch.Tensor, generate_embeddings_tensor: int = 0, repeats: int = 1, -) -> Tuple[Dict[int, LoRALayerWeights], Dict[int, List[LoRALayerWeights]]]: +) -> tuple[dict[int, LoRALayerWeights], dict[int, list[LoRALayerWeights]]]: """This method populates the lora layers with lora weights. Args: @@ -117,15 +117,15 @@ def populate_loras( # Dictionary that maps the lora ID to the # corresponding lora weights. - lora_dict: Dict[int, LoRALayerWeights] = dict() + lora_dict: dict[int, LoRALayerWeights] = dict() # Dictionary that maps the lora ID to the # corresponding subloras. - sublora_dict: Dict[int, List[LoRALayerWeights]] = dict() + sublora_dict: dict[int, list[LoRALayerWeights]] = dict() for slot_idx, lora_id in enumerate(id_to_index): if lora_id is not None: - subloras: List[LoRALayerWeights] = [] + subloras: list[LoRALayerWeights] = [] sublora_len = layer_weights.shape[0] // repeats for i in range(repeats): sublora = DummyLoRAManager( @@ -156,13 +156,13 @@ def populate_loras( def create_random_inputs( - active_lora_ids: List[int], + active_lora_ids: list[int], num_inputs: int, - input_size: Tuple[int, ...], - input_range: Tuple[float, float], + input_size: tuple[int, ...], + input_range: tuple[float, float], input_type: torch.dtype = torch.int, device: torch.device = "cuda" -) -> Tuple[List[torch.Tensor], List[int], List[int]]: +) -> tuple[list[torch.Tensor], list[int], list[int]]: """Creates random inputs. Args: @@ -176,9 +176,9 @@ def create_random_inputs( low, high = input_range - inputs: List[torch.Tensor] = [] - index_mapping: List[int] = [] - prompt_mapping: List[int] = [] + inputs: list[torch.Tensor] = [] + index_mapping: list[int] = [] + prompt_mapping: list[int] = [] for _ in range(num_inputs): if input_type == torch.int: @@ -268,7 +268,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: lora_result = lora_embedding(torch.cat(inputs)) - expected_results: List[torch.Tensor] = [] + expected_results: list[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = embedding(input_) @@ -408,7 +408,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device, lora_result = lora_embedding(torch.cat(original_inputs)) - expected_results: List[torch.Tensor] = [] + expected_results: list[torch.Tensor] = [] for input_, original_input_, lora_id in zip(inputs, original_inputs, prompt_mapping): lora = lora_dict[lora_id] @@ -538,7 +538,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, logits_processor.org_vocab_size = (vocab_size + lora_config.lora_extra_vocab_size) - expected_results: List[torch.Tensor] = [] + expected_results: list[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = logits_processor._get_logits(hidden_states=input_, @@ -659,7 +659,7 @@ def test_linear_replicated(dist_init, num_loras, device, stage, lora_result = lora_linear(torch.cat(inputs))[0] - expected_results: List[torch.Tensor] = [] + expected_results: list[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = linear(input_)[0] @@ -784,7 +784,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, lora_result = lora_linear(torch.cat(inputs))[0] - expected_results: List[torch.Tensor] = [] + expected_results: list[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = linear(input_)[0] @@ -933,7 +933,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, lora_result = lora_linear(torch.cat(inputs))[0] - expected_results: List[torch.Tensor] = [] + expected_results: list[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): result = linear(input_)[0] subloras = sublora_dict[lora_id] @@ -1093,9 +1093,9 @@ def test_vocab_parallel_embedding_indices(tp_size, seed): computed_added_vocab_size = 0 vocab_size_padded = -1 - all_org_tokens: List[int] = [] - all_added_tokens: List[int] = [] - token_ids: List[int] = [] + all_org_tokens: list[int] = [] + all_added_tokens: list[int] = [] + token_ids: list[int] = [] for tp_rank in range(tp_size): with patch( diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 564818f2..e84ff30b 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import ray @@ -31,7 +29,7 @@ EXPECTED_LORA_OUTPUT = [ ] -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 @@ -49,7 +47,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index 0a94298c..f577f39b 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import ast -from typing import List, Optional, Tuple +from typing import Optional import numpy as np import pytest @@ -86,7 +86,7 @@ def evaluate_json_response(model_response, golden_response): def generate( llm: vllm.LLM, - inputs: Tuple[str, SamplingParams, Optional[LoRARequest]], + inputs: tuple[str, SamplingParams, Optional[LoRARequest]], ): prompts, sampling_param, lora_request = inputs outputs = llm.generate(prompts, sampling_param, lora_request=lora_request) @@ -95,7 +95,7 @@ def generate( def batched_generate( llm: vllm.LLM, - inputs: List[Tuple[str, SamplingParams, Optional[LoRARequest]]], + inputs: list[tuple[str, SamplingParams, Optional[LoRARequest]]], ): for input in inputs: prompt, sampling_param, lora_req = input @@ -164,7 +164,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos): non-batched generation. """ # Create non batched results first to compare against batched results - non_batched_results: List[str] = [] + non_batched_results: list[str] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] @@ -177,7 +177,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos): # Create batched results # Each element of the batch must be # (prompt, prompt_sampling_params, prompt_lora_request) - batched_prompts: List[Tuple[str, SamplingParams, + batched_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] @@ -202,7 +202,7 @@ def test_self_consistency(lora_llm, long_context_infos): num_loras = len(long_context_infos) # Create results in order of long_context_infos - batched_prompts: List[Tuple[str, SamplingParams, + batched_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] @@ -251,7 +251,7 @@ def test_quality(lora_llm, long_context_infos): The test is expected to run for about 1 minute on a p4de.24xlarge instance. """ - scores: List[float] = [] + scores: list[float] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] for prompt_and_response in prompts_and_responses[context_len]: @@ -284,7 +284,7 @@ def test_max_len(lora_llm, long_context_infos): generate(lora_llm, (bad_prompt, sampling_params, lora_request)) # Also test batched - batched_prompts: List[Tuple[str, SamplingParams, + batched_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]] = [] for lora_id_with_bad_inputs in long_context_infos: for lora_id, info in long_context_infos.items(): diff --git a/tests/lora/test_lora_bias_e2e.py b/tests/lora/test_lora_bias_e2e.py index 3a7b3916..d4245a89 100644 --- a/tests/lora/test_lora_bias_e2e.py +++ b/tests/lora/test_lora_bias_e2e.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import vllm @@ -10,7 +8,7 @@ from vllm.lora.request import LoRARequest MODEL_PATH = "ibm-granite/granite-3b-code-base" -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 @@ -23,7 +21,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: sampling_params, lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: generated_text = output.outputs[0].text generated_texts.append(generated_text) diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index e2c3d20d..02f2339b 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest from vllm.lora.models import LoRAModel @@ -31,7 +29,7 @@ def test_load_checkpoints( packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping embedding_modules = BaiChuanBaseForCausalLM.embedding_modules embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules - expected_lora_modules: List[str] = [] + expected_lora_modules: list[str] = [] for module in BAICHUAN_LORA_MODULES: if module in packed_modules_mapping: expected_lora_modules.extend(packed_modules_mapping[module]) @@ -99,7 +97,7 @@ def test_lora_weights_mapping(baichuan_lora_files): packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping embedding_modules = BaiChuanBaseForCausalLM.embedding_modules embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules - expected_lora_modules: List[str] = [] + expected_lora_modules: list[str] = [] for module in BAICHUAN_LORA_MODULES: if module in packed_modules_mapping: expected_lora_modules.extend(packed_modules_mapping[module]) diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py index 13098488..b279566c 100644 --- a/tests/lora/test_lora_functions.py +++ b/tests/lora/test_lora_functions.py @@ -4,7 +4,6 @@ Script to test add_lora, remove_lora, pin_lora, list_loras functions. """ import os -from typing import List import pytest @@ -46,7 +45,7 @@ def test_lora_functions_sync(): llm = LLM.get_engine_class().from_engine_args(engine_args) - def run_check(fn, args, expected: List): + def run_check(fn, args, expected: list): fn(args) assert set(llm.list_loras()) == set(expected) @@ -105,7 +104,7 @@ async def test_lora_functions_async(): gpu_memory_utilization=0.8, enforce_eager=True) - async def run_check(fn, args, expected: List): + async def run_check(fn, args, expected: list): await fn(args) assert set(await llm.list_loras()) == set(expected) diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index 44d11173..0875128c 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest from vllm.lora.models import LoRAModel @@ -23,7 +21,7 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request): packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping embedding_modules = LlamaForCausalLM.embedding_modules embed_padding_modules = LlamaForCausalLM.embedding_padding_modules - expected_lora_modules: List[str] = [] + expected_lora_modules: list[str] = [] for module in LLAMA_LORA_MODULES: if module in packed_modules_mapping: expected_lora_modules.extend(packed_modules_mapping[module]) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 7ab46b7f..8d258331 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import Dict, List import pytest import torch @@ -72,9 +71,9 @@ def test_from_lora_tensors(sql_lora_files, device): assert lora.embeddings_tensor is None -def create_lora(lora_id: int, model: nn.Module, sub_modules: List[str], +def create_lora(lora_id: int, model: nn.Module, sub_modules: list[str], device: torch.device) -> LoRAModel: - loras: Dict[str, LoRALayerWeights] = {} + loras: dict[str, LoRALayerWeights] = {} for name in sub_modules: w = model.get_submodule(name).weight loras[name] = LoRALayerWeights( @@ -96,7 +95,7 @@ def create_packed_lora( empty_replaced_module_name=None, ) -> LoRAModel: w = model.get_submodule(module_name).weight - loras: Dict[str, LoRALayerWeights] = {} + loras: dict[str, LoRALayerWeights] = {} for replaced_module_name in replaced_module_names: if replaced_module_name == empty_replaced_module_name: continue diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py index 2e81bb32..f596651b 100644 --- a/tests/lora/test_minicpmv_tp.py +++ b/tests/lora/test_minicpmv_tp.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import vllm @@ -27,7 +25,7 @@ EXPECTED_OUTPUT = [ ] -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: sampling_params = vllm.SamplingParams( temperature=0, max_tokens=5, @@ -48,7 +46,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: if lora_id else None, ) # Print the outputs. - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: generated_text = output.outputs[0].text.strip() generated_texts.append(generated_text) diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index 90cf8fd3..caa65f2d 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import torch @@ -13,7 +11,7 @@ MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1" def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, - prompts: List[str]) -> List[str]: + prompts: list[str]) -> list[str]: sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256) outputs = llm.generate( @@ -22,7 +20,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py index 8999e0cf..8596d399 100644 --- a/tests/lora/test_phi.py +++ b/tests/lora/test_phi.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import vllm @@ -12,7 +10,7 @@ MODEL_PATH = "microsoft/phi-2" PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501 -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format( sql_prompt= @@ -41,7 +39,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: if lora_id else None, ) # Print the outputs. - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py index 032e2047..c75e8661 100644 --- a/tests/lora/test_punica_ops.py +++ b/tests/lora/test_punica_ops.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 from threading import Lock -from typing import List import pytest import torch @@ -20,7 +19,7 @@ from .utils import (PunicaTensors, assert_close, generate_data, # Utility shrink and expand operations used as reference implementations. def sgmv_shrink_for_nslices( nslices: int, inputs_tensor: torch.Tensor, - lora_weights_lst: List[torch.Tensor], out_tensor: torch.Tensor, + lora_weights_lst: list[torch.Tensor], out_tensor: torch.Tensor, b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor, prompt_lora_mapping: torch.Tensor, batches: int, max_seq_length: int, num_tokens: int, scaling: float): @@ -44,7 +43,7 @@ def sgmv_shrink_for_nslices( def sgmv_expand_for_nslices(nslices: int, hidden_size: int, inputs_tensor: torch.Tensor, - lora_weights_lst: List[torch.Tensor], + lora_weights_lst: list[torch.Tensor], out_tensor: torch.Tensor, b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor, diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 7f687f56..b4f3d8dc 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -3,7 +3,6 @@ # Adapted from # https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py from dataclasses import dataclass -from typing import List import pytest @@ -19,7 +18,7 @@ class ModelWithQuantization: quantization: str -MODELS: List[ModelWithQuantization] +MODELS: list[ModelWithQuantization] #AWQ quantization is currently not supported in ROCm. if current_platform.is_rocm(): MODELS = [ @@ -41,7 +40,7 @@ else: def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, - max_tokens: int = 256) -> List[str]: + max_tokens: int = 256) -> list[str]: raw_prompts = [ "Give me an orange-ish brown color", "Give me a neon pink color", @@ -61,7 +60,7 @@ def do_sample(llm: vllm.LLM, lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 1cf1534e..24eff013 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Dict, List, Optional +from typing import Optional import pytest from packaging.version import Version @@ -20,7 +20,7 @@ class TestConfig: max_loras: int = 2 max_lora_rank: int = 16 max_model_len: int = 4096 - mm_processor_kwargs: Optional[Dict[str, int]] = None + mm_processor_kwargs: Optional[dict[str, int]] = None def __post_init__(self): if self.mm_processor_kwargs is None: @@ -57,11 +57,11 @@ class Qwen2VLTester: ) def run_test(self, - images: List[ImageAsset], - expected_outputs: List[str], + images: list[ImageAsset], + expected_outputs: list[str], lora_id: Optional[int] = None, temperature: float = 0, - max_tokens: int = 5) -> List[str]: + max_tokens: int = 5) -> list[str]: sampling_params = vllm.SamplingParams( temperature=temperature, diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py index 07af1e9f..ff3bfcac 100644 --- a/tests/lora/test_transfomers_model.py +++ b/tests/lora/test_transfomers_model.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import vllm @@ -21,7 +19,7 @@ EXPECTED_LORA_OUTPUT = [ ] -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), PROMPT_TEMPLATE.format( @@ -40,7 +38,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts: List[str] = [] + generated_texts: list[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_ultravox.py b/tests/lora/test_ultravox.py index 703f92ce..6d2833bd 100644 --- a/tests/lora/test_ultravox.py +++ b/tests/lora/test_ultravox.py @@ -3,7 +3,6 @@ import shutil from os import path from tempfile import TemporaryDirectory -from typing import List, Tuple import torch from huggingface_hub import snapshot_download @@ -86,8 +85,8 @@ def test_ultravox_lora(vllm_runner): dtype="bfloat16", max_model_len=1024, ) as vllm_model: - ultravox_outputs: List[Tuple[ - List[int], str]] = vllm_model.generate_greedy( + ultravox_outputs: list[tuple[ + list[int], str]] = vllm_model.generate_greedy( [ _get_prompt(0, PROMPT, VLLM_PLACEHOLDER, ULTRAVOX_MODEL_NAME) @@ -108,7 +107,7 @@ def test_ultravox_lora(vllm_runner): dtype="bfloat16", max_model_len=1024, ) as vllm_model: - llama_outputs: List[Tuple[List[int], str]] = ( + llama_outputs: list[tuple[list[int], str]] = ( vllm_model.generate_greedy( [_get_prompt(0, PROMPT, VLLM_PLACEHOLDER, LLMA_MODEL_NAME)], 256, diff --git a/tests/lora/utils.py b/tests/lora/utils.py index 1e163fbf..59a0e742 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple, Union +from typing import Optional, Union import torch @@ -12,7 +12,7 @@ class DummyLoRAManager: def __init__(self, device: torch.device = "cuda:0"): super().__init__() - self._loras: Dict[str, LoRALayerWeights] = {} + self._loras: dict[str, LoRALayerWeights] = {} self._device = device def set_module_lora(self, module_name: str, lora: LoRALayerWeights): @@ -77,11 +77,11 @@ class DummyLoRAManager: self, module_name: str, input_dim: int, - output_dims: List[int], - noop_lora_index: Optional[List[int]] = None, + output_dims: list[int], + noop_lora_index: Optional[list[int]] = None, rank: int = 8, ): - base_loras: List[LoRALayerWeights] = [] + base_loras: list[LoRALayerWeights] = [] noop_lora_index_set = set(noop_lora_index or []) for i, out_dim in enumerate(output_dims): @@ -110,7 +110,7 @@ def assert_close(a, b): @dataclass class PunicaTensors: inputs_tensor: torch.Tensor - lora_weights: Union[torch.Tensor, List[torch.Tensor]] + lora_weights: Union[torch.Tensor, list[torch.Tensor]] our_out_tensor: torch.Tensor ref_out_tensor: torch.Tensor b_seq_start_loc: torch.Tensor @@ -118,7 +118,7 @@ class PunicaTensors: seq_len_tensor: torch.Tensor token_lora_mapping: torch.Tensor - def meta(self) -> Tuple[int, int]: + def meta(self) -> tuple[int, int]: """ Infer max_seq_length and token_nums from the tensors and return them. diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index b276d9d9..e23ff43e 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import time -from typing import List import pytest import ray @@ -133,7 +132,7 @@ def test_metric_counter_generation_tokens_multi_step( "served_model_name", [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]]) def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str, - served_model_name: List[str]) -> None: + served_model_name: list[str]) -> None: with vllm_runner(model, dtype=dtype, disable_log_stats=False, diff --git a/tests/mistral_tool_use/utils.py b/tests/mistral_tool_use/utils.py index 971ed55c..1d809a05 100644 --- a/tests/mistral_tool_use/utils.py +++ b/tests/mistral_tool_use/utils.py @@ -1,21 +1,21 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Optional +from typing import Optional from typing_extensions import TypedDict class ServerConfig(TypedDict, total=False): model: str - arguments: List[str] + arguments: list[str] system_prompt: Optional[str] supports_parallel: Optional[bool] supports_rocm: Optional[bool] -ARGS: List[str] = ["--max-model-len", "1024"] +ARGS: list[str] = ["--max-model-len", "1024"] -CONFIGS: Dict[str, ServerConfig] = { +CONFIGS: dict[str, ServerConfig] = { "mistral": { "model": "mistralai/Mistral-7B-Instruct-v0.3", diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index 2c678084..4a6a766b 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config @@ -51,7 +49,7 @@ class Relu3(ReLUSquaredActivation): # All but RMSNorm ("all,-rms_norm", 4, [0, 1, 1, 1], True), ]) -def test_enabled_ops(env: str, torch_level: int, ops_enabled: List[int], +def test_enabled_ops(env: str, torch_level: int, ops_enabled: list[int], default_on: bool): vllm_config = VllmConfig(compilation_config=CompilationConfig( level=torch_level, custom_ops=env.split(","))) diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index 0ea17247..13433b04 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple, Type +from typing import Optional import numpy as np import pytest @@ -17,7 +17,7 @@ from ...utils import check_logprobs_close MODEL_NAME = "fixie-ai/ultravox-v0_4" -AudioTuple = Tuple[np.ndarray, int] +AudioTuple = tuple[np.ndarray, int] VLLM_PLACEHOLDER = "<|audio|>" HF_PLACEHOLDER = "<|audio|>" @@ -78,7 +78,7 @@ def _get_prompt(audio_count, question, placeholder): add_generation_prompt=True) -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, +def vllm_to_hf_output(vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], model: str): """Sanitize vllm output to be comparable with hf output.""" @@ -96,9 +96,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - prompts_and_audios: List[Tuple[str, str, AudioTuple]], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + prompts_and_audios: list[tuple[str, str, AudioTuple]], model: str, *, dtype: str, @@ -158,8 +158,8 @@ def run_test( def run_multi_audio_test( - vllm_runner: Type[VllmRunner], - prompts_and_audios: List[Tuple[str, List[AudioTuple]]], + vllm_runner: type[VllmRunner], + prompts_and_audios: list[tuple[str, list[AudioTuple]]], model: str, *, dtype: str, diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py index 57fe1d5b..804df4c4 100644 --- a/tests/models/decoder_only/language/test_gguf.py +++ b/tests/models/decoder_only/language/test_gguf.py @@ -5,7 +5,7 @@ Note: To pass the test, quantization higher than Q4 should be used """ import os -from typing import List, NamedTuple, Type +from typing import NamedTuple import pytest from huggingface_hub import hf_hub_download @@ -90,8 +90,8 @@ MODELS = [ @pytest.mark.parametrize("tp_size", [1, 2]) def test_models( num_gpus_available: int, - vllm_runner: Type[VllmRunner], - example_prompts: List[str], + vllm_runner: type[VllmRunner], + example_prompts: list[str], model: GGUFTestConfig, dtype: str, max_tokens: int, diff --git a/tests/models/decoder_only/language/test_modelopt.py b/tests/models/decoder_only/language/test_modelopt.py index 66dd9795..a997b9e6 100644 --- a/tests/models/decoder_only/language/test_modelopt.py +++ b/tests/models/decoder_only/language/test_modelopt.py @@ -5,7 +5,6 @@ Note: these tests will only pass on H100 """ import os -from typing import List import pytest from transformers import AutoTokenizer @@ -65,7 +64,7 @@ def test_models(example_prompts, model_name) -> None: for prompt in example_prompts ] params = SamplingParams(max_tokens=20, temperature=0) - generations: List[str] = [] + generations: list[str] = [] # Note: these need to be run 1 at a time due to numerical precision, # since the expected strs were generated this way. for prompt in formatted_prompts: diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/decoder_only/vision_language/test_awq.py index 31a5cd26..f4a6dd0f 100644 --- a/tests/models/decoder_only/vision_language/test_awq.py +++ b/tests/models/decoder_only/vision_language/test_awq.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Type +from typing import Optional import pytest import torch @@ -19,12 +19,12 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ def run_awq_test( - vllm_runner: Type[VllmRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets, source_model: str, quant_model: str, *, - size_factors: List[float], + size_factors: list[float], dtype: str, max_tokens: int, num_logprobs: int, diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 2c66edb5..3f7a7c01 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -6,7 +6,6 @@ import math import os from collections import defaultdict from pathlib import PosixPath -from typing import Type import pytest from packaging.version import Version @@ -562,8 +561,8 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2) )) def test_single_image_models(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_single_image_test( @@ -585,8 +584,8 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str, )) def test_multi_image_models(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_multi_image_test( @@ -608,8 +607,8 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str, )) def test_image_embedding_models(model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_embedding_test( @@ -629,7 +628,7 @@ def test_image_embedding_models(model_type: str, fork_new_process_for_each_test=False, )) def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], video_assets: _VideoAssets): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_video_test( @@ -651,8 +650,8 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, def test_custom_inputs_models( model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], ): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_custom_inputs_test( @@ -674,8 +673,8 @@ def test_custom_inputs_models( @fork_new_process_for_each_test def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_single_image_test( @@ -698,8 +697,8 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, @fork_new_process_for_each_test def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_multi_image_test( @@ -722,8 +721,8 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, @fork_new_process_for_each_test def test_image_embedding_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_embedding_test( @@ -743,8 +742,8 @@ def test_image_embedding_models_heavy(model_type: str, fork_new_process_for_each_test=True, )) def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], video_assets: _VideoAssets): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_video_test( @@ -767,8 +766,8 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, def test_custom_inputs_models_heavy( model_type: str, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], ): model_test_info = VLM_TEST_SETTINGS[model_type] runners.run_custom_inputs_test( diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py index dd68fe4c..53b183b2 100644 --- a/tests/models/decoder_only/vision_language/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/test_phi3v.py @@ -2,7 +2,7 @@ import os import re -from typing import List, Optional, Tuple, Type +from typing import Optional import pytest from transformers import AutoTokenizer @@ -25,7 +25,7 @@ HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these models = ["microsoft/Phi-3.5-vision-instruct"] -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, +def vllm_to_hf_output(vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], model: str): """Sanitize vllm output to be comparable with hf output.""" @@ -55,9 +55,9 @@ if current_platform.is_rocm(): def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput]], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + inputs: list[tuple[list[str], PromptImageInput]], model: str, *, dtype: str, diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py index 602da2b5..d51dabc2 100644 --- a/tests/models/decoder_only/vision_language/test_pixtral.py +++ b/tests/models/decoder_only/vision_language/test_pixtral.py @@ -6,7 +6,7 @@ Run `pytest tests/models/test_mistral.py`. import json import uuid from dataclasses import asdict -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple +from typing import TYPE_CHECKING, Any, Optional import pytest from mistral_common.multimodal import download_image @@ -38,7 +38,7 @@ IMG_URLS = [ PROMPT = "Describe each image in one short sentence." -def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]: +def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]: return [{ "role": "user", @@ -54,7 +54,7 @@ def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]: }] -def _create_msg_format_hf(urls: List[str]) -> List[Dict[str, Any]]: +def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]: return [{ "role": "user", @@ -68,7 +68,7 @@ def _create_msg_format_hf(urls: List[str]) -> List[Dict[str, Any]]: }] -def _create_engine_inputs(urls: List[str]) -> TokensPrompt: +def _create_engine_inputs(urls: list[str]) -> TokensPrompt: msg = _create_msg_format(urls) tokenizer = MistralTokenizer.from_model("pixtral") @@ -89,7 +89,7 @@ def _create_engine_inputs(urls: List[str]) -> TokensPrompt: return engine_inputs -def _create_engine_inputs_hf(urls: List[str]) -> TextPrompt: +def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt: msg = _create_msg_format_hf(urls) tokenizer = AutoProcessor.from_pretrained("mistral-community/pixtral-12b") @@ -128,7 +128,7 @@ assert FIXTURES_PATH.exists() FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json" FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json" -OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]] +OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]] # For the test author to store golden output in JSON diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py index de240a90..af494eb2 100644 --- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, List, Optional, Tuple, Type, TypedDict, Union +from typing import Any, Optional, TypedDict, Union import numpy.typing as npt import pytest @@ -69,21 +69,21 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict): def batch_make_image_embeddings( - image_batches: List[Union[Image.Image, List[Image.Image]]], processor, - llm: VllmRunner) -> List[Qwen2VLPromptImageEmbeddingInput]: + image_batches: list[Union[Image.Image, list[Image.Image]]], processor, + llm: VllmRunner) -> list[Qwen2VLPromptImageEmbeddingInput]: """batched image embeddings for Qwen2-VL This will infer all images' embeddings in a single batch, and split the result according to input batches. image_batches: - - Single-image batches: `List[Image.Image]` - - Multiple-image batches: `List[List[Image.Image]]]` + - Single-image batches: `list[Image.Image]` + - Multiple-image batches: `list[list[Image.Image]]]` - returns: `List[Qwen2VLPromptImageEmbeddingInput]` + returns: `list[Qwen2VLPromptImageEmbeddingInput]` """ - image_batches_: List[Any] = image_batches[:] + image_batches_: list[Any] = image_batches[:] # convert single-image batches to multiple-image batches for idx in range(len(image_batches_)): @@ -93,7 +93,7 @@ def batch_make_image_embeddings( assert isinstance(image_batches_[idx], list) # append all images into a list (as a batch) - images: List[Image.Image] = [] + images: list[Image.Image] = [] for image_batch in image_batches_: images += image_batch @@ -121,7 +121,7 @@ def batch_make_image_embeddings( image_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches - result: List[Qwen2VLPromptImageEmbeddingInput] = [] + result: list[Qwen2VLPromptImageEmbeddingInput] = [] image_counter = 0 embed_counter = 0 for image_batch in image_batches_: @@ -153,7 +153,7 @@ def batch_make_image_embeddings( def batch_make_video_embeddings( video_batches: PromptVideoInput, processor, - llm: VllmRunner) -> List[Qwen2VLPromptVideoEmbeddingInput]: + llm: VllmRunner) -> list[Qwen2VLPromptVideoEmbeddingInput]: """batched video embeddings for Qwen2-VL A NDArray represents a single video's all frames. @@ -162,21 +162,21 @@ def batch_make_video_embeddings( and split the result according to input batches. video_batches: - - Single-video batches: `List[NDArray]` - - Multiple-video batches: `List[List[NDArray]]` + - Single-video batches: `list[NDArray]` + - Multiple-video batches: `list[list[NDArray]]` """ - video_batches_: List[Any] = video_batches[:] + video_batches_: list[Any] = video_batches[:] for idx in range(len(video_batches_)): if not isinstance(video_batches_[idx], list): - single_video_batch: List[npt.NDArray] = [video_batches_[idx]] + single_video_batch: list[npt.NDArray] = [video_batches_[idx]] video_batches_[idx] = single_video_batch assert isinstance(video_batches_[idx], list) # append all videos into a list (as a batch) - videos: List[npt.NDArray] = [] + videos: list[npt.NDArray] = [] for video_batch in video_batches_: videos += video_batch @@ -204,7 +204,7 @@ def batch_make_video_embeddings( video_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches - result: List[Qwen2VLPromptVideoEmbeddingInput] = [] + result: list[Qwen2VLPromptVideoEmbeddingInput] = [] video_counter = 0 embed_counter = 0 for video_batch in video_batches_: @@ -235,8 +235,8 @@ def batch_make_video_embeddings( def run_embedding_input_test( - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]], + vllm_runner: type[VllmRunner], + inputs: list[tuple[list[str], PromptImageInput, PromptVideoInput]], model: str, *, dtype: str, @@ -323,8 +323,8 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model, num_logprobs: int) -> None: images = [asset.pil_image for asset in image_assets] - inputs_per_case: List[Tuple[ - List[str], PromptImageInput, PromptVideoInput]] = [( + inputs_per_case: list[tuple[ + list[str], PromptImageInput, PromptVideoInput]] = [( [prompt for _ in size_factors], [rescale_image_size(image, factor) for factor in size_factors], [], @@ -365,7 +365,7 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets, num_logprobs: int) -> None: images = [asset.pil_image for asset in image_assets] - inputs_per_case: List[Tuple[List[str], PromptImageInput, + inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [( [MULTIIMAGE_PROMPT for _ in size_factors], [[ @@ -413,8 +413,8 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model, for asset in video_assets ] - inputs_per_case: List[Tuple[ - List[str], PromptImageInput, PromptVideoInput]] = [( + inputs_per_case: list[tuple[ + list[str], PromptImageInput, PromptVideoInput]] = [( [prompt for _ in size_factors], [], [rescale_video_size(video, factor) for factor in size_factors], diff --git a/tests/models/decoder_only/vision_language/vlm_utils/builders.py b/tests/models/decoder_only/vision_language/vlm_utils/builders.py index 539410d1..bf5f87eb 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 """Helpers for building inputs that can be leveraged for different test types. """ +from collections.abc import Iterable from pathlib import PosixPath -from typing import Callable, Iterable, List, Optional, Tuple, Union +from typing import Callable, Optional, Union import torch @@ -33,7 +34,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int], def get_model_prompts(base_prompts: Iterable[str], img_idx_to_prompt: Optional[Callable[[int], str]], video_idx_to_prompt: Optional[Callable[[int], str]], - prompt_formatter: Callable[[str], str]) -> List[str]: + prompt_formatter: Callable[[str], str]) -> list[str]: """Given a model-agnostic base prompt and test configuration for a model(s) to be tested, update the media placeholders and apply the prompt formatting to get the test prompt string for this model. @@ -218,7 +219,7 @@ def build_video_inputs_from_test_info( ) for video, prompt in zip(sampled_vids, model_prompts)] -def apply_image_size_scaling(image, size: Union[float, Tuple[int, int]], +def apply_image_size_scaling(image, size: Union[float, tuple[int, int]], size_type: SizeType): """Applies a size scaler to one image; this can be a an image size factor, which scales the image while maintaining the aspect ratio""" diff --git a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py index ca4ec214..c189e5a7 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py @@ -5,7 +5,7 @@ handling multimodal placeholder substitution, and so on. """ import itertools from collections import OrderedDict -from typing import Dict, Iterable, Tuple +from collections.abc import Iterable import pytest @@ -13,9 +13,9 @@ from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs, ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType) -def get_filtered_test_settings(test_settings: Dict[str, VLMTestInfo], +def get_filtered_test_settings(test_settings: dict[str, VLMTestInfo], test_type: VLMTestType, - fork_per_test: bool) -> Dict[str, VLMTestInfo]: + fork_per_test: bool) -> dict[str, VLMTestInfo]: """Given the dict of potential test settings to run, return a subdict of tests who have the current test type enabled with the matching val for fork_per_test. @@ -49,7 +49,7 @@ def get_filtered_test_settings(test_settings: Dict[str, VLMTestInfo], return matching_tests -def get_parametrized_options(test_settings: Dict[str, VLMTestInfo], +def get_parametrized_options(test_settings: dict[str, VLMTestInfo], test_type: VLMTestType, fork_new_process_for_each_test: bool): """Converts all of our VLMTestInfo into an expanded list of parameters. @@ -121,7 +121,7 @@ def get_parametrized_options(test_settings: Dict[str, VLMTestInfo], def get_wrapped_test_sizes( test_info: VLMTestInfo, - test_type: VLMTestType) -> Tuple[ImageSizeWrapper, ...]: + test_type: VLMTestType) -> tuple[ImageSizeWrapper, ...]: """Given a test info which may have size factors or fixed sizes, wrap them and combine them into an iterable, each of which will be used in parameter expansion. diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py index f2260f56..aaad584c 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/core.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Core test implementation to be shared across modalities.""" -from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Callable, Optional, Union import torch from PIL.Image import Image @@ -17,9 +17,9 @@ from .types import RunnerOutput def run_test( *, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + inputs: list[tuple[list[str], list[Union[list[Image], Image]]]], model: str, dtype: str, max_tokens: int, @@ -29,15 +29,15 @@ def run_test( max_num_seqs: int, hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]], vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]], - auto_cls: Type[_BaseAutoModelClass], + auto_cls: type[_BaseAutoModelClass], use_tokenizer_eos: bool, postprocess_inputs: Callable[[BatchEncoding], BatchEncoding], comparator: Callable[..., None], get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]], - stop_str: Optional[List[str]], - limit_mm_per_prompt: Dict[str, int], - vllm_runner_kwargs: Optional[Dict[str, Any]], - hf_model_kwargs: Optional[Dict[str, Any]], + stop_str: Optional[list[str]], + limit_mm_per_prompt: dict[str, int], + vllm_runner_kwargs: Optional[dict[str, Any]], + hf_model_kwargs: Optional[dict[str, Any]], patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]], task: TaskOption = "auto", runner_mm_key: str = "images", @@ -61,7 +61,7 @@ def run_test( # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method (the default method). - vllm_runner_kwargs_: Dict[str, Any] = {} + vllm_runner_kwargs_: dict[str, Any] = {} if model_info.tokenizer: vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer if model_info.tokenizer_mode: @@ -84,7 +84,7 @@ def run_test( **vllm_runner_kwargs_) as vllm_model: tokenizer = vllm_model.model.get_tokenizer() - vllm_kwargs: Dict[str, Any] = {} + vllm_kwargs: dict[str, Any] = {} if get_stop_token_ids is not None: vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer) if stop_str: diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 408ce9cf..66410f66 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -6,7 +6,7 @@ typically specific to a small subset of models. import re import types from pathlib import PosixPath -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, Optional, Union import torch from PIL.Image import Image @@ -49,7 +49,7 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput, def qwen_vllm_to_hf_output( vllm_output: RunnerOutput, - model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]: + model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]: """Sanitize vllm output [qwen models] to be comparable with hf output.""" output_ids, output_str, out_logprobs = vllm_output @@ -60,7 +60,7 @@ def qwen_vllm_to_hf_output( def qwen2_vllm_to_hf_output( vllm_output: RunnerOutput, - model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]: + model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]: """Sanitize vllm output [qwen2 models] to be comparable with hf output.""" output_ids, output_str, out_logprobs = vllm_output @@ -78,7 +78,7 @@ def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput, def llava_video_vllm_to_hf_output( vllm_output: RunnerOutput, - model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]: + model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]: config = AutoConfig.from_pretrained(model) mm_token_id = config.video_token_index return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id) @@ -247,7 +247,7 @@ def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str): ####### Prompt path encoders for models that need models on disk def qwen_prompt_path_encoder( - tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset], + tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], _ImageAssets]) -> str: """Given a temporary dir path, export one or more image assets into the tempdir & replace its contents with the local path to the string so that @@ -257,7 +257,7 @@ def qwen_prompt_path_encoder( Args: tmp_path: Tempdir for test under consideration. prompt: Prompt with image placeholders. - assets: List of image assets whose len equals the num placeholders. + assets: list of image assets whose len equals the num placeholders. """ # Ensure that the number of placeholders matches the number of assets; # If this is not true, the test is probably written incorrectly. @@ -350,7 +350,7 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: self.max_num = self.config.max_dynamic_patch self.image_size = self.vision_config.image_size - def __call__(self, text: str, images: Union[Image, List[Image]], + def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs): # yapf: disable from vllm.model_executor.models.h2ovl import ( @@ -410,7 +410,7 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: self.max_num = self.config.max_dynamic_patch self.image_size = self.vision_config.image_size - def __call__(self, text: str, images: Union[Image, List[Image]], + def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs): from vllm.model_executor.models.internvl import ( IMG_CONTEXT, IMG_END, IMG_START, diff --git a/tests/models/decoder_only/vision_language/vlm_utils/runners.py b/tests/models/decoder_only/vision_language/vlm_utils/runners.py index fb9df37c..023df5f1 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/runners.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/runners.py @@ -3,7 +3,6 @@ types / modalities. """ from pathlib import PosixPath -from typing import Type from .....conftest import HfRunner, VllmRunner, _ImageAssets, _VideoAssets from . import builders, core @@ -13,8 +12,8 @@ from .types import ExpandableVLMTestArgs, VLMTestInfo ####### Entrypoints for running different test types def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets): assert test_case.size_wrapper is not None inputs = builders.build_single_image_inputs_from_test_info( @@ -36,8 +35,8 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets): assert test_case.size_wrapper is not None inputs = builders.build_multi_image_inputs_from_test_info( @@ -59,8 +58,8 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, def run_embedding_test(*, model_test_info: VLMTestInfo, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets): assert test_case.size_wrapper is not None inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info( @@ -85,8 +84,8 @@ def run_video_test( *, model_test_info: VLMTestInfo, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], video_assets: _VideoAssets, ): assert test_case.size_wrapper is not None @@ -111,8 +110,8 @@ def run_video_test( def run_custom_inputs_test(*, model_test_info: VLMTestInfo, test_case: ExpandableVLMTestArgs, - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner]): + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner]): # Custom test cases can provide inputs directly, but they need to # explicitly provided a CustomTestConfig, which wraps the inputs and # the limit_mm_per_prompt diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py index ecb86609..bdbdbc7e 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/types.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 """Types for writing multimodal model tests.""" +from collections.abc import Iterable from enum import Enum from pathlib import PosixPath -from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional, - Tuple, Type, Union) +from typing import Any, Callable, NamedTuple, Optional, Union import torch from PIL.Image import Image @@ -35,7 +35,7 @@ VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?" IMAGE_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)] EMBEDDING_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0)] -RunnerOutput = Tuple[List[int], str, Optional[SampleLogprobs]] +RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]] # yapf: enable @@ -53,8 +53,8 @@ class SizeType(Enum): class CustomTestOptions(NamedTuple): - inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]] - limit_mm_per_prompt: Dict[str, int] + inputs: list[tuple[list[str], list[Union[list[Image], Image]]]] + limit_mm_per_prompt: dict[str, int] # kwarg to pass multimodal data in as to vllm/hf runner instances. runner_mm_key: str = "images" @@ -63,13 +63,13 @@ class ImageSizeWrapper(NamedTuple): type: SizeType # A size factor is a wrapper of 0+ floats, # while a fixed size contains an iterable of integer pairs - data: Union[Iterable[float], Iterable[Tuple[int, int]]] + data: Union[Iterable[float], Iterable[tuple[int, int]]] class VLMTestInfo(NamedTuple): """Holds the configuration for 1+ tests for one model architecture.""" - models: List[str] + models: list[str] test_type: Union[VLMTestType, Iterable[VLMTestType]] # Should be None only if this is a CUSTOM_INPUTS test @@ -97,19 +97,19 @@ class VLMTestInfo(NamedTuple): max_num_seqs: int = 256 task: TaskOption = "auto" tensor_parallel_size: int = 1 - vllm_runner_kwargs: Optional[Dict[str, Any]] = None + vllm_runner_kwargs: Optional[dict[str, Any]] = None # Optional callable which gets a list of token IDs from the model tokenizer get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]] = None # Optional list of strings to stop generation, useful when stop tokens are # not special tokens in the tokenizer - stop_str: Optional[List[str]] = None + stop_str: Optional[list[str]] = None # Exposed options for HF runner - hf_model_kwargs: Optional[Dict[str, Any]] = None + hf_model_kwargs: Optional[dict[str, Any]] = None # Indicates we should explicitly pass the EOS from the tokenizer use_tokenizer_eos: bool = False - auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM + auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM # Callable to pass to the HF runner to run on inputs; for now, we also pass # the data type to input post processing, because almost all of the uses of # postprocess_inputs are to fix the data types of BatchEncoding values. @@ -128,12 +128,12 @@ class VLMTestInfo(NamedTuple): # Default expandable params per test; these defaults can be overridden in # instances of this object; the complete set of test cases for the model # is all combinations of .models + all fields below - max_tokens: Union[int, Tuple[int]] = 128 - num_logprobs: Union[int, Tuple[int]] = 5 + max_tokens: Union[int, tuple[int]] = 128 + num_logprobs: Union[int, tuple[int]] = 5 dtype: Union[str, Iterable[str]] = "half" distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None # Only expanded in video tests - num_video_frames: Union[int, Tuple[int]] = 16 + num_video_frames: Union[int, tuple[int]] = 16 # Fixed image sizes / image size factors; most tests use image_size_factors # The values provided for these two fields will be stacked and expanded @@ -141,19 +141,19 @@ class VLMTestInfo(NamedTuple): # once per tests (much like concatenating and wrapping in one parametrize # call) image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS - image_sizes: Optional[Iterable[Iterable[Tuple[int, int]]]] = None + image_sizes: Optional[Iterable[Iterable[tuple[int, int]]]] = None # Hack for updating a prompt to take into a local path; currently only used # for Qwen-VL, which requires encoding the image path / url into the prompt # for HF runner prompt_path_encoder: Optional[ - Callable[[PosixPath, str, Union[List[ImageAsset], _ImageAssets]], + Callable[[PosixPath, str, Union[list[ImageAsset], _ImageAssets]], str]] = None # noqa: E501 # Allows configuring a test to run with custom inputs - custom_test_opts: Optional[List[CustomTestOptions]] = None + custom_test_opts: Optional[list[CustomTestOptions]] = None - marks: Optional[List[MarkDecorator]] = None + marks: Optional[list[MarkDecorator]] = None def get_non_parametrized_runner_kwargs(self): """Returns a dictionary of expandable kwargs for items that are used diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py index 7ed2fb8a..470dc041 100644 --- a/tests/models/embedding/language/test_gritlm.py +++ b/tests/models/embedding/language/test_gritlm.py @@ -3,7 +3,6 @@ import importlib.util import math from array import array -from typing import List import openai import pytest @@ -81,14 +80,14 @@ async def client_generate(server_generate: RemoteOpenAIServer): yield async_client -def run_llm_encode(llm: vllm.LLM, queries: List[str], - instruction: str) -> List[float]: +def run_llm_encode(llm: vllm.LLM, queries: list[str], + instruction: str) -> list[float]: outputs = llm.encode([instruction + q for q in queries], ) return [output.outputs.embedding for output in outputs] -async def run_client_embeddings(client: vllm.LLM, queries: List[str], - instruction: str) -> List[float]: +async def run_client_embeddings(client: vllm.LLM, queries: list[str], + instruction: str) -> list[float]: outputs = await client.embeddings.create( model=MODEL_NAME, input=[instruction + q for q in queries], @@ -123,7 +122,7 @@ def get_test_data(): return queries, q_instruction, documents, d_instruction -def validate_embed_output(q_rep: List[float], d_rep: List[float]): +def validate_embed_output(q_rep: list[float], d_rep: list[float]): cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0]) assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001) diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py index 567aa509..bef85eaf 100644 --- a/tests/models/embedding/utils.py +++ b/tests/models/embedding/utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Sequence +from collections.abc import Sequence import torch import torch.nn.functional as F @@ -8,8 +8,8 @@ import torch.nn.functional as F def check_embeddings_close( *, - embeddings_0_lst: Sequence[List[float]], - embeddings_1_lst: Sequence[List[float]], + embeddings_0_lst: Sequence[list[float]], + embeddings_1_lst: Sequence[list[float]], name_0: str, name_1: str, tol: float = 1e-3, diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py index 82f2bf53..7391df6e 100644 --- a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py +++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from functools import partial -from typing import Callable, Dict, List, Type +from typing import Callable import pytest import torch @@ -67,7 +67,7 @@ def get_messages(image: Image.Image, text: str, embed_text: bool): def apply_chat_template_and_add_eos( - messages: List[Dict], + messages: list[dict], apply_chat_template_fn: Callable, ): prompt = apply_chat_template_fn( @@ -80,11 +80,11 @@ def postprocess_inputs(hf_model: HfRunner, inputs: BatchEncoding, **kwargs): def _run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - input_texts: List[str], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + input_texts: list[str], input_images: PromptImageInput, - embed_texts: List[bool], + embed_texts: list[bool], model: str, *, dtype: str, diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py index 990c6c15..4c2fbd52 100644 --- a/tests/models/embedding/vision_language/test_llava_next.py +++ b/tests/models/embedding/vision_language/test_llava_next.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Type - import pytest import torch.nn.functional as F from transformers import AutoModelForVision2Seq @@ -35,9 +33,9 @@ MODELS = ["royokong/e5-v"] def _run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - input_texts: List[str], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + input_texts: list[str], input_images: PromptImageInput, model: str, *, diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py index 0cb94874..3226138a 100644 --- a/tests/models/embedding/vision_language/test_phi3v.py +++ b/tests/models/embedding/vision_language/test_phi3v.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Type - import pytest import torch.nn.functional as F @@ -29,9 +27,9 @@ MODELS = ["TIGER-Lab/VLM2Vec-Full"] def _run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - input_texts: List[str], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + input_texts: list[str], input_images: PromptImageInput, model: str, *, diff --git a/tests/models/encoder_decoder/language/test_bart.py b/tests/models/encoder_decoder/language/test_bart.py index 81b629fd..e8070d28 100644 --- a/tests/models/encoder_decoder/language/test_bart.py +++ b/tests/models/encoder_decoder/language/test_bart.py @@ -3,7 +3,7 @@ Run `pytest tests/models/encoder_decoder/language/test_bart.py`. """ -from typing import List, Optional, Tuple, Type +from typing import Optional import pytest from transformers import AutoModelForSeq2SeqLM @@ -17,7 +17,7 @@ from ...utils import check_logprobs_close def vllm_to_hf_output( - vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], + vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], decoder_prompt_type: DecoderPromptType, ): """Sanitize vllm output to be comparable with hf output.""" @@ -31,9 +31,9 @@ def vllm_to_hf_output( def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - prompts: List[ExplicitEncoderDecoderPrompt[str, str]], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + prompts: list[ExplicitEncoderDecoderPrompt[str, str]], decoder_prompt_type: DecoderPromptType, model: str, *, diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/encoder_decoder/vision_language/test_florence2.py index de18deab..a6ec333e 100644 --- a/tests/models/encoder_decoder/vision_language/test_florence2.py +++ b/tests/models/encoder_decoder/vision_language/test_florence2.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Type +from typing import Optional import pytest from PIL import Image @@ -51,8 +51,8 @@ def hf_to_vllm_output(hf_output: tuple[list[int], str, def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], inputs: list[list[ExplicitEncoderDecoderPrompt]], model: str, *, @@ -114,7 +114,7 @@ def run_test( @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], +def test_models(hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], image_assets: _ImageAssets, model: str, size_factors: list[int], dtype: str, max_tokens: int, num_logprobs: int) -> None: diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index 4fee04fd..1e202907 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple, Type, overload +from typing import Optional, overload import pytest import torch @@ -64,7 +64,7 @@ prompt_data = { } -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, +def vllm_to_hf_output(vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], model: str): """Sanitize vllm output to be comparable with hf output.""" @@ -91,9 +91,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, def _get_inputs( image_assets: _ImageAssets, *, - size_factors: Optional[List[float]] = None, - sizes: Optional[List[Tuple[int, int]]] = None, -) -> List[Tuple[List[str], PromptImageInput]]: + size_factors: Optional[list[float]] = None, + sizes: Optional[list[tuple[int, int]]] = None, +) -> list[tuple[list[str], PromptImageInput]]: images = [asset.pil_image for asset in image_assets] if size_factors is not None: @@ -123,12 +123,12 @@ def _get_inputs( @overload def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets, model: str, *, - size_factors: List[float], + size_factors: list[float], dtype: str, max_tokens: int, num_logprobs: int, @@ -140,12 +140,12 @@ def run_test( @overload def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets, model: str, *, - sizes: List[Tuple[int, int]], + sizes: list[tuple[int, int]], dtype: str, max_tokens: int, num_logprobs: int, @@ -156,13 +156,13 @@ def run_test( def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], image_assets: _ImageAssets, model: str, *, - size_factors: Optional[List[float]] = None, - sizes: Optional[List[Tuple[int, int]]] = None, + size_factors: Optional[list[float]] = None, + sizes: Optional[list[tuple[int, int]]] = None, dtype: str, max_tokens: int, num_logprobs: int, @@ -183,9 +183,9 @@ def run_test( def _run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput]], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + inputs: list[tuple[list[str], PromptImageInput]], model: str, *, dtype: str, diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py index 5c43e4ee..84471c92 100644 --- a/tests/models/multimodal/processing/test_h2ovl.py +++ b/tests/models/multimodal/processing/test_h2ovl.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Tests for H2OVL's multimodal preprocessing kwargs.""" -from typing import Mapping, Optional +from collections.abc import Mapping +from typing import Optional import pytest from PIL import Image diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py index cc777fdf..adbc4f5b 100644 --- a/tests/models/multimodal/processing/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Tests for InternVL's multimodal preprocessing kwargs.""" -from typing import Mapping, Optional +from collections.abc import Mapping +from typing import Optional import pytest from PIL import Image diff --git a/tests/models/registry.py b/tests/models/registry.py index 78a65b93..b5ded20c 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Mapping, Set from dataclasses import dataclass, field -from typing import AbstractSet, Any, Literal, Mapping, Optional +from typing import Any, Literal, Optional import pytest from packaging.version import Version @@ -324,7 +325,7 @@ class HfExampleModels: self.hf_models = hf_models - def get_supported_archs(self) -> AbstractSet[str]: + def get_supported_archs(self) -> Set[str]: return self.hf_models.keys() def get_hf_info(self, model_arch: str) -> _HfExamplesInfo: diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index 31e3c1f7..243cb92a 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -4,7 +4,6 @@ Run `pytest tests/models/test_transformers.py`. """ from contextlib import nullcontext -from typing import Type import pytest @@ -14,8 +13,8 @@ from .utils import check_logprobs_close def check_implementation( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], example_prompts: list[str], model: str, **kwargs, @@ -47,8 +46,8 @@ def check_implementation( ("ArthurZ/Ilama-3.2-1B", "auto"), # CUSTOM CODE ]) # trust_remote_code=True by default def test_models( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], example_prompts: list[str], model: str, model_impl: str, @@ -71,8 +70,8 @@ def test_models( @multi_gpu_test(num_gpus=2) def test_distributed( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], example_prompts, ): kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2} @@ -92,7 +91,7 @@ def test_distributed( @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) def test_quantization( - vllm_runner: Type[VllmRunner], + vllm_runner: type[VllmRunner], example_prompts: list[str], model: str, quantization_kwargs: dict[str, str], diff --git a/tests/models/utils.py b/tests/models/utils.py index a90efb17..b0182d54 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import warnings -from typing import Dict, List, Optional, Sequence, Tuple, Union +from collections.abc import Sequence +from typing import Optional, Union import torch @@ -9,7 +10,7 @@ from vllm.config import ModelConfig, TaskOption from vllm.inputs import InputContext from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs -TokensText = Tuple[List[int], str] +TokensText = tuple[list[int], str] def check_outputs_equal( @@ -46,7 +47,7 @@ def check_outputs_equal( # * List of top sample logprobs for each sampled token # # Assumes prompt logprobs were not requested. -TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int, +TokensTextLogprobs = tuple[list[int], str, Optional[Union[list[dict[int, float]], SampleLogprobs]]] @@ -57,8 +58,8 @@ TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int, # * Optional list of top sample logprobs for each sampled token # # Assumes prompt logprobs were not requested. -TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]], - List[Dict[str, +TextTextLogprobs = tuple[list[str], str, Optional[Union[list[dict[str, float]], + list[dict[str, Logprob]]]]] # Representation of generated sequence as a tuple of @@ -68,9 +69,9 @@ TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]], # * Optional list of top prompt logprobs for each prompt token # # Allows prompt logprobs to be requested. -TokensTextLogprobsPromptLogprobs = Tuple[ - List[int], str, Optional[Union[List[Dict[int, float]], SampleLogprobs]], - Optional[Union[List[Optional[Dict[int, float]]], PromptLogprobs]]] +TokensTextLogprobsPromptLogprobs = tuple[ + list[int], str, Optional[Union[list[dict[int, float]], SampleLogprobs]], + Optional[Union[list[Optional[dict[int, float]]], PromptLogprobs]]] def check_logprobs_close( @@ -254,8 +255,8 @@ def build_model_context( tokenizer_name: Optional[str] = None, trust_remote_code: bool = False, dtype: Optional[Union[str, torch.dtype]] = None, - mm_processor_kwargs: Optional[Dict] = None, - limit_mm_per_prompt: Optional[Dict] = None, + mm_processor_kwargs: Optional[dict] = None, + limit_mm_per_prompt: Optional[dict] = None, disable_mm_preprocessor_cache: bool = True, ): """Creates an InputContext for a given model. diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py index 11e44f12..64559609 100644 --- a/tests/mq_llm_engine/utils.py +++ b/tests/mq_llm_engine/utils.py @@ -2,7 +2,7 @@ import asyncio import multiprocessing -from typing import Callable, Tuple, Union +from typing import Callable, Union from vllm import SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs @@ -16,7 +16,7 @@ async def generate( client: MQLLMEngineClient, request_id: str, num_tokens: int, - return_output: bool = False) -> Union[RequestOutput, Tuple[int, str]]: + return_output: bool = False) -> Union[RequestOutput, tuple[int, str]]: final_output = None count = 0 diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index 9822cee1..f925e42f 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # Test the AsyncLLMEngine with multi-step-decoding -from typing import List, Optional +from typing import Optional import pytest @@ -17,7 +17,7 @@ MODELS = [ NUM_SCHEDULER_STEPS = [8] # Multi-step decoding steps NUM_PROMPTS = [10] -DEFAULT_SERVER_ARGS: List[str] = [ +DEFAULT_SERVER_ARGS: list[str] = [ "--distributed-executor-backend", "ray", "--gpu-memory-utilization", diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index f9e0f507..8f76d895 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -4,7 +4,7 @@ import base64 import mimetypes import os from tempfile import NamedTemporaryFile, TemporaryDirectory -from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, Tuple +from typing import TYPE_CHECKING, NamedTuple, Optional import numpy as np import pytest @@ -30,7 +30,7 @@ TEST_IMAGE_URLS = [ @pytest.fixture(scope="module") -def url_images() -> Dict[str, Image.Image]: +def url_images() -> dict[str, Image.Image]: connector = MediaConnector() return { @@ -39,7 +39,7 @@ def url_images() -> Dict[str, Image.Image]: } -def get_supported_suffixes() -> Tuple[str, ...]: +def get_supported_suffixes() -> tuple[str, ...]: # We should at least test the file types mentioned in GPT-4 with Vision OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif') @@ -66,7 +66,7 @@ async def test_fetch_image_http(image_url: str): @pytest.mark.asyncio @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @pytest.mark.parametrize("suffix", get_supported_suffixes()) -async def test_fetch_image_base64(url_images: Dict[str, Image.Image], +async def test_fetch_image_base64(url_images: dict[str, Image.Image], image_url: str, suffix: str): connector = MediaConnector() url_image = url_images[image_url] diff --git a/tests/neuron/test_logits_processor.py b/tests/neuron/test_logits_processor.py index 37d59c9e..6d151408 100644 --- a/tests/neuron/test_logits_processor.py +++ b/tests/neuron/test_logits_processor.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import Tuple from unittest.mock import patch import pytest @@ -33,7 +32,7 @@ class MockLogitsProcessor(LogitsProcessor): def _prepare_test( batch_size: int -) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]: +) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]: vocab_size = 32000 input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16) fake_logits = torch.full((batch_size, vocab_size), diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py index a376d2cb..bc4a41cd 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -59,7 +60,7 @@ class MyGemma2Embedding(nn.Module): ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): weights = self.hf_to_vllm_mapper.apply(weights) weights = ((name, data) for name, data in weights diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index 0abbd8eb..e3016684 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -5,7 +5,6 @@ Run `pytest tests/quantization/test_configs.py --forked`. """ from dataclasses import dataclass -from typing import Tuple import pytest @@ -53,7 +52,7 @@ MODEL_ARG_EXPTYPES = [ @pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES) -def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None: +def test_auto_gptq(model_arg_exptype: tuple[str, None, str]) -> None: model_path, quantization_arg, expected_type = model_arg_exptype try: diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py index da59dc75..f64dca6e 100644 --- a/tests/quantization/test_register_quantization_config.py +++ b/tests/quantization/test_register_quantization_config.py @@ -5,7 +5,7 @@ See https://github.com/vllm-project/vllm/issues/11926 for more details. Run `pytest tests/quantization/test_register_quantization_config.py`. """ -from typing import Any, Dict, List, Optional +from typing import Any, Optional import pytest import torch @@ -58,7 +58,7 @@ class CustomQuantConfig(QuantizationConfig): """Name of the quantization method.""" return "custom_quant" - def get_supported_act_dtypes(self) -> List["torch.dtype"]: + def get_supported_act_dtypes(self) -> list["torch.dtype"]: """List of supported activation dtypes.""" return [torch.float16, torch.bfloat16] @@ -68,12 +68,12 @@ class CustomQuantConfig(QuantizationConfig): return -1 @staticmethod - def get_config_filenames() -> List[str]: + def get_config_filenames() -> list[str]: """List of filenames to search for in the model directory.""" return [] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "CustomQuantConfig": + def from_config(cls, config: dict[str, Any]) -> "CustomQuantConfig": """Create a config class from the model's quantization config.""" return CustomQuantConfig(num_bits=config.get("num_bits", 8)) diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 78bdd9b0..58c7c256 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import torch @@ -70,7 +68,7 @@ def test_get_prompt_logprobs( assert (len(logprobs) == num_top_logprobs or len(logprobs) == num_top_logprobs + 1) output_text = result.outputs[0].text - output_string_from_most_likely_tokens_lst: List[str] = [] + output_string_from_most_likely_tokens_lst: list[str] = [] for top_logprobs in result.outputs[0].logprobs: top_logprob = next(iter(top_logprobs.values())) output_string_from_most_likely_tokens_lst.append( diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py index 143f5299..29e73eb1 100644 --- a/tests/samplers/test_no_bad_words.py +++ b/tests/samplers/test_no_bad_words.py @@ -4,7 +4,7 @@ Run `pytest tests/samplers/test_no_bad_words.py`. """ -from typing import List, Optional +from typing import Optional from transformers import AutoTokenizer @@ -16,8 +16,8 @@ def _generate( prompt: str, num_prompt_tokens: int, temperature: float = 0, - bad_words: Optional[List[str]] = None, -) -> List[int]: + bad_words: Optional[list[str]] = None, +) -> list[int]: sampling_params = SamplingParams( temperature=temperature, bad_words=bad_words, @@ -59,7 +59,7 @@ class TestOneTokenBadWord: def _generate(self, model: LLM, - bad_words: Optional[List[str]] = None) -> List[int]: + bad_words: Optional[list[str]] = None) -> list[int]: return _generate( model=model, prompt=self.PROMPT, @@ -69,7 +69,7 @@ class TestOneTokenBadWord: def _encode(self, prompt: str, - add_special_tokens: bool = True) -> List[int]: + add_special_tokens: bool = True) -> list[int]: return self.tokenizer(prompt, add_special_tokens=add_special_tokens).input_ids @@ -149,7 +149,7 @@ class TestTwoTokenBadWord: def _generate(self, model: LLM, - bad_words: Optional[List[str]] = None) -> List[int]: + bad_words: Optional[list[str]] = None) -> list[int]: return _generate( model=model, prompt=self.PROMPT, @@ -158,7 +158,7 @@ class TestTwoTokenBadWord: ) @staticmethod - def _contains(sequence: List[int], subsequence: List[int]) -> bool: + def _contains(sequence: list[int], subsequence: list[int]) -> bool: searched = False for start in range(len(sequence)): @@ -181,6 +181,6 @@ class TestTwoTokenBadWord: def _encode(self, prompt: str, - add_special_tokens: bool = True) -> List[int]: + add_special_tokens: bool = True) -> list[int]: return self.tokenizer(prompt, add_special_tokens=add_special_tokens).input_ids diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index cc199bf6..2b86dcac 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 """Tests for rejection sampling.""" -from typing import List, Tuple import pytest import torch @@ -416,8 +415,8 @@ def test_rejection_sampling_approximates_target_distribution( draft_and_target_probs_equal) sample_sizes = [10, 100, 1_000, 10_000, 100_000] - distance_wrt_reference: List[float] = [] - distance_wrt_target: List[float] = [] + distance_wrt_reference: list[float] = [] + distance_wrt_target: list[float] = [] for num_samples in sample_sizes: (reference_vs_rejsample_dist, @@ -452,7 +451,7 @@ def test_rejection_sampling_approximates_target_distribution( expected_improvement_multiplier) -def get_ratio_first_to_last(elements: List[float]) -> float: +def get_ratio_first_to_last(elements: list[float]) -> float: return elements[0] / elements[-1] @@ -477,7 +476,7 @@ class _CorrectnessTestHelper: def generate_probs_for_test( self, draft_and_target_probs_equal: bool - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: draft_probs, target_probs = (F.softmax( torch.rand(self.vocab_size, dtype=torch.float32), dim=-1, @@ -499,7 +498,7 @@ class _CorrectnessTestHelper: def run_and_compare_distributions(self, draft_probs: torch.Tensor, target_probs: torch.Tensor, reference_probs: torch.Tensor, - num_samples: int) -> Tuple[float, float]: + num_samples: int) -> tuple[float, float]: # Sample using rejection sampling. rej_sample_probs = self._estimate_rejection_sampling_pdf( draft_probs, target_probs, num_samples) diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index ca09e536..68944ac7 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -3,7 +3,7 @@ import itertools import random from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple +from typing import Optional from unittest.mock import Mock, patch import pytest @@ -30,7 +30,7 @@ class MockLogitsSampler(Sampler): def _prepare_test( batch_size: int -) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]: +) -> tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]: input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16) fake_logits = torch.full((batch_size, VOCAB_SIZE), 1e-2, @@ -53,8 +53,8 @@ def _do_sample( sampling_params: SamplingParams, device: str, ): - seq_group_metadata_list: List[SequenceGroupMetadata] = [] - seq_lens: List[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] + seq_lens: list[int] = [] for i in range(batch_size): seq_group_metadata_list.append( SequenceGroupMetadata( @@ -171,7 +171,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str): def create_sampling_params(min_tokens, eos_token_id=0, *, - stop_token_ids: Optional[List[int]] = None, + stop_token_ids: Optional[list[int]] = None, prompt_logprobs: Optional[int] = None): sampling_params = SamplingParams( min_tokens=min_tokens, @@ -196,7 +196,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str): batch_size = random.randint(1, 128) expected_penalization = [] - sequence_metadata_list: List[SequenceGroupMetadata] = [] + sequence_metadata_list: list[SequenceGroupMetadata] = [] # 20% chance to generate seq group metadata list with all prompts is_prompt = random.random() < 0.2 while batch_size > 0: @@ -216,8 +216,8 @@ def test_sampler_min_tokens_penalty(seed: int, device: str): eos_token_id=eos_token_id, stop_token_ids=stop_token_ids) - seq_data: Dict[int, SequenceData] = {} - seq_group_penalization: List[bool] = [] + seq_data: dict[int, SequenceData] = {} + seq_group_penalization: list[bool] = [] for _ in range(num_seqs): num_input = random.randint(1, 100) num_generated = 0 if is_prompt else random.randint(1, 100) @@ -376,16 +376,16 @@ def test_sampler_min_tokens_penalty(seed: int, device: str): else: test_cases = [generate_test_case()] - def run_test_case(*, expected_penalization: List[bool], - seq_group_metadata_list: List[SequenceGroupMetadata]): + def run_test_case(*, expected_penalization: list[bool], + seq_group_metadata_list: list[SequenceGroupMetadata]): assert expected_penalization, \ "Invalid test case, need expected_penalization" assert seq_group_metadata_list, \ "Invalid test case, need seq_group_metadata_list" batch_size = 0 - seq_lens: List[int] = [] - sampling_params_per_row: List[SamplingParams] = [] + seq_lens: list[int] = [] + sampling_params_per_row: list[SamplingParams] = [] for sgm in seq_group_metadata_list: sampling_params = sgm.sampling_params @@ -456,11 +456,11 @@ def test_sampler_mixed(seed: int, device: str): batch_size = random.randint(1, 256) input_tensor, fake_logits, sampler = _prepare_test(batch_size) - seq_group_metadata_list: List[SequenceGroupMetadata] = [] - expected_tokens: List[Optional[List[int]]] = [] - seq_lens: List[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] + expected_tokens: list[Optional[list[int]]] = [] + seq_lens: list[int] = [] for i in range(batch_size): - expected: Optional[List[int]] = None + expected: Optional[list[int]] = None sampling_type = random.randint(0, 2) if sampling_type == 0: sampling_params = SamplingParams(temperature=0) @@ -492,7 +492,7 @@ def test_sampler_mixed(seed: int, device: str): )) seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - generators: Dict[str, torch.Generator] = {} + generators: dict[str, torch.Generator] = {} def test_sampling(): sampling_metadata = SamplingMetadata.prepare( @@ -587,8 +587,8 @@ def test_sampler_top_k_top_p(seed: int, device: str): device=device) assert len(processors) == 2 # top_p and top_k - seq_group_metadata_list: List[SequenceGroupMetadata] = [] - seq_lens: List[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] + seq_lens: list[int] = [] for i in range(batch_size): seq_group_metadata_list.append( SequenceGroupMetadata( @@ -669,10 +669,10 @@ def test_sampler_repetition_penalty_mixed(device: str): vocab_size = 8 - def test_sampling_params(sampling_params: List[SamplingParams]): + def test_sampling_params(sampling_params: list[SamplingParams]): - seq_group_metadata_list: List[SequenceGroupMetadata] = [] - seq_lens: List[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] + seq_lens: list[int] = [] for i in range(2): seq_group_metadata_list.append( SequenceGroupMetadata( diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index 53c88881..fe4a1c13 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Sequence from itertools import cycle -from typing import List, Optional, Sequence, Tuple, Union +from typing import Optional, Union import pytest import torch @@ -64,9 +65,9 @@ def maybe_assert_ngram_worker(llm): def get_output_from_llm_generator( llm_generator, prompts, - sampling_params) -> Tuple[List[str], List[List[int]], float]: - tokens: List[str] = [] - token_ids: List[List[int]] = [] + sampling_params) -> tuple[list[str], list[list[int]], float]: + tokens: list[str] = [] + token_ids: list[list[int]] = [] acceptance_rate: float = -1.0 for llm in llm_generator(): maybe_assert_ngram_worker(llm) diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index fe95ff9b..9edd8bd4 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import torch @@ -42,7 +40,7 @@ def test_get_token_ids_to_score(k: int): device='cuda', ) - expected_output: List[List[int]] = [ + expected_output: list[list[int]] = [ [], ] for i in range(proposal_token_ids.shape[0]): diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index 2bf40161..ca37c9a6 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import Dict, List from unittest.mock import MagicMock import pytest @@ -221,7 +220,7 @@ def test_same_output_for_multi_step(): # Run single-step repeatedly. zero_kv_cache(worker.cache_engine) - single_step_output: List[SamplerOutput] = [] + single_step_output: list[SamplerOutput] = [] continuations = [[1] for _ in prompts] set_random_seed(seed) @@ -243,15 +242,15 @@ def test_same_output_for_multi_step(): continuations[i].append(seq_group_output.samples[0].output_token) # Get token ids and logprobs for comparison. - multi_step_output_logprobs: List[List[Dict[int, + multi_step_output_logprobs: list[list[dict[int, Logprob]]] = [[] for _ in prompts] - single_step_output_logprobs: List[List[Dict[int, + single_step_output_logprobs: list[list[dict[int, Logprob]]] = [[] for _ in prompts] - multi_step_output_token_ids: List[List[int]] = [[] for _ in prompts] - single_step_output_token_ids: List[List[int]] = [[] for _ in prompts] + multi_step_output_token_ids: list[list[int]] = [[] for _ in prompts] + single_step_output_token_ids: list[list[int]] = [[] for _ in prompts] for i, _ in enumerate(prompts): for multi_step, single_step in zip(multi_step_output, single_step_output): @@ -336,7 +335,7 @@ def test_multi_step_with_batch_expansion_correct_output(): # will simulate the bonus token case with the second token # being the bonus token. zero_kv_cache(worker.cache_engine) - single_step_output: List[SamplerOutput] = [] + single_step_output: list[SamplerOutput] = [] set_random_seed(seed) for _ in range(num_steps): seq_group_metadata_list = create_seq_group_metadata_from_prompts( @@ -430,7 +429,7 @@ def test_multi_step_with_batch_expansion_incorrect_output(): # will simulate the bonus token case with the second token # being the bonus token. zero_kv_cache(worker.cache_engine) - single_step_output: List[SamplerOutput] = [] + single_step_output: list[SamplerOutput] = [] set_random_seed(seed) for _ in range(num_steps): seq_group_metadata_list = create_seq_group_metadata_from_prompts( diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py index 7bbbb023..161cc9fb 100644 --- a/tests/spec_decode/test_scorer.py +++ b/tests/spec_decode/test_scorer.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import List import pytest import torch @@ -15,7 +14,7 @@ from vllm.worker.worker import Worker from .utils import create_batch, create_worker -def create_proposal(propose_lens: List[int], vocab_size: int, +def create_proposal(propose_lens: list[int], vocab_size: int, device: str) -> SpeculativeProposals: batch_size = len(propose_lens) max_propose_len = max(propose_lens) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index e4b1a178..f7ef9786 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -3,7 +3,6 @@ import random from collections import defaultdict from types import SimpleNamespace -from typing import Dict, List, Set from unittest.mock import MagicMock import pytest @@ -123,7 +122,7 @@ def test_batch_expansion_correctly_calls_target_model( seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k)) - seen_contexts: List[List[int]] = [] + seen_contexts: list[list[int]] = [] call_args_list = target_worker.execute_model.call_args_list assert len(call_args_list) == 1 @@ -136,7 +135,7 @@ def test_batch_expansion_correctly_calls_target_model( for seq_data in seq_group_metadata.seq_data.values(): seen_contexts.append(seq_data.get_token_ids()) - expected_seen_contexts: List[List[int]] = [] + expected_seen_contexts: list[list[int]] = [] for prompt, prev_generated, draft_tokens in zip( prompts, prev_output_tokens, proposal_token_ids.tolist()): @@ -338,11 +337,11 @@ def test_correctly_formats_output(k: int, batch_size: int, next(iter(seq_group_metadata.seq_data.keys())) for seq_group_metadata in seq_group_metadata_list ] - actual_output_by_seq: Dict[int, List[SequenceOutput]] = { + actual_output_by_seq: dict[int, list[SequenceOutput]] = { seq_id: [] for seq_id in seq_ids } - expected_output_by_seq: Dict[int, List[SequenceOutput]] = { + expected_output_by_seq: dict[int, list[SequenceOutput]] = { seq_id: [] for seq_id in seq_ids } @@ -728,7 +727,7 @@ def test_populate_seq_ids_with_bonus_tokens(): size=(batch_size, (k + 1)), dtype=torch.int64, device='cuda') - expected_request_id_seq_ids_mapping: Dict[str, Set[int]] = defaultdict(set) + expected_request_id_seq_ids_mapping: dict[str, set[int]] = defaultdict(set) for seq_group_metadata in seq_group_metadata_list: for seq_id in seq_group_metadata.seq_data: expected_request_id_seq_ids_mapping[ diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 38f57e99..d303b7f1 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -1,9 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Sequence as GenericSequence from itertools import count -from typing import Callable, Dict, List, Optional -from typing import Sequence as GenericSequence -from typing import TypeVar, Union +from typing import Callable, Optional, TypeVar, Union from unittest.mock import MagicMock import torch @@ -44,7 +43,7 @@ def mock_worker(cls=None, return worker -def patch_execute_model_with_seeds(worker: Worker, rand_seeds: List[int]): +def patch_execute_model_with_seeds(worker: Worker, rand_seeds: list[int]): seed_iter = iter(rand_seeds) original_execute_model = worker.execute_model @@ -56,7 +55,7 @@ def patch_execute_model_with_seeds(worker: Worker, rand_seeds: List[int]): return new_execute_model -def zero_kv_cache(cache_engine: List[CacheEngine]): +def zero_kv_cache(cache_engine: list[CacheEngine]): assert cache_engine[0].gpu_cache for key_blocks, value_blocks in cache_engine[0].gpu_cache: key_blocks.zero_() @@ -106,13 +105,13 @@ def create_worker(cls: Callable[..., T], def create_seq_group_metadata_from_prompts( - prompts: List[List[int]], + prompts: list[list[int]], num_gpu_blocks: int, block_size: int, - final_prompt_lens: List[int], - continuations: Optional[List[List[int]]] = None, - seq_ids: Optional[List[int]] = None, -) -> List[SequenceGroupMetadata]: + final_prompt_lens: list[int], + continuations: Optional[list[list[int]]] = None, + seq_ids: Optional[list[int]] = None, +) -> list[SequenceGroupMetadata]: if continuations is None: continuations = [[] for _ in prompts] @@ -149,11 +148,11 @@ def create_seq_group_metadata_from_prompts( def create_chunked_seq_group_metadata_from_prompt( - prompt: List[int], + prompt: list[int], num_gpu_blocks: int, chunk_size: int, block_size: int, - seq_id: Optional[int] = None) -> List[SequenceGroupMetadata]: + seq_id: Optional[int] = None) -> list[SequenceGroupMetadata]: if seq_id is None: seq_id = 0 @@ -184,8 +183,8 @@ def create_chunked_seq_group_metadata_from_prompt( def assert_logprobs_dict_allclose( - actual_logprobs: List[Dict[int, Logprob]], - expected_logprobs: List[Dict[int, Logprob]]) -> None: + actual_logprobs: list[dict[int, Logprob]], + expected_logprobs: list[dict[int, Logprob]]) -> None: for single_step_actual_logprobs, single_step_expected_logprobs in zip( actual_logprobs, expected_logprobs): assert set(single_step_actual_logprobs.keys()) == set( @@ -202,7 +201,7 @@ def create_sampler_output_list( token_ids: torch.Tensor, probs: GenericSequence[Optional[torch.Tensor]], logprobs: GenericSequence[Optional[torch.Tensor]], - seq_ids: Optional[List[int]] = None) -> List[SamplerOutput]: + seq_ids: Optional[list[int]] = None) -> list[SamplerOutput]: num_steps, batch_size = token_ids.shape token_ids_by_step = token_ids.tolist() @@ -231,9 +230,9 @@ def create_sampler_output_list( def create_batch(batch_size, k, - prompt_len: Union[int, List[int]] = 10, + prompt_len: Union[int, list[int]] = 10, prev_output_token_len: int = 10, - seq_ids: Optional[List[int]] = None, + seq_ids: Optional[list[int]] = None, num_gpu_blocks: Optional[int] = None, block_size: Optional[int] = None, prefill_chunk_size: Optional[int] = None): diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index 17c128a1..05d2c624 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -3,7 +3,7 @@ Run `pytest tests/test_cache_block_hashing.py`. """ -from typing import List, Optional +from typing import Optional import pytest @@ -44,7 +44,7 @@ def flatten_2d(li): @pytest.mark.parametrize("concurrent_lora_int_ids", [[None], [1], [None, 1], [None, 1, 2], [1, 2]]) def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, - concurrent_lora_int_ids: List[Optional[int]]): + concurrent_lora_int_ids: list[Optional[int]]): tokenizer = TokenizerGroup( tokenizer_id="facebook/opt-125m", @@ -53,7 +53,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, max_input_length=None, ) - hashes: List[List[List[int]]] = [] + hashes: list[list[list[int]]] = [] for prefix in prefixes: for lora_int_id in concurrent_lora_int_ids: diff --git a/tests/test_inputs.py b/tests/test_inputs.py index fff90915..d361808e 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest from vllm.inputs import zip_enc_dec_prompts @@ -45,7 +43,7 @@ def test_parse_single_batch_string_consistent(string_input: str): @pytest.mark.parametrize('token_input', TOKEN_INPUTS) -def test_parse_single_batch_token_consistent(token_input: List[int]): +def test_parse_single_batch_token_consistent(token_input: list[int]): assert parse_and_batch_prompt(token_input) \ == parse_and_batch_prompt([token_input]) diff --git a/tests/test_logger.py b/tests/test_logger.py index 993822e9..11deae30 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -155,7 +155,7 @@ def test_an_error_is_raised_when_custom_logging_config_is_unexpected_json( with pytest.raises(ValueError) as ex_info: _configure_vllm_root_logger() assert ex_info.type == ValueError # noqa: E721 - assert "Invalid logging config. Expected Dict, got" in str(ex_info) + assert "Invalid logging config. Expected dict, got" in str(ex_info) @patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1) diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py index 487fbb8f..8301c645 100644 --- a/tests/test_logits_processor.py +++ b/tests/test_logits_processor.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import Tuple from unittest.mock import patch import pytest @@ -33,7 +32,7 @@ class MockLogitsProcessor(LogitsProcessor): def _prepare_test( batch_size: int -) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]: +) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]: vocab_size = 32000 input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16) fake_logits = torch.full((batch_size, vocab_size), diff --git a/tests/test_utils.py b/tests/test_utils.py index 5b69ffd1..8b67e92f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,7 +3,7 @@ import asyncio import os import socket -from typing import AsyncIterator, Tuple +from collections.abc import AsyncIterator from unittest.mock import patch import pytest @@ -33,7 +33,7 @@ async def test_merge_async_iterators(): iterators = [mock_async_iterator(i) for i in range(3)] merged_iterator = merge_async_iterators(*iterators) - async def stream_output(generator: AsyncIterator[Tuple[int, str]]): + async def stream_output(generator: AsyncIterator[tuple[int, str]]): async for idx, output in generator: print(f"idx: {idx}, output: {output}") diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 851c79d2..9aa2eea3 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, Generator, List, Optional +from collections.abc import Generator +from typing import Any, Optional import pytest from transformers import AutoTokenizer @@ -163,7 +164,7 @@ def detokenizer(tokenizer_name: str) -> Detokenizer: @pytest.fixture(name="complete_sequence_token_ids") def create_complete_sequence_token_ids(complete_sequence: str, - tokenizer) -> List[int]: + tokenizer) -> list[int]: complete_sequence_token_ids = tokenizer(complete_sequence).input_ids return complete_sequence_token_ids @@ -178,7 +179,7 @@ def create_sequence(prompt_token_ids=None): def create_dummy_logprobs( - complete_sequence_token_ids: List[int]) -> List[Dict[int, Logprob]]: + complete_sequence_token_ids: list[int]) -> list[dict[int, Logprob]]: return [{ token_id: Logprob(logprob=0.0), token_id + 1: Logprob(logprob=0.1) @@ -186,10 +187,10 @@ def create_dummy_logprobs( def create_dummy_prompt_logprobs( - complete_sequence_token_ids: List[int] -) -> List[Optional[Dict[int, Any]]]: + complete_sequence_token_ids: list[int] +) -> list[Optional[dict[int, Any]]]: # logprob for the first prompt token is None. - logprobs: List[Optional[Dict[int, Any]]] = [None] + logprobs: list[Optional[dict[int, Any]]] = [None] logprobs.extend(create_dummy_logprobs(complete_sequence_token_ids)[1:]) return logprobs @@ -198,7 +199,7 @@ def create_dummy_prompt_logprobs( @pytest.mark.parametrize("tokenizer_name", TOKENIZERS) @pytest.mark.parametrize("skip_special_tokens", [True, False], indirect=True) def test_decode_sequence_logprobs(complete_sequence: str, - complete_sequence_token_ids: List[int], + complete_sequence_token_ids: list[int], detokenizer: Detokenizer, skip_special_tokens: bool): """Verify Detokenizer decodes logprobs correctly.""" @@ -208,8 +209,8 @@ def test_decode_sequence_logprobs(complete_sequence: str, # Run sequentially. seq = create_sequence() dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids) - sequential_logprobs_text_chosen_token: List[str] = [] - sequential_logprobs_text_other_token: List[str] = [] + sequential_logprobs_text_chosen_token: list[str] = [] + sequential_logprobs_text_other_token: list[str] = [] for new_token, logprobs in zip(complete_sequence_token_ids, dummy_logprobs): seq.append_token_id(new_token, logprobs) @@ -232,7 +233,7 @@ def test_decode_sequence_logprobs(complete_sequence: str, @pytest.mark.parametrize("complete_sequence", TRUTH) @pytest.mark.parametrize("tokenizer_name", TOKENIZERS) -def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int], +def test_decode_prompt_logprobs(complete_sequence_token_ids: list[int], detokenizer: Detokenizer): """Verify Detokenizer decodes prompt logprobs correctly.""" sampling_params = SamplingParams(skip_special_tokens=True, @@ -249,7 +250,7 @@ def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int], dummy_logprobs, position_offset=0) # First logprob is None. - decoded_prompt_logprobs: List[Dict[int, Any]] = dummy_logprobs[ + decoded_prompt_logprobs: list[dict[int, Any]] = dummy_logprobs[ 1:] # type: ignore # decoded_prompt_logprobs doesn't contain the first token. diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py index 8e99f869..d1873823 100644 --- a/tests/tokenization/test_tokenizer_group.py +++ b/tests/tokenization/test_tokenizer_group.py @@ -3,7 +3,7 @@ import asyncio import os import sys -from typing import List, Optional +from typing import Optional from unittest.mock import patch import pytest @@ -129,7 +129,7 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type): def __init__(self, *args, - fail_at: Optional[List[int]] = None, + fail_at: Optional[list[int]] = None, **kwargs): super().__init__(*args, **kwargs) self.i = 0 diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py index 793d38f9..772eeb34 100644 --- a/tests/tokenization/test_tokenizer_registry.py +++ b/tests/tokenization/test_tokenizer_registry.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer_base import (TokenizerBase, @@ -17,15 +17,15 @@ class TestTokenizer(TokenizerBase): return TestTokenizer() @property - def all_special_tokens_extended(self) -> List[str]: + def all_special_tokens_extended(self) -> list[str]: raise NotImplementedError() @property - def all_special_tokens(self) -> List[str]: + def all_special_tokens(self) -> list[str]: raise NotImplementedError() @property - def all_special_ids(self) -> List[int]: + def all_special_ids(self) -> list[int]: raise NotImplementedError() @property @@ -58,7 +58,7 @@ class TestTokenizer(TokenizerBase): def __call__( self, - text: Union[str, List[str], List[int]], + text: Union[str, list[str], list[int]], text_pair: Optional[str] = None, add_special_tokens: bool = False, truncation: bool = False, @@ -66,10 +66,10 @@ class TestTokenizer(TokenizerBase): ): raise NotImplementedError() - def get_vocab(self) -> Dict[str, int]: + def get_vocab(self) -> dict[str, int]: raise NotImplementedError() - def get_added_vocab(self) -> Dict[str, int]: + def get_added_vocab(self) -> dict[str, int]: raise NotImplementedError() def encode_one( @@ -77,33 +77,33 @@ class TestTokenizer(TokenizerBase): text: str, truncation: bool = False, max_length: Optional[int] = None, - ) -> List[int]: + ) -> list[int]: raise NotImplementedError() def encode(self, text: str, - add_special_tokens: Optional[bool] = None) -> List[int]: + add_special_tokens: Optional[bool] = None) -> list[int]: raise NotImplementedError() def apply_chat_template(self, - messages: List["ChatCompletionMessageParam"], - tools: Optional[List[Dict[str, Any]]] = None, - **kwargs) -> List[int]: + messages: list["ChatCompletionMessageParam"], + tools: Optional[list[dict[str, Any]]] = None, + **kwargs) -> list[int]: raise NotImplementedError() - def convert_tokens_to_string(self, tokens: List[str]) -> str: + def convert_tokens_to_string(self, tokens: list[str]) -> str: raise NotImplementedError() def decode(self, - ids: Union[List[int], int], + ids: Union[list[int], int], skip_special_tokens: bool = True) -> str: raise NotImplementedError() def convert_ids_to_tokens( self, - ids: List[int], + ids: list[int], skip_special_tokens: bool = True, - ) -> List[str]: + ) -> list[str]: raise NotImplementedError() diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py index da033fa1..448347be 100644 --- a/tests/tool_use/test_chat_completions.py +++ b/tests/tool_use/test_chat_completions.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import openai import pytest @@ -45,7 +43,7 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI, logprobs=False, stream=True, ) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 role_sent: bool = False @@ -116,7 +114,7 @@ async def test_chat_completion_with_tools(client: openai.AsyncOpenAI, stream=True, ) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 role_sent: bool = False diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py index 7e349c51..a4067574 100644 --- a/tests/tool_use/test_jamba_tool_parser.py +++ b/tests/tool_use/test_jamba_tool_parser.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import json -from typing import Generator, List, Optional +from collections.abc import Generator +from typing import Optional import partial_json_parser import pytest @@ -26,8 +27,8 @@ def jamba_tool_parser(jamba_tokenizer): return JambaToolParser(jamba_tokenizer) -def assert_tool_calls(actual_tool_calls: List[ToolCall], - expected_tool_calls: List[ToolCall]): +def assert_tool_calls(actual_tool_calls: list[ToolCall], + expected_tool_calls: list[ToolCall]): assert len(actual_tool_calls) == len(expected_tool_calls) for actual_tool_call, expected_tool_call in zip(actual_tool_calls, @@ -218,10 +219,10 @@ def test_extract_tool_calls_streaming(jamba_tool_parser, jamba_tokenizer, model_output, expected_tool_calls, expected_content): other_content: str = '' - function_names: List[str] = [] - function_args_strs: List[str] = [] + function_names: list[str] = [] + function_args_strs: list[str] = [] tool_call_idx: int = -1 - tool_call_ids: List[Optional[str]] = [] + tool_call_ids: list[Optional[str]] = [] for delta_message in stream_delta_message_generator( jamba_tool_parser, jamba_tokenizer, model_output): diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py index b49a5e8e..910e0b2d 100644 --- a/tests/tool_use/test_parallel_tool_calls.py +++ b/tests/tool_use/test_parallel_tool_calls.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import json -from typing import Dict, List, Optional +from typing import Optional import openai import pytest @@ -54,7 +54,7 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI, assert isinstance(tool_call.function.arguments, str) parsed_arguments = json.loads(tool_call.function.arguments) - assert isinstance(parsed_arguments, Dict) + assert isinstance(parsed_arguments, dict) assert isinstance(parsed_arguments.get("city"), str) assert isinstance(parsed_arguments.get("state"), str) @@ -73,8 +73,8 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI, role_name: Optional[str] = None finish_reason_count: int = 0 - tool_call_names: List[str] = [] - tool_call_args: List[str] = [] + tool_call_names: list[str] = [] + tool_call_args: list[str] = [] tool_call_idx: int = -1 tool_call_id_count: int = 0 @@ -180,7 +180,7 @@ async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI, logprobs=False, stream=True) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 role_sent: bool = False diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py index 45f1bfc4..b320b335 100644 --- a/tests/tool_use/test_tool_calls.py +++ b/tests/tool_use/test_tool_calls.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import json -from typing import Dict, List, Optional +from typing import Optional import openai import pytest @@ -44,7 +44,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI): # make sure the arguments parse properly parsed_arguments = json.loads(tool_calls[0].function.arguments) - assert isinstance(parsed_arguments, Dict) + assert isinstance(parsed_arguments, dict) assert isinstance(parsed_arguments.get("city"), str) assert isinstance(parsed_arguments.get("state"), str) assert parsed_arguments.get("city") == "Dallas" @@ -117,7 +117,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI): # validate arguments streamed_args = json.loads(function_args_str) - assert isinstance(streamed_args, Dict) + assert isinstance(streamed_args, dict) assert isinstance(streamed_args.get("city"), str) assert isinstance(streamed_args.get("state"), str) assert streamed_args.get("city") == "Dallas" @@ -128,7 +128,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI): assert choice.message.role == role_name assert choice.message.tool_calls[0].function.name == function_name - # compare streamed with non-streamed args Dict-wise, not string-wise + # compare streamed with non-streamed args dict-wise, not string-wise # because character-to-character comparison might not work e.g. the tool # call parser adding extra spaces or something like that. we care about the # dicts matching not byte-wise match @@ -167,7 +167,7 @@ async def test_tool_call_with_results(client: openai.AsyncOpenAI): logprobs=False, stream=True) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 role_sent: bool = False diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index a7dfb107..fd947bd7 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from copy import deepcopy -from typing import Any, Dict, List, Optional +from typing import Any, Optional from openai.types.chat import (ChatCompletionMessageParam, ChatCompletionToolParam) @@ -12,14 +12,14 @@ from tests.utils import VLLM_PATH class ServerConfig(TypedDict, total=False): model: str - arguments: List[str] + arguments: list[str] system_prompt: Optional[str] supports_parallel: Optional[bool] supports_rocm: Optional[bool] -def patch_system_prompt(messages: List[Dict[str, Any]], - system_prompt: str) -> List[Dict[str, Any]]: +def patch_system_prompt(messages: list[dict[str, Any]], + system_prompt: str) -> list[dict[str, Any]]: new_messages = deepcopy(messages) if new_messages[0]["role"] == "system": new_messages[0]["content"] = system_prompt @@ -28,8 +28,8 @@ def patch_system_prompt(messages: List[Dict[str, Any]], return new_messages -def ensure_system_prompt(messages: List[Dict[str, Any]], - config: ServerConfig) -> List[Dict[str, Any]]: +def ensure_system_prompt(messages: list[dict[str, Any]], + config: ServerConfig) -> list[dict[str, Any]]: prompt = config.get("system_prompt") if prompt: return patch_system_prompt(messages, prompt) @@ -39,9 +39,9 @@ def ensure_system_prompt(messages: List[Dict[str, Any]], # universal args for all models go here. also good if you need to test locally # and change type or KV cache quantization or something. -ARGS: List[str] = ["--enable-auto-tool-choice", "--max-model-len", "1024"] +ARGS: list[str] = ["--enable-auto-tool-choice", "--max-model-len", "1024"] -CONFIGS: Dict[str, ServerConfig] = { +CONFIGS: dict[str, ServerConfig] = { "hermes": { "model": "NousResearch/Hermes-3-Llama-3.1-8B", @@ -205,7 +205,7 @@ SEARCH_TOOL: ChatCompletionToolParam = { } } -MESSAGES_WITHOUT_TOOLS: List[ChatCompletionMessageParam] = [{ +MESSAGES_WITHOUT_TOOLS: list[ChatCompletionMessageParam] = [{ "role": "user", "content": @@ -222,14 +222,14 @@ MESSAGES_WITHOUT_TOOLS: List[ChatCompletionMessageParam] = [{ "Can you tell me a joke please?" }] -MESSAGES_ASKING_FOR_TOOLS: List[ChatCompletionMessageParam] = [{ +MESSAGES_ASKING_FOR_TOOLS: list[ChatCompletionMessageParam] = [{ "role": "user", "content": "What is the weather in Dallas, Texas in Fahrenheit?" }] -MESSAGES_WITH_TOOL_RESPONSE: List[ChatCompletionMessageParam] = [{ +MESSAGES_WITH_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [{ "role": "user", "content": @@ -258,7 +258,7 @@ MESSAGES_WITH_TOOL_RESPONSE: List[ChatCompletionMessageParam] = [{ "cloudy skies and a low chance of rain." }] -MESSAGES_ASKING_FOR_PARALLEL_TOOLS: List[ChatCompletionMessageParam] = [{ +MESSAGES_ASKING_FOR_PARALLEL_TOOLS: list[ChatCompletionMessageParam] = [{ "role": "user", "content": @@ -266,7 +266,7 @@ MESSAGES_ASKING_FOR_PARALLEL_TOOLS: List[ChatCompletionMessageParam] = [{ "Fahrenheit?" }] -MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: List[ChatCompletionMessageParam] = [{ +MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [{ "role": "user", "content": diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index 592775e8..5fc5d08b 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -2,8 +2,9 @@ import os import threading +from collections.abc import Iterable from concurrent import futures -from typing import Callable, Dict, Iterable, Literal +from typing import Callable, Literal import grpc import pytest @@ -25,7 +26,7 @@ FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value', def decode_value(value: AnyValue): - field_decoders: Dict[FieldName, Callable] = { + field_decoders: dict[FieldName, Callable] = { "bool_value": (lambda v: v.bool_value), "string_value": (lambda v: v.string_value), "int_value": (lambda v: v.int_value), diff --git a/tests/utils.py b/tests/utils.py index 2ad91ca2..5a97636e 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -11,7 +11,7 @@ import time import warnings from contextlib import contextmanager from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Type, Union +from typing import Any, Callable, Optional, Union import openai import pytest @@ -73,9 +73,9 @@ class RemoteOpenAIServer: def __init__(self, model: str, - vllm_serve_args: List[str], + vllm_serve_args: list[str], *, - env_dict: Optional[Dict[str, str]] = None, + env_dict: Optional[dict[str, str]] = None, auto_port: bool = True, max_wait_seconds: Optional[float] = None) -> None: if auto_port: @@ -183,7 +183,7 @@ def _test_completion( client: openai.OpenAI, model: str, prompt: str, - token_ids: List[int], + token_ids: list[int], ): results = [] @@ -400,10 +400,10 @@ def _test_image_text( def compare_two_settings(model: str, - arg1: List[str], - arg2: List[str], - env1: Optional[Dict[str, str]] = None, - env2: Optional[Dict[str, str]] = None, + arg1: list[str], + arg2: list[str], + env1: Optional[dict[str, str]] = None, + env2: Optional[dict[str, str]] = None, *, method: str = "generate", max_wait_seconds: Optional[float] = None) -> None: @@ -429,8 +429,8 @@ def compare_two_settings(model: str, def compare_all_settings(model: str, - all_args: List[List[str]], - all_envs: List[Optional[Dict[str, str]]], + all_args: list[list[str]], + all_envs: list[Optional[dict[str, str]]], *, method: str = "generate", max_wait_seconds: Optional[float] = None) -> None: @@ -470,7 +470,7 @@ def compare_all_settings(model: str, prompt = "Hello, my name is" token_ids = tokenizer(prompt).input_ids - ref_results: List = [] + ref_results: list = [] for i, (args, env) in enumerate(zip(all_args, all_envs)): if can_force_load_format: # we are comparing the results and @@ -481,7 +481,7 @@ def compare_all_settings(model: str, # environment variable to force the load format, # e.g. in quantization tests. args = args + ["--load-format", envs.VLLM_TEST_FORCE_LOAD_FORMAT] - compare_results: List = [] + compare_results: list = [] results = ref_results if i == 0 else compare_results with RemoteOpenAIServer(model, args, @@ -582,7 +582,7 @@ def multi_process_parallel( @contextmanager -def error_on_warning(category: Type[Warning] = Warning): +def error_on_warning(category: type[Warning] = Warning): """ Within the scope of this context manager, tests will fail if any warning of the given category is emitted. @@ -604,7 +604,7 @@ def get_physical_device_indices(devices): @_nvml() -def wait_for_gpu_memory_to_clear(devices: List[int], +def wait_for_gpu_memory_to_clear(devices: list[int], threshold_bytes: int, timeout_s: float = 120) -> None: # Use nvml instead of pytorch to reduce measurement error from torch cuda @@ -612,8 +612,8 @@ def wait_for_gpu_memory_to_clear(devices: List[int], devices = get_physical_device_indices(devices) start_time = time.time() while True: - output: Dict[int, str] = {} - output_raw: Dict[int, float] = {} + output: dict[int, str] = {} + output_raw: dict[int, float] = {} for device in devices: if current_platform.is_rocm(): dev_handle = amdsmi_get_processor_handles()[device] @@ -758,13 +758,13 @@ def multi_gpu_test(*, num_gpus: int): async def completions_with_server_args( - prompts: List[str], + prompts: list[str], model_name: str, - server_cli_args: List[str], + server_cli_args: list[str], num_logprobs: Optional[int], max_wait_seconds: int = 240, max_tokens: Union[int, list] = 5, -) -> List[Completion]: +) -> list[Completion]: '''Construct a remote OpenAI server, obtain an async client to the server & invoke the completions API to obtain completions. @@ -807,7 +807,7 @@ async def completions_with_server_args( return outputs -def get_client_text_generations(completions: List[Completion]) -> List[str]: +def get_client_text_generations(completions: list[Completion]) -> list[str]: '''Extract generated tokens from the output of a request made to an Open-AI-protocol completions endpoint. ''' @@ -816,7 +816,7 @@ def get_client_text_generations(completions: List[Completion]) -> List[str]: def get_client_text_logprob_generations( - completions: List[Completion]) -> List[TextTextLogprobs]: + completions: list[Completion]) -> list[TextTextLogprobs]: '''Operates on the output of a request made to an Open-AI-protocol completions endpoint; obtains top-rank logprobs for each token in each :class:`SequenceGroup` diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 8956393c..cce2fb2c 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 """Compare the with and without prefix caching.""" -from typing import List import pytest @@ -434,7 +433,7 @@ def test_cache_blocks(): # Test that blocks are cached correctly for 2 full blocks from the start. blocks = [KVCacheBlock(block_id=i) for i in range(2)] - block_hashes: List[BlockHashType] = [] + block_hashes: list[BlockHashType] = [] block_pool.cache_full_blocks( request=req, diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index eb730973..f45c21ab 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional +from typing import Optional from vllm.config import CacheConfig, ModelConfig, SchedulerConfig from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange @@ -48,9 +48,9 @@ def create_scheduler( def create_requests( num_requests: int, num_tokens: int = 10, - mm_positions: Optional[List[PlaceholderRange]] = None, + mm_positions: Optional[list[PlaceholderRange]] = None, max_tokens: int = 16, - stop_token_ids: Optional[List[int]] = None, + stop_token_ids: Optional[list[int]] = None, ): sampling_params = SamplingParams(ignore_eos=False, max_tokens=max_tokens, diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py index 560dc312..8872f038 100644 --- a/tests/v1/engine/conftest.py +++ b/tests/v1/engine/conftest.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Tuple - import pytest import torch from transformers import AutoTokenizer @@ -17,8 +15,8 @@ from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from tests.v1.engine.utils import FULL_STRINGS # isort: skip -EngineCoreSampleLogprobsType = List[Tuple[torch.Tensor, torch.Tensor]] -EngineCorePromptLogprobsType = Tuple[torch.Tensor, torch.Tensor] +EngineCoreSampleLogprobsType = list[tuple[torch.Tensor, torch.Tensor]] +EngineCorePromptLogprobsType = tuple[torch.Tensor, torch.Tensor] def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors: diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index d864cb2a..e7b91aeb 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -2,7 +2,7 @@ import asyncio from contextlib import ExitStack -from typing import List, Optional, Tuple +from typing import Optional import pytest @@ -47,7 +47,7 @@ async def generate(engine: AsyncLLM, prompt: PromptType, output_kind: RequestOutputKind, max_tokens: int, - prompt_logprobs: Optional[int] = None) -> Tuple[int, str]: + prompt_logprobs: Optional[int] = None) -> tuple[int, str]: # Ensure generate doesn't complete too fast for cancellation test. await asyncio.sleep(0.2) @@ -114,7 +114,7 @@ async def test_async_llm_refuses_prompt_logprobs_with_apc( (VISION_ENGINE_ARGS, VISION_PROMPT)]) @pytest.mark.asyncio async def test_load(monkeypatch, output_kind: RequestOutputKind, - engine_args_and_prompt: Tuple[AsyncEngineArgs, + engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType]): # TODO(rickyx): Remove monkeypatch once we have a better way to test V1 # so that in the future when we switch, we don't have to change all the @@ -160,7 +160,7 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind, (VISION_ENGINE_ARGS, VISION_PROMPT)]) @pytest.mark.asyncio async def test_abort(monkeypatch, output_kind: RequestOutputKind, - engine_args_and_prompt: Tuple[AsyncEngineArgs, + engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType]): with monkeypatch.context() as m, ExitStack() as after: @@ -177,7 +177,7 @@ async def test_abort(monkeypatch, output_kind: RequestOutputKind, request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)] # Create concurrent requests. - tasks: List[asyncio.Task] = [] + tasks: list[asyncio.Task] = [] for request_id in request_ids: tasks.append( asyncio.create_task( diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 8c2998e5..11c22eff 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -5,7 +5,6 @@ import threading import time import uuid from concurrent.futures import Future -from typing import List import pytest from transformers import AutoTokenizer @@ -213,7 +212,7 @@ def test_engine_core_concurrent_batches(monkeypatch): class DummyExecutor(UniProcExecutor): def initialize_from_config( - self, kv_cache_configs: List[KVCacheConfig]) -> None: + self, kv_cache_configs: list[KVCacheConfig]) -> None: super().initialize_from_config(kv_cache_configs) # This executor actually can only run 1 batch at a time diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index a7c02322..3880a3dd 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -3,7 +3,7 @@ import asyncio import time import uuid -from typing import Dict, List, Optional +from typing import Optional import pytest from transformers import AutoTokenizer @@ -44,7 +44,7 @@ def make_request(params: SamplingParams) -> EngineCoreRequest: ) -def loop_until_done(client: EngineCoreClient, outputs: Dict): +def loop_until_done(client: EngineCoreClient, outputs: dict): while True: engine_core_outputs = client.get_output().outputs @@ -62,7 +62,7 @@ def loop_until_done(client: EngineCoreClient, outputs: Dict): break -async def loop_until_done_async(client: EngineCoreClient, outputs: Dict): +async def loop_until_done_async(client: EngineCoreClient, outputs: dict): while True: engine_core_outputs = (await client.get_output_async()).outputs @@ -121,7 +121,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): client.add_request(request) time.sleep(0.01) - outputs: Dict[str, List] = {req_id: [] for req_id in request_ids} + outputs: dict[str, list] = {req_id: [] for req_id in request_ids} loop_until_done(client, outputs) for req_id in request_ids: @@ -207,7 +207,7 @@ async def test_engine_core_client_asyncio(monkeypatch): await client.add_request_async(request) await asyncio.sleep(0.01) - outputs: Dict[str, List] = {req_id: [] for req_id in request_ids} + outputs: dict[str, list] = {req_id: [] for req_id in request_ids} await loop_until_done_async(client, outputs) for req_id in request_ids: diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py index de2a39ee..33c884e6 100644 --- a/tests/v1/engine/test_llm_engine.py +++ b/tests/v1/engine/test_llm_engine.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import Dict, List, Optional, Tuple +from typing import Optional import pytest @@ -47,9 +47,9 @@ def vllm_model_apc(vllm_runner, monkeypatch): def _get_test_sampling_params( - prompt_list: List[str], + prompt_list: list[str], seed: Optional[int] = 42, -) -> Tuple[List[SamplingParams], List[int]]: +) -> tuple[list[SamplingParams], list[int]]: """Generate random sampling params for a batch.""" def get_mostly_n_gt1() -> int: @@ -81,7 +81,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None: # Validate each request response for out, n in zip(outputs, n_list): - completion_counts: Dict[str, int] = {} + completion_counts: dict[str, int] = {} # Assert correct number of completions assert len(out.outputs) == n, ( f"{len(out.outputs)} completions; {n} expected.") diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index 1d47df41..0de853ba 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -2,7 +2,7 @@ import math import time -from typing import Dict, List, Optional +from typing import Optional import pytest @@ -112,12 +112,12 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind, def _validate_logprobs( - gen_tokens: Dict[str, List[int]], - gen_logprobs: Dict[str, Optional[SampleLogprobs]], - gen_prompt_logprobs: Dict[str, Optional[PromptLogprobs]], - gen_cumulative_logprob: Dict[str, float], + gen_tokens: dict[str, list[int]], + gen_logprobs: dict[str, Optional[SampleLogprobs]], + gen_prompt_logprobs: dict[str, Optional[PromptLogprobs]], + gen_cumulative_logprob: dict[str, float], dtv: DummyOutputProcessorTestVectors, - request_id_list: List[str], + request_id_list: list[str], num_sample_logprobs: Optional[int], num_prompt_logprobs: Optional[int], ) -> None: diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py index 39248ce8..02baa480 100644 --- a/tests/v1/engine/utils.py +++ b/tests/v1/engine/utils.py @@ -2,7 +2,7 @@ import random from dataclasses import dataclass -from typing import List, Optional, Tuple, Union +from typing import Optional, Union import torch from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast @@ -61,7 +61,7 @@ def _create_random_top_logprob_test_vector( def _create_random_top_logprob_test_matrix( - shape: Tuple, + shape: tuple, lower: float, upper: float, ) -> torch.Tensor: @@ -90,7 +90,7 @@ def _create_random_top_token_test_vector( lower: int, upper: int, sampled_token_id: int, - adjust_num_logprobs: bool = True) -> Tuple[torch.Tensor, int]: + adjust_num_logprobs: bool = True) -> tuple[torch.Tensor, int]: """Create a random vector of top logprob token indices Use to create fake sample logprobs for testing. The sampled token @@ -141,11 +141,11 @@ def _create_random_top_token_test_vector( def _create_random_top_token_test_matrix( - shape: Tuple[int, int], + shape: tuple[int, int], lower: int, upper: int, - tokens_list: List[int], -) -> Tuple[torch.Tensor, torch.Tensor]: + tokens_list: list[int], +) -> tuple[torch.Tensor, torch.Tensor]: """Create a random matrix of top logprob token indices Use to create fake prompt logprobs for testing. @@ -160,7 +160,7 @@ def _create_random_top_token_test_matrix( upper: upper range of token ids Returns: - Tuple containing: + tuple containing: - 2D num_tokens x num_logprobs+1 torch Tensor of token ids - 1D tensor of ranks of prompt tokens in their respective rows, or random values @@ -206,10 +206,10 @@ def decode_token( def generate_dummy_sample_logprobs( - sampled_tokens_list: List, + sampled_tokens_list: list, num_logprobs: int, tokenizer: PreTrainedTokenizer, -) -> List[Tuple[List[int], List[float], int]]: +) -> list[tuple[list[int], list[float], int]]: """Generate dummy sample logprobs Generate a test data structure which imitates the list of sample logprobs @@ -221,7 +221,7 @@ def generate_dummy_sample_logprobs( tokenizer: model tokenizer to use for detokenization Returns - List of (top token ids vector, logprobs vector, sampled token rank) + list of (top token ids vector, logprobs vector, sampled token rank) Python lists tuples; in each tuple the logprobs and top token ids vectors have the same length which is either `num_logprobs` or `num_logprobs+1`. Sampled token rank is the rank (index+1) of the @@ -253,7 +253,7 @@ def generate_dummy_sample_logprobs( def generate_dummy_prompt_logprobs_tensors( - prompt_tokens_list: List, + prompt_tokens_list: list, num_logprobs: int, tokenizer: PreTrainedTokenizer, ) -> LogprobsTensors: @@ -269,7 +269,7 @@ def generate_dummy_prompt_logprobs_tensors( tokenizer: model tokenizer to use for detokenization Returns - Single Tuple of (logprobs matrix, top token ids matrix) torch Tensor, + Single tuple of (logprobs matrix, top token ids matrix) torch Tensor, where both matrices have dimensions num_prompt_tokens x num_logprobs """ @@ -301,19 +301,19 @@ class DummyOutputProcessorTestVectors: tokenizer: GeneralTokenizerType tokenizer_group: BaseTokenizerGroup vllm_config: EngineArgs - full_tokens: List[List[int]] # Prompt + generated tokens - prompt_tokens: List[List[int]] - generation_tokens: List[List[int]] + full_tokens: list[list[int]] # Prompt + generated tokens + prompt_tokens: list[list[int]] + generation_tokens: list[list[int]] # Each request is associated with a tuple of # (top tokens, top logprobs, ranks) prompt logprobs tensors - prompt_logprobs: List[LogprobsTensors] + prompt_logprobs: list[LogprobsTensors] # Each request is associated with a sample logprobs; a request's # sample logprobs are a list of (top tokens, top logprobs, ranks) # sample logprobs tensors at each sequence position - generation_logprobs: List[List[Tuple[List[int], List[float], int]]] - prompt_strings: List[str] - prompt_strings_len: List[int] - generation_strings: List[str] + generation_logprobs: list[list[tuple[list[int], list[float], int]]] + prompt_strings: list[str] + prompt_strings_len: list[int] + generation_strings: list[str] class MockEngineCore: @@ -321,18 +321,18 @@ class MockEngineCore: def __init__( self, - tokens_list: List[List[int]], + tokens_list: list[list[int]], # For each request, for each sampled token offset, # a tuple of # (list of topk token ids, list of sample logprob vals, rank) - generated_logprobs_raw: Optional[List[List[Tuple[List[int], - List[float], + generated_logprobs_raw: Optional[list[list[tuple[list[int], + list[float], int]]]] = None, # For each request, a tuple of # (prompt logprob val matrix, prompt logprob tok id matrix); # each matrix has dimensions # (num prompt toks) x (num prompt logprobs+1) - prompt_logprobs_raw: Optional[List[LogprobsTensors]] = None, + prompt_logprobs_raw: Optional[list[LogprobsTensors]] = None, ) -> None: self.tokens_list = tokens_list self.current_idx = 0 @@ -341,7 +341,7 @@ class MockEngineCore: self.prompt_logprobs_raw = prompt_logprobs_raw self.do_prompt_logprobs = prompt_logprobs_raw is not None - def get_outputs(self) -> List[EngineCoreOutput]: + def get_outputs(self) -> list[EngineCoreOutput]: do_logprobs = self.do_logprobs do_prompt_logprobs = self.do_prompt_logprobs token_idx = self.current_idx diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py index 35e059cc..171c8417 100644 --- a/tests/v1/entrypoints/openai/test_completion.py +++ b/tests/v1/entrypoints/openai/test_completion.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import re -from typing import Dict, List, Optional +from typing import Optional import openai # use the official client for correctness check import pytest @@ -193,7 +193,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: Optional[int]): - params: Dict = { + params: dict = { "prompt": ["A robot may not injure another robot", "My name is"], "model": model_name, } @@ -237,7 +237,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, stream=True) - chunks: List[str] = [] + chunks: list[str] = [] finish_reason_count = 0 async for chunk in stream: chunks.append(chunk.choices[0].text) @@ -278,7 +278,7 @@ async def test_parallel_no_streaming(client: openai.AsyncOpenAI, num_completions = len(completion.choices) assert num_completions == n, ( f"Num completions {num_completions} but expected {n}.") - completion_repeats: Dict[str, int] = {} + completion_repeats: dict[str, int] = {} for idx, choice in enumerate(completion.choices): # Assert correct completion index & some finish reason. assert choice.index == idx, ( @@ -321,7 +321,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str): temperature=0.95, stream=True, seed=42) - chunks: List[List[str]] = [[] for i in range(n)] + chunks: list[list[str]] = [[] for i in range(n)] finish_reason_count = 0 async for chunk in stream: index = chunk.choices[0].index @@ -332,7 +332,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str): # Assert `n` completions with correct finish reasons assert finish_reason_count == n, ( f"Expected {n} completions with valid indices and finish_reason.") - completion_repeats: Dict[str, int] = {} + completion_repeats: dict[str, int] = {} for chunk in chunks: chunk_len = len(chunk) # Assert correct number of completion tokens diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index a26a8c4e..d564a8c2 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import itertools -from typing import List, Tuple import pytest import torch @@ -46,8 +45,8 @@ def hf_model(hf_runner): def _repeat_logprob_config( test_prompts, - logprob_prompt_logprob_list: List[Tuple], -) -> List[Tuple]: + logprob_prompt_logprob_list: list[tuple], +) -> list[tuple]: """Ensure each test prompt has a logprob config. A logprob config specifies the optional (i.e. @@ -74,7 +73,7 @@ def _repeat_logprob_config( tuples Returns: - List of + list of (optional num sample logprob,optional num prompt logprob) tuples which is either identical to `logprob_prompt_logprob_list`, or else repeats @@ -177,7 +176,7 @@ def _test_case_get_logprobs_and_prompt_logprobs( for r in range(1, num_top_logprobs + 1)) output_text = vllm_result.outputs[0].text - output_string_from_most_likely_tokens_lst: List[str] = [] + output_string_from_most_likely_tokens_lst: list[str] = [] for top_logprobs in vllm_result.outputs[0].logprobs: top_logprob = next(iter(top_logprobs.values())) output_string_from_most_likely_tokens_lst.append( diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py index f00585b4..b1862455 100644 --- a/tests/v1/sample/test_rejection_sampler.py +++ b/tests/v1/sample/test_rejection_sampler.py @@ -1,5 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List import pytest import torch @@ -13,7 +12,7 @@ def sampler(): return RejectionSampler() -def create_logits_tensor(token_ids: List[int], +def create_logits_tensor(token_ids: list[int], vocab_size: int = 100) -> torch.Tensor: """Helper function to create logits tensor that will produce desired token ids on argmax""" @@ -23,7 +22,7 @@ def create_logits_tensor(token_ids: List[int], return logits -def create_sampling_metadata(spec_tokens: List[List[int]]) -> SamplingMetadata: +def create_sampling_metadata(spec_tokens: list[list[int]]) -> SamplingMetadata: batch_size = len(spec_tokens) return SamplingMetadata( temperature=torch.tensor([]), @@ -106,7 +105,7 @@ def test_single_token_sequence(sampler): def test_empty_sequence(sampler): """Test handling empty sequence of speculated tokens""" - spec_tokens: List[List[int]] = [[]] + spec_tokens: list[list[int]] = [[]] output_tokens = [5] # Just the bonus token metadata = create_sampling_metadata(spec_tokens) diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py index 435c1b7b..b702d9ed 100644 --- a/tests/v1/sample/test_sampler.py +++ b/tests/v1/sample/test_sampler.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Optional, Set, Tuple +from typing import Optional import numpy as np import pytest @@ -32,7 +32,7 @@ def _create_penalty_tensor(batch_size: int, penalty_value: float, def _create_prompt_tokens_tensor( - prompt_token_ids: List[List[int]], + prompt_token_ids: list[list[int]], vocab_size: int, device: torch.device, ) -> torch.Tensor: @@ -49,8 +49,8 @@ def _create_logit_bias( batch_size: int, vocab_size: int, bias_value: float, -) -> List[Optional[Dict[int, float]]]: - res: List[Optional[Dict[int, float]]] = [] +) -> list[Optional[dict[int, float]]]: + res: list[Optional[dict[int, float]]] = [] for i in range(batch_size): logit_bias = {min(i, vocab_size - 1): bias_value} res.append(logit_bias) @@ -83,8 +83,8 @@ def _create_default_sampling_metadata( vocab_size: int, device: torch.device, ) -> SamplingMetadata: - output_token_ids: List[List[int]] = [] - prompt_token_ids: List[List[int]] = [] + output_token_ids: list[list[int]] = [] + prompt_token_ids: list[list[int]] = [] for _ in range(batch_size): output_token_ids.append( np.random.randint(0, vocab_size, size=num_output_tokens).tolist()) @@ -118,8 +118,8 @@ def _create_default_sampling_metadata( def _generate_min_token_penalties_and_stop_tokens( num_output_tokens: int, batch_size: int, vocab_size: int, - batch_indices_for_min_token_penalty: List[int] -) -> Dict[int, Tuple[int, Set[int]]]: + batch_indices_for_min_token_penalty: list[int] +) -> dict[int, tuple[int, set[int]]]: """ Generates and returns a dict of minimum token penalties and corresponding stop token IDs (`min_tokens`, `stop_token_ids`) for each @@ -130,7 +130,7 @@ def _generate_min_token_penalties_and_stop_tokens( and a random set of stop token IDs is created. Otherwise, a lower `min_tokens` value is assigned, and the stop token IDs set is empty. """ - min_tokens: Dict[int, Tuple[int, Set[int]]] = {} + min_tokens: dict[int, tuple[int, set[int]]] = {} for index in range(batch_size): if index in batch_indices_for_min_token_penalty: min_tokens[index] = ( @@ -147,7 +147,7 @@ def _generate_min_token_penalties_and_stop_tokens( def _create_weighted_output_token_list( batch_size: int, - vocab_size: int) -> Tuple[List[List[int]], List[List[int]]]: + vocab_size: int) -> tuple[list[list[int]], list[list[int]]]: """ Creates an output token list where each token occurs a distinct number of times. @@ -157,7 +157,7 @@ def _create_weighted_output_token_list( list, each with a different frequency. Returns: - Tuple[List[List[int]], List[List[int]]]: + tuple[list[list[int]], list[list[int]]]: - The first element is the output token list, where each sublist corresponds to a batch and contains tokens with weighted frequencies. @@ -165,8 +165,8 @@ def _create_weighted_output_token_list( batch, ordered by their frequency in the corresponding output list. """ - output_token_ids: List[List[int]] = [] - sorted_token_ids_in_output: List[List[int]] = [] + output_token_ids: list[list[int]] = [] + sorted_token_ids_in_output: list[list[int]] = [] for _ in range(batch_size): distinct_token_ids = np.random.choice(vocab_size, size=np.random.randint(1, 10), diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py index e1465b12..c69d0d49 100644 --- a/tests/v1/sample/utils.py +++ b/tests/v1/sample/utils.py @@ -1,12 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 import re -from typing import List, Tuple from vllm import CompletionOutput -def get_test_batch(batch_logprobs_composition: str) -> List[Tuple]: +def get_test_batch(batch_logprobs_composition: str) -> list[tuple]: """Generate logprobs configs for a batch of requests A given request's logprobs configuration is (1) num_sample_logprobs and (2) @@ -32,7 +31,7 @@ def get_test_batch(batch_logprobs_composition: str) -> List[Tuple]: Returns: - List of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs]) + list of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs]) tuples """ if batch_logprobs_composition == "NONE": diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py index 9b669ae0..b68f0838 100644 --- a/tests/v1/test_utils.py +++ b/tests/v1/test_utils.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import torch from vllm.v1.utils import bind_kv_cache @@ -22,7 +20,7 @@ def test_bind_kv_cache(): 'layers.2.self_attn': torch.zeros((1, )), 'layers.3.self_attn': torch.zeros((1, )), } - runner_kv_caches: List[torch.Tensor] = [] + runner_kv_caches: list[torch.Tensor] = [] bind_kv_cache(kv_cache, ctx, runner_kv_caches) assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[ 'layers.0.self_attn'] @@ -52,7 +50,7 @@ def test_bind_kv_cache_non_attention(): 'model.layers.28.attn': torch.zeros((1, )), } - runner_kv_caches: List[torch.Tensor] = [] + runner_kv_caches: list[torch.Tensor] = [] bind_kv_cache(kv_cache, ctx, runner_kv_caches) assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[ diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 327370e7..72ec7370 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Optional, Set, Tuple +from typing import Optional import numpy as np import pytest @@ -22,22 +22,22 @@ MAX_NUM_PROMPT_TOKENS = 64 def _remove_requests( input_batch: InputBatch, batch_size: int, - reqs: List[CachedRequestState]) -> Tuple[Set[str], List[int]]: + reqs: list[CachedRequestState]) -> tuple[set[str], list[int]]: """ - Remove some requests randomly from the batch and returns a Tuple + Remove some requests randomly from the batch and returns a tuple of 1) set of request removed 2) indices of the requests removed ordered in descending order """ num_reqs_to_remove = np.random.randint(0, batch_size) - req_indices_to_remove: Set[int] = set() + req_indices_to_remove: set[int] = set() for _ in range(num_reqs_to_remove): req_index_to_remove = np.random.randint(0, batch_size) req_indices_to_remove.add(req_index_to_remove) req_indices_to_remove_list = list(req_indices_to_remove) req_indices_to_remove_list.sort(reverse=True) - req_ids_to_remove: Set[str] = set() + req_ids_to_remove: set[str] = set() for index in req_indices_to_remove: input_batch.remove_request(reqs[index].req_id) req_ids_to_remove.add(reqs[index].req_id) @@ -45,9 +45,9 @@ def _remove_requests( def _construct_expected_sampling_metadata( - reqs: List[CachedRequestState], - req_ids_retained: Set[int], - req_id_index_in_input_batch: Dict[str, int], + reqs: list[CachedRequestState], + req_ids_retained: set[int], + req_id_index_in_input_batch: dict[str, int], device: torch.device, ) -> SamplingMetadata: """ @@ -55,8 +55,8 @@ def _construct_expected_sampling_metadata( batch. """ num_reqs = len(req_ids_retained) - output_token_ids: List[List[int]] = [list() for _ in range(num_reqs)] - prompt_token_ids: List[List[int]] = [list() for _ in range(num_reqs)] + output_token_ids: list[list[int]] = [list() for _ in range(num_reqs)] + prompt_token_ids: list[list[int]] = [list() for _ in range(num_reqs)] presence_penalties = [0.0 for _ in range(num_reqs)] frequency_penalties = [0.0 for _ in range(num_reqs)] repetition_penalties = [1.0 for _ in range(num_reqs)] @@ -191,7 +191,7 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int): pin_memory=is_pin_memory_available(), vocab_size=1024, ) - reqs: List[CachedRequestState] = [] + reqs: list[CachedRequestState] = [] req_id_reqs = {} req_id_output_token_ids = {} # Add requests diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py index 392fd270..3b25980c 100644 --- a/tests/vllm_test_utils/vllm_test_utils/blame.py +++ b/tests/vllm_test_utils/vllm_test_utils/blame.py @@ -4,7 +4,8 @@ import contextlib import dataclasses import sys import traceback -from typing import Callable, Generator +from collections.abc import Generator +from typing import Callable @dataclasses.dataclass diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py index 44d45f26..27077f13 100644 --- a/tests/vllm_test_utils/vllm_test_utils/monitor.py +++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py @@ -4,7 +4,8 @@ import contextlib import dataclasses import sys import traceback -from typing import Callable, Generator, Generic, TypeVar +from collections.abc import Generator +from typing import Callable, Generic, TypeVar _T = TypeVar("_T") diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index 0ce0465a..3e237aac 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import itertools -from typing import List import pytest import torch @@ -43,7 +42,7 @@ def test_empty_seq_group(): enable_chunked_prefill=False, enforce_eager=True, ) - seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] model_input = model_runner._prepare_model_input_tensors( seq_group_metadata_list) ( @@ -103,9 +102,9 @@ def test_prepare_prompt(batch_size): enforce_eager=True, ) - seq_lens: List[int] = [] - encoder_seq_lens: List[int] = [] - seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: list[int] = [] + encoder_seq_lens: list[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] block_tables = {0: [1]} cross_block_table = [2] for i in range(batch_size): @@ -295,9 +294,9 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group): enforce_eager=True, ) - seq_lens: List[int] = [] - encoder_seq_lens: List[int] = [] - seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: list[int] = [] + encoder_seq_lens: list[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] block_tables = { 0: [1], 1: [3] @@ -503,9 +502,9 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group): } if multiple_seqs_per_seq_group else { 0: [1] } - seq_lens: List[int] = [] - encoder_seq_lens: List[int] = [] - seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: list[int] = [] + encoder_seq_lens: list[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] cross_block_table = [2] expanded_batch_size = 0 diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py index eb341fb1..a41fc521 100644 --- a/tests/worker/test_model_input.py +++ b/tests/worker/test_model_input.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import dataclasses -from typing import List, Tuple, Type import torch @@ -27,15 +26,15 @@ class MockAttentionBackend(AttentionBackend): raise NotImplementedError @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: + def get_metadata_cls() -> type["AttentionMetadata"]: return AttentionMetadata @staticmethod - def get_builder_cls() -> Type["AttentionMetadataBuilder"]: + def get_builder_cls() -> type["AttentionMetadataBuilder"]: return AttentionMetadataBuilder @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: + def get_state_cls() -> type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -44,7 +43,7 @@ class MockAttentionBackend(AttentionBackend): block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: raise NotImplementedError @staticmethod @@ -57,7 +56,7 @@ class MockAttentionBackend(AttentionBackend): @staticmethod def copy_blocks( - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: pass diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index 3f9a0d6f..b8ba69b0 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import torch @@ -42,8 +40,8 @@ def test_prepare_prompt(batch_size): enable_chunked_prefill=False, ) - seq_lens: List[int] = [] - seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: list[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] block_tables = {0: [1]} for i in range(batch_size): # make sure all tokens fit into one block @@ -159,8 +157,8 @@ def test_prepare_decode_cuda_graph(batch_size): enable_chunked_prefill=False, ) - context_lens: List[int] = [] - seq_group_metadata_list: List[SequenceGroupMetadata] = [] + context_lens: list[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] # Assume each seq group finishes prefill. for i in range(batch_size): # make sure all tokens fit into one block @@ -265,7 +263,7 @@ def test_empty_seq_group(): dtype="float16", enforce_eager=False, ) - seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] model_input = model_runner._prepare_model_input_tensors( seq_group_metadata_list) input_tokens, input_positions, attn_metadata = ( @@ -315,10 +313,10 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): ) # Add prefill requests. - seq_lens: List[int] = [] - seq_group_metadata_list: List[SequenceGroupMetadata] = [] - prefill_metadata_list: List[SequenceGroupMetadata] = [] - decode_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: list[int] = [] + seq_group_metadata_list: list[SequenceGroupMetadata] = [] + prefill_metadata_list: list[SequenceGroupMetadata] = [] + decode_metadata_list: list[SequenceGroupMetadata] = [] block_tables = {0: [1]} prefill_batch_size = batch_size // 2 decode_batch_size = batch_size - prefill_batch_size diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py index adbb7301..9601b578 100644 --- a/tools/profiler/print_layerwise_table.py +++ b/tools/profiler/print_layerwise_table.py @@ -2,13 +2,12 @@ import argparse import json -from typing import Dict from vllm.profiler.layerwise_profile import ModelStatsEntry, SummaryStatsEntry from vllm.profiler.utils import TablePrinter, indent_string -def flatten_entries(entry_cls, profile_dict: Dict): +def flatten_entries(entry_cls, profile_dict: dict): entries_and_depth = [] def get_entries(node, curr_depth=0): diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py index c527cdbe..8ec3dfc9 100644 --- a/tools/profiler/visualize_layerwise_profile.py +++ b/tools/profiler/visualize_layerwise_profile.py @@ -6,7 +6,7 @@ import json import math import os from pathlib import Path -from typing import Any, List, Optional, Tuple +from typing import Any, Optional import matplotlib.pyplot as plt import pandas as pd @@ -24,7 +24,7 @@ def largest_dist_from_leaf(node: dict, depth: int = 0): def get_entries_at_depth(depth: int, - entries_and_traces: List[Tuple[Any, Any]], + entries_and_traces: list[tuple[Any, Any]], node: dict, curr_depth: int = 0, trace=()): @@ -48,9 +48,9 @@ def get_entries_at_depth(depth: int, trace=trace) -def fold_nodes(root: dict, nodes_to_fold: List[str]): +def fold_nodes(root: dict, nodes_to_fold: list[str]): - stack: List[dict] = [root] + stack: list[dict] = [root] while len(stack) != 0: node = stack.pop() if node['entry']['name'] in nodes_to_fold: @@ -427,12 +427,12 @@ def main( plot_metric: str, make_names_unique: bool, top_k: int, - json_nodes_to_fold: List[str]): + json_nodes_to_fold: list[str]): - def prepare_data(profile_json: dict, step_keys: List[str]) -> pd.DataFrame: + def prepare_data(profile_json: dict, step_keys: list[str]) -> pd.DataFrame: def get_entries_and_traces(key: str): - entries_and_traces: List[Tuple[Any, Any]] = [] + entries_and_traces: list[tuple[Any, Any]] = [] for root in profile_json[key]["summary_stats"]: # Fold nodes in the traces as per user request. i.e. simply # make the requested nodes leaf-nodes. diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 373f92a5..3c822028 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -2,7 +2,7 @@ import contextlib import importlib -from typing import TYPE_CHECKING, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Optional, Union import torch import torch.library @@ -198,7 +198,7 @@ def rms_norm_dynamic_per_token_quant( quant_dtype: torch.dtype, scale_ub: Optional[torch.Tensor] = None, residual: Optional[torch.Tensor] = None -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: output = torch.empty_like(input, dtype=quant_dtype) scales = torch.empty((input.numel() // input.shape[-1], 1), device=input.device, @@ -347,7 +347,7 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"): @register_fake("_C::aqlm_gemm") def _aqlm_gemm_fake(input: torch.Tensor, codes: torch.Tensor, codebooks: torch.Tensor, scales: torch.Tensor, - codebook_partition_sizes: List[int], + codebook_partition_sizes: list[int], bias: Optional[torch.Tensor]) -> torch.Tensor: out_features = codes.size(0) * codebooks.size(2) flat_input = input.reshape((-1, input.size(-1))) @@ -363,7 +363,7 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"): @register_fake("_C::aqlm_dequant") def _aqlm_dequant_fake( codes: torch.Tensor, codebooks: torch.Tensor, - codebook_partition_sizes: List[int]) -> torch.Tensor: + codebook_partition_sizes: list[int]) -> torch.Tensor: in_features = codes.size(1) * 8 out_features = codes.size(0) return torch.empty((out_features, in_features), @@ -554,7 +554,7 @@ def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool: def cutlass_sparse_compress(a: torch.Tensor) \ - -> Tuple[torch.Tensor, torch.Tensor]: + -> tuple[torch.Tensor, torch.Tensor]: """ Compresses a sparse matrix for use with Cutlass sparse operations. @@ -571,7 +571,7 @@ def cutlass_sparse_compress(a: torch.Tensor) \ - `torch.float16` Returns: - Tuple[torch.Tensor, torch.Tensor]: + tuple[torch.Tensor, torch.Tensor]: A tuple containing: - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`. - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation. @@ -646,14 +646,14 @@ def cutlass_scaled_sparse_mm( # aqlm def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor, codebooks: torch.Tensor, scales: torch.Tensor, - codebook_partition_sizes: List[int], + codebook_partition_sizes: list[int], bias: Optional[torch.Tensor]) -> torch.Tensor: return torch.ops._C.aqlm_gemm(input, codes, codebooks, scales, codebook_partition_sizes, bias) def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor, - codebook_partition_sizes: List[int]) -> torch.Tensor: + codebook_partition_sizes: list[int]) -> torch.Tensor: return torch.ops._C.aqlm_dequant(codes, codebooks, codebook_partition_sizes) @@ -738,7 +738,7 @@ def machete_supported_schedules( group_zeros_type: Optional[torch.dtype] = None, channel_scales_type: Optional[torch.dtype] = None, token_scales_type: Optional[torch.dtype] = None, - out_type: Optional[torch.dtype] = None) -> List[str]: + out_type: Optional[torch.dtype] = None) -> list[str]: return torch.ops._C.machete_supported_schedules( a_type, b_type.id, group_scales_type, group_zeros_type, channel_scales_type, token_scales_type, out_type) @@ -783,7 +783,7 @@ def permute_cols(a: torch.Tensor, perm: torch.Tensor) -> torch.Tensor: # fp4 def scaled_fp4_quant( input: torch.Tensor, - input_global_scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + input_global_scale: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: """ Quantize input tensor to FP4 and return quantized tensor and scale. @@ -798,7 +798,7 @@ def scaled_fp4_quant( input_global_scale: A scalar scaling factor for the entire tensor. Returns: - Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP4 but every + tuple[torch.Tensor, torch.Tensor]: The output tensor in FP4 but every two values are packed into a uint8 and float8_e4m3 scaling factors in the sizzled layout. """ @@ -845,7 +845,7 @@ def scaled_fp8_quant( num_token_padding: Optional[int] = None, scale_ub: Optional[torch.Tensor] = None, use_per_token_if_dynamic: bool = False, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: """ Quantize input tensor to FP8 and return quantized tensor and scale. @@ -866,12 +866,12 @@ def scaled_fp8_quant( in the dynamic quantization case. Returns: - Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and + tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and scaling factor. """ # This code assumes batch_dim and num_tokens are flattened assert (input.ndim == 2) - shape: Union[Tuple[int, int], torch.Size] = input.shape + shape: Union[tuple[int, int], torch.Size] = input.shape # For rocm, the output fp8 dtype is torch.float_e3m3fnuz out_dtype: torch.dtype = torch.float8_e4m3fnuz \ if current_platform.is_rocm() else torch.float8_e4m3fn @@ -903,7 +903,7 @@ def allspark_repack_weight( scale: torch.Tensor, zero_point: Optional[torch.Tensor] = None, has_zp: bool = False -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Rearrange qweight, scale, and zero_point(if asymmetric) to n32k16 format for Ampere W8A16 Fused Gemm kernel @@ -917,7 +917,7 @@ def allspark_repack_weight( if use asymmetric quantization, has_zp = True. Returns: - Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : + tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : rearranged weight, scale, and optionally zero_point. """ K = qweight.shape[0] @@ -964,7 +964,7 @@ def scaled_int8_quant( scale: Optional[torch.Tensor] = None, azp: Optional[torch.Tensor] = None, symmetric: bool = True -) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: +) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: """ Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp. @@ -977,7 +977,7 @@ def scaled_int8_quant( symmetric: Whether to use symmetric quantization (scale only, azp ignored). Returns: - Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp. + tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp. """ output = torch.empty_like(input, dtype=torch.int8) if scale is not None: @@ -1165,13 +1165,13 @@ def concat_and_cache_mla( scale) -def copy_blocks(key_caches: List[torch.Tensor], - value_caches: List[torch.Tensor], +def copy_blocks(key_caches: list[torch.Tensor], + value_caches: list[torch.Tensor], block_mapping: torch.Tensor) -> None: torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping) -def copy_blocks_mla(kv_caches: List[torch.Tensor], +def copy_blocks_mla(kv_caches: list[torch.Tensor], block_mapping: torch.Tensor) -> None: torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping) @@ -1209,7 +1209,7 @@ def get_max_shared_memory_per_block_device_attribute(device: int) -> int: # custom ar -def init_custom_ar(ipc_tensors: List[torch.Tensor], rank_data: torch.Tensor, +def init_custom_ar(ipc_tensors: list[torch.Tensor], rank_data: torch.Tensor, rank: int, full_nvlink: bool) -> int: return torch.ops._C_custom_ar.init_custom_ar(ipc_tensors, rank_data, rank, full_nvlink) @@ -1229,16 +1229,16 @@ def meta_size() -> int: return torch.ops._C_custom_ar.meta_size() -def register_buffer(fa: int, ipc_tensors: List[int]) -> None: +def register_buffer(fa: int, ipc_tensors: list[int]) -> None: return torch.ops._C_custom_ar.register_buffer(fa, ipc_tensors) -def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]: +def get_graph_buffer_ipc_meta(fa: int) -> tuple[list[int], list[int]]: return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa) -def register_graph_buffers(fa: int, handles: List[List[int]], - offsets: List[List[int]]) -> None: +def register_graph_buffers(fa: int, handles: list[list[int]], + offsets: list[list[int]]) -> None: torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets) @@ -1246,7 +1246,7 @@ def get_flash_mla_metadata( cache_seqlens: torch.Tensor, num_heads_per_head_k: int, num_heads_k: int, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: """ Arguments: cache_seqlens: (batch_size), dtype torch.int32. @@ -1272,7 +1272,7 @@ def flash_mla_with_kvcache( num_splits: torch.Tensor, softmax_scale: Optional[float] = None, causal: bool = False, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: """ Arguments: q: (batch_size, seq_len_q, num_heads_q, head_dim). diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index ccb67baa..a7b909d2 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Tuple +from typing import Optional import torch @@ -18,7 +18,7 @@ class ipex_ops: @staticmethod def _reshape_activation_tensor( - x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: num = x.size(0) d = x.size(1) // 2 x = x.reshape(num, 2, d) @@ -213,8 +213,8 @@ class ipex_ops: key, value, key_cache, value_cache, slot_mapping) @staticmethod - def copy_blocks(key_caches: List[torch.Tensor], - value_caches: List[torch.Tensor], + def copy_blocks(key_caches: list[torch.Tensor], + value_caches: list[torch.Tensor], block_mapping: torch.Tensor) -> None: torch.xpu.copy_blocks( # type: ignore key_caches, diff --git a/vllm/beam_search.py b/vllm/beam_search.py index 97b2b630..5d4ebdb7 100644 --- a/vllm/beam_search.py +++ b/vllm/beam_search.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union from vllm.sequence import Logprob @@ -17,14 +17,14 @@ class BeamSearchSequence: about to be returned to the user. """ # The tokens includes the prompt. - tokens: List[int] - logprobs: List[Dict[int, Logprob]] + tokens: list[int] + logprobs: list[dict[int, Logprob]] cum_logprob: float = 0.0 text: Optional[str] = None finish_reason: Optional[str] = None stop_reason: Union[int, str, None] = None multi_modal_data: Optional["MultiModalDataDict"] = None - mm_processor_kwargs: Optional[Dict[str, Any]] = None + mm_processor_kwargs: Optional[dict[str, Any]] = None @dataclass @@ -33,20 +33,20 @@ class BeamSearchOutput: It contains the list of the best beam search sequences. The length of the list is equal to the beam width. """ - sequences: List[BeamSearchSequence] + sequences: list[BeamSearchSequence] class BeamSearchInstance: - def __init__(self, prompt_tokens: List[int]): - self.beams: List[BeamSearchSequence] = [ + def __init__(self, prompt_tokens: list[int]): + self.beams: list[BeamSearchSequence] = [ BeamSearchSequence(tokens=prompt_tokens, logprobs=[]) ] - self.completed: List[BeamSearchSequence] = [] + self.completed: list[BeamSearchSequence] = [] def get_beam_search_score( - tokens: List[int], + tokens: list[int], cumulative_logprob: float, eos_token_id: int, length_penalty: float = 1.0, diff --git a/vllm/config.py b/vllm/config.py index 54ed3841..f87d2d6e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -7,13 +7,14 @@ import hashlib import json import sys import warnings +from collections import Counter +from collections.abc import Mapping from contextlib import contextmanager from dataclasses import dataclass, field, replace from importlib.util import find_spec from pathlib import Path -from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Counter, Dict, - Final, List, Literal, Mapping, Optional, Protocol, Set, - Tuple, Type, Union) +from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Final, Literal, + Optional, Protocol, Union) import torch from pydantic import BaseModel, Field, PrivateAttr @@ -67,20 +68,20 @@ _ResolvedTask = Literal["generate", "embed", "classify", "score", "reward", RunnerType = Literal["generate", "pooling", "draft", "transcription"] -_RUNNER_TASKS: Dict[RunnerType, List[_ResolvedTask]] = { +_RUNNER_TASKS: dict[RunnerType, list[_ResolvedTask]] = { "generate": ["generate"], "pooling": ["embed", "classify", "score", "reward"], "draft": ["draft"], "transcription": ["transcription"], } -_TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = { +_TASK_RUNNER: dict[_ResolvedTask, RunnerType] = { task: runner for runner, tasks in _RUNNER_TASKS.items() for task in tasks } -HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig], +HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig], PretrainedConfig]] @@ -92,7 +93,7 @@ class SupportsHash(Protocol): class SupportsMetricsInfo(Protocol): - def metrics_info(self) -> Dict[str, str]: + def metrics_info(self) -> dict[str, str]: ... @@ -209,7 +210,7 @@ class ModelConfig: excluding anything before input ids/embeddings and after the final hidden states. """ - factors: List[Any] = [] + factors: list[Any] = [] factors.append(self.model) factors.append(self.dtype) factors.append(self.quantization) @@ -233,7 +234,7 @@ class ModelConfig: allowed_local_media_path: str = "", revision: Optional[str] = None, code_revision: Optional[str] = None, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, rope_theta: Optional[float] = None, tokenizer_revision: Optional[str] = None, max_model_len: Optional[int] = None, @@ -244,19 +245,19 @@ class ModelConfig: max_logprobs: int = 20, disable_sliding_window: bool = False, skip_tokenizer_init: bool = False, - served_model_name: Optional[Union[str, List[str]]] = None, + served_model_name: Optional[Union[str, list[str]]] = None, limit_mm_per_prompt: Optional[Mapping[str, int]] = None, use_async_output_proc: bool = True, config_format: ConfigFormat = ConfigFormat.AUTO, hf_overrides: Optional[HfOverrides] = None, - mm_processor_kwargs: Optional[Dict[str, Any]] = None, + mm_processor_kwargs: Optional[dict[str, Any]] = None, disable_mm_preprocessor_cache: bool = False, - override_neuron_config: Optional[Dict[str, Any]] = None, + override_neuron_config: Optional[dict[str, Any]] = None, override_pooler_config: Optional["PoolerConfig"] = None, logits_processor_pattern: Optional[str] = None, generation_config: Optional[str] = None, enable_sleep_mode: bool = False, - override_generation_config: Optional[Dict[str, Any]] = None, + override_generation_config: Optional[dict[str, Any]] = None, model_impl: Union[str, ModelImpl] = ModelImpl.AUTO, ) -> None: self.model = model @@ -283,7 +284,7 @@ class ModelConfig: hf_overrides_fn = None if rope_scaling is not None: - hf_override: Dict[str, Any] = {"rope_scaling": rope_scaling} + hf_override: dict[str, Any] = {"rope_scaling": rope_scaling} hf_overrides_kw.update(hf_override) msg = ("`--rope-scaling` will be removed in a future release. " f"'Please instead use `--hf-overrides '{hf_override!r}'`") @@ -505,8 +506,8 @@ class ModelConfig: def _get_preferred_task( self, - architectures: List[str], - supported_tasks: Set[_ResolvedTask], + architectures: list[str], + supported_tasks: set[_ResolvedTask], ) -> Optional[_ResolvedTask]: model_id = self.model if get_pooling_config(model_id, self.revision): @@ -516,7 +517,7 @@ class ModelConfig: if self.registry.is_transcription_model(architectures): return "transcription" - suffix_to_preferred_task: List[Tuple[str, _ResolvedTask]] = [ + suffix_to_preferred_task: list[tuple[str, _ResolvedTask]] = [ # Other models follow this pattern ("ForCausalLM", "generate"), ("ForConditionalGeneration", "generate"), @@ -537,27 +538,27 @@ class ModelConfig: def _resolve_task( self, task_option: Union[TaskOption, Literal["draft"]], - ) -> Tuple[Set[_ResolvedTask], _ResolvedTask]: + ) -> tuple[set[_ResolvedTask], _ResolvedTask]: if task_option == "draft": return {"draft"}, "draft" registry = self.registry architectures = self.architectures - runner_support: Dict[RunnerType, bool] = { + runner_support: dict[RunnerType, bool] = { # NOTE: Listed from highest to lowest priority, # in case the model supports multiple of them "transcription": registry.is_transcription_model(architectures), "generate": registry.is_text_generation_model(architectures), "pooling": registry.is_pooling_model(architectures), } - supported_runner_types_lst: List[RunnerType] = [ + supported_runner_types_lst: list[RunnerType] = [ runner_type for runner_type, is_supported in runner_support.items() if is_supported ] - supported_tasks_lst: List[_ResolvedTask] = [ + supported_tasks_lst: list[_ResolvedTask] = [ task for runner_type in supported_runner_types_lst for task in _RUNNER_TASKS[runner_type] ] @@ -767,7 +768,7 @@ class ModelConfig: self.use_async_output_proc = False def get_hf_config_sliding_window( - self) -> Union[Optional[int], List[Optional[int]]]: + self) -> Union[Optional[int], list[Optional[int]]]: """Get the sliding window size, or None if disabled.""" # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in @@ -778,7 +779,7 @@ class ModelConfig: return None return getattr(self.hf_text_config, "sliding_window", None) - def get_sliding_window(self) -> Optional[Union[int, List[Optional[int]]]]: + def get_sliding_window(self) -> Optional[Union[int, list[Optional[int]]]]: """Get the sliding window size, or None if disabled. """ # If user disables sliding window, return None. @@ -888,7 +889,7 @@ class ModelConfig: return num_heads // parallel_config.tensor_parallel_size def get_layers_start_end_indices( - self, parallel_config: "ParallelConfig") -> Tuple[int, int]: + self, parallel_config: "ParallelConfig") -> tuple[int, int]: from vllm.distributed.utils import get_pp_indices if self.hf_text_config.model_type == "deepseek_mtp": total_num_hidden_layers = getattr(self.hf_text_config, @@ -949,7 +950,7 @@ class ModelConfig: return self.multimodal_config - def try_get_generation_config(self) -> Dict[str, Any]: + def try_get_generation_config(self) -> dict[str, Any]: if self.generation_config is None or self.generation_config == "auto": config = try_get_generation_config( self.hf_config_path or self.model, @@ -967,7 +968,7 @@ class ModelConfig: return config.to_diff_dict() - def get_diff_sampling_param(self) -> Dict[str, Any]: + def get_diff_sampling_param(self) -> dict[str, Any]: """ This method returns a dictionary containing the parameters that differ from the default sampling parameters, but only @@ -975,7 +976,7 @@ class ModelConfig: set, an empty dictionary is returned. Returns: - Dict[str, Any]: A dictionary with the differing sampling + dict[str, Any]: A dictionary with the differing sampling parameters if `generation_config` is set, otherwise an empty dictionary. """ @@ -1032,7 +1033,7 @@ class ModelConfig: return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE @property - def supported_runner_types(self) -> Set[RunnerType]: + def supported_runner_types(self) -> set[RunnerType]: return {_TASK_RUNNER[task] for task in self.supported_tasks} @property @@ -1075,7 +1076,7 @@ class CacheConfig: excluding anything before input ids/embeddings and after the final hidden states. """ - factors: List[Any] = [] + factors: list[Any] = [] factors.append(self.cache_dtype) # `cpu_offload_gb` does not use `torch.compile` yet. hash_str = hashlib.md5(str(factors).encode()).hexdigest() @@ -1183,7 +1184,7 @@ class TokenizerPoolConfig: pool type. """ pool_size: int - pool_type: Union[str, Type["BaseTokenizerGroup"]] + pool_type: Union[str, type["BaseTokenizerGroup"]] extra_config: dict def compute_hash(self) -> str: @@ -1200,7 +1201,7 @@ class TokenizerPoolConfig: """ # no factors to consider. # this config will not affect the computation graph. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -1214,7 +1215,7 @@ class TokenizerPoolConfig: @classmethod def create_config( cls, tokenizer_pool_size: int, - tokenizer_pool_type: Union[str, Type["BaseTokenizerGroup"]], + tokenizer_pool_type: Union[str, type["BaseTokenizerGroup"]], tokenizer_pool_extra_config: Optional[Union[str, dict]] ) -> Optional["TokenizerPoolConfig"]: """Create a TokenizerPoolConfig from the given parameters. @@ -1285,7 +1286,7 @@ class LoadConfig: download_dir: Optional[str] = None model_loader_extra_config: Optional[Union[str, dict]] = field( default_factory=dict) - ignore_patterns: Optional[Union[List[str], str]] = None + ignore_patterns: Optional[Union[list[str], str]] = None def compute_hash(self) -> str: """ @@ -1301,7 +1302,7 @@ class LoadConfig: """ # no factors to consider. # this config will not affect the computation graph. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -1359,7 +1360,7 @@ class ParallelConfig: # to "ray" if Ray is installed and fail otherwise. Note that tpu # and hpu only support Ray for distributed inference. distributed_executor_backend: Optional[Union[str, - Type["ExecutorBase"]]] = None + type["ExecutorBase"]]] = None # the full name of the worker class to use. If "auto", the worker class # will be determined based on the platform. @@ -1423,7 +1424,7 @@ class ParallelConfig: excluding anything before input ids/embeddings and after the final hidden states. """ - factors: List[Any] = [] + factors: list[Any] = [] factors.append(self.pipeline_parallel_size) factors.append(self.tensor_parallel_size) return hashlib.sha256(str(factors).encode()).hexdigest() @@ -1600,7 +1601,7 @@ class SchedulerConfig: # scheduler class or path. "vllm.core.scheduler.Scheduler" (default) # or "mod.custom_class". - scheduler_cls: Union[str, Type[object]] = "vllm.core.scheduler.Scheduler" + scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler" def compute_hash(self) -> str: """ @@ -1616,7 +1617,7 @@ class SchedulerConfig: """ # no factors to consider. # this config will not affect the computation graph. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -1752,7 +1753,7 @@ class DeviceConfig: # no factors to consider. # the device/platform information will be summarized # by torch/vllm automatically. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -1798,7 +1799,7 @@ class SpeculativeConfig: """ # no factors to consider. # spec decode does not use `torch.compile` yet. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -2261,7 +2262,7 @@ class LoRAConfig: lora_extra_vocab_size: int = 256 # This is a constant. lora_vocab_padding_size: ClassVar[int] = 256 - long_lora_scaling_factors: Optional[Tuple[float]] = None + long_lora_scaling_factors: Optional[tuple[float]] = None bias_enabled: bool = False def compute_hash(self) -> str: @@ -2278,7 +2279,7 @@ class LoRAConfig: """ # no factors to consider. # LoRA is not compatible with `torch.compile` . - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -2350,7 +2351,7 @@ class PromptAdapterConfig: """ # no factors to consider. # this config will not affect the computation graph. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -2395,7 +2396,7 @@ class MultiModalConfig: """ # no factors to consider. # this config will not affect the computation graph. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -2431,7 +2432,7 @@ class PoolerConfig: are returned. """ - returned_token_ids: Optional[List[int]] = None + returned_token_ids: Optional[list[int]] = None """ A list of indices for the vocabulary dimensions to be extracted, such as the token IDs of ``good_token`` and ``bad_token`` in the @@ -2452,7 +2453,7 @@ class PoolerConfig: """ # no factors to consider. # this config will not affect the computation graph. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -2469,7 +2470,7 @@ _STR_DTYPE_TO_TORCH_DTYPE = { "bfloat16": torch.bfloat16, } -_ROCM_NOT_SUPPORTED_DTYPE: List[str] = [] # +_ROCM_NOT_SUPPORTED_DTYPE: list[str] = [] # def _get_and_verify_dtype( @@ -2558,7 +2559,7 @@ def _get_and_verify_max_len( hf_config: PretrainedConfig, max_model_len: Optional[int], disable_sliding_window: bool, - sliding_window_len: Optional[Union[int, List[Optional[int]]]], + sliding_window_len: Optional[Union[int, list[Optional[int]]]], spec_target_max_model_len: Optional[int] = None, encoder_config: Optional[Any] = None, ) -> int: @@ -2684,7 +2685,7 @@ def _get_and_verify_max_len( def get_min_sliding_window( - sliding_window: Union[int, List[Optional[int]]]) -> int: + sliding_window: Union[int, list[Optional[int]]]) -> int: if isinstance(sliding_window, list): return min(s for s in sliding_window if s is not None) @@ -2692,7 +2693,7 @@ def get_min_sliding_window( def get_served_model_name(model: str, - served_model_name: Optional[Union[str, List[str]]]): + served_model_name: Optional[Union[str, list[str]]]): """ If the input is a non-empty list, the first model_name in `served_model_name` is taken. @@ -2731,7 +2732,7 @@ class DecodingConfig: """ # no factors to consider. # this config will not affect the computation graph. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -2774,7 +2775,7 @@ class ObservabilityConfig: """ # no factors to consider. # this config will not affect the computation graph. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -2833,7 +2834,7 @@ class KVTransferConfig(BaseModel): """ # no factors to consider. # this config will not affect the computation graph. - factors: List[Any] = [] + factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode()).hexdigest() return hash_str @@ -2930,7 +2931,7 @@ class CompilationConfig(BaseModel): torch.compile will handle cudagraph capture logic in the future. - cudagraph_capture_sizes: sizes to capture cudagraph. - None (default): capture sizes are inferred from vllm config. - - List[int]: capture sizes are specified as given. + - list[int]: capture sizes are specified as given. - cudagraph_num_of_warmups: number of warmup runs for cudagraph. It means the first several runs will be treated as warmup runs. Only after that, the execution will be recorded, and the recorded @@ -2972,17 +2973,17 @@ class CompilationConfig(BaseModel): debug_dump_path: str = "" cache_dir: str = "" backend: str = "" - custom_ops: List[str] = Field(default_factory=list) - splitting_ops: List[str] = Field(default=None) # type: ignore + custom_ops: list[str] = Field(default_factory=list) + splitting_ops: list[str] = Field(default=None) # type: ignore use_inductor: bool = True - compile_sizes: Optional[List[Union[int, str]]] = Field(default=None) - inductor_compile_config: Dict = Field(default_factory=dict) - inductor_passes: Dict[str, str] = Field(default_factory=dict) + compile_sizes: Optional[list[Union[int, str]]] = Field(default=None) + inductor_compile_config: dict = Field(default_factory=dict) + inductor_passes: dict[str, str] = Field(default_factory=dict) use_cudagraph: bool = False cudagraph_num_of_warmups: int = 0 - cudagraph_capture_sizes: Optional[List[int]] = None + cudagraph_capture_sizes: Optional[list[int]] = None cudagraph_copy_inputs: bool = False class PassConfig(BaseModel): @@ -2998,7 +2999,7 @@ class CompilationConfig(BaseModel): - enable_noop: whether to enable the custom no-op elimination pass. TODO(luka) better pass enabling system. """ - dump_graph_stages: List[str] = Field(default_factory=list) + dump_graph_stages: list[str] = Field(default_factory=list) dump_graph_dir: Path = Field(default=Path(".")) enable_fusion: bool = True enable_noop: bool = True @@ -3026,20 +3027,20 @@ class CompilationConfig(BaseModel): max_capture_size: int = PrivateAttr local_cache_dir: str = PrivateAttr # local cache dir for each rank # optimization: - # Intuitively, bs_to_padded_graph_size should be Dict[int, int]. + # Intuitively, bs_to_padded_graph_size should be dict[int, int]. # since we know all keys are in a range [0, max_capture_size], - # we can optimize it to List[int] for better lookup performance. - bs_to_padded_graph_size: List[int] = PrivateAttr + # we can optimize it to list[int] for better lookup performance. + bs_to_padded_graph_size: list[int] = PrivateAttr # keep track of enabled and disabled custom ops enabled_custom_ops: Counter[str] = PrivateAttr disabled_custom_ops: Counter[str] = PrivateAttr - traced_files: Set[str] = PrivateAttr + traced_files: set[str] = PrivateAttr compilation_time: float = PrivateAttr # Per-model forward context # Map from layer name to the attention cls - static_forward_context: Dict[str, Any] = PrivateAttr + static_forward_context: dict[str, Any] = PrivateAttr def compute_hash(self) -> str: """ @@ -3053,7 +3054,7 @@ class CompilationConfig(BaseModel): excluding anything before input ids/embeddings and after the final hidden states. """ - factors: List[Any] = [] + factors: list[Any] = [] factors.append(self.level) factors.append(self.backend) factors.append(self.custom_ops) @@ -3150,7 +3151,7 @@ class CompilationConfig(BaseModel): return VllmBackend(vllm_config) def init_with_cudagraph_sizes(self, - cudagraph_capture_sizes: List[int]) -> None: + cudagraph_capture_sizes: list[int]) -> None: """To complete the initialization of config, we need to know the cudagraph sizes.""" @@ -3243,10 +3244,10 @@ class VllmConfig: excluding anything before input ids/embeddings and after the final hidden states. """ - factors: List[Any] = [] + factors: list[Any] = [] # summarize vllm config - vllm_factors: List[Any] = [] + vllm_factors: list[Any] = [] from vllm import __version__ vllm_factors.append(__version__) if self.model_config: diff --git a/vllm/connections.py b/vllm/connections.py index dc060bb6..2c259bb7 100644 --- a/vllm/connections.py +++ b/vllm/connections.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Mapping, MutableMapping from pathlib import Path -from typing import Mapping, MutableMapping, Optional +from typing import Optional from urllib.parse import urlparse import aiohttp diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 28b8c847..c81ff958 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -10,7 +10,8 @@ import asyncio import json import ssl from argparse import Namespace -from typing import Any, AsyncGenerator, Optional +from collections.abc import AsyncGenerator +from typing import Any, Optional from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response, StreamingResponse diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index c50c631d..b05842dd 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -5,10 +5,11 @@ import codecs import json from abc import ABC, abstractmethod from collections import defaultdict, deque +from collections.abc import Awaitable, Iterable from functools import cache, lru_cache, partial from pathlib import Path -from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List, - Literal, Optional, Tuple, TypeVar, Union, cast) +from typing import (Any, Callable, Generic, Literal, Optional, TypeVar, Union, + cast) import jinja2.nodes import transformers.utils.chat_template_utils as hf_chat_utils @@ -117,7 +118,7 @@ class CustomChatCompletionMessageParam(TypedDict, total=False): role: Required[str] """The role of the message's author.""" - content: Union[str, List[ChatCompletionContentPartParam]] + content: Union[str, list[ChatCompletionContentPartParam]] """The contents of the message.""" name: str @@ -143,7 +144,7 @@ class ConversationMessage(TypedDict, total=False): role: Required[str] """The role of the message's author.""" - content: Union[Optional[str], List[Dict[str, str]]] + content: Union[Optional[str], list[dict[str, str]]] """The contents of the message""" tool_call_id: Optional[str] @@ -495,13 +496,13 @@ class BaseMultiModalContentParser(ABC): super().__init__() # multimodal placeholder_string : count - self._placeholder_counts: Dict[str, int] = defaultdict(lambda: 0) + self._placeholder_counts: dict[str, int] = defaultdict(lambda: 0) def _add_placeholder(self, placeholder: Optional[str]): if placeholder: self._placeholder_counts[placeholder] += 1 - def mm_placeholder_counts(self) -> Dict[str, int]: + def mm_placeholder_counts(self) -> dict[str, int]: return dict(self._placeholder_counts) @abstractmethod @@ -652,12 +653,12 @@ def load_chat_template( # TODO: Let user specify how to insert multimodal tokens into prompt # (similar to chat template) -def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], +def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int], text_prompt: str) -> str: """Combine multimodal prompts for a multimodal language model.""" # Look through the text prompt to check for missing placeholders - missing_placeholders: List[str] = [] + missing_placeholders: list[str] = [] for placeholder in placeholder_counts: # For any existing placeholder in the text prompt, we leave it as is @@ -684,10 +685,10 @@ _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam) _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) _VideoParser = partial(cast, ChatCompletionContentPartVideoParam) -_ContentPart: TypeAlias = Union[str, Dict[str, str], InputAudio] +_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio] # Define a mapping from part types to their corresponding parsing functions. -MM_PARSER_MAP: Dict[ +MM_PARSER_MAP: dict[ str, Callable[[ChatCompletionContentPartParam], _ContentPart], ] = { @@ -749,7 +750,7 @@ def _parse_chat_message_content_mm_part( part) return "audio_url", audio_params.get("audio_url", "") if part.get("input_audio") is not None: - input_audio_params = cast(Dict[str, str], part) + input_audio_params = cast(dict[str, str], part) return "input_audio", input_audio_params if part.get("video_url") is not None: video_params = cast(CustomChatCompletionContentSimpleVideoParam, @@ -773,7 +774,7 @@ def _parse_chat_message_content_parts( mm_tracker: BaseMultiModalItemTracker, *, wrap_dicts: bool, -) -> List[ConversationMessage]: +) -> list[ConversationMessage]: content = list[_ContentPart]() mm_parser = mm_tracker.create_parser() @@ -791,7 +792,7 @@ def _parse_chat_message_content_parts( # Parsing wraps images and texts as interleaved dictionaries return [ConversationMessage(role=role, content=content)] # type: ignore - texts = cast(List[str], content) + texts = cast(list[str], content) text_prompt = "\n".join(texts) mm_placeholder_counts = mm_parser.mm_placeholder_counts() if mm_placeholder_counts: @@ -866,7 +867,7 @@ def _parse_chat_message_content( message: ChatCompletionMessageParam, mm_tracker: BaseMultiModalItemTracker, content_format: _ChatTemplateContentFormat, -) -> List[ConversationMessage]: +) -> list[ConversationMessage]: role = message["role"] content = message.get("content") @@ -900,7 +901,7 @@ def _parse_chat_message_content( return result -def _postprocess_messages(messages: List[ConversationMessage]) -> None: +def _postprocess_messages(messages: list[ConversationMessage]) -> None: # per the Transformers docs & maintainers, tool call arguments in # assistant-role messages with tool_calls need to be dicts not JSON str - # this is how tool-use chat templates will expect them moving forwards @@ -916,12 +917,12 @@ def _postprocess_messages(messages: List[ConversationMessage]) -> None: def parse_chat_messages( - messages: List[ChatCompletionMessageParam], + messages: list[ChatCompletionMessageParam], model_config: ModelConfig, tokenizer: AnyTokenizer, content_format: _ChatTemplateContentFormat, -) -> Tuple[List[ConversationMessage], Optional[MultiModalDataDict]]: - conversation: List[ConversationMessage] = [] +) -> tuple[list[ConversationMessage], Optional[MultiModalDataDict]]: + conversation: list[ConversationMessage] = [] mm_tracker = MultiModalItemTracker(model_config, tokenizer) for msg in messages: @@ -939,12 +940,12 @@ def parse_chat_messages( def parse_chat_messages_futures( - messages: List[ChatCompletionMessageParam], + messages: list[ChatCompletionMessageParam], model_config: ModelConfig, tokenizer: AnyTokenizer, content_format: _ChatTemplateContentFormat, -) -> Tuple[List[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]: - conversation: List[ConversationMessage] = [] +) -> tuple[list[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]: + conversation: list[ConversationMessage] = [] mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer) for msg in messages: @@ -963,7 +964,7 @@ def parse_chat_messages_futures( def apply_hf_chat_template( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - conversation: List[ConversationMessage], + conversation: list[ConversationMessage], chat_template: Optional[str], *, tokenize: bool = False, # Different from HF's default @@ -985,10 +986,10 @@ def apply_hf_chat_template( def apply_mistral_chat_template( tokenizer: MistralTokenizer, - messages: List[ChatCompletionMessageParam], + messages: list[ChatCompletionMessageParam], chat_template: Optional[str] = None, **kwargs: Any, -) -> List[int]: +) -> list[int]: if chat_template is not None: logger.warning_once( "'chat_template' cannot be overridden for mistral tokenizer.") diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py index 73df900f..21a7d48b 100644 --- a/vllm/entrypoints/cli/openai.py +++ b/vllm/entrypoints/cli/openai.py @@ -5,7 +5,7 @@ import argparse import os import signal import sys -from typing import List, Optional, Tuple +from typing import Optional from openai import OpenAI from openai.types.chat import ChatCompletionMessageParam @@ -23,7 +23,7 @@ def _register_signal_handlers(): signal.signal(signal.SIGTSTP, signal_handler) -def _interactive_cli(args: argparse.Namespace) -> Tuple[str, OpenAI]: +def _interactive_cli(args: argparse.Namespace) -> tuple[str, OpenAI]: _register_signal_handlers() base_url = args.url @@ -43,7 +43,7 @@ def _interactive_cli(args: argparse.Namespace) -> Tuple[str, OpenAI]: def chat(system_prompt: Optional[str], model_name: str, client: OpenAI) -> None: - conversation: List[ChatCompletionMessageParam] = [] + conversation: list[ChatCompletionMessageParam] = [] if system_prompt is not None: conversation.append({"role": "system", "content": system_prompt}) @@ -100,7 +100,7 @@ class ChatCommand(CLISubcommand): def cmd(args: argparse.Namespace) -> None: model_name, client = _interactive_cli(args) system_prompt = args.system_prompt - conversation: List[ChatCompletionMessageParam] = [] + conversation: list[ChatCompletionMessageParam] = [] if system_prompt is not None: conversation.append({"role": "system", "content": system_prompt}) @@ -168,5 +168,5 @@ class CompleteCommand(CLISubcommand): return complete_parser -def cmd_init() -> List[CLISubcommand]: +def cmd_init() -> list[CLISubcommand]: return [ChatCommand(), CompleteCommand()] diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 1afead8a..c345ece4 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import argparse -from typing import List import uvloop @@ -59,5 +58,5 @@ class ServeSubcommand(CLISubcommand): return make_arg_parser(serve_parser) -def cmd_init() -> List[CLISubcommand]: +def cmd_init() -> list[CLISubcommand]: return [ServeSubcommand()] diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 3f3262f6..122e2ed8 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -2,9 +2,9 @@ import itertools import warnings +from collections.abc import Sequence from contextlib import contextmanager -from typing import (Any, Callable, ClassVar, Dict, List, Optional, Sequence, - Tuple, Type, Union, cast, overload) +from typing import Any, Callable, ClassVar, Optional, Union, cast, overload import cloudpickle import torch.nn as nn @@ -177,11 +177,11 @@ class LLM: disable_custom_all_reduce: bool = False, disable_async_output_proc: bool = False, hf_overrides: Optional[HfOverrides] = None, - mm_processor_kwargs: Optional[Dict[str, Any]] = None, + mm_processor_kwargs: Optional[dict[str, Any]] = None, # After positional args are removed, move this right below `model` task: TaskOption = "auto", override_pooler_config: Optional[PoolerConfig] = None, - compilation_config: Optional[Union[int, Dict[str, Any]]] = None, + compilation_config: Optional[Union[int, dict[str, Any]]] = None, **kwargs, ) -> None: ''' @@ -246,7 +246,7 @@ class LLM: self.request_counter = Counter() @staticmethod - def get_engine_class() -> Type[LLMEngine]: + def get_engine_class() -> type[LLMEngine]: if envs.VLLM_USE_V1: # Lazy import: the v1 package isn't distributed from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine @@ -283,11 +283,11 @@ class LLM: Sequence[SamplingParams]]] = None, *, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, - ) -> List[RequestOutput]: + ) -> list[RequestOutput]: ... @overload # LEGACY: single (prompt + optional token ids) @@ -296,30 +296,30 @@ class LLM: self, prompts: str, sampling_params: Optional[Union[SamplingParams, - List[SamplingParams]]] = None, - prompt_token_ids: Optional[List[int]] = None, + list[SamplingParams]]] = None, + prompt_token_ids: Optional[list[int]] = None, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, - ) -> List[RequestOutput]: + ) -> list[RequestOutput]: ... @overload # LEGACY: multi (prompt + optional token ids) @deprecated("'prompt_token_ids' will become part of 'prompts'") def generate( self, - prompts: List[str], + prompts: list[str], sampling_params: Optional[Union[SamplingParams, - List[SamplingParams]]] = None, - prompt_token_ids: Optional[List[List[int]]] = None, + list[SamplingParams]]] = None, + prompt_token_ids: Optional[list[list[int]]] = None, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, - ) -> List[RequestOutput]: + ) -> list[RequestOutput]: ... @overload # LEGACY: single (token ids + optional prompt) @@ -328,32 +328,32 @@ class LLM: self, prompts: Optional[str] = None, sampling_params: Optional[Union[SamplingParams, - List[SamplingParams]]] = None, + list[SamplingParams]]] = None, *, - prompt_token_ids: List[int], + prompt_token_ids: list[int], use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, - ) -> List[RequestOutput]: + ) -> list[RequestOutput]: ... @overload # LEGACY: multi (token ids + optional prompt) @deprecated("'prompt_token_ids' will become part of 'prompts'") def generate( self, - prompts: Optional[List[str]] = None, + prompts: Optional[list[str]] = None, sampling_params: Optional[Union[SamplingParams, - List[SamplingParams]]] = None, + list[SamplingParams]]] = None, *, - prompt_token_ids: List[List[int]], + prompt_token_ids: list[list[int]], use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, - ) -> List[RequestOutput]: + ) -> list[RequestOutput]: ... @overload # LEGACY: single or multi token ids [pos-only] @@ -362,13 +362,13 @@ class LLM: self, prompts: None, sampling_params: None, - prompt_token_ids: Union[List[int], List[List[int]]], + prompt_token_ids: Union[list[int], list[list[int]]], use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, - ) -> List[RequestOutput]: + ) -> list[RequestOutput]: ... @deprecate_kwargs( @@ -379,17 +379,17 @@ class LLM: def generate( self, prompts: Union[Union[PromptType, Sequence[PromptType]], - Optional[Union[str, List[str]]]] = None, + Optional[Union[str, list[str]]]] = None, sampling_params: Optional[Union[SamplingParams, Sequence[SamplingParams]]] = None, - prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None, + prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, - priority: Optional[List[int]] = None, - ) -> List[RequestOutput]: + priority: Optional[list[int]] = None, + ) -> list[RequestOutput]: """Generates the completions for the input prompts. This class automatically batches the given prompts, considering @@ -440,7 +440,7 @@ class LLM: if prompt_token_ids is not None: parsed_prompts = self._convert_v1_inputs( - prompts=cast(Optional[Union[str, List[str]]], prompts), + prompts=cast(Optional[Union[str, list[str]]], prompts), prompt_token_ids=prompt_token_ids, ) else: @@ -473,8 +473,8 @@ class LLM: def collective_rpc(self, method: Union[str, Callable[..., _R]], timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: + args: tuple = (), + kwargs: Optional[dict[str, Any]] = None) -> list[_R]: """ Execute an RPC call on all workers. @@ -510,9 +510,9 @@ class LLM: def beam_search( self, - prompts: List[Union[TokensPrompt, TextPrompt]], + prompts: list[Union[TokensPrompt, TextPrompt]], params: BeamSearchParams, - ) -> List[BeamSearchOutput]: + ) -> list[BeamSearchOutput]: """ Generate sequences using beam search. @@ -543,7 +543,7 @@ class LLM: beam_search_params = SamplingParams(logprobs=2 * beam_width, max_tokens=1, temperature=temperature) - instances: List[BeamSearchInstance] = [] + instances: list[BeamSearchInstance] = [] for prompt in prompts: if is_token_prompt(prompt): @@ -553,12 +553,12 @@ class LLM: instances.append(BeamSearchInstance(prompt_tokens)) for _ in range(max_tokens): - all_beams: List[BeamSearchSequence] = list( + all_beams: list[BeamSearchSequence] = list( sum((instance.beams for instance in instances), [])) pos = [0] + list( itertools.accumulate( len(instance.beams) for instance in instances)) - instance_start_and_end: List[Tuple[int, int]] = list( + instance_start_and_end: list[tuple[int, int]] = list( zip(pos[:-1], pos[1:])) if len(all_beams) == 0: @@ -620,19 +620,19 @@ class LLM: def chat( self, - messages: Union[List[ChatCompletionMessageParam], - List[List[ChatCompletionMessageParam]]], + messages: Union[list[ChatCompletionMessageParam], + list[list[ChatCompletionMessageParam]]], sampling_params: Optional[Union[SamplingParams, - List[SamplingParams]]] = None, + list[SamplingParams]]] = None, use_tqdm: bool = True, lora_request: Optional[LoRARequest] = None, chat_template: Optional[str] = None, chat_template_content_format: ChatTemplateContentFormatOption = "auto", add_generation_prompt: bool = True, continue_final_message: bool = False, - tools: Optional[List[Dict[str, Any]]] = None, - mm_processor_kwargs: Optional[Dict[str, Any]] = None, - ) -> List[RequestOutput]: + tools: Optional[list[dict[str, Any]]] = None, + mm_processor_kwargs: Optional[dict[str, Any]] = None, + ) -> list[RequestOutput]: """ Generate responses for a chat conversation. @@ -678,17 +678,17 @@ class LLM: A list of ``RequestOutput`` objects containing the generated responses in the same order as the input messages. """ - list_of_messages: List[List[ChatCompletionMessageParam]] + list_of_messages: list[list[ChatCompletionMessageParam]] # Handle multi and single conversations if is_list_of(messages, list): - # messages is List[List[...]] - list_of_messages = cast(List[List[ChatCompletionMessageParam]], + # messages is list[list[...]] + list_of_messages = cast(list[list[ChatCompletionMessageParam]], messages) else: - # messages is List[...] + # messages is list[...] list_of_messages = [ - cast(List[ChatCompletionMessageParam], messages) + cast(list[ChatCompletionMessageParam], messages) ] tokenizer = self.get_tokenizer() @@ -699,7 +699,7 @@ class LLM: tokenizer, ) - prompts: List[Union[TokensPrompt, TextPrompt]] = [] + prompts: list[Union[TokensPrompt, TextPrompt]] = [] for msgs in list_of_messages: # NOTE: _parse_chat_message_content_parts() currently doesn't @@ -712,7 +712,7 @@ class LLM: content_format=resolved_content_format, ) - prompt_data: Union[str, List[int]] + prompt_data: Union[str, list[int]] if isinstance(tokenizer, MistralTokenizer): prompt_data = apply_mistral_chat_template( tokenizer, @@ -762,9 +762,9 @@ class LLM: Sequence[PoolingParams]]] = None, *, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: ... @overload # LEGACY: single (prompt + optional token ids) @@ -774,25 +774,25 @@ class LLM: prompts: str, pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, - prompt_token_ids: Optional[List[int]] = None, + prompt_token_ids: Optional[list[int]] = None, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: ... @overload # LEGACY: multi (prompt + optional token ids) @deprecated("'prompt_token_ids' will become part of 'prompts'") def encode( self, - prompts: List[str], + prompts: list[str], pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, - prompt_token_ids: Optional[List[List[int]]] = None, + prompt_token_ids: Optional[list[list[int]]] = None, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: ... @overload # LEGACY: single (token ids + optional prompt) @@ -803,26 +803,26 @@ class LLM: pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, *, - prompt_token_ids: List[int], + prompt_token_ids: list[int], use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: ... @overload # LEGACY: multi (token ids + optional prompt) @deprecated("'prompt_token_ids' will become part of 'prompts'") def encode( self, - prompts: Optional[List[str]] = None, + prompts: Optional[list[str]] = None, pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, *, - prompt_token_ids: List[List[int]], + prompt_token_ids: list[list[int]], use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: ... @overload # LEGACY: single or multi token ids [pos-only] @@ -831,11 +831,11 @@ class LLM: self, prompts: None, pooling_params: None, - prompt_token_ids: Union[List[int], List[List[int]]], + prompt_token_ids: Union[list[int], list[list[int]]], use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: ... @deprecate_kwargs( @@ -846,14 +846,14 @@ class LLM: def encode( self, prompts: Union[Union[PromptType, Sequence[PromptType]], - Optional[Union[str, List[str]]]] = None, + Optional[Union[str, list[str]]]] = None, pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, - prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None, + prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: """Apply pooling to the hidden states corresponding to the input prompts. @@ -898,7 +898,7 @@ class LLM: if prompt_token_ids is not None: parsed_prompts = self._convert_v1_inputs( - prompts=cast(Optional[Union[str, List[str]]], prompts), + prompts=cast(Optional[Union[str, list[str]]], prompts), prompt_token_ids=prompt_token_ids, ) else: @@ -926,9 +926,9 @@ class LLM: /, *, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[EmbeddingRequestOutput]: + ) -> list[EmbeddingRequestOutput]: """ Generate an embedding vector for each prompt. @@ -966,9 +966,9 @@ class LLM: /, *, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[ClassificationRequestOutput]: + ) -> list[ClassificationRequestOutput]: """ Generate class logits for each prompt. @@ -1003,29 +1003,29 @@ class LLM: def _embedding_score( self, tokenizer: AnyTokenizer, - text_1: List[Union[str, TextPrompt, TokensPrompt]], - text_2: List[Union[str, TextPrompt, TokensPrompt]], + text_1: list[Union[str, TextPrompt, TokensPrompt]], + text_2: list[Union[str, TextPrompt, TokensPrompt]], truncate_prompt_tokens: Optional[int] = None, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[ScoringRequestOutput]: + ) -> list[ScoringRequestOutput]: - encoded_output: List[PoolingRequestOutput] = self.encode( + encoded_output: list[PoolingRequestOutput] = self.encode( text_1 + text_2, use_tqdm=use_tqdm, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request) - encoded_output_1: List[PoolingRequestOutput] = encoded_output[ + encoded_output_1: list[PoolingRequestOutput] = encoded_output[ 0:len(text_1)] - encoded_output_2: List[PoolingRequestOutput] = encoded_output[ + encoded_output_2: list[PoolingRequestOutput] = encoded_output[ len(text_1):] if len(encoded_output_1) == 1: encoded_output_1 = encoded_output_1 * len(encoded_output_2) - scores: List[PoolingRequestOutput] = [] + scores: list[PoolingRequestOutput] = [] scores = _cosine_similarity(tokenizer=tokenizer, embed_1=encoded_output_1, @@ -1038,13 +1038,13 @@ class LLM: def _cross_encoding_score( self, tokenizer: AnyTokenizer, - text_1: List[str], - text_2: List[str], + text_1: list[str], + text_2: list[str], truncate_prompt_tokens: Optional[int] = None, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[ScoringRequestOutput]: + ) -> list[ScoringRequestOutput]: if isinstance(tokenizer, MistralTokenizer): raise ValueError( @@ -1057,7 +1057,7 @@ class LLM: pooling_params = PoolingParams() - tokenization_kwargs: Dict[str, Any] = {} + tokenization_kwargs: dict[str, Any] = {} if truncate_prompt_tokens is not None: tokenization_kwargs["truncation"] = True tokenization_kwargs["max_length"] = truncate_prompt_tokens @@ -1094,9 +1094,9 @@ class LLM: *, truncate_prompt_tokens: Optional[int] = None, use_tqdm: bool = True, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[ScoringRequestOutput]: + ) -> list[ScoringRequestOutput]: """Generate similarity scores for all pairs ````. The inputs can be ``1 -> 1``, ``1 -> N`` or ``N -> N``. @@ -1162,12 +1162,12 @@ class LLM: if isinstance(text_1, (str, dict)): # Convert a single prompt to a list. text_1 = [text_1] - input_text_1: List[str] = [ensure_str(t) for t in text_1] + input_text_1: list[str] = [ensure_str(t) for t in text_1] if isinstance(text_2, (str, dict)): # Convert a single prompt to a list. text_2 = [text_2] - input_text_2: List[str] = [ensure_str(t) for t in text_2] + input_text_2: list[str] = [ensure_str(t) for t in text_2] _validate_score_input_lens(input_text_1, input_text_2) @@ -1226,8 +1226,8 @@ class LLM: # LEGACY def _convert_v1_inputs( self, - prompts: Optional[Union[str, List[str]]], - prompt_token_ids: Optional[Union[List[int], List[List[int]]]], + prompts: Optional[Union[str, list[str]]], + prompt_token_ids: Optional[Union[list[int], list[list[int]]]], ): # skip_tokenizer_init is now checked in engine @@ -1252,7 +1252,7 @@ class LLM: raise ValueError("Either prompts or prompt_token_ids must be " "provided.") - parsed_prompts: List[PromptType] = [] + parsed_prompts: list[PromptType] = [] for i in range(num_requests): item: PromptType @@ -1275,7 +1275,7 @@ class LLM: lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]], prompt_adapter_request: Optional[PromptAdapterRequest], guided_options: Optional[GuidedDecodingRequest] = None, - priority: Optional[List[int]] = None, + priority: Optional[list[int]] = None, ) -> None: if guided_options is not None: warnings.warn( @@ -1357,7 +1357,7 @@ class LLM: def _run_engine( self, *, use_tqdm: bool - ) -> List[Union[RequestOutput, PoolingRequestOutput]]: + ) -> list[Union[RequestOutput, PoolingRequestOutput]]: # Initialize tqdm. if use_tqdm: num_requests = self.llm_engine.get_num_unfinished_requests() @@ -1370,7 +1370,7 @@ class LLM: ) # Run the engine. - outputs: List[Union[RequestOutput, PoolingRequestOutput]] = [] + outputs: list[Union[RequestOutput, PoolingRequestOutput]] = [] total_in_toks = 0 total_out_toks = 0 while self.llm_engine.has_unfinished_requests(): diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py index e82b6ba6..ea575915 100644 --- a/vllm/entrypoints/logger.py +++ b/vllm/entrypoints/logger.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Union +from typing import Optional, Union from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -22,7 +22,7 @@ class RequestLogger: self, request_id: str, prompt: Optional[str], - prompt_token_ids: Optional[List[int]], + prompt_token_ids: Optional[list[int]], params: Optional[Union[SamplingParams, PoolingParams, BeamSearchParams]], lora_request: Optional[LoRARequest], diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 1b65484c..ec2099d4 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -13,10 +13,11 @@ import socket import tempfile import uuid from argparse import Namespace +from collections.abc import AsyncIterator from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus -from typing import Annotated, AsyncIterator, Dict, Optional, Set, Tuple, Union +from typing import Annotated, Optional, Union import uvloop from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request @@ -93,7 +94,7 @@ prometheus_multiproc_dir: tempfile.TemporaryDirectory # Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765) logger = init_logger('vllm.entrypoints.openai.api_server') -_running_tasks: Set[asyncio.Task] = set() +_running_tasks: set[asyncio.Task] = set() @asynccontextmanager @@ -587,7 +588,7 @@ async def do_rerank_v2(request: RerankRequest, raw_request: Request): return await do_rerank(request, raw_request) -TASK_HANDLERS: Dict[str, Dict[str, tuple]] = { +TASK_HANDLERS: dict[str, dict[str, tuple]] = { "generate": { "messages": (ChatCompletionRequest, create_chat_completion), "default": (CompletionRequest, create_completion), @@ -894,7 +895,7 @@ async def init_app_state( state.task = model_config.task -def create_server_socket(addr: Tuple[str, int]) -> socket.socket: +def create_server_socket(addr: tuple[str, int]) -> socket.socket: family = socket.AF_INET if is_valid_ipv6_address(addr[0]): family = socket.AF_INET6 diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 8d877046..b8cc5743 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -8,7 +8,8 @@ purposes. import argparse import json import ssl -from typing import List, Optional, Sequence, Union, get_args +from collections.abc import Sequence +from typing import Optional, Union, get_args from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, @@ -33,7 +34,7 @@ class LoRAParserAction(argparse.Action): if isinstance(values, str): raise TypeError("Expected values to be a list") - lora_list: List[LoRAModulePath] = [] + lora_list: list[LoRAModulePath] = [] for item in values: if item in [None, '']: # Skip if item is None or empty string continue @@ -69,7 +70,7 @@ class PromptAdapterParserAction(argparse.Action): if isinstance(values, str): raise TypeError("Expected values to be a list") - adapter_list: List[PromptAdapterPath] = [] + adapter_list: list[PromptAdapterPath] = [] for item in values: name, path = item.split('=') adapter_list.append(PromptAdapterPath(name, path)) diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py index 41e5eef4..04d5091a 100644 --- a/vllm/entrypoints/openai/logits_processors.py +++ b/vllm/entrypoints/openai/logits_processors.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Iterable from functools import lru_cache, partial -from typing import Dict, FrozenSet, Iterable, List, Optional, Union +from typing import Optional, Union import torch @@ -14,10 +15,10 @@ class AllowedTokenIdsLogitsProcessor: specific set of token ids.""" def __init__(self, allowed_ids: Iterable[int]): - self.allowed_ids: Optional[List[int]] = list(allowed_ids) + self.allowed_ids: Optional[list[int]] = list(allowed_ids) self.mask: Optional[torch.Tensor] = None - def __call__(self, token_ids: List[int], + def __call__(self, token_ids: list[int], logits: torch.Tensor) -> torch.Tensor: if self.mask is None: self.mask = torch.ones((logits.shape[-1], ), @@ -31,7 +32,7 @@ class AllowedTokenIdsLogitsProcessor: @lru_cache(maxsize=32) def _get_allowed_token_ids_logits_processor( - allowed_token_ids: FrozenSet[int], + allowed_token_ids: frozenset[int], vocab_size: int, ) -> LogitsProcessor: if not allowed_token_ids: @@ -43,8 +44,8 @@ def _get_allowed_token_ids_logits_processor( def logit_bias_logits_processor( - logit_bias: Dict[int, float], - token_ids: List[int], + logit_bias: dict[int, float], + token_ids: list[int], logits: torch.Tensor, ) -> torch.Tensor: for token_id, bias in logit_bias.items(): @@ -53,16 +54,16 @@ def logit_bias_logits_processor( def get_logits_processors( - logit_bias: Optional[Union[Dict[int, float], Dict[str, float]]], - allowed_token_ids: Optional[List[int]], + logit_bias: Optional[Union[dict[int, float], dict[str, float]]], + allowed_token_ids: Optional[list[int]], tokenizer: AnyTokenizer, -) -> List[LogitsProcessor]: - logits_processors: List[LogitsProcessor] = [] +) -> list[LogitsProcessor]: + logits_processors: list[LogitsProcessor] = [] if logit_bias: try: # Convert token_id to integer # Clamp the bias between -100 and 100 per OpenAI API spec - clamped_logit_bias: Dict[int, float] = { + clamped_logit_bias: dict[int, float] = { int(token_id): min(100.0, max(-100.0, bias)) for token_id, bias in logit_bias.items() } diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 31214211..14ce71cd 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -5,13 +5,13 @@ import re import time from argparse import Namespace -from typing import Any, ClassVar, Dict, List, Literal, Optional, Set, Union +from typing import Annotated, Any, ClassVar, Literal, Optional, Union import torch from fastapi import UploadFile from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter, ValidationInfo, field_validator, model_validator) -from typing_extensions import Annotated, TypeAlias +from typing_extensions import TypeAlias from vllm.entrypoints.chat_utils import ChatCompletionMessageParam from vllm.logger import init_logger @@ -47,7 +47,7 @@ class OpenAIBaseModel(BaseModel): model_config = ConfigDict(extra="allow") # Cache class field names - field_names: ClassVar[Optional[Set[str]]] = None + field_names: ClassVar[Optional[set[str]]] = None @model_validator(mode="wrap") @classmethod @@ -105,12 +105,12 @@ class ModelCard(OpenAIBaseModel): root: Optional[str] = None parent: Optional[str] = None max_model_len: Optional[int] = None - permission: List[ModelPermission] = Field(default_factory=list) + permission: list[ModelPermission] = Field(default_factory=list) class ModelList(OpenAIBaseModel): object: str = "list" - data: List[ModelCard] = Field(default_factory=list) + data: list[ModelCard] = Field(default_factory=list) class PromptTokenUsageInfo(OpenAIBaseModel): @@ -134,7 +134,7 @@ class JsonSchemaResponseFormat(OpenAIBaseModel): description: Optional[str] = None # schema is the field in openai but that causes conflicts with pydantic so # instead use json_schema with an alias - json_schema: Optional[Dict[str, Any]] = Field(default=None, alias='schema') + json_schema: Optional[dict[str, Any]] = Field(default=None, alias='schema') strict: Optional[bool] = None @@ -152,7 +152,7 @@ class StreamOptions(OpenAIBaseModel): class FunctionDefinition(OpenAIBaseModel): name: str description: Optional[str] = None - parameters: Optional[Dict[str, Any]] = None + parameters: Optional[dict[str, Any]] = None class ChatCompletionToolsParam(OpenAIBaseModel): @@ -171,15 +171,15 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel): class LogitsProcessorConstructor(BaseModel): qualname: str - args: Optional[List[Any]] = None - kwargs: Optional[Dict[str, Any]] = None + args: Optional[list[Any]] = None + kwargs: Optional[dict[str, Any]] = None -LogitsProcessors = List[Union[str, LogitsProcessorConstructor]] +LogitsProcessors = list[Union[str, LogitsProcessorConstructor]] def get_logits_processors(processors: Optional[LogitsProcessors], - pattern: Optional[str]) -> Optional[List[Any]]: + pattern: Optional[str]) -> Optional[list[Any]]: if processors and pattern: logits_processors = [] for processor in processors: @@ -212,10 +212,10 @@ def get_logits_processors(processors: Optional[LogitsProcessors], class ChatCompletionRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation # https://platform.openai.com/docs/api-reference/chat/create - messages: List[ChatCompletionMessageParam] + messages: list[ChatCompletionMessageParam] model: Optional[str] = None frequency_penalty: Optional[float] = 0.0 - logit_bias: Optional[Dict[str, float]] = None + logit_bias: Optional[dict[str, float]] = None logprobs: Optional[bool] = False top_logprobs: Optional[int] = 0 # TODO(#9845): remove max_tokens when field is removed from OpenAI API @@ -228,12 +228,12 @@ class ChatCompletionRequest(OpenAIBaseModel): presence_penalty: Optional[float] = 0.0 response_format: Optional[ResponseFormat] = None seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) - stop: Optional[Union[str, List[str]]] = Field(default_factory=list) + stop: Optional[Union[str, list[str]]] = Field(default_factory=list) stream: Optional[bool] = False stream_options: Optional[StreamOptions] = None temperature: Optional[float] = None top_p: Optional[float] = None - tools: Optional[List[ChatCompletionToolsParam]] = None + tools: Optional[list[ChatCompletionToolsParam]] = None tool_choice: Optional[Union[Literal["none"], Literal["auto"], ChatCompletionNamedToolChoiceParam]] = "none" @@ -248,7 +248,7 @@ class ChatCompletionRequest(OpenAIBaseModel): min_p: Optional[float] = None repetition_penalty: Optional[float] = None length_penalty: float = 1.0 - stop_token_ids: Optional[List[int]] = Field(default_factory=list) + stop_token_ids: Optional[list[int]] = Field(default_factory=list) include_stop_str_in_output: bool = False ignore_eos: bool = False min_tokens: int = 0 @@ -290,7 +290,7 @@ class ChatCompletionRequest(OpenAIBaseModel): "special tokens so this should be set to false (as is the " "default)."), ) - documents: Optional[List[Dict[str, str]]] = Field( + documents: Optional[list[dict[str, str]]] = Field( default=None, description= ("A list of dicts representing documents that will be accessible to " @@ -307,12 +307,12 @@ class ChatCompletionRequest(OpenAIBaseModel): "allowed, so you must provide a chat template if the tokenizer " "does not define one."), ) - chat_template_kwargs: Optional[Dict[str, Any]] = Field( + chat_template_kwargs: Optional[dict[str, Any]] = Field( default=None, description=("Additional kwargs to pass to the template renderer. " "Will be accessible by the chat template."), ) - mm_processor_kwargs: Optional[Dict[str, Any]] = Field( + mm_processor_kwargs: Optional[dict[str, Any]] = Field( default=None, description=("Additional kwargs to pass to the HF processor."), ) @@ -325,7 +325,7 @@ class ChatCompletionRequest(OpenAIBaseModel): description=( "If specified, the output will follow the regex pattern."), ) - guided_choice: Optional[List[str]] = Field( + guided_choice: Optional[list[str]] = Field( default=None, description=( "If specified, the output will be exactly one of the choices."), @@ -643,17 +643,17 @@ class CompletionRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation # https://platform.openai.com/docs/api-reference/completions/create model: Optional[str] = None - prompt: Union[List[int], List[List[int]], str, List[str]] + prompt: Union[list[int], list[list[int]], str, list[str]] best_of: Optional[int] = None echo: Optional[bool] = False frequency_penalty: Optional[float] = 0.0 - logit_bias: Optional[Dict[str, float]] = None + logit_bias: Optional[dict[str, float]] = None logprobs: Optional[int] = None max_tokens: Optional[int] = 16 n: int = 1 presence_penalty: Optional[float] = 0.0 seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) - stop: Optional[Union[str, List[str]]] = Field(default_factory=list) + stop: Optional[Union[str, list[str]]] = Field(default_factory=list) stream: Optional[bool] = False stream_options: Optional[StreamOptions] = None suffix: Optional[str] = None @@ -667,14 +667,14 @@ class CompletionRequest(OpenAIBaseModel): min_p: Optional[float] = None repetition_penalty: Optional[float] = None length_penalty: float = 1.0 - stop_token_ids: Optional[List[int]] = Field(default_factory=list) + stop_token_ids: Optional[list[int]] = Field(default_factory=list) include_stop_str_in_output: bool = False ignore_eos: bool = False min_tokens: int = 0 skip_special_tokens: bool = True spaces_between_special_tokens: bool = True truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None - allowed_token_ids: Optional[List[int]] = None + allowed_token_ids: Optional[list[int]] = None prompt_logprobs: Optional[int] = None # doc: end-completion-sampling-params @@ -701,7 +701,7 @@ class CompletionRequest(OpenAIBaseModel): description=( "If specified, the output will follow the regex pattern."), ) - guided_choice: Optional[List[str]] = Field( + guided_choice: Optional[list[str]] = Field( default=None, description=( "If specified, the output will be exactly one of the choices."), @@ -908,7 +908,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation # https://platform.openai.com/docs/api-reference/embeddings model: Optional[str] = None - input: Union[List[int], List[List[int]], str, List[str]] + input: Union[list[int], list[list[int]], str, list[str]] encoding_format: Literal["float", "base64"] = "float" dimensions: Optional[int] = None user: Optional[str] = None @@ -940,7 +940,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): class EmbeddingChatRequest(OpenAIBaseModel): model: Optional[str] = None - messages: List[ChatCompletionMessageParam] + messages: list[ChatCompletionMessageParam] encoding_format: Literal["float", "base64"] = "float" dimensions: Optional[int] = None @@ -969,12 +969,12 @@ class EmbeddingChatRequest(OpenAIBaseModel): "allowed, so you must provide a chat template if the tokenizer " "does not define one."), ) - chat_template_kwargs: Optional[Dict[str, Any]] = Field( + chat_template_kwargs: Optional[dict[str, Any]] = Field( default=None, description=("Additional kwargs to pass to the template renderer. " "Will be accessible by the chat template."), ) - mm_processor_kwargs: Optional[Dict[str, Any]] = Field( + mm_processor_kwargs: Optional[dict[str, Any]] = Field( default=None, description=("Additional kwargs to pass to the HF processor."), ) @@ -1008,8 +1008,8 @@ PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest] class ScoreRequest(OpenAIBaseModel): model: Optional[str] = None - text_1: Union[List[str], str] - text_2: Union[List[str], str] + text_1: Union[list[str], str] + text_2: Union[list[str], str] truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None # doc: begin-score-pooling-params @@ -1033,7 +1033,7 @@ class ScoreRequest(OpenAIBaseModel): class RerankRequest(OpenAIBaseModel): model: Optional[str] = None query: str - documents: List[str] + documents: list[str] top_n: int = Field(default_factory=lambda: 0) truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None @@ -1073,14 +1073,14 @@ class RerankResponse(OpenAIBaseModel): id: str model: str usage: RerankUsage - results: List[RerankResult] + results: list[RerankResult] class CompletionLogProbs(OpenAIBaseModel): - text_offset: List[int] = Field(default_factory=list) - token_logprobs: List[Optional[float]] = Field(default_factory=list) - tokens: List[str] = Field(default_factory=list) - top_logprobs: List[Optional[Dict[str, + text_offset: list[int] = Field(default_factory=list) + token_logprobs: list[Optional[float]] = Field(default_factory=list) + tokens: list[str] = Field(default_factory=list) + top_logprobs: list[Optional[dict[str, float]]] = Field(default_factory=list) @@ -1096,7 +1096,7 @@ class CompletionResponseChoice(OpenAIBaseModel): "to stop, None if the completion finished for some other reason " "including encountering the EOS token"), ) - prompt_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None + prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None class CompletionResponse(OpenAIBaseModel): @@ -1104,7 +1104,7 @@ class CompletionResponse(OpenAIBaseModel): object: str = "text_completion" created: int = Field(default_factory=lambda: int(time.time())) model: str - choices: List[CompletionResponseChoice] + choices: list[CompletionResponseChoice] usage: UsageInfo @@ -1127,14 +1127,14 @@ class CompletionStreamResponse(OpenAIBaseModel): object: str = "text_completion" created: int = Field(default_factory=lambda: int(time.time())) model: str - choices: List[CompletionResponseStreamChoice] + choices: list[CompletionResponseStreamChoice] usage: Optional[UsageInfo] = Field(default=None) class EmbeddingResponseData(OpenAIBaseModel): index: int object: str = "embedding" - embedding: Union[List[float], str] + embedding: Union[list[float], str] class EmbeddingResponse(OpenAIBaseModel): @@ -1142,14 +1142,14 @@ class EmbeddingResponse(OpenAIBaseModel): object: str = "list" created: int = Field(default_factory=lambda: int(time.time())) model: str - data: List[EmbeddingResponseData] + data: list[EmbeddingResponseData] usage: UsageInfo class PoolingResponseData(OpenAIBaseModel): index: int object: str = "pooling" - data: Union[List[List[float]], List[float], str] + data: Union[list[list[float]], list[float], str] class PoolingResponse(OpenAIBaseModel): @@ -1157,7 +1157,7 @@ class PoolingResponse(OpenAIBaseModel): object: str = "list" created: int = Field(default_factory=lambda: int(time.time())) model: str - data: List[PoolingResponseData] + data: list[PoolingResponseData] usage: UsageInfo @@ -1172,7 +1172,7 @@ class ScoreResponse(OpenAIBaseModel): object: str = "list" created: int = Field(default_factory=lambda: int(time.time())) model: str - data: List[ScoreResponseData] + data: list[ScoreResponseData] usage: UsageInfo @@ -1205,7 +1205,7 @@ class ExtractedToolCallInformation(BaseModel): tools_called: bool # extracted tool calls - tool_calls: List[ToolCall] + tool_calls: list[ToolCall] # content - per OpenAI spec, content AND tool calls can be returned rarely # But some models will do this intentionally @@ -1216,21 +1216,21 @@ class ChatMessage(OpenAIBaseModel): role: str reasoning_content: Optional[str] = None content: Optional[str] = None - tool_calls: List[ToolCall] = Field(default_factory=list) + tool_calls: list[ToolCall] = Field(default_factory=list) class ChatCompletionLogProb(OpenAIBaseModel): token: str logprob: float = -9999.0 - bytes: Optional[List[int]] = None + bytes: Optional[list[int]] = None class ChatCompletionLogProbsContent(ChatCompletionLogProb): - top_logprobs: List[ChatCompletionLogProb] = Field(default_factory=list) + top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list) class ChatCompletionLogProbs(OpenAIBaseModel): - content: Optional[List[ChatCompletionLogProbsContent]] = None + content: Optional[list[ChatCompletionLogProbsContent]] = None class ChatCompletionResponseChoice(OpenAIBaseModel): @@ -1248,16 +1248,16 @@ class ChatCompletionResponse(OpenAIBaseModel): object: Literal["chat.completion"] = "chat.completion" created: int = Field(default_factory=lambda: int(time.time())) model: str - choices: List[ChatCompletionResponseChoice] + choices: list[ChatCompletionResponseChoice] usage: UsageInfo - prompt_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None + prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None class DeltaMessage(OpenAIBaseModel): role: Optional[str] = None content: Optional[str] = None reasoning_content: Optional[str] = None - tool_calls: List[DeltaToolCall] = Field(default_factory=list) + tool_calls: list[DeltaToolCall] = Field(default_factory=list) class ChatCompletionResponseStreamChoice(OpenAIBaseModel): @@ -1273,7 +1273,7 @@ class ChatCompletionStreamResponse(OpenAIBaseModel): object: Literal["chat.completion.chunk"] = "chat.completion.chunk" created: int = Field(default_factory=lambda: int(time.time())) model: str - choices: List[ChatCompletionResponseStreamChoice] + choices: list[ChatCompletionResponseStreamChoice] usage: Optional[UsageInfo] = Field(default=None) @@ -1358,7 +1358,7 @@ class TokenizeCompletionRequest(OpenAIBaseModel): class TokenizeChatRequest(OpenAIBaseModel): model: Optional[str] = None - messages: List[ChatCompletionMessageParam] + messages: list[ChatCompletionMessageParam] add_generation_prompt: bool = Field( default=True, @@ -1393,12 +1393,12 @@ class TokenizeChatRequest(OpenAIBaseModel): "allowed, so you must provide a chat template if the tokenizer " "does not define one."), ) - chat_template_kwargs: Optional[Dict[str, Any]] = Field( + chat_template_kwargs: Optional[dict[str, Any]] = Field( default=None, description=("Additional kwargs to pass to the template renderer. " "Will be accessible by the chat template."), ) - mm_processor_kwargs: Optional[Dict[str, Any]] = Field( + mm_processor_kwargs: Optional[dict[str, Any]] = Field( default=None, description=("Additional kwargs to pass to the HF processor."), ) @@ -1419,12 +1419,12 @@ TokenizeRequest = Union[TokenizeCompletionRequest, TokenizeChatRequest] class TokenizeResponse(OpenAIBaseModel): count: int max_model_len: int - tokens: List[int] + tokens: list[int] class DetokenizeRequest(OpenAIBaseModel): model: Optional[str] = None - tokens: List[int] + tokens: list[int] class DetokenizeResponse(OpenAIBaseModel): @@ -1492,7 +1492,7 @@ class TranscriptionRequest(OpenAIBaseModel): to automatically increase the temperature until certain thresholds are hit. """ - timestamp_granularities: List[Literal["word", "segment"]] = Field( + timestamp_granularities: list[Literal["word", "segment"]] = Field( alias="timestamp_granularities[]", default=[]) """The timestamp granularities to populate for this transcription. @@ -1580,7 +1580,7 @@ class TranscriptionSegment(OpenAIBaseModel): text: str """Text content of the segment.""" - tokens: List[int] + tokens: list[int] """Array of token IDs for the text content.""" @@ -1594,8 +1594,8 @@ class TranscriptionResponseVerbose(OpenAIBaseModel): text: str """The transcribed text.""" - segments: Optional[List[TranscriptionSegment]] = None + segments: Optional[list[TranscriptionSegment]] = None """Segments of the transcribed text and their corresponding details.""" - words: Optional[List[TranscriptionWord]] = None + words: Optional[list[TranscriptionWord]] = None """Extracted words and their corresponding timestamps.""" diff --git a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py index b5df7e47..b3bc0e83 100644 --- a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py +++ b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import os +from collections.abc import Sequence from functools import cached_property -from typing import Callable, Dict, List, Optional, Sequence, Tuple, Type, Union +from typing import Callable, Optional, Union from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaMessage) @@ -25,14 +26,14 @@ class ReasoningParser: self.model_tokenizer = tokenizer @cached_property - def vocab(self) -> Dict[str, int]: + def vocab(self) -> dict[str, int]: # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab # whereas all tokenizers have .get_vocab() return self.model_tokenizer.get_vocab() def extract_reasoning_content( self, model_output: str, request: ChatCompletionRequest - ) -> Tuple[Optional[str], Optional[str]]: + ) -> tuple[Optional[str], Optional[str]]: """ Extract reasoning content from a complete model-generated string. @@ -47,7 +48,7 @@ class ReasoningParser: The request object that was used to generate the model_output. Returns: - Tuple[Optional[str], Optional[str]] + tuple[Optional[str], Optional[str]] A tuple containing the reasoning content and the content. """ @@ -77,10 +78,10 @@ class ReasoningParser: class ReasoningParserManager: - reasoning_parsers: Dict[str, Type] = {} + reasoning_parsers: dict[str, type] = {} @classmethod - def get_reasoning_parser(cls, name) -> Type: + def get_reasoning_parser(cls, name) -> type: """ Get reasoning parser by name which is registered by `register_module`. @@ -94,8 +95,8 @@ class ReasoningParserManager: @classmethod def _register_module(cls, - module: Type, - module_name: Optional[Union[str, List[str]]] = None, + module: type, + module_name: Optional[Union[str, list[str]]] = None, force: bool = True) -> None: if not issubclass(module, ReasoningParser): raise TypeError("module must be subclass of ReasoningParser, " @@ -114,9 +115,9 @@ class ReasoningParserManager: @classmethod def register_module( cls, - name: Optional[Union[str, List[str]]] = None, + name: Optional[Union[str, list[str]]] = None, force: bool = True, - module: Union[Type, None] = None) -> Union[type, Callable]: + module: Union[type, None] = None) -> Union[type, Callable]: """ Register module with the given name or name list. it can be used as a decoder(with module as None) or normal function(with module as not diff --git a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py index e5ab6e6b..1a2c66a6 100644 --- a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py +++ b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import re -from typing import Optional, Sequence, Tuple, Union +from collections.abc import Sequence +from typing import Optional, Union from transformers import PreTrainedTokenizerBase @@ -122,7 +123,7 @@ class DeepSeekR1ReasoningParser(ReasoningParser): def extract_reasoning_content( self, model_output: str, request: ChatCompletionRequest - ) -> Tuple[Optional[str], Optional[str]]: + ) -> tuple[Optional[str], Optional[str]]: # DeepSeek R1 doesn't generate now. # Thus we assume the reasoning content is always at the start. diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index e4496f61..0d06ba3d 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -2,9 +2,10 @@ import asyncio import tempfile +from collections.abc import Awaitable from http import HTTPStatus from io import StringIO -from typing import Awaitable, Callable, List, Optional +from typing import Callable, Optional import aiohttp import torch @@ -143,7 +144,7 @@ async def read_file(path_or_url: str) -> str: async def write_local_file(output_path: str, - batch_outputs: List[BatchRequestOutput]) -> None: + batch_outputs: list[BatchRequestOutput]) -> None: """ Write the responses to a local file. output_path: The path to write the responses to. @@ -204,7 +205,7 @@ async def upload_data(output_url: str, data_or_file: str, f"Error message: {str(e)}.") from e -async def write_file(path_or_url: str, batch_outputs: List[BatchRequestOutput], +async def write_file(path_or_url: str, batch_outputs: list[BatchRequestOutput], output_tmp_dir: str) -> None: """ Write batch_outputs to a file or upload to a URL. @@ -353,7 +354,7 @@ async def main(args): logger.info("Reading batch from %s...", args.input_file) # Submit all requests in the file to the engine "concurrently". - response_futures: List[Awaitable[BatchRequestOutput]] = [] + response_futures: list[Awaitable[BatchRequestOutput]] = [] for request_json in (await read_file(args.input_file)).strip().split("\n"): # Skip empty lines. request_json = request_json.strip() diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 02dd2c48..98e9ea0f 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -3,10 +3,9 @@ import asyncio import json import time -from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, Final, List, - Optional) -from typing import Sequence as GenericSequence -from typing import Union +from collections.abc import AsyncGenerator, AsyncIterator +from collections.abc import Sequence as GenericSequence +from typing import Callable, Final, Optional, Union from fastapi import Request @@ -205,7 +204,7 @@ class OpenAIServingChat(OpenAIServing): raw_request.state.request_metadata = request_metadata # Schedule the request and get the result generator. - generators: List[AsyncGenerator[RequestOutput, None]] = [] + generators: list[AsyncGenerator[RequestOutput, None]] = [] try: for i, engine_prompt in enumerate(engine_prompts): sampling_params: Union[SamplingParams, BeamSearchParams] @@ -282,7 +281,7 @@ class OpenAIServingChat(OpenAIServing): result_generator: AsyncIterator[RequestOutput], request_id: str, model_name: str, - conversation: List[ConversationMessage], + conversation: list[ConversationMessage], tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, ) -> AsyncGenerator[str, None]: @@ -310,7 +309,7 @@ class OpenAIServingChat(OpenAIServing): should_stream_with_reasoning_parsing = ( self._should_stream_with_reasoning_parsing(request)) - all_previous_token_ids: Optional[List[List[int]]] + all_previous_token_ids: Optional[list[list[int]]] # Only one of these will be used, thus previous_texts and # all_previous_token_ids will not be used twice in the same iteration. @@ -339,7 +338,7 @@ class OpenAIServingChat(OpenAIServing): # Prepare the tool parser if it's needed try: if tool_choice_auto and self.tool_parser: - tool_parsers: List[Optional[ToolParser]] = [ + tool_parsers: list[Optional[ToolParser]] = [ self.tool_parser(tokenizer) ] * num_choices else: @@ -406,7 +405,7 @@ class OpenAIServingChat(OpenAIServing): # Send response to echo the input portion of the # last message if request.echo: - last_msg_content: Union[str, List[Dict[str, str]]] = "" + last_msg_content: Union[str, list[dict[str, str]]] = "" if conversation and "content" in conversation[ -1] and conversation[-1].get("role") == role: last_msg_content = conversation[-1]["content"] or "" @@ -674,7 +673,7 @@ class OpenAIServingChat(OpenAIServing): result_generator: AsyncIterator[RequestOutput], request_id: str, model_name: str, - conversation: List[ConversationMessage], + conversation: list[ConversationMessage], tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, ) -> Union[ErrorResponse, ChatCompletionResponse]: @@ -693,7 +692,7 @@ class OpenAIServingChat(OpenAIServing): assert final_res is not None - choices: List[ChatCompletionResponseChoice] = [] + choices: list[ChatCompletionResponseChoice] = [] role = self.get_chat_request_role(request) for output in final_res.outputs: @@ -812,7 +811,7 @@ class OpenAIServingChat(OpenAIServing): choices.append(choice_data) if request.echo: - last_msg_content: Union[str, List[Dict[str, str]]] = "" + last_msg_content: Union[str, list[dict[str, str]]] = "" if conversation and "content" in conversation[-1] and conversation[ -1].get("role") == role: last_msg_content = conversation[-1]["content"] or "" @@ -853,8 +852,8 @@ class OpenAIServingChat(OpenAIServing): return response def _get_top_logprobs( - self, logprobs: Dict[int, Logprob], top_logprobs: Optional[int], - tokenizer: AnyTokenizer) -> List[ChatCompletionLogProb]: + self, logprobs: dict[int, Logprob], top_logprobs: Optional[int], + tokenizer: AnyTokenizer) -> list[ChatCompletionLogProb]: return [ ChatCompletionLogProb(token=(token := self._get_decoded_token( p[1], @@ -871,12 +870,12 @@ class OpenAIServingChat(OpenAIServing): def _create_chat_logprobs( self, token_ids: GenericSequence[int], - top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]], + top_logprobs: GenericSequence[Optional[dict[int, Logprob]]], tokenizer: AnyTokenizer, num_output_top_logprobs: Optional[int] = None, ) -> ChatCompletionLogProbs: """Create OpenAI-style logprobs.""" - logprobs_content: List[ChatCompletionLogProbsContent] = [] + logprobs_content: list[ChatCompletionLogProbsContent] = [] for i, token_id in enumerate(token_ids): step_top_logprobs = top_logprobs[i] diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 840f0f9b..ed09af84 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -2,9 +2,9 @@ import asyncio import time -from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional -from typing import Sequence as GenericSequence -from typing import Tuple, Union, cast +from collections.abc import AsyncGenerator, AsyncIterator +from collections.abc import Sequence as GenericSequence +from typing import Optional, Union, cast from fastapi import Request @@ -113,7 +113,7 @@ class OpenAIServingCompletion(OpenAIServing): return self.create_error_response(str(e)) # Schedule the request and get the result generator. - generators: List[AsyncGenerator[RequestOutput, None]] = [] + generators: list[AsyncGenerator[RequestOutput, None]] = [] try: for i, engine_prompt in enumerate(engine_prompts): sampling_params: Union[SamplingParams, BeamSearchParams] @@ -189,7 +189,7 @@ class OpenAIServingCompletion(OpenAIServing): request_metadata=request_metadata) # Non-streaming response - final_res_batch: List[Optional[RequestOutput]] = [None] * num_prompts + final_res_batch: list[Optional[RequestOutput]] = [None] * num_prompts try: async for i, res in result_generator: final_res_batch[i] = res @@ -203,7 +203,7 @@ class OpenAIServingCompletion(OpenAIServing): if final_res.prompt is None: final_res.prompt = request_prompts[i]["prompt"] - final_res_batch_checked = cast(List[RequestOutput], + final_res_batch_checked = cast(list[RequestOutput], final_res_batch) response = self.request_output_to_completion_response( @@ -237,7 +237,7 @@ class OpenAIServingCompletion(OpenAIServing): async def completion_stream_generator( self, request: CompletionRequest, - result_generator: AsyncIterator[Tuple[int, RequestOutput]], + result_generator: AsyncIterator[tuple[int, RequestOutput]], request_id: str, created_time: int, model_name: str, @@ -270,7 +270,7 @@ class OpenAIServingCompletion(OpenAIServing): num_prompt_tokens[prompt_idx] = len(res.prompt_token_ids) delta_token_ids: GenericSequence[int] - out_logprobs: Optional[GenericSequence[Optional[Dict[ + out_logprobs: Optional[GenericSequence[Optional[dict[ int, Logprob]]]] for output in res.outputs: @@ -381,7 +381,7 @@ class OpenAIServingCompletion(OpenAIServing): def request_output_to_completion_response( self, - final_res_batch: List[RequestOutput], + final_res_batch: list[RequestOutput], request: CompletionRequest, request_id: str, created_time: int, @@ -389,7 +389,7 @@ class OpenAIServingCompletion(OpenAIServing): tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, ) -> CompletionResponse: - choices: List[CompletionResponseChoice] = [] + choices: list[CompletionResponseChoice] = [] num_prompt_tokens = 0 num_generated_tokens = 0 @@ -406,7 +406,7 @@ class OpenAIServingCompletion(OpenAIServing): prompt_text = final_res.prompt token_ids: GenericSequence[int] - out_logprobs: Optional[GenericSequence[Optional[Dict[int, + out_logprobs: Optional[GenericSequence[Optional[dict[int, Logprob]]]] for output in final_res.outputs: @@ -480,16 +480,16 @@ class OpenAIServingCompletion(OpenAIServing): def _create_completion_logprobs( self, token_ids: GenericSequence[int], - top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]], + top_logprobs: GenericSequence[Optional[dict[int, Logprob]]], num_output_top_logprobs: int, tokenizer: AnyTokenizer, initial_text_offset: int = 0, ) -> CompletionLogProbs: """Create logprobs for OpenAI Completion API.""" - out_text_offset: List[int] = [] - out_token_logprobs: List[Optional[float]] = [] - out_tokens: List[str] = [] - out_top_logprobs: List[Optional[Dict[str, float]]] = [] + out_text_offset: list[int] = [] + out_token_logprobs: list[Optional[float]] = [] + out_tokens: list[str] = [] + out_top_logprobs: list[Optional[dict[str, float]]] = [] last_token_len = 0 diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 607dbd96..5f6e06e6 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -3,7 +3,8 @@ import asyncio import base64 import time -from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast +from collections.abc import AsyncGenerator +from typing import Final, Literal, Optional, Union, cast import numpy as np from fastapi import Request @@ -31,7 +32,7 @@ logger = init_logger(__name__) def _get_embedding( output: EmbeddingOutput, encoding_format: Literal["float", "base64"], -) -> Union[List[float], str]: +) -> Union[list[float], str]: if encoding_format == "float": return output.embedding elif encoding_format == "base64": @@ -143,7 +144,7 @@ class OpenAIServingEmbedding(OpenAIServing): return self.create_error_response(str(e)) # Schedule the request and get the result generator. - generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] + generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] try: pooling_params = request.to_pooling_params() @@ -178,7 +179,7 @@ class OpenAIServingEmbedding(OpenAIServing): num_prompts = len(engine_prompts) # Non-streaming response - final_res_batch: List[Optional[PoolingRequestOutput]] + final_res_batch: list[Optional[PoolingRequestOutput]] final_res_batch = [None] * num_prompts try: async for i, res in result_generator: @@ -186,7 +187,7 @@ class OpenAIServingEmbedding(OpenAIServing): assert all(final_res is not None for final_res in final_res_batch) - final_res_batch_checked = cast(List[PoolingRequestOutput], + final_res_batch_checked = cast(list[PoolingRequestOutput], final_res_batch) response = self.request_output_to_embedding_response( @@ -206,13 +207,13 @@ class OpenAIServingEmbedding(OpenAIServing): def request_output_to_embedding_response( self, - final_res_batch: List[PoolingRequestOutput], + final_res_batch: list[PoolingRequestOutput], request_id: str, created_time: int, model_name: str, encoding_format: Literal["float", "base64"], ) -> EmbeddingResponse: - items: List[EmbeddingResponseData] = [] + items: list[EmbeddingResponseData] = [] num_prompt_tokens = 0 for idx, final_res in enumerate(final_res_batch): diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index d097bfcf..59333dbf 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1,15 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 import json +from collections.abc import Iterable, Iterator, Mapping, Sequence from concurrent.futures.thread import ThreadPoolExecutor from http import HTTPStatus -from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping, - Optional, Sequence, Tuple, TypedDict, Union) +from typing import Annotated, Any, Callable, Optional, TypedDict, Union from fastapi import Request from pydantic import Field from starlette.datastructures import Headers -from typing_extensions import Annotated from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient @@ -64,10 +63,10 @@ AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest, class TextTokensPrompt(TypedDict): prompt: str - prompt_token_ids: List[int] + prompt_token_ids: list[int] -RequestPrompt = Union[List[int], str, TextTokensPrompt] +RequestPrompt = Union[list[int], str, TextTokensPrompt] class OpenAIServing: @@ -144,7 +143,7 @@ class OpenAIServing: def _maybe_get_adapters( self, request: AnyRequest - ) -> Union[Tuple[None, None], Tuple[LoRARequest, None], Tuple[ + ) -> Union[tuple[None, None], tuple[LoRARequest, None], tuple[ None, PromptAdapterRequest]]: if self._is_model_supported(request.model): return None, None @@ -188,7 +187,7 @@ class OpenAIServing: self, request: AnyRequest, tokenizer: AnyTokenizer, - prompt_ids: List[int], + prompt_ids: list[int], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]], ) -> TextTokensPrompt: if truncate_prompt_tokens is None: @@ -203,7 +202,7 @@ class OpenAIServing: def _validate_input( self, request: AnyRequest, - input_ids: List[int], + input_ids: list[int], input_text: str, ) -> TextTokensPrompt: token_num = len(input_ids) @@ -259,7 +258,7 @@ class OpenAIServing: self, request: AnyRequest, tokenizer: AnyTokenizer, - prompt_input: Union[str, List[int]], + prompt_input: Union[str, list[int]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, add_special_tokens: bool = True, ) -> TextTokensPrompt: @@ -280,7 +279,7 @@ class OpenAIServing: self, request: AnyRequest, tokenizer: AnyTokenizer, - prompt_inputs: Iterable[Union[str, List[int]]], + prompt_inputs: Iterable[Union[str, list[int]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, add_special_tokens: bool = True, ) -> Iterator[TextTokensPrompt]: @@ -309,10 +308,10 @@ class OpenAIServing: self, request: AnyRequest, tokenizer: AnyTokenizer, - input_or_inputs: Union[str, List[str], List[int], List[List[int]]], + input_or_inputs: Union[str, list[str], list[int], list[list[int]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, add_special_tokens: bool = True, - ) -> List[TextTokensPrompt]: + ) -> list[TextTokensPrompt]: """ Tokenize/detokenize depending on the input format. @@ -344,10 +343,10 @@ class OpenAIServing: self, request: CompletionLikeRequest, tokenizer: AnyTokenizer, - input_or_inputs: Union[str, List[str], List[int], List[List[int]]], + input_or_inputs: Union[str, list[str], list[int], list[list[int]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, add_special_tokens: bool = True, - ) -> Tuple[List[TextTokensPrompt], List[TokensPrompt]]: + ) -> tuple[list[TextTokensPrompt], list[TokensPrompt]]: request_prompts = await self._tokenize_prompt_input_or_inputs_async( request, tokenizer, @@ -367,19 +366,19 @@ class OpenAIServing: self, request: ChatLikeRequest, tokenizer: AnyTokenizer, - messages: List[ChatCompletionMessageParam], + messages: list[ChatCompletionMessageParam], chat_template: Optional[str], chat_template_content_format: ChatTemplateContentFormatOption, add_generation_prompt: bool = True, continue_final_message: bool = False, - tool_dicts: Optional[List[Dict[str, Any]]] = None, - documents: Optional[List[Dict[str, str]]] = None, - chat_template_kwargs: Optional[Dict[str, Any]] = None, + tool_dicts: Optional[list[dict[str, Any]]] = None, + documents: Optional[list[dict[str, str]]] = None, + chat_template_kwargs: Optional[dict[str, Any]] = None, tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None, truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, add_special_tokens: bool = False, - ) -> Tuple[List[ConversationMessage], Sequence[RequestPrompt], - List[TokensPrompt]]: + ) -> tuple[list[ConversationMessage], Sequence[RequestPrompt], + list[TokensPrompt]]: resolved_content_format = resolve_chat_template_content_format( chat_template, chat_template_content_format, @@ -392,7 +391,7 @@ class OpenAIServing: content_format=resolved_content_format, ) - _chat_template_kwargs: Dict[str, Any] = dict( + _chat_template_kwargs: dict[str, Any] = dict( chat_template=chat_template, add_generation_prompt=add_generation_prompt, continue_final_message=continue_final_message, @@ -401,7 +400,7 @@ class OpenAIServing: ) _chat_template_kwargs.update(chat_template_kwargs or {}) - request_prompt: Union[str, List[int]] + request_prompt: Union[str, list[int]] if isinstance(tokenizer, MistralTokenizer): request_prompt = apply_mistral_chat_template( tokenizer, diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 0f4a174a..38a66583 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -4,7 +4,7 @@ import json import pathlib from dataclasses import dataclass from http import HTTPStatus -from typing import List, Optional, Union +from typing import Optional, Union from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient @@ -53,10 +53,10 @@ class OpenAIServingModels: self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + base_model_paths: list[BaseModelPath], *, - lora_modules: Optional[List[LoRAModulePath]] = None, - prompt_adapters: Optional[List[PromptAdapterPath]] = None, + lora_modules: Optional[list[LoRAModulePath]] = None, + prompt_adapters: Optional[list[PromptAdapterPath]] = None, ): super().__init__() @@ -65,7 +65,7 @@ class OpenAIServingModels: self.engine_client = engine_client self.static_lora_modules = lora_modules - self.lora_requests: List[LoRARequest] = [] + self.lora_requests: list[LoRARequest] = [] self.lora_id_counter = AtomicCounter(0) self.prompt_adapter_requests = [] diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index bbf5aed1..0a3ca2aa 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -3,7 +3,8 @@ import asyncio import base64 import time -from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast +from collections.abc import AsyncGenerator +from typing import Final, Literal, Optional, Union, cast import numpy as np from fastapi import Request @@ -29,7 +30,7 @@ logger = init_logger(__name__) def _get_data( output: PoolingOutput, encoding_format: Literal["float", "base64"], -) -> Union[List[float], str]: +) -> Union[list[float], str]: if encoding_format == "float": return output.data.tolist() elif encoding_format == "base64": @@ -139,7 +140,7 @@ class OpenAIServingPooling(OpenAIServing): return self.create_error_response(str(e)) # Schedule the request and get the result generator. - generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] + generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] try: pooling_params = request.to_pooling_params() @@ -174,7 +175,7 @@ class OpenAIServingPooling(OpenAIServing): num_prompts = len(engine_prompts) # Non-streaming response - final_res_batch: List[Optional[PoolingRequestOutput]] + final_res_batch: list[Optional[PoolingRequestOutput]] final_res_batch = [None] * num_prompts try: async for i, res in result_generator: @@ -182,7 +183,7 @@ class OpenAIServingPooling(OpenAIServing): assert all(final_res is not None for final_res in final_res_batch) - final_res_batch_checked = cast(List[PoolingRequestOutput], + final_res_batch_checked = cast(list[PoolingRequestOutput], final_res_batch) response = self.request_output_to_pooling_response( @@ -202,13 +203,13 @@ class OpenAIServingPooling(OpenAIServing): def request_output_to_pooling_response( self, - final_res_batch: List[PoolingRequestOutput], + final_res_batch: list[PoolingRequestOutput], request_id: str, created_time: int, model_name: str, encoding_format: Literal["float", "base64"], ) -> PoolingResponse: - items: List[PoolingResponseData] = [] + items: list[PoolingResponseData] = [] num_prompt_tokens = 0 for idx, final_res in enumerate(final_res_batch): diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index a087a8d9..73b4288c 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import asyncio import time -from typing import Any, AsyncGenerator, Dict, List, Mapping, Optional, Union +from collections.abc import AsyncGenerator, Mapping +from typing import Any, Optional, Union from fastapi import Request @@ -48,8 +49,8 @@ class ServingScores(OpenAIServing): async def _embedding_score( self, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - texts_1: List[str], - texts_2: List[str], + texts_1: list[str], + texts_2: list[str], request: Union[RerankRequest, ScoreRequest], request_id=str, tokenization_kwargs: Optional[dict[str, Any]] = None, @@ -57,11 +58,11 @@ class ServingScores(OpenAIServing): prompt_adapter_request: Optional[Union[PromptAdapterRequest, None]] = None, trace_headers: Optional[Mapping[str, str]] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: input_texts = texts_1 + texts_2 - engine_prompts: List[TokensPrompt] = [] + engine_prompts: list[TokensPrompt] = [] tokenize_async = make_async(tokenizer.__call__, executor=self._tokenizer_executor) @@ -82,7 +83,7 @@ class ServingScores(OpenAIServing): prompt_token_ids=text_token_prompt["prompt_token_ids"])) # Schedule the request and get the result generator. - generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] + generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] pooling_params = request.to_pooling_params() for i, engine_prompt in enumerate(engine_prompts): @@ -108,16 +109,16 @@ class ServingScores(OpenAIServing): result_generator = merge_async_iterators(*generators) # Non-streaming response - final_res_batch: List[PoolingRequestOutput] = [] + final_res_batch: list[PoolingRequestOutput] = [] - embeddings: List[Optional[PoolingRequestOutput]] =\ + embeddings: list[Optional[PoolingRequestOutput]] =\ [None] * len(engine_prompts) async for i, res in result_generator: embeddings[i] = res - emb_texts_1: List[PoolingRequestOutput] = [] - emb_texts_2: List[PoolingRequestOutput] = [] + emb_texts_1: list[PoolingRequestOutput] = [] + emb_texts_2: list[PoolingRequestOutput] = [] for i in range(0, len(texts_1)): assert (emb := embeddings[i]) is not None @@ -139,8 +140,8 @@ class ServingScores(OpenAIServing): async def _cross_encoding_score( self, tokenizer: Union[AnyTokenizer], - texts_1: List[str], - texts_2: List[str], + texts_1: list[str], + texts_2: list[str], request: Union[RerankRequest, ScoreRequest], request_id=str, tokenization_kwargs: Optional[dict[str, Any]] = None, @@ -148,10 +149,10 @@ class ServingScores(OpenAIServing): prompt_adapter_request: Optional[Union[PromptAdapterRequest, None]] = None, trace_headers: Optional[Mapping[str, str]] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: - request_prompts: List[str] = [] - engine_prompts: List[TokensPrompt] = [] + request_prompts: list[str] = [] + engine_prompts: list[TokensPrompt] = [] if len(texts_1) == 1: texts_1 = texts_1 * len(texts_2) @@ -185,7 +186,7 @@ class ServingScores(OpenAIServing): engine_prompts.append(engine_prompt) # Schedule the request and get the result generator. - generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] + generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] pooling_params = request.to_pooling_params() @@ -212,7 +213,7 @@ class ServingScores(OpenAIServing): result_generator = merge_async_iterators(*generators) # Non-streaming response - final_res_batch: List[ + final_res_batch: list[ Optional[PoolingRequestOutput]] = [None] * len(engine_prompts) async for i, res in result_generator: @@ -228,9 +229,9 @@ class ServingScores(OpenAIServing): request_id: str, raw_request: Optional[Request] = None, truncate_prompt_tokens: Optional[int] = None, - ) -> List[PoolingRequestOutput]: + ) -> list[PoolingRequestOutput]: - tokenization_kwargs: Dict[str, Any] = {} + tokenization_kwargs: dict[str, Any] = {} if truncate_prompt_tokens is not None: tokenization_kwargs["truncation"] = True tokenization_kwargs["max_length"] = truncate_prompt_tokens @@ -372,12 +373,12 @@ class ServingScores(OpenAIServing): def request_output_to_score_response( self, - final_res_batch: List[PoolingRequestOutput], + final_res_batch: list[PoolingRequestOutput], request_id: str, created_time: int, model_name: str, ) -> ScoreResponse: - items: List[ScoreResponseData] = [] + items: list[ScoreResponseData] = [] num_prompt_tokens = 0 for idx, final_res in enumerate(final_res_batch): @@ -406,13 +407,13 @@ class ServingScores(OpenAIServing): ) def request_output_to_rerank_response( - self, final_res_batch: List[PoolingRequestOutput], request_id: str, - model_name: str, documents: List[str], + self, final_res_batch: list[PoolingRequestOutput], request_id: str, + model_name: str, documents: list[str], top_n: int) -> RerankResponse: """ Convert the output of do_rank to a RerankResponse """ - results: List[RerankResult] = [] + results: list[RerankResult] = [] num_prompt_tokens = 0 for idx, final_res in enumerate(final_res_batch): classify_res = ScoringRequestOutput.from_base(final_res) diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 6c79adf9..4e95ef59 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Final, List, Optional, Union +from typing import Final, Optional, Union from fastapi import Request @@ -92,7 +92,7 @@ class OpenAIServingTokenization(OpenAIServing): logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) - input_ids: List[int] = [] + input_ids: list[int] = [] for i, engine_prompt in enumerate(engine_prompts): self._log_inputs(request_id, request_prompts[i], diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py index 0bedb571..77f016a5 100644 --- a/vllm/entrypoints/openai/serving_transcription.py +++ b/vllm/entrypoints/openai/serving_transcription.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import asyncio import io -from typing import AsyncGenerator, Optional, Union, cast +from collections.abc import AsyncGenerator +from typing import Optional, Union, cast from fastapi import Request diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py index 7cdd6d4c..931d5aab 100644 --- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import os +from collections.abc import Sequence from functools import cached_property -from typing import Callable, Dict, List, Optional, Sequence, Type, Union +from typing import Callable, Optional, Union from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaMessage, @@ -22,16 +23,16 @@ class ToolParser: """ def __init__(self, tokenizer: AnyTokenizer): - self.prev_tool_call_arr: List[Dict] = [] + self.prev_tool_call_arr: list[dict] = [] # the index of the tool call that is currently being parsed self.current_tool_id: int = -1 self.current_tool_name_sent: bool = False - self.streamed_args_for_tool: List[str] = [] + self.streamed_args_for_tool: list[str] = [] self.model_tokenizer = tokenizer @cached_property - def vocab(self) -> Dict[str, int]: + def vocab(self) -> dict[str, int]: # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab # whereas all tokenizers have .get_vocab() return self.model_tokenizer.get_vocab() @@ -79,10 +80,10 @@ class ToolParser: class ToolParserManager: - tool_parsers: Dict[str, Type] = {} + tool_parsers: dict[str, type] = {} @classmethod - def get_tool_parser(cls, name) -> Type: + def get_tool_parser(cls, name) -> type: """ Get tool parser by name which is registered by `register_module`. @@ -95,8 +96,8 @@ class ToolParserManager: @classmethod def _register_module(cls, - module: Type, - module_name: Optional[Union[str, List[str]]] = None, + module: type, + module_name: Optional[Union[str, list[str]]] = None, force: bool = True) -> None: if not issubclass(module, ToolParser): raise TypeError( @@ -116,9 +117,9 @@ class ToolParserManager: @classmethod def register_module( cls, - name: Optional[Union[str, List[str]]] = None, + name: Optional[Union[str, list[str]]] = None, force: bool = True, - module: Union[Type, None] = None) -> Union[type, Callable]: + module: Union[type, None] = None) -> Union[type, Callable]: """ Register module with the given name or name list. it can be used as a decoder(with module as None) or normal function(with module as not diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index 002bf173..76da63c5 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -2,8 +2,9 @@ import json import re +from collections.abc import Sequence from json import JSONDecoder -from typing import Dict, Sequence, Union +from typing import Union import partial_json_parser from partial_json_parser.core.options import Allow @@ -145,7 +146,7 @@ class Granite20bFCToolParser(ToolParser): return None # select as the current tool call the one we're on the state at - current_tool_call: Dict = tool_call_arr[self.current_tool_id] \ + current_tool_call: dict = tool_call_arr[self.current_tool_id] \ if len(tool_call_arr) > 0 else {} # case -- if no tokens have been streamed for the tool, e.g. diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py index c948ed78..91afc88e 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import json -from typing import Dict, Sequence, Union +from collections.abc import Sequence +from typing import Union import partial_json_parser from partial_json_parser.core.options import Allow @@ -136,7 +137,7 @@ class GraniteToolParser(ToolParser): return None # select as the current tool call the one we're on the state at - current_tool_call: Dict = tool_call_arr[self.current_tool_id] + current_tool_call: dict = tool_call_arr[self.current_tool_id] delta = None # case: we are starting a new tool in the array diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index 4841b287..4c39e9b0 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -2,7 +2,8 @@ import json import re -from typing import Dict, List, Sequence, Union +from collections.abc import Sequence +from typing import Union import partial_json_parser from partial_json_parser.core.options import Allow @@ -33,9 +34,9 @@ class Hermes2ProToolParser(ToolParser): self.model_tokenizer = self.model_tokenizer.tokenizer self.current_tool_name_sent: bool = False - self.prev_tool_call_arr: List[Dict] = [] + self.prev_tool_call_arr: list[dict] = [] self.current_tool_id: int = -1 - self.streamed_args_for_tool: List[str] = [ + self.streamed_args_for_tool: list[str] = [ ] # map what has been streamed for each tool so far to a list self.tool_call_start_token: str = "" diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index b9215e79..57d7c77c 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import json -from typing import Dict, Sequence, Union +from collections.abc import Sequence +from typing import Union import partial_json_parser from partial_json_parser.core.options import Allow @@ -90,7 +91,7 @@ class Internlm2ToolParser(ToolParser): # tool calls are generated in an object in inernlm2 # it's not support parallel tool calls try: - tool_call_arr: Dict = partial_json_parser.loads( + tool_call_arr: dict = partial_json_parser.loads( parsable_arr, flags) except partial_json_parser.core.exceptions.MalformedJSON: logger.debug('not enough tokens to parse into JSON yet') diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py index 7c4d63e1..8df106bf 100644 --- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -2,7 +2,8 @@ import json import re -from typing import Dict, List, Sequence, Union +from collections.abc import Sequence +from typing import Union import partial_json_parser from partial_json_parser.core.options import Allow @@ -35,9 +36,9 @@ class JambaToolParser(ToolParser): ) self.current_tool_name_sent: bool = False - self.prev_tool_call_arr: List[Dict] = [] + self.prev_tool_call_arr: list[dict] = [] self.current_tool_id: int = -1 - self.streamed_args_for_tool: List[str] = [ + self.streamed_args_for_tool: list[str] = [ ] # map what has been streamed for each tool so far to a list self.tool_calls_start_token: str = "" @@ -157,7 +158,7 @@ class JambaToolParser(ToolParser): # tool calls are generated in an array, so do partial JSON # parsing on the entire array try: - tool_call_arr: List[Dict] = partial_json_parser.loads( + tool_call_arr: list[dict] = partial_json_parser.loads( parsable_arr, flags) except partial_json_parser.core.exceptions.MalformedJSON: logger.debug('not enough tokens to parse into JSON yet') @@ -165,7 +166,7 @@ class JambaToolParser(ToolParser): # select as the current tool call the one we're on the state at - current_tool_call: Dict = tool_call_arr[self.current_tool_id] \ + current_tool_call: dict = tool_call_arr[self.current_tool_id] \ if len(tool_call_arr) > 0 else {} # case -- if no tokens have been streamed for the tool, e.g. diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py index 6a7b1136..20c3238f 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -2,8 +2,9 @@ import json import re +from collections.abc import Sequence from json import JSONDecoder -from typing import Dict, List, Sequence, Union +from typing import Union import partial_json_parser from partial_json_parser.core.options import Allow @@ -40,10 +41,10 @@ class Llama3JsonToolParser(ToolParser): # initialize properties used for state when parsing tool calls in # streaming mode - self.prev_tool_call_arr: List[Dict] = [] + self.prev_tool_call_arr: list[dict] = [] self.current_tool_id: int = -1 self.current_tool_name_sent: bool = False - self.streamed_args_for_tool: List[str] = [ + self.streamed_args_for_tool: list[str] = [ ] # map what has been streamed for each tool so far to a list self.bot_token = "<|python_tag|>" self.bot_token_id = tokenizer.encode(self.bot_token, @@ -78,7 +79,7 @@ class Llama3JsonToolParser(ToolParser): start_idx += end_idx + len('; ') function_call_arr.append(obj) - tool_calls: List[ToolCall] = [ + tool_calls: list[ToolCall] = [ ToolCall( type="function", function=FunctionCall( @@ -152,7 +153,7 @@ class Llama3JsonToolParser(ToolParser): return None # select as the current tool call the one we're on the state at - current_tool_call: Dict = tool_call_arr[self.current_tool_id] \ + current_tool_call: dict = tool_call_arr[self.current_tool_id] \ if len(tool_call_arr) > 0 else {} # case -- if no tokens have been streamed for the tool, e.g. diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index 4f048088..06614456 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -2,9 +2,10 @@ import json import re +from collections.abc import Sequence from random import choices from string import ascii_letters, digits -from typing import Dict, List, Sequence, Union +from typing import Union import partial_json_parser from partial_json_parser.core.options import Allow @@ -56,10 +57,10 @@ class MistralToolParser(ToolParser): # initialize properties used for state when parsing tool calls in # streaming mode - self.prev_tool_call_arr: List[Dict] = [] + self.prev_tool_call_arr: list[dict] = [] self.current_tool_id: int = -1 self.current_tool_name_sent: bool = False - self.streamed_args_for_tool: List[str] = [ + self.streamed_args_for_tool: list[str] = [ ] # map what has been streamed for each tool so far to a list self.bot_token = "[TOOL_CALLS]" self.bot_token_id = self.vocab.get(self.bot_token) @@ -104,7 +105,7 @@ class MistralToolParser(ToolParser): function_call_arr = json.loads(raw_tool_call) # Tool Call - tool_calls: List[MistralToolCall] = [ + tool_calls: list[MistralToolCall] = [ MistralToolCall( type="function", function=FunctionCall( @@ -172,7 +173,7 @@ class MistralToolParser(ToolParser): # tool calls are generated in an array, so do partial JSON # parsing on the entire array try: - tool_call_arr: List[Dict] = partial_json_parser.loads( + tool_call_arr: list[dict] = partial_json_parser.loads( parsable_arr, flags) except partial_json_parser.core.exceptions.MalformedJSON: logger.debug('not enough tokens to parse into JSON yet') @@ -180,7 +181,7 @@ class MistralToolParser(ToolParser): # select as the current tool call the one we're on the state at - current_tool_call: Dict = tool_call_arr[self.current_tool_id] \ + current_tool_call: dict = tool_call_arr[self.current_tool_id] \ if len(tool_call_arr) > 0 else {} # case -- if no tokens have been streamed for the tool, e.g. diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py index 5c282b5c..1b9317f1 100644 --- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py @@ -3,7 +3,8 @@ import ast import json import re -from typing import Any, Sequence, Tuple, Union +from collections.abc import Sequence +from typing import Any, Union from transformers import PreTrainedTokenizerBase @@ -204,7 +205,7 @@ def _handle_single_tool(call: ast.Call) -> ToolCall: arguments=json.dumps(arguments))) -def _make_valid_python(text: str) -> Union[Tuple[str, str], None]: +def _make_valid_python(text: str) -> Union[tuple[str, str], None]: bracket_stack = [] for index, char in enumerate(text): if char in {"[", "(", "{"}: diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py index 945cbd68..7997629d 100644 --- a/vllm/entrypoints/openai/tool_parsers/utils.py +++ b/vllm/entrypoints/openai/tool_parsers/utils.py @@ -2,7 +2,7 @@ import json from json import JSONDecodeError, JSONDecoder -from typing import Any, List, Tuple +from typing import Any import partial_json_parser from partial_json_parser.core.options import Allow @@ -82,7 +82,7 @@ def extract_intermediate_diff(curr: str, old: str) -> str: return diff -def find_all_indices(string: str, substring: str) -> List[int]: +def find_all_indices(string: str, substring: str) -> list[int]: """ Find all (starting) indices of a substring in a given string. Useful for tool call extraction @@ -99,7 +99,7 @@ def find_all_indices(string: str, substring: str) -> List[int]: # partial_json_parser doesn't support extra data and # JSONDecorder.raw_decode doesn't support partial JSON -def partial_json_loads(input_str: str, flags: Allow) -> Tuple[Any, int]: +def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]: try: return (partial_json_parser.loads(input_str, flags), len(input_str)) except JSONDecodeError as e: diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index 6ec0b5fb..53411a27 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Union +from typing import Union from torch.nn import CosineSimilarity @@ -10,12 +10,12 @@ from vllm.transformers_utils.tokenizer import (PreTrainedTokenizer, def _cosine_similarity( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - embed_1: List[PoolingRequestOutput], - embed_2: List[PoolingRequestOutput], -) -> List[PoolingRequestOutput]: + embed_1: list[PoolingRequestOutput], + embed_2: list[PoolingRequestOutput], +) -> list[PoolingRequestOutput]: scorer = CosineSimilarity(0) - scores: Union[List[PoolingRequestOutput]] = [] + scores: Union[list[PoolingRequestOutput]] = [] for emb_1, emb_2 in zip(embed_1, embed_2): pair_score = scorer(emb_1.outputs.data, emb_2.outputs.data) @@ -38,8 +38,8 @@ def _cosine_similarity( def _validate_score_input_lens( - texts_1: Union[List[str], List[dict]], - texts_2: Union[List[str], List[dict]], + texts_1: Union[list[str], list[dict]], + texts_2: Union[list[str], list[dict]], ): if len(texts_1) > 1 and len(texts_1) != len(texts_2): raise ValueError("Input lengths must be either 1:1, 1:N or N:N") diff --git a/vllm/envs.py b/vllm/envs.py index 048d63bf..bf64cd70 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -2,7 +2,7 @@ import os import tempfile -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Callable, Optional if TYPE_CHECKING: VLLM_HOST_IP: str = "" @@ -67,12 +67,12 @@ if TYPE_CHECKING: VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False VLLM_TEST_FORCE_FP8_MARLIN: bool = False VLLM_RPC_TIMEOUT: int = 10000 # ms - VLLM_PLUGINS: Optional[List[str]] = None + VLLM_PLUGINS: Optional[list[str]] = None VLLM_TORCH_PROFILER_DIR: Optional[str] = None VLLM_USE_TRITON_AWQ: bool = False VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_SKIP_P2P_CHECK: bool = False - VLLM_DISABLED_KERNELS: List[str] = [] + VLLM_DISABLED_KERNELS: list[str] = [] VLLM_USE_V1: bool = False VLLM_ROCM_FP8_PADDING: bool = True VLLM_ENABLE_V1_MULTIPROCESSING: bool = True @@ -123,7 +123,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: # begin-env-vars-definition -environment_variables: Dict[str, Callable[[], Any]] = { +environment_variables: dict[str, Callable[[], Any]] = { # ================== Installation Time Env Vars ================== diff --git a/vllm/forward_context.py b/vllm/forward_context.py index b91816af..c3d20cff 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -4,7 +4,7 @@ import time from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Optional import torch import torch.distributed as dist @@ -28,13 +28,13 @@ batchsize_forward_time: defaultdict = defaultdict(list) @dataclass class ForwardContext: # copy from vllm_config.compilation_config.static_forward_context - attn_layers: Dict[str, Any] + attn_layers: dict[str, Any] # TODO: extend to support per-layer dynamic forward context attn_metadata: "AttentionMetadata" # set dynamically for each forward pass # TODO: remove after making all virtual_engines share the same kv cache virtual_engine: int # set dynamically for each forward pass num_tokens_across_dp: Optional[ - List[int]] = None # set dynamically for each forward pass + list[int]] = None # set dynamically for each forward pass _forward_context: Optional[ForwardContext] = None diff --git a/vllm/logger.py b/vllm/logger.py index 0ee47de1..2b0b9da2 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -109,7 +109,7 @@ def _configure_vllm_root_logger() -> None: custom_config = json.loads(file.read()) if not isinstance(custom_config, dict): - raise ValueError("Invalid logging config. Expected Dict, got %s.", + raise ValueError("Invalid logging config. Expected dict, got %s.", type(custom_config).__name__) logging_config = custom_config diff --git a/vllm/logits_process.py b/vllm/logits_process.py index a810be7b..e3faf200 100644 --- a/vllm/logits_process.py +++ b/vllm/logits_process.py @@ -1,13 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, List, Tuple, Union +from typing import Callable, Union import torch from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -LogitsProcessor = Union[Callable[[List[int], torch.Tensor], torch.Tensor], - Callable[[List[int], List[int], torch.Tensor], +LogitsProcessor = Union[Callable[[list[int], torch.Tensor], torch.Tensor], + Callable[[list[int], list[int], torch.Tensor], torch.Tensor]] """LogitsProcessor is a function that takes a list of previously generated tokens, the logits tensor @@ -17,9 +17,9 @@ to sample from.""" def get_bad_words_logits_processors( - bad_words: List[str], - tokenizer: AnyTokenizer) -> List[LogitsProcessor]: - bad_words_ids: List[List[int]] = list() + bad_words: list[str], + tokenizer: AnyTokenizer) -> list[LogitsProcessor]: + bad_words_ids: list[list[int]] = list() for bad_word in bad_words: # To prohibit words both at the beginning @@ -51,13 +51,13 @@ class NoBadWordsLogitsProcessor: _SMALLEST_LOGIT = float("-inf") _NEUTRAL_LOGIT = 0.0 - def __init__(self, bad_words_ids: List[List[int]]): + def __init__(self, bad_words_ids: list[list[int]]): self.bad_words_ids = bad_words_ids self.word_bias: torch.FloatTensor = None def __call__( self, - past_tokens_ids: Union[List[int], Tuple[int]], + past_tokens_ids: Union[list[int], tuple[int]], logits: torch.FloatTensor, ) -> torch.Tensor: if self.word_bias is None: diff --git a/vllm/outputs.py b/vllm/outputs.py index 03011971..8c355c89 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 import time +from collections.abc import MutableSequence +from collections.abc import Sequence as GenericSequence from dataclasses import dataclass -from typing import Dict, Generic, List, MutableSequence, Optional -from typing import Sequence as GenericSequence -from typing import Union +from typing import Generic, Optional, Union import torch from typing_extensions import TypeVar, deprecated @@ -109,14 +109,14 @@ class RequestOutput: self, request_id: str, prompt: Optional[str], - prompt_token_ids: Optional[List[int]], + prompt_token_ids: Optional[list[int]], prompt_logprobs: Optional[PromptLogprobs], - outputs: List[CompletionOutput], + outputs: list[CompletionOutput], finished: bool, metrics: Optional[RequestMetrics] = None, lora_request: Optional[LoRARequest] = None, encoder_prompt: Optional[str] = None, - encoder_prompt_token_ids: Optional[List[int]] = None, + encoder_prompt_token_ids: Optional[list[int]] = None, num_cached_tokens: Optional[int] = None, *, multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None, @@ -139,9 +139,9 @@ class RequestOutput: cls, request_id: str, prompt: Optional[str], - prompt_token_ids: Optional[List[int]], + prompt_token_ids: Optional[list[int]], text: str, - token_ids: List[int], + token_ids: list[int], logprobs: Optional[SampleLogprobs], prompt_logprobs: Optional[PromptLogprobs], cumulative_logprob: Optional[float], @@ -189,7 +189,7 @@ class RequestOutput: @classmethod def from_seq_group( cls, seq_group: SequenceGroup, use_cache: bool, - seq_id_to_seq_group: Dict[str, SequenceGroupBase] + seq_id_to_seq_group: dict[str, SequenceGroupBase] ) -> Optional["RequestOutput"]: finished = seq_group.is_finished() @@ -363,12 +363,12 @@ class PoolingRequestOutput(Generic[_O]): Args: request_id (str): A unique identifier for the pooling request. outputs (PoolingOutput): The pooling results for the given input. - prompt_token_ids (List[int]): A list of token IDs used in the prompt. + prompt_token_ids (list[int]): A list of token IDs used in the prompt. finished (bool): A flag indicating whether the pooling is completed. """ def __init__(self, request_id: str, outputs: _O, - prompt_token_ids: List[int], finished: bool): + prompt_token_ids: list[int], finished: bool): self.request_id = request_id self.prompt_token_ids = prompt_token_ids self.finished = finished @@ -407,7 +407,7 @@ class RequestOutputFactory: @staticmethod def create(seq_group: SequenceGroup, - seq_id_to_seq_group: Dict[str, SequenceGroupBase], + seq_id_to_seq_group: dict[str, SequenceGroupBase], use_cache: bool = False): if seq_group.pooled_data is not None: return PoolingRequestOutput.from_seq_group(seq_group) diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 2ce87283..17e4e433 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -4,11 +4,10 @@ import copy from dataclasses import dataclass from enum import Enum, IntEnum from functools import cached_property -from typing import Any, Dict, List, Optional, Set, Union +from typing import Annotated, Any, Optional, Union import msgspec from pydantic import BaseModel -from typing_extensions import Annotated from vllm.logger import init_logger from vllm.logits_process import LogitsProcessor @@ -29,9 +28,9 @@ class SamplingType(IntEnum): @dataclass class GuidedDecodingParams: """One of these fields will be used to build a logit processor.""" - json: Optional[Union[str, Dict]] = None + json: Optional[Union[str, dict]] = None regex: Optional[str] = None - choice: Optional[List[str]] = None + choice: Optional[list[str]] = None grammar: Optional[str] = None json_object: Optional[bool] = None """These are other options that can be set""" @@ -40,9 +39,9 @@ class GuidedDecodingParams: @staticmethod def from_optional( - json: Optional[Union[Dict, BaseModel, str]] = None, + json: Optional[Union[dict, BaseModel, str]] = None, regex: Optional[str] = None, - choice: Optional[List[str]] = None, + choice: Optional[list[str]] = None, grammar: Optional[str] = None, json_object: Optional[bool] = None, backend: Optional[str] = None, @@ -72,7 +71,7 @@ class GuidedDecodingParams: """ return (self.backend or "").split(":")[0] - def backend_options(self) -> List[str]: + def backend_options(self) -> list[str]: """Return the backend options as a list of strings.""" if not self.backend or ":" not in self.backend: return [] @@ -144,12 +143,12 @@ class SamplingParams( considered, relative to the probability of the most likely token. Must be in [0, 1]. Set to 0 to disable this. seed: Random seed to use for the generation. - stop: List of strings that stop the generation when they are generated. + stop: list of strings that stop the generation when they are generated. The returned output will not contain the stop strings. - stop_token_ids: List of tokens that stop the generation when they are + stop_token_ids: list of tokens that stop the generation when they are generated. The returned output will contain the stop tokens unless the stop tokens are special tokens. - bad_words: List of words that are not allowed to be generated. + bad_words: list of words that are not allowed to be generated. More precisely, only the last token of a corresponding token sequence is not allowed when the next generated token can complete the sequence. @@ -172,7 +171,7 @@ class SamplingParams( skip_special_tokens: Whether to skip special tokens in the output. spaces_between_special_tokens: Whether to add spaces between special tokens in the output. Defaults to True. - logits_processors: List of functions that modify logits based on + logits_processors: list of functions that modify logits based on previously generated tokens, and optionally prompt tokens as a first argument. truncate_prompt_tokens: If set to an integer k, will use only the last k @@ -198,9 +197,9 @@ class SamplingParams( top_k: int = -1 min_p: float = 0.0 seed: Optional[int] = None - stop: Optional[Union[str, List[str]]] = None - stop_token_ids: Optional[List[int]] = None - bad_words: Optional[List[str]] = None + stop: Optional[Union[str, list[str]]] = None + stop_token_ids: Optional[list[int]] = None + bad_words: Optional[list[str]] = None ignore_eos: bool = False max_tokens: Optional[int] = 16 min_tokens: int = 0 @@ -212,8 +211,8 @@ class SamplingParams( detokenize: bool = True skip_special_tokens: bool = True spaces_between_special_tokens: bool = True - # Optional[List[LogitsProcessor]] type. We use Any here because - # Optional[List[LogitsProcessor]] type is not supported by msgspec. + # Optional[list[LogitsProcessor]] type. We use Any here because + # Optional[list[LogitsProcessor]] type is not supported by msgspec. logits_processors: Optional[Any] = None include_stop_str_in_output: bool = False truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None @@ -222,12 +221,12 @@ class SamplingParams( # The below fields are not supposed to be used as an input. # They are set in post_init. output_text_buffer_length: int = 0 - _all_stop_token_ids: Set[int] = msgspec.field(default_factory=set) + _all_stop_token_ids: set[int] = msgspec.field(default_factory=set) # Fields used to construct logits processors guided_decoding: Optional[GuidedDecodingParams] = None - logit_bias: Optional[Dict[int, float]] = None - allowed_token_ids: Optional[List[int]] = None + logit_bias: Optional[dict[int, float]] = None + allowed_token_ids: Optional[list[int]] = None @staticmethod def from_optional( @@ -241,9 +240,9 @@ class SamplingParams( top_k: int = -1, min_p: float = 0.0, seed: Optional[int] = None, - stop: Optional[Union[str, List[str]]] = None, - stop_token_ids: Optional[List[int]] = None, - bad_words: Optional[List[str]] = None, + stop: Optional[Union[str, list[str]]] = None, + stop_token_ids: Optional[list[int]] = None, + bad_words: Optional[list[str]] = None, include_stop_str_in_output: bool = False, ignore_eos: bool = False, max_tokens: Optional[int] = 16, @@ -253,13 +252,13 @@ class SamplingParams( detokenize: bool = True, skip_special_tokens: bool = True, spaces_between_special_tokens: bool = True, - logits_processors: Optional[List[LogitsProcessor]] = None, + logits_processors: Optional[list[LogitsProcessor]] = None, truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None, output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE, guided_decoding: Optional[GuidedDecodingParams] = None, - logit_bias: Optional[Union[Dict[int, float], Dict[str, float]]] = None, - allowed_token_ids: Optional[List[int]] = None, + logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None, + allowed_token_ids: Optional[list[int]] = None, ) -> "SamplingParams": if logit_bias is not None: # Convert token_id to integer @@ -435,7 +434,7 @@ class SamplingParams( def update_from_generation_config( self, - generation_config: Dict[str, Any], + generation_config: dict[str, Any], model_eos_token_id: Optional[int] = None) -> None: """Update if there are non-default values from generation_config""" @@ -468,7 +467,7 @@ class SamplingParams( return SamplingType.RANDOM @property - def all_stop_token_ids(self) -> Set[int]: + def all_stop_token_ids(self) -> set[int]: return self._all_stop_token_ids def clone(self) -> "SamplingParams": diff --git a/vllm/sequence.py b/vllm/sequence.py index c0425ba3..6a7b1e62 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -5,11 +5,11 @@ import enum from abc import ABC, abstractmethod from array import array from collections import defaultdict +from collections.abc import Mapping +from collections.abc import Sequence as GenericSequence from dataclasses import dataclass, field from functools import reduce -from typing import Any, Callable, DefaultDict, Dict, List, Mapping, Optional -from typing import Sequence as GenericSequence -from typing import Set, Tuple, Union +from typing import Any, Callable, Optional, Union import msgspec import torch @@ -50,9 +50,9 @@ class Logprob: # {token_id -> logprob} per each sequence group. None if the corresponding # sequence group doesn't require prompt logprob. -PromptLogprobs = List[Optional[Dict[int, Logprob]]] +PromptLogprobs = list[Optional[dict[int, Logprob]]] # {token_id -> logprob} for each sequence group. -SampleLogprobs = List[Dict[int, Logprob]] +SampleLogprobs = list[dict[int, Logprob]] class SequenceStatus(enum.IntEnum): @@ -129,7 +129,7 @@ class SequenceDataDelta( omit_defaults=True): # type: ignore[call-arg] """Delta SequenceData to send to workers per step.""" # A new token to be appended to existing SequenceData. - new_output_token_ids: List[int] + new_output_token_ids: list[int] # Overwriting existing `cumulative_logprob` new_cumulative_logprob: float # Overwriting existing `num_computed_tokens`. @@ -152,7 +152,7 @@ class SequenceData(msgspec.Struct, output_token_ids: The token IDs of the output. cumulative_logprob: The cumulative log probability of the output. """ - # NOTE: we cannot use Union[List, array] because msgspec cannot support + # NOTE: we cannot use Union[list, array] because msgspec cannot support # union of 2 list types. _prompt_token_ids: array _output_token_ids: array = msgspec.field( @@ -160,25 +160,25 @@ class SequenceData(msgspec.Struct, ### The below fields should not be passed as an argument ### _cumulative_logprob: float = 0.0 - _prompt_token_ids_tuple: Tuple[int, + _prompt_token_ids_tuple: tuple[int, ...] = msgspec.field(default_factory=tuple) # The number of tokens that are computed (that run against the model). _num_computed_tokens: int = 0 # The number of tokens with prefix cache hit. _num_cached_tokens: int = 0 _stage: SequenceStage = SequenceStage.PREFILL - _cached_all_token_ids: List[int] = msgspec.field(default_factory=list) + _cached_all_token_ids: list[int] = msgspec.field(default_factory=list) # It is used to get delta input. It is reset when `get_delta_and_reset` # is called. - _new_appended_tokens: List[int] = msgspec.field(default_factory=list) + _new_appended_tokens: list[int] = msgspec.field(default_factory=list) # It is used to compute mrope_position_ids. _mrope_position_delta: Optional[int] = None @staticmethod def from_prompt_token_counts( - *token_counts: Tuple[int, int]) -> "SequenceData": + *token_counts: tuple[int, int]) -> "SequenceData": """ Construct a :class:`SequenceData` instance by concatenating prompt token sequences. @@ -220,14 +220,14 @@ class SequenceData(msgspec.Struct, def __post_init__(self) -> None: assert self._prompt_token_ids.typecode == "l" assert self._output_token_ids.typecode == "l" - self._prompt_token_ids_tuple: Tuple[int, ...] = tuple( + self._prompt_token_ids_tuple: tuple[int, ...] = tuple( self._prompt_token_ids) self._update_cached_all_tokens() def _update_cached_all_tokens(self): assert isinstance(self._prompt_token_ids, array) assert isinstance(self._output_token_ids, array) - self._cached_all_token_ids: List[int] = list(self._prompt_token_ids + + self._cached_all_token_ids: list[int] = list(self._prompt_token_ids + self._output_token_ids) @property @@ -235,7 +235,7 @@ class SequenceData(msgspec.Struct, return self._cumulative_logprob @property - def prompt_token_ids(self) -> Tuple[int, ...]: + def prompt_token_ids(self) -> tuple[int, ...]: return self._prompt_token_ids_tuple @prompt_token_ids.setter @@ -252,7 +252,7 @@ class SequenceData(msgspec.Struct, return self._prompt_token_ids @property - def output_token_ids(self) -> Tuple[int, ...]: + def output_token_ids(self) -> tuple[int, ...]: return tuple(self._output_token_ids) @output_token_ids.setter @@ -295,12 +295,12 @@ class SequenceData(msgspec.Struct, def get_output_len(self) -> int: return len(self._output_token_ids) - def get_token_ids(self) -> List[int]: + def get_token_ids(self) -> list[int]: return self._cached_all_token_ids def get_prefix_token_ids( self, num_tokens: int - ) -> Tuple[Tuple[int, ...], Optional[Tuple[int, ...]]]: + ) -> tuple[tuple[int, ...], Optional[tuple[int, ...]]]: """Get prefix tokens, and make the return value hashable""" prompt_length = self.get_prompt_len() if num_tokens > prompt_length: @@ -351,10 +351,10 @@ class SequenceData(msgspec.Struct, return self._prompt_token_ids[-1] return self._output_token_ids[-1] - def get_prompt_token_ids(self) -> Tuple[int, ...]: + def get_prompt_token_ids(self) -> tuple[int, ...]: return self.prompt_token_ids - def get_output_token_ids(self) -> Tuple[int, ...]: + def get_output_token_ids(self) -> tuple[int, ...]: return self.output_token_ids def get_delta_and_reset(self) -> SequenceDataDelta: @@ -432,7 +432,7 @@ class Sequence: self.prefix_offset = 0 self.read_offset = 0 # Input + output tokens - self.tokens: Optional[List[str]] = None + self.tokens: Optional[list[str]] = None @property def n_blocks(self) -> int: @@ -443,7 +443,7 @@ class Sequence: return self.inputs.prompt @property - def prompt_token_ids(self) -> List[int]: + def prompt_token_ids(self) -> list[int]: return self.inputs.prompt_token_ids @property @@ -451,7 +451,7 @@ class Sequence: return self.inputs.prompt_embeds @property - def token_type_ids(self) -> List[int]: + def token_type_ids(self) -> list[int]: return self.inputs.token_type_ids @property @@ -463,7 +463,7 @@ class Sequence: return self.inputs.multi_modal_placeholders @property - def mm_processor_kwargs(self) -> Dict[str, Any]: + def mm_processor_kwargs(self) -> dict[str, Any]: return self.inputs.mm_processor_kwargs @property @@ -548,7 +548,7 @@ class Sequence: """Reset the sequence states for recomputation.""" self.data.reset_state_for_recompute() - def append_token_id(self, token_id: int, logprobs: Dict[int, + def append_token_id(self, token_id: int, logprobs: dict[int, Logprob]) -> None: assert token_id in logprobs self.output_logprobs.append(logprobs) @@ -563,16 +563,16 @@ class Sequence: def get_output_len(self) -> int: return self.data.get_output_len() - def get_token_ids(self) -> List[int]: + def get_token_ids(self) -> list[int]: return self.data.get_token_ids() - def get_prompt_token_ids(self) -> Tuple[int, ...]: + def get_prompt_token_ids(self) -> tuple[int, ...]: return self.data.get_prompt_token_ids() def get_last_token_id(self) -> int: return self.data.get_last_token_id() - def get_output_token_ids(self) -> Tuple[int, ...]: + def get_output_token_ids(self) -> tuple[int, ...]: return self.data.get_output_token_ids() def get_cumulative_logprob(self) -> float: @@ -644,7 +644,7 @@ class SequenceGroup: def __init__( self, request_id: str, - seqs: List[Sequence], + seqs: list[Sequence], arrival_time: float, sampling_params: Optional[SamplingParams] = None, lora_request: Optional[LoRARequest] = None, @@ -686,7 +686,7 @@ class SequenceGroup: return self.first_seq.prompt @property - def prompt_token_ids(self) -> List[int]: + def prompt_token_ids(self) -> list[int]: return self.first_seq.prompt_token_ids @property @@ -698,7 +698,7 @@ class SequenceGroup: if self.encoder_seq is not None else None) @property - def encoder_prompt_token_ids(self) -> Optional[List[int]]: + def encoder_prompt_token_ids(self) -> Optional[list[int]]: # There are either 0 or 1 encoder sequences # If one is present, its prompt token ids are # distinct from the decoder's. @@ -706,7 +706,7 @@ class SequenceGroup: if self.encoder_seq is not None else None) @property - def token_type_ids(self) -> Optional[List[int]]: + def token_type_ids(self) -> Optional[list[int]]: return self.first_seq.token_type_ids @property @@ -726,7 +726,7 @@ class SequenceGroup: return {} @property - def mm_processor_kwargs(self) -> Dict[str, Any]: + def mm_processor_kwargs(self) -> dict[str, Any]: if self.first_seq.multi_modal_data: return self.first_seq.mm_processor_kwargs elif self.encoder_seq is not None: @@ -823,7 +823,7 @@ class SequenceGroup: def get_seqs( self, status: Optional[SequenceStatus] = None, - ) -> List[Sequence]: + ) -> list[Sequence]: if status is None: return self.seqs @@ -838,7 +838,7 @@ class SequenceGroup: def get_encoder_seq(self) -> Optional[Sequence]: return self.encoder_seq - def get_finished_seqs(self) -> List[Sequence]: + def get_finished_seqs(self) -> list[Sequence]: if self.is_single_seq: return self.seqs if self.first_seq.is_finished() else [] @@ -897,13 +897,13 @@ class SequenceGroupMetadataDelta( After sending the first SequenceGroupMetadata, vLLM scheduler only sends delta to reduce the data payload size. """ - seq_data_delta: Dict[int, SequenceDataDelta] + seq_data_delta: dict[int, SequenceDataDelta] request_id: str - block_tables: Dict[int, List[int]] + block_tables: dict[int, list[int]] is_prompt: bool do_sample: bool = True token_chunk_size: Optional[int] = None - computed_block_nums: Optional[List[int]] = None + computed_block_nums: Optional[list[int]] = None state: Optional[SequenceGroupState] = msgspec.field( default_factory=lambda: SequenceGroupState()) @@ -947,23 +947,23 @@ class SequenceGroupMetadata( request_id: str is_prompt: bool - seq_data: Dict[int, SequenceData] + seq_data: dict[int, SequenceData] sampling_params: Optional[SamplingParams] - block_tables: Dict[int, List[int]] + block_tables: dict[int, list[int]] do_sample: bool = True pooling_params: Optional[PoolingParams] = None lora_request: Optional[LoRARequest] = None - computed_block_nums: Optional[List[int]] = None + computed_block_nums: Optional[list[int]] = None state: Optional[SequenceGroupState] = msgspec.field( default_factory=lambda: SequenceGroupState()) # "MultiModalDataDict" types. We have to use Any due to msgspec # doesn't allow to have union of 2 different dicts. - token_type_ids: Optional[List[int]] = None + token_type_ids: Optional[list[int]] = None multi_modal_data: Optional[Any] = None multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None - mm_processor_kwargs: Optional[Dict[str, Any]] = None + mm_processor_kwargs: Optional[dict[str, Any]] = None encoder_seq_data: Optional[SequenceData] = None - cross_block_table: Optional[List[int]] = None + cross_block_table: Optional[list[int]] = None prompt_adapter_request: Optional[PromptAdapterRequest] = None token_chunk_size: Optional[int] = None @@ -1042,7 +1042,7 @@ class SequenceOutput( """ parent_seq_id: int output_token: int - logprobs: Dict[int, Logprob] + logprobs: dict[int, Logprob] def __repr__(self) -> str: return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, " @@ -1076,7 +1076,7 @@ class CompletionSequenceGroupOutput( array_like=True): # type: ignore[call-arg] """The model output associated with a completion sequence group.""" __metaclass__ = SequenceGroupOutput - samples: List[SequenceOutput] + samples: list[SequenceOutput] # Prompt logprob for each prompt query token. prompt_logprobs: Optional[PromptLogprobs] @@ -1119,7 +1119,7 @@ class IntermediateTensors: contains the hidden states and residuals for a request. """ - tensors: Dict[str, torch.Tensor] + tensors: dict[str, torch.Tensor] def __init__(self, tensors): # manually define this function, so that @@ -1155,7 +1155,7 @@ class PoolerOutput( omit_defaults=True, # type: ignore[call-arg] array_like=True): # type: ignore[call-arg] """The output from a pooling operation in the pooling model.""" - outputs: List[PoolingSequenceGroupOutput] + outputs: list[PoolingSequenceGroupOutput] def __getitem__(self, idx: int) -> PoolingSequenceGroupOutput: return self.outputs[idx] @@ -1172,7 +1172,7 @@ class PoolerOutput( def get_all_seq_ids( - seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]: + seq_group_metadata_list: list[SequenceGroupMetadata]) -> list[int]: """Given a list of SequenceGroupMetadata, create a list of all sequence ids. """ @@ -1180,13 +1180,13 @@ def get_all_seq_ids( def get_all_seq_ids_and_request_ids( - seq_group_metadata_list: List[SequenceGroupMetadata] -) -> Tuple[List[int], Dict[str, Set[int]]]: + seq_group_metadata_list: list[SequenceGroupMetadata] +) -> tuple[list[int], dict[str, set[int]]]: """Given a list of SequenceGroupMetadata, create a list of all sequence ids. """ - seq_ids: List[int] = [] - request_id_seq_ids_mapping: DefaultDict[str, Set[int]] = defaultdict(set) + seq_ids: list[int] = [] + request_id_seq_ids_mapping: defaultdict[str, set[int]] = defaultdict(set) for sg in seq_group_metadata_list: for seq_id in sg.seq_data: seq_ids.append(seq_id) @@ -1206,14 +1206,14 @@ class HiddenStates(msgspec.Struct, array_like=True, # all tokens, whereas for decode step, it use used for last accepted tokens. hidden_states: torch.Tensor # The sequence group metadata list. Only needed for decode step. - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None + seq_group_metadata_list: Optional[list[SequenceGroupMetadata]] = None # Scorer hidden states of the 2nd last token proposed by the proposer ( # irrespective of whether it was accepted or not). Only used for cases when # last proposed token is accepted (i.e., in case of bonus tokens). For the # case of no bonus tokens, these are ignored. second_last_token_hidden_states: Optional[torch.Tensor] = None - _seq_ids: List[int] = msgspec.field(default_factory=list) + _seq_ids: list[int] = msgspec.field(default_factory=list) def __post_init__(self): if self.seq_group_metadata_list is not None: @@ -1221,12 +1221,12 @@ class HiddenStates(msgspec.Struct, array_like=True, self._seq_ids = get_all_seq_ids(self.seq_group_metadata_list) @property - def seq_ids(self) -> List[int]: + def seq_ids(self) -> list[int]: return self._seq_ids def update(self, hidden_states: torch.Tensor, - seq_group_metadata_list: List[SequenceGroupMetadata], + seq_group_metadata_list: list[SequenceGroupMetadata], second_last_token_hidden_states: Optional[torch.Tensor] = None): """Update hidden states from target model invocation. Only used for decode steps""" @@ -1244,7 +1244,7 @@ class HiddenStates(msgspec.Struct, array_like=True, ]) def prune(self, - seq_group_metadata_list: List[SequenceGroupMetadata]) -> None: + seq_group_metadata_list: list[SequenceGroupMetadata]) -> None: """Prune to provided list of sequence ids. Only used for decode steps. """ # Currently this prunes all seq_ids not present in @@ -1287,16 +1287,16 @@ class ExecuteModelRequest( """The model execution request, containing CPU metadata only. The LLM engine should create an instance of this class for each request batch.""" # The sequence group metadata list. - seq_group_metadata_list: List[Union[SequenceGroupMetadata, + seq_group_metadata_list: list[Union[SequenceGroupMetadata, SequenceGroupMetadataDelta]] # Blocks to swap in. List of CPU -> GPU block number. - blocks_to_swap_in: List[Tuple[int, + blocks_to_swap_in: list[tuple[int, int]] = msgspec.field(default_factory=list) # Blocks to swap out. List of GPU -> CPU block number. - blocks_to_swap_out: List[Tuple[int, + blocks_to_swap_out: list[tuple[int, int]] = msgspec.field(default_factory=list) # Blocks to copy. Source to dest block. - blocks_to_copy: List[Tuple[int, int]] = msgspec.field(default_factory=list) + blocks_to_copy: list[tuple[int, int]] = msgspec.field(default_factory=list) # Virtual engine ID for pipeline parallel. virtual_engine: int = 0 # The number of slots for lookahead decoding. @@ -1310,7 +1310,7 @@ class ExecuteModelRequest( # The step index for spec model input. spec_step_idx: Optional[int] = None # Finished request ids since last step. - finished_requests_ids: List[str] = msgspec.field(default_factory=list) + finished_requests_ids: list[str] = msgspec.field(default_factory=list) # The last sampled token ids for multi step decoding. last_sampled_token_ids: Optional[torch.Tensor] = None # Async callback @@ -1344,7 +1344,7 @@ class ExecuteModelRequest( return state.current_step def clone( - self, seq_group_metadata_list: List[Union[SequenceGroupMetadata, + self, seq_group_metadata_list: list[Union[SequenceGroupMetadata, SequenceGroupMetadataDelta]] ) -> "ExecuteModelRequest": """Clone the request with a new sequence group metadata list.""" @@ -1371,13 +1371,13 @@ class SequenceGroupBase: assembled_seq_group: Optional[SequenceGroup] = None # seq id to a unique index inside this group - seq_id_to_index: Dict[str, int] = field(default_factory=dict) + seq_id_to_index: dict[str, int] = field(default_factory=dict) # seq ids to be finished - to_be_finished: Dict[str, SequenceGroup] = field(default_factory=dict) + to_be_finished: dict[str, SequenceGroup] = field(default_factory=dict) # seq id to finished sequences - finished_reqs: Dict[str, SequenceGroup] = field(default_factory=dict) + finished_reqs: dict[str, SequenceGroup] = field(default_factory=dict) streaming: bool = False diff --git a/vllm/tracing.py b/vllm/tracing.py index bf069ad8..557ae40b 100644 --- a/vllm/tracing.py +++ b/vllm/tracing.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import Mapping, Optional +from collections.abc import Mapping +from typing import Optional from vllm.logger import init_logger from vllm.utils import run_once diff --git a/vllm/utils.py b/vllm/utils.py index 29e60a9c..26c9e1a9 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -28,12 +28,12 @@ import warnings import weakref from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task from collections import OrderedDict, UserDict, defaultdict -from collections.abc import Hashable, Iterable, Mapping +from collections.abc import (AsyncGenerator, Awaitable, Generator, Hashable, + Iterable, Iterator, Mapping) from dataclasses import dataclass, field from functools import cache, lru_cache, partial, wraps -from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, - Dict, Generator, Generic, Iterator, List, Literal, - NamedTuple, Optional, Tuple, Type, TypeVar, Union) +from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple, + Optional, TypeVar, Union) from uuid import uuid4 import cloudpickle @@ -400,7 +400,7 @@ def _next_task(iterator: AsyncGenerator[T, None], async def merge_async_iterators( *iterators: AsyncGenerator[T, - None], ) -> AsyncGenerator[Tuple[int, T], None]: + None], ) -> AsyncGenerator[tuple[int, T], None]: """Merge multiple asynchronous iterators into a single iterator. This method handle the case where some iterators finish before others. @@ -433,7 +433,7 @@ async def merge_async_iterators( async def collect_from_async_generator( - iterator: AsyncGenerator[T, None]) -> List[T]: + iterator: AsyncGenerator[T, None]) -> list[T]: """Collect all items from an async generator into a list.""" items = [] async for item in iterator: @@ -560,7 +560,7 @@ def find_process_using_port(port: int) -> Optional[psutil.Process]: return None -def update_environment_variables(envs: Dict[str, str]): +def update_environment_variables(envs: dict[str, str]): for k, v in envs.items(): if k in os.environ and os.environ[k] != v: logger.warning( @@ -569,7 +569,7 @@ def update_environment_variables(envs: Dict[str, str]): os.environ[k] = v -def chunk_list(lst: List[T], chunk_size: int): +def chunk_list(lst: list[T], chunk_size: int): """Yield successive chunk_size chunks from lst.""" for i in range(0, len(lst), chunk_size): yield lst[i:i + chunk_size] @@ -642,7 +642,7 @@ def create_kv_caches_with_random_flash( model_dtype: Optional[Union[str, torch.dtype]] = None, seed: int = 0, device: Optional[str] = "cuda", -) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: +) -> tuple[list[torch.Tensor], list[torch.Tensor]]: from vllm.platforms import current_platform current_platform.seed_everything(seed) @@ -650,8 +650,8 @@ def create_kv_caches_with_random_flash( key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size) scale = head_size**-0.5 - key_caches: List[torch.Tensor] = [] - value_caches: List[torch.Tensor] = [] + key_caches: list[torch.Tensor] = [] + value_caches: list[torch.Tensor] = [] for _ in range(num_layers): key_value_cache = torch.empty(size=key_value_cache_shape, @@ -679,7 +679,7 @@ def create_kv_caches_with_random( model_dtype: Optional[Union[str, torch.dtype]] = None, seed: int = 0, device: Optional[str] = "cuda", -) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: +) -> tuple[list[torch.Tensor], list[torch.Tensor]]: if cache_dtype == "fp8" and head_size % 16: raise ValueError( @@ -693,7 +693,7 @@ def create_kv_caches_with_random( scale = head_size**-0.5 x = 16 // torch.tensor([], dtype=torch_dtype).element_size() key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) - key_caches: List[torch.Tensor] = [] + key_caches: list[torch.Tensor] = [] for _ in range(num_layers): key_cache = torch.empty(size=key_cache_shape, dtype=torch_dtype, @@ -708,7 +708,7 @@ def create_kv_caches_with_random( key_caches.append(key_cache) value_cache_shape = (num_blocks, num_heads, head_size, block_size) - value_caches: List[torch.Tensor] = [] + value_caches: list[torch.Tensor] = [] for _ in range(num_layers): value_cache = torch.empty(size=value_cache_shape, dtype=torch_dtype, @@ -754,7 +754,7 @@ class DeviceMemoryProfiler: def make_ndarray_with_pad( - x: List[List[T]], + x: list[list[T]], pad: T, dtype: npt.DTypeLike, *, @@ -779,7 +779,7 @@ def make_ndarray_with_pad( def make_tensor_with_pad( - x: List[List[T]], + x: list[list[T]], pad: T, dtype: torch.dtype, *, @@ -831,7 +831,7 @@ def is_list_of( typ: Union[type[T], tuple[type[T], ...]], *, check: Literal["first", "all"] = "first", -) -> TypeIs[List[T]]: +) -> TypeIs[list[T]]: if not isinstance(value, list): return False @@ -843,8 +843,8 @@ def is_list_of( assert_never(check) -JSONTree = Union[Dict[str, "JSONTree[T]"], List["JSONTree[T]"], - Tuple["JSONTree[T]", ...], T] +JSONTree = Union[dict[str, "JSONTree[T]"], list["JSONTree[T]"], + tuple["JSONTree[T]", ...], T] """A nested JSON structure where the leaves need not be JSON-serializable.""" @@ -859,7 +859,7 @@ def json_map_leaves(func: Callable[[T], U], value: JSONTree[T]) -> JSONTree[U]: return func(value) -def flatten_2d_lists(lists: List[List[T]]) -> List[T]: +def flatten_2d_lists(lists: list[list[T]]) -> list[T]: """Flatten a list of lists to a single list.""" return [item for sublist in lists for item in sublist] @@ -1226,7 +1226,7 @@ class FlexibleArgumentParser(argparse.ArgumentParser): return value - def _pull_args_from_config(self, args: List[str]) -> List[str]: + def _pull_args_from_config(self, args: list[str]) -> list[str]: """Method to pull arguments specified in the config file into the command-line args variable. @@ -1291,7 +1291,7 @@ class FlexibleArgumentParser(argparse.ArgumentParser): return args - def _load_config_file(self, file_path: str) -> List[str]: + def _load_config_file(self, file_path: str) -> list[str]: """Loads a yaml file and returns the key value pairs as a flattened list with argparse like pattern ```yaml @@ -1313,9 +1313,9 @@ class FlexibleArgumentParser(argparse.ArgumentParser): %s supplied", extension) # only expecting a flat dictionary of atomic types - processed_args: List[str] = [] + processed_args: list[str] = [] - config: Dict[str, Union[int, str]] = {} + config: dict[str, Union[int, str]] = {} try: with open(file_path) as config_file: config = yaml.safe_load(config_file) @@ -1399,7 +1399,7 @@ def resolve_mm_processor_kwargs( *, requires_kw_only: bool = True, allow_var_kwargs: bool = False, -) -> Dict[str, Any]: +) -> dict[str, Any]: """Applies filtering to eliminate invalid mm_processor_kwargs, i.e., those who are not explicit keywords to the given callable (of one is given; otherwise no filtering is done), then merges the kwarg dicts, @@ -1440,7 +1440,7 @@ def get_allowed_kwarg_only_overrides( *, requires_kw_only: bool = True, allow_var_kwargs: bool = False, -) -> Dict[str, Any]: +) -> dict[str, Any]: """ Given a callable which has one or more keyword only params and a dict mapping param names to values, drop values that can be not be kwarg @@ -1531,9 +1531,9 @@ class AtomicCounter: # Adapted from: https://stackoverflow.com/a/47212782/5082708 class LazyDict(Mapping[str, T], Generic[T]): - def __init__(self, factory: Dict[str, Callable[[], T]]): + def __init__(self, factory: dict[str, Callable[[], T]]): self._factory = factory - self._dict: Dict[str, T] = {} + self._dict: dict[str, T] = {} def __getitem__(self, key: str) -> T: if key not in self._dict: @@ -1552,9 +1552,9 @@ class LazyDict(Mapping[str, T], Generic[T]): return len(self._factory) -class ClassRegistry(UserDict[Type[T], _V]): +class ClassRegistry(UserDict[type[T], _V]): - def __getitem__(self, key: Type[T]) -> _V: + def __getitem__(self, key: type[T]) -> _V: for cls in key.mro(): if cls in self.data: return self.data[cls] @@ -1584,8 +1584,8 @@ def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor: def weak_ref_tensors( - tensors: Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor]] -) -> Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor]]: + tensors: Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor]] +) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor]]: """ Convenience function to create weak references to tensors, for single tensor, list of tensors or tuple of tensors. @@ -1857,7 +1857,7 @@ vllm_lib = Library("vllm", "FRAGMENT") # noqa def direct_register_custom_op( op_name: str, op_func: Callable, - mutates_args: List[str], + mutates_args: list[str], fake_impl: Optional[Callable] = None, target_lib: Optional[Library] = None, dispatch_key: str = "CUDA", @@ -2177,8 +2177,8 @@ def get_mp_context(): def bind_kv_cache( - ctx: Dict[str, Any], - kv_cache: List[List[torch.Tensor]], # [virtual_engine][layer_index] + ctx: dict[str, Any], + kv_cache: list[list[torch.Tensor]], # [virtual_engine][layer_index] ) -> None: # Bind the kv_cache tensor to Attention modules, similar to # ctx[layer_name].kv_cache[ve]=kv_cache[ve][extract_layer_index(layer_name)] @@ -2210,8 +2210,8 @@ def bind_kv_cache( forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx] -def run_method(obj: Any, method: Union[str, bytes, Callable], args: Tuple[Any], - kwargs: Dict[str, Any]) -> Any: +def run_method(obj: Any, method: Union[str, bytes, Callable], args: tuple[Any], + kwargs: dict[str, Any]) -> Any: """ Run a method of an object with the given arguments and keyword arguments. If the method is string, it will be converted to a method using getattr. @@ -2263,7 +2263,7 @@ def import_pynvml(): return pynvml -def warn_for_unimplemented_methods(cls: Type[T]) -> Type[T]: +def warn_for_unimplemented_methods(cls: type[T]) -> type[T]: """ A replacement for `abc.ABC`. When we use `abc.ABC`, subclasses will fail to instantiate diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 353bf46d..8bf7f358 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Attention layer with FlashAttention.""" from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type +from typing import TYPE_CHECKING, Any, Optional import numpy as np import torch @@ -30,7 +30,7 @@ class FlashAttentionBackend(AttentionBackend): accept_output_buffer: bool = True @staticmethod - def get_supported_head_sizes() -> List[int]: + def get_supported_head_sizes() -> list[int]: return [32, 64, 96, 128, 160, 192, 224, 256] @staticmethod @@ -38,15 +38,15 @@ class FlashAttentionBackend(AttentionBackend): return "FLASH_ATTN_VLLM_V1" @staticmethod - def get_impl_cls() -> Type["FlashAttentionImpl"]: + def get_impl_cls() -> type["FlashAttentionImpl"]: return FlashAttentionImpl @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: + def get_metadata_cls() -> type["AttentionMetadata"]: return FlashAttentionMetadata @staticmethod - def get_builder_cls() -> Type["FlashAttentionMetadataBuilder"]: + def get_builder_cls() -> type["FlashAttentionMetadataBuilder"]: return FlashAttentionMetadataBuilder @staticmethod @@ -55,7 +55,7 @@ class FlashAttentionBackend(AttentionBackend): block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: if block_size % 16 != 0: raise ValueError("Block size must be a multiple of 16.") return (2, num_blocks, block_size, num_kv_heads, head_size) @@ -158,10 +158,10 @@ class FlashAttentionImpl(AttentionImpl): head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, + blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, ) -> None: @@ -381,7 +381,7 @@ def cascade_attention( max_kv_len: int, softmax_scale: float, alibi_slopes: Optional[torch.Tensor], - sliding_window: Tuple[int, int], + sliding_window: tuple[int, int], logits_soft_cap: float, block_table: torch.Tensor, common_prefix_len: int, diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 30bce5cc..824ffcfd 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -195,8 +195,7 @@ return curr_o @ W_O import functools from abc import abstractmethod from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple, - Type, TypeVar) +from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar import torch from compressed_tensors.quantization import QuantizationStrategy @@ -250,11 +249,11 @@ class MLACommonBackend(AttentionBackend): return "TRITON_MLA_VLLM_V1" @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: + def get_metadata_cls() -> type["AttentionMetadata"]: return MLACommonMetadata @staticmethod - def get_builder_cls() -> Type["MLACommonMetadataBuilder"]: + def get_builder_cls() -> type["MLACommonMetadataBuilder"]: return MLACommonMetadataBuilder @staticmethod @@ -263,11 +262,11 @@ class MLACommonBackend(AttentionBackend): block_size: int, num_kv_heads: int, # assumed to be 1 for MLA head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return (num_blocks, block_size, head_size) @staticmethod - def get_supported_head_sizes() -> List[int]: + def get_supported_head_sizes() -> list[int]: return [576] @staticmethod @@ -317,8 +316,8 @@ class MLACommonMetadata: has_context: bool = False context_chunk_cu_seq_lens: Optional[torch.Tensor] = None context_chunk_starts: Optional[torch.Tensor] = None - context_chunk_seq_tot: Optional[List[int]] = None - context_chunk_max_seq_lens: Optional[List[int]] = None + context_chunk_seq_tot: Optional[list[int]] = None + context_chunk_max_seq_lens: Optional[list[int]] = None chunked_prefill_workspace: Optional[torch.Tensor] = None def __post_init__(self): @@ -538,10 +537,10 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]], + blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, # MLA Specific Arguments @@ -634,7 +633,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): # # returns input_group_shape, weight_group_shape def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \ - Tuple[Tuple[int, int], Tuple[int, int]]: + tuple[tuple[int, int], tuple[int, int]]: if isinstance(layer.quant_method, Fp8LinearMethod): if layer.quant_method.block_quant: weight_block_size = \ diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 8a7b7b97..b357d714 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type +from typing import Any, Optional import torch @@ -25,21 +25,21 @@ class FlashMLABackend(MLACommonBackend): return "FLASHMLA_VLLM_V1" @staticmethod - def get_metadata_cls() -> Type["FlashMLAMetadata"]: + def get_metadata_cls() -> type["FlashMLAMetadata"]: return FlashMLAMetadata @staticmethod - def get_builder_cls() -> Type["FlashMLAMetadataBuilder"]: + def get_builder_cls() -> type["FlashMLAMetadataBuilder"]: return FlashMLAMetadataBuilder @staticmethod - def get_impl_cls() -> Type["FlashMLAImpl"]: + def get_impl_cls() -> type["FlashMLAImpl"]: return FlashMLAImpl @dataclass class FlashMLAMetadata(MLACommonMetadata): - decode_tile_scheduler_metadata: Optional[Tuple[torch.Tensor, + decode_tile_scheduler_metadata: Optional[tuple[torch.Tensor, torch.Tensor]] = None decode_num_splits: Optional[torch.Tensor] = None @@ -76,10 +76,10 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]], + blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, # MLA Specific Arguments diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py index 7747509f..3f9b349a 100644 --- a/vllm/v1/attention/backends/mla/triton_mla.py +++ b/vllm/v1/attention/backends/mla/triton_mla.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional, Type +from typing import Any, Optional import torch @@ -21,7 +21,7 @@ class TritonMLABackend(MLACommonBackend): return "TRITON_MLA_VLLM_V1" @staticmethod - def get_impl_cls() -> Type["TritonMLAImpl"]: + def get_impl_cls() -> type["TritonMLAImpl"]: return TritonMLAImpl @@ -33,10 +33,10 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]): head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]], + blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, # MLA Specific Arguments diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py index a9f7b3fd..bf4a05da 100644 --- a/vllm/v1/attention/backends/pallas.py +++ b/vllm/v1/attention/backends/pallas.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type +from typing import Any, Optional import torch # Required to register custom ops. @@ -22,15 +22,15 @@ class PallasAttentionBackend(AttentionBackend): return "PALLAS_VLLM_V1" @staticmethod - def get_impl_cls() -> Type["PallasAttentionBackendImpl"]: + def get_impl_cls() -> type["PallasAttentionBackendImpl"]: return PallasAttentionBackendImpl @staticmethod - def get_metadata_cls() -> Type["PallasMetadata"]: + def get_metadata_cls() -> type["PallasMetadata"]: return PallasMetadata @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: + def get_state_cls() -> type["CommonAttentionState"]: return CommonAttentionState @staticmethod @@ -39,7 +39,7 @@ class PallasAttentionBackend(AttentionBackend): block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return (num_kv_heads, num_blocks, block_size, head_size) @staticmethod @@ -77,10 +77,10 @@ class PallasAttentionBackendImpl(AttentionImpl): head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, + blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, ) -> None: @@ -120,7 +120,7 @@ class PallasAttentionBackendImpl(AttentionImpl): query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - kv_cache: Tuple[torch.Tensor, torch.Tensor], + kv_cache: tuple[torch.Tensor, torch.Tensor], attn_metadata: PallasMetadata, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py index 5c7d759b..a625d99f 100644 --- a/vllm/v1/attention/backends/rocm_attn.py +++ b/vllm/v1/attention/backends/rocm_attn.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Attention layer with PagedAttention on rocm""" -from typing import Any, Dict, List, Optional, Tuple, Type +from typing import Any, Optional import torch @@ -20,7 +20,7 @@ class ROCmAttentionBackend(AttentionBackend): accept_output_buffer: bool = True @staticmethod - def get_supported_head_sizes() -> List[int]: + def get_supported_head_sizes() -> list[int]: return [32, 64, 96, 128, 160, 192, 224, 256] @staticmethod @@ -28,11 +28,11 @@ class ROCmAttentionBackend(AttentionBackend): return "ROCM_ATTN_VLLM_V1" @staticmethod - def get_impl_cls() -> Type["ROCmAttentionImpl"]: + def get_impl_cls() -> type["ROCmAttentionImpl"]: return ROCmAttentionImpl @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: + def get_metadata_cls() -> type["AttentionMetadata"]: return FlashAttentionMetadata @staticmethod @@ -41,7 +41,7 @@ class ROCmAttentionBackend(AttentionBackend): block_size: int, num_kv_heads: int, head_size: int, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: if block_size % 16 != 0: raise ValueError("Block size must be a multiple of 16.") return (2, num_blocks, block_size, num_kv_heads, head_size) @@ -51,7 +51,7 @@ class ROCmAttentionBackend(AttentionBackend): return False @staticmethod - def get_builder_cls() -> Type["FlashAttentionMetadataBuilder"]: + def get_builder_cls() -> type["FlashAttentionMetadataBuilder"]: return FlashAttentionMetadataBuilder @@ -63,10 +63,10 @@ class ROCmAttentionImpl(AttentionImpl): head_size: int, scale: float, num_kv_heads: int, - alibi_slopes: Optional[List[float]], + alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, + blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, ) -> None: diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index 1b5c7f96..394b47fd 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from collections import defaultdict -from typing import Dict, Iterable, List, Optional +from collections.abc import Iterable +from typing import Optional from vllm.logger import init_logger from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue, @@ -29,7 +30,7 @@ class BlockPool: self.num_gpu_blocks = num_gpu_blocks self.enable_caching = enable_caching # All kv-cache blocks. - self.blocks: List[KVCacheBlock] = [ + self.blocks: list[KVCacheBlock] = [ KVCacheBlock(idx) for idx in range(num_gpu_blocks) ] # Free block queue that constructs and manipulates a doubly linked @@ -46,7 +47,7 @@ class BlockPool: # if there is already an identical block in the cache. This is because # we want to make sure the allocated block IDs won't change so that # block tables are append-only. - self.cached_block_hash_to_block: Dict[BlockHashType, Dict[ + self.cached_block_hash_to_block: dict[BlockHashType, dict[ int, KVCacheBlock]] = defaultdict(dict) def get_cached_block(self, @@ -69,8 +70,8 @@ class BlockPool: def cache_full_blocks( self, request: Request, - blocks: List[KVCacheBlock], - block_hashes: List[BlockHashType], + blocks: list[KVCacheBlock], + block_hashes: list[BlockHashType], num_cached_blocks: int, num_full_blocks: int, block_size: int, @@ -146,7 +147,7 @@ class BlockPool: self.cached_block_hash_to_block[block_hash][blk.block_id] = blk prev_block_hash_value = block_hash.hash_value - def get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]: + def get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]: """Get new blocks from the free block pool. Note that we do not check block cache in this function. @@ -161,7 +162,7 @@ class BlockPool: raise ValueError( f"Cannot get {num_blocks} free blocks from the pool") - ret: List[KVCacheBlock] = [] + ret: list[KVCacheBlock] = [] idx = 0 while idx < num_blocks: # First allocate blocks. @@ -200,7 +201,7 @@ class BlockPool: return True return False - def touch(self, blocks: List[KVCacheBlock]) -> None: + def touch(self, blocks: list[KVCacheBlock]) -> None: """Touch a block increases its reference count by 1, and may remove the block from the free queue. This is used when a block is hit by another request with the same prefix. diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 13ad14e4..018379c1 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import TYPE_CHECKING, Dict, List, Set, Tuple +from typing import TYPE_CHECKING from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY @@ -18,9 +18,9 @@ class EncoderCacheManager: self.cache_size = cache_size self.num_free_slots = cache_size # req_id -> cached input ids - self.cached: Dict[str, Set[int]] = {} - # List of [req_id, input_id] - self.freed: List[Tuple[str, int]] = [] + self.cached: dict[str, set[int]] = {} + # list of [req_id, input_id] + self.freed: list[tuple[str, int]] = [] def has_cache(self, request: Request, input_id: int) -> bool: req_id = request.request_id @@ -37,7 +37,7 @@ class EncoderCacheManager: self.cached[req_id].add(input_id) self.num_free_slots -= request.get_num_encoder_tokens(input_id) - def get_cached_input_ids(self, request: Request) -> Set[int]: + def get_cached_input_ids(self, request: Request) -> set[int]: return self.cached.get(request.request_id, set()) def free_encoder_input(self, request: Request, input_id: int) -> None: @@ -58,7 +58,7 @@ class EncoderCacheManager: for input_id in input_ids: self.free_encoder_input(request, input_id) - def get_freed_ids(self) -> List[Tuple[str, int]]: + def get_freed_ids(self) -> list[tuple[str, int]]: freed = self.freed self.freed = [] return freed @@ -67,7 +67,7 @@ class EncoderCacheManager: def compute_encoder_budget( model_config: "ModelConfig", scheduler_config: "SchedulerConfig", -) -> Tuple[int, int]: +) -> tuple[int, int]: """Compute the encoder cache budget based on the model and scheduler configurations. @@ -97,7 +97,7 @@ def compute_encoder_budget( def _compute_encoder_budget_multimodal( model_config: "ModelConfig", scheduler_config: "SchedulerConfig", -) -> Tuple[int, int]: +) -> tuple[int, int]: """Compute the encoder cache budget based on the model and scheduler configurations for a multimodal model. diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 030574de..6c6be01a 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 from collections import defaultdict -from typing import DefaultDict, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Optional from vllm.logger import init_logger from vllm.utils import cdiv @@ -52,20 +53,20 @@ class KVCacheManager: # Mapping from request ID to blocks to track the blocks allocated # for each request, so that we can free the blocks when the request # is finished. - self.req_to_blocks: DefaultDict[str, - List[KVCacheBlock]] = defaultdict(list) + self.req_to_blocks: defaultdict[str, + list[KVCacheBlock]] = defaultdict(list) # Mapping from request ID to kv block hashes. # This is to avoid recomputing the block hashes for each call of # `get_computed_blocks` or `allocate_slots`. - self.req_to_block_hashes: DefaultDict[ - str, List[BlockHashType]] = defaultdict(list) + self.req_to_block_hashes: defaultdict[ + str, list[BlockHashType]] = defaultdict(list) # {req_id: The number of cached blocks for this given request} # This is used to track the number of cached blocks for each request. # This is only used to track the RUNNING requests, we do not track the # data for reempted ones. - self.num_cached_block: Dict[str, int] = {} + self.num_cached_block: dict[str, int] = {} self.prefix_cache_stats = PrefixCacheStats() @property @@ -88,7 +89,7 @@ class KVCacheManager: return stats def get_computed_blocks( - self, request: Request) -> Tuple[List[KVCacheBlock], int]: + self, request: Request) -> tuple[list[KVCacheBlock], int]: """Get the computed (cached) blocks for the request. Note that the computed blocks must be full. @@ -136,8 +137,8 @@ class KVCacheManager: self, request: Request, num_tokens: int, - new_computed_blocks: Optional[List[KVCacheBlock]] = None - ) -> Optional[List[KVCacheBlock]]: + new_computed_blocks: Optional[list[KVCacheBlock]] = None + ) -> Optional[list[KVCacheBlock]]: """Add slots for a request with new tokens to append. Args: diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 546fddf6..adadcab5 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -3,7 +3,7 @@ from collections import deque from collections.abc import Sequence from dataclasses import dataclass -from typing import Any, List, NamedTuple, Optional, Tuple +from typing import Any, NamedTuple, Optional from vllm.config import VllmConfig from vllm.logger import init_logger @@ -25,7 +25,7 @@ class BlockHashType(NamedTuple): # Hash value of the block in an integer. hash_value: int # Token IDs in the block. - token_ids: Tuple[int, ...] + token_ids: tuple[int, ...] # Extra keys for the block. extra_keys: Optional[Any] = None @@ -45,7 +45,7 @@ class PrefixCachingMetrics: self.aggregated_query_total = 0 self.aggregated_query_hit = 0 # A deque of (requests, queries, hits) for the most recent requests. - self.query_queue: deque[Tuple[int, int, int]] = deque() + self.query_queue: deque[tuple[int, int, int]] = deque() def observe(self, stats: PrefixCacheStats): """Observe the prefix caching for a set of requests. @@ -164,7 +164,7 @@ class FreeKVCacheBlockQueue: blocks: A list of KVCacheBlock objects. """ - def __init__(self, blocks: List[KVCacheBlock]) -> None: + def __init__(self, blocks: list[KVCacheBlock]) -> None: self.num_free_blocks = len(blocks) # Initialize the doubly linked list of free blocks. @@ -233,7 +233,7 @@ class FreeKVCacheBlockQueue: block.next_free_block = None self.num_free_blocks += 1 - def get_all_free_blocks(self) -> List[KVCacheBlock]: + def get_all_free_blocks(self) -> list[KVCacheBlock]: """Get all free blocks in the free list. Mainly used for testing. Returns: @@ -264,7 +264,7 @@ def need_extra_keys(request: Request) -> bool: def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int, end_token_idx: int, - start_mm_idx: int) -> Tuple[List[Any], int]: + start_mm_idx: int) -> tuple[list[Any], int]: """Generate extra keys related to MultiModal request for block hash computation. For multi-modal inputs, the extra keys are (mm_hash, start_offset) that indicate a mm input contained in the @@ -279,7 +279,7 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int, Returns: A tuple of extra keys and the next multi-modal index. """ - extra_keys: List[Any] = [] + extra_keys: list[Any] = [] mm_positions, mm_hashes = request.mm_positions, request.mm_hashes if not mm_positions: @@ -331,7 +331,7 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int, return extra_keys, curr_mm_idx -def _gen_lora_extra_hash_keys(request: Request) -> List[int]: +def _gen_lora_extra_hash_keys(request: Request) -> list[int]: """Generate extra keys related to LoRA for block hash computation. Args: @@ -348,7 +348,7 @@ def _gen_lora_extra_hash_keys(request: Request) -> List[int]: def generate_block_hash_extra_keys( request: Request, start_token_idx: int, end_token_idx: int, - start_mm_idx: int) -> Tuple[Optional[Tuple[Any, ...]], int]: + start_mm_idx: int) -> tuple[Optional[tuple[Any, ...]], int]: """Generate extra keys for the block hash. The extra keys can come from the multi-modal inputs and request specific metadata (e.g., LoRA ID). @@ -361,12 +361,12 @@ def generate_block_hash_extra_keys( Returns: A tuple of extra keys and the next multi-modal index. """ - mm_extra_keys: List[Any] + mm_extra_keys: list[Any] mm_extra_keys, new_start_mm_idx = _gen_mm_extra_hash_keys( request, start_token_idx, end_token_idx, start_mm_idx) - lora_extra_keys: List[int] = _gen_lora_extra_hash_keys(request) + lora_extra_keys: list[int] = _gen_lora_extra_hash_keys(request) - extra_keys: List[Any] = lora_extra_keys + mm_extra_keys + extra_keys: list[Any] = lora_extra_keys + mm_extra_keys if not extra_keys: return None, new_start_mm_idx @@ -377,7 +377,7 @@ def generate_block_hash_extra_keys( def hash_block_tokens( parent_block_hash: Optional[int], curr_block_token_ids: Sequence[int], - extra_keys: Optional[Tuple[Any, ...]] = None) -> BlockHashType: + extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHashType: """Computes a hash value corresponding to the contents of a block and the contents of the preceding block(s). The hash value is used for prefix caching. We use LRU cache for this function to avoid recomputing @@ -410,7 +410,7 @@ def hash_block_tokens( def hash_request_tokens(block_size: int, - request: Request) -> List[BlockHashType]: + request: Request) -> list[BlockHashType]: """Computes hash values of a chain of blocks given a sequence of token IDs. The hash value is used for prefix caching. @@ -554,8 +554,8 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, def get_kv_cache_configs(vllm_config: VllmConfig, - kv_cache_specs: List[KVCacheSpec], - available_memory: int) -> List[KVCacheConfig]: + kv_cache_specs: list[KVCacheSpec], + available_memory: int) -> list[KVCacheConfig]: """ Generates the KV cache configuration for a model TODO: support hybrid models with more than one type of KV cache. diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 87c9c0cd..db14c945 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -2,7 +2,8 @@ import time from collections import deque -from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union from vllm.config import (CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig, SpeculativeConfig) @@ -57,24 +58,24 @@ class Scheduler: self.block_size = self.cache_config.block_size # req_id -> Request - self.requests: Dict[str, Request] = {} + self.requests: dict[str, Request] = {} # Priority queues for requests. - self.waiting: Deque[Request] = deque() - self.running: List[Request] = [] + self.waiting: deque[Request] = deque() + self.running: list[Request] = [] # The requests that have been scheduled and are being executed # by the executor. - self.scheduled_req_ids: Set[str] = set() + self.scheduled_req_ids: set[str] = set() # The request IDs that are finished in between the previous and the # current steps. This is used to notify the workers about the finished # requests so that they can free the cached states for those requests. # This is flushed at the end of each scheduling step. - self.finished_req_ids: Set[str] = set() + self.finished_req_ids: set[str] = set() # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating # them at each scheduling step. # Request id -> CachedRequestData - self._cached_reqs_data: Dict[str, CachedRequestData] = {} + self._cached_reqs_data: dict[str, CachedRequestData] = {} # Encoder-related. # Calculate encoder cache size if applicable @@ -108,19 +109,19 @@ class Scheduler: # chunked prefills, prefix caching, speculative decoding, # and the "jump decoding" optimization in the future. - scheduled_new_reqs: List[Request] = [] - scheduled_resumed_reqs: List[Request] = [] - scheduled_running_reqs: List[Request] = [] - preempted_reqs: List[Request] = [] + scheduled_new_reqs: list[Request] = [] + scheduled_resumed_reqs: list[Request] = [] + scheduled_running_reqs: list[Request] = [] + preempted_reqs: list[Request] = [] - req_to_new_block_ids: Dict[str, List[int]] = {} - num_scheduled_tokens: Dict[str, int] = {} + req_to_new_block_ids: dict[str, list[int]] = {} + num_scheduled_tokens: dict[str, int] = {} token_budget = self.max_num_scheduled_tokens # Encoder-related. - scheduled_encoder_inputs: Dict[str, List[int]] = {} + scheduled_encoder_inputs: dict[str, list[int]] = {} encoder_budget = self.max_num_encoder_input_tokens # Spec decode-related. - scheduled_spec_decode_tokens: Dict[str, List[int]] = {} + scheduled_spec_decode_tokens: dict[str, list[int]] = {} # For logging. scheduled_timestamp = time.monotonic() @@ -211,7 +212,7 @@ class Scheduler: encoder_budget = new_encoder_budget # Record the LoRAs in scheduled_running_reqs - requested_loras: Set[int] = set() + requested_loras: set[int] = set() if self.lora_config: requested_loras = set( req.lora_request.lora_int_id for req in scheduled_running_reqs @@ -378,7 +379,7 @@ class Scheduler: request: Request, num_scheduled_tokens: int, num_scheduled_spec_tokens: int, - new_block_ids: List[int], + new_block_ids: list[int], resumed_from_preemption: bool, ) -> "CachedRequestData": # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating @@ -407,7 +408,7 @@ class Scheduler: num_computed_tokens: int, num_new_tokens: int, encoder_budget: int, - ) -> Tuple[List[int], int, int]: + ) -> tuple[list[int], int, int]: """ Determine which encoder inputs need to be scheduled in the current step, and update `num_new_tokens` and encoder token budget accordingly. @@ -427,7 +428,7 @@ class Scheduler: if not request.has_encoder_inputs(): return [], num_new_tokens, encoder_budget - encoder_inputs_to_schedule: List[int] = [] + encoder_inputs_to_schedule: list[int] = [] mm_positions = request.mm_positions assert mm_positions is not None assert len(mm_positions) > 0 @@ -482,8 +483,8 @@ class Scheduler: prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict num_scheduled_tokens = scheduler_output.num_scheduled_tokens - new_running: List[Request] = [] - outputs: List[EngineCoreOutput] = [] + new_running: list[Request] = [] + outputs: list[EngineCoreOutput] = [] # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below # loop can be a performance bottleneck. We should do our best to avoid @@ -543,7 +544,7 @@ class Scheduler: stopped = False new_logprobs = None - new_token_ids: List[int] = [] + new_token_ids: list[int] = [] if request.num_computed_tokens >= request.num_tokens: for output_token_id in generated_token_ids: diff --git a/vllm/v1/core/scheduler_output.py b/vllm/v1/core/scheduler_output.py index 47413527..b6caa8b4 100644 --- a/vllm/v1/core/scheduler_output.py +++ b/vllm/v1/core/scheduler_output.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple +from typing import TYPE_CHECKING, Optional if TYPE_CHECKING: from vllm.lora.request import LoRARequest @@ -15,13 +15,13 @@ if TYPE_CHECKING: class NewRequestData: req_id: str - prompt_token_ids: List[int] + prompt_token_ids: list[int] prompt: Optional[str] - mm_inputs: List["MultiModalKwargs"] - mm_hashes: List[str] - mm_positions: List["PlaceholderRange"] + mm_inputs: list["MultiModalKwargs"] + mm_hashes: list[str] + mm_positions: list["PlaceholderRange"] sampling_params: "SamplingParams" - block_ids: List[int] + block_ids: list[int] num_computed_tokens: int lora_request: Optional["LoRARequest"] @@ -29,7 +29,7 @@ class NewRequestData: def from_request( cls, request: "Request", - block_ids: List[int], + block_ids: list[int], ) -> "NewRequestData": return cls( req_id=request.request_id, @@ -53,8 +53,8 @@ class CachedRequestData: # the request's block IDs. If True, new_block_ids will be used as the # request's block IDs instead of appending to the existing block IDs. resumed_from_preemption: bool - new_token_ids: List[int] - new_block_ids: List[int] + new_token_ids: list[int] + new_block_ids: list[int] num_computed_tokens: int @classmethod @@ -62,8 +62,8 @@ class CachedRequestData: cls, request: "Request", resumed_from_preemption: bool, - new_token_ids: List[int], - new_block_ids: List[int], + new_token_ids: list[int], + new_block_ids: list[int], ) -> "CachedRequestData": return cls( req_id=request.request_id, @@ -77,29 +77,29 @@ class CachedRequestData: @dataclass class SchedulerOutput: - # List of the requests that are scheduled for the first time. + # list of the requests that are scheduled for the first time. # We cache the request's data in each worker process, so that we don't # need to re-send it every scheduling step. - scheduled_new_reqs: List[NewRequestData] - # List of the requests that have been scheduled before. + scheduled_new_reqs: list[NewRequestData] + # list of the requests that have been scheduled before. # Since the request's data is already cached in the worker processes, # we only send the diff to minimize the communication cost. - scheduled_cached_reqs: List[CachedRequestData] + scheduled_cached_reqs: list[CachedRequestData] # req_id -> num_scheduled_tokens # Number of tokens scheduled for each request. - num_scheduled_tokens: Dict[str, int] + num_scheduled_tokens: dict[str, int] # Total number of tokens scheduled for all requests. # Equal to sum(num_scheduled_tokens.values()) total_num_scheduled_tokens: int # req_id -> spec_token_ids # If a request does not have any spec decode tokens, it will not be # included in the dictionary. - scheduled_spec_decode_tokens: Dict[str, List[int]] + scheduled_spec_decode_tokens: dict[str, list[int]] # req_id -> encoder input indices that need processing. # E.g., if a request has [0, 1], it could mean the vision encoder needs # to process that the request's 0-th and 1-th images in the current step. - scheduled_encoder_inputs: Dict[str, List[int]] + scheduled_encoder_inputs: dict[str, list[int]] # Number of common prefix blocks for all requests. # This can be used for cascade attention. num_common_prefix_blocks: int @@ -107,7 +107,7 @@ class SchedulerOutput: # Request IDs that are finished in between the previous and the current # steps. This is used to notify the workers about the finished requests # so that they can free the cached states for those requests. - finished_req_ids: Set[str] - # List of (req_id, encoder_input_index) tuples. + finished_req_ids: set[str] + # list of (req_id, encoder_input_index) tuples. # Used to free the encoder cache. - free_encoder_input_ids: List[Tuple[str, int]] + free_encoder_input_ids: list[tuple[str, int]] diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 32fb3c5b..cd29c2d7 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -2,7 +2,7 @@ import enum import time -from typing import Any, List, Optional, Union +from typing import Any, Optional, Union import msgspec @@ -51,10 +51,10 @@ class EngineCoreRequest( # NOTE(ywang96): original text prompt is needed when a request is added to # Detokenizer, but set to None when it is added to EngineCoreClient. prompt: Optional[str] - prompt_token_ids: List[int] - mm_inputs: Optional[List[Optional[MultiModalKwargs]]] - mm_hashes: Optional[List[str]] - mm_placeholders: Optional[List[PlaceholderRange]] + prompt_token_ids: list[int] + mm_inputs: Optional[list[Optional[MultiModalKwargs]]] + mm_hashes: Optional[list[str]] + mm_placeholders: Optional[list[PlaceholderRange]] sampling_params: SamplingParams eos_token_id: Optional[int] arrival_time: float @@ -93,14 +93,14 @@ class EngineCoreOutput( gc=False): # type: ignore[call-arg] request_id: str - new_token_ids: List[int] + new_token_ids: list[int] new_logprobs: Optional[LogprobsLists] = None new_prompt_logprobs_tensors: Optional[LogprobsTensors] = None finish_reason: Optional[FinishReason] = None stop_reason: Union[int, str, None] = None - events: Optional[List[EngineCoreEvent]] = None + events: Optional[list[EngineCoreEvent]] = None @property def finished(self) -> bool: @@ -129,7 +129,7 @@ class EngineCoreOutputs( # e.g. columnwise layout # [num_reqs] - outputs: List[EngineCoreOutput] = [] + outputs: list[EngineCoreOutput] = [] scheduler_stats: Optional[SchedulerStats] = None timestamp: float = 0.0 diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 0c04e14c..ab3cdc4e 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -2,7 +2,8 @@ import asyncio import os -from typing import AsyncGenerator, List, Mapping, Optional, Set, Type, Union +from collections.abc import AsyncGenerator, Mapping +from typing import Optional, Union import numpy as np @@ -39,7 +40,7 @@ class AsyncLLM(EngineClient): def __init__( self, vllm_config: VllmConfig, - executor_class: Type[Executor], + executor_class: type[Executor], log_stats: bool, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, input_registry: InputRegistry = INPUT_REGISTRY, @@ -54,7 +55,7 @@ class AsyncLLM(EngineClient): self.log_requests = log_requests self.log_stats = log_stats - self.stat_loggers: List[StatLoggerBase] = [] + self.stat_loggers: list[StatLoggerBase] = [] if self.log_stats: self.stat_loggers.extend([ LoggingStatLogger(), @@ -400,7 +401,7 @@ class AsyncLLM(EngineClient): """Remove an already loaded LoRA adapter.""" return await self.engine_core.remove_lora_async(lora_id) - async def list_loras(self) -> Set[int]: + async def list_loras(self) -> set[int]: """List all registered adapters.""" return await self.engine_core.list_loras_async() diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 041896f1..b9bf8fac 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -7,7 +7,7 @@ import time from concurrent.futures import Future from inspect import isclass, signature from multiprocessing.connection import Connection -from typing import Any, List, Optional, Set, Tuple, Type +from typing import Any, Optional import msgspec import psutil @@ -42,7 +42,7 @@ class EngineCore: def __init__( self, vllm_config: VllmConfig, - executor_class: Type[Executor], + executor_class: type[Executor], log_stats: bool, ): assert vllm_config.model_config.runner_type != "pooling" @@ -80,7 +80,7 @@ class EngineCore: # schedule and execute batches, and is required by pipeline parallelism # to eliminate pipeline bubbles. self.batch_queue_size = self.model_executor.max_concurrent_batches - self.batch_queue: Optional[queue.Queue[Tuple[Future[ModelRunnerOutput], + self.batch_queue: Optional[queue.Queue[tuple[Future[ModelRunnerOutput], SchedulerOutput]]] = None if self.batch_queue_size > 1: logger.info("Batch queue is enabled with size %d", @@ -88,7 +88,7 @@ class EngineCore: self.batch_queue = queue.Queue(self.batch_queue_size) def _initialize_kv_caches(self, - vllm_config: VllmConfig) -> Tuple[int, int]: + vllm_config: VllmConfig) -> tuple[int, int]: start = time.time() # Get all kv cache needed by the model @@ -134,7 +134,7 @@ class EngineCore: self.scheduler.add_request(req) - def abort_requests(self, request_ids: List[str]): + def abort_requests(self, request_ids: list[str]): """Abort requests from the scheduler.""" # TODO: The scheduler doesn't really need to know the @@ -228,7 +228,7 @@ class EngineCore: def remove_lora(self, lora_id: int) -> bool: return self.model_executor.remove_lora(lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: return self.model_executor.list_loras() def pin_lora(self, lora_id: int) -> bool: @@ -244,7 +244,7 @@ class EngineCoreProc(EngineCore): output_path: str, ready_pipe: Connection, vllm_config: VllmConfig, - executor_class: Type[Executor], + executor_class: type[Executor], log_stats: bool, ): super().__init__(vllm_config, executor_class, log_stats) @@ -254,7 +254,7 @@ class EngineCoreProc(EngineCore): # and to overlap some serialization/deserialization with the # model forward pass. # Threads handle Socket <-> Queues and core_busy_loop uses Queue. - self.input_queue: queue.Queue[Tuple[EngineCoreRequestType, + self.input_queue: queue.Queue[tuple[EngineCoreRequestType, Any]] = queue.Queue() self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue() threading.Thread(target=self.process_input_socket, diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 9f36e11d..cdce14af 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -10,7 +10,7 @@ from abc import ABC, abstractmethod from concurrent.futures import Future from dataclasses import dataclass from threading import Thread -from typing import Any, Dict, List, Optional, Set, Type, Union +from typing import Any, Optional, Union import zmq import zmq.asyncio @@ -48,7 +48,7 @@ class EngineCoreClient(ABC): multiprocess_mode: bool, asyncio_mode: bool, vllm_config: VllmConfig, - executor_class: Type[Executor], + executor_class: type[Executor], log_stats: bool, ) -> "EngineCoreClient": @@ -94,7 +94,7 @@ class EngineCoreClient(ABC): async def execute_dummy_batch_async(self) -> None: raise NotImplementedError - def abort_requests(self, request_ids: List[str]) -> None: + def abort_requests(self, request_ids: list[str]) -> None: raise NotImplementedError def add_lora(self, lora_request: LoRARequest) -> bool: @@ -103,7 +103,7 @@ class EngineCoreClient(ABC): def remove_lora(self, lora_id: int) -> bool: raise NotImplementedError - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: raise NotImplementedError def pin_lora(self, lora_id: int) -> bool: @@ -127,7 +127,7 @@ class EngineCoreClient(ABC): async def wake_up_async(self) -> None: raise NotImplementedError - async def abort_requests_async(self, request_ids: List[str]) -> None: + async def abort_requests_async(self, request_ids: list[str]) -> None: raise NotImplementedError async def add_lora_async(self, lora_request: LoRARequest) -> bool: @@ -136,7 +136,7 @@ class EngineCoreClient(ABC): async def remove_lora_async(self, lora_id: int) -> bool: raise NotImplementedError - async def list_loras_async(self) -> Set[int]: + async def list_loras_async(self) -> set[int]: raise NotImplementedError async def pin_lora_async(self, lora_id: int) -> bool: @@ -162,7 +162,7 @@ class InprocClient(EngineCoreClient): def add_request(self, request: EngineCoreRequest) -> None: self.engine_core.add_request(request) - def abort_requests(self, request_ids: List[str]) -> None: + def abort_requests(self, request_ids: list[str]) -> None: if len(request_ids) > 0: self.engine_core.abort_requests(request_ids) @@ -190,7 +190,7 @@ class InprocClient(EngineCoreClient): def remove_lora(self, lora_id: int) -> bool: return self.engine_core.remove_lora(lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: return self.engine_core.list_loras() def pin_lora(self, lora_id: int) -> bool: @@ -239,7 +239,7 @@ class MPClient(EngineCoreClient): self, asyncio_mode: bool, vllm_config: VllmConfig, - executor_class: Type[Executor], + executor_class: type[Executor], log_stats: bool, ): # The child processes will send SIGUSR1 when unrecoverable @@ -293,14 +293,14 @@ class MPClient(EngineCoreClient): self.output_socket = resources.output_socket self.input_socket = resources.input_socket - self.utility_results: Dict[int, AnyFuture] = {} + self.utility_results: dict[int, AnyFuture] = {} def shutdown(self): self._finalizer() def _process_utility_output(output: UtilityOutput, - utility_results: Dict[int, AnyFuture]): + utility_results: dict[int, AnyFuture]): """Set the result from a utility method in the waiting future""" future = utility_results.pop(output.call_id) if output.failure_message is not None: @@ -312,7 +312,7 @@ def _process_utility_output(output: UtilityOutput, class SyncMPClient(MPClient): """Synchronous client for multi-proc EngineCore.""" - def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor], + def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], log_stats: bool): super().__init__( asyncio_mode=False, @@ -373,7 +373,7 @@ class SyncMPClient(MPClient): request.prompt = None self._send_input(EngineCoreRequestType.ADD, request) - def abort_requests(self, request_ids: List[str]) -> None: + def abort_requests(self, request_ids: list[str]) -> None: if len(request_ids) > 0: self._send_input(EngineCoreRequestType.ABORT, request_ids) @@ -389,7 +389,7 @@ class SyncMPClient(MPClient): def remove_lora(self, lora_id: int) -> bool: return self._call_utility("remove_lora", lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: return self._call_utility("list_loras") def pin_lora(self, lora_id: int) -> bool: @@ -408,7 +408,7 @@ class SyncMPClient(MPClient): class AsyncMPClient(MPClient): """Asyncio-compatible client for multi-proc EngineCore.""" - def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor], + def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], log_stats: bool): super().__init__( asyncio_mode=True, @@ -471,7 +471,7 @@ class AsyncMPClient(MPClient): request.prompt = None await self._send_input(EngineCoreRequestType.ADD, request) - async def abort_requests_async(self, request_ids: List[str]) -> None: + async def abort_requests_async(self, request_ids: list[str]) -> None: if len(request_ids) > 0: await self._send_input(EngineCoreRequestType.ABORT, request_ids) @@ -496,7 +496,7 @@ class AsyncMPClient(MPClient): async def remove_lora_async(self, lora_id: int) -> bool: return await self._call_utility_async("remove_lora", lora_id) - async def list_loras_async(self) -> Set[int]: + async def list_loras_async(self) -> set[int]: return await self._call_utility_async("list_loras") async def pin_lora_async(self, lora_id: int) -> bool: diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 629da06f..4a1636f4 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import List, Optional +from typing import Optional from vllm.engine.output_processor.stop_checker import StopChecker from vllm.logger import init_logger @@ -17,12 +17,12 @@ class IncrementalDetokenizer: # Generation data output_text: str - tokens: List[str] - token_ids: List[int] + tokens: list[str] + token_ids: list[int] prompt_len: int # Stop strings - stop: List[str] + stop: list[str] include_stop_str_in_output: bool # Metadata for incremental detokenization @@ -41,7 +41,7 @@ class IncrementalDetokenizer: _last_output_text_offset: int = 0 @property - def output_token_ids(self) -> List[int]: + def output_token_ids(self) -> list[int]: return self.token_ids[self.prompt_len:] @classmethod @@ -84,7 +84,7 @@ class IncrementalDetokenizer: stop_buffer_length=stop_buffer_length, ) - def update(self, new_token_ids: List[int]) -> Optional[str]: + def update(self, new_token_ids: list[int]) -> Optional[str]: """ Update RequestState for the request_id by: 1) Detokenize the new token ids incrementally. diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index ccf52250..2e76694a 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Mapping, Optional, Set, Type, Union +from collections.abc import Mapping +from typing import Optional, Union from typing_extensions import TypeVar @@ -36,10 +37,10 @@ class LLMEngine: def __init__( self, vllm_config: VllmConfig, - executor_class: Type[Executor], + executor_class: type[Executor], log_stats: bool, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + stat_loggers: Optional[dict[str, StatLoggerBase]] = None, input_registry: InputRegistry = INPUT_REGISTRY, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, use_cached_outputs: bool = False, @@ -97,7 +98,7 @@ class LLMEngine: cls, engine_args: EngineArgs, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + stat_loggers: Optional[dict[str, StatLoggerBase]] = None, enable_multiprocessing: bool = False, ) -> "LLMEngine": """Creates an LLM engine from the engine arguments.""" @@ -139,7 +140,7 @@ class LLMEngine: def validate_outputs(cls, outputs, output_type): return outputs - def abort_request(self, request_ids: List[str]) -> None: + def abort_request(self, request_ids: list[str]) -> None: """Remove request_ids from EngineCore and Detokenizer.""" self.engine_core.abort_requests(request_ids) @@ -199,7 +200,7 @@ class LLMEngine: # 3) Add the request to EngineCore. self.engine_core.add_request(request) - def step(self) -> List[RequestOutput]: + def step(self) -> list[RequestOutput]: if self.should_execute_dummy_batch: self.should_execute_dummy_batch = False @@ -241,7 +242,7 @@ class LLMEngine: def get_tokenizer_group( self, - group_type: Type[_G] = BaseTokenizerGroup, + group_type: type[_G] = BaseTokenizerGroup, ) -> _G: tokenizer_group = self.tokenizer @@ -263,7 +264,7 @@ class LLMEngine: """Remove an already loaded LoRA adapter.""" return self.engine_core.remove_lora(lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: """List all registered adapters.""" return self.engine_core.list_loras() diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py index 4622cafa..7f572163 100644 --- a/vllm/v1/engine/logprobs.py +++ b/vllm/v1/engine/logprobs.py @@ -2,7 +2,7 @@ import itertools from dataclasses import dataclass -from typing import Dict, List, Optional +from typing import Optional from vllm.logger import init_logger from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs @@ -151,12 +151,12 @@ class LogprobsProcessor: @staticmethod def _make_logprob_dict( - logprobs: List[float], - logprob_token_ids: List[int], - decoded_tokens: List[str], + logprobs: list[float], + logprob_token_ids: list[int], + decoded_tokens: list[str], rank: int, num_logprobs: int, - ) -> Dict[int, Logprob]: + ) -> dict[int, Logprob]: """Make a Logprob dictionary for a position. Args: @@ -168,7 +168,7 @@ class LogprobsProcessor: by the user (in addition to sampled logprob) Returns: - Dict[token id, Logprob] + dict[token id, Logprob] """ # We do not need a special case for the sampled token diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index a1d802bf..0f66f681 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional +from typing import Any, Optional from vllm.config import ModelConfig from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE @@ -68,10 +68,10 @@ class MMInputCacheClient: def process_inputs( self, mm_data: MultiModalDataDict, - mm_hashes: Optional[List[str]], - mm_processor_kwargs: Optional[Dict[str, Any]], - precomputed_mm_inputs: Optional[List[MultiModalKwargs]], - ) -> List[MultiModalKwargs]: + mm_hashes: Optional[list[str]], + mm_processor_kwargs: Optional[dict[str, Any]], + precomputed_mm_inputs: Optional[list[MultiModalKwargs]], + ) -> list[MultiModalKwargs]: if precomputed_mm_inputs is None: image_inputs = mm_data["image"] if not isinstance(image_inputs, list): @@ -88,7 +88,7 @@ class MMInputCacheClient: # Process each image input separately, so that later we can schedule # them in a fine-grained manner. # Apply caching (if enabled) and reuse precomputed inputs (if provided) - ret_inputs: List[MultiModalKwargs] = [] + ret_inputs: list[MultiModalKwargs] = [] for input_id in range(num_inputs): if self.mm_debug_cache_hit_ratio_steps is not None: self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps) @@ -133,9 +133,9 @@ class MMInputCacheServer: def get_and_update( self, - mm_inputs: List[Optional[MultiModalKwargs]], - mm_hashes: List[str], - ) -> List[MultiModalKwargs]: + mm_inputs: list[Optional[MultiModalKwargs]], + mm_hashes: list[str], + ) -> list[MultiModalKwargs]: assert len(mm_inputs) == len(mm_hashes) if not self.use_cache: diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 9ae8303d..22bbb8a0 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -2,7 +2,7 @@ import asyncio from dataclasses import dataclass -from typing import Dict, List, Optional, Union +from typing import Optional, Union from vllm.outputs import RequestOutput from vllm.sampling_params import RequestOutputKind @@ -18,8 +18,8 @@ from vllm.v1.metrics.stats import (IterationStats, LoRARequestStates, @dataclass class OutputProcessorOutput: - request_outputs: List[RequestOutput] - reqs_to_abort: List[str] + request_outputs: list[RequestOutput] + reqs_to_abort: list[str] class RequestState: @@ -30,7 +30,7 @@ class RequestState: lora_name: Optional[str], output_kind: RequestOutputKind, prompt: Optional[str], - prompt_token_ids: List[int], + prompt_token_ids: list[int], logprobs_processor: LogprobsProcessor, detokenizer: IncrementalDetokenizer, arrival_time: float, @@ -90,7 +90,7 @@ class OutputProcessor: ): self.log_stats = log_stats self.tokenizer = tokenizer - self.request_states: Dict[str, RequestState] = {} + self.request_states: dict[str, RequestState] = {} self.lora_states = LoRARequestStates() def is_request_active(self, request_id: str) -> bool: @@ -104,7 +104,7 @@ class OutputProcessor: def abort_requests( self, - request_ids: List[str], + request_ids: list[str], ) -> None: for request_id in request_ids: req_state = self.request_states.pop(request_id, None) @@ -130,7 +130,7 @@ class OutputProcessor: def process_outputs( self, - engine_core_outputs: List[EngineCoreOutput], + engine_core_outputs: list[EngineCoreOutput], engine_core_timestamp: Optional[float] = None, iteration_stats: Optional[IterationStats] = None, ) -> OutputProcessorOutput: @@ -158,8 +158,8 @@ class OutputProcessor: ********************************************************** """ - request_outputs: List[RequestOutput] = [] - reqs_to_abort: List[str] = [] + request_outputs: list[RequestOutput] = [] + reqs_to_abort: list[str] = [] for engine_core_output in engine_core_outputs: req_id = engine_core_output.request_id req_state = self.request_states.get(req_id) @@ -265,7 +265,7 @@ class OutputProcessor: @staticmethod def _make_request_output( request_state: RequestState, - new_token_ids: List[int], + new_token_ids: list[int], finish_reason: Optional[FinishReason], stop_reason: Union[int, str, None], ) -> Optional[RequestOutput]: diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py index 5d4ea111..29136077 100644 --- a/vllm/v1/engine/parallel_sampling.py +++ b/vllm/v1/engine/parallel_sampling.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import AsyncGenerator, Mapping from copy import copy -from typing import (AsyncGenerator, Dict, List, Mapping, Optional, Protocol, - Tuple, Union) +from typing import Optional, Protocol, Union from vllm.inputs import PromptType from vllm.lora.request import LoRARequest @@ -137,7 +137,7 @@ class ParallelSamplingRequest: key=lambda x: x.index) return self.request_output - def get_child_info(self, index: int) -> Tuple[str, SamplingParams]: + def get_child_info(self, index: int) -> tuple[str, SamplingParams]: """Get child request ID and sampling params. Args: @@ -237,9 +237,9 @@ class SyncParallelSamplingManager: def __init__(self): # Parent req ID -> parent request manager - self.parent_reqs: Dict[str, ParallelSamplingRequest] = {} + self.parent_reqs: dict[str, ParallelSamplingRequest] = {} # Child req ID -> (child req index, parent req ID) - self.child_reqs: Dict[str, Tuple[int, str]] = {} + self.child_reqs: dict[str, tuple[int, str]] = {} def _register_parent_request(self, req: ParallelSamplingRequest) -> None: """Register parallel sampling parent request.""" @@ -299,8 +299,8 @@ class SyncParallelSamplingManager: def step( self, - outputs: List[RequestOutput], - ) -> List[RequestOutput]: + outputs: list[RequestOutput], + ) -> list[RequestOutput]: """Build parallel sampling request outputs. Extract child request outputs, aggregate them @@ -355,7 +355,7 @@ async def generate_parallel_sampling_async( parent_req = ParallelSamplingRequest(request_id, sampling_params) # Aggregate generators for n child requests - gens: List[AsyncGenerator[RequestOutput, None]] = [] + gens: list[AsyncGenerator[RequestOutput, None]] = [] for idx in range(parent_req.n): child_req_id, child_params = parent_req.get_child_info(idx) child_gen = generate( diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 2547ceba..3a3fc69e 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import time -from typing import Mapping, Optional, Union +from collections.abc import Mapping +from typing import Optional, Union from vllm.config import CacheConfig, LoRAConfig, ModelConfig from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 11002ad0..aa6ae83c 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from concurrent.futures import Future -from typing import List, Type, Union +from typing import Union import torch import torch.distributed as dist @@ -22,8 +22,8 @@ class Executor(ExecutorBase): For methods shared by v0 and v1, define them in ExecutorBase""" @staticmethod - def get_class(vllm_config: VllmConfig) -> Type["Executor"]: - executor_class: Type[Executor] + def get_class(vllm_config: VllmConfig) -> type["Executor"]: + executor_class: type[Executor] parallel_config = vllm_config.parallel_config distributed_executor_backend = ( parallel_config.distributed_executor_backend) @@ -53,7 +53,7 @@ class Executor(ExecutorBase): return executor_class def initialize_from_config(self, - kv_cache_configs: List[KVCacheConfig]) -> None: + kv_cache_configs: list[KVCacheConfig]) -> None: """ Initialize the KV caches and begin the model execution loop of the underlying workers. @@ -69,7 +69,7 @@ class Executor(ExecutorBase): # operators can be applied to all workers. return min(output) - def get_kv_cache_specs(self) -> List[KVCacheSpec]: + def get_kv_cache_specs(self) -> list[KVCacheSpec]: output = self.collective_rpc("get_kv_cache_spec") return output diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 25b5c1c1..b2cbba51 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -10,7 +10,7 @@ from dataclasses import dataclass from enum import Enum, auto from functools import partial from multiprocessing.process import BaseProcess -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Optional, Union import cloudpickle import psutil @@ -77,7 +77,7 @@ class MultiprocExecutor(Executor): scheduler_output_handle = self.rpc_broadcast_mq.export_handle() # Create workers - self.workers: List[WorkerProcHandle] = [] + self.workers: list[WorkerProcHandle] = [] for rank in range(self.world_size): worker = WorkerProc.make_worker_process(self.vllm_config, rank, rank, @@ -94,8 +94,8 @@ class MultiprocExecutor(Executor): def collective_rpc(self, method: Union[str, Callable], timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict] = None) -> List[Any]: + args: tuple = (), + kwargs: Optional[dict] = None) -> list[Any]: start_time = time.monotonic() kwargs = kwargs or {} @@ -208,7 +208,7 @@ class WorkerProc: self.rank = rank wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank) # TODO: move `init_worker` to executor level as a collective rpc call - all_kwargs: List[Dict] = [ + all_kwargs: list[dict] = [ {} for _ in range(vllm_config.parallel_config.world_size) ] all_kwargs[rank] = { diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index eddfb594..dfef1039 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Dict, List import torch @@ -74,7 +73,7 @@ class FullAttentionSpec(KVCacheSpecBase): return cdiv(num_tokens, self.block_size) * self.page_size_bytes -KVCacheSpec = Dict[str, KVCacheSpecBase] +KVCacheSpec = dict[str, KVCacheSpecBase] @dataclass @@ -95,7 +94,7 @@ class KVCacheConfig: """The number of KV cache blocks""" num_blocks: int """layer_name -> how to initialize KV cache for that layer""" - tensors: Dict[str, KVCacheTensor] + tensors: dict[str, KVCacheTensor] """ A list of kv-cache groups. Each group includes a set of layers with the same kv-cache spec, and the total page_size of layers inside a group @@ -108,6 +107,6 @@ class KVCacheConfig: 3. (not implemented yet) A model with 2 full attention layers and 4 sliding window attention layers: three groups, (full * 2), (sw * 2), (sw * 2). """ - groups: List[List[str]] + groups: list[list[str]] """the KVCacheSpec of the model""" kv_cache_spec: KVCacheSpec diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 40dfc566..5a2a1c30 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -2,7 +2,7 @@ import time from abc import ABC, abstractmethod -from typing import Dict, List, Optional +from typing import Optional import numpy as np import prometheus_client @@ -35,8 +35,8 @@ class LoggingStatLogger(StatLoggerBase): self.last_log_time = now # Tracked stats over current local logging interval. - self.num_prompt_tokens: List[int] = [] - self.num_generation_tokens: List[int] = [] + self.num_prompt_tokens: list[int] = [] + self.num_generation_tokens: list[int] = [] # Prefix cache metrics. TODO: Make the interval configurable. self.prefix_caching_metrics = PrefixCachingMetrics() @@ -52,7 +52,7 @@ class LoggingStatLogger(StatLoggerBase): self.num_generation_tokens.append( iteration_stats.num_generation_tokens) - def _get_throughput(self, tracked_stats: List[int], now: float) -> float: + def _get_throughput(self, tracked_stats: list[int], now: float) -> float: # Compute summary metrics for tracked stats return float(np.sum(tracked_stats) / (now - self.last_log_time)) @@ -147,7 +147,7 @@ class PrometheusStatLogger(StatLoggerBase): documentation="Number of generation tokens processed.", labelnames=labelnames).labels(*labelvalues) - self.counter_request_success: Dict[FinishReason, + self.counter_request_success: dict[FinishReason, prometheus_client.Counter] = {} counter_request_success_base = prometheus_client.Counter( name="vllm:request_success_total", @@ -338,14 +338,14 @@ class PrometheusStatLogger(StatLoggerBase): prometheus_client.REGISTRY.unregister(collector) -def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]: +def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]: """ Builds a list of buckets with increasing powers of 10 multiplied by mantissa values until the value exceeds the specified maximum. """ exponent = 0 - buckets: List[int] = [] + buckets: list[int] = [] while True: for m in mantissa_lst: value = m * 10**exponent @@ -356,7 +356,7 @@ def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]: exponent += 1 -def build_1_2_5_buckets(max_value: int) -> List[int]: +def build_1_2_5_buckets(max_value: int) -> list[int]: """ Example: >>> build_1_2_5_buckets(100) @@ -365,7 +365,7 @@ def build_1_2_5_buckets(max_value: int) -> List[int]: return build_buckets([1, 2, 5], max_value) -def build_cudagraph_buckets(vllm_config: VllmConfig) -> List[int]: +def build_cudagraph_buckets(vllm_config: VllmConfig) -> list[int]: if not vllm_config.model_config.enforce_eager: buckets = vllm_config.compilation_config.\ cudagraph_capture_sizes.copy() diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 30f460e5..625edb60 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -2,7 +2,7 @@ import time from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Dict, List, Optional, Set +from typing import TYPE_CHECKING, Optional if TYPE_CHECKING: from vllm.outputs import RequestOutput @@ -39,8 +39,8 @@ class SchedulerStats: @dataclass class LoRAStats: - waiting_requests: Set[str] = field(default_factory=set) - running_requests: Set[str] = field(default_factory=set) + waiting_requests: set[str] = field(default_factory=set) + running_requests: set[str] = field(default_factory=set) @dataclass @@ -81,11 +81,11 @@ class IterationStats: self.num_generation_tokens = 0 self.num_prompt_tokens = 0 self.num_preempted_reqs = 0 - self.finished_requests: List[FinishedRequestStats] = [] - self.time_to_first_tokens_iter: List[float] = [] - self.time_per_output_tokens_iter: List[float] = [] - self.waiting_lora_adapters: Dict[str, int] = {} - self.running_lora_adapters: Dict[str, int] = {} + self.finished_requests: list[FinishedRequestStats] = [] + self.time_to_first_tokens_iter: list[float] = [] + self.time_per_output_tokens_iter: list[float] = [] + self.waiting_lora_adapters: dict[str, int] = {} + self.running_lora_adapters: dict[str, int] = {} def _time_since(self, start: float) -> float: """Calculate an interval relative to this iteration's timestamp.""" @@ -132,7 +132,7 @@ class IterationStats: if num_new_generation_tokens > 0: req_stats.last_token_ts = engine_core_timestamp - def update_from_events(self, req_id: str, events: List["EngineCoreEvent"], + def update_from_events(self, req_id: str, events: list["EngineCoreEvent"], is_prefilling: bool, req_stats: RequestStateStats, lora_stats: Optional[LoRAStats]): # Avoid circular dependency @@ -185,7 +185,7 @@ class LoRARequestStates: """Per-LoRA request state stats.""" def __init__(self): - self.lora_name_to_stats: Dict[str, LoRAStats] = {} + self.lora_name_to_stats: dict[str, LoRAStats] = {} def get_stats(self, req_state: 'RequestState') -> Optional[LoRAStats]: if req_state.lora_name is None: diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index f461d52c..dc3ad402 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Dict, List, NamedTuple, Optional +from typing import NamedTuple, Optional import torch @@ -9,11 +9,11 @@ import torch class LogprobsLists(NamedTuple): # [num_reqs, max_num_logprobs + 1] - logprob_token_ids: List[List[int]] + logprob_token_ids: list[list[int]] # [num_reqs, max_num_logprobs + 1] - logprobs: List[List[float]] + logprobs: list[list[float]] # [num_reqs] - sampled_token_ranks: List[int] + sampled_token_ranks: list[int] def slice(self, start: int, end: int): return LogprobsLists( @@ -52,23 +52,23 @@ class SamplerOutput: # ModelRunnerOutput is serialized and sent to the scheduler process. -# This is expensive for torch.Tensor so prefer to use List instead. +# This is expensive for torch.Tensor so prefer to use list instead. @dataclass class ModelRunnerOutput: # [num_reqs] - req_ids: List[str] + req_ids: list[str] # req_id -> index - req_id_to_index: Dict[str, int] + req_id_to_index: dict[str, int] # num_reqs x num_generated_tokens # num_generated_tokens is the number of tokens # generated in the current step. It can be different for # each request due to speculative/jump decoding. - sampled_token_ids: List[List[int]] + sampled_token_ids: list[list[int]] # num_reqs x num_spec_tokens - spec_token_ids: Optional[List[List[int]]] + spec_token_ids: Optional[list[list[int]]] # [num_reqs, max_num_logprobs + 1] # [num_reqs, max_num_logprobs + 1] @@ -79,4 +79,4 @@ class ModelRunnerOutput: # [prompt_len, num_prompt_logprobs] # [prompt_len, num_prompt_logprobs] # [prompt_len] - prompt_logprobs_dict: Dict[str, Optional[LogprobsTensors]] + prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 52d7faee..99df5473 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import enum -from typing import TYPE_CHECKING, List, Optional, Union +from typing import TYPE_CHECKING, Optional, Union from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams @@ -20,10 +20,10 @@ class Request: self, request_id: str, prompt: Optional[str], - prompt_token_ids: List[int], - multi_modal_inputs: Optional[List["MultiModalKwargs"]], - multi_modal_hashes: Optional[List[str]], - multi_modal_placeholders: Optional[List["PlaceholderRange"]], + prompt_token_ids: list[int], + multi_modal_inputs: Optional[list["MultiModalKwargs"]], + multi_modal_hashes: Optional[list[str]], + multi_modal_placeholders: Optional[list["PlaceholderRange"]], sampling_params: SamplingParams, eos_token_id: Optional[int], arrival_time: float, @@ -36,7 +36,7 @@ class Request: self.lora_request = lora_request self.status = RequestStatus.WAITING - self.events: List[EngineCoreEvent] = [] + self.events: list[EngineCoreEvent] = [] self.stop_reason: Union[int, str, None] = None assert sampling_params.max_tokens is not None self.max_tokens = sampling_params.max_tokens @@ -44,15 +44,15 @@ class Request: self.prompt = prompt self.prompt_token_ids = prompt_token_ids self.num_prompt_tokens = len(self.prompt_token_ids) - self._output_token_ids: List[int] = [] - self._all_token_ids: List[int] = self.prompt_token_ids.copy() - self.spec_token_ids: List[int] = [] + self._output_token_ids: list[int] = [] + self._all_token_ids: list[int] = self.prompt_token_ids.copy() + self.spec_token_ids: list[int] = [] self.num_computed_tokens = 0 # Multi-modal related self.mm_positions = multi_modal_placeholders or [] self.mm_inputs = multi_modal_inputs or [] - self.mm_hashes: List[str] = multi_modal_hashes or [] + self.mm_hashes: list[str] = multi_modal_hashes or [] # Sanity check assert len(self.mm_inputs) == len(self.mm_positions) @@ -89,7 +89,7 @@ class Request: EngineCoreEvent.new_event(EngineCoreEventType.SCHEDULED, timestamp)) - def take_events(self) -> Optional[List[EngineCoreEvent]]: + def take_events(self) -> Optional[list[EngineCoreEvent]]: if not self.events: return None events, self.events = self.events, [] @@ -97,7 +97,7 @@ class Request: def append_output_token_ids( self, - token_ids: Union[int, List[int]], + token_ids: Union[int, list[int]], ) -> None: if isinstance(token_ids, int): token_ids = [token_ids] diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py index b757a1dc..55d9739b 100644 --- a/vllm/v1/sample/metadata.py +++ b/vllm/v1/sample/metadata.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Dict, List, Optional, Set, Tuple +from typing import Optional import torch @@ -17,7 +17,7 @@ class SamplingMetadata: top_k: Optional[torch.Tensor] min_p: Optional[torch.Tensor] - generators: Dict[int, torch.Generator] + generators: dict[int, torch.Generator] # None means no logprobs, 0 means sampled token logprobs only max_num_logprobs: Optional[int] @@ -28,12 +28,12 @@ class SamplingMetadata: presence_penalties: torch.Tensor repetition_penalties: torch.Tensor - output_token_ids: List[List[int]] + output_token_ids: list[list[int]] # req_index -> (min_tokens, stop_token_ids) - min_tokens: Dict[int, Tuple[int, Set[int]]] + min_tokens: dict[int, tuple[int, set[int]]] - logit_bias: List[Optional[Dict[int, float]]] + logit_bias: list[Optional[dict[int, float]]] # `allowed_token_ids_mask` is a 2D bool tensor of shape (max batch size, # vocab size). diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py index 8d9f6529..ed05e3f4 100644 --- a/vllm/v1/sample/ops/penalties.py +++ b/vllm/v1/sample/ops/penalties.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Set, Tuple - import torch from vllm.model_executor.layers.utils import apply_penalties @@ -9,13 +7,13 @@ from vllm.utils import is_pin_memory_available, make_tensor_with_pad def apply_min_token_penalties( - logits: torch.Tensor, output_token_ids: List[List[int]], - min_tokens: Dict[int, Tuple[int, Set[int]]]) -> None: + logits: torch.Tensor, output_token_ids: list[list[int]], + min_tokens: dict[int, tuple[int, set[int]]]) -> None: """ Applies minimum token penalty by setting the logits of the stop tokens to -inf. """ - min_tokens_logits_to_penalize: List[Tuple[int, int]] = [] + min_tokens_logits_to_penalize: list[tuple[int, int]] = [] for index, (min_token, stop_token_ids) in min_tokens.items(): if len(output_token_ids[index]) < min_token: for stop_token_id in stop_token_ids: @@ -30,7 +28,7 @@ def apply_all_penalties( presence_penalties: torch.Tensor, frequency_penalties: torch.Tensor, repetition_penalties: torch.Tensor, - output_token_ids: List[List[int]], + output_token_ids: list[list[int]], ) -> torch.Tensor: """ Applies presence, frequency and repetition penalties to the logits. @@ -43,7 +41,7 @@ def apply_all_penalties( repetition_penalties) -def _convert_to_tensors(output_token_ids: List[List[int]], vocab_size: int, +def _convert_to_tensors(output_token_ids: list[list[int]], vocab_size: int, device: torch.device) -> torch.Tensor: """ Convert the different list data structures to tensors. diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index 78c88ad8..1bb950be 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, Optional +from typing import Optional import torch import torch.nn as nn @@ -54,7 +54,7 @@ class TopKTopPSampler(nn.Module): def forward_native( self, logits: torch.Tensor, - generators: Dict[int, torch.Generator], + generators: dict[int, torch.Generator], k: Optional[torch.Tensor], p: Optional[torch.Tensor], ) -> torch.Tensor: @@ -66,7 +66,7 @@ class TopKTopPSampler(nn.Module): def forward_cuda( self, logits: torch.Tensor, - generators: Dict[int, torch.Generator], + generators: dict[int, torch.Generator], k: Optional[torch.Tensor], p: Optional[torch.Tensor], ) -> torch.Tensor: @@ -117,7 +117,7 @@ def apply_top_k_top_p( def random_sample( probs: torch.Tensor, - generators: Dict[int, torch.Generator], + generators: dict[int, torch.Generator], ) -> torch.Tensor: """Randomly sample from the probabilities. @@ -143,7 +143,7 @@ def flashinfer_sample( probs: torch.Tensor, k: Optional[torch.Tensor], p: Optional[torch.Tensor], - generators: Dict[int, torch.Generator], + generators: dict[int, torch.Generator], ) -> torch.Tensor: """Sample from the probabilities using FlashInfer. diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index 2e392734..80a4b241 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -1,5 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List import torch import torch.nn as nn @@ -54,7 +53,7 @@ class RejectionSampler(nn.Module): else: self.forward_method = self.forward_native - def forward(self, draft_token_ids: List[List[int]], + def forward(self, draft_token_ids: list[list[int]], target_probs: torch.Tensor, sampling_metadata: SamplingMetadata) -> SamplerOutput: if not sampling_metadata.all_greedy: @@ -66,7 +65,7 @@ class RejectionSampler(nn.Module): def flashinfer_sample( self, - draft_token_ids: List[List[int]], + draft_token_ids: list[list[int]], target_probs: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> SamplerOutput: @@ -119,7 +118,7 @@ class RejectionSampler(nn.Module): # TODO: The following method can be optimized for better performance. def forward_native( self, - draft_token_ids: List[List[int]], + draft_token_ids: list[list[int]], target_probs: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> SamplerOutput: diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py index 09d38263..46818977 100644 --- a/vllm/v1/stats/common.py +++ b/vllm/v1/stats/common.py @@ -4,7 +4,7 @@ import time from dataclasses import dataclass from dataclasses import field as dataclass_field from enum import IntEnum -from typing import ClassVar, Dict, List, Optional, Set +from typing import ClassVar, Optional import msgspec from msgspec import field as msgspec_field @@ -78,7 +78,7 @@ class RequestStatsUpdate( ▼ FINISHED (All could go to FINISHED) """ - _VALID_TRANSITIONS: ClassVar[Dict[Type, Set[Type]]] = { + _VALID_TRANSITIONS: ClassVar[dict[Type, set[Type]]] = { Type.ARRIVED: { Type.INPUT_PROCESSED, Type.FINISHED, @@ -140,7 +140,7 @@ class RequestStatsUpdate( finish_reason: Optional[str] = None # Non-optional fields for each update type. - _REQUIRED_FIELDS: ClassVar[Dict[Type, List[str]]] = { + _REQUIRED_FIELDS: ClassVar[dict[Type, list[str]]] = { Type.INPUT_PROCESSED: ["num_prompt_tokens", "sampling_params"], Type.PREFILLING: ["num_computed_tokens", "num_cached_tokens"], Type.DETOKENIZED: ["num_new_tokens"], @@ -218,13 +218,13 @@ class RequestStats: # 2. the request was preempted and resumed. It is equivalent to running # a prefill of the original prefill tokens + generated output tokens # before preemption. - prefill_start_ts_s_lst: List[float] = dataclass_field(default_factory=list) + prefill_start_ts_s_lst: list[float] = dataclass_field(default_factory=list) # A list of timestamps when a token is decoded by the engine core. - decoding_ts_s_lst: List[float] = dataclass_field(default_factory=list) + decoding_ts_s_lst: list[float] = dataclass_field(default_factory=list) # A sorted list of timestamps for each output token. - output_token_ts_s_lst: List[float] = dataclass_field(default_factory=list) + output_token_ts_s_lst: list[float] = dataclass_field(default_factory=list) # First token's timestamp. first_token_ts_s: Optional[float] = None @@ -241,7 +241,7 @@ class RequestStats: # metric to measure the impact of preemption other than observation of # large P99 TPOT. Ideally we could quantify the impact of preemption by # measuring the number of tokens re-computed due to preemption. - preempted_ts_s_lst: List[float] = dataclass_field(default_factory=list) + preempted_ts_s_lst: list[float] = dataclass_field(default_factory=list) # Timestamp when the request was finished at the engine core. finished_ts_s: Optional[float] = None @@ -308,7 +308,7 @@ class RequestStats: return self.e2e_latency_s - self.first_token_latency_s @property - def output_token_latency_s_lst(self) -> List[float]: + def output_token_latency_s_lst(self) -> list[float]: if len(self.output_token_ts_s_lst) == 0: return [] latency_s_lst = [] @@ -442,7 +442,7 @@ class EngineCoreStatsSnapshot( default_factory=SchedulerStats) # Per request stats updates. - requests_stats_updates: List[RequestStatsUpdate] = msgspec_field( + requests_stats_updates: list[RequestStatsUpdate] = msgspec_field( default_factory=list) # Engine core's queue stats. diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 62271255..8e1fb18c 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -5,8 +5,8 @@ import os import weakref from collections import defaultdict from collections.abc import Sequence -from typing import (TYPE_CHECKING, Any, Callable, Dict, Generic, List, - Optional, TypeVar, Union, overload) +from typing import (TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar, + Union, overload) import torch @@ -24,7 +24,7 @@ T = TypeVar("T") class ConstantList(Generic[T], Sequence): - def __init__(self, x: List[T]) -> None: + def __init__(self, x: list[T]) -> None: self._x = x def append(self, item): @@ -57,10 +57,10 @@ class ConstantList(Generic[T], Sequence): ... @overload - def __getitem__(self, s: slice, /) -> List[T]: + def __getitem__(self, s: slice, /) -> list[T]: ... - def __getitem__(self, item: Union[int, slice]) -> Union[T, List[T]]: + def __getitem__(self, item: Union[int, slice]) -> Union[T, list[T]]: return self._x[item] @overload @@ -71,7 +71,7 @@ class ConstantList(Generic[T], Sequence): def __setitem__(self, s: slice, value: T, /): ... - def __setitem__(self, item: Union[int, slice], value: Union[T, List[T]]): + def __setitem__(self, item: Union[int, slice], value: Union[T, list[T]]): raise Exception("Cannot set item in a constant list") def __delitem__(self, item): @@ -99,7 +99,7 @@ class BackgroundProcHandle: output_path: str, process_name: str, target_fn: Callable, - process_kwargs: Dict[Any, Any], + process_kwargs: dict[Any, Any], ): context = get_mp_context() reader, writer = context.Pipe(duplex=False) @@ -146,9 +146,9 @@ def shutdown(proc: multiprocessing.Process, input_path: str, output_path: str): def bind_kv_cache( - kv_caches: Dict[str, torch.Tensor], - forward_context: Dict[str, "Attention"], - runner_kv_caches: List[torch.Tensor], + kv_caches: dict[str, torch.Tensor], + forward_context: dict[str, "Attention"], + runner_kv_caches: list[torch.Tensor], ) -> None: """ Bind the allocated KV cache to both ModelRunner and forward context so diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 830cca10..7d4082b7 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List - import numpy as np import torch @@ -40,7 +38,7 @@ class BlockTable: def append_row( self, - block_ids: List[int], + block_ids: list[int], row_idx: int, ) -> None: if not block_ids: @@ -50,7 +48,7 @@ class BlockTable: self.num_blocks_per_row[row_idx] += num_blocks self.block_table_np[row_idx, start:start + num_blocks] = block_ids - def add_row(self, block_ids: List[int], row_idx: int) -> None: + def add_row(self, block_ids: list[int], row_idx: int) -> None: self.num_blocks_per_row[row_idx] = 0 self.append_row(block_ids, row_idx) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 788a3522..b0b218d9 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -2,7 +2,7 @@ # Datastructures defining an input batch from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, cast +from typing import TYPE_CHECKING, Optional, cast import numpy as np import torch @@ -24,16 +24,16 @@ if TYPE_CHECKING: class CachedRequestState: req_id: str - prompt_token_ids: List[int] + prompt_token_ids: list[int] prompt: Optional[str] - mm_inputs: List[MultiModalKwargs] - mm_positions: List["PlaceholderRange"] + mm_inputs: list[MultiModalKwargs] + mm_positions: list["PlaceholderRange"] sampling_params: SamplingParams generator: Optional[torch.Generator] - block_ids: List[int] + block_ids: list[int] num_computed_tokens: int - output_token_ids: List[int] + output_token_ids: list[int] mrope_positions: Optional[torch.Tensor] = None mrope_position_delta: Optional[int] = None @@ -63,8 +63,8 @@ class InputBatch: self.pin_memory = pin_memory self.vocab_size = vocab_size - self._req_ids: List[Optional[str]] = [] - self.req_id_to_index: Dict[str, int] = {} + self._req_ids: list[Optional[str]] = [] + self.req_id_to_index: dict[str, int] = {} # TODO(woosuk): This buffer could be too large if max_model_len is big. # Find a way to reduce the CPU memory usage. @@ -106,8 +106,8 @@ class InputBatch: device="cpu", pin_memory=pin_memory) self.temperature_cpu = self.temperature_cpu_tensor.numpy() - self.greedy_reqs: Set[str] = set() - self.random_reqs: Set[str] = set() + self.greedy_reqs: set[str] = set() + self.random_reqs: set[str] = set() self.top_p = torch.empty((max_num_reqs, ), dtype=torch.float32, @@ -117,7 +117,7 @@ class InputBatch: device="cpu", pin_memory=pin_memory) self.top_p_cpu = self.top_p_cpu_tensor.numpy() - self.top_p_reqs: Set[str] = set() + self.top_p_reqs: set[str] = set() self.top_k = torch.empty((max_num_reqs, ), dtype=torch.int32, @@ -127,7 +127,7 @@ class InputBatch: device="cpu", pin_memory=pin_memory) self.top_k_cpu = self.top_k_cpu_tensor.numpy() - self.top_k_reqs: Set[str] = set() + self.top_k_reqs: set[str] = set() self.min_p = torch.empty((max_num_reqs, ), dtype=torch.float32, @@ -137,7 +137,7 @@ class InputBatch: device="cpu", pin_memory=pin_memory) self.min_p_cpu = self.min_p_cpu_tensor.numpy() - self.min_p_reqs: Set[str] = set() + self.min_p_reqs: set[str] = set() # Frequency penalty related data structures self.frequency_penalties = torch.empty((max_num_reqs, ), @@ -150,7 +150,7 @@ class InputBatch: pin_memory=pin_memory) self.frequency_penalties_cpu = \ self.frequency_penalties_cpu_tensor.numpy() - self.frequency_penalties_reqs: Set[str] = set() + self.frequency_penalties_reqs: set[str] = set() # Presence penalty related data structures self.presence_penalties = torch.empty((max_num_reqs, ), @@ -162,7 +162,7 @@ class InputBatch: pin_memory=pin_memory) self.presence_penalties_cpu = self.presence_penalties_cpu_tensor.numpy( ) - self.presence_penalties_reqs: Set[str] = set() + self.presence_penalties_reqs: set[str] = set() # Repetition penalty related data structures self.repetition_penalties = torch.empty((max_num_reqs, ), @@ -175,43 +175,43 @@ class InputBatch: pin_memory=pin_memory) self.repetition_penalties_cpu = \ self.repetition_penalties_cpu_tensor.numpy() - self.repetition_penalties_reqs: Set[str] = set() + self.repetition_penalties_reqs: set[str] = set() # req_index -> (min_tokens, stop_token_ids) - self.min_tokens: Dict[int, Tuple[int, Set[int]]] = {} + self.min_tokens: dict[int, tuple[int, set[int]]] = {} # lora related self.request_lora_mapping = np.zeros((self.max_num_reqs, ), dtype=np.int32) - self.lora_id_to_request_ids: Dict[int, Set[str]] = {} - self.lora_id_to_lora_request: Dict[int, LoRARequest] = {} + self.lora_id_to_request_ids: dict[int, set[str]] = {} + self.lora_id_to_lora_request: dict[int, LoRARequest] = {} # req_index -> generator # NOTE(woosuk): The indices of the requests that do not have their own # generator should not be included in the dictionary. - self.generators: Dict[int, torch.Generator] = {} + self.generators: dict[int, torch.Generator] = {} - self.num_logprobs: Dict[str, int] = {} + self.num_logprobs: dict[str, int] = {} # NOTE(rob): num_prompt_logprobs only includes reqs # that are currently in the prefill phase. - self.num_prompt_logprobs: Dict[str, int] = {} + self.num_prompt_logprobs: dict[str, int] = {} - self.logit_bias: List[Optional[Dict[int, + self.logit_bias: list[Optional[dict[int, float]]] = [None] * max_num_reqs - self.has_allowed_token_ids: Set[str] = set() + self.has_allowed_token_ids: set[str] = set() self.allowed_token_ids_mask: Optional[torch.Tensor] = None self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None - self.req_output_token_ids: List[Optional[List[int]]] = [] + self.req_output_token_ids: list[Optional[list[int]]] = [] # This is updated each time the batch constituents change. self.sampling_metadata = self._make_sampling_metadata() @property - def req_ids(self) -> List[str]: + def req_ids(self) -> list[str]: # None elements should only be present transiently # while performing state updates to the batch. - return cast(List[str], self._req_ids) + return cast(list[str], self._req_ids) def add_request( self, @@ -417,7 +417,7 @@ class InputBatch: self.logit_bias[i2], self.logit_bias[i1] self.block_table.swap_row(i1, i2) - def condense(self, empty_req_indices: List[int]) -> None: + def condense(self, empty_req_indices: list[int]) -> None: num_reqs = self.num_reqs if num_reqs == 0: # The batched states are empty. @@ -550,7 +550,7 @@ class InputBatch: frequency_penalties=self.frequency_penalties[:num_reqs], presence_penalties=self.presence_penalties[:num_reqs], repetition_penalties=self.repetition_penalties[:num_reqs], - output_token_ids=cast(List[List[int]], self.req_output_token_ids), + output_token_ids=cast(list[list[int]], self.req_output_token_ids), min_tokens=self.min_tokens, no_penalties=self.no_penalties, logit_bias=self.logit_bias[:num_reqs], @@ -577,7 +577,7 @@ class InputBatch: def make_lora_inputs( self, num_scheduled_tokens: np.ndarray - ) -> Tuple[Tuple[int, ...], Tuple[int, ...], Set[LoRARequest]]: + ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]: """ Given the num_scheduled_tokens for each request in the batch, return datastructures used to activate the current LoRAs. @@ -593,7 +593,7 @@ class InputBatch: prompt_lora_mapping = tuple(req_lora_mapping) token_lora_mapping = tuple( req_lora_mapping.repeat(num_scheduled_tokens)) - active_lora_requests: Set[LoRARequest] = set( + active_lora_requests: set[LoRARequest] = set( self.lora_id_to_lora_request.values()) return prompt_lora_mapping, token_lora_mapping, active_lora_requests diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6785d668..4a1fb051 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3,7 +3,7 @@ import gc import time import weakref -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Optional, Union import numpy as np import torch @@ -135,9 +135,9 @@ class GPUModelRunner(LoRAModelRunnerMixin): # Lazy initialization # self.model: nn.Module # Set after load_model - self.kv_caches: List[torch.Tensor] = [] + self.kv_caches: list[torch.Tensor] = [] # req_id -> (input_id -> encoder_output) - self.encoder_cache: Dict[str, Dict[int, torch.Tensor]] = {} + self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} # Set up speculative decoding. self.use_spec_decode = False @@ -158,7 +158,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ) # Request states. - self.requests: Dict[str, CachedRequestState] = {} + self.requests: dict[str, CachedRequestState] = {} # Persistent batch. self.input_batch = InputBatch( max_num_reqs=self.max_num_reqs, @@ -274,7 +274,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): # then resubmitted with the same ID. In this case, we treat them as two # distinct requests - clearing the cached states for the first request # and handling the second as a new request. - removed_req_indices: List[int] = [] + removed_req_indices: list[int] = [] for req_id in scheduler_output.finished_req_ids: req_index = self.input_batch.remove_request(req_id) if req_index is not None: @@ -305,7 +305,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): assert req_index is not None removed_req_indices.append(req_index) - req_ids_to_add: List[str] = [] + req_ids_to_add: list[str] = [] # Add new requests to the cached states. for new_req_data in scheduler_output.scheduled_new_reqs: req_id = new_req_data.req_id @@ -446,7 +446,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): def _prepare_inputs( self, scheduler_output: "SchedulerOutput", - ) -> Tuple[FlashAttentionMetadata, torch.Tensor]: + ) -> tuple[FlashAttentionMetadata, torch.Tensor]: total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens assert total_num_scheduled_tokens > 0 num_reqs = self.input_batch.num_reqs @@ -774,8 +774,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): return # Batch the multi-modal inputs. - mm_inputs: List[MultiModalKwargs] = [] - req_input_ids: List[Tuple[str, int]] = [] + mm_inputs: list[MultiModalKwargs] = [] + req_input_ids: list[tuple[str, int]] = [] for req_id, encoder_input_ids in scheduled_encoder_inputs.items(): req_state = self.requests[req_id] for input_id in encoder_input_ids: @@ -819,8 +819,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): def _gather_encoder_outputs( self, scheduler_output: "SchedulerOutput", - ) -> List[torch.Tensor]: - encoder_outputs: List[torch.Tensor] = [] + ) -> list[torch.Tensor]: + encoder_outputs: list[torch.Tensor] = [] for req_id in self.input_batch.req_ids: num_scheduled_tokens = scheduler_output.num_scheduled_tokens[ req_id] @@ -1022,10 +1022,10 @@ class GPUModelRunner(LoRAModelRunnerMixin): def generate_draft_token_ids( self, - sampled_token_ids: List[List[int]], - ) -> List[List[int]]: + sampled_token_ids: list[list[int]], + ) -> list[list[int]]: # TODO(woosuk): Optimize. - draft_token_ids: List[List[int]] = [] + draft_token_ids: list[list[int]] = [] for i, sampled_ids in enumerate(sampled_token_ids): num_sampled_ids = len(sampled_ids) if not num_sampled_ids: @@ -1069,12 +1069,12 @@ class GPUModelRunner(LoRAModelRunnerMixin): self, hidden_states: torch.Tensor, scheduler_output: "SchedulerOutput", - ) -> Dict[str, Optional[LogprobsTensors]]: + ) -> dict[str, Optional[LogprobsTensors]]: num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs if not num_prompt_logprobs_dict: return {} - prompt_logprobs_dict: Dict[str, Optional[LogprobsTensors]] = {} + prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {} # Since prompt logprobs are a rare feature, prioritize simple, # maintainable loop over optimal performance. @@ -1365,7 +1365,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): "Hybrid models with more than one KV cache type are not " "supported yet.") - kv_caches: Dict[str, torch.Tensor] = {} + kv_caches: dict[str, torch.Tensor] = {} for layer_name, layer_spec in kv_cache_config.kv_cache_spec.items(): tensor_config = kv_cache_config.tensors[layer_name] diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index f681925f..cc6268d6 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -2,7 +2,7 @@ """A GPU worker class.""" import gc import os -from typing import TYPE_CHECKING, Optional, Set +from typing import TYPE_CHECKING, Optional import torch import torch.distributed @@ -243,7 +243,7 @@ class Worker(WorkerBase): def remove_lora(self, lora_id: int) -> bool: return self.model_runner.remove_lora(lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: return self.model_runner.list_loras() def pin_lora(self, lora_id: int) -> bool: diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 731e758e..f34aacac 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -4,7 +4,6 @@ Define LoRA functionality mixin for model runners. """ from contextlib import contextmanager -from typing import Set, Tuple import numpy as np import torch.nn as nn @@ -57,9 +56,9 @@ class LoRAModelRunnerMixin: ) return self.lora_manager.create_lora_manager(model) - def _set_active_loras(self, prompt_lora_mapping: Tuple[int, ...], - token_lora_mapping: Tuple[int, ...], - lora_requests: Set[LoRARequest]) -> None: + def _set_active_loras(self, prompt_lora_mapping: tuple[int, ...], + token_lora_mapping: tuple[int, ...], + lora_requests: set[LoRARequest]) -> None: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") @@ -74,10 +73,10 @@ class LoRAModelRunnerMixin: def set_active_loras(self, input_batch: InputBatch, num_scheduled_tokens: np.ndarray) -> None: - prompt_lora_mapping: Tuple[int, ...] # of size input_batch.num_reqs - token_lora_mapping: Tuple[int, + prompt_lora_mapping: tuple[int, ...] # of size input_batch.num_reqs + token_lora_mapping: tuple[int, ...] # of size np.sum(num_scheduled_tokens) - lora_requests: Set[LoRARequest] + lora_requests: set[LoRARequest] prompt_lora_mapping, token_lora_mapping, lora_requests = \ input_batch.make_lora_inputs(num_scheduled_tokens) return self._set_active_loras(prompt_lora_mapping, token_lora_mapping, @@ -105,7 +104,7 @@ class LoRAModelRunnerMixin: num_scheduled_tokens) # Make dummy lora requests - lora_requests: Set[LoRARequest] = { + lora_requests: set[LoRARequest] = { LoRARequest(lora_name=f"warmup_{lora_id}", lora_int_id=lora_id, lora_path="/not/a/real/path") @@ -143,7 +142,7 @@ class LoRAModelRunnerMixin: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.pin_adapter(lora_id) - def list_loras(self) -> Set[int]: + def list_loras(self) -> set[int]: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.list_adapters() \ No newline at end of file diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 2c6a0371..104e5a3d 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import time -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, cast +from typing import TYPE_CHECKING, Optional, cast from unittest.mock import patch import numpy as np @@ -95,13 +95,13 @@ class TPUModelRunner: ) # Request states. - self.requests: Dict[str, CachedRequestState] = {} + self.requests: dict[str, CachedRequestState] = {} # req_id -> (input_id -> encoder_output) - self.encoder_cache: Dict[str, Dict[int, torch.Tensor]] = {} + self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} # KV caches for forward pass - self.kv_caches: List[Tuple[torch.Tensor, torch.Tensor]] = [] + self.kv_caches: list[tuple[torch.Tensor, torch.Tensor]] = [] # Cached torch/numpy tensor # The pytorch tensor and numpy array share the same buffer. @@ -171,7 +171,7 @@ class TPUModelRunner: # then resubmitted with the same ID. In this case, we treat them as two # distinct requests - clearing the cached states for the first request # and handling the second as a new request. - removed_req_indices: List[int] = [] + removed_req_indices: list[int] = [] for req_id in scheduler_output.finished_req_ids: req_index = self.input_batch.remove_request(req_id) if req_index is not None: @@ -194,7 +194,7 @@ class TPUModelRunner: assert req_index is not None removed_req_indices.append(req_index) - req_ids_to_add: List[str] = [] + req_ids_to_add: list[str] = [] # Add new requests to the cached states. for new_req_data in scheduler_output.scheduled_new_reqs: req_id = new_req_data.req_id @@ -453,7 +453,7 @@ class TPUModelRunner: selected_token_ids = torch.argmax(logits, dim=-1, keepdim=True) # Then, let's update the cache state. - request_seq_lens: List[Tuple[int, CachedRequestState, int]] = [] + request_seq_lens: list[tuple[int, CachedRequestState, int]] = [] for i, req_id in zip(range(num_reqs), self.input_batch.req_ids): assert req_id is not None req_state = self.requests[req_id] @@ -473,9 +473,9 @@ class TPUModelRunner: assert all( req_id is not None for req_id in self.input_batch.req_ids[:num_reqs]), "req_ids contains None" - req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs]) + req_ids = cast(list[str], self.input_batch.req_ids[:num_reqs]) - prompt_logprobs_dict: Dict[str, Optional[LogprobsTensors]] = {} + prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {} for req_id in self.input_batch.req_ids[:num_reqs]: prompt_logprobs_dict[req_id] = None @@ -612,7 +612,7 @@ class TPUModelRunner: "Hybrid models with more than one KV cache type are not " "supported yet.") - kv_caches: Dict[str, torch.Tensor] = {} + kv_caches: dict[str, torch.Tensor] = {} for layer_name, layer_spec in kv_cache_config.kv_cache_spec.items(): tensor_config = kv_cache_config.tensors[layer_name] @@ -649,7 +649,7 @@ class ModelWrapperV1(nn.Module): self, token_ids: torch.Tensor, position_ids: torch.Tensor, - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + kv_caches: list[tuple[torch.Tensor, torch.Tensor]], ) -> torch.Tensor: """Executes the forward pass of the model and samples the next token. @@ -667,7 +667,7 @@ class ModelWrapperV1(nn.Module): # [num_kv_heads, num_blocks, block_size, head_size]. To make it # work, we need to flatten the first three dimensions and modify # the slot_mapping accordingly. - # kv_caches: List[Tuple[torch.Tensor, torch.Tensor]] + # kv_caches: list[tuple[torch.Tensor, torch.Tensor]] num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape slot_mapping = attn_metadata.slot_mapping slot_mapping = slot_mapping.flatten() diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 405dc628..cbd2fe6e 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """A TPU worker class.""" import os -from typing import Dict, List, Optional +from typing import Optional import torch import torch.distributed @@ -103,7 +103,7 @@ class TPUWorker: self.model_runner = TPUModelRunner(self.vllm_config, self.device) def determine_available_memory(self) -> int: - kv_caches: Dict[str, torch.Tensor] = {} + kv_caches: dict[str, torch.Tensor] = {} kv_cache_spec = self.model_runner.get_kv_cache_spec() for layer_name, layer_spec in kv_cache_spec.items(): if isinstance(layer_spec, FullAttentionSpec): @@ -118,7 +118,7 @@ class TPUWorker: else: raise NotImplementedError - runner_kv_caches: List[torch.Tensor] = [] + runner_kv_caches: list[torch.Tensor] = [] bind_kv_cache( kv_caches, self.vllm_config.compilation_config.static_forward_context,