diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index ad428bd1..bab57025 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -27,8 +27,8 @@ class RequestFuncInput: class RequestFuncOutput: generated_text: str = "" success: bool = False - latency: float = 0 - ttft: float = 0 # Time to first token + latency: float = 0.0 + ttft: float = 0.0 # Time to first token itl: List[float] = field( default_factory=list) # List of inter-token latencies prompt_len: int = 0 @@ -58,23 +58,24 @@ async def async_request_tgi( output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len - ttft = 0 + ttft = 0.0 st = time.perf_counter() most_recent_timestamp = st try: async with session.post(url=api_url, json=payload) as response: if response.status == 200: - async for chunk in response.content: - chunk = chunk.strip() - if not chunk: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: continue - chunk = remove_prefix(chunk.decode("utf-8"), "data:") + chunk = remove_prefix(chunk_bytes.decode("utf-8"), + "data:") data = json.loads(chunk) timestamp = time.perf_counter() # First token - if ttft == 0: + if ttft == 0.0: ttft = time.perf_counter() - st output.ttft = ttft @@ -119,23 +120,24 @@ async def async_request_trt_llm( output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len - ttft = 0 + ttft = 0.0 st = time.perf_counter() most_recent_timestamp = st try: async with session.post(url=api_url, json=payload) as response: if response.status == 200: - async for chunk in response.content: - chunk = chunk.strip() - if not chunk: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: continue - chunk = remove_prefix(chunk.decode("utf-8"), "data:") + chunk = remove_prefix(chunk_bytes.decode("utf-8"), + "data:") data = json.loads(chunk) timestamp = time.perf_counter() # First token - if ttft == 0: + if ttft == 0.0: ttft = time.perf_counter() - st output.ttft = ttft @@ -151,7 +153,7 @@ async def async_request_trt_llm( output.success = True else: - output.error = response.reason + output.error = response.reason or "" output.success = False except Exception: output.success = False @@ -195,7 +197,7 @@ async def async_request_deepspeed_mii( output.generated_text = parsed_resp["text"][0] output.success = True else: - output.error = response.reason + output.error = response.reason or "" output.success = False except Exception: output.success = False @@ -234,19 +236,20 @@ async def async_request_openai_completions( output.prompt_len = request_func_input.prompt_len generated_text = "" - ttft = 0 + ttft = 0.0 st = time.perf_counter() most_recent_timestamp = st try: async with session.post(url=api_url, json=payload, headers=headers) as response: if response.status == 200: - async for chunk in response.content: - chunk = chunk.strip() - if not chunk: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: continue - chunk = remove_prefix(chunk.decode("utf-8"), "data: ") + chunk = remove_prefix(chunk_bytes.decode("utf-8"), + "data: ") if chunk == "[DONE]": latency = time.perf_counter() - st else: @@ -255,7 +258,7 @@ async def async_request_openai_completions( if data["choices"][0]["text"]: timestamp = time.perf_counter() # First token - if ttft == 0: + if ttft == 0.0: ttft = time.perf_counter() - st output.ttft = ttft @@ -315,19 +318,20 @@ async def async_request_openai_chat_completions( output.prompt_len = request_func_input.prompt_len generated_text = "" - ttft = 0 + ttft = 0.0 st = time.perf_counter() most_recent_timestamp = st try: async with session.post(url=api_url, json=payload, headers=headers) as response: if response.status == 200: - async for chunk in response.content: - chunk = chunk.strip() - if not chunk: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: continue - chunk = remove_prefix(chunk.decode("utf-8"), "data: ") + chunk = remove_prefix(chunk_bytes.decode("utf-8"), + "data: ") if chunk == "[DONE]": latency = time.perf_counter() - st else: @@ -337,7 +341,7 @@ async def async_request_openai_chat_completions( delta = data["choices"][0]["delta"] if delta.get("content", None): # First token - if ttft == 0: + if ttft == 0.0: ttft = time.perf_counter() - st output.ttft = ttft @@ -354,7 +358,7 @@ async def async_request_openai_chat_completions( output.success = True output.latency = latency else: - output.error = response.reason + output.error = response.reason or "" output.success = False except Exception: output.success = False diff --git a/docs/source/conf.py b/docs/source/conf.py index 44cda7c9..7a8c365f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -12,6 +12,7 @@ import logging import sys +from typing import List from sphinx.ext import autodoc @@ -45,7 +46,7 @@ templates_path = ['_templates'] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [] +exclude_patterns: List[str] = [] # Exclude the prompt "$" when copying code copybutton_prompt_text = r"\$ " diff --git a/setup.py b/setup.py index 98c92f91..9f0814e9 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import re import subprocess import sys from shutil import which -from typing import List +from typing import Dict, List import torch from packaging.version import Version, parse @@ -52,7 +52,7 @@ class CMakeExtension(Extension): class cmake_build_ext(build_ext): # A dict of extension directories that have been configured. - did_config = {} + did_config: Dict[str, bool] = {} # # Determine number of compilation jobs and optionally nvcc compile threads. @@ -261,6 +261,7 @@ def get_nvcc_cuda_version() -> Version: Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py """ + assert CUDA_HOME is not None, "CUDA_HOME is not set" nvcc_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True) output = nvcc_output.split() diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index 9f466566..fbceacf0 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -1,5 +1,5 @@ -from abc import ABC, abstractmethod, abstractproperty -from typing import Dict, List, Optional, Protocol +from abc import ABC, abstractmethod +from typing import Dict, FrozenSet, List, Optional, Protocol from vllm.utils import Device @@ -10,23 +10,28 @@ class Block(ABC): def append_token_ids(self, token_ids: List[int]) -> None: pass - @abstractproperty + @property + @abstractmethod def block_id(self) -> Optional[int]: pass - @abstractproperty + @property + @abstractmethod def token_ids(self) -> List[int]: pass - @abstractproperty + @property + @abstractmethod def num_empty_slots(self) -> int: pass - @abstractproperty + @property + @abstractmethod def is_full(self) -> bool: pass - @abstractproperty + @property + @abstractmethod def prev_block(self) -> Optional["Block"]: pass @@ -47,12 +52,13 @@ class Block(ABC): class BlockAllocator(ABC): @abstractmethod - def allocate_mutable(self, prev_block: Optional[Block]) -> Block: + def allocate_mutable(self, prev_block: Optional[Block], + device: Device) -> Block: pass @abstractmethod def allocate_immutable(self, prev_block: Optional[Block], - token_ids: List[int]) -> Block: + token_ids: List[int], device: Device) -> Block: pass @abstractmethod @@ -64,11 +70,12 @@ class BlockAllocator(ABC): pass @abstractmethod - def get_num_free_blocks(self) -> int: + def get_num_free_blocks(self, device: Device) -> int: pass - @abstractproperty - def all_block_ids(self) -> frozenset[int]: + @property + @abstractmethod + def all_block_ids(self) -> FrozenSet[int]: pass @abstractmethod diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 905db52a..02560907 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,6 +1,6 @@ import time from dataclasses import dataclass -from typing import Dict, List +from typing import Dict, List, Protocol import numpy as np from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info, @@ -119,6 +119,12 @@ class Stats: time_e2e_requests: List[float] +class SupportsMetricsInfo(Protocol): + + def metrics_info(self) -> Dict[str, str]: + ... + + class StatLogger: """StatLogger is used LLMEngine to log to Promethus and Stdout.""" @@ -135,7 +141,7 @@ class StatLogger: self.labels = labels self.metrics = Metrics(labelnames=list(labels.keys())) - def info(self, type: str, obj: object) -> None: + def info(self, type: str, obj: SupportsMetricsInfo) -> None: if type == "cache_config": self.metrics.info_cache_config.info(obj.metrics_info()) diff --git a/vllm/logger.py b/vllm/logger.py index e5e46f5c..af957508 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -4,6 +4,7 @@ import logging import os import sys +from typing import Optional VLLM_CONFIGURE_LOGGING = int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")) @@ -26,7 +27,7 @@ class NewLineFormatter(logging.Formatter): _root_logger = logging.getLogger("vllm") -_default_handler = None +_default_handler: Optional[logging.Handler] = None def _setup_logger(): @@ -55,7 +56,12 @@ def init_logger(name: str): # Use the same settings as above for root logger logger = logging.getLogger(name) logger.setLevel(os.getenv("LOG_LEVEL", "DEBUG")) + if VLLM_CONFIGURE_LOGGING: + if _default_handler is None: + raise ValueError( + "_default_handler is not set up. This should never happen!" + " Please open an issue on Github.") logger.addHandler(_default_handler) logger.propagate = False return logger diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index eb8d5f6d..6519781c 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -247,11 +247,12 @@ def _yarn_find_correction_dim(num_rotations: int, # Find dim range bounds based on rotations -def _yarn_find_correction_range(low_rot: int, - high_rot: int, - dim: int, - base: float = 10000, - max_position_embeddings: int = 2048) -> int: +def _yarn_find_correction_range( + low_rot: int, + high_rot: int, + dim: int, + base: float = 10000, + max_position_embeddings: int = 2048) -> Tuple[int, int]: low = math.floor( _yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)) high = math.ceil( @@ -293,8 +294,8 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding): *, extrapolation_factor: float = 1, attn_factor: float = 1, - beta_fast: float = 32, - beta_slow: float = 1, + beta_fast: int = 32, + beta_slow: int = 1, ) -> None: self.scaling_factor = scaling_factor self.extrapolation_factor = extrapolation_factor diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 8a6ba6c5..ce7a30dc 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -1,10 +1,10 @@ -from typing import Optional +from typing import Dict, Optional from transformers import AutoConfig, PretrainedConfig from vllm.transformers_utils.configs import * -_CONFIG_REGISTRY = { +_CONFIG_REGISTRY: Dict[str, PretrainedConfig] = { "chatglm": ChatGLMConfig, "dbrx": DbrxConfig, "mpt": MPTConfig, diff --git a/vllm/transformers_utils/configs/dbrx.py b/vllm/transformers_utils/configs/dbrx.py index 3a19af71..1d2724f2 100644 --- a/vllm/transformers_utils/configs/dbrx.py +++ b/vllm/transformers_utils/configs/dbrx.py @@ -12,7 +12,7 @@ from transformers.utils import logging logger = logging.get_logger(__name__) -DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {} +DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {} # type: ignore class DbrxAttentionConfig(PretrainedConfig): diff --git a/vllm/transformers_utils/tokenizers/baichuan.py b/vllm/transformers_utils/tokenizers/baichuan.py index 02045bdc..79894035 100644 --- a/vllm/transformers_utils/tokenizers/baichuan.py +++ b/vllm/transformers_utils/tokenizers/baichuan.py @@ -16,11 +16,11 @@ logger = logging.get_logger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} -PRETRAINED_VOCAB_FILES_MAP = { +PRETRAINED_VOCAB_FILES_MAP = { # type: ignore "vocab_file": {}, "tokenizer_file": {}, } -PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} # type: ignore class BaichuanTokenizer(PreTrainedTokenizer): @@ -148,9 +148,9 @@ class BaichuanTokenizer(PreTrainedTokenizer): `Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) " - "should be a directory") - return + raise ValueError(f"Vocabulary path ({save_directory}) " + "should be a directory") + out_vocab_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + diff --git a/vllm/utils.py b/vllm/utils.py index fdb0a376..669b6589 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -294,7 +294,7 @@ def create_kv_caches_with_random( head_size: int, cache_dtype: Optional[Union[str, torch.dtype]], model_dtype: Optional[Union[str, torch.dtype]] = None, - seed: Optional[int] = 0, + seed: int = 0, device: Optional[str] = "cuda", ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: torch.random.manual_seed(seed) @@ -400,7 +400,7 @@ class CudaMemoryProfiler: gc.collect() -def str_to_int_tuple(s: str) -> Tuple[int]: +def str_to_int_tuple(s: str) -> Tuple[int, ...]: """Convert a string to a tuple of integers.""" try: return tuple(map(int, s.split(",")))