diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index c50125b7..431adb8e 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -921,7 +921,7 @@ if __name__ == "__main__": "--percentile-metrics", type=str, default="ttft,tpot,itl", - help="Comma-seperated list of selected metrics to report percentils. " + help="Comma-separated list of selected metrics to report percentils. " "This argument specifies the metrics to report percentiles. " "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " "Default value is \"ttft,tpot,itl\".") @@ -929,7 +929,7 @@ if __name__ == "__main__": "--metric-percentiles", type=str, default="99", - help="Comma-seperated list of percentiles for selected metrics. " + help="Comma-separated list of percentiles for selected metrics. " "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " "Default value is \"99\". " "Use \"--percentile-metrics\" to select metrics.", diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 71cb420a..6d3ba6c0 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -963,7 +963,7 @@ if __name__ == "__main__": "--percentile-metrics", type=str, default="ttft,tpot,itl", - help="Comma-seperated list of selected metrics to report percentils. " + help="Comma-separated list of selected metrics to report percentils. " "This argument specifies the metrics to report percentiles. " "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " "Default value is \"ttft,tpot,itl\".") @@ -971,7 +971,7 @@ if __name__ == "__main__": "--metric-percentiles", type=str, default="99", - help="Comma-seperated list of percentiles for selected metrics. " + help="Comma-separated list of percentiles for selected metrics. " "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " "Default value is \"99\". " "Use \"--percentile-metrics\" to select metrics.", diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu index f0e5533b..98daf1a1 100644 --- a/csrc/mamba/causal_conv1d/causal_conv1d.cu +++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu @@ -422,7 +422,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) { int final_state_position = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize); // in case the final state is separated between the last "smem_exchange" and // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), - // (which occurs when `final_state_position` is a non-positivie index) + // (which occurs when `final_state_position` is a non-positive index) // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){ input_t vals_load[kNElts] = {0}; diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index c0a572b4..f9c5ad4d 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -326,7 +326,7 @@ class FlashAttentionMetadata(AttentionMetadata): assert self.use_cuda_graph if turn_prefills_into_decodes: - # When Mutli-Step is enabled with Chunked-Prefill, prefills and + # When Multi-Step is enabled with Chunked-Prefill, prefills and # decodes are scheduled together. In the first step, all the # prefills turn into decodes. This update reflects that # conversion. diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index cede9915..15625612 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -152,11 +152,11 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): logger.warning("Could not import HPU FusedSDPA kernel. " "vLLM will use native implementation.") - suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes() - if head_size not in suppored_head_sizes: + supported_head_sizes = HPUPagedAttention.get_supported_head_sizes() + if head_size not in supported_head_sizes: raise ValueError( f"Head size {head_size} is not supported by PagedAttention. " - f"Supported head sizes are: {suppored_head_sizes}.") + f"Supported head sizes are: {supported_head_sizes}.") if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index 8d70afe2..5a47c0f6 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention( return spda_o @ W_O NOTE: in the actual code, - `kv_b_proj` is [W_UK; W_UV] concatnated per head - `q_b_proj` is [W_UQ; W_QR] concatnated per head + `kv_b_proj` is [W_UK; W_UV] concatenated per head + `q_b_proj` is [W_UQ; W_QR] concatenated per head `out_proj` is W_O @@ -667,7 +667,7 @@ class MLACommonMetadata(AttentionMetadata): assert num_seqs > num_queries if turn_prefills_into_decodes: - # When Mutli-Step is enabled with Chunked-Prefill, prefills and + # When Multi-Step is enabled with Chunked-Prefill, prefills and # decodes are scheduled together. In the first step, all the # prefills turn into decodes. This update reflects that # conversion. diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index cd152e57..a9d4a70b 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -414,11 +414,11 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]): assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads - suppored_head_sizes = PagedAttention.get_supported_head_sizes() - if head_size not in suppored_head_sizes: + supported_head_sizes = PagedAttention.get_supported_head_sizes() + if head_size not in supported_head_sizes: raise ValueError( f"Head size {head_size} is not supported by PagedAttention. " - f"Supported head sizes are: {suppored_head_sizes}.") + f"Supported head sizes are: {supported_head_sizes}.") self.attn_type = attn_type diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py index 6bce5879..8c9145bb 100644 --- a/vllm/attention/ops/nki_flash_attn.py +++ b/vllm/attention/ops/nki_flash_attn.py @@ -446,7 +446,7 @@ def flash_paged_attention( IO tensor dtypes: - This kernel assumes all IO tensors have the same dtype except for block_tables (int32) and mask (int32) - - If mixed_percision is True, then all Tensor Engine operation will be + - If mixed_precision is True, then all Tensor Engine operation will be performed in bfloat16 and accumulation will be performed in float32. Otherwise the intermediates will be in the same type as the inputs. diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 813556f9..dc0ec321 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -724,14 +724,14 @@ def add_cli_args(parser: argparse.ArgumentParser): "--percentile-metrics", type=str, default="ttft,tpot,itl", - help="Comma-seperated list of selected metrics to report percentils. " + help="Comma-separated list of selected metrics to report percentils. " "This argument specifies the metrics to report percentiles. " "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ") parser.add_argument( "--metric-percentiles", type=str, default="99", - help="Comma-seperated list of percentiles for selected metrics. " + help="Comma-separated list of percentiles for selected metrics. " "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " "Use \"--percentile-metrics\" to select metrics.", ) diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 4c5d78a4..5f126c75 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -93,7 +93,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor): externally (before the next schedule() call) """ # Sequences can be in RUNNING or FINISHED_ABORTED state - # once scheduled, as a sequence is moved to FINSIHED_ABORTED + # once scheduled, as a sequence is moved to FINISHED_ABORTED # if a client disconnects from the api server. seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING) if seqs is None: diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py index 7997629d..acbff325 100644 --- a/vllm/entrypoints/openai/tool_parsers/utils.py +++ b/vllm/entrypoints/openai/tool_parsers/utils.py @@ -98,7 +98,7 @@ def find_all_indices(string: str, substring: str) -> list[int]: # partial_json_parser doesn't support extra data and -# JSONDecorder.raw_decode doesn't support partial JSON +# JSONDecoder.raw_decode doesn't support partial JSON def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]: try: return (partial_json_parser.loads(input_str, flags), len(input_str)) diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py index bedda4c2..014108e6 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py @@ -29,7 +29,7 @@ def choose_scaled_mm_linear_kernel( compute_capability: Optional[int] = None ) -> Type[ScaledMMLinearKernel]: """ - Choose an ScalledMMLinearKernel that can implement the given config for the + Choose an ScaledMMLinearKernel that can implement the given config for the given compute capability. Attempts to choose the best kernel in terms of performance. diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index cfd7bc2a..3c8aecc0 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -69,12 +69,12 @@ class CpuPlatform(Platform): cache_config = vllm_config.cache_config - ipex_avaliable = find_spec("intel_extension_for_pytorch") is not None + ipex_available = find_spec("intel_extension_for_pytorch") is not None if cache_config and cache_config.block_size is None: - cache_config.block_size = 128 if ipex_avaliable else 16 + cache_config.block_size = 128 if ipex_available else 16 - if not ipex_avaliable and cache_config.block_size != 16: + if not ipex_available and cache_config.block_size != 16: raise RuntimeError( f"--block-size={cache_config.block_size} requires" " intel_extension_for_pytorch") diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 2bb543bd..f788d90b 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -231,7 +231,7 @@ class Platform: parser: Optional[FlexibleArgumentParser] = None ) -> None: """ - Do some pre-registeration or update action for the current platform. + Do some pre-registration or update action for the current platform. This function is called before global VllmConfig is initialized or cli arguments are parsed. It's used for out-of-tree platforms to register or diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py index 249ace1f..0dae02d3 100644 --- a/vllm/reasoning/granite_reasoning_parser.py +++ b/vllm/reasoning/granite_reasoning_parser.py @@ -60,7 +60,7 @@ class GraniteReasoningParser(ReasoningParser): Args: model_output (str): Output of the model to be parsed. - request (ChatCompletionReqest): Request being processed. + request (ChatCompletionRequest): Request being processed. Returns: tuple[Optional[str], Optional[str]]: Tuple pair containing the diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 584320e7..75cf09e0 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -101,7 +101,7 @@ class RequestOutputKind(Enum): CUMULATIVE = 0 # Return only deltas in each RequestOutput DELTA = 1 - # Do not return intermediate RequestOuputs + # Do not return intermediate RequestOutput FINAL_ONLY = 2 diff --git a/vllm/third_party/pynvml.py b/vllm/third_party/pynvml.py index 0a4be23a..7ed9ced0 100644 --- a/vllm/third_party/pynvml.py +++ b/vllm/third_party/pynvml.py @@ -1119,7 +1119,7 @@ class _PrintableStructure(Structure): e.g. class that has _field_ 'hex_value', c_uint could be formatted with _fmt_ = {"hex_value" : "%08X"} to produce nicer output. - Default fomratting string for all fields can be set with key "" like: + Default formatting string for all fields can be set with key "" like: _fmt_ = {"" : "%d MHz"} # e.g all values are numbers in MHz. If not set it's assumed to be just "%s" diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 1437db7e..e6c4ebc7 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention( return spda_o @ W_O NOTE: in the actual code, - `kv_b_proj` is [W_UK; W_UV] concatnated per head - `q_b_proj` is [W_UQ; W_QR] concatnated per head + `kv_b_proj` is [W_UK; W_UV] concatenated per head + `q_b_proj` is [W_UQ; W_QR] concatenated per head `out_proj` is W_O diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index d79bce19..e854c2a4 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -326,7 +326,7 @@ class WorkerProc: logger.debug("Worker interrupted.") except Exception: - # worker_busy_loop sends exceptions exceptons to Executor + # worker_busy_loop sends exceptions to Executor # for shutdown, but if there is an error in startup or an # error with IPC itself, we need to alert the parent. psutil.Process().parent().send_signal(signal.SIGUSR1) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a83409a7..debb7072 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -998,7 +998,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ) -> Union[ModelRunnerOutput, torch.Tensor]: self._update_states(scheduler_output) if not scheduler_output.total_num_scheduled_tokens: - # Return empty ModelRunnerOuptut if there's no work to do. + # Return empty ModelRunnerOutput if there's no work to do. return EMPTY_MODEL_RUNNER_OUTPUT if self.is_multimodal_model: diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 7360c876..c99c6cb7 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -652,7 +652,7 @@ class TPUModelRunner: # Update cached state self._update_states(scheduler_output) if not scheduler_output.total_num_scheduled_tokens: - # Return empty ModelRunnerOuptut if there's no work to do. + # Return empty ModelRunnerOutput if there's no work to do. return EMPTY_MODEL_RUNNER_OUTPUT if self.is_multimodal_model: