[BugFix] fix some typos found by typos. (#16314)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
This commit is contained in:
yihong 2025-04-09 18:43:59 +08:00 committed by GitHub
parent 24834f4894
commit 04149cce27
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
21 changed files with 33 additions and 33 deletions

View File

@ -921,7 +921,7 @@ if __name__ == "__main__":
"--percentile-metrics",
type=str,
default="ttft,tpot,itl",
help="Comma-seperated list of selected metrics to report percentils. "
help="Comma-separated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
"Default value is \"ttft,tpot,itl\".")
@ -929,7 +929,7 @@ if __name__ == "__main__":
"--metric-percentiles",
type=str,
default="99",
help="Comma-seperated list of percentiles for selected metrics. "
help="Comma-separated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
"Default value is \"99\". "
"Use \"--percentile-metrics\" to select metrics.",

View File

@ -963,7 +963,7 @@ if __name__ == "__main__":
"--percentile-metrics",
type=str,
default="ttft,tpot,itl",
help="Comma-seperated list of selected metrics to report percentils. "
help="Comma-separated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
"Default value is \"ttft,tpot,itl\".")
@ -971,7 +971,7 @@ if __name__ == "__main__":
"--metric-percentiles",
type=str,
default="99",
help="Comma-seperated list of percentiles for selected metrics. "
help="Comma-separated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
"Default value is \"99\". "
"Use \"--percentile-metrics\" to select metrics.",

View File

@ -422,7 +422,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
int final_state_position = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
// in case the final state is separated between the last "smem_exchange" and
// and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2),
// (which occurs when `final_state_position` is a non-positivie index)
// (which occurs when `final_state_position` is a non-positive index)
// we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){
input_t vals_load[kNElts] = {0};

View File

@ -326,7 +326,7 @@ class FlashAttentionMetadata(AttentionMetadata):
assert self.use_cuda_graph
if turn_prefills_into_decodes:
# When Mutli-Step is enabled with Chunked-Prefill, prefills and
# When Multi-Step is enabled with Chunked-Prefill, prefills and
# decodes are scheduled together. In the first step, all the
# prefills turn into decodes. This update reflects that
# conversion.

View File

@ -152,11 +152,11 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
logger.warning("Could not import HPU FusedSDPA kernel. "
"vLLM will use native implementation.")
suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes()
if head_size not in suppored_head_sizes:
supported_head_sizes = HPUPagedAttention.get_supported_head_sizes()
if head_size not in supported_head_sizes:
raise ValueError(
f"Head size {head_size} is not supported by PagedAttention. "
f"Supported head sizes are: {suppored_head_sizes}.")
f"Supported head sizes are: {supported_head_sizes}.")
if attn_type != AttentionType.DECODER:
raise NotImplementedError("Encoder self-attention and "

View File

@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention(
return spda_o @ W_O
NOTE: in the actual code,
`kv_b_proj` is [W_UK; W_UV] concatnated per head
`q_b_proj` is [W_UQ; W_QR] concatnated per head
`kv_b_proj` is [W_UK; W_UV] concatenated per head
`q_b_proj` is [W_UQ; W_QR] concatenated per head
`out_proj` is W_O
@ -667,7 +667,7 @@ class MLACommonMetadata(AttentionMetadata):
assert num_seqs > num_queries
if turn_prefills_into_decodes:
# When Mutli-Step is enabled with Chunked-Prefill, prefills and
# When Multi-Step is enabled with Chunked-Prefill, prefills and
# decodes are scheduled together. In the first step, all the
# prefills turn into decodes. This update reflects that
# conversion.

View File

@ -414,11 +414,11 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
assert self.num_heads % self.num_kv_heads == 0
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
suppored_head_sizes = PagedAttention.get_supported_head_sizes()
if head_size not in suppored_head_sizes:
supported_head_sizes = PagedAttention.get_supported_head_sizes()
if head_size not in supported_head_sizes:
raise ValueError(
f"Head size {head_size} is not supported by PagedAttention. "
f"Supported head sizes are: {suppored_head_sizes}.")
f"Supported head sizes are: {supported_head_sizes}.")
self.attn_type = attn_type

View File

@ -446,7 +446,7 @@ def flash_paged_attention(
IO tensor dtypes:
- This kernel assumes all IO tensors have the same dtype except for
block_tables (int32) and mask (int32)
- If mixed_percision is True, then all Tensor Engine operation will be
- If mixed_precision is True, then all Tensor Engine operation will be
performed in bfloat16 and accumulation will be performed in float32.
Otherwise the intermediates will be in the same type as the inputs.

View File

@ -724,14 +724,14 @@ def add_cli_args(parser: argparse.ArgumentParser):
"--percentile-metrics",
type=str,
default="ttft,tpot,itl",
help="Comma-seperated list of selected metrics to report percentils. "
help="Comma-separated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ")
parser.add_argument(
"--metric-percentiles",
type=str,
default="99",
help="Comma-seperated list of percentiles for selected metrics. "
help="Comma-separated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
"Use \"--percentile-metrics\" to select metrics.",
)

View File

@ -93,7 +93,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
externally (before the next schedule() call)
"""
# Sequences can be in RUNNING or FINISHED_ABORTED state
# once scheduled, as a sequence is moved to FINSIHED_ABORTED
# once scheduled, as a sequence is moved to FINISHED_ABORTED
# if a client disconnects from the api server.
seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
if seqs is None:

View File

@ -98,7 +98,7 @@ def find_all_indices(string: str, substring: str) -> list[int]:
# partial_json_parser doesn't support extra data and
# JSONDecorder.raw_decode doesn't support partial JSON
# JSONDecoder.raw_decode doesn't support partial JSON
def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]:
try:
return (partial_json_parser.loads(input_str, flags), len(input_str))

View File

@ -29,7 +29,7 @@ def choose_scaled_mm_linear_kernel(
compute_capability: Optional[int] = None
) -> Type[ScaledMMLinearKernel]:
"""
Choose an ScalledMMLinearKernel that can implement the given config for the
Choose an ScaledMMLinearKernel that can implement the given config for the
given compute capability. Attempts to choose the best kernel in terms of
performance.

View File

@ -69,12 +69,12 @@ class CpuPlatform(Platform):
cache_config = vllm_config.cache_config
ipex_avaliable = find_spec("intel_extension_for_pytorch") is not None
ipex_available = find_spec("intel_extension_for_pytorch") is not None
if cache_config and cache_config.block_size is None:
cache_config.block_size = 128 if ipex_avaliable else 16
cache_config.block_size = 128 if ipex_available else 16
if not ipex_avaliable and cache_config.block_size != 16:
if not ipex_available and cache_config.block_size != 16:
raise RuntimeError(
f"--block-size={cache_config.block_size} requires"
" intel_extension_for_pytorch")

View File

@ -231,7 +231,7 @@ class Platform:
parser: Optional[FlexibleArgumentParser] = None
) -> None:
"""
Do some pre-registeration or update action for the current platform.
Do some pre-registration or update action for the current platform.
This function is called before global VllmConfig is initialized or cli
arguments are parsed. It's used for out-of-tree platforms to register or

View File

@ -60,7 +60,7 @@ class GraniteReasoningParser(ReasoningParser):
Args:
model_output (str): Output of the model to be parsed.
request (ChatCompletionReqest): Request being processed.
request (ChatCompletionRequest): Request being processed.
Returns:
tuple[Optional[str], Optional[str]]: Tuple pair containing the

View File

@ -101,7 +101,7 @@ class RequestOutputKind(Enum):
CUMULATIVE = 0
# Return only deltas in each RequestOutput
DELTA = 1
# Do not return intermediate RequestOuputs
# Do not return intermediate RequestOutput
FINAL_ONLY = 2

View File

@ -1119,7 +1119,7 @@ class _PrintableStructure(Structure):
e.g. class that has _field_ 'hex_value', c_uint could be formatted with
_fmt_ = {"hex_value" : "%08X"}
to produce nicer output.
Default fomratting string for all fields can be set with key "<default>" like:
Default formatting string for all fields can be set with key "<default>" like:
_fmt_ = {"<default>" : "%d MHz"} # e.g all values are numbers in MHz.
If not set it's assumed to be just "%s"

View File

@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention(
return spda_o @ W_O
NOTE: in the actual code,
`kv_b_proj` is [W_UK; W_UV] concatnated per head
`q_b_proj` is [W_UQ; W_QR] concatnated per head
`kv_b_proj` is [W_UK; W_UV] concatenated per head
`q_b_proj` is [W_UQ; W_QR] concatenated per head
`out_proj` is W_O

View File

@ -326,7 +326,7 @@ class WorkerProc:
logger.debug("Worker interrupted.")
except Exception:
# worker_busy_loop sends exceptions exceptons to Executor
# worker_busy_loop sends exceptions to Executor
# for shutdown, but if there is an error in startup or an
# error with IPC itself, we need to alert the parent.
psutil.Process().parent().send_signal(signal.SIGUSR1)

View File

@ -998,7 +998,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
) -> Union[ModelRunnerOutput, torch.Tensor]:
self._update_states(scheduler_output)
if not scheduler_output.total_num_scheduled_tokens:
# Return empty ModelRunnerOuptut if there's no work to do.
# Return empty ModelRunnerOutput if there's no work to do.
return EMPTY_MODEL_RUNNER_OUTPUT
if self.is_multimodal_model:

View File

@ -652,7 +652,7 @@ class TPUModelRunner:
# Update cached state
self._update_states(scheduler_output)
if not scheduler_output.total_num_scheduled_tokens:
# Return empty ModelRunnerOuptut if there's no work to do.
# Return empty ModelRunnerOutput if there's no work to do.
return EMPTY_MODEL_RUNNER_OUTPUT
if self.is_multimodal_model: