[BugFix] fix some typos found by typos. (#16314)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
This commit is contained in:
parent
24834f4894
commit
04149cce27
@ -921,7 +921,7 @@ if __name__ == "__main__":
|
||||
"--percentile-metrics",
|
||||
type=str,
|
||||
default="ttft,tpot,itl",
|
||||
help="Comma-seperated list of selected metrics to report percentils. "
|
||||
help="Comma-separated list of selected metrics to report percentils. "
|
||||
"This argument specifies the metrics to report percentiles. "
|
||||
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
|
||||
"Default value is \"ttft,tpot,itl\".")
|
||||
@ -929,7 +929,7 @@ if __name__ == "__main__":
|
||||
"--metric-percentiles",
|
||||
type=str,
|
||||
default="99",
|
||||
help="Comma-seperated list of percentiles for selected metrics. "
|
||||
help="Comma-separated list of percentiles for selected metrics. "
|
||||
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
||||
"Default value is \"99\". "
|
||||
"Use \"--percentile-metrics\" to select metrics.",
|
||||
|
@ -963,7 +963,7 @@ if __name__ == "__main__":
|
||||
"--percentile-metrics",
|
||||
type=str,
|
||||
default="ttft,tpot,itl",
|
||||
help="Comma-seperated list of selected metrics to report percentils. "
|
||||
help="Comma-separated list of selected metrics to report percentils. "
|
||||
"This argument specifies the metrics to report percentiles. "
|
||||
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
|
||||
"Default value is \"ttft,tpot,itl\".")
|
||||
@ -971,7 +971,7 @@ if __name__ == "__main__":
|
||||
"--metric-percentiles",
|
||||
type=str,
|
||||
default="99",
|
||||
help="Comma-seperated list of percentiles for selected metrics. "
|
||||
help="Comma-separated list of percentiles for selected metrics. "
|
||||
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
||||
"Default value is \"99\". "
|
||||
"Use \"--percentile-metrics\" to select metrics.",
|
||||
|
@ -422,7 +422,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
|
||||
int final_state_position = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
|
||||
// in case the final state is separated between the last "smem_exchange" and
|
||||
// and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2),
|
||||
// (which occurs when `final_state_position` is a non-positivie index)
|
||||
// (which occurs when `final_state_position` is a non-positive index)
|
||||
// we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
|
||||
if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){
|
||||
input_t vals_load[kNElts] = {0};
|
||||
|
@ -326,7 +326,7 @@ class FlashAttentionMetadata(AttentionMetadata):
|
||||
assert self.use_cuda_graph
|
||||
|
||||
if turn_prefills_into_decodes:
|
||||
# When Mutli-Step is enabled with Chunked-Prefill, prefills and
|
||||
# When Multi-Step is enabled with Chunked-Prefill, prefills and
|
||||
# decodes are scheduled together. In the first step, all the
|
||||
# prefills turn into decodes. This update reflects that
|
||||
# conversion.
|
||||
|
@ -152,11 +152,11 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
|
||||
logger.warning("Could not import HPU FusedSDPA kernel. "
|
||||
"vLLM will use native implementation.")
|
||||
|
||||
suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes()
|
||||
if head_size not in suppored_head_sizes:
|
||||
supported_head_sizes = HPUPagedAttention.get_supported_head_sizes()
|
||||
if head_size not in supported_head_sizes:
|
||||
raise ValueError(
|
||||
f"Head size {head_size} is not supported by PagedAttention. "
|
||||
f"Supported head sizes are: {suppored_head_sizes}.")
|
||||
f"Supported head sizes are: {supported_head_sizes}.")
|
||||
|
||||
if attn_type != AttentionType.DECODER:
|
||||
raise NotImplementedError("Encoder self-attention and "
|
||||
|
@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention(
|
||||
return spda_o @ W_O
|
||||
|
||||
NOTE: in the actual code,
|
||||
`kv_b_proj` is [W_UK; W_UV] concatnated per head
|
||||
`q_b_proj` is [W_UQ; W_QR] concatnated per head
|
||||
`kv_b_proj` is [W_UK; W_UV] concatenated per head
|
||||
`q_b_proj` is [W_UQ; W_QR] concatenated per head
|
||||
`out_proj` is W_O
|
||||
|
||||
|
||||
@ -667,7 +667,7 @@ class MLACommonMetadata(AttentionMetadata):
|
||||
assert num_seqs > num_queries
|
||||
|
||||
if turn_prefills_into_decodes:
|
||||
# When Mutli-Step is enabled with Chunked-Prefill, prefills and
|
||||
# When Multi-Step is enabled with Chunked-Prefill, prefills and
|
||||
# decodes are scheduled together. In the first step, all the
|
||||
# prefills turn into decodes. This update reflects that
|
||||
# conversion.
|
||||
|
@ -414,11 +414,11 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
|
||||
assert self.num_heads % self.num_kv_heads == 0
|
||||
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
||||
|
||||
suppored_head_sizes = PagedAttention.get_supported_head_sizes()
|
||||
if head_size not in suppored_head_sizes:
|
||||
supported_head_sizes = PagedAttention.get_supported_head_sizes()
|
||||
if head_size not in supported_head_sizes:
|
||||
raise ValueError(
|
||||
f"Head size {head_size} is not supported by PagedAttention. "
|
||||
f"Supported head sizes are: {suppored_head_sizes}.")
|
||||
f"Supported head sizes are: {supported_head_sizes}.")
|
||||
|
||||
self.attn_type = attn_type
|
||||
|
||||
|
@ -446,7 +446,7 @@ def flash_paged_attention(
|
||||
IO tensor dtypes:
|
||||
- This kernel assumes all IO tensors have the same dtype except for
|
||||
block_tables (int32) and mask (int32)
|
||||
- If mixed_percision is True, then all Tensor Engine operation will be
|
||||
- If mixed_precision is True, then all Tensor Engine operation will be
|
||||
performed in bfloat16 and accumulation will be performed in float32.
|
||||
Otherwise the intermediates will be in the same type as the inputs.
|
||||
|
||||
|
@ -724,14 +724,14 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
"--percentile-metrics",
|
||||
type=str,
|
||||
default="ttft,tpot,itl",
|
||||
help="Comma-seperated list of selected metrics to report percentils. "
|
||||
help="Comma-separated list of selected metrics to report percentils. "
|
||||
"This argument specifies the metrics to report percentiles. "
|
||||
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ")
|
||||
parser.add_argument(
|
||||
"--metric-percentiles",
|
||||
type=str,
|
||||
default="99",
|
||||
help="Comma-seperated list of percentiles for selected metrics. "
|
||||
help="Comma-separated list of percentiles for selected metrics. "
|
||||
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
||||
"Use \"--percentile-metrics\" to select metrics.",
|
||||
)
|
||||
|
@ -93,7 +93,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
|
||||
externally (before the next schedule() call)
|
||||
"""
|
||||
# Sequences can be in RUNNING or FINISHED_ABORTED state
|
||||
# once scheduled, as a sequence is moved to FINSIHED_ABORTED
|
||||
# once scheduled, as a sequence is moved to FINISHED_ABORTED
|
||||
# if a client disconnects from the api server.
|
||||
seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
|
||||
if seqs is None:
|
||||
|
@ -98,7 +98,7 @@ def find_all_indices(string: str, substring: str) -> list[int]:
|
||||
|
||||
|
||||
# partial_json_parser doesn't support extra data and
|
||||
# JSONDecorder.raw_decode doesn't support partial JSON
|
||||
# JSONDecoder.raw_decode doesn't support partial JSON
|
||||
def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]:
|
||||
try:
|
||||
return (partial_json_parser.loads(input_str, flags), len(input_str))
|
||||
|
@ -29,7 +29,7 @@ def choose_scaled_mm_linear_kernel(
|
||||
compute_capability: Optional[int] = None
|
||||
) -> Type[ScaledMMLinearKernel]:
|
||||
"""
|
||||
Choose an ScalledMMLinearKernel that can implement the given config for the
|
||||
Choose an ScaledMMLinearKernel that can implement the given config for the
|
||||
given compute capability. Attempts to choose the best kernel in terms of
|
||||
performance.
|
||||
|
||||
|
@ -69,12 +69,12 @@ class CpuPlatform(Platform):
|
||||
|
||||
cache_config = vllm_config.cache_config
|
||||
|
||||
ipex_avaliable = find_spec("intel_extension_for_pytorch") is not None
|
||||
ipex_available = find_spec("intel_extension_for_pytorch") is not None
|
||||
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = 128 if ipex_avaliable else 16
|
||||
cache_config.block_size = 128 if ipex_available else 16
|
||||
|
||||
if not ipex_avaliable and cache_config.block_size != 16:
|
||||
if not ipex_available and cache_config.block_size != 16:
|
||||
raise RuntimeError(
|
||||
f"--block-size={cache_config.block_size} requires"
|
||||
" intel_extension_for_pytorch")
|
||||
|
@ -231,7 +231,7 @@ class Platform:
|
||||
parser: Optional[FlexibleArgumentParser] = None
|
||||
) -> None:
|
||||
"""
|
||||
Do some pre-registeration or update action for the current platform.
|
||||
Do some pre-registration or update action for the current platform.
|
||||
|
||||
This function is called before global VllmConfig is initialized or cli
|
||||
arguments are parsed. It's used for out-of-tree platforms to register or
|
||||
|
@ -60,7 +60,7 @@ class GraniteReasoningParser(ReasoningParser):
|
||||
|
||||
Args:
|
||||
model_output (str): Output of the model to be parsed.
|
||||
request (ChatCompletionReqest): Request being processed.
|
||||
request (ChatCompletionRequest): Request being processed.
|
||||
|
||||
Returns:
|
||||
tuple[Optional[str], Optional[str]]: Tuple pair containing the
|
||||
|
@ -101,7 +101,7 @@ class RequestOutputKind(Enum):
|
||||
CUMULATIVE = 0
|
||||
# Return only deltas in each RequestOutput
|
||||
DELTA = 1
|
||||
# Do not return intermediate RequestOuputs
|
||||
# Do not return intermediate RequestOutput
|
||||
FINAL_ONLY = 2
|
||||
|
||||
|
||||
|
2
vllm/third_party/pynvml.py
vendored
2
vllm/third_party/pynvml.py
vendored
@ -1119,7 +1119,7 @@ class _PrintableStructure(Structure):
|
||||
e.g. class that has _field_ 'hex_value', c_uint could be formatted with
|
||||
_fmt_ = {"hex_value" : "%08X"}
|
||||
to produce nicer output.
|
||||
Default fomratting string for all fields can be set with key "<default>" like:
|
||||
Default formatting string for all fields can be set with key "<default>" like:
|
||||
_fmt_ = {"<default>" : "%d MHz"} # e.g all values are numbers in MHz.
|
||||
If not set it's assumed to be just "%s"
|
||||
|
||||
|
@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention(
|
||||
return spda_o @ W_O
|
||||
|
||||
NOTE: in the actual code,
|
||||
`kv_b_proj` is [W_UK; W_UV] concatnated per head
|
||||
`q_b_proj` is [W_UQ; W_QR] concatnated per head
|
||||
`kv_b_proj` is [W_UK; W_UV] concatenated per head
|
||||
`q_b_proj` is [W_UQ; W_QR] concatenated per head
|
||||
`out_proj` is W_O
|
||||
|
||||
|
||||
|
@ -326,7 +326,7 @@ class WorkerProc:
|
||||
logger.debug("Worker interrupted.")
|
||||
|
||||
except Exception:
|
||||
# worker_busy_loop sends exceptions exceptons to Executor
|
||||
# worker_busy_loop sends exceptions to Executor
|
||||
# for shutdown, but if there is an error in startup or an
|
||||
# error with IPC itself, we need to alert the parent.
|
||||
psutil.Process().parent().send_signal(signal.SIGUSR1)
|
||||
|
@ -998,7 +998,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
) -> Union[ModelRunnerOutput, torch.Tensor]:
|
||||
self._update_states(scheduler_output)
|
||||
if not scheduler_output.total_num_scheduled_tokens:
|
||||
# Return empty ModelRunnerOuptut if there's no work to do.
|
||||
# Return empty ModelRunnerOutput if there's no work to do.
|
||||
return EMPTY_MODEL_RUNNER_OUTPUT
|
||||
|
||||
if self.is_multimodal_model:
|
||||
|
@ -652,7 +652,7 @@ class TPUModelRunner:
|
||||
# Update cached state
|
||||
self._update_states(scheduler_output)
|
||||
if not scheduler_output.total_num_scheduled_tokens:
|
||||
# Return empty ModelRunnerOuptut if there's no work to do.
|
||||
# Return empty ModelRunnerOutput if there's no work to do.
|
||||
return EMPTY_MODEL_RUNNER_OUTPUT
|
||||
|
||||
if self.is_multimodal_model:
|
||||
|
Loading…
x
Reference in New Issue
Block a user