[BugFix] fix some typos found by typos. (#16314)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
This commit is contained in:
parent
24834f4894
commit
04149cce27
@ -921,7 +921,7 @@ if __name__ == "__main__":
|
|||||||
"--percentile-metrics",
|
"--percentile-metrics",
|
||||||
type=str,
|
type=str,
|
||||||
default="ttft,tpot,itl",
|
default="ttft,tpot,itl",
|
||||||
help="Comma-seperated list of selected metrics to report percentils. "
|
help="Comma-separated list of selected metrics to report percentils. "
|
||||||
"This argument specifies the metrics to report percentiles. "
|
"This argument specifies the metrics to report percentiles. "
|
||||||
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
|
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
|
||||||
"Default value is \"ttft,tpot,itl\".")
|
"Default value is \"ttft,tpot,itl\".")
|
||||||
@ -929,7 +929,7 @@ if __name__ == "__main__":
|
|||||||
"--metric-percentiles",
|
"--metric-percentiles",
|
||||||
type=str,
|
type=str,
|
||||||
default="99",
|
default="99",
|
||||||
help="Comma-seperated list of percentiles for selected metrics. "
|
help="Comma-separated list of percentiles for selected metrics. "
|
||||||
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
||||||
"Default value is \"99\". "
|
"Default value is \"99\". "
|
||||||
"Use \"--percentile-metrics\" to select metrics.",
|
"Use \"--percentile-metrics\" to select metrics.",
|
||||||
|
@ -963,7 +963,7 @@ if __name__ == "__main__":
|
|||||||
"--percentile-metrics",
|
"--percentile-metrics",
|
||||||
type=str,
|
type=str,
|
||||||
default="ttft,tpot,itl",
|
default="ttft,tpot,itl",
|
||||||
help="Comma-seperated list of selected metrics to report percentils. "
|
help="Comma-separated list of selected metrics to report percentils. "
|
||||||
"This argument specifies the metrics to report percentiles. "
|
"This argument specifies the metrics to report percentiles. "
|
||||||
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
|
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
|
||||||
"Default value is \"ttft,tpot,itl\".")
|
"Default value is \"ttft,tpot,itl\".")
|
||||||
@ -971,7 +971,7 @@ if __name__ == "__main__":
|
|||||||
"--metric-percentiles",
|
"--metric-percentiles",
|
||||||
type=str,
|
type=str,
|
||||||
default="99",
|
default="99",
|
||||||
help="Comma-seperated list of percentiles for selected metrics. "
|
help="Comma-separated list of percentiles for selected metrics. "
|
||||||
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
||||||
"Default value is \"99\". "
|
"Default value is \"99\". "
|
||||||
"Use \"--percentile-metrics\" to select metrics.",
|
"Use \"--percentile-metrics\" to select metrics.",
|
||||||
|
@ -422,7 +422,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
|
|||||||
int final_state_position = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
|
int final_state_position = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
|
||||||
// in case the final state is separated between the last "smem_exchange" and
|
// in case the final state is separated between the last "smem_exchange" and
|
||||||
// and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2),
|
// and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2),
|
||||||
// (which occurs when `final_state_position` is a non-positivie index)
|
// (which occurs when `final_state_position` is a non-positive index)
|
||||||
// we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
|
// we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
|
||||||
if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){
|
if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){
|
||||||
input_t vals_load[kNElts] = {0};
|
input_t vals_load[kNElts] = {0};
|
||||||
|
@ -326,7 +326,7 @@ class FlashAttentionMetadata(AttentionMetadata):
|
|||||||
assert self.use_cuda_graph
|
assert self.use_cuda_graph
|
||||||
|
|
||||||
if turn_prefills_into_decodes:
|
if turn_prefills_into_decodes:
|
||||||
# When Mutli-Step is enabled with Chunked-Prefill, prefills and
|
# When Multi-Step is enabled with Chunked-Prefill, prefills and
|
||||||
# decodes are scheduled together. In the first step, all the
|
# decodes are scheduled together. In the first step, all the
|
||||||
# prefills turn into decodes. This update reflects that
|
# prefills turn into decodes. This update reflects that
|
||||||
# conversion.
|
# conversion.
|
||||||
|
@ -152,11 +152,11 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
|
|||||||
logger.warning("Could not import HPU FusedSDPA kernel. "
|
logger.warning("Could not import HPU FusedSDPA kernel. "
|
||||||
"vLLM will use native implementation.")
|
"vLLM will use native implementation.")
|
||||||
|
|
||||||
suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes()
|
supported_head_sizes = HPUPagedAttention.get_supported_head_sizes()
|
||||||
if head_size not in suppored_head_sizes:
|
if head_size not in supported_head_sizes:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Head size {head_size} is not supported by PagedAttention. "
|
f"Head size {head_size} is not supported by PagedAttention. "
|
||||||
f"Supported head sizes are: {suppored_head_sizes}.")
|
f"Supported head sizes are: {supported_head_sizes}.")
|
||||||
|
|
||||||
if attn_type != AttentionType.DECODER:
|
if attn_type != AttentionType.DECODER:
|
||||||
raise NotImplementedError("Encoder self-attention and "
|
raise NotImplementedError("Encoder self-attention and "
|
||||||
|
@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention(
|
|||||||
return spda_o @ W_O
|
return spda_o @ W_O
|
||||||
|
|
||||||
NOTE: in the actual code,
|
NOTE: in the actual code,
|
||||||
`kv_b_proj` is [W_UK; W_UV] concatnated per head
|
`kv_b_proj` is [W_UK; W_UV] concatenated per head
|
||||||
`q_b_proj` is [W_UQ; W_QR] concatnated per head
|
`q_b_proj` is [W_UQ; W_QR] concatenated per head
|
||||||
`out_proj` is W_O
|
`out_proj` is W_O
|
||||||
|
|
||||||
|
|
||||||
@ -667,7 +667,7 @@ class MLACommonMetadata(AttentionMetadata):
|
|||||||
assert num_seqs > num_queries
|
assert num_seqs > num_queries
|
||||||
|
|
||||||
if turn_prefills_into_decodes:
|
if turn_prefills_into_decodes:
|
||||||
# When Mutli-Step is enabled with Chunked-Prefill, prefills and
|
# When Multi-Step is enabled with Chunked-Prefill, prefills and
|
||||||
# decodes are scheduled together. In the first step, all the
|
# decodes are scheduled together. In the first step, all the
|
||||||
# prefills turn into decodes. This update reflects that
|
# prefills turn into decodes. This update reflects that
|
||||||
# conversion.
|
# conversion.
|
||||||
|
@ -414,11 +414,11 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
|
|||||||
assert self.num_heads % self.num_kv_heads == 0
|
assert self.num_heads % self.num_kv_heads == 0
|
||||||
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
||||||
|
|
||||||
suppored_head_sizes = PagedAttention.get_supported_head_sizes()
|
supported_head_sizes = PagedAttention.get_supported_head_sizes()
|
||||||
if head_size not in suppored_head_sizes:
|
if head_size not in supported_head_sizes:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Head size {head_size} is not supported by PagedAttention. "
|
f"Head size {head_size} is not supported by PagedAttention. "
|
||||||
f"Supported head sizes are: {suppored_head_sizes}.")
|
f"Supported head sizes are: {supported_head_sizes}.")
|
||||||
|
|
||||||
self.attn_type = attn_type
|
self.attn_type = attn_type
|
||||||
|
|
||||||
|
@ -446,7 +446,7 @@ def flash_paged_attention(
|
|||||||
IO tensor dtypes:
|
IO tensor dtypes:
|
||||||
- This kernel assumes all IO tensors have the same dtype except for
|
- This kernel assumes all IO tensors have the same dtype except for
|
||||||
block_tables (int32) and mask (int32)
|
block_tables (int32) and mask (int32)
|
||||||
- If mixed_percision is True, then all Tensor Engine operation will be
|
- If mixed_precision is True, then all Tensor Engine operation will be
|
||||||
performed in bfloat16 and accumulation will be performed in float32.
|
performed in bfloat16 and accumulation will be performed in float32.
|
||||||
Otherwise the intermediates will be in the same type as the inputs.
|
Otherwise the intermediates will be in the same type as the inputs.
|
||||||
|
|
||||||
|
@ -724,14 +724,14 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
|||||||
"--percentile-metrics",
|
"--percentile-metrics",
|
||||||
type=str,
|
type=str,
|
||||||
default="ttft,tpot,itl",
|
default="ttft,tpot,itl",
|
||||||
help="Comma-seperated list of selected metrics to report percentils. "
|
help="Comma-separated list of selected metrics to report percentils. "
|
||||||
"This argument specifies the metrics to report percentiles. "
|
"This argument specifies the metrics to report percentiles. "
|
||||||
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ")
|
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--metric-percentiles",
|
"--metric-percentiles",
|
||||||
type=str,
|
type=str,
|
||||||
default="99",
|
default="99",
|
||||||
help="Comma-seperated list of percentiles for selected metrics. "
|
help="Comma-separated list of percentiles for selected metrics. "
|
||||||
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
||||||
"Use \"--percentile-metrics\" to select metrics.",
|
"Use \"--percentile-metrics\" to select metrics.",
|
||||||
)
|
)
|
||||||
|
@ -93,7 +93,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
|
|||||||
externally (before the next schedule() call)
|
externally (before the next schedule() call)
|
||||||
"""
|
"""
|
||||||
# Sequences can be in RUNNING or FINISHED_ABORTED state
|
# Sequences can be in RUNNING or FINISHED_ABORTED state
|
||||||
# once scheduled, as a sequence is moved to FINSIHED_ABORTED
|
# once scheduled, as a sequence is moved to FINISHED_ABORTED
|
||||||
# if a client disconnects from the api server.
|
# if a client disconnects from the api server.
|
||||||
seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
|
seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
|
||||||
if seqs is None:
|
if seqs is None:
|
||||||
|
@ -98,7 +98,7 @@ def find_all_indices(string: str, substring: str) -> list[int]:
|
|||||||
|
|
||||||
|
|
||||||
# partial_json_parser doesn't support extra data and
|
# partial_json_parser doesn't support extra data and
|
||||||
# JSONDecorder.raw_decode doesn't support partial JSON
|
# JSONDecoder.raw_decode doesn't support partial JSON
|
||||||
def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]:
|
def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]:
|
||||||
try:
|
try:
|
||||||
return (partial_json_parser.loads(input_str, flags), len(input_str))
|
return (partial_json_parser.loads(input_str, flags), len(input_str))
|
||||||
|
@ -29,7 +29,7 @@ def choose_scaled_mm_linear_kernel(
|
|||||||
compute_capability: Optional[int] = None
|
compute_capability: Optional[int] = None
|
||||||
) -> Type[ScaledMMLinearKernel]:
|
) -> Type[ScaledMMLinearKernel]:
|
||||||
"""
|
"""
|
||||||
Choose an ScalledMMLinearKernel that can implement the given config for the
|
Choose an ScaledMMLinearKernel that can implement the given config for the
|
||||||
given compute capability. Attempts to choose the best kernel in terms of
|
given compute capability. Attempts to choose the best kernel in terms of
|
||||||
performance.
|
performance.
|
||||||
|
|
||||||
|
@ -69,12 +69,12 @@ class CpuPlatform(Platform):
|
|||||||
|
|
||||||
cache_config = vllm_config.cache_config
|
cache_config = vllm_config.cache_config
|
||||||
|
|
||||||
ipex_avaliable = find_spec("intel_extension_for_pytorch") is not None
|
ipex_available = find_spec("intel_extension_for_pytorch") is not None
|
||||||
|
|
||||||
if cache_config and cache_config.block_size is None:
|
if cache_config and cache_config.block_size is None:
|
||||||
cache_config.block_size = 128 if ipex_avaliable else 16
|
cache_config.block_size = 128 if ipex_available else 16
|
||||||
|
|
||||||
if not ipex_avaliable and cache_config.block_size != 16:
|
if not ipex_available and cache_config.block_size != 16:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"--block-size={cache_config.block_size} requires"
|
f"--block-size={cache_config.block_size} requires"
|
||||||
" intel_extension_for_pytorch")
|
" intel_extension_for_pytorch")
|
||||||
|
@ -231,7 +231,7 @@ class Platform:
|
|||||||
parser: Optional[FlexibleArgumentParser] = None
|
parser: Optional[FlexibleArgumentParser] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Do some pre-registeration or update action for the current platform.
|
Do some pre-registration or update action for the current platform.
|
||||||
|
|
||||||
This function is called before global VllmConfig is initialized or cli
|
This function is called before global VllmConfig is initialized or cli
|
||||||
arguments are parsed. It's used for out-of-tree platforms to register or
|
arguments are parsed. It's used for out-of-tree platforms to register or
|
||||||
|
@ -60,7 +60,7 @@ class GraniteReasoningParser(ReasoningParser):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
model_output (str): Output of the model to be parsed.
|
model_output (str): Output of the model to be parsed.
|
||||||
request (ChatCompletionReqest): Request being processed.
|
request (ChatCompletionRequest): Request being processed.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tuple[Optional[str], Optional[str]]: Tuple pair containing the
|
tuple[Optional[str], Optional[str]]: Tuple pair containing the
|
||||||
|
@ -101,7 +101,7 @@ class RequestOutputKind(Enum):
|
|||||||
CUMULATIVE = 0
|
CUMULATIVE = 0
|
||||||
# Return only deltas in each RequestOutput
|
# Return only deltas in each RequestOutput
|
||||||
DELTA = 1
|
DELTA = 1
|
||||||
# Do not return intermediate RequestOuputs
|
# Do not return intermediate RequestOutput
|
||||||
FINAL_ONLY = 2
|
FINAL_ONLY = 2
|
||||||
|
|
||||||
|
|
||||||
|
2
vllm/third_party/pynvml.py
vendored
2
vllm/third_party/pynvml.py
vendored
@ -1119,7 +1119,7 @@ class _PrintableStructure(Structure):
|
|||||||
e.g. class that has _field_ 'hex_value', c_uint could be formatted with
|
e.g. class that has _field_ 'hex_value', c_uint could be formatted with
|
||||||
_fmt_ = {"hex_value" : "%08X"}
|
_fmt_ = {"hex_value" : "%08X"}
|
||||||
to produce nicer output.
|
to produce nicer output.
|
||||||
Default fomratting string for all fields can be set with key "<default>" like:
|
Default formatting string for all fields can be set with key "<default>" like:
|
||||||
_fmt_ = {"<default>" : "%d MHz"} # e.g all values are numbers in MHz.
|
_fmt_ = {"<default>" : "%d MHz"} # e.g all values are numbers in MHz.
|
||||||
If not set it's assumed to be just "%s"
|
If not set it's assumed to be just "%s"
|
||||||
|
|
||||||
|
@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention(
|
|||||||
return spda_o @ W_O
|
return spda_o @ W_O
|
||||||
|
|
||||||
NOTE: in the actual code,
|
NOTE: in the actual code,
|
||||||
`kv_b_proj` is [W_UK; W_UV] concatnated per head
|
`kv_b_proj` is [W_UK; W_UV] concatenated per head
|
||||||
`q_b_proj` is [W_UQ; W_QR] concatnated per head
|
`q_b_proj` is [W_UQ; W_QR] concatenated per head
|
||||||
`out_proj` is W_O
|
`out_proj` is W_O
|
||||||
|
|
||||||
|
|
||||||
|
@ -326,7 +326,7 @@ class WorkerProc:
|
|||||||
logger.debug("Worker interrupted.")
|
logger.debug("Worker interrupted.")
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
# worker_busy_loop sends exceptions exceptons to Executor
|
# worker_busy_loop sends exceptions to Executor
|
||||||
# for shutdown, but if there is an error in startup or an
|
# for shutdown, but if there is an error in startup or an
|
||||||
# error with IPC itself, we need to alert the parent.
|
# error with IPC itself, we need to alert the parent.
|
||||||
psutil.Process().parent().send_signal(signal.SIGUSR1)
|
psutil.Process().parent().send_signal(signal.SIGUSR1)
|
||||||
|
@ -998,7 +998,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
) -> Union[ModelRunnerOutput, torch.Tensor]:
|
) -> Union[ModelRunnerOutput, torch.Tensor]:
|
||||||
self._update_states(scheduler_output)
|
self._update_states(scheduler_output)
|
||||||
if not scheduler_output.total_num_scheduled_tokens:
|
if not scheduler_output.total_num_scheduled_tokens:
|
||||||
# Return empty ModelRunnerOuptut if there's no work to do.
|
# Return empty ModelRunnerOutput if there's no work to do.
|
||||||
return EMPTY_MODEL_RUNNER_OUTPUT
|
return EMPTY_MODEL_RUNNER_OUTPUT
|
||||||
|
|
||||||
if self.is_multimodal_model:
|
if self.is_multimodal_model:
|
||||||
|
@ -652,7 +652,7 @@ class TPUModelRunner:
|
|||||||
# Update cached state
|
# Update cached state
|
||||||
self._update_states(scheduler_output)
|
self._update_states(scheduler_output)
|
||||||
if not scheduler_output.total_num_scheduled_tokens:
|
if not scheduler_output.total_num_scheduled_tokens:
|
||||||
# Return empty ModelRunnerOuptut if there's no work to do.
|
# Return empty ModelRunnerOutput if there's no work to do.
|
||||||
return EMPTY_MODEL_RUNNER_OUTPUT
|
return EMPTY_MODEL_RUNNER_OUTPUT
|
||||||
|
|
||||||
if self.is_multimodal_model:
|
if self.is_multimodal_model:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user