Correct capitalisation: VLLM -> vLLM (#14562)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor 2025-03-10 17:36:21 +01:00 committed by GitHub
parent dea985aef0
commit 3b352a2f92
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 25 additions and 25 deletions

View File

@ -139,7 +139,7 @@ def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
print(f"Naive output={output_naive}")
print(f"FlashInfer output={output_flashinfer}")
print(f"VLLM output={output_vllm}")
print(f"vLLM output={output_vllm}")
if torch.allclose(output_naive, output_flashinfer, atol=1e-2,
rtol=1e-2) and torch.allclose(

View File

@ -37,7 +37,7 @@ you may contact the following individuals:
## Slack Discussion
You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai)
You may use the `#security` channel in the [vLLM Slack](https://slack.vllm.ai)
to discuss security-related topics. However, please do not disclose any
vulnerabilities in this channel. If you need to report a vulnerability, please
use the GitHub security advisory system or contact a VMT member privately.

View File

@ -509,7 +509,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU
memory. This is also known as "KV cache offloading" and is configured
with `--swap-space` and `--preemption-mode`.
In v0, [VLLM has long supported beam
In v0, [vLLM has long supported beam
search](gh-issue:6226). The
SequenceGroup encapsulated the idea of N Sequences which
all shared the same prompt kv blocks. This enabled KV cache block

View File

@ -5,7 +5,7 @@ with LMCache.
We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
and launch an additional LMCache server.
KV cache is transferred in the following manner:
VLLM prefill node -> LMCache server -> VLLM decode node.
vLLM prefill node -> LMCache server -> vLLM decode node.
Note that `pip install lmcache` is needed to run this example.
Learn more about LMCache in https://github.com/LMCache/LMCache.

View File

@ -25,7 +25,7 @@ ACCURACY_CONFIGS = [
GSM8KAccuracyTestConfig(
model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
excepted_value=0.76), # no bias
# NOTE(rob): We cannot re-initialize VLLM in the same process for TPU,
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
# so only one of these tests can run in a single call to pytest. As
# a follow up, move this into the LM-EVAL section of the CI.
# GSM8KAccuracyTestConfig(

View File

@ -51,7 +51,7 @@ def get_env_variable_attn_backend() -> Optional[_Backend]:
# (default behavior if this variable is None)
#
# THIS SELECTION TAKES PRECEDENCE OVER THE
# VLLM ATTENTION BACKEND ENVIRONMENT VARIABLE
# VLLM_ATTENTION_BACKEND ENVIRONMENT VARIABLE
forced_attn_backend: Optional[_Backend] = None

View File

@ -278,7 +278,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
class VllmBackend:
"""The compilation backend for `torch.compile` with VLLM.
"""The compilation backend for `torch.compile` with vLLM.
It is used for compilation level of `CompilationLevel.PIECEWISE`,
where we customize the compilation.

View File

@ -31,7 +31,7 @@ class CompilerInterface:
def compute_hash(self, vllm_config: VllmConfig) -> str:
"""
Gather all the relevant information from the VLLM config,
Gather all the relevant information from the vLLM config,
to compute a hash so that we can cache the compiled model.
See :meth:`VllmConfig.compute_hash` to check what information

View File

@ -3572,11 +3572,11 @@ _current_vllm_config: Optional[VllmConfig] = None
@contextmanager
def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
"""
Temporarily set the current VLLM config.
Temporarily set the current vLLM config.
Used during model initialization.
We save the current VLLM config in a global variable,
We save the current vLLM config in a global variable,
so that all modules can access it, e.g. custom ops
can access the VLLM config to determine how to dispatch.
can access the vLLM config to determine how to dispatch.
"""
global _current_vllm_config
old_vllm_config = _current_vllm_config
@ -3611,7 +3611,7 @@ def get_current_vllm_config() -> VllmConfig:
# in ci, usually when we test custom ops/modules directly,
# we don't set the vllm config. In that case, we set a default
# config.
logger.warning("Current VLLM config is not set.")
logger.warning("Current vLLM config is not set.")
from vllm.config import VllmConfig
return VllmConfig()
return _current_vllm_config

View File

@ -237,7 +237,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
tool_choice: Optional[Union[Literal["none"], Literal["auto"],
ChatCompletionNamedToolChoiceParam]] = "none"
# NOTE this will be ignored by VLLM -- the model determines the behavior
# NOTE this will be ignored by vLLM -- the model determines the behavior
parallel_tool_calls: Optional[bool] = False
user: Optional[str] = None

View File

@ -164,7 +164,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VERBOSE":
lambda: bool(int(os.getenv('VERBOSE', '0'))),
# Root directory for VLLM configuration files
# Root directory for vLLM configuration files
# Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
# Note that this not only affects how vllm finds its configuration files
# during runtime, but also affects how vllm installs its configuration
@ -178,7 +178,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# ================== Runtime Env Vars ==================
# Root directory for VLLM cache files
# Root directory for vLLM cache files
# Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
"VLLM_CACHE_ROOT":
lambda: os.path.expanduser(
@ -260,7 +260,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_ENGINE_ITERATION_TIMEOUT_S":
lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
# API key for VLLM API server
# API key for vLLM API server
"VLLM_API_KEY":
lambda: os.environ.get("VLLM_API_KEY", None),

View File

@ -1414,7 +1414,7 @@ def cat_with_pad(tensors, dim, padding_value=0):
@INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm)
class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
"""
Implements the Phi-4-multimodal-instruct model in VLLM.
Implements the Phi-4-multimodal-instruct model in vLLM.
"""
packed_modules_mapping = {
"qkv_proj": [

View File

@ -119,7 +119,7 @@ class CudaPlatformBase(Platform):
if envs.VLLM_USE_V1:
raise NotImplementedError(
"Multi-step scheduling is not supported (and not "
"needed) on VLLM V1. Please launch without "
"needed) on vLLM V1. Please launch without "
"--num-scheduler-steps.")
else:
parallel_config.worker_cls = \

View File

@ -173,7 +173,7 @@ class RocmPlatform(Platform):
if envs.VLLM_USE_V1:
raise NotImplementedError(
"Multi-step scheduling is not supported (and not "
"needed) on VLLM V1. Please launch without "
"needed) on vLLM V1. Please launch without "
"--num-scheduler-steps.")
else:
parallel_config.worker_cls = \
@ -181,7 +181,7 @@ class RocmPlatform(Platform):
elif vllm_config.speculative_config:
if envs.VLLM_USE_V1:
raise NotImplementedError(
"Speculative decoding is not yet supported on VLLM V1."
"Speculative decoding is not yet supported on vLLM V1."
)
else:
parallel_config.worker_cls = \

View File

@ -249,7 +249,7 @@ class MistralTokenizer(TokenizerBase):
revision=revision)
return tokenizer_file
# the following attributes are set to fit VLLM's design and are used
# the following attributes are set to fit vLLM's design and are used
# by the guided structured output backends.
@property
def all_special_tokens_extended(self) -> List[str]:

View File

@ -255,7 +255,7 @@ class MPClient(EngineCoreClient):
# TODO(rob): rather than killing the main process, we should
# figure out how to raise an AsyncEngineDeadError and
# handle at the API server level so we can return a better
# error code to the clients calling VLLM.
# error code to the clients calling vLLM.
def sigusr1_handler(signum, frame):
logger.fatal("Got fatal signal from worker processes, shutting "
"down. See stack trace above for root cause issue.")

View File

@ -248,7 +248,7 @@ class OutputProcessor:
****************** NOTE FOR DEVELOPERS ******************
VLLM V1 minimizes the number of python loops over the full
vLLM V1 minimizes the number of python loops over the full
batch to ensure system overheads are minimized. This is the
only function that should loop over EngineCoreOutputs.

View File

@ -93,10 +93,10 @@ class Processor:
) -> None:
# Best of not yet supported.
if params.best_of is not None and params.best_of > 1:
raise ValueError("VLLM V1 does not yet support best_of.")
raise ValueError("vLLM V1 does not yet support best_of.")
# Logits processors not supported.
if params.logits_processors:
raise ValueError("VLLM V1 does not support per request "
raise ValueError("vLLM V1 does not support per request "
"user provided logits processors.")
def _validate_params(