diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py index 010a38b7..eaf6b25e 100644 --- a/benchmarks/kernels/benchmark_rmsnorm.py +++ b/benchmarks/kernels/benchmark_rmsnorm.py @@ -139,7 +139,7 @@ def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True): print(f"Naive output={output_naive}") print(f"FlashInfer output={output_flashinfer}") - print(f"VLLM output={output_vllm}") + print(f"vLLM output={output_vllm}") if torch.allclose(output_naive, output_flashinfer, atol=1e-2, rtol=1e-2) and torch.allclose( diff --git a/docs/source/contributing/vulnerability_management.md b/docs/source/contributing/vulnerability_management.md index a9bbfde2..1842b301 100644 --- a/docs/source/contributing/vulnerability_management.md +++ b/docs/source/contributing/vulnerability_management.md @@ -37,7 +37,7 @@ you may contact the following individuals: ## Slack Discussion -You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai) +You may use the `#security` channel in the [vLLM Slack](https://slack.vllm.ai) to discuss security-related topics. However, please do not disclose any vulnerabilities in this channel. If you need to report a vulnerability, please use the GitHub security advisory system or contact a VMT member privately. diff --git a/docs/source/design/v1/metrics.md b/docs/source/design/v1/metrics.md index 0d74d21a..bed40516 100644 --- a/docs/source/design/v1/metrics.md +++ b/docs/source/design/v1/metrics.md @@ -509,7 +509,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU memory. This is also known as "KV cache offloading" and is configured with `--swap-space` and `--preemption-mode`. -In v0, [VLLM has long supported beam +In v0, [vLLM has long supported beam search](gh-issue:6226). The SequenceGroup encapsulated the idea of N Sequences which all shared the same prompt kv blocks. This enabled KV cache block diff --git a/examples/offline_inference/disaggregated_prefill_lmcache.py b/examples/offline_inference/disaggregated_prefill_lmcache.py index 36d343c6..5c84bbfc 100644 --- a/examples/offline_inference/disaggregated_prefill_lmcache.py +++ b/examples/offline_inference/disaggregated_prefill_lmcache.py @@ -5,7 +5,7 @@ with LMCache. We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode), and launch an additional LMCache server. KV cache is transferred in the following manner: -VLLM prefill node -> LMCache server -> VLLM decode node. +vLLM prefill node -> LMCache server -> vLLM decode node. Note that `pip install lmcache` is needed to run this example. Learn more about LMCache in https://github.com/LMCache/LMCache. diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py index 3db9bc73..20f9dd77 100644 --- a/tests/tpu/test_quantization_accuracy.py +++ b/tests/tpu/test_quantization_accuracy.py @@ -25,7 +25,7 @@ ACCURACY_CONFIGS = [ GSM8KAccuracyTestConfig( model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", excepted_value=0.76), # no bias - # NOTE(rob): We cannot re-initialize VLLM in the same process for TPU, + # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU, # so only one of these tests can run in a single call to pytest. As # a follow up, move this into the LM-EVAL section of the CI. # GSM8KAccuracyTestConfig( diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 26c6ac81..ebbdea27 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -51,7 +51,7 @@ def get_env_variable_attn_backend() -> Optional[_Backend]: # (default behavior if this variable is None) # # THIS SELECTION TAKES PRECEDENCE OVER THE -# VLLM ATTENTION BACKEND ENVIRONMENT VARIABLE +# VLLM_ATTENTION_BACKEND ENVIRONMENT VARIABLE forced_attn_backend: Optional[_Backend] = None diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index afb63cf8..cdae42fe 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -278,7 +278,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): class VllmBackend: - """The compilation backend for `torch.compile` with VLLM. + """The compilation backend for `torch.compile` with vLLM. It is used for compilation level of `CompilationLevel.PIECEWISE`, where we customize the compilation. diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index d280fdfb..b45c694f 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -31,7 +31,7 @@ class CompilerInterface: def compute_hash(self, vllm_config: VllmConfig) -> str: """ - Gather all the relevant information from the VLLM config, + Gather all the relevant information from the vLLM config, to compute a hash so that we can cache the compiled model. See :meth:`VllmConfig.compute_hash` to check what information diff --git a/vllm/config.py b/vllm/config.py index ad436a1e..a6ac9f43 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3572,11 +3572,11 @@ _current_vllm_config: Optional[VllmConfig] = None @contextmanager def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False): """ - Temporarily set the current VLLM config. + Temporarily set the current vLLM config. Used during model initialization. - We save the current VLLM config in a global variable, + We save the current vLLM config in a global variable, so that all modules can access it, e.g. custom ops - can access the VLLM config to determine how to dispatch. + can access the vLLM config to determine how to dispatch. """ global _current_vllm_config old_vllm_config = _current_vllm_config @@ -3611,7 +3611,7 @@ def get_current_vllm_config() -> VllmConfig: # in ci, usually when we test custom ops/modules directly, # we don't set the vllm config. In that case, we set a default # config. - logger.warning("Current VLLM config is not set.") + logger.warning("Current vLLM config is not set.") from vllm.config import VllmConfig return VllmConfig() return _current_vllm_config diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 6b519e1b..90076a45 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -237,7 +237,7 @@ class ChatCompletionRequest(OpenAIBaseModel): tool_choice: Optional[Union[Literal["none"], Literal["auto"], ChatCompletionNamedToolChoiceParam]] = "none" - # NOTE this will be ignored by VLLM -- the model determines the behavior + # NOTE this will be ignored by vLLM -- the model determines the behavior parallel_tool_calls: Optional[bool] = False user: Optional[str] = None diff --git a/vllm/envs.py b/vllm/envs.py index 187d28b2..24ee4583 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -164,7 +164,7 @@ environment_variables: dict[str, Callable[[], Any]] = { "VERBOSE": lambda: bool(int(os.getenv('VERBOSE', '0'))), - # Root directory for VLLM configuration files + # Root directory for vLLM configuration files # Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set # Note that this not only affects how vllm finds its configuration files # during runtime, but also affects how vllm installs its configuration @@ -178,7 +178,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # ================== Runtime Env Vars ================== - # Root directory for VLLM cache files + # Root directory for vLLM cache files # Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set "VLLM_CACHE_ROOT": lambda: os.path.expanduser( @@ -260,7 +260,7 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_ENGINE_ITERATION_TIMEOUT_S": lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")), - # API key for VLLM API server + # API key for vLLM API server "VLLM_API_KEY": lambda: os.environ.get("VLLM_API_KEY", None), diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 89abfc59..2a839f3a 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1414,7 +1414,7 @@ def cat_with_pad(tensors, dim, padding_value=0): @INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm) class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): """ - Implements the Phi-4-multimodal-instruct model in VLLM. + Implements the Phi-4-multimodal-instruct model in vLLM. """ packed_modules_mapping = { "qkv_proj": [ diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 1bba9908..a986ec0a 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -119,7 +119,7 @@ class CudaPlatformBase(Platform): if envs.VLLM_USE_V1: raise NotImplementedError( "Multi-step scheduling is not supported (and not " - "needed) on VLLM V1. Please launch without " + "needed) on vLLM V1. Please launch without " "--num-scheduler-steps.") else: parallel_config.worker_cls = \ diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index a4f18cbf..de4f6070 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -173,7 +173,7 @@ class RocmPlatform(Platform): if envs.VLLM_USE_V1: raise NotImplementedError( "Multi-step scheduling is not supported (and not " - "needed) on VLLM V1. Please launch without " + "needed) on vLLM V1. Please launch without " "--num-scheduler-steps.") else: parallel_config.worker_cls = \ @@ -181,7 +181,7 @@ class RocmPlatform(Platform): elif vllm_config.speculative_config: if envs.VLLM_USE_V1: raise NotImplementedError( - "Speculative decoding is not yet supported on VLLM V1." + "Speculative decoding is not yet supported on vLLM V1." ) else: parallel_config.worker_cls = \ diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 801597bd..40a5777f 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -249,7 +249,7 @@ class MistralTokenizer(TokenizerBase): revision=revision) return tokenizer_file - # the following attributes are set to fit VLLM's design and are used + # the following attributes are set to fit vLLM's design and are used # by the guided structured output backends. @property def all_special_tokens_extended(self) -> List[str]: diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index d060d977..0f92adcc 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -255,7 +255,7 @@ class MPClient(EngineCoreClient): # TODO(rob): rather than killing the main process, we should # figure out how to raise an AsyncEngineDeadError and # handle at the API server level so we can return a better - # error code to the clients calling VLLM. + # error code to the clients calling vLLM. def sigusr1_handler(signum, frame): logger.fatal("Got fatal signal from worker processes, shutting " "down. See stack trace above for root cause issue.") diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index aca0233e..aea52618 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -248,7 +248,7 @@ class OutputProcessor: ****************** NOTE FOR DEVELOPERS ****************** - VLLM V1 minimizes the number of python loops over the full + vLLM V1 minimizes the number of python loops over the full batch to ensure system overheads are minimized. This is the only function that should loop over EngineCoreOutputs. diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 38638c1e..5c940cce 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -93,10 +93,10 @@ class Processor: ) -> None: # Best of not yet supported. if params.best_of is not None and params.best_of > 1: - raise ValueError("VLLM V1 does not yet support best_of.") + raise ValueError("vLLM V1 does not yet support best_of.") # Logits processors not supported. if params.logits_processors: - raise ValueError("VLLM V1 does not support per request " + raise ValueError("vLLM V1 does not support per request " "user provided logits processors.") def _validate_params(