Correct capitalisation: VLLM
-> vLLM
(#14562)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
dea985aef0
commit
3b352a2f92
@ -139,7 +139,7 @@ def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
|
||||
|
||||
print(f"Naive output={output_naive}")
|
||||
print(f"FlashInfer output={output_flashinfer}")
|
||||
print(f"VLLM output={output_vllm}")
|
||||
print(f"vLLM output={output_vllm}")
|
||||
|
||||
if torch.allclose(output_naive, output_flashinfer, atol=1e-2,
|
||||
rtol=1e-2) and torch.allclose(
|
||||
|
@ -37,7 +37,7 @@ you may contact the following individuals:
|
||||
|
||||
## Slack Discussion
|
||||
|
||||
You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai)
|
||||
You may use the `#security` channel in the [vLLM Slack](https://slack.vllm.ai)
|
||||
to discuss security-related topics. However, please do not disclose any
|
||||
vulnerabilities in this channel. If you need to report a vulnerability, please
|
||||
use the GitHub security advisory system or contact a VMT member privately.
|
||||
|
@ -509,7 +509,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU
|
||||
memory. This is also known as "KV cache offloading" and is configured
|
||||
with `--swap-space` and `--preemption-mode`.
|
||||
|
||||
In v0, [VLLM has long supported beam
|
||||
In v0, [vLLM has long supported beam
|
||||
search](gh-issue:6226). The
|
||||
SequenceGroup encapsulated the idea of N Sequences which
|
||||
all shared the same prompt kv blocks. This enabled KV cache block
|
||||
|
@ -5,7 +5,7 @@ with LMCache.
|
||||
We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
|
||||
and launch an additional LMCache server.
|
||||
KV cache is transferred in the following manner:
|
||||
VLLM prefill node -> LMCache server -> VLLM decode node.
|
||||
vLLM prefill node -> LMCache server -> vLLM decode node.
|
||||
|
||||
Note that `pip install lmcache` is needed to run this example.
|
||||
Learn more about LMCache in https://github.com/LMCache/LMCache.
|
||||
|
@ -25,7 +25,7 @@ ACCURACY_CONFIGS = [
|
||||
GSM8KAccuracyTestConfig(
|
||||
model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
excepted_value=0.76), # no bias
|
||||
# NOTE(rob): We cannot re-initialize VLLM in the same process for TPU,
|
||||
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
|
||||
# so only one of these tests can run in a single call to pytest. As
|
||||
# a follow up, move this into the LM-EVAL section of the CI.
|
||||
# GSM8KAccuracyTestConfig(
|
||||
|
@ -51,7 +51,7 @@ def get_env_variable_attn_backend() -> Optional[_Backend]:
|
||||
# (default behavior if this variable is None)
|
||||
#
|
||||
# THIS SELECTION TAKES PRECEDENCE OVER THE
|
||||
# VLLM ATTENTION BACKEND ENVIRONMENT VARIABLE
|
||||
# VLLM_ATTENTION_BACKEND ENVIRONMENT VARIABLE
|
||||
forced_attn_backend: Optional[_Backend] = None
|
||||
|
||||
|
||||
|
@ -278,7 +278,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
|
||||
|
||||
|
||||
class VllmBackend:
|
||||
"""The compilation backend for `torch.compile` with VLLM.
|
||||
"""The compilation backend for `torch.compile` with vLLM.
|
||||
It is used for compilation level of `CompilationLevel.PIECEWISE`,
|
||||
where we customize the compilation.
|
||||
|
||||
|
@ -31,7 +31,7 @@ class CompilerInterface:
|
||||
|
||||
def compute_hash(self, vllm_config: VllmConfig) -> str:
|
||||
"""
|
||||
Gather all the relevant information from the VLLM config,
|
||||
Gather all the relevant information from the vLLM config,
|
||||
to compute a hash so that we can cache the compiled model.
|
||||
|
||||
See :meth:`VllmConfig.compute_hash` to check what information
|
||||
|
@ -3572,11 +3572,11 @@ _current_vllm_config: Optional[VllmConfig] = None
|
||||
@contextmanager
|
||||
def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
|
||||
"""
|
||||
Temporarily set the current VLLM config.
|
||||
Temporarily set the current vLLM config.
|
||||
Used during model initialization.
|
||||
We save the current VLLM config in a global variable,
|
||||
We save the current vLLM config in a global variable,
|
||||
so that all modules can access it, e.g. custom ops
|
||||
can access the VLLM config to determine how to dispatch.
|
||||
can access the vLLM config to determine how to dispatch.
|
||||
"""
|
||||
global _current_vllm_config
|
||||
old_vllm_config = _current_vllm_config
|
||||
@ -3611,7 +3611,7 @@ def get_current_vllm_config() -> VllmConfig:
|
||||
# in ci, usually when we test custom ops/modules directly,
|
||||
# we don't set the vllm config. In that case, we set a default
|
||||
# config.
|
||||
logger.warning("Current VLLM config is not set.")
|
||||
logger.warning("Current vLLM config is not set.")
|
||||
from vllm.config import VllmConfig
|
||||
return VllmConfig()
|
||||
return _current_vllm_config
|
||||
|
@ -237,7 +237,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
tool_choice: Optional[Union[Literal["none"], Literal["auto"],
|
||||
ChatCompletionNamedToolChoiceParam]] = "none"
|
||||
|
||||
# NOTE this will be ignored by VLLM -- the model determines the behavior
|
||||
# NOTE this will be ignored by vLLM -- the model determines the behavior
|
||||
parallel_tool_calls: Optional[bool] = False
|
||||
user: Optional[str] = None
|
||||
|
||||
|
@ -164,7 +164,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VERBOSE":
|
||||
lambda: bool(int(os.getenv('VERBOSE', '0'))),
|
||||
|
||||
# Root directory for VLLM configuration files
|
||||
# Root directory for vLLM configuration files
|
||||
# Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
|
||||
# Note that this not only affects how vllm finds its configuration files
|
||||
# during runtime, but also affects how vllm installs its configuration
|
||||
@ -178,7 +178,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
|
||||
# ================== Runtime Env Vars ==================
|
||||
|
||||
# Root directory for VLLM cache files
|
||||
# Root directory for vLLM cache files
|
||||
# Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
|
||||
"VLLM_CACHE_ROOT":
|
||||
lambda: os.path.expanduser(
|
||||
@ -260,7 +260,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S":
|
||||
lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
|
||||
|
||||
# API key for VLLM API server
|
||||
# API key for vLLM API server
|
||||
"VLLM_API_KEY":
|
||||
lambda: os.environ.get("VLLM_API_KEY", None),
|
||||
|
||||
|
@ -1414,7 +1414,7 @@ def cat_with_pad(tensors, dim, padding_value=0):
|
||||
@INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm)
|
||||
class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
|
||||
"""
|
||||
Implements the Phi-4-multimodal-instruct model in VLLM.
|
||||
Implements the Phi-4-multimodal-instruct model in vLLM.
|
||||
"""
|
||||
packed_modules_mapping = {
|
||||
"qkv_proj": [
|
||||
|
@ -119,7 +119,7 @@ class CudaPlatformBase(Platform):
|
||||
if envs.VLLM_USE_V1:
|
||||
raise NotImplementedError(
|
||||
"Multi-step scheduling is not supported (and not "
|
||||
"needed) on VLLM V1. Please launch without "
|
||||
"needed) on vLLM V1. Please launch without "
|
||||
"--num-scheduler-steps.")
|
||||
else:
|
||||
parallel_config.worker_cls = \
|
||||
|
@ -173,7 +173,7 @@ class RocmPlatform(Platform):
|
||||
if envs.VLLM_USE_V1:
|
||||
raise NotImplementedError(
|
||||
"Multi-step scheduling is not supported (and not "
|
||||
"needed) on VLLM V1. Please launch without "
|
||||
"needed) on vLLM V1. Please launch without "
|
||||
"--num-scheduler-steps.")
|
||||
else:
|
||||
parallel_config.worker_cls = \
|
||||
@ -181,7 +181,7 @@ class RocmPlatform(Platform):
|
||||
elif vllm_config.speculative_config:
|
||||
if envs.VLLM_USE_V1:
|
||||
raise NotImplementedError(
|
||||
"Speculative decoding is not yet supported on VLLM V1."
|
||||
"Speculative decoding is not yet supported on vLLM V1."
|
||||
)
|
||||
else:
|
||||
parallel_config.worker_cls = \
|
||||
|
@ -249,7 +249,7 @@ class MistralTokenizer(TokenizerBase):
|
||||
revision=revision)
|
||||
return tokenizer_file
|
||||
|
||||
# the following attributes are set to fit VLLM's design and are used
|
||||
# the following attributes are set to fit vLLM's design and are used
|
||||
# by the guided structured output backends.
|
||||
@property
|
||||
def all_special_tokens_extended(self) -> List[str]:
|
||||
|
@ -255,7 +255,7 @@ class MPClient(EngineCoreClient):
|
||||
# TODO(rob): rather than killing the main process, we should
|
||||
# figure out how to raise an AsyncEngineDeadError and
|
||||
# handle at the API server level so we can return a better
|
||||
# error code to the clients calling VLLM.
|
||||
# error code to the clients calling vLLM.
|
||||
def sigusr1_handler(signum, frame):
|
||||
logger.fatal("Got fatal signal from worker processes, shutting "
|
||||
"down. See stack trace above for root cause issue.")
|
||||
|
@ -248,7 +248,7 @@ class OutputProcessor:
|
||||
|
||||
****************** NOTE FOR DEVELOPERS ******************
|
||||
|
||||
VLLM V1 minimizes the number of python loops over the full
|
||||
vLLM V1 minimizes the number of python loops over the full
|
||||
batch to ensure system overheads are minimized. This is the
|
||||
only function that should loop over EngineCoreOutputs.
|
||||
|
||||
|
@ -93,10 +93,10 @@ class Processor:
|
||||
) -> None:
|
||||
# Best of not yet supported.
|
||||
if params.best_of is not None and params.best_of > 1:
|
||||
raise ValueError("VLLM V1 does not yet support best_of.")
|
||||
raise ValueError("vLLM V1 does not yet support best_of.")
|
||||
# Logits processors not supported.
|
||||
if params.logits_processors:
|
||||
raise ValueError("VLLM V1 does not support per request "
|
||||
raise ValueError("vLLM V1 does not support per request "
|
||||
"user provided logits processors.")
|
||||
|
||||
def _validate_params(
|
||||
|
Loading…
x
Reference in New Issue
Block a user