Correct capitalisation: VLLM
-> vLLM
(#14562)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
dea985aef0
commit
3b352a2f92
@ -139,7 +139,7 @@ def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
|
|||||||
|
|
||||||
print(f"Naive output={output_naive}")
|
print(f"Naive output={output_naive}")
|
||||||
print(f"FlashInfer output={output_flashinfer}")
|
print(f"FlashInfer output={output_flashinfer}")
|
||||||
print(f"VLLM output={output_vllm}")
|
print(f"vLLM output={output_vllm}")
|
||||||
|
|
||||||
if torch.allclose(output_naive, output_flashinfer, atol=1e-2,
|
if torch.allclose(output_naive, output_flashinfer, atol=1e-2,
|
||||||
rtol=1e-2) and torch.allclose(
|
rtol=1e-2) and torch.allclose(
|
||||||
|
@ -37,7 +37,7 @@ you may contact the following individuals:
|
|||||||
|
|
||||||
## Slack Discussion
|
## Slack Discussion
|
||||||
|
|
||||||
You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai)
|
You may use the `#security` channel in the [vLLM Slack](https://slack.vllm.ai)
|
||||||
to discuss security-related topics. However, please do not disclose any
|
to discuss security-related topics. However, please do not disclose any
|
||||||
vulnerabilities in this channel. If you need to report a vulnerability, please
|
vulnerabilities in this channel. If you need to report a vulnerability, please
|
||||||
use the GitHub security advisory system or contact a VMT member privately.
|
use the GitHub security advisory system or contact a VMT member privately.
|
||||||
|
@ -509,7 +509,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU
|
|||||||
memory. This is also known as "KV cache offloading" and is configured
|
memory. This is also known as "KV cache offloading" and is configured
|
||||||
with `--swap-space` and `--preemption-mode`.
|
with `--swap-space` and `--preemption-mode`.
|
||||||
|
|
||||||
In v0, [VLLM has long supported beam
|
In v0, [vLLM has long supported beam
|
||||||
search](gh-issue:6226). The
|
search](gh-issue:6226). The
|
||||||
SequenceGroup encapsulated the idea of N Sequences which
|
SequenceGroup encapsulated the idea of N Sequences which
|
||||||
all shared the same prompt kv blocks. This enabled KV cache block
|
all shared the same prompt kv blocks. This enabled KV cache block
|
||||||
|
@ -5,7 +5,7 @@ with LMCache.
|
|||||||
We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
|
We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
|
||||||
and launch an additional LMCache server.
|
and launch an additional LMCache server.
|
||||||
KV cache is transferred in the following manner:
|
KV cache is transferred in the following manner:
|
||||||
VLLM prefill node -> LMCache server -> VLLM decode node.
|
vLLM prefill node -> LMCache server -> vLLM decode node.
|
||||||
|
|
||||||
Note that `pip install lmcache` is needed to run this example.
|
Note that `pip install lmcache` is needed to run this example.
|
||||||
Learn more about LMCache in https://github.com/LMCache/LMCache.
|
Learn more about LMCache in https://github.com/LMCache/LMCache.
|
||||||
|
@ -25,7 +25,7 @@ ACCURACY_CONFIGS = [
|
|||||||
GSM8KAccuracyTestConfig(
|
GSM8KAccuracyTestConfig(
|
||||||
model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
excepted_value=0.76), # no bias
|
excepted_value=0.76), # no bias
|
||||||
# NOTE(rob): We cannot re-initialize VLLM in the same process for TPU,
|
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
|
||||||
# so only one of these tests can run in a single call to pytest. As
|
# so only one of these tests can run in a single call to pytest. As
|
||||||
# a follow up, move this into the LM-EVAL section of the CI.
|
# a follow up, move this into the LM-EVAL section of the CI.
|
||||||
# GSM8KAccuracyTestConfig(
|
# GSM8KAccuracyTestConfig(
|
||||||
|
@ -51,7 +51,7 @@ def get_env_variable_attn_backend() -> Optional[_Backend]:
|
|||||||
# (default behavior if this variable is None)
|
# (default behavior if this variable is None)
|
||||||
#
|
#
|
||||||
# THIS SELECTION TAKES PRECEDENCE OVER THE
|
# THIS SELECTION TAKES PRECEDENCE OVER THE
|
||||||
# VLLM ATTENTION BACKEND ENVIRONMENT VARIABLE
|
# VLLM_ATTENTION_BACKEND ENVIRONMENT VARIABLE
|
||||||
forced_attn_backend: Optional[_Backend] = None
|
forced_attn_backend: Optional[_Backend] = None
|
||||||
|
|
||||||
|
|
||||||
|
@ -278,7 +278,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
|
|||||||
|
|
||||||
|
|
||||||
class VllmBackend:
|
class VllmBackend:
|
||||||
"""The compilation backend for `torch.compile` with VLLM.
|
"""The compilation backend for `torch.compile` with vLLM.
|
||||||
It is used for compilation level of `CompilationLevel.PIECEWISE`,
|
It is used for compilation level of `CompilationLevel.PIECEWISE`,
|
||||||
where we customize the compilation.
|
where we customize the compilation.
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ class CompilerInterface:
|
|||||||
|
|
||||||
def compute_hash(self, vllm_config: VllmConfig) -> str:
|
def compute_hash(self, vllm_config: VllmConfig) -> str:
|
||||||
"""
|
"""
|
||||||
Gather all the relevant information from the VLLM config,
|
Gather all the relevant information from the vLLM config,
|
||||||
to compute a hash so that we can cache the compiled model.
|
to compute a hash so that we can cache the compiled model.
|
||||||
|
|
||||||
See :meth:`VllmConfig.compute_hash` to check what information
|
See :meth:`VllmConfig.compute_hash` to check what information
|
||||||
|
@ -3572,11 +3572,11 @@ _current_vllm_config: Optional[VllmConfig] = None
|
|||||||
@contextmanager
|
@contextmanager
|
||||||
def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
|
def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
|
||||||
"""
|
"""
|
||||||
Temporarily set the current VLLM config.
|
Temporarily set the current vLLM config.
|
||||||
Used during model initialization.
|
Used during model initialization.
|
||||||
We save the current VLLM config in a global variable,
|
We save the current vLLM config in a global variable,
|
||||||
so that all modules can access it, e.g. custom ops
|
so that all modules can access it, e.g. custom ops
|
||||||
can access the VLLM config to determine how to dispatch.
|
can access the vLLM config to determine how to dispatch.
|
||||||
"""
|
"""
|
||||||
global _current_vllm_config
|
global _current_vllm_config
|
||||||
old_vllm_config = _current_vllm_config
|
old_vllm_config = _current_vllm_config
|
||||||
@ -3611,7 +3611,7 @@ def get_current_vllm_config() -> VllmConfig:
|
|||||||
# in ci, usually when we test custom ops/modules directly,
|
# in ci, usually when we test custom ops/modules directly,
|
||||||
# we don't set the vllm config. In that case, we set a default
|
# we don't set the vllm config. In that case, we set a default
|
||||||
# config.
|
# config.
|
||||||
logger.warning("Current VLLM config is not set.")
|
logger.warning("Current vLLM config is not set.")
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
return VllmConfig()
|
return VllmConfig()
|
||||||
return _current_vllm_config
|
return _current_vllm_config
|
||||||
|
@ -237,7 +237,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
|||||||
tool_choice: Optional[Union[Literal["none"], Literal["auto"],
|
tool_choice: Optional[Union[Literal["none"], Literal["auto"],
|
||||||
ChatCompletionNamedToolChoiceParam]] = "none"
|
ChatCompletionNamedToolChoiceParam]] = "none"
|
||||||
|
|
||||||
# NOTE this will be ignored by VLLM -- the model determines the behavior
|
# NOTE this will be ignored by vLLM -- the model determines the behavior
|
||||||
parallel_tool_calls: Optional[bool] = False
|
parallel_tool_calls: Optional[bool] = False
|
||||||
user: Optional[str] = None
|
user: Optional[str] = None
|
||||||
|
|
||||||
|
@ -164,7 +164,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"VERBOSE":
|
"VERBOSE":
|
||||||
lambda: bool(int(os.getenv('VERBOSE', '0'))),
|
lambda: bool(int(os.getenv('VERBOSE', '0'))),
|
||||||
|
|
||||||
# Root directory for VLLM configuration files
|
# Root directory for vLLM configuration files
|
||||||
# Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
|
# Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
|
||||||
# Note that this not only affects how vllm finds its configuration files
|
# Note that this not only affects how vllm finds its configuration files
|
||||||
# during runtime, but also affects how vllm installs its configuration
|
# during runtime, but also affects how vllm installs its configuration
|
||||||
@ -178,7 +178,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
|
|
||||||
# ================== Runtime Env Vars ==================
|
# ================== Runtime Env Vars ==================
|
||||||
|
|
||||||
# Root directory for VLLM cache files
|
# Root directory for vLLM cache files
|
||||||
# Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
|
# Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
|
||||||
"VLLM_CACHE_ROOT":
|
"VLLM_CACHE_ROOT":
|
||||||
lambda: os.path.expanduser(
|
lambda: os.path.expanduser(
|
||||||
@ -260,7 +260,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S":
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S":
|
||||||
lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
|
lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
|
||||||
|
|
||||||
# API key for VLLM API server
|
# API key for vLLM API server
|
||||||
"VLLM_API_KEY":
|
"VLLM_API_KEY":
|
||||||
lambda: os.environ.get("VLLM_API_KEY", None),
|
lambda: os.environ.get("VLLM_API_KEY", None),
|
||||||
|
|
||||||
|
@ -1414,7 +1414,7 @@ def cat_with_pad(tensors, dim, padding_value=0):
|
|||||||
@INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm)
|
@INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm)
|
||||||
class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
|
class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
|
||||||
"""
|
"""
|
||||||
Implements the Phi-4-multimodal-instruct model in VLLM.
|
Implements the Phi-4-multimodal-instruct model in vLLM.
|
||||||
"""
|
"""
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": [
|
"qkv_proj": [
|
||||||
|
@ -119,7 +119,7 @@ class CudaPlatformBase(Platform):
|
|||||||
if envs.VLLM_USE_V1:
|
if envs.VLLM_USE_V1:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Multi-step scheduling is not supported (and not "
|
"Multi-step scheduling is not supported (and not "
|
||||||
"needed) on VLLM V1. Please launch without "
|
"needed) on vLLM V1. Please launch without "
|
||||||
"--num-scheduler-steps.")
|
"--num-scheduler-steps.")
|
||||||
else:
|
else:
|
||||||
parallel_config.worker_cls = \
|
parallel_config.worker_cls = \
|
||||||
|
@ -173,7 +173,7 @@ class RocmPlatform(Platform):
|
|||||||
if envs.VLLM_USE_V1:
|
if envs.VLLM_USE_V1:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Multi-step scheduling is not supported (and not "
|
"Multi-step scheduling is not supported (and not "
|
||||||
"needed) on VLLM V1. Please launch without "
|
"needed) on vLLM V1. Please launch without "
|
||||||
"--num-scheduler-steps.")
|
"--num-scheduler-steps.")
|
||||||
else:
|
else:
|
||||||
parallel_config.worker_cls = \
|
parallel_config.worker_cls = \
|
||||||
@ -181,7 +181,7 @@ class RocmPlatform(Platform):
|
|||||||
elif vllm_config.speculative_config:
|
elif vllm_config.speculative_config:
|
||||||
if envs.VLLM_USE_V1:
|
if envs.VLLM_USE_V1:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Speculative decoding is not yet supported on VLLM V1."
|
"Speculative decoding is not yet supported on vLLM V1."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
parallel_config.worker_cls = \
|
parallel_config.worker_cls = \
|
||||||
|
@ -249,7 +249,7 @@ class MistralTokenizer(TokenizerBase):
|
|||||||
revision=revision)
|
revision=revision)
|
||||||
return tokenizer_file
|
return tokenizer_file
|
||||||
|
|
||||||
# the following attributes are set to fit VLLM's design and are used
|
# the following attributes are set to fit vLLM's design and are used
|
||||||
# by the guided structured output backends.
|
# by the guided structured output backends.
|
||||||
@property
|
@property
|
||||||
def all_special_tokens_extended(self) -> List[str]:
|
def all_special_tokens_extended(self) -> List[str]:
|
||||||
|
@ -255,7 +255,7 @@ class MPClient(EngineCoreClient):
|
|||||||
# TODO(rob): rather than killing the main process, we should
|
# TODO(rob): rather than killing the main process, we should
|
||||||
# figure out how to raise an AsyncEngineDeadError and
|
# figure out how to raise an AsyncEngineDeadError and
|
||||||
# handle at the API server level so we can return a better
|
# handle at the API server level so we can return a better
|
||||||
# error code to the clients calling VLLM.
|
# error code to the clients calling vLLM.
|
||||||
def sigusr1_handler(signum, frame):
|
def sigusr1_handler(signum, frame):
|
||||||
logger.fatal("Got fatal signal from worker processes, shutting "
|
logger.fatal("Got fatal signal from worker processes, shutting "
|
||||||
"down. See stack trace above for root cause issue.")
|
"down. See stack trace above for root cause issue.")
|
||||||
|
@ -248,7 +248,7 @@ class OutputProcessor:
|
|||||||
|
|
||||||
****************** NOTE FOR DEVELOPERS ******************
|
****************** NOTE FOR DEVELOPERS ******************
|
||||||
|
|
||||||
VLLM V1 minimizes the number of python loops over the full
|
vLLM V1 minimizes the number of python loops over the full
|
||||||
batch to ensure system overheads are minimized. This is the
|
batch to ensure system overheads are minimized. This is the
|
||||||
only function that should loop over EngineCoreOutputs.
|
only function that should loop over EngineCoreOutputs.
|
||||||
|
|
||||||
|
@ -93,10 +93,10 @@ class Processor:
|
|||||||
) -> None:
|
) -> None:
|
||||||
# Best of not yet supported.
|
# Best of not yet supported.
|
||||||
if params.best_of is not None and params.best_of > 1:
|
if params.best_of is not None and params.best_of > 1:
|
||||||
raise ValueError("VLLM V1 does not yet support best_of.")
|
raise ValueError("vLLM V1 does not yet support best_of.")
|
||||||
# Logits processors not supported.
|
# Logits processors not supported.
|
||||||
if params.logits_processors:
|
if params.logits_processors:
|
||||||
raise ValueError("VLLM V1 does not support per request "
|
raise ValueError("vLLM V1 does not support per request "
|
||||||
"user provided logits processors.")
|
"user provided logits processors.")
|
||||||
|
|
||||||
def _validate_params(
|
def _validate_params(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user