[Doc] Use intersphinx and update entrypoints docs (#5125)
This commit is contained in:
parent
d79d9eaaff
commit
a9bcc7afb2
@ -80,7 +80,7 @@ def setup(app):
|
|||||||
generate_examples()
|
generate_examples()
|
||||||
|
|
||||||
|
|
||||||
# Mock out external dependencies here.
|
# Mock out external dependencies here, otherwise the autodoc pages may be blank.
|
||||||
autodoc_mock_imports = [
|
autodoc_mock_imports = [
|
||||||
"cpuinfo",
|
"cpuinfo",
|
||||||
"torch",
|
"torch",
|
||||||
@ -115,4 +115,15 @@ class MockedClassDocumenter(autodoc.ClassDocumenter):
|
|||||||
|
|
||||||
autodoc.ClassDocumenter = MockedClassDocumenter
|
autodoc.ClassDocumenter = MockedClassDocumenter
|
||||||
|
|
||||||
|
intersphinx_mapping = {
|
||||||
|
'python': ('https://docs.python.org/3', None),
|
||||||
|
'typing_extensions':
|
||||||
|
('https://typing-extensions.readthedocs.io/en/latest', None),
|
||||||
|
'numpy': ('https://numpy.org/doc/stable', None),
|
||||||
|
'torch': ('https://pytorch.org/docs/stable', None),
|
||||||
|
'psutil': ('https://psutil.readthedocs.io/en/stable', None),
|
||||||
|
}
|
||||||
|
|
||||||
|
autodoc_preserve_defaults = True
|
||||||
|
|
||||||
navigation_with_keys = False
|
navigation_with_keys = False
|
||||||
|
@ -307,8 +307,6 @@ class AsyncLLMEngine:
|
|||||||
generate method when there are requests in the waiting queue. The generate
|
generate method when there are requests in the waiting queue. The generate
|
||||||
method yields the outputs from the :class:`LLMEngine` to the caller.
|
method yields the outputs from the :class:`LLMEngine` to the caller.
|
||||||
|
|
||||||
NOTE: For the comprehensive list of arguments, see :class:`LLMEngine`.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
worker_use_ray: Whether to use Ray for model workers. Required for
|
worker_use_ray: Whether to use Ray for model workers. Required for
|
||||||
distributed execution. Should be the same as
|
distributed execution. Should be the same as
|
||||||
|
@ -70,8 +70,8 @@ class LLMEngine:
|
|||||||
The :class:`~vllm.LLM` class wraps this class for offline batched inference
|
The :class:`~vllm.LLM` class wraps this class for offline batched inference
|
||||||
and the :class:`AsyncLLMEngine` class wraps this class for online serving.
|
and the :class:`AsyncLLMEngine` class wraps this class for online serving.
|
||||||
|
|
||||||
NOTE: The config arguments are derived from the :class:`~vllm.EngineArgs`
|
The config arguments are derived from :class:`~vllm.EngineArgs`. (See
|
||||||
class. For the comprehensive list of arguments, see :ref:`engine_args`.
|
:ref:`engine_args`)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model_config: The configuration related to the LLM model.
|
model_config: The configuration related to the LLM model.
|
||||||
|
@ -30,12 +30,6 @@ class LLM:
|
|||||||
this class generates texts from the model, using an intelligent batching
|
this class generates texts from the model, using an intelligent batching
|
||||||
mechanism and efficient memory management.
|
mechanism and efficient memory management.
|
||||||
|
|
||||||
NOTE: This class is intended to be used for offline inference. For online
|
|
||||||
serving, use the :class:`~vllm.AsyncLLMEngine` class instead.
|
|
||||||
|
|
||||||
NOTE: For the comprehensive list of arguments, see
|
|
||||||
:class:`~vllm.EngineArgs`.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model: The name or path of a HuggingFace Transformers model.
|
model: The name or path of a HuggingFace Transformers model.
|
||||||
tokenizer: The name or path of a HuggingFace Transformers tokenizer.
|
tokenizer: The name or path of a HuggingFace Transformers tokenizer.
|
||||||
@ -84,6 +78,12 @@ class LLM:
|
|||||||
When a sequence has context length larger than this, we fall back
|
When a sequence has context length larger than this, we fall back
|
||||||
to eager mode.
|
to eager mode.
|
||||||
disable_custom_all_reduce: See ParallelConfig
|
disable_custom_all_reduce: See ParallelConfig
|
||||||
|
**kwargs: Arguments for :class:`~vllm.EngineArgs`. (See
|
||||||
|
:ref:`engine_args`)
|
||||||
|
|
||||||
|
Note:
|
||||||
|
This class is intended to be used for offline inference. For online
|
||||||
|
serving, use the :class:`~vllm.AsyncLLMEngine` class instead.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
DEPRECATE_LEGACY: ClassVar[bool] = False
|
DEPRECATE_LEGACY: ClassVar[bool] = False
|
||||||
@ -253,7 +253,7 @@ class LLM:
|
|||||||
) -> List[RequestOutput]:
|
) -> List[RequestOutput]:
|
||||||
"""Generates the completions for the input prompts.
|
"""Generates the completions for the input prompts.
|
||||||
|
|
||||||
NOTE: This class automatically batches the given prompts, considering
|
This class automatically batches the given prompts, considering
|
||||||
the memory constraint. For the best performance, put all of your prompts
|
the memory constraint. For the best performance, put all of your prompts
|
||||||
into a single list and pass it to this method.
|
into a single list and pass it to this method.
|
||||||
|
|
||||||
@ -270,6 +270,11 @@ class LLM:
|
|||||||
Returns:
|
Returns:
|
||||||
A list of `RequestOutput` objects containing the
|
A list of `RequestOutput` objects containing the
|
||||||
generated completions in the same order as the input prompts.
|
generated completions in the same order as the input prompts.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
|
||||||
|
considered legacy and may be deprecated in the future. You should
|
||||||
|
instead pass them via the ``inputs`` parameter.
|
||||||
"""
|
"""
|
||||||
if prompt_token_ids is not None or multi_modal_data is not None:
|
if prompt_token_ids is not None or multi_modal_data is not None:
|
||||||
inputs = self._convert_v1_inputs(
|
inputs = self._convert_v1_inputs(
|
||||||
@ -393,7 +398,7 @@ class LLM:
|
|||||||
) -> List[EmbeddingRequestOutput]:
|
) -> List[EmbeddingRequestOutput]:
|
||||||
"""Generates the completions for the input prompts.
|
"""Generates the completions for the input prompts.
|
||||||
|
|
||||||
NOTE: This class automatically batches the given prompts, considering
|
This class automatically batches the given prompts, considering
|
||||||
the memory constraint. For the best performance, put all of your prompts
|
the memory constraint. For the best performance, put all of your prompts
|
||||||
into a single list and pass it to this method.
|
into a single list and pass it to this method.
|
||||||
|
|
||||||
@ -409,6 +414,11 @@ class LLM:
|
|||||||
Returns:
|
Returns:
|
||||||
A list of `EmbeddingRequestOutput` objects containing the
|
A list of `EmbeddingRequestOutput` objects containing the
|
||||||
generated embeddings in the same order as the input prompts.
|
generated embeddings in the same order as the input prompts.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
|
||||||
|
considered legacy and may be deprecated in the future. You should
|
||||||
|
instead pass them via the ``inputs`` parameter.
|
||||||
"""
|
"""
|
||||||
if prompt_token_ids is not None or multi_modal_data is not None:
|
if prompt_token_ids is not None or multi_modal_data is not None:
|
||||||
inputs = self._convert_v1_inputs(
|
inputs = self._convert_v1_inputs(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user