[Doc] Use intersphinx and update entrypoints docs (#5125)

2024-05-31 00:59:23 +08:00 · 2024-05-31 00:59:23 +08:00 · a9bcc7afb2
commit a9bcc7afb2
parent d79d9eaaff
4 changed files with 32 additions and 13 deletions
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -80,7 +80,7 @@ def setup(app):
    generate_examples()
-# Mock out external dependencies here.
+# Mock out external dependencies here, otherwise the autodoc pages may be blank.
 autodoc_mock_imports = [
    "cpuinfo",
    "torch",
@ -115,4 +115,15 @@ class MockedClassDocumenter(autodoc.ClassDocumenter):
 autodoc.ClassDocumenter = MockedClassDocumenter
 intersphinx_mapping = {
    'python': ('https://docs.python.org/3', None),
    'typing_extensions':
    ('https://typing-extensions.readthedocs.io/en/latest', None),
    'numpy': ('https://numpy.org/doc/stable', None),
    'torch': ('https://pytorch.org/docs/stable', None),
    'psutil': ('https://psutil.readthedocs.io/en/stable', None),
 }
 autodoc_preserve_defaults = True
 navigation_with_keys = False
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@ -307,8 +307,6 @@ class AsyncLLMEngine:
    generate method when there are requests in the waiting queue. The generate
    method yields the outputs from the :class:`LLMEngine` to the caller.
    NOTE: For the comprehensive list of arguments, see :class:`LLMEngine`.
    Args:
        worker_use_ray: Whether to use Ray for model workers. Required for
            distributed execution. Should be the same as
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@ -70,8 +70,8 @@ class LLMEngine:
    The :class:`~vllm.LLM` class wraps this class for offline batched inference
    and the :class:`AsyncLLMEngine` class wraps this class for online serving.
-    NOTE: The config arguments are derived from the :class:`~vllm.EngineArgs`
+    The config arguments are derived from :class:`~vllm.EngineArgs`. (See
-    class. For the comprehensive list of arguments, see :ref:`engine_args`.
+    :ref:`engine_args`)
    Args:
        model_config: The configuration related to the LLM model.
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -30,12 +30,6 @@ class LLM:
    this class generates texts from the model, using an intelligent batching
    mechanism and efficient memory management.
    NOTE: This class is intended to be used for offline inference. For online
    serving, use the :class:`~vllm.AsyncLLMEngine` class instead.
    NOTE: For the comprehensive list of arguments, see
    :class:`~vllm.EngineArgs`.
    Args:
        model: The name or path of a HuggingFace Transformers model.
        tokenizer: The name or path of a HuggingFace Transformers tokenizer.
@ -84,6 +78,12 @@ class LLM:
            When a sequence has context length larger than this, we fall back
            to eager mode.
        disable_custom_all_reduce: See ParallelConfig
        **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See
            :ref:`engine_args`)
    Note:
        This class is intended to be used for offline inference. For online
        serving, use the :class:`~vllm.AsyncLLMEngine` class instead.
    """
    DEPRECATE_LEGACY: ClassVar[bool] = False
@ -253,7 +253,7 @@ class LLM:
    ) -> List[RequestOutput]:
        """Generates the completions for the input prompts.
-        NOTE: This class automatically batches the given prompts, considering
+        This class automatically batches the given prompts, considering
        the memory constraint. For the best performance, put all of your prompts
        into a single list and pass it to this method.
@ -270,6 +270,11 @@ class LLM:
        Returns:
            A list of `RequestOutput` objects containing the
            generated completions in the same order as the input prompts.
        Note:
            Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
            considered legacy and may be deprecated in the future. You should
            instead pass them via the ``inputs`` parameter.
        """
        if prompt_token_ids is not None or multi_modal_data is not None:
            inputs = self._convert_v1_inputs(
@ -393,7 +398,7 @@ class LLM:
    ) -> List[EmbeddingRequestOutput]:
        """Generates the completions for the input prompts.
-        NOTE: This class automatically batches the given prompts, considering
+        This class automatically batches the given prompts, considering
        the memory constraint. For the best performance, put all of your prompts
        into a single list and pass it to this method.
@ -409,6 +414,11 @@ class LLM:
        Returns:
            A list of `EmbeddingRequestOutput` objects containing the
            generated embeddings in the same order as the input prompts.
        Note:
            Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
            considered legacy and may be deprecated in the future. You should
            instead pass them via the ``inputs`` parameter.
        """
        if prompt_token_ids is not None or multi_modal_data is not None:
            inputs = self._convert_v1_inputs(