From a9bcc7afb23d208efaa1b47549fa93eaa1d9d6cf Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 31 May 2024 00:59:23 +0800
Subject: [PATCH] [Doc] Use intersphinx and update entrypoints docs (#5125)

---
 docs/source/conf.py             | 13 ++++++++++++-
 vllm/engine/async_llm_engine.py |  2 --
 vllm/engine/llm_engine.py       |  4 ++--
 vllm/entrypoints/llm.py         | 26 ++++++++++++++++++--------
 4 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 9da5a499..cfebc2ff 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -80,7 +80,7 @@ def setup(app):
     generate_examples()
 
 
-# Mock out external dependencies here.
+# Mock out external dependencies here, otherwise the autodoc pages may be blank.
 autodoc_mock_imports = [
     "cpuinfo",
     "torch",
@@ -115,4 +115,15 @@ class MockedClassDocumenter(autodoc.ClassDocumenter):
 
 autodoc.ClassDocumenter = MockedClassDocumenter
 
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3', None),
+    'typing_extensions':
+    ('https://typing-extensions.readthedocs.io/en/latest', None),
+    'numpy': ('https://numpy.org/doc/stable', None),
+    'torch': ('https://pytorch.org/docs/stable', None),
+    'psutil': ('https://psutil.readthedocs.io/en/stable', None),
+}
+
+autodoc_preserve_defaults = True
+
 navigation_with_keys = False
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index d4289c71..db4d2849 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -307,8 +307,6 @@ class AsyncLLMEngine:
     generate method when there are requests in the waiting queue. The generate
     method yields the outputs from the :class:`LLMEngine` to the caller.
 
-    NOTE: For the comprehensive list of arguments, see :class:`LLMEngine`.
-
     Args:
         worker_use_ray: Whether to use Ray for model workers. Required for
             distributed execution. Should be the same as
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 08bccf20..cb5893e7 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -70,8 +70,8 @@ class LLMEngine:
     The :class:`~vllm.LLM` class wraps this class for offline batched inference
     and the :class:`AsyncLLMEngine` class wraps this class for online serving.
 
-    NOTE: The config arguments are derived from the :class:`~vllm.EngineArgs`
-    class. For the comprehensive list of arguments, see :ref:`engine_args`.
+    The config arguments are derived from :class:`~vllm.EngineArgs`. (See
+    :ref:`engine_args`)
 
     Args:
         model_config: The configuration related to the LLM model.
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 9759d055..6e971ae7 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -30,12 +30,6 @@ class LLM:
     this class generates texts from the model, using an intelligent batching
     mechanism and efficient memory management.
 
-    NOTE: This class is intended to be used for offline inference. For online
-    serving, use the :class:`~vllm.AsyncLLMEngine` class instead.
-
-    NOTE: For the comprehensive list of arguments, see
-    :class:`~vllm.EngineArgs`.
-
     Args:
         model: The name or path of a HuggingFace Transformers model.
         tokenizer: The name or path of a HuggingFace Transformers tokenizer.
@@ -84,6 +78,12 @@ class LLM:
             When a sequence has context length larger than this, we fall back
             to eager mode.
         disable_custom_all_reduce: See ParallelConfig
+        **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See
+            :ref:`engine_args`)
+    
+    Note:
+        This class is intended to be used for offline inference. For online
+        serving, use the :class:`~vllm.AsyncLLMEngine` class instead.
     """
 
     DEPRECATE_LEGACY: ClassVar[bool] = False
@@ -253,7 +253,7 @@ class LLM:
     ) -> List[RequestOutput]:
         """Generates the completions for the input prompts.
 
-        NOTE: This class automatically batches the given prompts, considering
+        This class automatically batches the given prompts, considering
         the memory constraint. For the best performance, put all of your prompts
         into a single list and pass it to this method.
 
@@ -270,6 +270,11 @@ class LLM:
         Returns:
             A list of `RequestOutput` objects containing the
             generated completions in the same order as the input prompts.
+
+        Note:
+            Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
+            considered legacy and may be deprecated in the future. You should
+            instead pass them via the ``inputs`` parameter.
         """
         if prompt_token_ids is not None or multi_modal_data is not None:
             inputs = self._convert_v1_inputs(
@@ -393,7 +398,7 @@ class LLM:
     ) -> List[EmbeddingRequestOutput]:
         """Generates the completions for the input prompts.
 
-        NOTE: This class automatically batches the given prompts, considering
+        This class automatically batches the given prompts, considering
         the memory constraint. For the best performance, put all of your prompts
         into a single list and pass it to this method.
 
@@ -409,6 +414,11 @@ class LLM:
         Returns:
             A list of `EmbeddingRequestOutput` objects containing the
             generated embeddings in the same order as the input prompts.
+
+        Note:
+            Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
+            considered legacy and may be deprecated in the future. You should
+            instead pass them via the ``inputs`` parameter.
         """
         if prompt_token_ids is not None or multi_modal_data is not None:
             inputs = self._convert_v1_inputs(