From a9bcc7afb23d208efaa1b47549fa93eaa1d9d6cf Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 31 May 2024 00:59:23 +0800 Subject: [PATCH] [Doc] Use intersphinx and update entrypoints docs (#5125) --- docs/source/conf.py | 13 ++++++++++++- vllm/engine/async_llm_engine.py | 2 -- vllm/engine/llm_engine.py | 4 ++-- vllm/entrypoints/llm.py | 26 ++++++++++++++++++-------- 4 files changed, 32 insertions(+), 13 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 9da5a499..cfebc2ff 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -80,7 +80,7 @@ def setup(app): generate_examples() -# Mock out external dependencies here. +# Mock out external dependencies here, otherwise the autodoc pages may be blank. autodoc_mock_imports = [ "cpuinfo", "torch", @@ -115,4 +115,15 @@ class MockedClassDocumenter(autodoc.ClassDocumenter): autodoc.ClassDocumenter = MockedClassDocumenter +intersphinx_mapping = { + 'python': ('https://docs.python.org/3', None), + 'typing_extensions': + ('https://typing-extensions.readthedocs.io/en/latest', None), + 'numpy': ('https://numpy.org/doc/stable', None), + 'torch': ('https://pytorch.org/docs/stable', None), + 'psutil': ('https://psutil.readthedocs.io/en/stable', None), +} + +autodoc_preserve_defaults = True + navigation_with_keys = False diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index d4289c71..db4d2849 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -307,8 +307,6 @@ class AsyncLLMEngine: generate method when there are requests in the waiting queue. The generate method yields the outputs from the :class:`LLMEngine` to the caller. - NOTE: For the comprehensive list of arguments, see :class:`LLMEngine`. - Args: worker_use_ray: Whether to use Ray for model workers. Required for distributed execution. Should be the same as diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 08bccf20..cb5893e7 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -70,8 +70,8 @@ class LLMEngine: The :class:`~vllm.LLM` class wraps this class for offline batched inference and the :class:`AsyncLLMEngine` class wraps this class for online serving. - NOTE: The config arguments are derived from the :class:`~vllm.EngineArgs` - class. For the comprehensive list of arguments, see :ref:`engine_args`. + The config arguments are derived from :class:`~vllm.EngineArgs`. (See + :ref:`engine_args`) Args: model_config: The configuration related to the LLM model. diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 9759d055..6e971ae7 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -30,12 +30,6 @@ class LLM: this class generates texts from the model, using an intelligent batching mechanism and efficient memory management. - NOTE: This class is intended to be used for offline inference. For online - serving, use the :class:`~vllm.AsyncLLMEngine` class instead. - - NOTE: For the comprehensive list of arguments, see - :class:`~vllm.EngineArgs`. - Args: model: The name or path of a HuggingFace Transformers model. tokenizer: The name or path of a HuggingFace Transformers tokenizer. @@ -84,6 +78,12 @@ class LLM: When a sequence has context length larger than this, we fall back to eager mode. disable_custom_all_reduce: See ParallelConfig + **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See + :ref:`engine_args`) + + Note: + This class is intended to be used for offline inference. For online + serving, use the :class:`~vllm.AsyncLLMEngine` class instead. """ DEPRECATE_LEGACY: ClassVar[bool] = False @@ -253,7 +253,7 @@ class LLM: ) -> List[RequestOutput]: """Generates the completions for the input prompts. - NOTE: This class automatically batches the given prompts, considering + This class automatically batches the given prompts, considering the memory constraint. For the best performance, put all of your prompts into a single list and pass it to this method. @@ -270,6 +270,11 @@ class LLM: Returns: A list of `RequestOutput` objects containing the generated completions in the same order as the input prompts. + + Note: + Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is + considered legacy and may be deprecated in the future. You should + instead pass them via the ``inputs`` parameter. """ if prompt_token_ids is not None or multi_modal_data is not None: inputs = self._convert_v1_inputs( @@ -393,7 +398,7 @@ class LLM: ) -> List[EmbeddingRequestOutput]: """Generates the completions for the input prompts. - NOTE: This class automatically batches the given prompts, considering + This class automatically batches the given prompts, considering the memory constraint. For the best performance, put all of your prompts into a single list and pass it to this method. @@ -409,6 +414,11 @@ class LLM: Returns: A list of `EmbeddingRequestOutput` objects containing the generated embeddings in the same order as the input prompts. + + Note: + Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is + considered legacy and may be deprecated in the future. You should + instead pass them via the ``inputs`` parameter. """ if prompt_token_ids is not None or multi_modal_data is not None: inputs = self._convert_v1_inputs(