From 6cd40a5bfed24ef0ceca83b0450be6920d8ca6d4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 21:34:44 +0800
Subject: [PATCH] [Doc][4/N] Reorganize API Reference (#11843)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                 |   2 +-
 Dockerfile                                    |   4 ++--
 .../{dev => api}/engine/async_llm_engine.md   |   0
 .../engine_index.md => api/engine/index.md}   |   0
 docs/source/{dev => api}/engine/llm_engine.md |   0
 .../multimodal/index.md}                      |  10 --------
 .../offline_inference/index.md}               |   0
 .../{dev => api}/offline_inference/llm.md     |   0
 .../offline_inference/llm_inputs.md           |   0
 docs/source/api/params.md                     |  22 ++++++++++++++++++
 .../dockerfile-stages-dependency.png          | Bin
 .../contributing/dockerfile/dockerfile.md     |   2 +-
 docs/source/design/arch_overview.md           |   2 +-
 .../multimodal/adding_multimodal_plugin.md    |  16 -------------
 docs/source/dev/pooling_params.md             |   6 -----
 docs/source/dev/sampling_params.md            |   6 -----
 docs/source/getting_started/quickstart.md     |   2 +-
 docs/source/index.md                          |   9 ++++---
 docs/source/serving/offline_inference.md      |   2 +-
 .../serving/openai_compatible_server.md       |   8 +++----
 vllm/multimodal/base.py                       |   3 ---
 vllm/multimodal/inputs.py                     |   6 -----
 vllm/multimodal/registry.py                   |   3 ---
 vllm/pooling_params.py                        |   2 +-
 24 files changed, 38 insertions(+), 67 deletions(-)
 rename docs/source/{dev => api}/engine/async_llm_engine.md (100%)
 rename docs/source/{dev/engine/engine_index.md => api/engine/index.md} (100%)
 rename docs/source/{dev => api}/engine/llm_engine.md (100%)
 rename docs/source/{design/multimodal/multimodal_index.md => api/multimodal/index.md} (84%)
 rename docs/source/{dev/offline_inference/offline_index.md => api/offline_inference/index.md} (100%)
 rename docs/source/{dev => api}/offline_inference/llm.md (100%)
 rename docs/source/{dev => api}/offline_inference/llm_inputs.md (100%)
 create mode 100644 docs/source/api/params.md
 rename docs/source/assets/{dev => contributing}/dockerfile-stages-dependency.png (100%)
 delete mode 100644 docs/source/design/multimodal/adding_multimodal_plugin.md
 delete mode 100644 docs/source/dev/pooling_params.md
 delete mode 100644 docs/source/dev/sampling_params.md

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b7178b94..f883595f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -38,7 +38,7 @@ steps:
   - pip install -r requirements-docs.txt
   - SPHINXOPTS=\"-W\" make html
   # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/dev/sampling_params.html
+  - grep \"sig sig-object py\" build/html/api/params.html
 
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
   fast_check: true
diff --git a/Dockerfile b/Dockerfile
index 808cf675..4542bc9c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,8 +2,8 @@
 # to run the OpenAI compatible server.
 
 # Please update any changes made here to
-# docs/source/dev/dockerfile/dockerfile.md and
-# docs/source/assets/dev/dockerfile-stages-dependency.png
+# docs/source/contributing/dockerfile/dockerfile.md and
+# docs/source/assets/contributing/dockerfile-stages-dependency.png
 
 ARG CUDA_VERSION=12.4.1
 #################### BASE BUILD IMAGE ####################
diff --git a/docs/source/dev/engine/async_llm_engine.md b/docs/source/api/engine/async_llm_engine.md
similarity index 100%
rename from docs/source/dev/engine/async_llm_engine.md
rename to docs/source/api/engine/async_llm_engine.md
diff --git a/docs/source/dev/engine/engine_index.md b/docs/source/api/engine/index.md
similarity index 100%
rename from docs/source/dev/engine/engine_index.md
rename to docs/source/api/engine/index.md
diff --git a/docs/source/dev/engine/llm_engine.md b/docs/source/api/engine/llm_engine.md
similarity index 100%
rename from docs/source/dev/engine/llm_engine.md
rename to docs/source/api/engine/llm_engine.md
diff --git a/docs/source/design/multimodal/multimodal_index.md b/docs/source/api/multimodal/index.md
similarity index 84%
rename from docs/source/design/multimodal/multimodal_index.md
rename to docs/source/api/multimodal/index.md
index e4f2171e..0046b73e 100644
--- a/docs/source/design/multimodal/multimodal_index.md
+++ b/docs/source/api/multimodal/index.md
@@ -11,18 +11,8 @@ vLLM provides experimental support for multi-modal models through the {mod}`vllm
 Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
 via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
 
-Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
-by following [this guide](#adding-multimodal-plugin).
-
 Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).
 
-## Guides
-
-```{toctree}
-:maxdepth: 1
-
-adding_multimodal_plugin
-```
 
 ## Module Contents
 
diff --git a/docs/source/dev/offline_inference/offline_index.md b/docs/source/api/offline_inference/index.md
similarity index 100%
rename from docs/source/dev/offline_inference/offline_index.md
rename to docs/source/api/offline_inference/index.md
diff --git a/docs/source/dev/offline_inference/llm.md b/docs/source/api/offline_inference/llm.md
similarity index 100%
rename from docs/source/dev/offline_inference/llm.md
rename to docs/source/api/offline_inference/llm.md
diff --git a/docs/source/dev/offline_inference/llm_inputs.md b/docs/source/api/offline_inference/llm_inputs.md
similarity index 100%
rename from docs/source/dev/offline_inference/llm_inputs.md
rename to docs/source/api/offline_inference/llm_inputs.md
diff --git a/docs/source/api/params.md b/docs/source/api/params.md
new file mode 100644
index 00000000..a3b4d9cb
--- /dev/null
+++ b/docs/source/api/params.md
@@ -0,0 +1,22 @@
+# Optional Parameters
+
+Optional parameters for vLLM APIs.
+
+(sampling-params)=
+
+## Sampling Parameters
+
+```{eval-rst}
+.. autoclass:: vllm.SamplingParams
+    :members:
+```
+
+(pooling-params)=
+
+## Pooling Parameters
+
+```{eval-rst}
+.. autoclass:: vllm.PoolingParams
+    :members:
+```
+
diff --git a/docs/source/assets/dev/dockerfile-stages-dependency.png b/docs/source/assets/contributing/dockerfile-stages-dependency.png
similarity index 100%
rename from docs/source/assets/dev/dockerfile-stages-dependency.png
rename to docs/source/assets/contributing/dockerfile-stages-dependency.png
diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
index 38ea956b..cb142318 100644
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -17,7 +17,7 @@ The edges of the build graph represent:
 
 - `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head)
 
-  > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png
+  > ```{figure} /assets/contributing/dockerfile-stages-dependency.png
   > :align: center
   > :alt: query
   > :width: 100%
diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md
index 5e0dd021..cec503ef 100644
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@@ -53,7 +53,7 @@ for output in outputs:
 ```
 
 More API details can be found in the {doc}`Offline Inference
-</dev/offline_inference/offline_index>` section of the API docs.
+</api/offline_inference/index>` section of the API docs.
 
 The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
 
diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.md b/docs/source/design/multimodal/adding_multimodal_plugin.md
deleted file mode 100644
index bcccd284..00000000
--- a/docs/source/design/multimodal/adding_multimodal_plugin.md
+++ /dev/null
@@ -1,16 +0,0 @@
-(adding-multimodal-plugin)=
-
-# Adding a Multimodal Plugin
-
-This document teaches you how to add a new modality to vLLM.
-
-Each modality in vLLM is represented by a {class}`~vllm.multimodal.MultiModalPlugin` and registered to {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to {meth}`~vllm.multimodal.MultiModalRegistry.register_plugin`.
-
-The remainder of this document details how to define custom {class}`~vllm.multimodal.MultiModalPlugin` s.
-
-```{note}
-This article is a work in progress.
-```
-
-% TODO: Add more instructions on how to add new plugins once embeddings is in.
diff --git a/docs/source/dev/pooling_params.md b/docs/source/dev/pooling_params.md
deleted file mode 100644
index 74b2c574..00000000
--- a/docs/source/dev/pooling_params.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Pooling Parameters
-
-```{eval-rst}
-.. autoclass:: vllm.PoolingParams
-    :members:
-```
diff --git a/docs/source/dev/sampling_params.md b/docs/source/dev/sampling_params.md
deleted file mode 100644
index bdc36af5..00000000
--- a/docs/source/dev/sampling_params.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Sampling Parameters
-
-```{eval-rst}
-.. autoclass:: vllm.SamplingParams
-    :members:
-```
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 6b56918c..2808e1b3 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -42,7 +42,7 @@ The first line of this example imports the classes {class}`~vllm.LLM` and {class
 from vllm import LLM, SamplingParams
 ```
 
-The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](https://docs.vllm.ai/en/stable/dev/sampling_params.html).
+The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params).
 
 ```python
 prompts = [
diff --git a/docs/source/index.md b/docs/source/index.md
index 11d3e24a..6747a7fc 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -137,10 +137,10 @@ community/sponsors
 :caption: API Reference
 :maxdepth: 2
 
-dev/sampling_params
-dev/pooling_params
-dev/offline_inference/offline_index
-dev/engine/engine_index
+api/offline_inference/index
+api/engine/index
+api/multimodal/index
+api/params
 ```
 
 % Design Documents: Details about vLLM internals
@@ -154,7 +154,6 @@ design/huggingface_integration
 design/plugin_system
 design/kernel/paged_attention
 design/input_processing/model_inputs_index
-design/multimodal/multimodal_index
 design/automatic_prefix_caching
 design/multiprocessing
 ```
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 83178f78..79092ab2 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -23,7 +23,7 @@ The available APIs depend on the type of model that is being run:
 Please refer to the above pages for more details about each API.
 
 ```{seealso}
-[API Reference](/dev/offline_inference/offline_index)
+[API Reference](/api/offline_inference/index)
 ```
 
 ## Configuration Options
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 022dd3ae..ec5a3675 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -195,7 +195,7 @@ Code example: <gh-file:examples/online_serving/openai_completion_client.py>
 
 #### Extra parameters
 
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported.
+The following [sampling parameters](#sampling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -226,7 +226,7 @@ Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
 
 #### Extra parameters
 
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported.
+The following [sampling parameters](#sampling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -259,7 +259,7 @@ Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
 
 #### Extra parameters
 
-The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported.
+The following [pooling parameters](#pooling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
@@ -447,7 +447,7 @@ Response:
 
 #### Extra parameters
 
-The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported.
+The following [pooling parameters](#pooling-params) are supported.
 
 ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 7f4029e7..4941fbac 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -49,9 +49,6 @@ class MultiModalPlugin(ABC):
     process the same data differently). This registry is in turn used by
     :class:`~MultiModalRegistry` which acts at a higher level
     (i.e., the modality of the data).
-
-    See also:
-        :ref:`adding-multimodal-plugin`
     """
 
     def __init__(self) -> None:
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 8fdcc4b5..d5424618 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -99,12 +99,6 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]]
 """
 A dictionary containing an entry for each modality type to input.
-
-Note:
-    This dictionary also accepts modality keys defined outside
-    :class:`MultiModalDataBuiltins` as long as a customized plugin
-    is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-    Read more on that :ref:`here <adding-multimodal-plugin>`.
 """
 
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 5f01eac4..9eceefb0 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -125,9 +125,6 @@ class MultiModalRegistry:
     def register_plugin(self, plugin: MultiModalPlugin) -> None:
         """
         Register a multi-modal plugin so it can be recognized by vLLM.
-
-        See also:
-            :ref:`adding-multimodal-plugin`
         """
         data_type_key = plugin.get_data_key()
 
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 2635c0bc..b24b7e91 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -7,7 +7,7 @@ class PoolingParams(
         msgspec.Struct,
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True):  # type: ignore[call-arg]
-    """Pooling parameters for embeddings API.
+    """API parameters for pooling models. This is currently a placeholder.
 
     Attributes:
         additional_data: Any additional data needed for pooling.