From 11c3b98491dcd8860d2aa6cac7b9ab2cd0bd2b53 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 17 Apr 2025 21:37:37 +0800 Subject: [PATCH] [Doc] Document Matryoshka Representation Learning support (#16770) --- docs/source/models/pooling_models.md | 74 +++++++++++++++++++ .../openai_embedding_matryoshka_fy.py | 36 +++++++++ 2 files changed, 110 insertions(+) create mode 100644 examples/online_serving/openai_embedding_matryoshka_fy.py diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index dbcd846c..5f1c2b5b 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -141,3 +141,77 @@ Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints tha - [Pooling API](#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models. - [Embeddings API](#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs) for embedding models. - [Score API](#score-api) is similar to `LLM.score` for cross-encoder models. + +## Matryoshka Embeddings + +[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows user to trade off between performance and cost. + +:::{warning} +Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings. + +For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model will result in the following error. + +```json +{"object":"error","message":"Model \"BAAI/bge-m3\" does not support matryoshka representation, changing output dimensions will lead to poor results.","type":"BadRequestError","param":null,"code":400} +``` + +::: + +### Manually enable Matryoshka Embeddings + +There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, we simply check the existence of the fields `is_matryoshka` or `matryoshka_dimensions` inside `config.json`. + +For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}` (offline) or `--hf_overrides '{"is_matryoshka": true}'` (online). + +Here is an example to serve a model with Matryoshka Embeddings enabled. + +```text +vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"is_matryoshka":true}' +``` + +### Offline Inference + +You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in {class}`~vllm.PoolingParams`. + +```python +from vllm import LLM, PoolingParams + +model = LLM(model="jinaai/jina-embeddings-v3", + task="embed", + trust_remote_code=True) +outputs = model.embed(["Follow the white rabbit."], + pooling_params=PoolingParams(dimensions=32)) +print(outputs[0].outputs) +``` + +A code example can be found here: + +### Online Inference + +Use the following command to start vllm server. + +```text +vllm serve jinaai/jina-embeddings-v3 --trust-remote-code +``` + +You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter. + +```text +curl http://127.0.0.1:8000/v1/embeddings \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "input": "Follow the white rabbit.", + "model": "jinaai/jina-embeddings-v3", + "encoding_format": "float", + "dimensions": 1 + }' +``` + +Expected output: + +```json +{"id":"embd-0aab28c384d348c3b8f0eb783109dc5f","object":"list","created":1744195454,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-1.0]}],"usage":{"prompt_tokens":10,"total_tokens":10,"completion_tokens":0,"prompt_tokens_details":null}} +``` + +A openai client example can be found here: diff --git a/examples/online_serving/openai_embedding_matryoshka_fy.py b/examples/online_serving/openai_embedding_matryoshka_fy.py new file mode 100644 index 00000000..27ab8cb6 --- /dev/null +++ b/examples/online_serving/openai_embedding_matryoshka_fy.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Example Python client for embedding API dimensions using vLLM API server +NOTE: + start a supported Matryoshka Embeddings model server with `vllm serve`, e.g. + vllm serve jinaai/jina-embeddings-v3 --trust-remote-code +""" + +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + + +def main(): + client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, + ) + + models = client.models.list() + model = models.data[0].id + + responses = client.embeddings.create( + input=["Follow the white rabbit."], + model=model, + dimensions=1, + ) + + for data in responses.data: + print(data.embedding) # List of float of len 1 + + +if __name__ == "__main__": + main()