[Doc] Add better clarity for tensorizer usage (#4090)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
This commit is contained in:
Sanger Steel 2024-04-15 16:28:25 -04:00 committed by GitHub
parent eb46fbfda2
commit d619ae2d19
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 46 additions and 22 deletions

View File

@ -45,7 +45,7 @@ Below, you can find an explanation of every engine argument for vLLM:
* "safetensors" will load the weights in the safetensors format. * "safetensors" will load the weights in the safetensors format.
* "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading. * "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading.
* "dummy" will initialize the weights with random values, mainly for profiling. * "dummy" will initialize the weights with random values, mainly for profiling.
* "tensorizer" will load serialized weights using `CoreWeave's Tensorizer model deserializer. <https://github.com/coreweave/tensorizer>`_. See `tensorized_vllm_model.py` in the examples folder to serialize a vLLM model, and for more information. Tensorizer support for vLLM can be installed with `pip install vllm[tensorizer]`. * "tensorizer" will load serialized weights using `CoreWeave's Tensorizer model deserializer. <https://github.com/coreweave/tensorizer>`_ See `examples/tensorize_vllm_model.py <https://github.com/vllm-project/vllm/blob/main/examples/tensorize_vllm_model.py>`_ to serialize a vLLM model, and for more information.
.. option:: --dtype {auto,half,float16,bfloat16,float,float32} .. option:: --dtype {auto,half,float16,bfloat16,float,float32}

View File

@ -23,14 +23,16 @@ from vllm.model_executor.tensorizer_loader import TensorizerArgs
# yapf: disable # yapf: disable
""" """
tensorize_vllm_model.py is a script that can be used to serialize and tensorize_vllm_model.py is a script that can be used to serialize and
deserialize vLLM models. These models can be loaded using tensorizer directly deserialize vLLM models. These models can be loaded using tensorizer
to the GPU extremely quickly. Tensor encryption and decryption is also to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
supported, although libsodium must be installed to use it. Install or locally. Tensor encryption and decryption is also supported, although
vllm with tensorizer support using `pip install vllm[tensorizer]`. libsodium must be installed to use it. Install vllm with tensorizer support
using `pip install vllm[tensorizer]`.
To serialize a model, you can run something like this: To serialize a model, install vLLM from source, then run something
like this from the root level of this repository:
python tensorize_vllm_model.py \ python -m examples.tensorize_vllm_model \
--model EleutherAI/gpt-j-6B \ --model EleutherAI/gpt-j-6B \
--dtype float16 \ --dtype float16 \
serialize \ serialize \
@ -38,31 +40,57 @@ python tensorize_vllm_model.py \
--suffix vllm --suffix vllm
Which downloads the model from HuggingFace, loads it into vLLM, serializes it, Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
and saves it to your S3 bucket. A local directory can also be used. and saves it to your S3 bucket. A local directory can also be used. This
assumes your S3 credentials are specified as environment variables
in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and `S3_ENDPOINT`.
To provide S3 credentials directly, you can provide `--s3-access-key-id` and
`--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this
script.
You can also encrypt the model weights with a randomly-generated key by You can also encrypt the model weights with a randomly-generated key by
providing a `--keyfile` argument. providing a `--keyfile` argument.
To deserialize a model, you can run something like this: To deserialize a model, you can run something like this from the root
level of this repository:
python tensorize_vllm_model.py \ python -m examples.tensorize_vllm_model \
--model EleutherAI/gpt-j-6B \ --model EleutherAI/gpt-j-6B \
--dtype float16 \ --dtype float16 \
deserialize \ deserialize \
--path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/vllm/model.tensors --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/vllm/model.tensors
Which downloads the model tensors from your S3 bucket and deserializes them. Which downloads the model tensors from your S3 bucket and deserializes them.
To provide S3 credentials, you can provide `--s3-access-key-id` and
`--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this script,
the OpenAI entrypoint, as arguments for LLM(), or as environment variables
in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and `S3_ENDPOINT`.
You can also provide a `--keyfile` argument to decrypt the model weights if You can also provide a `--keyfile` argument to decrypt the model weights if
they were serialized with encryption. they were serialized with encryption.
For more information on the available arguments, run For more information on the available arguments for serializing, run
`python tensorize_vllm_model.py --help`. `python -m examples.tensorize_vllm_model serialize --help`.
Or for deserializing:
`python -m examples.tensorize_vllm_model deserialize --help`.
Once a model is serialized, it can be used to load the model when running the
OpenAI inference client at `vllm/entrypoints/openai/api_server.py` by providing
the `--tensorizer-uri` CLI argument that is functionally the same as the
`--path-to-tensors` argument in this script, along with `--vllm-tensorized`, to
signify that the model to be deserialized is a vLLM model, rather than a
HuggingFace `PreTrainedModel`, which can also be deserialized using tensorizer
in the same inference server, albeit without the speed optimizations. To
deserialize an encrypted file, the `--encryption-keyfile` argument can be used
to provide the path to the keyfile used to encrypt the model weights. For
information on all the arguments that can be used to configure tensorizer's
deserialization, check out the tensorizer options argument group in the
`vllm/entrypoints/openai/api_server.py` script with `--help`.
Tensorizer can also be invoked with the `LLM` class directly to load models:
llm = LLM(model="facebook/opt-125m",
load_format="tensorizer",
tensorizer_uri=path_to_opt_tensors,
num_readers=3,
vllm_tensorized=True)
""" """

View File

@ -126,7 +126,6 @@ class TensorizerArgs:
"s3_endpoint": self.s3_endpoint, "s3_endpoint": self.s3_endpoint,
} }
# Omitting self.dtype and self.device as this behaves weirdly
self.deserializer_params = { self.deserializer_params = {
"verify_hash": self.verify_hash, "verify_hash": self.verify_hash,
"encryption": self.encryption_keyfile, "encryption": self.encryption_keyfile,
@ -145,7 +144,7 @@ class TensorizerArgs:
parser: argparse.ArgumentParser) -> argparse.ArgumentParser: parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
"""Tensorizer CLI arguments""" """Tensorizer CLI arguments"""
# Create the argument group # Tensorizer options arg group
group = parser.add_argument_group( group = parser.add_argument_group(
'tensorizer options', 'tensorizer options',
description=('Options for configuring the behavior of the' description=('Options for configuring the behavior of the'
@ -205,9 +204,7 @@ class TensorizerArgs:
@classmethod @classmethod
def from_cli_args(cls, args: argparse.Namespace) -> "TensorizerArgs": def from_cli_args(cls, args: argparse.Namespace) -> "TensorizerArgs":
# Get the list of attributes of this dataclass.
attrs = [attr.name for attr in dataclasses.fields(cls)] attrs = [attr.name for attr in dataclasses.fields(cls)]
# Set the attributes from the parsed arguments.
tensorizer_args = cls(**{ tensorizer_args = cls(**{
attr: getattr(args, attr) attr: getattr(args, attr)
for attr in attrs if hasattr(args, attr) for attr in attrs if hasattr(args, attr)
@ -291,7 +288,6 @@ class TensorizerAgent:
nn.Module: The deserialized model. nn.Module: The deserialized model.
""" """
before_mem = get_mem_usage() before_mem = get_mem_usage()
# Lazy load the tensors from S3 into the model.
start = time.perf_counter() start = time.perf_counter()
with open_stream( with open_stream(
self.tensorizer_args.tensorizer_uri, self.tensorizer_args.tensorizer_uri,