[Doc] Add better clarity for tensorizer usage (#4090)
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
This commit is contained in:
parent
eb46fbfda2
commit
d619ae2d19
@ -45,7 +45,7 @@ Below, you can find an explanation of every engine argument for vLLM:
|
|||||||
* "safetensors" will load the weights in the safetensors format.
|
* "safetensors" will load the weights in the safetensors format.
|
||||||
* "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading.
|
* "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading.
|
||||||
* "dummy" will initialize the weights with random values, mainly for profiling.
|
* "dummy" will initialize the weights with random values, mainly for profiling.
|
||||||
* "tensorizer" will load serialized weights using `CoreWeave's Tensorizer model deserializer. <https://github.com/coreweave/tensorizer>`_. See `tensorized_vllm_model.py` in the examples folder to serialize a vLLM model, and for more information. Tensorizer support for vLLM can be installed with `pip install vllm[tensorizer]`.
|
* "tensorizer" will load serialized weights using `CoreWeave's Tensorizer model deserializer. <https://github.com/coreweave/tensorizer>`_ See `examples/tensorize_vllm_model.py <https://github.com/vllm-project/vllm/blob/main/examples/tensorize_vllm_model.py>`_ to serialize a vLLM model, and for more information.
|
||||||
|
|
||||||
.. option:: --dtype {auto,half,float16,bfloat16,float,float32}
|
.. option:: --dtype {auto,half,float16,bfloat16,float,float32}
|
||||||
|
|
||||||
|
@ -23,14 +23,16 @@ from vllm.model_executor.tensorizer_loader import TensorizerArgs
|
|||||||
# yapf: disable
|
# yapf: disable
|
||||||
"""
|
"""
|
||||||
tensorize_vllm_model.py is a script that can be used to serialize and
|
tensorize_vllm_model.py is a script that can be used to serialize and
|
||||||
deserialize vLLM models. These models can be loaded using tensorizer directly
|
deserialize vLLM models. These models can be loaded using tensorizer
|
||||||
to the GPU extremely quickly. Tensor encryption and decryption is also
|
to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
|
||||||
supported, although libsodium must be installed to use it. Install
|
or locally. Tensor encryption and decryption is also supported, although
|
||||||
vllm with tensorizer support using `pip install vllm[tensorizer]`.
|
libsodium must be installed to use it. Install vllm with tensorizer support
|
||||||
|
using `pip install vllm[tensorizer]`.
|
||||||
|
|
||||||
To serialize a model, you can run something like this:
|
To serialize a model, install vLLM from source, then run something
|
||||||
|
like this from the root level of this repository:
|
||||||
|
|
||||||
python tensorize_vllm_model.py \
|
python -m examples.tensorize_vllm_model \
|
||||||
--model EleutherAI/gpt-j-6B \
|
--model EleutherAI/gpt-j-6B \
|
||||||
--dtype float16 \
|
--dtype float16 \
|
||||||
serialize \
|
serialize \
|
||||||
@ -38,31 +40,57 @@ python tensorize_vllm_model.py \
|
|||||||
--suffix vllm
|
--suffix vllm
|
||||||
|
|
||||||
Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
|
Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
|
||||||
and saves it to your S3 bucket. A local directory can also be used.
|
and saves it to your S3 bucket. A local directory can also be used. This
|
||||||
|
assumes your S3 credentials are specified as environment variables
|
||||||
|
in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and `S3_ENDPOINT`.
|
||||||
|
To provide S3 credentials directly, you can provide `--s3-access-key-id` and
|
||||||
|
`--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this
|
||||||
|
script.
|
||||||
|
|
||||||
You can also encrypt the model weights with a randomly-generated key by
|
You can also encrypt the model weights with a randomly-generated key by
|
||||||
providing a `--keyfile` argument.
|
providing a `--keyfile` argument.
|
||||||
|
|
||||||
To deserialize a model, you can run something like this:
|
To deserialize a model, you can run something like this from the root
|
||||||
|
level of this repository:
|
||||||
|
|
||||||
python tensorize_vllm_model.py \
|
python -m examples.tensorize_vllm_model \
|
||||||
--model EleutherAI/gpt-j-6B \
|
--model EleutherAI/gpt-j-6B \
|
||||||
--dtype float16 \
|
--dtype float16 \
|
||||||
deserialize \
|
deserialize \
|
||||||
--path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/vllm/model.tensors
|
--path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/vllm/model.tensors
|
||||||
|
|
||||||
Which downloads the model tensors from your S3 bucket and deserializes them.
|
Which downloads the model tensors from your S3 bucket and deserializes them.
|
||||||
To provide S3 credentials, you can provide `--s3-access-key-id` and
|
|
||||||
`--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this script,
|
|
||||||
the OpenAI entrypoint, as arguments for LLM(), or as environment variables
|
|
||||||
in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and `S3_ENDPOINT`.
|
|
||||||
|
|
||||||
|
|
||||||
You can also provide a `--keyfile` argument to decrypt the model weights if
|
You can also provide a `--keyfile` argument to decrypt the model weights if
|
||||||
they were serialized with encryption.
|
they were serialized with encryption.
|
||||||
|
|
||||||
For more information on the available arguments, run
|
For more information on the available arguments for serializing, run
|
||||||
`python tensorize_vllm_model.py --help`.
|
`python -m examples.tensorize_vllm_model serialize --help`.
|
||||||
|
|
||||||
|
Or for deserializing:
|
||||||
|
|
||||||
|
`python -m examples.tensorize_vllm_model deserialize --help`.
|
||||||
|
|
||||||
|
Once a model is serialized, it can be used to load the model when running the
|
||||||
|
OpenAI inference client at `vllm/entrypoints/openai/api_server.py` by providing
|
||||||
|
the `--tensorizer-uri` CLI argument that is functionally the same as the
|
||||||
|
`--path-to-tensors` argument in this script, along with `--vllm-tensorized`, to
|
||||||
|
signify that the model to be deserialized is a vLLM model, rather than a
|
||||||
|
HuggingFace `PreTrainedModel`, which can also be deserialized using tensorizer
|
||||||
|
in the same inference server, albeit without the speed optimizations. To
|
||||||
|
deserialize an encrypted file, the `--encryption-keyfile` argument can be used
|
||||||
|
to provide the path to the keyfile used to encrypt the model weights. For
|
||||||
|
information on all the arguments that can be used to configure tensorizer's
|
||||||
|
deserialization, check out the tensorizer options argument group in the
|
||||||
|
`vllm/entrypoints/openai/api_server.py` script with `--help`.
|
||||||
|
|
||||||
|
Tensorizer can also be invoked with the `LLM` class directly to load models:
|
||||||
|
|
||||||
|
llm = LLM(model="facebook/opt-125m",
|
||||||
|
load_format="tensorizer",
|
||||||
|
tensorizer_uri=path_to_opt_tensors,
|
||||||
|
num_readers=3,
|
||||||
|
vllm_tensorized=True)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@ -126,7 +126,6 @@ class TensorizerArgs:
|
|||||||
"s3_endpoint": self.s3_endpoint,
|
"s3_endpoint": self.s3_endpoint,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Omitting self.dtype and self.device as this behaves weirdly
|
|
||||||
self.deserializer_params = {
|
self.deserializer_params = {
|
||||||
"verify_hash": self.verify_hash,
|
"verify_hash": self.verify_hash,
|
||||||
"encryption": self.encryption_keyfile,
|
"encryption": self.encryption_keyfile,
|
||||||
@ -145,7 +144,7 @@ class TensorizerArgs:
|
|||||||
parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
|
parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
|
||||||
"""Tensorizer CLI arguments"""
|
"""Tensorizer CLI arguments"""
|
||||||
|
|
||||||
# Create the argument group
|
# Tensorizer options arg group
|
||||||
group = parser.add_argument_group(
|
group = parser.add_argument_group(
|
||||||
'tensorizer options',
|
'tensorizer options',
|
||||||
description=('Options for configuring the behavior of the'
|
description=('Options for configuring the behavior of the'
|
||||||
@ -205,9 +204,7 @@ class TensorizerArgs:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_cli_args(cls, args: argparse.Namespace) -> "TensorizerArgs":
|
def from_cli_args(cls, args: argparse.Namespace) -> "TensorizerArgs":
|
||||||
# Get the list of attributes of this dataclass.
|
|
||||||
attrs = [attr.name for attr in dataclasses.fields(cls)]
|
attrs = [attr.name for attr in dataclasses.fields(cls)]
|
||||||
# Set the attributes from the parsed arguments.
|
|
||||||
tensorizer_args = cls(**{
|
tensorizer_args = cls(**{
|
||||||
attr: getattr(args, attr)
|
attr: getattr(args, attr)
|
||||||
for attr in attrs if hasattr(args, attr)
|
for attr in attrs if hasattr(args, attr)
|
||||||
@ -291,7 +288,6 @@ class TensorizerAgent:
|
|||||||
nn.Module: The deserialized model.
|
nn.Module: The deserialized model.
|
||||||
"""
|
"""
|
||||||
before_mem = get_mem_usage()
|
before_mem = get_mem_usage()
|
||||||
# Lazy load the tensors from S3 into the model.
|
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
with open_stream(
|
with open_stream(
|
||||||
self.tensorizer_args.tensorizer_uri,
|
self.tensorizer_args.tensorizer_uri,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user