2025-02-02 14:58:18 -05:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
2023-07-03 11:31:55 -07:00
|
|
|
"""vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
|
2025-02-09 02:56:40 -08:00
|
|
|
# The version.py should be independent library, and we always import the
|
|
|
|
# version library first. Such assumption is critical for some customization.
|
|
|
|
from .version import __version__, __version_tuple__ # isort:skip
|
|
|
|
|
2025-01-20 19:35:59 +08:00
|
|
|
import os
|
|
|
|
|
|
|
|
import torch
|
2023-07-03 11:31:55 -07:00
|
|
|
|
2024-03-08 10:52:20 -08:00
|
|
|
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
|
|
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
|
|
|
from vllm.engine.llm_engine import LLMEngine
|
|
|
|
from vllm.entrypoints.llm import LLM
|
2024-04-24 23:52:22 -07:00
|
|
|
from vllm.executor.ray_utils import initialize_ray_cluster
|
2024-09-27 11:35:15 +08:00
|
|
|
from vllm.inputs import PromptType, TextPrompt, TokensPrompt
|
2024-04-06 17:11:41 -07:00
|
|
|
from vllm.model_executor.models import ModelRegistry
|
2024-12-13 18:40:07 +08:00
|
|
|
from vllm.outputs import (ClassificationOutput, ClassificationRequestOutput,
|
|
|
|
CompletionOutput, EmbeddingOutput,
|
|
|
|
EmbeddingRequestOutput, PoolingOutput,
|
|
|
|
PoolingRequestOutput, RequestOutput, ScoringOutput,
|
|
|
|
ScoringRequestOutput)
|
2024-05-11 11:30:37 -07:00
|
|
|
from vllm.pooling_params import PoolingParams
|
2024-03-08 10:52:20 -08:00
|
|
|
from vllm.sampling_params import SamplingParams
|
2023-06-17 03:07:40 -07:00
|
|
|
|
2025-01-20 19:35:59 +08:00
|
|
|
# set some common config/environment variables that should be set
|
|
|
|
# for all processes created by vllm and all processes
|
|
|
|
# that interact with vllm workers.
|
|
|
|
# they are executed whenever `import vllm` is called.
|
2025-01-16 20:19:52 +08:00
|
|
|
|
2025-01-20 19:35:59 +08:00
|
|
|
# see https://github.com/NVIDIA/nccl/issues/1234
|
|
|
|
os.environ['NCCL_CUMEM_ENABLE'] = '0'
|
2025-01-16 20:19:52 +08:00
|
|
|
|
2025-01-20 19:35:59 +08:00
|
|
|
# see https://github.com/vllm-project/vllm/issues/10480
|
|
|
|
os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
|
|
|
|
# see https://github.com/vllm-project/vllm/issues/10619
|
|
|
|
torch._inductor.config.compile_threads = 1
|
2025-01-16 20:19:52 +08:00
|
|
|
|
2023-06-17 03:07:40 -07:00
|
|
|
__all__ = [
|
2024-06-14 02:21:39 +08:00
|
|
|
"__version__",
|
2024-09-23 18:44:26 +02:00
|
|
|
"__version_tuple__",
|
2023-06-17 03:07:40 -07:00
|
|
|
"LLM",
|
2024-04-06 17:11:41 -07:00
|
|
|
"ModelRegistry",
|
2024-09-27 11:35:15 +08:00
|
|
|
"PromptType",
|
2024-05-29 04:29:31 +08:00
|
|
|
"TextPrompt",
|
|
|
|
"TokensPrompt",
|
2023-06-17 03:07:40 -07:00
|
|
|
"SamplingParams",
|
|
|
|
"RequestOutput",
|
|
|
|
"CompletionOutput",
|
2024-12-01 14:36:51 +08:00
|
|
|
"PoolingOutput",
|
|
|
|
"PoolingRequestOutput",
|
2024-12-13 18:40:07 +08:00
|
|
|
"EmbeddingOutput",
|
|
|
|
"EmbeddingRequestOutput",
|
|
|
|
"ClassificationOutput",
|
|
|
|
"ClassificationRequestOutput",
|
|
|
|
"ScoringOutput",
|
|
|
|
"ScoringRequestOutput",
|
2023-06-17 03:07:40 -07:00
|
|
|
"LLMEngine",
|
|
|
|
"EngineArgs",
|
|
|
|
"AsyncLLMEngine",
|
|
|
|
"AsyncEngineArgs",
|
2024-03-11 11:03:45 -07:00
|
|
|
"initialize_ray_cluster",
|
2024-05-11 11:30:37 -07:00
|
|
|
"PoolingParams",
|
2023-06-17 03:07:40 -07:00
|
|
|
]
|