[CI/Build] vLLM cache directory for images (#6444)
This commit is contained in:
parent
37d776606f
commit
d97011512e
@ -1,14 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
|
||||||
|
|
||||||
# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
|
|
||||||
mkdir -p images
|
|
||||||
cd images
|
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
|
|
||||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
|
|
||||||
|
|
||||||
cd -
|
|
@ -12,7 +12,6 @@ steps:
|
|||||||
fast_check_only: true
|
fast_check_only: true
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s async_engine # Async Engine
|
- pytest -v -s async_engine # Async Engine
|
||||||
- bash ../.buildkite/download-images.sh # Inputs
|
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
- pytest -v -s multimodal
|
- pytest -v -s multimodal
|
||||||
- pytest -v -s test_utils.py # Utils
|
- pytest -v -s test_utils.py # Utils
|
||||||
@ -82,7 +81,6 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
commands:
|
commands:
|
||||||
- bash ../.buildkite/download-images.sh
|
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
@ -155,7 +153,6 @@ steps:
|
|||||||
- label: Inputs Test
|
- label: Inputs Test
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
commands:
|
commands:
|
||||||
- bash ../.buildkite/download-images.sh
|
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
- pytest -v -s multimodal
|
- pytest -v -s multimodal
|
||||||
|
|
||||||
@ -175,7 +172,6 @@ steps:
|
|||||||
- label: Vision Language Models Test
|
- label: Vision Language Models Test
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
commands:
|
commands:
|
||||||
- bash ../.buildkite/download-images.sh
|
|
||||||
- pytest -v -s models -m vlm
|
- pytest -v -s models -m vlm
|
||||||
|
|
||||||
- label: Prefix Caching Test
|
- label: Prefix Caching Test
|
||||||
|
@ -1,12 +1,5 @@
|
|||||||
import os
|
|
||||||
import subprocess
|
|
||||||
|
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
from vllm.assets.image import ImageAsset
|
||||||
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
|
|
||||||
# You can use `.buildkite/download-images.sh` to download them
|
|
||||||
|
|
||||||
|
|
||||||
def run_llava():
|
def run_llava():
|
||||||
@ -14,7 +7,7 @@ def run_llava():
|
|||||||
|
|
||||||
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
|
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
|
||||||
|
|
||||||
image = Image.open("images/stop_sign.jpg")
|
image = ImageAsset("stop_sign").pil_image
|
||||||
|
|
||||||
outputs = llm.generate({
|
outputs = llm.generate({
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
@ -28,25 +21,5 @@ def run_llava():
|
|||||||
print(generated_text)
|
print(generated_text)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
run_llava()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Download from s3
|
run_llava()
|
||||||
s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
|
|
||||||
local_directory = "images"
|
|
||||||
|
|
||||||
# Make sure the local directory exists or create it
|
|
||||||
os.makedirs(local_directory, exist_ok=True)
|
|
||||||
|
|
||||||
# Use AWS CLI to sync the directory, assume anonymous access
|
|
||||||
subprocess.check_call([
|
|
||||||
"aws",
|
|
||||||
"s3",
|
|
||||||
"sync",
|
|
||||||
s3_bucket_path,
|
|
||||||
local_directory,
|
|
||||||
"--no-sign-request",
|
|
||||||
])
|
|
||||||
main()
|
|
||||||
|
@ -1,12 +1,5 @@
|
|||||||
import os
|
|
||||||
import subprocess
|
|
||||||
|
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
from vllm.assets.image import ImageAsset
|
||||||
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
|
|
||||||
# You can use `.buildkite/download-images.sh` to download them
|
|
||||||
|
|
||||||
|
|
||||||
def run_paligemma():
|
def run_paligemma():
|
||||||
@ -14,7 +7,7 @@ def run_paligemma():
|
|||||||
|
|
||||||
prompt = "caption es"
|
prompt = "caption es"
|
||||||
|
|
||||||
image = Image.open("images/stop_sign.jpg")
|
image = ImageAsset("stop_sign").pil_image
|
||||||
|
|
||||||
outputs = llm.generate({
|
outputs = llm.generate({
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
@ -28,25 +21,5 @@ def run_paligemma():
|
|||||||
print(generated_text)
|
print(generated_text)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
run_paligemma()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Download from s3
|
run_paligemma()
|
||||||
s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
|
|
||||||
local_directory = "images"
|
|
||||||
|
|
||||||
# Make sure the local directory exists or create it
|
|
||||||
os.makedirs(local_directory, exist_ok=True)
|
|
||||||
|
|
||||||
# Use AWS CLI to sync the directory, assume anonymous access
|
|
||||||
subprocess.check_call([
|
|
||||||
"aws",
|
|
||||||
"s3",
|
|
||||||
"sync",
|
|
||||||
s3_bucket_path,
|
|
||||||
local_directory,
|
|
||||||
"--no-sign-request",
|
|
||||||
])
|
|
||||||
main()
|
|
||||||
|
@ -1,12 +1,5 @@
|
|||||||
import os
|
|
||||||
import subprocess
|
|
||||||
|
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.assets.image import ImageAsset
|
||||||
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
|
|
||||||
# You can use `.buildkite/download-images.sh` to download them
|
|
||||||
|
|
||||||
|
|
||||||
def run_phi3v():
|
def run_phi3v():
|
||||||
@ -24,7 +17,7 @@ def run_phi3v():
|
|||||||
max_num_seqs=5,
|
max_num_seqs=5,
|
||||||
)
|
)
|
||||||
|
|
||||||
image = Image.open("images/cherry_blossom.jpg")
|
image = ImageAsset("cherry_blossom").pil_image
|
||||||
|
|
||||||
# single-image prompt
|
# single-image prompt
|
||||||
prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n" # noqa: E501
|
prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n" # noqa: E501
|
||||||
@ -44,19 +37,4 @@ def run_phi3v():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
|
|
||||||
local_directory = "images"
|
|
||||||
|
|
||||||
# Make sure the local directory exists or create it
|
|
||||||
os.makedirs(local_directory, exist_ok=True)
|
|
||||||
|
|
||||||
# Use AWS CLI to sync the directory, assume anonymous access
|
|
||||||
subprocess.check_call([
|
|
||||||
"aws",
|
|
||||||
"s3",
|
|
||||||
"sync",
|
|
||||||
s3_bucket_path,
|
|
||||||
local_directory,
|
|
||||||
"--no-sign-request",
|
|
||||||
])
|
|
||||||
run_phi3v()
|
run_phi3v()
|
||||||
|
@ -3,11 +3,7 @@ import gc
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from collections import UserList
|
from collections import UserList
|
||||||
from dataclasses import dataclass
|
from typing import Any, Dict, List, Optional, Tuple, TypedDict, TypeVar
|
||||||
from functools import cached_property
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict,
|
|
||||||
TypeVar)
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
@ -18,12 +14,12 @@ from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
|
|||||||
AutoTokenizer, BatchEncoding)
|
AutoTokenizer, BatchEncoding)
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.assets.image import ImageAsset
|
||||||
from vllm.config import TokenizerPoolConfig
|
from vllm.config import TokenizerPoolConfig
|
||||||
from vllm.distributed import (destroy_distributed_environment,
|
from vllm.distributed import (destroy_distributed_environment,
|
||||||
destroy_model_parallel)
|
destroy_model_parallel)
|
||||||
from vllm.inputs import TextPrompt
|
from vllm.inputs import TextPrompt
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.multimodal.utils import fetch_image
|
|
||||||
from vllm.sequence import SampleLogprobs
|
from vllm.sequence import SampleLogprobs
|
||||||
from vllm.utils import cuda_device_count_stateless, is_cpu
|
from vllm.utils import cuda_device_count_stateless, is_cpu
|
||||||
|
|
||||||
@ -33,9 +29,6 @@ _TEST_DIR = os.path.dirname(__file__)
|
|||||||
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
|
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
|
||||||
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
|
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
|
||||||
|
|
||||||
_IMAGE_DIR = Path(_TEST_DIR) / "images"
|
|
||||||
"""You can use `.buildkite/download-images.sh` to download the assets."""
|
|
||||||
|
|
||||||
|
|
||||||
def _read_prompts(filename: str) -> List[str]:
|
def _read_prompts(filename: str) -> List[str]:
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
@ -43,20 +36,6 @@ def _read_prompts(filename: str) -> List[str]:
|
|||||||
return prompts
|
return prompts
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class ImageAsset:
|
|
||||||
name: Literal["stop_sign", "cherry_blossom", "boardwalk"]
|
|
||||||
|
|
||||||
@cached_property
|
|
||||||
def pil_image(self) -> Image.Image:
|
|
||||||
if self.name == "boardwalk":
|
|
||||||
return fetch_image(
|
|
||||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
|
||||||
)
|
|
||||||
|
|
||||||
return Image.open(_IMAGE_DIR / f"{self.name}.jpg")
|
|
||||||
|
|
||||||
|
|
||||||
class _ImageAssetPrompts(TypedDict):
|
class _ImageAssetPrompts(TypedDict):
|
||||||
stop_sign: str
|
stop_sign: str
|
||||||
cherry_blossom: str
|
cherry_blossom: str
|
||||||
|
0
vllm/assets/__init__.py
Normal file
0
vllm/assets/__init__.py
Normal file
11
vllm/assets/base.py
Normal file
11
vllm/assets/base.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
|
|
||||||
|
|
||||||
|
def get_cache_dir():
|
||||||
|
"""Get the path to the cache for storing downloaded assets."""
|
||||||
|
path = Path(envs.VLLM_ASSETS_CACHE)
|
||||||
|
path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
return path
|
47
vllm/assets/image.py
Normal file
47
vllm/assets/image.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
import shutil
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from functools import cached_property, lru_cache
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from vllm.multimodal.utils import fetch_image
|
||||||
|
|
||||||
|
from .base import get_cache_dir
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache
|
||||||
|
def get_air_example_data_2_asset(filename: str) -> Image.Image:
|
||||||
|
"""
|
||||||
|
Download and open an image from
|
||||||
|
``s3://air-example-data-2/vllm_opensource_llava/``.
|
||||||
|
"""
|
||||||
|
image_directory = get_cache_dir() / "air-example-data-2"
|
||||||
|
image_directory.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
image_path = image_directory / filename
|
||||||
|
if not image_path.exists():
|
||||||
|
base_url = "https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava"
|
||||||
|
|
||||||
|
with requests.get(f"{base_url}/{filename}", stream=True) as response:
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
with image_path.open("wb") as f:
|
||||||
|
shutil.copyfileobj(response.raw, f)
|
||||||
|
|
||||||
|
return Image.open(image_path)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ImageAsset:
|
||||||
|
name: Literal["stop_sign", "cherry_blossom", "boardwalk"]
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def pil_image(self) -> Image.Image:
|
||||||
|
if self.name == "boardwalk":
|
||||||
|
return fetch_image(
|
||||||
|
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||||
|
)
|
||||||
|
|
||||||
|
return get_air_example_data_2_asset(f"{self.name}.jpg")
|
@ -189,10 +189,10 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
|
|||||||
cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
|
cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
|
||||||
if cuda_visible_devices is None:
|
if cuda_visible_devices is None:
|
||||||
cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
|
cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
|
||||||
VLLM_CONFIG_ROOT = envs.VLLM_CONFIG_ROOT
|
|
||||||
path = os.path.expanduser(
|
path = os.path.join(
|
||||||
f"{VLLM_CONFIG_ROOT}/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
|
envs.VLLM_CACHE_ROOT,
|
||||||
)
|
f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json")
|
||||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||||
from vllm.distributed.parallel_state import get_world_group
|
from vllm.distributed.parallel_state import get_world_group
|
||||||
if ((not is_distributed or get_world_group().local_rank == 0)
|
if ((not is_distributed or get_world_group().local_rank == 0)
|
||||||
|
54
vllm/envs.py
54
vllm/envs.py
@ -17,7 +17,8 @@ if TYPE_CHECKING:
|
|||||||
S3_ACCESS_KEY_ID: Optional[str] = None
|
S3_ACCESS_KEY_ID: Optional[str] = None
|
||||||
S3_SECRET_ACCESS_KEY: Optional[str] = None
|
S3_SECRET_ACCESS_KEY: Optional[str] = None
|
||||||
S3_ENDPOINT_URL: Optional[str] = None
|
S3_ENDPOINT_URL: Optional[str] = None
|
||||||
VLLM_CONFIG_ROOT: str = ""
|
VLLM_CACHE_ROOT: str = os.path.expanduser("~/.cache/vllm")
|
||||||
|
VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
|
||||||
VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
|
VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
|
||||||
VLLM_NO_USAGE_STATS: bool = False
|
VLLM_NO_USAGE_STATS: bool = False
|
||||||
VLLM_DO_NOT_TRACK: bool = False
|
VLLM_DO_NOT_TRACK: bool = False
|
||||||
@ -31,10 +32,11 @@ if TYPE_CHECKING:
|
|||||||
VLLM_OPENVINO_KVCACHE_SPACE: int = 0
|
VLLM_OPENVINO_KVCACHE_SPACE: int = 0
|
||||||
VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
|
VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
|
||||||
VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
|
VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
|
||||||
VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/"
|
VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
|
||||||
VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
|
VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
|
||||||
VLLM_USE_RAY_COMPILED_DAG: bool = False
|
VLLM_USE_RAY_COMPILED_DAG: bool = False
|
||||||
VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
|
VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
|
||||||
|
VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
|
||||||
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
|
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
|
||||||
VLLM_TARGET_DEVICE: str = "cuda"
|
VLLM_TARGET_DEVICE: str = "cuda"
|
||||||
MAX_JOBS: Optional[str] = None
|
MAX_JOBS: Optional[str] = None
|
||||||
@ -45,6 +47,21 @@ if TYPE_CHECKING:
|
|||||||
CMAKE_BUILD_TYPE: Optional[str] = None
|
CMAKE_BUILD_TYPE: Optional[str] = None
|
||||||
VERBOSE: bool = False
|
VERBOSE: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
def get_default_cache_root():
|
||||||
|
return os.getenv(
|
||||||
|
"XDG_CACHE_HOME",
|
||||||
|
os.path.join(os.path.expanduser("~"), ".cache"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_default_config_root():
|
||||||
|
return os.getenv(
|
||||||
|
"XDG_CONFIG_HOME",
|
||||||
|
os.path.join(os.path.expanduser("~"), ".config"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# The begin-* and end* here are used by the documentation generator
|
# The begin-* and end* here are used by the documentation generator
|
||||||
# to extract the used env vars.
|
# to extract the used env vars.
|
||||||
|
|
||||||
@ -89,15 +106,28 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
|||||||
lambda: bool(int(os.getenv('VERBOSE', '0'))),
|
lambda: bool(int(os.getenv('VERBOSE', '0'))),
|
||||||
|
|
||||||
# Root directory for VLLM configuration files
|
# Root directory for VLLM configuration files
|
||||||
|
# Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
|
||||||
# Note that this not only affects how vllm finds its configuration files
|
# Note that this not only affects how vllm finds its configuration files
|
||||||
# during runtime, but also affects how vllm installs its configuration
|
# during runtime, but also affects how vllm installs its configuration
|
||||||
# files during **installation**.
|
# files during **installation**.
|
||||||
"VLLM_CONFIG_ROOT":
|
"VLLM_CONFIG_ROOT":
|
||||||
lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv(
|
lambda: os.path.expanduser(
|
||||||
"XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"),
|
os.getenv(
|
||||||
|
"VLLM_CONFIG_ROOT",
|
||||||
|
os.path.join(get_default_config_root(), "vllm"),
|
||||||
|
)),
|
||||||
|
|
||||||
# ================== Runtime Env Vars ==================
|
# ================== Runtime Env Vars ==================
|
||||||
|
|
||||||
|
# Root directory for VLLM cache files
|
||||||
|
# Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
|
||||||
|
"VLLM_CACHE_ROOT":
|
||||||
|
lambda: os.path.expanduser(
|
||||||
|
os.getenv(
|
||||||
|
"VLLM_CACHE_ROOT",
|
||||||
|
os.path.join(get_default_cache_root(), "vllm"),
|
||||||
|
)),
|
||||||
|
|
||||||
# used in distributed environment to determine the master address
|
# used in distributed environment to determine the master address
|
||||||
'VLLM_HOST_IP':
|
'VLLM_HOST_IP':
|
||||||
lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
|
lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
|
||||||
@ -242,6 +272,14 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
|||||||
"VLLM_WORKER_MULTIPROC_METHOD":
|
"VLLM_WORKER_MULTIPROC_METHOD":
|
||||||
lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
|
lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
|
||||||
|
|
||||||
|
# Path to the cache for storing downloaded assets
|
||||||
|
"VLLM_ASSETS_CACHE":
|
||||||
|
lambda: os.path.expanduser(
|
||||||
|
os.getenv(
|
||||||
|
"VLLM_ASSETS_CACHE",
|
||||||
|
os.path.join(get_default_cache_root(), "vllm", "assets"),
|
||||||
|
)),
|
||||||
|
|
||||||
# Timeout for fetching images when serving multimodal models
|
# Timeout for fetching images when serving multimodal models
|
||||||
# Default is 5 seconds
|
# Default is 5 seconds
|
||||||
"VLLM_IMAGE_FETCH_TIMEOUT":
|
"VLLM_IMAGE_FETCH_TIMEOUT":
|
||||||
@ -250,7 +288,11 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
|||||||
# Path to the XLA persistent cache directory.
|
# Path to the XLA persistent cache directory.
|
||||||
# Only used for XLA devices such as TPUs.
|
# Only used for XLA devices such as TPUs.
|
||||||
"VLLM_XLA_CACHE_PATH":
|
"VLLM_XLA_CACHE_PATH":
|
||||||
lambda: os.getenv("VLLM_XLA_CACHE_PATH", "~/.vllm/xla_cache/"),
|
lambda: os.path.expanduser(
|
||||||
|
os.getenv(
|
||||||
|
"VLLM_ASSETS_CACHE",
|
||||||
|
os.path.join(get_default_cache_root(), "vllm", "xla_cache"),
|
||||||
|
)),
|
||||||
"VLLM_FUSED_MOE_CHUNK_SIZE":
|
"VLLM_FUSED_MOE_CHUNK_SIZE":
|
||||||
lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "65536")),
|
lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "65536")),
|
||||||
|
|
||||||
@ -262,7 +304,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
|||||||
# end-env-vars-definition
|
# end-env-vars-definition
|
||||||
|
|
||||||
|
|
||||||
def __getattr__(name):
|
def __getattr__(name: str):
|
||||||
# lazy evaluation of environment variables
|
# lazy evaluation of environment variables
|
||||||
if name in environment_variables:
|
if name in environment_variables:
|
||||||
return environment_variables[name]()
|
return environment_variables[name]()
|
||||||
|
@ -19,9 +19,8 @@ import vllm.envs as envs
|
|||||||
from vllm.version import __version__ as VLLM_VERSION
|
from vllm.version import __version__ as VLLM_VERSION
|
||||||
|
|
||||||
_config_home = envs.VLLM_CONFIG_ROOT
|
_config_home = envs.VLLM_CONFIG_ROOT
|
||||||
_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "vllm/usage_stats.json")
|
_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "usage_stats.json")
|
||||||
_USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home,
|
_USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home, "do_not_track")
|
||||||
"vllm/do_not_track")
|
|
||||||
_USAGE_STATS_ENABLED = None
|
_USAGE_STATS_ENABLED = None
|
||||||
_USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER
|
_USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER
|
||||||
|
|
||||||
|
@ -98,8 +98,7 @@ class TPUWorker(LoraNotSupportedWorkerBase):
|
|||||||
# Use persistent cache to avoid XLA recompilation.
|
# Use persistent cache to avoid XLA recompilation.
|
||||||
# NOTE(woosuk): This does not completely eliminate the recompilation
|
# NOTE(woosuk): This does not completely eliminate the recompilation
|
||||||
# overhead because dynamo does not cache the compiled results.
|
# overhead because dynamo does not cache the compiled results.
|
||||||
xr.initialize_cache(os.path.expanduser(envs.VLLM_XLA_CACHE_PATH),
|
xr.initialize_cache(envs.VLLM_XLA_CACHE_PATH, readonly=False)
|
||||||
readonly=False)
|
|
||||||
|
|
||||||
def load_model(self):
|
def load_model(self):
|
||||||
self.model_runner.load_model()
|
self.model_runner.load_model()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user