[CI/Build] vLLM cache directory for images (#6444)

This commit is contained in:
Cyrus Leung 2024-07-16 14:12:25 +08:00 committed by GitHub
parent 37d776606f
commit d97011512e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 123 additions and 140 deletions

View File

@ -1,14 +0,0 @@
#!/bin/bash
set -ex
set -o pipefail
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
mkdir -p images
cd images
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
cd -

View File

@ -12,7 +12,6 @@ steps:
fast_check_only: true fast_check_only: true
commands: commands:
- pytest -v -s async_engine # Async Engine - pytest -v -s async_engine # Async Engine
- bash ../.buildkite/download-images.sh # Inputs
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
- pytest -v -s multimodal - pytest -v -s multimodal
- pytest -v -s test_utils.py # Utils - pytest -v -s test_utils.py # Utils
@ -82,7 +81,6 @@ steps:
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
commands: commands:
- bash ../.buildkite/download-images.sh
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
@ -155,7 +153,6 @@ steps:
- label: Inputs Test - label: Inputs Test
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
commands: commands:
- bash ../.buildkite/download-images.sh
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
- pytest -v -s multimodal - pytest -v -s multimodal
@ -175,7 +172,6 @@ steps:
- label: Vision Language Models Test - label: Vision Language Models Test
mirror_hardwares: [amd] mirror_hardwares: [amd]
commands: commands:
- bash ../.buildkite/download-images.sh
- pytest -v -s models -m vlm - pytest -v -s models -m vlm
- label: Prefix Caching Test - label: Prefix Caching Test

View File

@ -1,12 +1,5 @@
import os
import subprocess
from PIL import Image
from vllm import LLM from vllm import LLM
from vllm.assets.image import ImageAsset
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
# You can use `.buildkite/download-images.sh` to download them
def run_llava(): def run_llava():
@ -14,7 +7,7 @@ def run_llava():
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:" prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
image = Image.open("images/stop_sign.jpg") image = ImageAsset("stop_sign").pil_image
outputs = llm.generate({ outputs = llm.generate({
"prompt": prompt, "prompt": prompt,
@ -28,25 +21,5 @@ def run_llava():
print(generated_text) print(generated_text)
def main():
run_llava()
if __name__ == "__main__": if __name__ == "__main__":
# Download from s3 run_llava()
s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
local_directory = "images"
# Make sure the local directory exists or create it
os.makedirs(local_directory, exist_ok=True)
# Use AWS CLI to sync the directory, assume anonymous access
subprocess.check_call([
"aws",
"s3",
"sync",
s3_bucket_path,
local_directory,
"--no-sign-request",
])
main()

View File

@ -1,12 +1,5 @@
import os
import subprocess
from PIL import Image
from vllm import LLM from vllm import LLM
from vllm.assets.image import ImageAsset
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
# You can use `.buildkite/download-images.sh` to download them
def run_paligemma(): def run_paligemma():
@ -14,7 +7,7 @@ def run_paligemma():
prompt = "caption es" prompt = "caption es"
image = Image.open("images/stop_sign.jpg") image = ImageAsset("stop_sign").pil_image
outputs = llm.generate({ outputs = llm.generate({
"prompt": prompt, "prompt": prompt,
@ -28,25 +21,5 @@ def run_paligemma():
print(generated_text) print(generated_text)
def main():
run_paligemma()
if __name__ == "__main__": if __name__ == "__main__":
# Download from s3 run_paligemma()
s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
local_directory = "images"
# Make sure the local directory exists or create it
os.makedirs(local_directory, exist_ok=True)
# Use AWS CLI to sync the directory, assume anonymous access
subprocess.check_call([
"aws",
"s3",
"sync",
s3_bucket_path,
local_directory,
"--no-sign-request",
])
main()

View File

@ -1,12 +1,5 @@
import os
import subprocess
from PIL import Image
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
# You can use `.buildkite/download-images.sh` to download them
def run_phi3v(): def run_phi3v():
@ -24,7 +17,7 @@ def run_phi3v():
max_num_seqs=5, max_num_seqs=5,
) )
image = Image.open("images/cherry_blossom.jpg") image = ImageAsset("cherry_blossom").pil_image
# single-image prompt # single-image prompt
prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n" # noqa: E501 prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n" # noqa: E501
@ -44,19 +37,4 @@ def run_phi3v():
if __name__ == "__main__": if __name__ == "__main__":
s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
local_directory = "images"
# Make sure the local directory exists or create it
os.makedirs(local_directory, exist_ok=True)
# Use AWS CLI to sync the directory, assume anonymous access
subprocess.check_call([
"aws",
"s3",
"sync",
s3_bucket_path,
local_directory,
"--no-sign-request",
])
run_phi3v() run_phi3v()

View File

@ -3,11 +3,7 @@ import gc
import os import os
import sys import sys
from collections import UserList from collections import UserList
from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, TypedDict, TypeVar
from functools import cached_property
from pathlib import Path
from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict,
TypeVar)
import pytest import pytest
import torch import torch
@ -18,12 +14,12 @@ from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
AutoTokenizer, BatchEncoding) AutoTokenizer, BatchEncoding)
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.config import TokenizerPoolConfig from vllm.config import TokenizerPoolConfig
from vllm.distributed import (destroy_distributed_environment, from vllm.distributed import (destroy_distributed_environment,
destroy_model_parallel) destroy_model_parallel)
from vllm.inputs import TextPrompt from vllm.inputs import TextPrompt
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.multimodal.utils import fetch_image
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.utils import cuda_device_count_stateless, is_cpu from vllm.utils import cuda_device_count_stateless, is_cpu
@ -33,9 +29,6 @@ _TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
_IMAGE_DIR = Path(_TEST_DIR) / "images"
"""You can use `.buildkite/download-images.sh` to download the assets."""
def _read_prompts(filename: str) -> List[str]: def _read_prompts(filename: str) -> List[str]:
with open(filename, "r") as f: with open(filename, "r") as f:
@ -43,20 +36,6 @@ def _read_prompts(filename: str) -> List[str]:
return prompts return prompts
@dataclass(frozen=True)
class ImageAsset:
name: Literal["stop_sign", "cherry_blossom", "boardwalk"]
@cached_property
def pil_image(self) -> Image.Image:
if self.name == "boardwalk":
return fetch_image(
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
)
return Image.open(_IMAGE_DIR / f"{self.name}.jpg")
class _ImageAssetPrompts(TypedDict): class _ImageAssetPrompts(TypedDict):
stop_sign: str stop_sign: str
cherry_blossom: str cherry_blossom: str

0
vllm/assets/__init__.py Normal file
View File

11
vllm/assets/base.py Normal file
View File

@ -0,0 +1,11 @@
from pathlib import Path
import vllm.envs as envs
def get_cache_dir():
"""Get the path to the cache for storing downloaded assets."""
path = Path(envs.VLLM_ASSETS_CACHE)
path.mkdir(parents=True, exist_ok=True)
return path

47
vllm/assets/image.py Normal file
View File

@ -0,0 +1,47 @@
import shutil
from dataclasses import dataclass
from functools import cached_property, lru_cache
from typing import Literal
import requests
from PIL import Image
from vllm.multimodal.utils import fetch_image
from .base import get_cache_dir
@lru_cache
def get_air_example_data_2_asset(filename: str) -> Image.Image:
"""
Download and open an image from
``s3://air-example-data-2/vllm_opensource_llava/``.
"""
image_directory = get_cache_dir() / "air-example-data-2"
image_directory.mkdir(parents=True, exist_ok=True)
image_path = image_directory / filename
if not image_path.exists():
base_url = "https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava"
with requests.get(f"{base_url}/{filename}", stream=True) as response:
response.raise_for_status()
with image_path.open("wb") as f:
shutil.copyfileobj(response.raw, f)
return Image.open(image_path)
@dataclass(frozen=True)
class ImageAsset:
name: Literal["stop_sign", "cherry_blossom", "boardwalk"]
@cached_property
def pil_image(self) -> Image.Image:
if self.name == "boardwalk":
return fetch_image(
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
)
return get_air_example_data_2_asset(f"{self.name}.jpg")

View File

@ -189,10 +189,10 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
if cuda_visible_devices is None: if cuda_visible_devices is None:
cuda_visible_devices = ",".join(str(i) for i in range(num_dev)) cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
VLLM_CONFIG_ROOT = envs.VLLM_CONFIG_ROOT
path = os.path.expanduser( path = os.path.join(
f"{VLLM_CONFIG_ROOT}/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json" envs.VLLM_CACHE_ROOT,
) f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json")
os.makedirs(os.path.dirname(path), exist_ok=True) os.makedirs(os.path.dirname(path), exist_ok=True)
from vllm.distributed.parallel_state import get_world_group from vllm.distributed.parallel_state import get_world_group
if ((not is_distributed or get_world_group().local_rank == 0) if ((not is_distributed or get_world_group().local_rank == 0)

View File

@ -17,7 +17,8 @@ if TYPE_CHECKING:
S3_ACCESS_KEY_ID: Optional[str] = None S3_ACCESS_KEY_ID: Optional[str] = None
S3_SECRET_ACCESS_KEY: Optional[str] = None S3_SECRET_ACCESS_KEY: Optional[str] = None
S3_ENDPOINT_URL: Optional[str] = None S3_ENDPOINT_URL: Optional[str] = None
VLLM_CONFIG_ROOT: str = "" VLLM_CACHE_ROOT: str = os.path.expanduser("~/.cache/vllm")
VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai" VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
VLLM_NO_USAGE_STATS: bool = False VLLM_NO_USAGE_STATS: bool = False
VLLM_DO_NOT_TRACK: bool = False VLLM_DO_NOT_TRACK: bool = False
@ -31,10 +32,11 @@ if TYPE_CHECKING:
VLLM_OPENVINO_KVCACHE_SPACE: int = 0 VLLM_OPENVINO_KVCACHE_SPACE: int = 0
VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/" VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024 VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_USE_RAY_COMPILED_DAG: bool = False
VLLM_WORKER_MULTIPROC_METHOD: str = "fork" VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_IMAGE_FETCH_TIMEOUT: int = 5
VLLM_TARGET_DEVICE: str = "cuda" VLLM_TARGET_DEVICE: str = "cuda"
MAX_JOBS: Optional[str] = None MAX_JOBS: Optional[str] = None
@ -45,6 +47,21 @@ if TYPE_CHECKING:
CMAKE_BUILD_TYPE: Optional[str] = None CMAKE_BUILD_TYPE: Optional[str] = None
VERBOSE: bool = False VERBOSE: bool = False
def get_default_cache_root():
return os.getenv(
"XDG_CACHE_HOME",
os.path.join(os.path.expanduser("~"), ".cache"),
)
def get_default_config_root():
return os.getenv(
"XDG_CONFIG_HOME",
os.path.join(os.path.expanduser("~"), ".config"),
)
# The begin-* and end* here are used by the documentation generator # The begin-* and end* here are used by the documentation generator
# to extract the used env vars. # to extract the used env vars.
@ -89,15 +106,28 @@ environment_variables: Dict[str, Callable[[], Any]] = {
lambda: bool(int(os.getenv('VERBOSE', '0'))), lambda: bool(int(os.getenv('VERBOSE', '0'))),
# Root directory for VLLM configuration files # Root directory for VLLM configuration files
# Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
# Note that this not only affects how vllm finds its configuration files # Note that this not only affects how vllm finds its configuration files
# during runtime, but also affects how vllm installs its configuration # during runtime, but also affects how vllm installs its configuration
# files during **installation**. # files during **installation**.
"VLLM_CONFIG_ROOT": "VLLM_CONFIG_ROOT":
lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv( lambda: os.path.expanduser(
"XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"), os.getenv(
"VLLM_CONFIG_ROOT",
os.path.join(get_default_config_root(), "vllm"),
)),
# ================== Runtime Env Vars ================== # ================== Runtime Env Vars ==================
# Root directory for VLLM cache files
# Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
"VLLM_CACHE_ROOT":
lambda: os.path.expanduser(
os.getenv(
"VLLM_CACHE_ROOT",
os.path.join(get_default_cache_root(), "vllm"),
)),
# used in distributed environment to determine the master address # used in distributed environment to determine the master address
'VLLM_HOST_IP': 'VLLM_HOST_IP':
lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""), lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
@ -242,6 +272,14 @@ environment_variables: Dict[str, Callable[[], Any]] = {
"VLLM_WORKER_MULTIPROC_METHOD": "VLLM_WORKER_MULTIPROC_METHOD":
lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"), lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
# Path to the cache for storing downloaded assets
"VLLM_ASSETS_CACHE":
lambda: os.path.expanduser(
os.getenv(
"VLLM_ASSETS_CACHE",
os.path.join(get_default_cache_root(), "vllm", "assets"),
)),
# Timeout for fetching images when serving multimodal models # Timeout for fetching images when serving multimodal models
# Default is 5 seconds # Default is 5 seconds
"VLLM_IMAGE_FETCH_TIMEOUT": "VLLM_IMAGE_FETCH_TIMEOUT":
@ -250,7 +288,11 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# Path to the XLA persistent cache directory. # Path to the XLA persistent cache directory.
# Only used for XLA devices such as TPUs. # Only used for XLA devices such as TPUs.
"VLLM_XLA_CACHE_PATH": "VLLM_XLA_CACHE_PATH":
lambda: os.getenv("VLLM_XLA_CACHE_PATH", "~/.vllm/xla_cache/"), lambda: os.path.expanduser(
os.getenv(
"VLLM_ASSETS_CACHE",
os.path.join(get_default_cache_root(), "vllm", "xla_cache"),
)),
"VLLM_FUSED_MOE_CHUNK_SIZE": "VLLM_FUSED_MOE_CHUNK_SIZE":
lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "65536")), lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "65536")),
@ -262,7 +304,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# end-env-vars-definition # end-env-vars-definition
def __getattr__(name): def __getattr__(name: str):
# lazy evaluation of environment variables # lazy evaluation of environment variables
if name in environment_variables: if name in environment_variables:
return environment_variables[name]() return environment_variables[name]()

View File

@ -19,9 +19,8 @@ import vllm.envs as envs
from vllm.version import __version__ as VLLM_VERSION from vllm.version import __version__ as VLLM_VERSION
_config_home = envs.VLLM_CONFIG_ROOT _config_home = envs.VLLM_CONFIG_ROOT
_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "vllm/usage_stats.json") _USAGE_STATS_JSON_PATH = os.path.join(_config_home, "usage_stats.json")
_USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home, _USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home, "do_not_track")
"vllm/do_not_track")
_USAGE_STATS_ENABLED = None _USAGE_STATS_ENABLED = None
_USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER _USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER

View File

@ -98,8 +98,7 @@ class TPUWorker(LoraNotSupportedWorkerBase):
# Use persistent cache to avoid XLA recompilation. # Use persistent cache to avoid XLA recompilation.
# NOTE(woosuk): This does not completely eliminate the recompilation # NOTE(woosuk): This does not completely eliminate the recompilation
# overhead because dynamo does not cache the compiled results. # overhead because dynamo does not cache the compiled results.
xr.initialize_cache(os.path.expanduser(envs.VLLM_XLA_CACHE_PATH), xr.initialize_cache(envs.VLLM_XLA_CACHE_PATH, readonly=False)
readonly=False)
def load_model(self): def load_model(self):
self.model_runner.load_model() self.model_runner.load_model()