From d97011512e5a816acbdb5bd8ffbf691dd227fe27 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 16 Jul 2024 14:12:25 +0800 Subject: [PATCH] [CI/Build] vLLM cache directory for images (#6444) --- .buildkite/download-images.sh | 14 ----- .buildkite/test-pipeline.yaml | 4 -- examples/llava_example.py | 33 ++---------- examples/paligemma_example.py | 33 ++---------- examples/phi3v_example.py | 26 +-------- tests/conftest.py | 25 +-------- vllm/assets/__init__.py | 0 vllm/assets/base.py | 11 ++++ vllm/assets/image.py | 47 ++++++++++++++++ .../custom_all_reduce_utils.py | 8 +-- vllm/envs.py | 54 ++++++++++++++++--- vllm/usage/usage_lib.py | 5 +- vllm/worker/tpu_worker.py | 3 +- 13 files changed, 123 insertions(+), 140 deletions(-) delete mode 100644 .buildkite/download-images.sh create mode 100644 vllm/assets/__init__.py create mode 100644 vllm/assets/base.py create mode 100644 vllm/assets/image.py diff --git a/.buildkite/download-images.sh b/.buildkite/download-images.sh deleted file mode 100644 index 360a7584..00000000 --- a/.buildkite/download-images.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -set -ex -set -o pipefail - -(which wget && which curl) || (apt-get update && apt-get install -y wget curl) - -# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/ -mkdir -p images -cd images -wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg -wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg - -cd - diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 4019cc00..cd3a5e80 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -12,7 +12,6 @@ steps: fast_check_only: true commands: - pytest -v -s async_engine # Async Engine - - bash ../.buildkite/download-images.sh # Inputs - pytest -v -s test_inputs.py - pytest -v -s multimodal - pytest -v -s test_utils.py # Utils @@ -82,7 +81,6 @@ steps: working_dir: "/vllm-workspace/tests" num_gpus: 2 commands: - - bash ../.buildkite/download-images.sh - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py @@ -155,7 +153,6 @@ steps: - label: Inputs Test #mirror_hardwares: [amd] commands: - - bash ../.buildkite/download-images.sh - pytest -v -s test_inputs.py - pytest -v -s multimodal @@ -175,7 +172,6 @@ steps: - label: Vision Language Models Test mirror_hardwares: [amd] commands: - - bash ../.buildkite/download-images.sh - pytest -v -s models -m vlm - label: Prefix Caching Test diff --git a/examples/llava_example.py b/examples/llava_example.py index 382d153c..4c9eabd2 100644 --- a/examples/llava_example.py +++ b/examples/llava_example.py @@ -1,12 +1,5 @@ -import os -import subprocess - -from PIL import Image - from vllm import LLM - -# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`. -# You can use `.buildkite/download-images.sh` to download them +from vllm.assets.image import ImageAsset def run_llava(): @@ -14,7 +7,7 @@ def run_llava(): prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" - image = Image.open("images/stop_sign.jpg") + image = ImageAsset("stop_sign").pil_image outputs = llm.generate({ "prompt": prompt, @@ -28,25 +21,5 @@ def run_llava(): print(generated_text) -def main(): - run_llava() - - if __name__ == "__main__": - # Download from s3 - s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/" - local_directory = "images" - - # Make sure the local directory exists or create it - os.makedirs(local_directory, exist_ok=True) - - # Use AWS CLI to sync the directory, assume anonymous access - subprocess.check_call([ - "aws", - "s3", - "sync", - s3_bucket_path, - local_directory, - "--no-sign-request", - ]) - main() + run_llava() diff --git a/examples/paligemma_example.py b/examples/paligemma_example.py index b315eafe..92a3cb3a 100644 --- a/examples/paligemma_example.py +++ b/examples/paligemma_example.py @@ -1,12 +1,5 @@ -import os -import subprocess - -from PIL import Image - from vllm import LLM - -# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`. -# You can use `.buildkite/download-images.sh` to download them +from vllm.assets.image import ImageAsset def run_paligemma(): @@ -14,7 +7,7 @@ def run_paligemma(): prompt = "caption es" - image = Image.open("images/stop_sign.jpg") + image = ImageAsset("stop_sign").pil_image outputs = llm.generate({ "prompt": prompt, @@ -28,25 +21,5 @@ def run_paligemma(): print(generated_text) -def main(): - run_paligemma() - - if __name__ == "__main__": - # Download from s3 - s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/" - local_directory = "images" - - # Make sure the local directory exists or create it - os.makedirs(local_directory, exist_ok=True) - - # Use AWS CLI to sync the directory, assume anonymous access - subprocess.check_call([ - "aws", - "s3", - "sync", - s3_bucket_path, - local_directory, - "--no-sign-request", - ]) - main() + run_paligemma() diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py index b605d4c6..ae8c38d8 100644 --- a/examples/phi3v_example.py +++ b/examples/phi3v_example.py @@ -1,12 +1,5 @@ -import os -import subprocess - -from PIL import Image - from vllm import LLM, SamplingParams - -# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`. -# You can use `.buildkite/download-images.sh` to download them +from vllm.assets.image import ImageAsset def run_phi3v(): @@ -24,7 +17,7 @@ def run_phi3v(): max_num_seqs=5, ) - image = Image.open("images/cherry_blossom.jpg") + image = ImageAsset("cherry_blossom").pil_image # single-image prompt prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n" # noqa: E501 @@ -44,19 +37,4 @@ def run_phi3v(): if __name__ == "__main__": - s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/" - local_directory = "images" - - # Make sure the local directory exists or create it - os.makedirs(local_directory, exist_ok=True) - - # Use AWS CLI to sync the directory, assume anonymous access - subprocess.check_call([ - "aws", - "s3", - "sync", - s3_bucket_path, - local_directory, - "--no-sign-request", - ]) run_phi3v() diff --git a/tests/conftest.py b/tests/conftest.py index 608a5f49..17f75d94 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,11 +3,7 @@ import gc import os import sys from collections import UserList -from dataclasses import dataclass -from functools import cached_property -from pathlib import Path -from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict, - TypeVar) +from typing import Any, Dict, List, Optional, Tuple, TypedDict, TypeVar import pytest import torch @@ -18,12 +14,12 @@ from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq, AutoTokenizer, BatchEncoding) from vllm import LLM, SamplingParams +from vllm.assets.image import ImageAsset from vllm.config import TokenizerPoolConfig from vllm.distributed import (destroy_distributed_environment, destroy_model_parallel) from vllm.inputs import TextPrompt from vllm.logger import init_logger -from vllm.multimodal.utils import fetch_image from vllm.sequence import SampleLogprobs from vllm.utils import cuda_device_count_stateless, is_cpu @@ -33,9 +29,6 @@ _TEST_DIR = os.path.dirname(__file__) _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] -_IMAGE_DIR = Path(_TEST_DIR) / "images" -"""You can use `.buildkite/download-images.sh` to download the assets.""" - def _read_prompts(filename: str) -> List[str]: with open(filename, "r") as f: @@ -43,20 +36,6 @@ def _read_prompts(filename: str) -> List[str]: return prompts -@dataclass(frozen=True) -class ImageAsset: - name: Literal["stop_sign", "cherry_blossom", "boardwalk"] - - @cached_property - def pil_image(self) -> Image.Image: - if self.name == "boardwalk": - return fetch_image( - "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - ) - - return Image.open(_IMAGE_DIR / f"{self.name}.jpg") - - class _ImageAssetPrompts(TypedDict): stop_sign: str cherry_blossom: str diff --git a/vllm/assets/__init__.py b/vllm/assets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vllm/assets/base.py b/vllm/assets/base.py new file mode 100644 index 00000000..18ca2fe6 --- /dev/null +++ b/vllm/assets/base.py @@ -0,0 +1,11 @@ +from pathlib import Path + +import vllm.envs as envs + + +def get_cache_dir(): + """Get the path to the cache for storing downloaded assets.""" + path = Path(envs.VLLM_ASSETS_CACHE) + path.mkdir(parents=True, exist_ok=True) + + return path diff --git a/vllm/assets/image.py b/vllm/assets/image.py new file mode 100644 index 00000000..a526db73 --- /dev/null +++ b/vllm/assets/image.py @@ -0,0 +1,47 @@ +import shutil +from dataclasses import dataclass +from functools import cached_property, lru_cache +from typing import Literal + +import requests +from PIL import Image + +from vllm.multimodal.utils import fetch_image + +from .base import get_cache_dir + + +@lru_cache +def get_air_example_data_2_asset(filename: str) -> Image.Image: + """ + Download and open an image from + ``s3://air-example-data-2/vllm_opensource_llava/``. + """ + image_directory = get_cache_dir() / "air-example-data-2" + image_directory.mkdir(parents=True, exist_ok=True) + + image_path = image_directory / filename + if not image_path.exists(): + base_url = "https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava" + + with requests.get(f"{base_url}/{filename}", stream=True) as response: + response.raise_for_status() + + with image_path.open("wb") as f: + shutil.copyfileobj(response.raw, f) + + return Image.open(image_path) + + +@dataclass(frozen=True) +class ImageAsset: + name: Literal["stop_sign", "cherry_blossom", "boardwalk"] + + @cached_property + def pil_image(self) -> Image.Image: + if self.name == "boardwalk": + return fetch_image( + "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + ) + + return get_air_example_data_2_asset(f"{self.name}.jpg") diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index 6f1aaed9..d27d7ee9 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -189,10 +189,10 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES if cuda_visible_devices is None: cuda_visible_devices = ",".join(str(i) for i in range(num_dev)) - VLLM_CONFIG_ROOT = envs.VLLM_CONFIG_ROOT - path = os.path.expanduser( - f"{VLLM_CONFIG_ROOT}/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json" - ) + + path = os.path.join( + envs.VLLM_CACHE_ROOT, + f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json") os.makedirs(os.path.dirname(path), exist_ok=True) from vllm.distributed.parallel_state import get_world_group if ((not is_distributed or get_world_group().local_rank == 0) diff --git a/vllm/envs.py b/vllm/envs.py index 85d60f32..f3b6d278 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -17,7 +17,8 @@ if TYPE_CHECKING: S3_ACCESS_KEY_ID: Optional[str] = None S3_SECRET_ACCESS_KEY: Optional[str] = None S3_ENDPOINT_URL: Optional[str] = None - VLLM_CONFIG_ROOT: str = "" + VLLM_CACHE_ROOT: str = os.path.expanduser("~/.cache/vllm") + VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm") VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai" VLLM_NO_USAGE_STATS: bool = False VLLM_DO_NOT_TRACK: bool = False @@ -31,10 +32,11 @@ if TYPE_CHECKING: VLLM_OPENVINO_KVCACHE_SPACE: int = 0 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False - VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/" + VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache") VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024 VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_WORKER_MULTIPROC_METHOD: str = "fork" + VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets") VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_TARGET_DEVICE: str = "cuda" MAX_JOBS: Optional[str] = None @@ -45,6 +47,21 @@ if TYPE_CHECKING: CMAKE_BUILD_TYPE: Optional[str] = None VERBOSE: bool = False + +def get_default_cache_root(): + return os.getenv( + "XDG_CACHE_HOME", + os.path.join(os.path.expanduser("~"), ".cache"), + ) + + +def get_default_config_root(): + return os.getenv( + "XDG_CONFIG_HOME", + os.path.join(os.path.expanduser("~"), ".config"), + ) + + # The begin-* and end* here are used by the documentation generator # to extract the used env vars. @@ -89,15 +106,28 @@ environment_variables: Dict[str, Callable[[], Any]] = { lambda: bool(int(os.getenv('VERBOSE', '0'))), # Root directory for VLLM configuration files + # Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set # Note that this not only affects how vllm finds its configuration files # during runtime, but also affects how vllm installs its configuration # files during **installation**. "VLLM_CONFIG_ROOT": - lambda: os.environ.get("VLLM_CONFIG_ROOT", None) or os.getenv( - "XDG_CONFIG_HOME", None) or os.path.expanduser("~/.config"), + lambda: os.path.expanduser( + os.getenv( + "VLLM_CONFIG_ROOT", + os.path.join(get_default_config_root(), "vllm"), + )), # ================== Runtime Env Vars ================== + # Root directory for VLLM cache files + # Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set + "VLLM_CACHE_ROOT": + lambda: os.path.expanduser( + os.getenv( + "VLLM_CACHE_ROOT", + os.path.join(get_default_cache_root(), "vllm"), + )), + # used in distributed environment to determine the master address 'VLLM_HOST_IP': lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""), @@ -242,6 +272,14 @@ environment_variables: Dict[str, Callable[[], Any]] = { "VLLM_WORKER_MULTIPROC_METHOD": lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"), + # Path to the cache for storing downloaded assets + "VLLM_ASSETS_CACHE": + lambda: os.path.expanduser( + os.getenv( + "VLLM_ASSETS_CACHE", + os.path.join(get_default_cache_root(), "vllm", "assets"), + )), + # Timeout for fetching images when serving multimodal models # Default is 5 seconds "VLLM_IMAGE_FETCH_TIMEOUT": @@ -250,7 +288,11 @@ environment_variables: Dict[str, Callable[[], Any]] = { # Path to the XLA persistent cache directory. # Only used for XLA devices such as TPUs. "VLLM_XLA_CACHE_PATH": - lambda: os.getenv("VLLM_XLA_CACHE_PATH", "~/.vllm/xla_cache/"), + lambda: os.path.expanduser( + os.getenv( + "VLLM_ASSETS_CACHE", + os.path.join(get_default_cache_root(), "vllm", "xla_cache"), + )), "VLLM_FUSED_MOE_CHUNK_SIZE": lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "65536")), @@ -262,7 +304,7 @@ environment_variables: Dict[str, Callable[[], Any]] = { # end-env-vars-definition -def __getattr__(name): +def __getattr__(name: str): # lazy evaluation of environment variables if name in environment_variables: return environment_variables[name]() diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 6907d8b9..fb6a6d85 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -19,9 +19,8 @@ import vllm.envs as envs from vllm.version import __version__ as VLLM_VERSION _config_home = envs.VLLM_CONFIG_ROOT -_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "vllm/usage_stats.json") -_USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home, - "vllm/do_not_track") +_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "usage_stats.json") +_USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home, "do_not_track") _USAGE_STATS_ENABLED = None _USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 60fee989..9bf764f0 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -98,8 +98,7 @@ class TPUWorker(LoraNotSupportedWorkerBase): # Use persistent cache to avoid XLA recompilation. # NOTE(woosuk): This does not completely eliminate the recompilation # overhead because dynamo does not cache the compiled results. - xr.initialize_cache(os.path.expanduser(envs.VLLM_XLA_CACHE_PATH), - readonly=False) + xr.initialize_cache(envs.VLLM_XLA_CACHE_PATH, readonly=False) def load_model(self): self.model_runner.load_model()