[MISC] Consolidate cleanup() and refactor offline_inference_with_prefix.py (#9510)

2024-10-18 14:30:55 -07:00 · 2024-10-18 14:30:55 -07:00 · d11bf435a0
commit d11bf435a0
parent 9bb10a7d27
20 changed files with 84 additions and 105 deletions
--- a/examples/offline_inference_with_prefix.py
+++ b/examples/offline_inference_with_prefix.py
@ -1,4 +1,5 @@
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 # NOTE: This is just a running example. For benchmarking purpose,
 # please see benchmarks/benchmark_prefix_caching.py
@ -28,14 +29,9 @@ generating_prompts = [prefix + prompt for prompt in prompts]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.0)
-# Create an LLM.
+# Create an LLM without prefix caching as a baseline.
-regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.3)
+regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
 # The second LLM needs to request a higher gpu_memory_utilization because
 # the first LLM has already allocated a full 30% of the gpu memory.
 prefix_cached_llm = LLM(model="facebook/opt-125m",
                        enable_prefix_caching=True,
                        gpu_memory_utilization=0.6)
 print("Results without `enable_prefix_caching`")
 # Generate texts from the prompts. The output is a list of RequestOutput objects
@ -52,6 +48,15 @@ for output in outputs:
 print("-" * 80)
 # Destroy the LLM object and free up the GPU memory.
 del regular_llm
 cleanup_dist_env_and_memory()
 # Create an LLM with prefix caching enabled.
 prefix_cached_llm = LLM(model="facebook/opt-125m",
                        enable_prefix_caching=True,
                        gpu_memory_utilization=0.4)
 # Warmup so that the shared prompt's KV cache is computed.
 prefix_cached_llm.generate(generating_prompts[0], sampling_params)
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@ -12,11 +12,11 @@ import torch
 from vllm import SamplingParams
 from vllm.config import ParallelConfig
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
 from vllm.outputs import RequestOutput as RealRequestOutput
 from vllm.sampling_params import RequestOutputKind
 from ..conftest import cleanup
 from ..utils import wait_for_gpu_memory_to_clear
@ -157,7 +157,7 @@ async def async_engine():
        engine.shutdown_background_loop()
        del engine
        await asyncio.sleep(0.1)
-        cleanup()
+        cleanup_dist_env_and_memory()
@pytest.fixture()
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -1,5 +1,3 @@
 import contextlib
 import gc
 import json
 import os
 import sys
@ -27,8 +25,7 @@ from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.config import TaskOption, TokenizerPoolConfig
 from vllm.connections import global_http_connection
-from vllm.distributed import (destroy_distributed_environment,
+from vllm.distributed import (cleanup_dist_env_and_memory,
                              destroy_model_parallel,
                              init_distributed_environment,
                              initialize_model_parallel)
 from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
@ -140,17 +137,7 @@ def dist_init():
    )
    initialize_model_parallel(1, 1)
    yield
-    cleanup()
+    cleanup_dist_env_and_memory()
 def cleanup():
    destroy_model_parallel()
    destroy_distributed_environment()
    with contextlib.suppress(AssertionError):
        torch.distributed.destroy_process_group()
    gc.collect()
    if not is_cpu():
        torch.cuda.empty_cache()
@pytest.fixture()
@ -167,7 +154,7 @@ def should_do_global_cleanup_after_test(request) -> bool:
 def cleanup_fixture(should_do_global_cleanup_after_test: bool):
    yield
    if should_do_global_cleanup_after_test:
-        cleanup()
+        cleanup_dist_env_and_memory()
@pytest.fixture(autouse=True)
@ -606,7 +593,7 @@ class HfRunner:
    def __exit__(self, exc_type, exc_value, traceback):
        del self.model
-        cleanup()
+        cleanup_dist_env_and_memory()
@pytest.fixture(scope="session")
@ -861,7 +848,7 @@ class VllmRunner:
    def __exit__(self, exc_type, exc_value, traceback):
        del self.model
-        cleanup()
+        cleanup_dist_env_and_memory()
@pytest.fixture(scope="session")
--- a/tests/core/block/e2e/conftest.py
+++ b/tests/core/block/e2e/conftest.py
@ -3,10 +3,9 @@ from typing import Callable, Iterable, Optional
 import pytest
 from vllm import LLM
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.model_executor.utils import set_random_seed
 from ....conftest import cleanup
@pytest.fixture
 def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
@ -37,7 +36,7 @@ def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
        yield llm
        del llm
-        cleanup()
+        cleanup_dist_env_and_memory()
    for llm in generator_inner():
        yield llm
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@ -4,8 +4,7 @@ from typing import List
 import pytest
 from vllm import LLM, EmbeddingRequestOutput, PoolingParams
-
+from vllm.distributed import cleanup_dist_env_and_memory
 from ...conftest import cleanup
 MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
@ -41,7 +40,7 @@ def llm():
        del llm
-    cleanup()
+    cleanup_dist_env_and_memory()
 def assert_outputs_equal(o1: List[EmbeddingRequestOutput],
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@ -4,8 +4,7 @@ from typing import List
 import pytest
 from vllm import LLM, RequestOutput, SamplingParams
-
+from vllm.distributed import cleanup_dist_env_and_memory
 from ...conftest import cleanup
 MODEL_NAME = "facebook/opt-125m"
@ -39,7 +38,7 @@ def llm():
        del llm
-    cleanup()
+    cleanup_dist_env_and_memory()
 def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]):
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@ -5,10 +5,9 @@ import pytest
 from huggingface_hub import snapshot_download
 from vllm import LLM
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.lora.request import LoRARequest
 from ...conftest import cleanup
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 PROMPTS = [
@ -39,7 +38,7 @@ def llm():
        del llm
-    cleanup()
+    cleanup_dist_env_and_memory()
@pytest.fixture(scope="module")
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@ -5,12 +5,11 @@ import weakref
 import jsonschema
 import pytest
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from ...conftest import cleanup
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@ -23,7 +22,7 @@ def llm():
    with llm.deprecate_legacy_api():
        yield weakref.proxy(llm)
        del llm
-    cleanup()
+    cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@ -1,6 +1,7 @@
 import sys
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 def test_lazy_outlines(sample_regex):
@ -14,6 +15,7 @@ def test_lazy_outlines(sample_regex):
    ]
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
    # Create an LLM without guided decoding as a baseline.
    llm = LLM(model="facebook/opt-125m",
              enforce_eager=True,
              gpu_memory_utilization=0.3)
@ -26,8 +28,11 @@ def test_lazy_outlines(sample_regex):
    # make sure outlines is not imported
    assert 'outlines' not in sys.modules
-    # The second LLM needs to request a higher gpu_memory_utilization because
+    # Destroy the LLM object and free up the GPU memory.
-    # the first LLM has already allocated a full 30% of the gpu memory.
+    del llm
    cleanup_dist_env_and_memory()
    # Create an LLM with guided decoding enabled.
    llm = LLM(model="facebook/opt-125m",
              enforce_eager=True,
              guided_decoding_backend="lm-format-enforcer",
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@ -6,8 +6,7 @@ import weakref
 import pytest
 from vllm import LLM
-
+from vllm.distributed import cleanup_dist_env_and_memory
 from ...conftest import cleanup
 MODEL_NAME = "facebook/opt-125m"
@ -27,7 +26,7 @@ def llm():
        del llm
-    cleanup()
+    cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@ -1,20 +1,16 @@
 import contextlib
 import gc
 import tempfile
 from collections import OrderedDict
 from typing import Dict, List, TypedDict
 from unittest.mock import MagicMock, patch
 import pytest
 import ray
 import torch
 import torch.nn as nn
 from huggingface_hub import snapshot_download
 import vllm
 from vllm.config import LoRAConfig
-from vllm.distributed import (destroy_distributed_environment,
+from vllm.distributed import (cleanup_dist_env_and_memory,
                              destroy_model_parallel,
                              init_distributed_environment,
                              initialize_model_parallel)
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@ -48,16 +44,6 @@ LONG_LORA_INFOS: List[ContextIDInfo] = [{
 }]
 def cleanup():
    destroy_model_parallel()
    destroy_distributed_environment()
    with contextlib.suppress(AssertionError):
        torch.distributed.destroy_process_group()
    gc.collect()
    torch.cuda.empty_cache()
    ray.shutdown()
@pytest.fixture()
 def should_do_global_cleanup_after_test(request) -> bool:
    """Allow subdirectories to skip global cleanup by overriding this fixture.
@ -72,7 +58,7 @@ def should_do_global_cleanup_after_test(request) -> bool:
 def cleanup_fixture(should_do_global_cleanup_after_test: bool):
    yield
    if should_do_global_cleanup_after_test:
-        cleanup()
+        cleanup_dist_env_and_memory(shutdown_ray=True)
@pytest.fixture
@ -87,7 +73,7 @@ def dist_init():
    )
    initialize_model_parallel(1, 1)
    yield
-    cleanup()
+    cleanup_dist_env_and_memory(shutdown_ray=True)
@pytest.fixture
@ -238,7 +224,7 @@ def long_context_lora_files_32k():
 def long_context_infos(long_context_lora_files_16k_1,
                       long_context_lora_files_16k_2,
                       long_context_lora_files_32k):
-    cleanup()
+    cleanup_dist_env_and_memory(shutdown_ray=True)
    infos: Dict[int, ContextInfo] = {}
    for lora_checkpoint_info in LONG_LORA_INFOS:
        lora_id = lora_checkpoint_info["lora_id"]
@ -259,7 +245,7 @@ def long_context_infos(long_context_lora_files_16k_1,
@pytest.fixture
 def llama_2_7b_engine_extra_embeddings():
-    cleanup()
+    cleanup_dist_env_and_memory(shutdown_ray=True)
    get_model_old = get_model
    def get_model_patched(*, model_config, device_config, **kwargs):
@ -272,7 +258,7 @@ def llama_2_7b_engine_extra_embeddings():
        engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
    yield engine.llm_engine
    del engine
-    cleanup()
+    cleanup_dist_env_and_memory(shutdown_ray=True)
@pytest.fixture
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@ -3,10 +3,9 @@ from typing import List
 import pytest
 import vllm
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.lora.request import LoRARequest
 from .conftest import cleanup
 MODEL_PATH = "baichuan-inc/Baichuan-7B"
 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
@ -80,7 +79,7 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
    output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
    del llm_tp1
-    cleanup()
+    cleanup_dist_env_and_memory()
    llm_tp2 = vllm.LLM(MODEL_PATH,
                       enable_lora=True,
@ -93,7 +92,7 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
    output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)
    del llm_tp2
-    cleanup()
+    cleanup_dist_env_and_memory()
    assert output_tp1 == output_tp2
@ -108,6 +107,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
    output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)
    del llm_tp4
-    cleanup()
+    cleanup_dist_env_and_memory()
    assert output_tp1 == output_tp4
--- a/tests/lora/test_llama.py
+++ b/tests/lora/test_llama.py
@ -4,10 +4,9 @@ import pytest
 import ray
 import vllm
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.lora.request import LoRARequest
 from .conftest import cleanup
 MODEL_PATH = "meta-llama/Llama-2-7b-hf"
@ -93,7 +92,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
    output_tp1 = do_sample(llm_tp1, sql_lora_files, lora_id=1)
    del llm_tp1
-    cleanup()
+    cleanup_dist_env_and_memory()
    llm_tp2 = vllm.LLM(MODEL_PATH,
                       enable_lora=True,
@ -103,7 +102,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
    output_tp2 = do_sample(llm_tp2, sql_lora_files, lora_id=1)
    del llm_tp2
-    cleanup()
+    cleanup_dist_env_and_memory()
    assert output_tp1 == output_tp2
@ -115,7 +114,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
    output_tp4 = do_sample(llm_tp4, sql_lora_files, lora_id=1)
    del llm_tp4
-    cleanup()
+    cleanup_dist_env_and_memory()
    assert output_tp1 == output_tp4
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@ -6,11 +6,10 @@ from typing import List
 import pytest
 import vllm
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.lora.request import LoRARequest
 from vllm.utils import is_hip
 from .conftest import cleanup
@dataclass
 class ModelWithQuantization:
@ -160,7 +159,7 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
    print("removing lora")
    del llm
-    cleanup()
+    cleanup_dist_env_and_memory()
@pytest.mark.parametrize("model", MODELS)
@ -181,7 +180,7 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
    output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
    del llm_tp1
-    cleanup()
+    cleanup_dist_env_and_memory()
    llm_tp2 = vllm.LLM(
        model=model.model_path,
@ -194,6 +193,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
    output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
    del llm_tp2
-    cleanup()
+    cleanup_dist_env_and_memory()
    assert output_tp1 == output_tp2
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@ -6,13 +6,12 @@ import ray
 from prometheus_client import REGISTRY
 from vllm import EngineArgs, LLMEngine
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.metrics import RayPrometheusStatLogger
 from vllm.sampling_params import SamplingParams
 from ..conftest import cleanup
 MODELS = [
    "facebook/opt-125m",
 ]
@ -307,7 +306,7 @@ def test_metric_spec_decode_interval(
    finally:
        del engine
-        cleanup()
+        cleanup_dist_env_and_memory()
 def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
--- a/tests/models/decoder_only/vision_language/test_intern_vit.py
+++ b/tests/models/decoder_only/vision_language/test_intern_vit.py
@ -6,7 +6,7 @@ import torch.nn as nn
 from huggingface_hub import snapshot_download
 from transformers import AutoConfig, AutoModel, CLIPImageProcessor
-from ....conftest import _ImageAssets, cleanup
+from ....conftest import _ImageAssets
 # we use snapshot_download to prevent conflicts between
 # dynamic_module and trust_remote_code for hf_runner
@ -45,12 +45,13 @@ def run_intern_vit_test(
        for pixel_value in pixel_values
    ]
    from vllm.distributed import cleanup_dist_env_and_memory
    from vllm.model_executor.models.intern_vit import InternVisionModel
    vllm_model = InternVisionModel(config)
    vllm_model.load_weights(hf_model.state_dict().items())
    del hf_model
-    cleanup()
+    cleanup_dist_env_and_memory()
    vllm_model = vllm_model.to("cuda", dtype)
    vllm_outputs_per_image = [
@ -58,7 +59,7 @@ def run_intern_vit_test(
        for pixel_value in pixel_values
    ]
    del vllm_model
-    cleanup()
+    cleanup_dist_env_and_memory()
    cos_similar = nn.CosineSimilarity(dim=-1)
    for vllm_output, hf_output in zip(vllm_outputs_per_image,
--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ b/tests/prefix_caching/test_disable_sliding_window.py
@ -4,8 +4,8 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`.
 """
 import pytest
 from tests.conftest import cleanup
 from vllm import LLM
 from vllm.distributed import cleanup_dist_env_and_memory
 MODEL_LEN_LEN = [
    # Example models with sliding window.
@ -31,7 +31,7 @@ def test_disable_sliding_window(model_len_len, ):
        model_config.max_model_len)
    del vllm_disabled_model
-    cleanup()
+    cleanup_dist_env_and_memory()
    vllm_enabled_model = LLM(model, disable_sliding_window=False)
    vllm_enabled_model.generate("Hi my name is")
@ -41,4 +41,4 @@ def test_disable_sliding_window(model_len_len, ):
        model_config.max_model_len)
    del vllm_enabled_model
-    cleanup()
+    cleanup_dist_env_and_memory()
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@ -4,10 +4,10 @@ from typing import List, Optional, Sequence, Tuple, Union
 import pytest
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import PromptLogprobs, SampleLogprobs
 from ...conftest import cleanup
 from ...models.utils import (TokensTextLogprobs,
                             TokensTextLogprobsPromptLogprobs,
                             check_logprobs_close, check_outputs_equal)
@ -44,7 +44,7 @@ def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
        yield llm
        del llm
-        cleanup()
+        cleanup_dist_env_and_memory()
    return generate
--- a/tests/tensorizer_loader/conftest.py
+++ b/tests/tensorizer_loader/conftest.py
@ -1,27 +1,18 @@
 import contextlib
 import functools
 import gc
 from typing import Callable, TypeVar
 import pytest
 import ray
 import torch
 from typing_extensions import ParamSpec
-from vllm.distributed import (destroy_distributed_environment,
+from vllm.distributed import cleanup_dist_env_and_memory
                              destroy_model_parallel)
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
@pytest.fixture(autouse=True)
 def cleanup():
-    destroy_model_parallel()
+    cleanup_dist_env_and_memory(shutdown_ray=True)
    destroy_distributed_environment()
    with contextlib.suppress(AssertionError):
        torch.distributed.destroy_process_group()
    ray.shutdown()
    gc.collect()
    torch.cuda.empty_cache()
 _P = ParamSpec("_P")
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@ -20,6 +20,7 @@ If you only need to use the distributed environment without model/pipeline
 steps.
 """
 import contextlib
 import gc
 import pickle
 import weakref
 from collections import namedtuple
@ -36,7 +37,7 @@ from torch.distributed import Backend, ProcessGroup
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import supports_custom_op
+from vllm.utils import is_cpu, supports_custom_op
@dataclass
@ -1129,6 +1130,19 @@ def destroy_distributed_environment():
        torch.distributed.destroy_process_group()
 def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
    destroy_model_parallel()
    destroy_distributed_environment()
    with contextlib.suppress(AssertionError):
        torch.distributed.destroy_process_group()
    if shutdown_ray:
        import ray  # Lazy import Ray
        ray.shutdown()
    gc.collect()
    if not is_cpu():
        torch.cuda.empty_cache()
 def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
    """
    This is a collective operation that returns if each rank is in the same node