vllm/tests/test_regression.py

"""Containing tests that check for regressions in vLLM's behavior.

It should include tests that are reported by users and making sure they
will never happen again.

"""
import gc

import torch

from vllm import LLM, SamplingParams


def test_duplicated_ignored_sequence_group():
    """https://github.com/vllm-project/vllm/issues/1655"""

    sampling_params = SamplingParams(temperature=0.01,
                                     top_p=0.1,
                                     max_tokens=256)
    llm = LLM(model="facebook/opt-125m",
              max_num_batched_tokens=4096,
              tensor_parallel_size=1)
    prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
    outputs = llm.generate(prompts, sampling_params=sampling_params)

    assert len(prompts) == len(outputs)


def test_max_tokens_none():
    sampling_params = SamplingParams(temperature=0.01,
                                     top_p=0.1,
                                     max_tokens=None)
    llm = LLM(model="facebook/opt-125m",
              max_num_batched_tokens=4096,
              tensor_parallel_size=1)
    prompts = ["Just say hello!"]
    outputs = llm.generate(prompts, sampling_params=sampling_params)

    assert len(prompts) == len(outputs)


def test_gc():
    llm = LLM("facebook/opt-125m", enforce_eager=True)
    del llm

    gc.collect()
    torch.cuda.empty_cache()

    # The memory allocated for model and KV cache should be released.
    # The memory allocated for PyTorch and others should be less than 50MB.
    # Usually, it's around 10MB.
    allocated = torch.cuda.memory_allocated()
    assert allocated < 50 * 1024 * 1024


if __name__ == "__main__":
    import pytest
    pytest.main([__file__])
[Minor] Fix duplication of ignored seq group in engine step (#1666) 2023-11-16 13:11:41 -08:00			`"""Containing tests that check for regressions in vLLM's behavior.`

			`It should include tests that are reported by users and making sure they`
			`will never happen again.`

			`"""`
[BugFix] Fix GC bug for `LLM` class (#2882) 2024-02-14 22:17:44 -08:00			`import gc`

			`import torch`

[Minor] Fix duplication of ignored seq group in engine step (#1666) 2023-11-16 13:11:41 -08:00			`from vllm import LLM, SamplingParams`


			`def test_duplicated_ignored_sequence_group():`
			`"""https://github.com/vllm-project/vllm/issues/1655"""`

			`sampling_params = SamplingParams(temperature=0.01,`
			`top_p=0.1,`
			`max_tokens=256)`
			`llm = LLM(model="facebook/opt-125m",`
			`max_num_batched_tokens=4096,`
			`tensor_parallel_size=1)`
			`prompts = ["This is a short prompt", "This is a very long prompt " * 1000]`
			`outputs = llm.generate(prompts, sampling_params=sampling_params)`

			`assert len(prompts) == len(outputs)`


[Bugfix] fix crash if max_tokens=None (#2570) 2024-01-23 22:38:55 -08:00			`def test_max_tokens_none():`
			`sampling_params = SamplingParams(temperature=0.01,`
			`top_p=0.1,`
			`max_tokens=None)`
			`llm = LLM(model="facebook/opt-125m",`
			`max_num_batched_tokens=4096,`
			`tensor_parallel_size=1)`
			`prompts = ["Just say hello!"]`
			`outputs = llm.generate(prompts, sampling_params=sampling_params)`

			`assert len(prompts) == len(outputs)`


[BugFix] Fix GC bug for `LLM` class (#2882) 2024-02-14 22:17:44 -08:00			`def test_gc():`
			`llm = LLM("facebook/opt-125m", enforce_eager=True)`
			`del llm`

			`gc.collect()`
			`torch.cuda.empty_cache()`

			`# The memory allocated for model and KV cache should be released.`
			`# The memory allocated for PyTorch and others should be less than 50MB.`
			`# Usually, it's around 10MB.`
			`allocated = torch.cuda.memory_allocated()`
			`assert allocated < 50 * 1024 * 1024`


[Minor] Fix duplication of ignored seq group in engine step (#1666) 2023-11-16 13:11:41 -08:00			`if __name__ == "__main__":`
			`import pytest`
			`pytest.main([__file__])`