vllm/tests/conftest.py

import contextlib
import gc
import os
from typing import List, Optional, Tuple

import pytest
import torch
from PIL import Image
from transformers import (AutoModelForCausalLM, AutoProcessor,
                          LlavaForConditionalGeneration)

from vllm import LLM, SamplingParams
from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
from vllm.distributed import destroy_model_parallel
from vllm.sequence import MultiModalData
from vllm.transformers_utils.tokenizer import get_tokenizer

_TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]

# Multi modal related
_PIXEL_VALUES_FILES = [
    os.path.join(_TEST_DIR, "images", filename) for filename in
    ["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
]
_IMAGE_FEATURES_FILES = [
    os.path.join(_TEST_DIR, "images", filename) for filename in
    ["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"]
]
_IMAGE_FILES = [
    os.path.join(_TEST_DIR, "images", filename)
    for filename in ["stop_sign.jpg", "cherry_blossom.jpg"]
]
_IMAGE_PROMPTS = [
    "<image>\nUSER: What's the content of the image?\nASSISTANT:",
    "<image>\nUSER: What is the season?\nASSISTANT:"
]
assert len(_PIXEL_VALUES_FILES) == len(_IMAGE_FEATURES_FILES) == len(
    _IMAGE_FILES) == len(_IMAGE_PROMPTS)


def _read_prompts(filename: str) -> List[str]:
    with open(filename, "r") as f:
        prompts = f.readlines()
        return prompts


def cleanup():
    destroy_model_parallel()
    with contextlib.suppress(AssertionError):
        torch.distributed.destroy_process_group()
    gc.collect()
    torch.cuda.empty_cache()


@pytest.fixture()
def should_do_global_cleanup_after_test(request) -> bool:
    """Allow subdirectories to skip global cleanup by overriding this fixture.
    This can provide a ~10x speedup for non-GPU unit tests since they don't need
    to initialize torch.
    """

    if request.node.get_closest_marker("skip_global_cleanup"):
        return False

    return True


@pytest.fixture(autouse=True)
def cleanup_fixture(should_do_global_cleanup_after_test: bool):
    yield
    if should_do_global_cleanup_after_test:
        cleanup()


@pytest.fixture(scope="session")
def hf_image_prompts() -> List[str]:
    return _IMAGE_PROMPTS


@pytest.fixture(scope="session")
def hf_images() -> List[Image.Image]:
    return [Image.open(filename) for filename in _IMAGE_FILES]


@pytest.fixture()
def vllm_images(request) -> "torch.Tensor":
    vision_language_config = request.getfixturevalue("model_and_config")[1]
    all_images = []
    if vision_language_config.image_input_type == (
            VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
        filenames = _IMAGE_FEATURES_FILES
    else:
        filenames = _PIXEL_VALUES_FILES
    for filename in filenames:
        all_images.append(torch.load(filename))
    return torch.concat(all_images, dim=0)


@pytest.fixture()
def vllm_image_prompts(request) -> List[str]:
    vision_language_config = request.getfixturevalue("model_and_config")[1]
    return [
        "<image>" * (vision_language_config.image_feature_size - 1) + p
        for p in _IMAGE_PROMPTS
    ]


@pytest.fixture
def example_prompts() -> List[str]:
    prompts = []
    for filename in _TEST_PROMPTS:
        prompts += _read_prompts(filename)
    return prompts


@pytest.fixture
def example_long_prompts() -> List[str]:
    prompts = []
    for filename in _LONG_PROMPTS:
        prompts += _read_prompts(filename)
    return prompts


_STR_DTYPE_TO_TORCH_DTYPE = {
    "half": torch.half,
    "bfloat16": torch.bfloat16,
    "float": torch.float,
}

_VISION_LANGUAGE_MODELS = {
    "llava-hf/llava-1.5-7b-hf": LlavaForConditionalGeneration,
}


class HfRunner:

    def __init__(
        self,
        model_name: str,
        tokenizer_name: Optional[str] = None,
        dtype: str = "half",
    ) -> None:
        assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
        torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
        self.model_name = model_name
        if model_name not in _VISION_LANGUAGE_MODELS:
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch_dtype,
                trust_remote_code=True,
            ).cuda()
            self.processor = None
        else:
            self.model = _VISION_LANGUAGE_MODELS[model_name].from_pretrained(
                model_name,
                torch_dtype=torch_dtype,
                trust_remote_code=True,
            ).cuda()
            self.processor = AutoProcessor.from_pretrained(
                model_name,
                torch_dtype=torch_dtype,
            )
        if tokenizer_name is None:
            tokenizer_name = model_name
        self.tokenizer = get_tokenizer(tokenizer_name, trust_remote_code=True)

    def generate(
        self,
        prompts: List[str],
        images: Optional[List[Image.Image]] = None,
        **kwargs,
    ) -> List[Tuple[List[int], str]]:
        outputs: List[Tuple[List[int], str]] = []
        if images:
            assert len(prompts) == len(images)
        for i, prompt in enumerate(prompts):
            if self.model_name not in _VISION_LANGUAGE_MODELS:
                input_ids = self.tokenizer(prompt,
                                           return_tensors="pt").input_ids
                inputs = {"input_ids": input_ids.cuda()}
            else:
                image = images[i] if images else None
                inputs = self.processor(text=prompt,
                                        images=image,
                                        return_tensors="pt")
                inputs = {
                    key: value.cuda() if value is not None else None
                    for key, value in inputs.items()
                }
            output_ids = self.model.generate(
                **inputs,
                use_cache=True,
                **kwargs,
            )
            output_str = self.tokenizer.batch_decode(
                output_ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False,
            )
            output_ids = output_ids.cpu().tolist()
            outputs.append((output_ids, output_str))
        return outputs

    def generate_greedy(
        self,
        prompts: List[str],
        max_tokens: int,
        images: Optional["torch.Tensor"] = None,
    ) -> List[Tuple[List[int], str]]:
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
                                images=images)
        for i in range(len(outputs)):
            output_ids, output_str = outputs[i]
            outputs[i] = (output_ids[0], output_str[0])
        return outputs

    def generate_beam_search(
        self,
        prompts: List[str],
        beam_width: int,
        max_tokens: int,
    ) -> List[Tuple[List[int], str]]:
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
                                num_beams=beam_width,
                                num_return_sequences=beam_width)
        for i in range(len(outputs)):
            output_ids, output_str = outputs[i]
            for j in range(len(output_ids)):
                output_ids[j] = [
                    x for x in output_ids[j]
                    if x != self.tokenizer.pad_token_id
                ]
            outputs[i] = (output_ids, output_str)
        return outputs

    def generate_greedy_logprobs(
        self,
        prompts: List[str],
        max_tokens: int,
    ) -> List[List[torch.Tensor]]:
        all_logprobs = []
        for prompt in prompts:
            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
            output = self.model.generate(
                input_ids.cuda(),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
            )
            seq_logprobs = []
            for hidden_states in output.hidden_states:
                last_hidden_states = hidden_states[-1][0]
                logits = torch.matmul(
                    last_hidden_states,
                    self.model.get_output_embeddings().weight.t(),
                )
                if self.model.get_output_embeddings().bias is not None:
                    logits += self.model.get_output_embeddings(
                    ).bias.unsqueeze(0)
                logprobs = torch.nn.functional.log_softmax(logits,
                                                           dim=-1,
                                                           dtype=torch.float32)
                seq_logprobs.append(logprobs)
            all_logprobs.append(seq_logprobs)
        return all_logprobs

    def __del__(self):
        del self.model
        cleanup()


@pytest.fixture
def hf_runner():
    return HfRunner


class VllmRunner:

    def __init__(
        self,
        model_name: str,
        tokenizer_name: Optional[str] = None,
        # Use smaller max model length, otherwise bigger model cannot run due
        # to kv cache size limit.
        max_model_len=1024,
        dtype: str = "half",
        disable_log_stats: bool = True,
        tensor_parallel_size: int = 1,
        block_size: int = 16,
        enable_chunked_prefill: bool = False,
        swap_space=4,
        **kwargs,
    ) -> None:
        self.model = LLM(
            model=model_name,
            tokenizer=tokenizer_name,
            trust_remote_code=True,
            dtype=dtype,
            swap_space=swap_space,
            disable_log_stats=disable_log_stats,
            tensor_parallel_size=tensor_parallel_size,
            max_model_len=max_model_len,
            block_size=block_size,
            enable_chunked_prefill=enable_chunked_prefill,
            **kwargs,
        )

    def generate(
        self,
        prompts: List[str],
        sampling_params: SamplingParams,
        images: Optional["torch.Tensor"] = None,
    ) -> List[Tuple[List[int], str]]:
        if images is not None:
            assert len(prompts) == images.shape[0]
        req_outputs = self.model.generate(
            prompts,
            sampling_params=sampling_params,
            multi_modal_data=MultiModalData(type=MultiModalData.Type.IMAGE,
                                            data=images)
            if images is not None else None)
        outputs = []
        for req_output in req_outputs:
            prompt_str = req_output.prompt
            prompt_ids = req_output.prompt_token_ids
            req_sample_output_ids = []
            req_sample_output_strs = []
            for sample in req_output.outputs:
                output_str = sample.text
                output_ids = sample.token_ids
                req_sample_output_ids.append(prompt_ids + output_ids)
                req_sample_output_strs.append(prompt_str + output_str)
            outputs.append((req_sample_output_ids, req_sample_output_strs))
        return outputs

    def generate_w_logprobs(
        self,
        prompts: List[str],
        sampling_params: SamplingParams,
    ) -> List[Tuple[List[int], str]]:
        assert sampling_params.logprobs is not None

        req_outputs = self.model.generate(prompts,
                                          sampling_params=sampling_params)
        outputs = []
        for req_output in req_outputs:
            for sample in req_output.outputs:
                output_str = sample.text
                output_ids = sample.token_ids
                output_logprobs = sample.logprobs
            outputs.append((output_ids, output_str, output_logprobs))
        return outputs

    def generate_greedy(
        self,
        prompts: List[str],
        max_tokens: int,
        images: Optional[torch.Tensor] = None,
    ) -> List[Tuple[List[int], str]]:
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
        outputs = self.generate(prompts, greedy_params, images=images)
        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]

    def generate_greedy_logprobs(
        self,
        prompts: List[str],
        max_tokens: int,
        num_logprobs: int,
    ) -> List[Tuple[List[int], str]]:
        greedy_logprobs_params = SamplingParams(temperature=0.0,
                                                max_tokens=max_tokens,
                                                logprobs=num_logprobs)
        outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params)

        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]

    def generate_beam_search(
        self,
        prompts: List[str],
        beam_width: int,
        max_tokens: int,
    ) -> List[Tuple[List[int], str]]:
        beam_search_params = SamplingParams(n=beam_width,
                                            use_beam_search=True,
                                            temperature=0.0,
                                            max_tokens=max_tokens)
        outputs = self.generate(prompts, beam_search_params)
        return outputs

    def __del__(self):
        del self.model
        cleanup()


@pytest.fixture(scope="session")
def vllm_runner():
    return VllmRunner


def get_tokenizer_pool_config(tokenizer_group_type):
    if tokenizer_group_type is None:
        return None
    if tokenizer_group_type == "ray":
        return TokenizerPoolConfig(pool_size=1,
                                   pool_type="ray",
                                   extra_config={})
    raise ValueError(f"Unknown tokenizer_group_type: {tokenizer_group_type}")
[Test] Make model tests run again and remove --forked from pytest (#3631) Co-authored-by: Simon Mo <simon.mo@hey.com> 2024-03-29 13:06:40 +09:00			`import contextlib`
			`import gc`
[BugFix] Fix input positions for long context with sliding window (#2088) 2023-12-13 12:28:13 -08:00			`import os`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00			`from typing import List, Optional, Tuple`

			`import pytest`
			`import torch`
[Feature] Add vision language model support. (#3042) 2024-03-25 14:16:30 -07:00			`from PIL import Image`
			`from transformers import (AutoModelForCausalLM, AutoProcessor,`
			`LlavaForConditionalGeneration)`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00
			`from vllm import LLM, SamplingParams`
[Feature] Add vision language model support. (#3042) 2024-03-25 14:16:30 -07:00			`from vllm.config import TokenizerPoolConfig, VisionLanguageConfig`
[Core][Refactor] move parallel_utils into vllm/distributed (#3950) [WIP][Core][Refactor] move vllm/model_executor/parallel_utils into vllm/distributed and vllm/device_communicators (#3950) 2024-04-10 15:33:30 -07:00			`from vllm.distributed import destroy_model_parallel`
[Feature] Add vision language model support. (#3042) 2024-03-25 14:16:30 -07:00			`from vllm.sequence import MultiModalData`
[CI] Try introducing isort. (#3495) 2024-03-25 23:59:47 +09:00			`from vllm.transformers_utils.tokenizer import get_tokenizer`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00
[BUGFIX] Fix the path of test prompts (#2273) 2023-12-27 02:37:21 +08:00			`_TEST_DIR = os.path.dirname(__file__)`
			`_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]`
			`_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]`
[BugFix] Fix input positions for long context with sliding window (#2088) 2023-12-13 12:28:13 -08:00
[Feature] Add vision language model support. (#3042) 2024-03-25 14:16:30 -07:00			`# Multi modal related`
			`_PIXEL_VALUES_FILES = [`
			`os.path.join(_TEST_DIR, "images", filename) for filename in`
			`["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]`
			`]`
			`_IMAGE_FEATURES_FILES = [`
			`os.path.join(_TEST_DIR, "images", filename) for filename in`
			`["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"]`
			`]`
			`_IMAGE_FILES = [`
			`os.path.join(_TEST_DIR, "images", filename)`
			`for filename in ["stop_sign.jpg", "cherry_blossom.jpg"]`
			`]`
			`_IMAGE_PROMPTS = [`
			`"<image>\nUSER: What's the content of the image?\nASSISTANT:",`
			`"<image>\nUSER: What is the season?\nASSISTANT:"`
			`]`
			`assert len(_PIXEL_VALUES_FILES) == len(_IMAGE_FEATURES_FILES) == len(`
			`_IMAGE_FILES) == len(_IMAGE_PROMPTS)`

[BugFix] Fix input positions for long context with sliding window (#2088) 2023-12-13 12:28:13 -08:00
Fix `vllm:prompt_tokens_total` metric calculation (#2869) 2024-02-19 09:55:41 +02:00			`def _read_prompts(filename: str) -> List[str]:`
[BugFix] Fix input positions for long context with sliding window (#2088) 2023-12-13 12:28:13 -08:00			`with open(filename, "r") as f:`
Fix `vllm:prompt_tokens_total` metric calculation (#2869) 2024-02-19 09:55:41 +02:00			`prompts = f.readlines()`
			`return prompts`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00

[Test] Make model tests run again and remove --forked from pytest (#3631) Co-authored-by: Simon Mo <simon.mo@hey.com> 2024-03-29 13:06:40 +09:00			`def cleanup():`
			`destroy_model_parallel()`
			`with contextlib.suppress(AssertionError):`
			`torch.distributed.destroy_process_group()`
			`gc.collect()`
			`torch.cuda.empty_cache()`


[Misc] [CI/Build] Speed up block manager CPU-only unit tests ~10x by opting-out of GPU cleanup (#3783) 2024-04-01 17:49:51 -07:00			`@pytest.fixture()`
[Misc] Add pytest marker to opt-out of global test cleanup (#3863) 2024-04-04 21:54:16 -07:00			`def should_do_global_cleanup_after_test(request) -> bool:`
[Misc] [CI/Build] Speed up block manager CPU-only unit tests ~10x by opting-out of GPU cleanup (#3783) 2024-04-01 17:49:51 -07:00			`"""Allow subdirectories to skip global cleanup by overriding this fixture.`
			`This can provide a ~10x speedup for non-GPU unit tests since they don't need`
			`to initialize torch.`
			`"""`
[Misc] Add pytest marker to opt-out of global test cleanup (#3863) 2024-04-04 21:54:16 -07:00
			`if request.node.get_closest_marker("skip_global_cleanup"):`
			`return False`

[Misc] [CI/Build] Speed up block manager CPU-only unit tests ~10x by opting-out of GPU cleanup (#3783) 2024-04-01 17:49:51 -07:00			`return True`


[Test] Make model tests run again and remove --forked from pytest (#3631) Co-authored-by: Simon Mo <simon.mo@hey.com> 2024-03-29 13:06:40 +09:00			`@pytest.fixture(autouse=True)`
[Misc] [CI/Build] Speed up block manager CPU-only unit tests ~10x by opting-out of GPU cleanup (#3783) 2024-04-01 17:49:51 -07:00			`def cleanup_fixture(should_do_global_cleanup_after_test: bool):`
[Test] Make model tests run again and remove --forked from pytest (#3631) Co-authored-by: Simon Mo <simon.mo@hey.com> 2024-03-29 13:06:40 +09:00			`yield`
[Misc] [CI/Build] Speed up block manager CPU-only unit tests ~10x by opting-out of GPU cleanup (#3783) 2024-04-01 17:49:51 -07:00			`if should_do_global_cleanup_after_test:`
			`cleanup()`
[Test] Make model tests run again and remove --forked from pytest (#3631) Co-authored-by: Simon Mo <simon.mo@hey.com> 2024-03-29 13:06:40 +09:00

[Feature] Add vision language model support. (#3042) 2024-03-25 14:16:30 -07:00			`@pytest.fixture(scope="session")`
			`def hf_image_prompts() -> List[str]:`
			`return _IMAGE_PROMPTS`


			`@pytest.fixture(scope="session")`
			`def hf_images() -> List[Image.Image]:`
			`return [Image.open(filename) for filename in _IMAGE_FILES]`


			`@pytest.fixture()`
			`def vllm_images(request) -> "torch.Tensor":`
			`vision_language_config = request.getfixturevalue("model_and_config")[1]`
			`all_images = []`
			`if vision_language_config.image_input_type == (`
			`VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):`
			`filenames = _IMAGE_FEATURES_FILES`
			`else:`
			`filenames = _PIXEL_VALUES_FILES`
			`for filename in filenames:`
			`all_images.append(torch.load(filename))`
			`return torch.concat(all_images, dim=0)`


			`@pytest.fixture()`
			`def vllm_image_prompts(request) -> List[str]:`
			`vision_language_config = request.getfixturevalue("model_and_config")[1]`
			`return [`
			`"<image>" * (vision_language_config.image_feature_size - 1) + p`
			`for p in _IMAGE_PROMPTS`
			`]`


Add tests for models (#922) 2023-09-01 11:19:43 +09:00			`@pytest.fixture`
			`def example_prompts() -> List[str]:`
[BugFix] Fix input positions for long context with sliding window (#2088) 2023-12-13 12:28:13 -08:00			`prompts = []`
			`for filename in _TEST_PROMPTS:`
[BUGFIX] Fix the path of test prompts (#2273) 2023-12-27 02:37:21 +08:00			`prompts += _read_prompts(filename)`
[BugFix] Fix input positions for long context with sliding window (#2088) 2023-12-13 12:28:13 -08:00			`return prompts`


			`@pytest.fixture`
			`def example_long_prompts() -> List[str]:`
			`prompts = []`
			`for filename in _LONG_PROMPTS:`
[BUGFIX] Fix the path of test prompts (#2273) 2023-12-27 02:37:21 +08:00			`prompts += _read_prompts(filename)`
[BugFix] Fix input positions for long context with sliding window (#2088) 2023-12-13 12:28:13 -08:00			`return prompts`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00

			`_STR_DTYPE_TO_TORCH_DTYPE = {`
			`"half": torch.half,`
			`"bfloat16": torch.bfloat16,`
			`"float": torch.float,`
			`}`

[Feature] Add vision language model support. (#3042) 2024-03-25 14:16:30 -07:00			`_VISION_LANGUAGE_MODELS = {`
			`"llava-hf/llava-1.5-7b-hf": LlavaForConditionalGeneration,`
			`}`

Add tests for models (#922) 2023-09-01 11:19:43 +09:00
			`class HfRunner:`

			`def __init__(`
			`self,`
			`model_name: str,`
			`tokenizer_name: Optional[str] = None,`
			`dtype: str = "half",`
			`) -> None:`
			`assert dtype in _STR_DTYPE_TO_TORCH_DTYPE`
			`torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]`
[Feature] Add vision language model support. (#3042) 2024-03-25 14:16:30 -07:00			`self.model_name = model_name`
			`if model_name not in _VISION_LANGUAGE_MODELS:`
			`self.model = AutoModelForCausalLM.from_pretrained(`
			`model_name,`
			`torch_dtype=torch_dtype,`
			`trust_remote_code=True,`
			`).cuda()`
			`self.processor = None`
			`else:`
			`self.model = _VISION_LANGUAGE_MODELS[model_name].from_pretrained(`
			`model_name,`
			`torch_dtype=torch_dtype,`
			`trust_remote_code=True,`
			`).cuda()`
			`self.processor = AutoProcessor.from_pretrained(`
			`model_name,`
			`torch_dtype=torch_dtype,`
			`)`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00			`if tokenizer_name is None:`
			`tokenizer_name = model_name`
			`self.tokenizer = get_tokenizer(tokenizer_name, trust_remote_code=True)`

			`def generate(`
			`self,`
			`prompts: List[str],`
[Feature] Add vision language model support. (#3042) 2024-03-25 14:16:30 -07:00			`images: Optional[List[Image.Image]] = None,`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00			`**kwargs,`
			`) -> List[Tuple[List[int], str]]:`
			`outputs: List[Tuple[List[int], str]] = []`
[Feature] Add vision language model support. (#3042) 2024-03-25 14:16:30 -07:00			`if images:`
			`assert len(prompts) == len(images)`
			`for i, prompt in enumerate(prompts):`
			`if self.model_name not in _VISION_LANGUAGE_MODELS:`
			`input_ids = self.tokenizer(prompt,`
			`return_tensors="pt").input_ids`
			`inputs = {"input_ids": input_ids.cuda()}`
			`else:`
			`image = images[i] if images else None`
			`inputs = self.processor(text=prompt,`
			`images=image,`
			`return_tensors="pt")`
			`inputs = {`
			`key: value.cuda() if value is not None else None`
			`for key, value in inputs.items()`
			`}`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00			`output_ids = self.model.generate(`
[Feature] Add vision language model support. (#3042) 2024-03-25 14:16:30 -07:00			`**inputs,`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00			`use_cache=True,`
			`**kwargs,`
			`)`
			`output_str = self.tokenizer.batch_decode(`
			`output_ids,`
			`skip_special_tokens=True,`
			`clean_up_tokenization_spaces=False,`
Align vLLM's beam search implementation with HF generate (#857) 2023-09-04 17:29:42 -07:00			`)`
			`output_ids = output_ids.cpu().tolist()`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00			`outputs.append((output_ids, output_str))`
			`return outputs`

			`def generate_greedy(`
			`self,`
			`prompts: List[str],`
			`max_tokens: int,`
[Feature] Add vision language model support. (#3042) 2024-03-25 14:16:30 -07:00			`images: Optional["torch.Tensor"] = None,`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00			`) -> List[Tuple[List[int], str]]:`
Align vLLM's beam search implementation with HF generate (#857) 2023-09-04 17:29:42 -07:00			`outputs = self.generate(prompts,`
			`do_sample=False,`
[Feature] Add vision language model support. (#3042) 2024-03-25 14:16:30 -07:00			`max_new_tokens=max_tokens,`
			`images=images)`
Align vLLM's beam search implementation with HF generate (#857) 2023-09-04 17:29:42 -07:00			`for i in range(len(outputs)):`
			`output_ids, output_str = outputs[i]`
			`outputs[i] = (output_ids[0], output_str[0])`
			`return outputs`

			`def generate_beam_search(`
			`self,`
			`prompts: List[str],`
			`beam_width: int,`
			`max_tokens: int,`
			`) -> List[Tuple[List[int], str]]:`
			`outputs = self.generate(prompts,`
			`do_sample=False,`
			`max_new_tokens=max_tokens,`
			`num_beams=beam_width,`
			`num_return_sequences=beam_width)`
			`for i in range(len(outputs)):`
			`output_ids, output_str = outputs[i]`
			`for j in range(len(output_ids)):`
			`output_ids[j] = [`
			`x for x in output_ids[j]`
			`if x != self.tokenizer.pad_token_id`
			`]`
			`outputs[i] = (output_ids, output_str)`
			`return outputs`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00
Implement prompt logprobs & Batched topk for computing logprobs (#1328) Co-authored-by: Yunmo Chen <16273544+wanmok@users.noreply.github.com> 2023-10-16 10:56:50 -07:00			`def generate_greedy_logprobs(`
			`self,`
			`prompts: List[str],`
			`max_tokens: int,`
			`) -> List[List[torch.Tensor]]:`
			`all_logprobs = []`
			`for prompt in prompts:`
			`input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids`
			`output = self.model.generate(`
			`input_ids.cuda(),`
			`use_cache=True,`
			`do_sample=False,`
			`max_new_tokens=max_tokens,`
			`output_hidden_states=True,`
			`return_dict_in_generate=True,`
			`)`
			`seq_logprobs = []`
			`for hidden_states in output.hidden_states:`
			`last_hidden_states = hidden_states[-1][0]`
			`logits = torch.matmul(`
			`last_hidden_states,`
			`self.model.get_output_embeddings().weight.t(),`
			`)`
			`if self.model.get_output_embeddings().bias is not None:`
			`logits += self.model.get_output_embeddings(`
			`).bias.unsqueeze(0)`
			`logprobs = torch.nn.functional.log_softmax(logits,`
			`dim=-1,`
			`dtype=torch.float32)`
			`seq_logprobs.append(logprobs)`
			`all_logprobs.append(seq_logprobs)`
			`return all_logprobs`

[Test] Make model tests run again and remove --forked from pytest (#3631) Co-authored-by: Simon Mo <simon.mo@hey.com> 2024-03-29 13:06:40 +09:00			`def __del__(self):`
			`del self.model`
			`cleanup()`

Add tests for models (#922) 2023-09-01 11:19:43 +09:00
			`@pytest.fixture`
			`def hf_runner():`
			`return HfRunner`


			`class VllmRunner:`

			`def __init__(`
			`self,`
			`model_name: str,`
			`tokenizer_name: Optional[str] = None,`
[Test] Make model tests run again and remove --forked from pytest (#3631) Co-authored-by: Simon Mo <simon.mo@hey.com> 2024-03-29 13:06:40 +09:00			`# Use smaller max model length, otherwise bigger model cannot run due`
			`# to kv cache size limit.`
			`max_model_len=1024,`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00			`dtype: str = "half",`
Fix `vllm:prompt_tokens_total` metric calculation (#2869) 2024-02-19 09:55:41 +02:00			`disable_log_stats: bool = True,`
[Test] Add basic correctness test (#2908) 2024-02-18 16:44:50 -08:00			`tensor_parallel_size: int = 1,`
[2/N] Chunked prefill data update (#3538) 2024-03-29 02:06:01 +09:00			`block_size: int = 16,`
			`enable_chunked_prefill: bool = False,`
[Bug fix][Core] assert num_new_tokens == 1 fails when SamplingParams.n is not 1 and max_tokens is large & Add tests for preemption (#4451) 2024-05-02 11:24:13 +09:00			`swap_space=4,`
Port metrics from `aioprometheus` to `prometheus_client` (#2730) 2024-02-25 19:54:00 +00:00			`**kwargs,`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00			`) -> None:`
			`self.model = LLM(`
			`model=model_name,`
			`tokenizer=tokenizer_name,`
			`trust_remote_code=True,`
			`dtype=dtype,`
[Bug fix][Core] assert num_new_tokens == 1 fails when SamplingParams.n is not 1 and max_tokens is large & Add tests for preemption (#4451) 2024-05-02 11:24:13 +09:00			`swap_space=swap_space,`
Fix `vllm:prompt_tokens_total` metric calculation (#2869) 2024-02-19 09:55:41 +02:00			`disable_log_stats=disable_log_stats,`
[Test] Add basic correctness test (#2908) 2024-02-18 16:44:50 -08:00			`tensor_parallel_size=tensor_parallel_size,`
[Test] Make model tests run again and remove --forked from pytest (#3631) Co-authored-by: Simon Mo <simon.mo@hey.com> 2024-03-29 13:06:40 +09:00			`max_model_len=max_model_len,`
[2/N] Chunked prefill data update (#3538) 2024-03-29 02:06:01 +09:00			`block_size=block_size,`
			`enable_chunked_prefill=enable_chunked_prefill,`
Port metrics from `aioprometheus` to `prometheus_client` (#2730) 2024-02-25 19:54:00 +00:00			`**kwargs,`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00			`)`

			`def generate(`
			`self,`
			`prompts: List[str],`
			`sampling_params: SamplingParams,`
[Feature] Add vision language model support. (#3042) 2024-03-25 14:16:30 -07:00			`images: Optional["torch.Tensor"] = None,`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00			`) -> List[Tuple[List[int], str]]:`
[Feature] Add vision language model support. (#3042) 2024-03-25 14:16:30 -07:00			`if images is not None:`
			`assert len(prompts) == images.shape[0]`
			`req_outputs = self.model.generate(`
			`prompts,`
			`sampling_params=sampling_params,`
			`multi_modal_data=MultiModalData(type=MultiModalData.Type.IMAGE,`
			`data=images)`
			`if images is not None else None)`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00			`outputs = []`
			`for req_output in req_outputs:`
			`prompt_str = req_output.prompt`
			`prompt_ids = req_output.prompt_token_ids`
Align vLLM's beam search implementation with HF generate (#857) 2023-09-04 17:29:42 -07:00			`req_sample_output_ids = []`
			`req_sample_output_strs = []`
			`for sample in req_output.outputs:`
			`output_str = sample.text`
			`output_ids = sample.token_ids`
			`req_sample_output_ids.append(prompt_ids + output_ids)`
			`req_sample_output_strs.append(prompt_str + output_str)`
			`outputs.append((req_sample_output_ids, req_sample_output_strs))`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00			`return outputs`

Integrate Marlin Kernels for Int4 GPTQ inference (#2497) Co-authored-by: Robert Shaw <114415538+rib-2@users.noreply.github.com> Co-authored-by: alexm <alexm@neuralmagic.com> 2024-03-01 14:47:51 -06:00			`def generate_w_logprobs(`
			`self,`
			`prompts: List[str],`
			`sampling_params: SamplingParams,`
			`) -> List[Tuple[List[int], str]]:`
			`assert sampling_params.logprobs is not None`

			`req_outputs = self.model.generate(prompts,`
			`sampling_params=sampling_params)`
			`outputs = []`
			`for req_output in req_outputs:`
			`for sample in req_output.outputs:`
			`output_str = sample.text`
			`output_ids = sample.token_ids`
			`output_logprobs = sample.logprobs`
			`outputs.append((output_ids, output_str, output_logprobs))`
			`return outputs`

Add tests for models (#922) 2023-09-01 11:19:43 +09:00			`def generate_greedy(`
			`self,`
			`prompts: List[str],`
			`max_tokens: int,`
[Feature] Add vision language model support. (#3042) 2024-03-25 14:16:30 -07:00			`images: Optional[torch.Tensor] = None,`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00			`) -> List[Tuple[List[int], str]]:`
			`greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)`
[Feature] Add vision language model support. (#3042) 2024-03-25 14:16:30 -07:00			`outputs = self.generate(prompts, greedy_params, images=images)`
Use queue for finished requests (#957) 2023-09-05 19:27:23 -07:00			`return [(output_ids[0], output_str[0])`
			`for output_ids, output_str in outputs]`
Align vLLM's beam search implementation with HF generate (#857) 2023-09-04 17:29:42 -07:00
Integrate Marlin Kernels for Int4 GPTQ inference (#2497) Co-authored-by: Robert Shaw <114415538+rib-2@users.noreply.github.com> Co-authored-by: alexm <alexm@neuralmagic.com> 2024-03-01 14:47:51 -06:00			`def generate_greedy_logprobs(`
			`self,`
			`prompts: List[str],`
			`max_tokens: int,`
			`num_logprobs: int,`
			`) -> List[Tuple[List[int], str]]:`
			`greedy_logprobs_params = SamplingParams(temperature=0.0,`
			`max_tokens=max_tokens,`
			`logprobs=num_logprobs)`
			`outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params)`

			`return [(output_ids, output_str, output_logprobs)`
			`for output_ids, output_str, output_logprobs in outputs]`

Align vLLM's beam search implementation with HF generate (#857) 2023-09-04 17:29:42 -07:00			`def generate_beam_search(`
			`self,`
			`prompts: List[str],`
			`beam_width: int,`
			`max_tokens: int,`
			`) -> List[Tuple[List[int], str]]:`
			`beam_search_params = SamplingParams(n=beam_width,`
			`use_beam_search=True,`
			`temperature=0.0,`
			`max_tokens=max_tokens)`
			`outputs = self.generate(prompts, beam_search_params)`
			`return outputs`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00
[Test] Make model tests run again and remove --forked from pytest (#3631) Co-authored-by: Simon Mo <simon.mo@hey.com> 2024-03-29 13:06:40 +09:00			`def __del__(self):`
			`del self.model`
			`cleanup()`

Add tests for models (#922) 2023-09-01 11:19:43 +09:00
[BugFix] Fix handling of stop strings and stop token ids (#3672) 2024-04-11 23:34:12 +01:00			`@pytest.fixture(scope="session")`
Add tests for models (#922) 2023-09-01 11:19:43 +09:00			`def vllm_runner():`
			`return VllmRunner`
Asynchronous tokenization (#2879) 2024-03-15 16:37:01 -07:00

			`def get_tokenizer_pool_config(tokenizer_group_type):`
			`if tokenizer_group_type is None:`
			`return None`
			`if tokenizer_group_type == "ray":`
			`return TokenizerPoolConfig(pool_size=1,`
			`pool_type="ray",`
			`extra_config={})`
			`raise ValueError(f"Unknown tokenizer_group_type: {tokenizer_group_type}")`