vllm/tests/test_config.py

import pytest

from vllm.config import ModelConfig


@pytest.mark.parametrize(("model_id", "expected_task"), [
    ("facebook/opt-125m", "generate"),
    ("intfloat/e5-mistral-7b-instruct", "embedding"),
])
def test_auto_task(model_id, expected_task):
    config = ModelConfig(
        model_id,
        task="auto",
        tokenizer=model_id,
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
        dtype="float16",
    )

    assert config.task == expected_task


@pytest.mark.parametrize(("model_id", "bad_task"), [
    ("facebook/opt-125m", "embedding"),
    ("intfloat/e5-mistral-7b-instruct", "generate"),
])
def test_incorrect_task(model_id, bad_task):
    with pytest.raises(ValueError, match=r"does not support the .* task"):
        ModelConfig(
            model_id,
            task=bad_task,
            tokenizer=model_id,
            tokenizer_mode="auto",
            trust_remote_code=False,
            seed=0,
            dtype="float16",
        )


MODEL_IDS_EXPECTED = [
    ("Qwen/Qwen1.5-7B", 32768),
    ("mistralai/Mistral-7B-v0.1", 4096),
    ("mistralai/Mistral-7B-Instruct-v0.2", 32768),
]


@pytest.mark.parametrize("model_id_expected", MODEL_IDS_EXPECTED)
def test_disable_sliding_window(model_id_expected):
    model_id, expected = model_id_expected
    model_config = ModelConfig(
        model_id,
        task="auto",
        tokenizer=model_id,
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
        dtype="float16",
        revision=None,
        disable_sliding_window=True,
    )
    assert model_config.max_model_len == expected


def test_get_sliding_window():
    TEST_SLIDING_WINDOW = 4096
    # Test that the sliding window is correctly computed.
    # For Qwen1.5/Qwen2, get_sliding_window() should be None
    # when use_sliding_window is False.
    qwen2_model_config = ModelConfig(
        "Qwen/Qwen1.5-7B",
        task="auto",
        tokenizer="Qwen/Qwen1.5-7B",
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
        dtype="float16",
        revision=None,
    )

    qwen2_model_config.hf_config.use_sliding_window = False
    qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
    assert qwen2_model_config.get_sliding_window() is None

    qwen2_model_config.hf_config.use_sliding_window = True
    assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW

    mistral_model_config = ModelConfig(
        "mistralai/Mistral-7B-v0.1",
        task="auto",
        tokenizer="mistralai/Mistral-7B-v0.1",
        tokenizer_mode="auto",
        trust_remote_code=False,
        seed=0,
        dtype="float16",
        revision=None,
    )
    mistral_model_config.hf_config.sliding_window = None
    assert mistral_model_config.get_sliding_window() is None

    mistral_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
    assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW


def test_rope_customization():
    TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0}
    TEST_ROPE_THETA = 16_000_000.0
    LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}

    llama_model_config = ModelConfig(
        "meta-llama/Meta-Llama-3-8B-Instruct",
        task="auto",
        tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
        tokenizer_mode="auto",
        trust_remote_code=False,
        dtype="float16",
        seed=0,
    )
    assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None
    assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000
    assert llama_model_config.max_model_len == 8192

    llama_model_config = ModelConfig(
        "meta-llama/Meta-Llama-3-8B-Instruct",
        task="auto",
        tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
        tokenizer_mode="auto",
        trust_remote_code=False,
        dtype="float16",
        seed=0,
        rope_scaling=TEST_ROPE_SCALING,
        rope_theta=TEST_ROPE_THETA,
    )
    assert getattr(llama_model_config.hf_config, "rope_scaling",
                   None) == TEST_ROPE_SCALING
    assert getattr(llama_model_config.hf_config, "rope_theta",
                   None) == TEST_ROPE_THETA
    assert llama_model_config.max_model_len == 16384

    longchat_model_config = ModelConfig(
        "lmsys/longchat-13b-16k",
        task="auto",
        tokenizer="lmsys/longchat-13b-16k",
        tokenizer_mode="auto",
        trust_remote_code=False,
        dtype="float16",
        seed=0,
    )
    # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config
    assert all(
        longchat_model_config.hf_config.rope_scaling.get(key) == value
        for key, value in LONGCHAT_ROPE_SCALING.items())
    assert longchat_model_config.max_model_len == 16384

    longchat_model_config = ModelConfig(
        "lmsys/longchat-13b-16k",
        task="auto",
        tokenizer="lmsys/longchat-13b-16k",
        tokenizer_mode="auto",
        trust_remote_code=False,
        dtype="float16",
        seed=0,
        rope_scaling=TEST_ROPE_SCALING,
    )
    assert getattr(longchat_model_config.hf_config, "rope_scaling",
                   None) == TEST_ROPE_SCALING
    assert longchat_model_config.max_model_len == 4096


@pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
    ("facebook/opt-125m", False),
    ("facebook/bart-base", True),
    ("meta-llama/Llama-3.2-1B", False),
    ("meta-llama/Llama-3.2-11B-Vision", True),
])
def test_is_encoder_decoder(model_id, is_encoder_decoder):
    config = ModelConfig(
        model_id,
        task="auto",
        tokenizer=model_id,
        tokenizer_mode="auto",
        trust_remote_code=False,
        dtype="float16",
        seed=0,
    )

    assert config.is_encoder_decoder == is_encoder_decoder


@pytest.mark.parametrize(("model_id", "uses_mrope"), [
    ("facebook/opt-125m", False),
    ("Qwen/Qwen2-VL-2B-Instruct", True),
])
def test_uses_mrope(model_id, uses_mrope):
    config = ModelConfig(
        model_id,
        task="auto",
        tokenizer=model_id,
        tokenizer_mode="auto",
        trust_remote_code=False,
        dtype="float16",
        seed=0,
    )

    assert config.uses_mrope == uses_mrope
[Bugfix / Core] Prefix Caching Guards (merged with main) (#4846) Co-authored-by: rsnm2 <rshaw@neuralmagic.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> 2024-05-27 15:18:17 -07:00			`import pytest`

Fix assertion failure in Qwen 1.5 with prefix caching enabled (#3373) Co-authored-by: Cade Daniel <edacih@gmail.com> 2024-03-15 04:56:57 +08:00			`from vllm.config import ModelConfig`

[Model] Add user-configurable task for models that support both generation and embedding (#9424) 2024-10-19 02:31:58 +08:00
			`@pytest.mark.parametrize(("model_id", "expected_task"), [`
			`("facebook/opt-125m", "generate"),`
			`("intfloat/e5-mistral-7b-instruct", "embedding"),`
			`])`
			`def test_auto_task(model_id, expected_task):`
			`config = ModelConfig(`
			`model_id,`
			`task="auto",`
			`tokenizer=model_id,`
			`tokenizer_mode="auto",`
			`trust_remote_code=False,`
			`seed=0,`
			`dtype="float16",`
			`)`

			`assert config.task == expected_task`


			`@pytest.mark.parametrize(("model_id", "bad_task"), [`
			`("facebook/opt-125m", "embedding"),`
			`("intfloat/e5-mistral-7b-instruct", "generate"),`
			`])`
			`def test_incorrect_task(model_id, bad_task):`
			`with pytest.raises(ValueError, match=r"does not support the .* task"):`
			`ModelConfig(`
			`model_id,`
			`task=bad_task,`
			`tokenizer=model_id,`
			`tokenizer_mode="auto",`
			`trust_remote_code=False,`
			`seed=0,`
			`dtype="float16",`
			`)`


[Bugfix / Core] Prefix Caching Guards (merged with main) (#4846) Co-authored-by: rsnm2 <rshaw@neuralmagic.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> 2024-05-27 15:18:17 -07:00			`MODEL_IDS_EXPECTED = [`
			`("Qwen/Qwen1.5-7B", 32768),`
			`("mistralai/Mistral-7B-v0.1", 4096),`
			`("mistralai/Mistral-7B-Instruct-v0.2", 32768),`
			`]`


			`@pytest.mark.parametrize("model_id_expected", MODEL_IDS_EXPECTED)`
			`def test_disable_sliding_window(model_id_expected):`
			`model_id, expected = model_id_expected`
			`model_config = ModelConfig(`
			`model_id,`
[Model] Add user-configurable task for models that support both generation and embedding (#9424) 2024-10-19 02:31:58 +08:00			`task="auto",`
			`tokenizer=model_id,`
[Bugfix / Core] Prefix Caching Guards (merged with main) (#4846) Co-authored-by: rsnm2 <rshaw@neuralmagic.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> 2024-05-27 15:18:17 -07:00			`tokenizer_mode="auto",`
			`trust_remote_code=False,`
			`seed=0,`
			`dtype="float16",`
			`revision=None,`
			`disable_sliding_window=True,`
			`)`
			`assert model_config.max_model_len == expected`

Fix assertion failure in Qwen 1.5 with prefix caching enabled (#3373) Co-authored-by: Cade Daniel <edacih@gmail.com> 2024-03-15 04:56:57 +08:00
			`def test_get_sliding_window():`
			`TEST_SLIDING_WINDOW = 4096`
			`# Test that the sliding window is correctly computed.`
			`# For Qwen1.5/Qwen2, get_sliding_window() should be None`
			`# when use_sliding_window is False.`
			`qwen2_model_config = ModelConfig(`
			`"Qwen/Qwen1.5-7B",`
[Model] Add user-configurable task for models that support both generation and embedding (#9424) 2024-10-19 02:31:58 +08:00			`task="auto",`
			`tokenizer="Qwen/Qwen1.5-7B",`
Fix assertion failure in Qwen 1.5 with prefix caching enabled (#3373) Co-authored-by: Cade Daniel <edacih@gmail.com> 2024-03-15 04:56:57 +08:00			`tokenizer_mode="auto",`
			`trust_remote_code=False,`
			`seed=0,`
			`dtype="float16",`
			`revision=None,`
			`)`

			`qwen2_model_config.hf_config.use_sliding_window = False`
			`qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW`
			`assert qwen2_model_config.get_sliding_window() is None`

			`qwen2_model_config.hf_config.use_sliding_window = True`
			`assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW`

			`mistral_model_config = ModelConfig(`
			`"mistralai/Mistral-7B-v0.1",`
[Model] Add user-configurable task for models that support both generation and embedding (#9424) 2024-10-19 02:31:58 +08:00			`task="auto",`
			`tokenizer="mistralai/Mistral-7B-v0.1",`
Fix assertion failure in Qwen 1.5 with prefix caching enabled (#3373) Co-authored-by: Cade Daniel <edacih@gmail.com> 2024-03-15 04:56:57 +08:00			`tokenizer_mode="auto",`
			`trust_remote_code=False,`
			`seed=0,`
			`dtype="float16",`
			`revision=None,`
			`)`
			`mistral_model_config.hf_config.sliding_window = None`
			`assert mistral_model_config.get_sliding_window() is None`

			`mistral_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW`
[Frontend] Dynamic RoPE scaling (#4638) 2024-05-22 05:32:35 +00:00			`assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW`


[Frontend] Customizable RoPE theta (#5197) 2024-06-11 17:42:26 +00:00			`def test_rope_customization():`
[Misc] Standardize RoPE handling for Qwen2-VL (#9250) 2024-10-16 13:56:17 +08:00			`TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0}`
[Frontend] Customizable RoPE theta (#5197) 2024-06-11 17:42:26 +00:00			`TEST_ROPE_THETA = 16_000_000.0`
[Misc] Standardize RoPE handling for Qwen2-VL (#9250) 2024-10-16 13:56:17 +08:00			`LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}`
[Frontend] Dynamic RoPE scaling (#4638) 2024-05-22 05:32:35 +00:00
			`llama_model_config = ModelConfig(`
			`"meta-llama/Meta-Llama-3-8B-Instruct",`
[Model] Add user-configurable task for models that support both generation and embedding (#9424) 2024-10-19 02:31:58 +08:00			`task="auto",`
			`tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",`
[Frontend] Dynamic RoPE scaling (#4638) 2024-05-22 05:32:35 +00:00			`tokenizer_mode="auto",`
			`trust_remote_code=False,`
			`dtype="float16",`
			`seed=0,`
			`)`
			`assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None`
[Frontend] Customizable RoPE theta (#5197) 2024-06-11 17:42:26 +00:00			`assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000`
[Frontend] Dynamic RoPE scaling (#4638) 2024-05-22 05:32:35 +00:00			`assert llama_model_config.max_model_len == 8192`

			`llama_model_config = ModelConfig(`
			`"meta-llama/Meta-Llama-3-8B-Instruct",`
[Model] Add user-configurable task for models that support both generation and embedding (#9424) 2024-10-19 02:31:58 +08:00			`task="auto",`
			`tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",`
[Frontend] Dynamic RoPE scaling (#4638) 2024-05-22 05:32:35 +00:00			`tokenizer_mode="auto",`
			`trust_remote_code=False,`
			`dtype="float16",`
			`seed=0,`
			`rope_scaling=TEST_ROPE_SCALING,`
[Frontend] Customizable RoPE theta (#5197) 2024-06-11 17:42:26 +00:00			`rope_theta=TEST_ROPE_THETA,`
[Frontend] Dynamic RoPE scaling (#4638) 2024-05-22 05:32:35 +00:00			`)`
			`assert getattr(llama_model_config.hf_config, "rope_scaling",`
			`None) == TEST_ROPE_SCALING`
[Frontend] Customizable RoPE theta (#5197) 2024-06-11 17:42:26 +00:00			`assert getattr(llama_model_config.hf_config, "rope_theta",`
			`None) == TEST_ROPE_THETA`
[Frontend] Dynamic RoPE scaling (#4638) 2024-05-22 05:32:35 +00:00			`assert llama_model_config.max_model_len == 16384`

[Bugfix] Bump transformers to 4.43.2 (#6752) 2024-07-24 16:22:16 -04:00			`longchat_model_config = ModelConfig(`
			`"lmsys/longchat-13b-16k",`
[Model] Add user-configurable task for models that support both generation and embedding (#9424) 2024-10-19 02:31:58 +08:00			`task="auto",`
			`tokenizer="lmsys/longchat-13b-16k",`
[Bugfix] Bump transformers to 4.43.2 (#6752) 2024-07-24 16:22:16 -04:00			`tokenizer_mode="auto",`
			`trust_remote_code=False,`
			`dtype="float16",`
			`seed=0,`
			`)`
			`# Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config`
			`assert all(`
			`longchat_model_config.hf_config.rope_scaling.get(key) == value`
			`for key, value in LONGCHAT_ROPE_SCALING.items())`
			`assert longchat_model_config.max_model_len == 16384`

			`longchat_model_config = ModelConfig(`
			`"lmsys/longchat-13b-16k",`
[Model] Add user-configurable task for models that support both generation and embedding (#9424) 2024-10-19 02:31:58 +08:00			`task="auto",`
			`tokenizer="lmsys/longchat-13b-16k",`
[Bugfix] Bump transformers to 4.43.2 (#6752) 2024-07-24 16:22:16 -04:00			`tokenizer_mode="auto",`
			`trust_remote_code=False,`
			`dtype="float16",`
			`seed=0,`
			`rope_scaling=TEST_ROPE_SCALING,`
			`)`
			`assert getattr(longchat_model_config.hf_config, "rope_scaling",`
			`None) == TEST_ROPE_SCALING`
			`assert longchat_model_config.max_model_len == 4096`
[Misc] Consolidate ModelConfig code related to HF config (#10104) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-11-07 14:00:21 +08:00

			`@pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [`
			`("facebook/opt-125m", False),`
			`("facebook/bart-base", True),`
			`("meta-llama/Llama-3.2-1B", False),`
			`("meta-llama/Llama-3.2-11B-Vision", True),`
			`])`
			`def test_is_encoder_decoder(model_id, is_encoder_decoder):`
			`config = ModelConfig(`
			`model_id,`
			`task="auto",`
			`tokenizer=model_id,`
			`tokenizer_mode="auto",`
			`trust_remote_code=False,`
			`dtype="float16",`
			`seed=0,`
			`)`

			`assert config.is_encoder_decoder == is_encoder_decoder`


			`@pytest.mark.parametrize(("model_id", "uses_mrope"), [`
			`("facebook/opt-125m", False),`
			`("Qwen/Qwen2-VL-2B-Instruct", True),`
			`])`
			`def test_uses_mrope(model_id, uses_mrope):`
			`config = ModelConfig(`
			`model_id,`
			`task="auto",`
			`tokenizer=model_id,`
			`tokenizer_mode="auto",`
			`trust_remote_code=False,`
			`dtype="float16",`
			`seed=0,`
			`)`

			`assert config.uses_mrope == uses_mrope`