vllm/tests/models/decoder_only/language/test_mistral.py

"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.

Run `pytest tests/models/test_mistral.py`.
"""
import pytest

from vllm import SamplingParams

from ...utils import check_logprobs_close

MODELS = [
    "mistralai/Mistral-7B-Instruct-v0.1",
]

MISTRAL_FORMAT_MODELS = [
    "mistralai/Mistral-7B-Instruct-v0.3",
    # uses the v3-Tekken tokenizer
    "mistralai/Ministral-8B-Instruct-2410",
    # Mistral-Nemo is to big for CI, but passes locally
    # "mistralai/Mistral-Nemo-Instruct-2407"
]

SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
SYMBOLIC_LANG_PROMPTS = [
    "勇敢な船乗りについての詩を書く",  # japanese
    "寫一首關於勇敢的水手的詩",  # chinese
    "ပုံပြင်လေးပြောပြပါ်:\n",  # burmese
    "Repeat the phrase 'URGENCY🌶️':\nURGENCY🌶️\nURGENCY🌶️\n",  # see https://github.com/vllm-project/vllm/pull/9625
]

# for function calling
TOOLS = [{
    "type": "function",
    "function": {
        "name": "get_current_weather",
        "description": "Get the current weather in a given location",
        "parameters": {
            "type": "object",
            "properties": {
                "city": {
                    "type":
                    "string",
                    "description":
                    "The city to find the weather for, e.g. 'San Francisco'"
                },
                "state": {
                    "type":
                    "string",
                    "description":
                    "the two-letter abbreviation for the state that the city is"
                    " in, e.g. 'CA' which would mean 'California'"
                },
                "unit": {
                    "type": "string",
                    "description": "The unit to fetch the temperature in",
                    "enum": ["celsius", "fahrenheit"]
                }
            },
            "required": ["city", "state", "unit"]
        }
    }
}]
MSGS = [{
    "role":
    "user",
    "content": ("Can you tell me what the temperate"
                " will be in Dallas, in fahrenheit?")
}]
EXPECTED_FUNC_CALL = (
    '[{"name": "get_current_weather", "arguments": '
    '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]')


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
) -> None:
    # TODO(sang): Sliding window should be tested separately.
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)

    with vllm_runner(model, dtype=dtype,
                     tokenizer_mode="mistral") as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )


@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_mistral_format(
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
) -> None:
    with vllm_runner(
            model,
            dtype=dtype,
            tokenizer_mode="auto",
            load_format="safetensors",
            config_format="hf",
    ) as hf_format_model:
        hf_format_outputs = hf_format_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

    with vllm_runner(
            model,
            dtype=dtype,
            tokenizer_mode="mistral",
            load_format="mistral",
            config_format="mistral",
    ) as mistral_format_model:
        mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

    check_logprobs_close(
        outputs_0_lst=hf_format_outputs,
        outputs_1_lst=mistral_format_outputs,
        name_0="hf",
        name_1="mistral",
    )


@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_mistral_symbolic_languages(
    vllm_runner,
    model: str,
    dtype: str,
) -> None:
    with vllm_runner(model,
                     dtype=dtype,
                     max_model_len=8192,
                     tokenizer_mode="mistral",
                     config_format="mistral",
                     load_format="mistral") as vllm_model:
        for prompt in SYMBOLIC_LANG_PROMPTS:
            msg = {"role": "user", "content": prompt}
            outputs = vllm_model.model.chat([msg],
                                            sampling_params=SAMPLING_PARAMS)
            assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()


@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("model",
                         MISTRAL_FORMAT_MODELS)  # v1 can't do func calling
def test_mistral_function_calling(
    vllm_runner,
    model: str,
    dtype: str,
) -> None:
    with vllm_runner(model,
                     dtype=dtype,
                     tokenizer_mode="mistral",
                     config_format="mistral",
                     load_format="mistral") as vllm_model:
        outputs = vllm_model.model.chat(MSGS,
                                        tools=TOOLS,
                                        sampling_params=SAMPLING_PARAMS)

        assert outputs[0].outputs[0].text.strip() == EXPECTED_FUNC_CALL
-												[BugFix] Fix input positions for long context with sliding window (#2088)


											
										
										
											2023-12-13 12:28:13 -08:00
+								"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
-												[Test] Make model tests run again and remove --forked from pytest (#3631)

Co-authored-by: Simon Mo <simon.mo@hey.com>
											
										
										
											2024-03-29 13:06:40 +09:00
+								Run `pytest tests/models/test_mistral.py`.
-												[BugFix] Fix input positions for long context with sliding window (#2088)


											
										
										
											2023-12-13 12:28:13 -08:00
+								"""
 								import pytest
-												[CI] Prune tests/models/decoder_only/language/* tests (#9940)

Signed-off-by: mgoin <michael@neuralmagic.com>
											
										
										
											2024-11-05 16:02:23 -05:00
+								from vllm import SamplingParams
-												[Model] Add mistral function calling format to all models loaded with "mistral" format (#8515)

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-17 19:50:37 +02:00
-												[CI/Build] Reorganize models tests (#7820)


											
										
										
											2024-09-14 01:20:06 +08:00
+								from ...utils import check_logprobs_close
-												[CI] Make mistral tests pass (#4596)


											
										
										
											2024-05-09 00:44:35 +09:00
-												[BugFix] Fix input positions for long context with sliding window (#2088)


											
										
										
											2023-12-13 12:28:13 -08:00
+								MODELS = [
 								    "mistralai/Mistral-7B-Instruct-v0.1",
 								]
-												[CI] Prune tests/models/decoder_only/language/* tests (#9940)

Signed-off-by: mgoin <michael@neuralmagic.com>
											
										
										
											2024-11-05 16:02:23 -05:00
+								MISTRAL_FORMAT_MODELS = [
 								    "mistralai/Mistral-7B-Instruct-v0.3",
-												[Bugfix] Fix edge-case crash when using chat with the Mistral Tekken Tokenizer (#10051)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
											
										
										
											2024-11-05 21:28:29 -07:00
+								    # uses the v3-Tekken tokenizer
 								    "mistralai/Ministral-8B-Instruct-2410",
 								    # Mistral-Nemo is to big for CI, but passes locally
 								    # "mistralai/Mistral-Nemo-Instruct-2407"
-												[CI] Prune tests/models/decoder_only/language/* tests (#9940)

Signed-off-by: mgoin <michael@neuralmagic.com>
											
										
										
											2024-11-05 16:02:23 -05:00
+								]
-												[Model] Add mistral function calling format to all models loaded with "mistral" format (#8515)

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-17 19:50:37 +02:00
+								SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
-												[Bugfix][Core] Fix tekken edge case for mistral tokenizer (#8640)


											
										
										
											2024-09-20 23:33:03 +02:00
+								SYMBOLIC_LANG_PROMPTS = [
 								    "勇敢な船乗りについての詩を書く",  # japanese
 								    "寫一首關於勇敢的水手的詩",  # chinese
-												[Bugfix] Fix edge-case crash when using chat with the Mistral Tekken Tokenizer (#10051)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
											
										
										
											2024-11-05 21:28:29 -07:00
+								    "ပုံပြင်လေးပြောပြပါ်:\n",  # burmese
 								    "Repeat the phrase 'URGENCY🌶️':\nURGENCY🌶️\nURGENCY🌶️\n",  # see https://github.com/vllm-project/vllm/pull/9625
-												[Bugfix][Core] Fix tekken edge case for mistral tokenizer (#8640)


											
										
										
											2024-09-20 23:33:03 +02:00
+								]
-												[Model] Add mistral function calling format to all models loaded with "mistral" format (#8515)

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-17 19:50:37 +02:00
 								# for function calling
 								TOOLS = [{
 								    "type": "function",
 								    "function": {
 								        "name": "get_current_weather",
 								        "description": "Get the current weather in a given location",
 								        "parameters": {
 								            "type": "object",
 								            "properties": {
 								                "city": {
 								                    "type":
 								                    "string",
 								                    "description":
 								                    "The city to find the weather for, e.g. 'San Francisco'"
 								                },
 								                "state": {
 								                    "type":
 								                    "string",
 								                    "description":
 								                    "the two-letter abbreviation for the state that the city is"
 								                    " in, e.g. 'CA' which would mean 'California'"
 								                },
 								                "unit": {
 								                    "type": "string",
 								                    "description": "The unit to fetch the temperature in",
 								                    "enum": ["celsius", "fahrenheit"]
 								                }
 								            },
 								            "required": ["city", "state", "unit"]
 								        }
 								    }
 								}]
 								MSGS = [{
 								    "role":
 								    "user",
 								    "content": ("Can you tell me what the temperate"
 								                " will be in Dallas, in fahrenheit?")
 								}]
 								EXPECTED_FUNC_CALL = (
 								    '[{"name": "get_current_weather", "arguments": '
 								    '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]')
-												[BugFix] Fix input positions for long context with sliding window (#2088)


											
										
										
											2023-12-13 12:28:13 -08:00
 								@pytest.mark.parametrize("model", MODELS)
 								@pytest.mark.parametrize("dtype", ["bfloat16"])
-												[CI] Make mistral tests pass (#4596)


											
										
										
											2024-05-09 00:44:35 +09:00
+								@pytest.mark.parametrize("max_tokens", [64])
 								@pytest.mark.parametrize("num_logprobs", [5])
-												[BugFix] Fix input positions for long context with sliding window (#2088)


											
										
										
											2023-12-13 12:28:13 -08:00
+								def test_models(
 								    hf_runner,
 								    vllm_runner,
-												[CI] Make mistral tests pass (#4596)


											
										
										
											2024-05-09 00:44:35 +09:00
+								    example_prompts,
-												[BugFix] Fix input positions for long context with sliding window (#2088)


											
										
										
											2023-12-13 12:28:13 -08:00
+								    model: str,
 								    dtype: str,
 								    max_tokens: int,
-												[CI] Make mistral tests pass (#4596)


											
										
										
											2024-05-09 00:44:35 +09:00
+								    num_logprobs: int,
-												[BugFix] Fix input positions for long context with sliding window (#2088)


											
										
										
											2023-12-13 12:28:13 -08:00
+								) -> None:
-												[CI] Make mistral tests pass (#4596)


											
										
										
											2024-05-09 00:44:35 +09:00
+								    # TODO(sang): Sliding window should be tested separately.
-												[CI/Test] improve robustness of test (hf_runner) (#5347)

[CI/Test] improve robustness of test by replacing del with context manager (hf_runner) (#5347)
											
										
										
											2024-06-07 22:31:32 -07:00
+								    with hf_runner(model, dtype=dtype) as hf_model:
 								        hf_outputs = hf_model.generate_greedy_logprobs_limit(
 								            example_prompts, max_tokens, num_logprobs)
-												[BugFix] Fix input positions for long context with sliding window (#2088)


											
										
										
											2023-12-13 12:28:13 -08:00
-												[Model] Add Mistral Tokenization to improve robustness and chat encoding (#7739)


											
										
										
											2024-08-27 14:40:02 +02:00
+								    with vllm_runner(model, dtype=dtype,
 								                     tokenizer_mode="mistral") as vllm_model:
-												[CI/Test] improve robustness of test (vllm_runner) (#5357)

[CI/Test] improve robustness of test by replacing del with context manager (vllm_runner) (#5357)
											
										
										
											2024-06-08 01:59:20 -07:00
+								        vllm_outputs = vllm_model.generate_greedy_logprobs(
 								            example_prompts, max_tokens, num_logprobs)
-												[Model] Add Mistral Tokenization to improve robustness and chat encoding (#7739)


											
										
										
											2024-08-27 14:40:02 +02:00
-												[CI] Make mistral tests pass (#4596)


											
										
										
											2024-05-09 00:44:35 +09:00
+								    check_logprobs_close(
 								        outputs_0_lst=hf_outputs,
 								        outputs_1_lst=vllm_outputs,
 								        name_0="hf",
 								        name_1="vllm",
 								    )
-												[Model] Allow loading from original Mistral format (#8168)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
											
										
										
											2024-09-07 01:02:05 +02:00
-												[CI] Prune tests/models/decoder_only/language/* tests (#9940)

Signed-off-by: mgoin <michael@neuralmagic.com>
											
										
										
											2024-11-05 16:02:23 -05:00
+								@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
-												[Model] Allow loading from original Mistral format (#8168)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
											
										
										
											2024-09-07 01:02:05 +02:00
+								@pytest.mark.parametrize("dtype", ["bfloat16"])
 								@pytest.mark.parametrize("max_tokens", [64])
 								@pytest.mark.parametrize("num_logprobs", [5])
 								def test_mistral_format(
 								    vllm_runner,
 								    example_prompts,
 								    model: str,
 								    dtype: str,
 								    max_tokens: int,
 								    num_logprobs: int,
 								) -> None:
 								    with vllm_runner(
 								            model,
 								            dtype=dtype,
 								            tokenizer_mode="auto",
 								            load_format="safetensors",
 								            config_format="hf",
 								    ) as hf_format_model:
 								        hf_format_outputs = hf_format_model.generate_greedy_logprobs(
 								            example_prompts, max_tokens, num_logprobs)
 								    with vllm_runner(
 								            model,
 								            dtype=dtype,
 								            tokenizer_mode="mistral",
 								            load_format="mistral",
 								            config_format="mistral",
 								    ) as mistral_format_model:
 								        mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
 								            example_prompts, max_tokens, num_logprobs)
 								    check_logprobs_close(
 								        outputs_0_lst=hf_format_outputs,
 								        outputs_1_lst=mistral_format_outputs,
 								        name_0="hf",
 								        name_1="mistral",
 								    )
-												[Model] Add mistral function calling format to all models loaded with "mistral" format (#8515)

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-17 19:50:37 +02:00
-												[CI] Prune tests/models/decoder_only/language/* tests (#9940)

Signed-off-by: mgoin <michael@neuralmagic.com>
											
										
										
											2024-11-05 16:02:23 -05:00
+								@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
-												[Bugfix][Core] Fix tekken edge case for mistral tokenizer (#8640)


											
										
										
											2024-09-20 23:33:03 +02:00
+								@pytest.mark.parametrize("dtype", ["bfloat16"])
 								def test_mistral_symbolic_languages(
-												[CI] Prune tests/models/decoder_only/language/* tests (#9940)

Signed-off-by: mgoin <michael@neuralmagic.com>
											
										
										
											2024-11-05 16:02:23 -05:00
+								    vllm_runner,
-												[Bugfix][Core] Fix tekken edge case for mistral tokenizer (#8640)


											
										
										
											2024-09-20 23:33:03 +02:00
+								    model: str,
 								    dtype: str,
 								) -> None:
-												[CI] Prune tests/models/decoder_only/language/* tests (#9940)

Signed-off-by: mgoin <michael@neuralmagic.com>
											
										
										
											2024-11-05 16:02:23 -05:00
+								    with vllm_runner(model,
 								                     dtype=dtype,
 								                     max_model_len=8192,
 								                     tokenizer_mode="mistral",
 								                     config_format="mistral",
 								                     load_format="mistral") as vllm_model:
 								        for prompt in SYMBOLIC_LANG_PROMPTS:
 								            msg = {"role": "user", "content": prompt}
 								            outputs = vllm_model.model.chat([msg],
 								                                            sampling_params=SAMPLING_PARAMS)
 								            assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
-												[Bugfix][Core] Fix tekken edge case for mistral tokenizer (#8640)


											
										
										
											2024-09-20 23:33:03 +02:00
-												[Model] Add mistral function calling format to all models loaded with "mistral" format (#8515)

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-17 19:50:37 +02:00
+								@pytest.mark.parametrize("dtype", ["bfloat16"])
-												[CI] Prune tests/models/decoder_only/language/* tests (#9940)

Signed-off-by: mgoin <michael@neuralmagic.com>
											
										
										
											2024-11-05 16:02:23 -05:00
+								@pytest.mark.parametrize("model",
 								                         MISTRAL_FORMAT_MODELS)  # v1 can't do func calling
-												[Model] Add mistral function calling format to all models loaded with "mistral" format (#8515)

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-17 19:50:37 +02:00
+								def test_mistral_function_calling(
 								    vllm_runner,
 								    model: str,
 								    dtype: str,
 								) -> None:
 								    with vllm_runner(model,
 								                     dtype=dtype,
 								                     tokenizer_mode="mistral",
 								                     config_format="mistral",
 								                     load_format="mistral") as vllm_model:
 								        outputs = vllm_model.model.chat(MSGS,
 								                                        tools=TOOLS,
 								                                        sampling_params=SAMPLING_PARAMS)
 								        assert outputs[0].outputs[0].text.strip() == EXPECTED_FUNC_CALL