vllm/tests/lora/test_llama_tp.py

# SPDX-License-Identifier: Apache-2.0

import pytest
import ray

import vllm
from vllm.lora.request import LoRARequest

from ..utils import create_new_process_for_each_test, multi_gpu_test

MODEL_PATH = "meta-llama/Llama-2-7b-hf"

EXPECTED_NO_LORA_OUTPUT = [
    "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]",  # noqa: E501
    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ",  # noqa: E501
    "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m",  # noqa: E501
    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ",  # noqa: E501
    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ",  # noqa: E501
    "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",  # noqa: E501
]
EXPECTED_LORA_OUTPUT = [
    "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
    "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",  # noqa: E501
    "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
    "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
    "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",  # noqa: E501
    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "  # noqa: E501
]


def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
    prompts = [
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
    ]
    sampling_params = vllm.SamplingParams(temperature=0,
                                          max_tokens=256,
                                          stop=["[/assistant]"])
    outputs = llm.generate(
        prompts,
        sampling_params,
        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
        if lora_id else None)
    # Print the outputs.
    generated_texts: list[str] = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        generated_texts.append(generated_text)
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    return generated_texts


def generate_and_test(llm, sql_lora_files):
    print("lora adapter created")
    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT

    print("lora 1")
    assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT

    print("no lora")
    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT

    print("lora 2")
    assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT

    print("removing lora")


@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
    # Simple autouse wrapper to run both engines for each test
    # This can be promoted up to conftest.py to run for every
    # test in a package
    pass


# V1 Test: Failing due to numerics on V1.
@pytest.mark.skip_v1
@create_new_process_for_each_test()
def test_llama_lora(sql_lora_files):

    llm = vllm.LLM(MODEL_PATH,
                   enable_lora=True,
                   max_num_seqs=16,
                   max_loras=4,
                   tensor_parallel_size=1,
                   enable_chunked_prefill=True)
    generate_and_test(llm, sql_lora_files)


# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
# used by the engine yet.
@pytest.mark.skip_v1
@create_new_process_for_each_test()
def test_llama_lora_warmup(sql_lora_files):
    """Test that the LLM initialization works with a warmup LORA path and
    is more conservative"""

    @ray.remote(num_gpus=1)
    def get_num_gpu_blocks_lora():
        llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16)
        num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
        return num_gpu_blocks_lora_warmup

    @ray.remote(num_gpus=1)
    def get_num_gpu_blocks_no_lora():
        llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
        num_gpu_blocks_no_lora_warmup = (
            llm.llm_engine.cache_config.num_gpu_blocks)
        return num_gpu_blocks_no_lora_warmup

    num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
    num_gpu_blocks_no_lora_warmup = ray.get(
        get_num_gpu_blocks_no_lora.remote())
    assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
        "The warmup with lora should be more "
        "conservative than without lora, therefore the number of "
        "memory blocks for the KV cache should be "
        "less when using lora than when not using lora")


# V1 Test: Failing due to numerics on V1.
@pytest.mark.skip_v1
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_llama_lora_tp4(sql_lora_files):

    llm = vllm.LLM(
        MODEL_PATH,
        enable_lora=True,
        max_num_seqs=16,
        max_loras=4,
        tensor_parallel_size=4,
        enable_chunked_prefill=True,
    )
    generate_and_test(llm, sql_lora_files)


@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):

    llm = vllm.LLM(
        MODEL_PATH,
        enable_lora=True,
        max_num_seqs=16,
        max_loras=4,
        tensor_parallel_size=4,
        fully_sharded_loras=True,
        enable_chunked_prefill=True,
    )
    generate_and_test(llm, sql_lora_files)


@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):

    llm = vllm.LLM(
        MODEL_PATH,
        enable_lora=True,
        max_num_seqs=16,
        max_loras=4,
        tensor_parallel_size=4,
        fully_sharded_loras=True,
        enable_lora_bias=True,
        enable_chunked_prefill=True,
    )
    generate_and_test(llm, sql_lora_files)
-												[Misc] Add SPDX-License-Identifier headers to python source files (#12628)

- **Add SPDX license headers to python source files**
- **Check for SPDX headers using pre-commit**

commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:18:24 2025 -0500

    Add SPDX license headers to python source files
    
This commit adds SPDX license headers to python source files as
recommended to
the project by the Linux Foundation. These headers provide a concise way
that is
both human and machine readable for communicating license information
for each
source file. It helps avoid any ambiguity about the license of the code
and can
    also be easily used by tools to help manage license compliance.
    
The Linux Foundation runs license scans against the codebase to help
ensure
    we are in compliance with the licenses of the code we use, including
dependencies. Having these headers in place helps that tool do its job.
    
    More information can be found on the SPDX site:
    
    - https://spdx.dev/learn/handling-license-info/
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:36:32 2025 -0500

    Check for SPDX headers using pre-commit
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

---------

Signed-off-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-02-02 14:58:18 -05:00
+								# SPDX-License-Identifier: Apache-2.0
-												[V1] LoRA Support (#10957)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
											
										
										
											2025-02-06 23:02:51 +05:30
+								import pytest
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								import ray
 								import vllm
 								from vllm.lora.request import LoRARequest
-												[Bugfix][ROCm] running new process using spawn method for rocm in tests. (#14810)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-03-17 19:33:35 +08:00
+								from ..utils import create_new_process_for_each_test, multi_gpu_test
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
 								MODEL_PATH = "meta-llama/Llama-2-7b-hf"
 								EXPECTED_NO_LORA_OUTPUT = [
 								    "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]",  # noqa: E501
 								    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ",  # noqa: E501
 								    "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m",  # noqa: E501
 								    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ",  # noqa: E501
 								    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ",  # noqa: E501
 								    "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",  # noqa: E501
 								]
 								EXPECTED_LORA_OUTPUT = [
 								    "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
 								    "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",  # noqa: E501
 								    "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
 								    "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
 								    "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",  # noqa: E501
 								    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "  # noqa: E501
 								]
-												Update deprecated Python 3.8 typing (#13971)


											
										
										
											2025-03-03 01:34:51 +00:00
+								def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    prompts = [
 								        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
 								        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
 								        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
 								        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
 								        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
 								        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
 								    ]
 								    sampling_params = vllm.SamplingParams(temperature=0,
 								                                          max_tokens=256,
 								                                          stop=["[/assistant]"])
 								    outputs = llm.generate(
 								        prompts,
 								        sampling_params,
 								        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
 								        if lora_id else None)
 								    # Print the outputs.
-												Update deprecated Python 3.8 typing (#13971)


											
										
										
											2025-03-03 01:34:51 +00:00
+								    generated_texts: list[str] = []
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    for output in outputs:
 								        prompt = output.prompt
 								        generated_text = output.outputs[0].text
 								        generated_texts.append(generated_text)
 								        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 								    return generated_texts
-												[Misc][LoRA] Move the implementation of lora bias to punica.py (#10829)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2024-12-03 01:53:36 +08:00
+								def generate_and_test(llm, sql_lora_files):
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    print("lora adapter created")
 								    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
 								    print("lora 1")
 								    assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT
 								    print("no lora")
 								    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
 								    print("lora 2")
 								    assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT
 								    print("removing lora")
-												[V1] LoRA Support (#10957)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
											
										
										
											2025-02-06 23:02:51 +05:30
+								@pytest.fixture(autouse=True)
 								def v1(run_with_both_engines_lora):
 								    # Simple autouse wrapper to run both engines for each test
 								    # This can be promoted up to conftest.py to run for every
 								    # test in a package
 								    pass
-												[V1] V1 Enablement Oracle  (#13726)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
											
										
										
											2025-03-15 01:02:20 -04:00
+								# V1 Test: Failing due to numerics on V1.
 								@pytest.mark.skip_v1
-												[Bugfix][ROCm] running new process using spawn method for rocm in tests. (#14810)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-03-17 19:33:35 +08:00
+								@create_new_process_for_each_test()
-												[Misc][LoRA] Move the implementation of lora bias to punica.py (#10829)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2024-12-03 01:53:36 +08:00
+								def test_llama_lora(sql_lora_files):
 								    llm = vllm.LLM(MODEL_PATH,
 								                   enable_lora=True,
 								                   max_num_seqs=16,
 								                   max_loras=4,
-												[Misc] LoRA + Chunked Prefill (#9057)


											
										
										
											2024-12-10 21:09:20 -05:00
+								                   tensor_parallel_size=1,
 								                   enable_chunked_prefill=True)
-												[Misc][LoRA] Move the implementation of lora bias to punica.py (#10829)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2024-12-03 01:53:36 +08:00
+								    generate_and_test(llm, sql_lora_files)
-												[V1] LoRA Support (#10957)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
											
										
										
											2025-02-06 23:02:51 +05:30
+								# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
 								# used by the engine yet.
 								@pytest.mark.skip_v1
-												[Bugfix][ROCm] running new process using spawn method for rocm in tests. (#14810)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-03-17 19:33:35 +08:00
+								@create_new_process_for_each_test()
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								def test_llama_lora_warmup(sql_lora_files):
 								    """Test that the LLM initialization works with a warmup LORA path and
 								    is more conservative"""
 								    @ray.remote(num_gpus=1)
 								    def get_num_gpu_blocks_lora():
 								        llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16)
 								        num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
 								        return num_gpu_blocks_lora_warmup
 								    @ray.remote(num_gpus=1)
 								    def get_num_gpu_blocks_no_lora():
 								        llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
 								        num_gpu_blocks_no_lora_warmup = (
 								            llm.llm_engine.cache_config.num_gpu_blocks)
 								        return num_gpu_blocks_no_lora_warmup
 								    num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
 								    num_gpu_blocks_no_lora_warmup = ray.get(
 								        get_num_gpu_blocks_no_lora.remote())
 								    assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
 								        "The warmup with lora should be more "
 								        "conservative than without lora, therefore the number of "
 								        "memory blocks for the KV cache should be "
 								        "less when using lora than when not using lora")
-												[V1] V1 Enablement Oracle  (#13726)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
											
										
										
											2025-03-15 01:02:20 -04:00
+								# V1 Test: Failing due to numerics on V1.
 								@pytest.mark.skip_v1
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								@multi_gpu_test(num_gpus=4)
-												[Bugfix][ROCm] running new process using spawn method for rocm in tests. (#14810)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-03-17 19:33:35 +08:00
+								@create_new_process_for_each_test()
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								def test_llama_lora_tp4(sql_lora_files):
 								    llm = vllm.LLM(
 								        MODEL_PATH,
 								        enable_lora=True,
 								        max_num_seqs=16,
 								        max_loras=4,
 								        tensor_parallel_size=4,
-												[Misc] LoRA + Chunked Prefill (#9057)


											
										
										
											2024-12-10 21:09:20 -05:00
+								        enable_chunked_prefill=True,
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    )
-												[Misc][LoRA] Move the implementation of lora bias to punica.py (#10829)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2024-12-03 01:53:36 +08:00
+								    generate_and_test(llm, sql_lora_files)
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
 								@multi_gpu_test(num_gpus=4)
-												[Bugfix][ROCm] running new process using spawn method for rocm in tests. (#14810)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-03-17 19:33:35 +08:00
+								@create_new_process_for_each_test()
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
 								    llm = vllm.LLM(
 								        MODEL_PATH,
 								        enable_lora=True,
 								        max_num_seqs=16,
 								        max_loras=4,
 								        tensor_parallel_size=4,
 								        fully_sharded_loras=True,
-												[Misc] LoRA + Chunked Prefill (#9057)


											
										
										
											2024-12-10 21:09:20 -05:00
+								        enable_chunked_prefill=True,
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    )
-												[Misc][LoRA] Move the implementation of lora bias to punica.py (#10829)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2024-12-03 01:53:36 +08:00
+								    generate_and_test(llm, sql_lora_files)
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
-												[Misc][LoRA] Move the implementation of lora bias to punica.py (#10829)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2024-12-03 01:53:36 +08:00
+								@multi_gpu_test(num_gpus=4)
-												[Bugfix][ROCm] running new process using spawn method for rocm in tests. (#14810)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-03-17 19:33:35 +08:00
+								@create_new_process_for_each_test()
-												[Misc][LoRA] Move the implementation of lora bias to punica.py (#10829)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2024-12-03 01:53:36 +08:00
+								def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
-												[Misc][LoRA] Move the implementation of lora bias to punica.py (#10829)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2024-12-03 01:53:36 +08:00
+								    llm = vllm.LLM(
 								        MODEL_PATH,
 								        enable_lora=True,
 								        max_num_seqs=16,
 								        max_loras=4,
 								        tensor_parallel_size=4,
 								        fully_sharded_loras=True,
 								        enable_lora_bias=True,
-												[Misc] LoRA + Chunked Prefill (#9057)


											
										
										
											2024-12-10 21:09:20 -05:00
+								        enable_chunked_prefill=True,
-												[Misc][LoRA] Move the implementation of lora bias to punica.py (#10829)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2024-12-03 01:53:36 +08:00
+								    )
 								    generate_and_test(llm, sql_lora_files)