[CI/Build] Replaced some models on tests for smaller ones (#9570)
Signed-off-by: Wallas Santos <wallashss@ibm.com>
This commit is contained in:
parent
74692421f7
commit
c0292211ce
@ -19,7 +19,7 @@ from ..utils import multi_gpu_test
|
|||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
"facebook/opt-125m",
|
"facebook/opt-125m",
|
||||||
"meta-llama/Llama-2-7b-hf",
|
"meta-llama/Llama-3.2-1B",
|
||||||
]
|
]
|
||||||
|
|
||||||
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
|
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
|
||||||
|
@ -16,7 +16,7 @@ from ..utils import multi_gpu_test
|
|||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
"facebook/opt-125m",
|
"facebook/opt-125m",
|
||||||
"meta-llama/Llama-2-7b-hf",
|
"meta-llama/Llama-3.2-1B",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -2,5 +2,5 @@ from ..utils import compare_two_settings
|
|||||||
|
|
||||||
|
|
||||||
def test_cpu_offload():
|
def test_cpu_offload():
|
||||||
compare_two_settings("meta-llama/Llama-2-7b-hf", [],
|
compare_two_settings("meta-llama/Llama-3.2-1B", [],
|
||||||
["--cpu-offload-gb", "4"])
|
["--cpu-offload-gb", "1"])
|
||||||
|
@ -13,8 +13,7 @@ from ..utils import compare_all_settings
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"model, model_args, pp_size, tp_size, attn_backend, method, fullgraph",
|
"model, model_args, pp_size, tp_size, attn_backend, method, fullgraph",
|
||||||
[
|
[
|
||||||
("meta-llama/Meta-Llama-3-8B", [], 2, 2, "FLASH_ATTN", "generate",
|
("meta-llama/Llama-3.2-1B", [], 2, 2, "FLASH_ATTN", "generate", True),
|
||||||
True),
|
|
||||||
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples",
|
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples",
|
||||||
["--quantization", "compressed-tensors"
|
["--quantization", "compressed-tensors"
|
||||||
], 1, 1, "FLASH_ATTN", "generate", True),
|
], 1, 1, "FLASH_ATTN", "generate", True),
|
||||||
|
@ -8,7 +8,7 @@ from ..openai.test_vision import TEST_IMAGE_URLS
|
|||||||
|
|
||||||
|
|
||||||
def test_chat():
|
def test_chat():
|
||||||
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
|
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
|
||||||
|
|
||||||
prompt1 = "Explain the concept of entropy."
|
prompt1 = "Explain the concept of entropy."
|
||||||
messages = [
|
messages = [
|
||||||
@ -26,7 +26,7 @@ def test_chat():
|
|||||||
|
|
||||||
|
|
||||||
def test_multi_chat():
|
def test_multi_chat():
|
||||||
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
|
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
|
||||||
|
|
||||||
prompt1 = "Explain the concept of entropy."
|
prompt1 = "Explain the concept of entropy."
|
||||||
prompt2 = "Explain what among us is."
|
prompt2 = "Explain what among us is."
|
||||||
|
@ -16,9 +16,6 @@ from .test_completion import zephyr_lora_files # noqa: F401
|
|||||||
|
|
||||||
# any model with a chat template should work here
|
# any model with a chat template should work here
|
||||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||||
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
|
|
||||||
# generation quality here
|
|
||||||
LORA_NAME = "typeof/zephyr-7b-beta-lora"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
|
@ -6,7 +6,7 @@ import pytest
|
|||||||
|
|
||||||
from ...utils import RemoteOpenAIServer
|
from ...utils import RemoteOpenAIServer
|
||||||
|
|
||||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
MODEL_NAME = "meta-llama/Llama-3.2-1B"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
@ -46,9 +46,10 @@ def test_filter_subtensors():
|
|||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def llama_2_7b_files():
|
def llama_2_7b_files():
|
||||||
with TemporaryDirectory() as cache_dir:
|
with TemporaryDirectory() as cache_dir:
|
||||||
input_dir = snapshot_download("meta-llama/Llama-2-7b-hf",
|
input_dir = snapshot_download("meta-llama/Llama-3.2-1B",
|
||||||
cache_dir=cache_dir,
|
cache_dir=cache_dir,
|
||||||
ignore_patterns="*.bin*")
|
ignore_patterns=["*.bin*", "original/*"])
|
||||||
|
|
||||||
yield input_dir
|
yield input_dir
|
||||||
|
|
||||||
|
|
||||||
@ -58,9 +59,12 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
|
|||||||
# Dump worker states to output directory
|
# Dump worker states to output directory
|
||||||
llm_sharded_writer.llm_engine.model_executor.save_sharded_state(
|
llm_sharded_writer.llm_engine.model_executor.save_sharded_state(
|
||||||
path=output_dir)
|
path=output_dir)
|
||||||
|
|
||||||
# Copy metadata files to output directory
|
# Copy metadata files to output directory
|
||||||
for file in os.listdir(input_dir):
|
for file in os.listdir(input_dir):
|
||||||
if not any(file.endswith(ext) for ext in weights_patterns):
|
if not any(
|
||||||
|
file.endswith(ext) and not os.path.isdir(file)
|
||||||
|
for ext in weights_patterns):
|
||||||
shutil.copy(f"{input_dir}/{file}", output_dir)
|
shutil.copy(f"{input_dir}/{file}", output_dir)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user