2024-08-13 16:24:17 -07:00
|
|
|
import os
|
2024-08-13 13:33:41 +08:00
|
|
|
|
2024-08-13 16:24:17 -07:00
|
|
|
import pytest
|
2024-04-06 17:11:41 -07:00
|
|
|
|
2024-08-13 16:24:17 -07:00
|
|
|
from vllm import LLM, SamplingParams
|
2024-04-06 17:11:41 -07:00
|
|
|
|
2024-08-13 16:24:17 -07:00
|
|
|
# NOTE: the order of the tests is important
|
|
|
|
# the first test does not load any plugins
|
|
|
|
# the second test loads the plugin
|
|
|
|
# they share the same process, so the plugin is loaded for the second test
|
2024-04-06 17:11:41 -07:00
|
|
|
|
|
|
|
|
2024-08-13 16:24:17 -07:00
|
|
|
def test_plugin(dummy_opt_path):
|
|
|
|
os.environ["VLLM_PLUGINS"] = ""
|
|
|
|
with pytest.raises(Exception) as excinfo:
|
|
|
|
LLM(model=dummy_opt_path, load_format="dummy")
|
|
|
|
assert "are not supported for now" in str(excinfo.value)
|
2024-04-06 17:11:41 -07:00
|
|
|
|
|
|
|
|
2024-08-13 16:24:17 -07:00
|
|
|
def test_oot_registration(dummy_opt_path):
|
|
|
|
os.environ["VLLM_PLUGINS"] = "register_dummy_model"
|
2024-04-06 17:11:41 -07:00
|
|
|
prompts = ["Hello, my name is", "The text does not matter"]
|
|
|
|
sampling_params = SamplingParams(temperature=0)
|
2024-08-13 16:24:17 -07:00
|
|
|
llm = LLM(model=dummy_opt_path, load_format="dummy")
|
2024-04-06 17:11:41 -07:00
|
|
|
first_token = llm.get_tokenizer().decode(0)
|
|
|
|
outputs = llm.generate(prompts, sampling_params)
|
|
|
|
|
|
|
|
for output in outputs:
|
|
|
|
generated_text = output.outputs[0].text
|
|
|
|
# make sure only the first token is generated
|
|
|
|
rest = generated_text.replace(first_token, "")
|
|
|
|
assert rest == ""
|