39 lines
1.3 KiB
Python
39 lines
1.3 KiB
Python
from huggingface_hub import hf_hub_download
|
|
|
|
from vllm import LLM, SamplingParams
|
|
|
|
|
|
def run_gguf_inference(model_path):
|
|
PROMPT_TEMPLATE = "<|system|>\n{system_message}</s>\n<|user|>\n{prompt}</s>\n<|assistant|>\n" # noqa: E501
|
|
system_message = "You are a friendly chatbot who always responds in the style of a pirate." # noqa: E501
|
|
# Sample prompts.
|
|
prompts = [
|
|
"How many helicopters can a human eat in one sitting?",
|
|
"What's the future of AI?",
|
|
]
|
|
prompts = [
|
|
PROMPT_TEMPLATE.format(system_message=system_message, prompt=prompt)
|
|
for prompt in prompts
|
|
]
|
|
# Create a sampling params object.
|
|
sampling_params = SamplingParams(temperature=0, max_tokens=128)
|
|
|
|
# Create an LLM.
|
|
llm = LLM(model=model_path,
|
|
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
|
gpu_memory_utilization=0.95)
|
|
|
|
outputs = llm.generate(prompts, sampling_params)
|
|
# Print the outputs.
|
|
for output in outputs:
|
|
prompt = output.prompt
|
|
generated_text = output.outputs[0].text
|
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
repo_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
|
|
filename = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf"
|
|
model = hf_hub_download(repo_id, filename=filename)
|
|
run_gguf_inference(model)
|