2025-02-02 14:58:18 -05:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
2024-08-06 07:54:23 +08:00
|
|
|
from huggingface_hub import hf_hub_download
|
|
|
|
|
|
|
|
from vllm import LLM, SamplingParams
|
|
|
|
|
|
|
|
|
2025-01-13 08:13:44 +08:00
|
|
|
def run_gguf_inference(model_path, tokenizer):
|
2024-08-06 07:54:23 +08:00
|
|
|
# Sample prompts.
|
|
|
|
prompts = [
|
|
|
|
"How many helicopters can a human eat in one sitting?",
|
|
|
|
"What's the future of AI?",
|
|
|
|
]
|
2025-01-13 08:13:44 +08:00
|
|
|
prompts = [[{"role": "user", "content": prompt}] for prompt in prompts]
|
2024-08-06 07:54:23 +08:00
|
|
|
# Create a sampling params object.
|
|
|
|
sampling_params = SamplingParams(temperature=0, max_tokens=128)
|
|
|
|
|
|
|
|
# Create an LLM.
|
2025-01-13 08:13:44 +08:00
|
|
|
llm = LLM(model=model_path, tokenizer=tokenizer)
|
2024-08-06 07:54:23 +08:00
|
|
|
|
2025-01-13 08:13:44 +08:00
|
|
|
outputs = llm.chat(prompts, sampling_params)
|
2024-08-06 07:54:23 +08:00
|
|
|
# Print the outputs.
|
|
|
|
for output in outputs:
|
|
|
|
prompt = output.prompt
|
|
|
|
generated_text = output.outputs[0].text
|
|
|
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2025-01-13 08:13:44 +08:00
|
|
|
repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
|
|
|
|
filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
|
|
|
|
tokenizer = "microsoft/Phi-3-medium-4k-instruct"
|
2024-08-06 07:54:23 +08:00
|
|
|
model = hf_hub_download(repo_id, filename=filename)
|
2025-01-13 08:13:44 +08:00
|
|
|
run_gguf_inference(model, tokenizer)
|