
Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
33 lines
1.0 KiB
Python
33 lines
1.0 KiB
Python
from huggingface_hub import hf_hub_download
|
|
|
|
from vllm import LLM, SamplingParams
|
|
|
|
|
|
def run_gguf_inference(model_path, tokenizer):
|
|
# Sample prompts.
|
|
prompts = [
|
|
"How many helicopters can a human eat in one sitting?",
|
|
"What's the future of AI?",
|
|
]
|
|
prompts = [[{"role": "user", "content": prompt}] for prompt in prompts]
|
|
# Create a sampling params object.
|
|
sampling_params = SamplingParams(temperature=0, max_tokens=128)
|
|
|
|
# Create an LLM.
|
|
llm = LLM(model=model_path, tokenizer=tokenizer)
|
|
|
|
outputs = llm.chat(prompts, sampling_params)
|
|
# Print the outputs.
|
|
for output in outputs:
|
|
prompt = output.prompt
|
|
generated_text = output.outputs[0].text
|
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
|
|
filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
|
|
tokenizer = "microsoft/Phi-3-medium-4k-instruct"
|
|
model = hf_hub_download(repo_id, filename=filename)
|
|
run_gguf_inference(model, tokenizer)
|