vllm/examples/offline_inference/gguf_inference.py

from huggingface_hub import hf_hub_download

from vllm import LLM, SamplingParams


def run_gguf_inference(model_path, tokenizer):
    # Sample prompts.
    prompts = [
        "How many helicopters can a human eat in one sitting?",
        "What's the future of AI?",
    ]
    prompts = [[{"role": "user", "content": prompt}] for prompt in prompts]
    # Create a sampling params object.
    sampling_params = SamplingParams(temperature=0, max_tokens=128)

    # Create an LLM.
    llm = LLM(model=model_path, tokenizer=tokenizer)

    outputs = llm.chat(prompts, sampling_params)
    # Print the outputs.
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")


if __name__ == "__main__":
    repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
    filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
    tokenizer = "microsoft/Phi-3-medium-4k-instruct"
    model = hf_hub_download(repo_id, filename=filename)
    run_gguf_inference(model, tokenizer)