[Doc] Replace ibm-fms with ibm-ai-platform (#12709)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
This commit is contained in:
parent
5d98d56089
commit
bb392af434
@ -131,7 +131,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
|||||||
llm = LLM(
|
llm = LLM(
|
||||||
model="meta-llama/Meta-Llama-3.1-70B-Instruct",
|
model="meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
tensor_parallel_size=4,
|
tensor_parallel_size=4,
|
||||||
speculative_model="ibm-fms/llama3-70b-accelerator",
|
speculative_model="ibm-ai-platform/llama3-70b-accelerator",
|
||||||
speculative_draft_tensor_parallel_size=1,
|
speculative_draft_tensor_parallel_size=1,
|
||||||
)
|
)
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
@ -149,11 +149,11 @@ limitation will be fixed in a future release.
|
|||||||
|
|
||||||
A variety of speculative models of this type are available on HF hub:
|
A variety of speculative models of this type are available on HF hub:
|
||||||
|
|
||||||
- [llama-13b-accelerator](https://huggingface.co/ibm-fms/llama-13b-accelerator)
|
- [llama-13b-accelerator](https://huggingface.co/ibm-ai-platform/llama-13b-accelerator)
|
||||||
- [llama3-8b-accelerator](https://huggingface.co/ibm-fms/llama3-8b-accelerator)
|
- [llama3-8b-accelerator](https://huggingface.co/ibm-ai-platform/llama3-8b-accelerator)
|
||||||
- [codellama-34b-accelerator](https://huggingface.co/ibm-fms/codellama-34b-accelerator)
|
- [codellama-34b-accelerator](https://huggingface.co/ibm-ai-platform/codellama-34b-accelerator)
|
||||||
- [llama2-70b-accelerator](https://huggingface.co/ibm-fms/llama2-70b-accelerator)
|
- [llama2-70b-accelerator](https://huggingface.co/ibm-ai-platform/llama2-70b-accelerator)
|
||||||
- [llama3-70b-accelerator](https://huggingface.co/ibm-fms/llama3-70b-accelerator)
|
- [llama3-70b-accelerator](https://huggingface.co/ibm-ai-platform/llama3-70b-accelerator)
|
||||||
- [granite-3b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-3b-code-instruct-accelerator)
|
- [granite-3b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-3b-code-instruct-accelerator)
|
||||||
- [granite-8b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-8b-code-instruct-accelerator)
|
- [granite-8b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-8b-code-instruct-accelerator)
|
||||||
- [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator)
|
- [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator)
|
||||||
|
@ -51,7 +51,7 @@ if __name__ == "__main__":
|
|||||||
# Create an LLM with spec decoding
|
# Create an LLM with spec decoding
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
model="meta-llama/Llama-2-13b-chat-hf",
|
model="meta-llama/Llama-2-13b-chat-hf",
|
||||||
speculative_model="ibm-fms/llama-13b-accelerator",
|
speculative_model="ibm-ai-platform/llama-13b-accelerator",
|
||||||
)
|
)
|
||||||
|
|
||||||
print("With speculation")
|
print("With speculation")
|
||||||
|
@ -278,7 +278,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
|
|||||||
"MedusaModel": _HfExamplesInfo("JackFram/llama-68m",
|
"MedusaModel": _HfExamplesInfo("JackFram/llama-68m",
|
||||||
speculative_model="abhigoyal/vllm-medusa-llama-68m-random"), # noqa: E501
|
speculative_model="abhigoyal/vllm-medusa-llama-68m-random"), # noqa: E501
|
||||||
"MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m",
|
"MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m",
|
||||||
speculative_model="ibm-fms/llama-160m-accelerator"), # noqa: E501
|
speculative_model="ibm-ai-platform/llama-160m-accelerator"), # noqa: E501
|
||||||
}
|
}
|
||||||
|
|
||||||
_FALLBACK_MODEL = {
|
_FALLBACK_MODEL = {
|
||||||
|
@ -33,7 +33,7 @@ from .conftest import run_equality_correctness_test
|
|||||||
MAIN_MODEL = "JackFram/llama-160m"
|
MAIN_MODEL = "JackFram/llama-160m"
|
||||||
|
|
||||||
# speculative model
|
# speculative model
|
||||||
SPEC_MODEL = "ibm-fms/llama-160m-accelerator"
|
SPEC_MODEL = "ibm-ai-platform/llama-160m-accelerator"
|
||||||
|
|
||||||
# max. number of speculative tokens: this corresponds to
|
# max. number of speculative tokens: this corresponds to
|
||||||
# n_predict in the config.json of the speculator model.
|
# n_predict in the config.json of the speculator model.
|
||||||
|
@ -64,7 +64,7 @@ class MLPSpeculator(nn.Module):
|
|||||||
https://arxiv.org/pdf/2404.19124
|
https://arxiv.org/pdf/2404.19124
|
||||||
|
|
||||||
Trained speculators of this type are available on HF hub at:
|
Trained speculators of this type are available on HF hub at:
|
||||||
https://huggingface.co/ibm-fms and https://huggingface.co/ibm-granite
|
https://huggingface.co/ibm-ai-platform and https://huggingface.co/ibm-granite
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user