2025-02-02 14:58:18 -05:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
2024-07-11 13:28:38 -07:00
|
|
|
import argparse
|
|
|
|
|
|
|
|
from transformers import AutoTokenizer
|
|
|
|
|
|
|
|
|
|
|
|
def main(model, cachedir):
|
|
|
|
# Load the tokenizer and save it to the specified directory
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model)
|
|
|
|
tokenizer.save_pretrained(cachedir)
|
|
|
|
print(f"Tokenizer saved to {cachedir}")
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
description="Download and save Hugging Face tokenizer")
|
|
|
|
parser.add_argument("--model",
|
|
|
|
type=str,
|
|
|
|
required=True,
|
|
|
|
help="Name of the model")
|
|
|
|
parser.add_argument("--cachedir",
|
|
|
|
type=str,
|
|
|
|
required=True,
|
|
|
|
help="Directory to save the tokenizer")
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
main(args.model, args.cachedir)
|