diff --git a/requirements-test.in b/requirements-test.in
index ecf874ec..53c53136 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -37,3 +37,5 @@ genai_perf==0.0.8
 tritonclient==2.51.0
 
 numpy < 2.0.0
+runai-model-streamer==0.11.0
+runai-model-streamer-s3==0.11.0
\ No newline at end of file
diff --git a/requirements-test.txt b/requirements-test.txt
index 648a2626..f9158641 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -171,6 +171,8 @@ huggingface-hub==0.26.2
     #   tokenizers
     #   transformers
     #   vocos
+humanize==4.11.0
+    # via runai-model-streamer
 idna==3.10
     # via
     #   anyio
@@ -290,6 +292,7 @@ numpy==1.26.4
     #   patsy
     #   peft
     #   rouge-score
+    #   runai-model-streamer
     #   sacrebleu
     #   scikit-learn
     #   scipy
@@ -514,6 +517,10 @@ rpds-py==0.20.1
     #   referencing
 rsa==4.7.2
     # via awscli
+runai-model-streamer==0.11.0
+    # via -r requirements-test.in
+runai-model-streamer-s3==0.11.0
+    # via -r requirements-test.in
 s3transfer==0.10.3
     # via
     #   awscli
@@ -594,6 +601,7 @@ torch==2.5.1
     #   encodec
     #   lm-eval
     #   peft
+    #   runai-model-streamer
     #   sentence-transformers
     #   tensorizer
     #   timm
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index bd97dd94..cc25c879 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -9,6 +9,7 @@ import weakref
 import pytest
 
 from vllm import LLM
+from vllm.config import LoadFormat
 from vllm.platforms import current_platform
 
 from ..conftest import VllmRunner
@@ -33,7 +34,7 @@ def v1(run_with_both_engines):
 
 def test_vllm_gc_ed():
     """Verify vllm instance is GC'ed when it is deleted"""
-    llm = LLM("facebook/opt-125m")
+    llm = LLM("distilbert/distilgpt2", load_format=LoadFormat.RUNAI_STREAMER)
     weak_llm = weakref.ref(llm)
     del llm
     # If there's any circular reference to vllm, this fails
@@ -94,14 +95,14 @@ def test_models(
 @pytest.mark.parametrize(
     "model, distributed_executor_backend, attention_backend, "
     "test_suite", [
-        ("facebook/opt-125m", "ray", "", "L4"),
-        ("facebook/opt-125m", "mp", "", "L4"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
-        ("facebook/opt-125m", "ray", "", "A100"),
-        ("facebook/opt-125m", "mp", "", "A100"),
-        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "FLASHINFER", "A100"),
+        ("distilbert/distilgpt2", "ray", "", "L4"),
+        ("distilbert/distilgpt2", "mp", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
+        ("distilbert/distilgpt2", "ray", "", "A100"),
+        ("distilbert/distilgpt2", "mp", "", "A100"),
+        ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
+        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
     ])
 def test_models_distributed(
     hf_runner,
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index f16b8007..24ed5d39 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -4,9 +4,11 @@ import pytest
 import torch
 
 from vllm import LLM, SamplingParams
+from vllm.config import LoadFormat
 from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.utils import GiB_bytes
 
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 from ..utils import fork_new_process_for_each_test
 
 
@@ -118,13 +120,18 @@ def test_cumem_with_cudagraph():
 @pytest.mark.parametrize(
     "model",
     [
-        "meta-llama/Llama-3.2-1B-Instruct",  # sleep mode with safetensors
-        "facebook/opt-125m"  # sleep mode with pytorch checkpoint
+        # sleep mode with safetensors
+        f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B",
+        # sleep mode with pytorch checkpoint
+        "facebook/opt-125m"
     ])
 def test_end_to_end(model):
     free, total = torch.cuda.mem_get_info()
     used_bytes_baseline = total - free  # in case other process is running
-    llm = LLM(model, enable_sleep_mode=True)
+    load_format = LoadFormat.AUTO
+    if "Llama" in model:
+        load_format = LoadFormat.RUNAI_STREAMER
+    llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
     prompt = "How are you?"
     sampling_params = SamplingParams(temperature=0, max_tokens=10)
     output = llm.generate(prompt, sampling_params)
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 6aaec6ee..a32b7cac 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -17,7 +17,7 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
 from ..models.utils import check_outputs_equal
 
 MODELS = [
-    "facebook/opt-125m",
+    "distilbert/distilgpt2",
 ]
 
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 02105900..74219e40 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -24,7 +24,7 @@ from tests.models.utils import (TokensTextLogprobs,
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TaskOption, TokenizerPoolConfig
+from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                               init_distributed_environment,
@@ -46,6 +46,21 @@ _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
 _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
 
 _M = TypeVar("_M")
+
+MODELS_ON_S3 = [
+    "distilbert/distilgpt2",
+    "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Meta-Llama-3-8B",
+    "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "openai-community/gpt2",
+    "ArthurZ/Ilama-3.2-1B",
+    "llava-hf/llava-1.5-7b-hf",
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+]
+
+MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
+
 _PromptMultiModalInput = Union[List[_M], List[List[_M]]]
 
 PromptImageInput = _PromptMultiModalInput[Image.Image]
@@ -677,8 +692,15 @@ class VllmRunner:
         enable_chunked_prefill: bool = False,
         swap_space: int = 4,
         enforce_eager: Optional[bool] = False,
+        load_format: Optional[LoadFormat] = None,
         **kwargs,
     ) -> None:
+        if model_name in MODELS_ON_S3 and not load_format:
+            model_name = (f"s3://vllm-ci-model-weights/"
+                          f"{model_name.split('/')[-1]}")
+            load_format = LoadFormat.RUNAI_STREAMER
+        if not load_format:
+            load_format = LoadFormat.AUTO
         self.model = LLM(
             model=model_name,
             task=task,
@@ -693,6 +715,7 @@ class VllmRunner:
             max_model_len=max_model_len,
             block_size=block_size,
             enable_chunked_prefill=enable_chunked_prefill,
+            load_format=load_format,
             **kwargs,
         )
 
diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py
index dca8fa60..93907eca 100644
--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -2,12 +2,15 @@
 
 import pytest
 
+from vllm.config import LoadFormat
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.sampling_params import SamplingParams
 
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 @pytest.mark.parametrize("block_size", [16])
 def test_computed_prefix_blocks(model: str, block_size: int):
     # This test checks if we are able to run the engine to completion
@@ -24,6 +27,7 @@ def test_computed_prefix_blocks(model: str, block_size: int):
         "decoration.")
 
     engine_args = EngineArgs(model=model,
+                             load_format=LoadFormat.RUNAI_STREAMER,
                              block_size=block_size,
                              enable_prefix_caching=True)
 
diff --git a/tests/engine/test_detokenization.py b/tests/engine/test_detokenization.py
index 742176ea..ab594aee 100644
--- a/tests/engine/test_detokenization.py
+++ b/tests/engine/test_detokenization.py
@@ -2,11 +2,14 @@
 
 import pytest
 
+from vllm.config import LoadFormat
 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams
 
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_computed_prefix_blocks(model: str):
     # This test checks if the engine generates completions both with and
     # without optional detokenization, that detokenization includes text
@@ -17,7 +20,7 @@ def test_computed_prefix_blocks(model: str):
         "paper clips? Is there an easy to follow video tutorial available "
         "online for free?")
 
-    llm = LLM(model=model)
+    llm = LLM(model=model, load_format=LoadFormat.RUNAI_STREAMER)
     sampling_params = SamplingParams(max_tokens=10,
                                      temperature=0.0,
                                      detokenize=False)
diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py
index 84cc3ed6..31c07e70 100644
--- a/tests/engine/test_executor.py
+++ b/tests/engine/test_executor.py
@@ -6,12 +6,17 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import pytest
 
+from vllm.config import LoadFormat
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine
 from vllm.executor.uniproc_executor import UniProcExecutor
 from vllm.sampling_params import SamplingParams
 
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
+
+RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
+
 
 class Mock:
     ...
@@ -33,10 +38,11 @@ class CustomUniExecutor(UniProcExecutor):
 CustomUniExecutorAsync = CustomUniExecutor
 
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_custom_executor_type_checking(model):
     with pytest.raises(ValueError):
         engine_args = EngineArgs(model=model,
+                                 load_format=RUNAI_STREAMER_LOAD_FORMAT,
                                  distributed_executor_backend=Mock)
         LLMEngine.from_engine_args(engine_args)
     with pytest.raises(ValueError):
@@ -45,7 +51,7 @@ def test_custom_executor_type_checking(model):
         AsyncLLMEngine.from_engine_args(engine_args)
 
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_custom_executor(model, tmp_path):
     cwd = os.path.abspath(".")
     os.chdir(tmp_path)
@@ -54,6 +60,7 @@ def test_custom_executor(model, tmp_path):
 
         engine_args = EngineArgs(
             model=model,
+            load_format=RUNAI_STREAMER_LOAD_FORMAT,
             distributed_executor_backend=CustomUniExecutor,
             enforce_eager=True,  # reduce test time
         )
@@ -68,7 +75,7 @@ def test_custom_executor(model, tmp_path):
         os.chdir(cwd)
 
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_custom_executor_async(model, tmp_path):
     cwd = os.path.abspath(".")
     os.chdir(tmp_path)
@@ -77,6 +84,7 @@ def test_custom_executor_async(model, tmp_path):
 
         engine_args = AsyncEngineArgs(
             model=model,
+            load_format=RUNAI_STREAMER_LOAD_FORMAT,
             distributed_executor_backend=CustomUniExecutorAsync,
             enforce_eager=True,  # reduce test time
         )
@@ -95,7 +103,7 @@ def test_custom_executor_async(model, tmp_path):
         os.chdir(cwd)
 
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_respect_ray(model):
     # even for TP=1 and PP=1,
     # if users specify ray, we should use ray.
@@ -104,6 +112,7 @@ def test_respect_ray(model):
     engine_args = EngineArgs(
         model=model,
         distributed_executor_backend="ray",
+        load_format=RUNAI_STREAMER_LOAD_FORMAT,
         enforce_eager=True,  # reduce test time
     )
     engine = LLMEngine.from_engine_args(engine_args)
diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py
index 655c8232..fee7fd3f 100644
--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -2,16 +2,21 @@
 
 import pytest
 
+from vllm.config import LoadFormat
 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams
 
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_skip_tokenizer_initialization(model: str):
     # This test checks if the flag skip_tokenizer_init skips the initialization
     # of tokenizer and detokenizer. The generated output is expected to contain
     # token ids.
-    llm = LLM(model=model, skip_tokenizer_init=True)
+    llm = LLM(model=model,
+              skip_tokenizer_init=True,
+              load_format=LoadFormat.RUNAI_STREAMER)
     sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
 
     with pytest.raises(ValueError, match="cannot pass text prompts when"):
diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py
index a50b3880..4b1e4f5c 100644
--- a/tests/engine/test_stop_reason.py
+++ b/tests/engine/test_stop_reason.py
@@ -12,7 +12,7 @@ import transformers
 
 from vllm import SamplingParams
 
-MODEL = "facebook/opt-350m"
+MODEL = "distilbert/distilgpt2"
 STOP_STR = "."
 SEED = 42
 MAX_TOKENS = 1024
diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index 77c80b2f..f6fda512 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -5,12 +5,17 @@ from typing import List
 import pytest
 
 from vllm import LLM
+from vllm.config import LoadFormat
 
+from ...conftest import MODEL_WEIGHTS_S3_BUCKET
 from ..openai.test_vision import TEST_IMAGE_URLS
 
+RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
+
 
 def test_chat():
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
+              load_format=RUNAI_STREAMER_LOAD_FORMAT)
 
     prompt1 = "Explain the concept of entropy."
     messages = [
@@ -28,7 +33,8 @@ def test_chat():
 
 
 def test_multi_chat():
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
+              load_format=RUNAI_STREAMER_LOAD_FORMAT)
 
     prompt1 = "Explain the concept of entropy."
     prompt2 = "Explain what among us is."
@@ -65,7 +71,8 @@ def test_multi_chat():
                          [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
 def test_chat_multi_image(image_urls: List[str]):
     llm = LLM(
-        model="microsoft/Phi-3.5-vision-instruct",
+        model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct",
+        load_format=RUNAI_STREAMER_LOAD_FORMAT,
         dtype="bfloat16",
         max_model_len=4096,
         max_num_seqs=5,
diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py
index 39d4810d..69c60bbe 100644
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -28,7 +28,7 @@ def test_collective_rpc(tp_size, backend):
         def echo_rank(self):
             return self.rank
 
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
+    llm = LLM(model="s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct",
               enforce_eager=True,
               load_format="dummy",
               tensor_parallel_size=tp_size,
diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index ebec8bab..61085bf4 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -6,9 +6,10 @@ from typing import List
 import pytest
 
 from vllm import LLM, PoolingParams, PoolingRequestOutput
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 
-MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
+MODEL_NAME = "s3://vllm-ci-model-weights/e5-mistral-7b-instruct"
 
 PROMPTS = [
     "Hello, my name is",
@@ -32,6 +33,7 @@ def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
     llm = LLM(model=MODEL_NAME,
+              load_format=LoadFormat.RUNAI_STREAMER,
               max_num_batched_tokens=32768,
               tensor_parallel_size=1,
               gpu_memory_utilization=0.75,
diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
index 4c78c2c8..f1bad876 100644
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -6,9 +6,10 @@ from typing import List
 import pytest
 
 from vllm import LLM, RequestOutput, SamplingParams
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 
-MODEL_NAME = "facebook/opt-125m"
+MODEL_NAME = "s3://vllm-ci-model-weights/distilgpt2"
 
 PROMPTS = [
     "Hello, my name is",
@@ -30,6 +31,7 @@ def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
     llm = LLM(model=MODEL_NAME,
+              load_format=LoadFormat.RUNAI_STREAMER,
               max_num_batched_tokens=4096,
               tensor_parallel_size=1,
               gpu_memory_utilization=0.10,
diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py
index 90e1d581..487c0046 100644
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -7,10 +7,11 @@ import pytest
 from huggingface_hub import snapshot_download
 
 from vllm import LLM
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.lora.request import LoRARequest
 
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "s3://vllm-ci-model-weights/zephyr-7b-beta"
 
 PROMPTS = [
     "Hello, my name is",
@@ -27,6 +28,7 @@ def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
     llm = LLM(model=MODEL_NAME,
+              load_format=LoadFormat.RUNAI_STREAMER,
               tensor_parallel_size=1,
               max_model_len=8192,
               enable_lora=True,
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index 01d2c170..70252471 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -7,12 +7,13 @@ import weakref
 import jsonschema
 import pytest
 
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
-MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
+MODEL_NAME = "s3://vllm-ci-model-weights/Qwen2.5-1.5B-Instruct"
 GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
 
 
@@ -20,7 +21,9 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
-    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    llm = LLM(model=MODEL_NAME,
+              load_format=LoadFormat.RUNAI_STREAMER,
+              max_model_len=1024)
 
     with llm.deprecate_legacy_api():
         yield weakref.proxy(llm)
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index b1f9ae14..07608e15 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -6,10 +6,11 @@ from contextlib import nullcontext
 from vllm_test_utils import BlameResult, blame
 
 from vllm import LLM, SamplingParams
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 
 
-def run_normal():
+def run_normal_opt125m():
     prompts = [
         "Hello, my name is",
         "The president of the United States is",
@@ -33,9 +34,35 @@ def run_normal():
     cleanup_dist_env_and_memory()
 
 
+def run_normal():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # Create an LLM without guided decoding as a baseline.
+    llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
+              enforce_eager=True,
+              gpu_memory_utilization=0.3)
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # Destroy the LLM object and free up the GPU memory.
+    del llm
+    cleanup_dist_env_and_memory()
+
+
 def run_lmfe(sample_regex):
     # Create an LLM with guided decoding enabled.
-    llm = LLM(model="facebook/opt-125m",
+    llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
               enforce_eager=True,
               guided_decoding_backend="lm-format-enforcer",
               gpu_memory_utilization=0.3)
diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py
index f2c145fa..04848131 100644
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -3,6 +3,7 @@
 import pytest
 
 from vllm import LLM
+from vllm.config import LoadFormat
 
 
 @pytest.fixture(autouse=True)
@@ -14,13 +15,17 @@ def v1(run_with_both_engines):
 
 
 def test_empty_prompt():
-    llm = LLM(model="gpt2", enforce_eager=True)
+    llm = LLM(model="s3://vllm-ci-model-weights/gpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
+              enforce_eager=True)
     with pytest.raises(ValueError, match='Prompt cannot be empty'):
         llm.generate([""])
 
 
 @pytest.mark.skip_v1
 def test_out_of_vocab_token():
-    llm = LLM(model="gpt2", enforce_eager=True)
+    llm = LLM(model="s3://vllm-ci-model-weights/gpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
+              enforce_eager=True)
     with pytest.raises(ValueError, match='out of vocabulary'):
         llm.generate({"prompt_token_ids": [999999]})
diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py
index 4c9774a7..cf114f06 100644
--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -86,4 +86,4 @@ def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
     assert rerank_response.status_code == 400
     # Assert just a small fragments of the response
     assert "Please reduce the length of the input." in \
-        rerank_response.text
\ No newline at end of file
+        rerank_response.text
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 0942c8ee..1a9063bc 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -8,16 +8,21 @@ import ray
 from prometheus_client import REGISTRY
 
 from vllm import EngineArgs, LLMEngine
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.metrics import RayPrometheusStatLogger
 from vllm.sampling_params import SamplingParams
 
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
+
 MODELS = [
-    "facebook/opt-125m",
+    "distilbert/distilgpt2",
 ]
 
+RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
+
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
@@ -141,8 +146,9 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
         metrics_tag_content = stat_logger.labels["model_name"]
 
     if served_model_name is None or served_model_name == []:
-        assert metrics_tag_content == model, (
-            f"Metrics tag model_name is wrong! expect: {model!r}\n"
+        actual_model_name = f"{MODEL_WEIGHTS_S3_BUCKET}/{model.split('/')[-1]}"
+        assert metrics_tag_content == actual_model_name, (
+            f"Metrics tag model_name is wrong! expect: {actual_model_name!r}\n"
             f"actual: {metrics_tag_content!r}")
     else:
         assert metrics_tag_content == served_model_name[0], (
@@ -170,7 +176,8 @@ async def test_async_engine_log_metrics_regression(
     """
     engine_args = AsyncEngineArgs(model=model,
                                   dtype=dtype,
-                                  disable_log_stats=disable_log_stats)
+                                  disable_log_stats=disable_log_stats,
+                                  load_format=RUNAI_STREAMER_LOAD_FORMAT)
     async_engine = AsyncLLMEngine.from_engine_args(engine_args)
     for i, prompt in enumerate(example_prompts):
         results = async_engine.generate(
@@ -199,7 +206,8 @@ def test_engine_log_metrics_regression(
 ) -> None:
     engine_args = EngineArgs(model=model,
                              dtype=dtype,
-                             disable_log_stats=disable_log_stats)
+                             disable_log_stats=disable_log_stats,
+                             load_format=RUNAI_STREAMER_LOAD_FORMAT)
     engine = LLMEngine.from_engine_args(engine_args)
     for i, prompt in enumerate(example_prompts):
         engine.add_request(
@@ -283,7 +291,8 @@ def test_metric_spec_decode_interval(
                              gpu_memory_utilization=0.4,
                              speculative_model=model,
                              num_speculative_tokens=k,
-                             enforce_eager=True)
+                             enforce_eager=True,
+                             load_format=RUNAI_STREAMER_LOAD_FORMAT)
 
     engine = LLMEngine.from_engine_args(engine_args)
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 17bfe1d2..8b0ece16 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -173,7 +173,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                          trust_remote_code=True),
     "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
                                        trust_remote_code=True),
-    "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct"),
+    "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct",
+                                        extras={"2.5": "Qwen/Qwen2.5-7B-Instruct"}), # noqa: E501
     "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
     "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
                                      is_available_online=False),
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index c58c6372..e0d5e003 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -7,6 +7,7 @@ from transformers import PretrainedConfig
 
 from vllm import LLM
 
+from ..conftest import MODELS_ON_S3
 from .registry import HF_EXAMPLE_MODELS
 
 
@@ -42,8 +43,11 @@ def test_can_initialize(model_arch):
 
     with patch.object(LLM.get_engine_class(), "_initialize_kv_caches",
                       _initialize_kv_caches):
+        model_name = model_info.default
+        if model_name in MODELS_ON_S3:
+            model_name = f"s3://vllm-ci-model-weights/{model_name.split('/')[-1]}"
         LLM(
-            model_info.default,
+            model_name,
             tokenizer=model_info.tokenizer,
             tokenizer_mode=model_info.tokenizer_mode,
             speculative_model=model_info.speculative_model,
diff --git a/tests/mq_llm_engine/test_abort.py b/tests/mq_llm_engine/test_abort.py
index 808346b5..b0ac0fb3 100644
--- a/tests/mq_llm_engine/test_abort.py
+++ b/tests/mq_llm_engine/test_abort.py
@@ -10,8 +10,8 @@ import pytest
 from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
 from vllm.engine.arg_utils import AsyncEngineArgs
 
-MODEL = "google/gemma-1.1-2b-it"
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
+MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it"
+ENGINE_ARGS = AsyncEngineArgs(model=MODEL, load_format="runai_streamer")
 RAISED_ERROR = KeyError
 RAISED_VALUE = "foo"
 EXPECTED_TOKENS = 250
diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
index 35d00178..4eac7341 100644
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -21,8 +21,10 @@ from vllm.lora.request import LoRARequest
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser
 
-MODEL = "google/gemma-1.1-2b-it"
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL, enforce_eager=True)
+MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it"
+ENGINE_ARGS = AsyncEngineArgs(model=MODEL,
+                              load_format="runai_streamer",
+                              enforce_eager=True)
 RAISED_ERROR = KeyError
 RAISED_VALUE = "foo"
 
diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py
index 2069ff98..3162d56c 100644
--- a/tests/mq_llm_engine/test_load.py
+++ b/tests/mq_llm_engine/test_load.py
@@ -10,12 +10,14 @@ import pytest
 from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
 from vllm.engine.arg_utils import AsyncEngineArgs
 
-MODEL = "google/gemma-1.1-2b-it"
+MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it"
 NUM_EXPECTED_TOKENS = 10
 NUM_REQUESTS = 10000
 
 # Scenarios to test for num generated token.
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True)
+ENGINE_ARGS = AsyncEngineArgs(model=MODEL,
+                              load_format="runai_streamer",
+                              disable_log_requests=True)
 
 
 @pytest.fixture(scope="function")
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 459c0d9d..7bbe5c53 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -553,7 +553,8 @@ def test_find_mm_placeholders(
     assert result == expected
 
 
-@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize(
+    "model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize(
     ("limit", "num_supported", "is_valid"),
     [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
@@ -592,7 +593,8 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         profiler.get_dummy_data(model_config.max_model_len)
 
 
-@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize(
+    "model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize(
     ("num_images", "limit", "is_valid"),
     [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
@@ -661,7 +663,7 @@ class _ProcessorProxy:
         return dict(exists=exists)
 
 
-@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-7B-Instruct"])  # Dummy
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])  # Dummy
 # yapf: disable
 @pytest.mark.parametrize(
     ("call_kwargs", "expected_kwargs"),
diff --git a/tests/runai_model_streamer/__init__.py b/tests/runai_model_streamer_test/__init__.py
similarity index 100%
rename from tests/runai_model_streamer/__init__.py
rename to tests/runai_model_streamer_test/__init__.py
diff --git a/tests/runai_model_streamer/test_runai_model_streamer_loader.py b/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
similarity index 100%
rename from tests/runai_model_streamer/test_runai_model_streamer_loader.py
rename to tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
diff --git a/tests/runai_model_streamer/test_weight_utils.py b/tests/runai_model_streamer_test/test_weight_utils.py
similarity index 100%
rename from tests/runai_model_streamer/test_weight_utils.py
rename to tests/runai_model_streamer_test/test_weight_utils.py
diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
index 9a92b08f..673d1b9a 100644
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -10,7 +10,7 @@ from vllm import SamplingParams
 
 # We also test with llama because it has generation_config to specify EOS
 # (past regression).
-MODELS = ["facebook/opt-125m", "meta-llama/Llama-3.2-1B-Instruct"]
+MODELS = ["distilbert/distilgpt2", "meta-llama/Llama-3.2-1B"]
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
index 3b95b038..f237b616 100644
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -5,7 +5,7 @@ import torch
 
 from vllm import SamplingParams
 
-MODELS = ["facebook/opt-125m"]
+MODELS = ["distilbert/distilgpt2"]
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 59d36099..78bdd9b0 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -9,7 +9,7 @@ from vllm import SamplingParams
 
 from ..conftest import VllmRunner
 
-MODELS = ["facebook/opt-125m"]
+MODELS = ["distilbert/distilgpt2"]
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
index cc655769..143f5299 100644
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -76,7 +76,7 @@ class TestOneTokenBadWord:
 
 class TestTwoTokenBadWord:
     # Another model (with a different tokenizer behaviour)
-    MODEL = "openai-community/gpt2"
+    MODEL = "distilbert/distilgpt2"
 
     PROMPT = "How old are you? I am 10"
     TARGET_TOKEN1 = "years"
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
index c74c1c02..66779d97 100644
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -4,7 +4,7 @@ import pytest
 
 from vllm import SamplingParams
 
-MODELS = ["facebook/opt-125m"]
+MODELS = ["distilbert/distilgpt2"]
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/test_config.py b/tests/test_config.py
index 746ca729..4a171861 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -8,14 +8,19 @@ from vllm.config import ModelConfig, PoolerConfig
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform
 
+from .conftest import MODEL_WEIGHTS_S3_BUCKET
+
 
 @pytest.mark.parametrize(
     ("model_id", "expected_runner_type", "expected_task"),
     [
-        ("facebook/opt-125m", "generate", "generate"),
-        ("intfloat/e5-mistral-7b-instruct", "pooling", "embed"),
-        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2", "generate", "generate"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/e5-mistral-7b-instruct", "pooling",
+         "embed"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/Qwen2.5-1.5B-apeach", "pooling",
+         "classify"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/ms-marco-MiniLM-L-6-v2", "pooling",
+         "score"),
         ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
         ("openai/whisper-small", "transcription", "transcription"),
     ],
diff --git a/tests/test_regression.py b/tests/test_regression.py
index f781b311..e9b21e1a 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -10,6 +10,9 @@ import gc
 import torch
 
 from vllm import LLM, SamplingParams
+from vllm.config import LoadFormat
+
+from .conftest import MODEL_WEIGHTS_S3_BUCKET
 
 
 def test_duplicated_ignored_sequence_group():
@@ -18,7 +21,8 @@ def test_duplicated_ignored_sequence_group():
     sampling_params = SamplingParams(temperature=0.01,
                                      top_p=0.1,
                                      max_tokens=256)
-    llm = LLM(model="facebook/opt-125m",
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
               max_num_batched_tokens=4096,
               tensor_parallel_size=1)
     prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
@@ -31,7 +35,8 @@ def test_max_tokens_none():
     sampling_params = SamplingParams(temperature=0.01,
                                      top_p=0.1,
                                      max_tokens=None)
-    llm = LLM(model="facebook/opt-125m",
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
               max_num_batched_tokens=4096,
               tensor_parallel_size=1)
     prompts = ["Just say hello!"]
@@ -41,7 +46,9 @@ def test_max_tokens_none():
 
 
 def test_gc():
-    llm = LLM("facebook/opt-125m", enforce_eager=True)
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
+              enforce_eager=True)
     del llm
 
     gc.collect()
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index 7ae0f4bb..2c337cc9 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -10,7 +10,7 @@ from vllm.worker.worker import Worker
 
 def test_swap() -> None:
     # Configure the engine.
-    engine_args = EngineArgs(model="facebook/opt-125m",
+    engine_args = EngineArgs(model="s3://vllm-ci-model-weights/distilgpt2",
                              dtype="half",
                              load_format="dummy")
     engine_config = engine_args.create_engine_config()
diff --git a/vllm/config.py b/vllm/config.py
index 5c220ed1..54227dda 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -409,7 +409,8 @@ class ModelConfig:
         if is_s3(model) or is_s3(tokenizer):
             if is_s3(model):
                 s3_model = S3Model()
-                s3_model.pull_files(model, allow_pattern=["*config.json"])
+                s3_model.pull_files(
+                    model, allow_pattern=["*.model", "*.py", "*.json"])
                 self.model_weights = self.model
                 self.model = s3_model.dir
 
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 230484a3..df957cfc 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1327,6 +1327,7 @@ class RunaiModelStreamerLoader(BaseModelLoader):
         """Prepare weights for the model.
 
         If the model is not local, it will be downloaded."""
+
         is_s3_path = is_s3(model_name_or_path)
         is_local = os.path.isdir(model_name_or_path)
         safetensors_pattern = "*.safetensors"
@@ -1340,7 +1341,6 @@ class RunaiModelStreamerLoader(BaseModelLoader):
                          revision,
                          ignore_patterns=self.load_config.ignore_patterns,
                      ))
-
         if is_s3_path:
             hf_weights_files = s3_glob(path=hf_folder,
                                        allow_pattern=[safetensors_pattern])
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 18f6f40b..ac1be383 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -27,6 +27,8 @@ from vllm.model_executor.layers.quantization import (QuantizationConfig,
 from vllm.platforms import current_platform
 from vllm.utils import PlaceholderModule
 
+logger = init_logger(__name__)
+
 try:
     from runai_model_streamer import SafetensorsStreamer
 except (ImportError, OSError):
@@ -37,8 +39,6 @@ except (ImportError, OSError):
     SafetensorsStreamer = runai_model_streamer.placeholder_attr(
         "SafetensorsStreamer")
 
-logger = init_logger(__name__)
-
 # use system-level temp directory for file locks, so that multiple users
 # can share the same lock without error.
 # lock files in the temp directory will be automatically deleted when the
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 2fed5d74..4768226f 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -144,7 +144,6 @@ def file_exists(
     revision: Optional[str] = None,
     token: Union[str, bool, None] = None,
 ) -> bool:
-
     file_list = list_repo_files(repo_id,
                                 repo_type=repo_type,
                                 revision=revision,
@@ -498,7 +497,7 @@ def get_sentence_transformer_tokenizer_config(model: str,
             if encoder_dict:
                 break
 
-    if not encoder_dict:
+    if not encoder_dict and not model.startswith("/"):
         try:
             # If model is on HuggingfaceHub, get the repo files
             repo_files = list_repo_files(model,
diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
index 4fe744d2..1c3520bc 100644
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@@ -46,6 +46,8 @@ def glob(s3=None,
     """
     if s3 is None:
         s3 = boto3.client("s3")
+    if not path.endswith("/"):
+        path = path + "/"
     bucket_name, _, paths = list_files(s3,
                                        path=path,
                                        allow_pattern=allow_pattern)
@@ -109,6 +111,7 @@ class S3Model:
         for sig in (signal.SIGINT, signal.SIGTERM):
             existing_handler = signal.getsignal(sig)
             signal.signal(sig, self._close_by_signal(existing_handler))
+
         self.dir = tempfile.mkdtemp()
 
     def __del__(self):
@@ -140,6 +143,9 @@ class S3Model:
             ignore_pattern: A list of patterns of which files not to pull.
 
         """
+        if not s3_model_path.endswith("/"):
+            s3_model_path = s3_model_path + "/"
+
         bucket_name, base_dir, files = list_files(self.s3, s3_model_path,
                                                   allow_pattern,
                                                   ignore_pattern)
@@ -147,8 +153,9 @@ class S3Model:
             return
 
         for file in files:
-            destination_file = os.path.join(self.dir,
-                                            file.removeprefix(base_dir))
+            destination_file = os.path.join(
+                self.dir,
+                file.removeprefix(base_dir).lstrip("/"))
             local_dir = Path(destination_file).parent
             os.makedirs(local_dir, exist_ok=True)
             self.s3.download_file(bucket_name, file, destination_file)