[CI/Build] drop support for Python 3.8 EOL (#8464)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
This commit is contained in:
Aaron Pham 2024-11-06 02:11:55 -05:00 committed by GitHub
parent 4be3a45158
commit 21063c11c7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
115 changed files with 239 additions and 321 deletions

View File

@ -56,7 +56,7 @@ serving_column_mapping = {
def read_markdown(file):
if os.path.exists(file):
with open(file, "r") as f:
with open(file) as f:
return f.read() + "\n"
else:
return f"{file} not found.\n"
@ -75,14 +75,14 @@ if __name__ == "__main__":
# collect results
for test_file in results_folder.glob("*.json"):
with open(test_file, "r") as f:
with open(test_file) as f:
raw_result = json.loads(f.read())
if "serving" in str(test_file):
# this result is generated via `benchmark_serving.py`
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)
@ -97,7 +97,7 @@ if __name__ == "__main__":
# this result is generated via `benchmark_latency.py`
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)
@ -119,7 +119,7 @@ if __name__ == "__main__":
# this result is generated via `benchmark_throughput.py`
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)

View File

@ -72,7 +72,7 @@ def main(args):
# collect results
for test_file in results_folder.glob("*_nightly_results.json"):
with open(test_file, "r") as f:
with open(test_file) as f:
results = results + json.loads(f.read())
# generate markdown table
@ -80,7 +80,7 @@ def main(args):
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
with open(args.description, "r") as f:
with open(args.description) as f:
description = f.read()
description = description.format(

View File

@ -36,11 +36,11 @@ if __name__ == "__main__":
# collect results
for test_file in results_folder.glob("*.json"):
with open(test_file, "r") as f:
with open(test_file) as f:
raw_result = json.loads(f.read())
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f:
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)

View File

@ -25,7 +25,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
python-version: ["3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Set up Python ${{ matrix.python-version }}

View File

@ -48,7 +48,7 @@ jobs:
fail-fast: false
matrix:
os: ['ubuntu-20.04']
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
python-version: ['3.9', '3.10', '3.11', '3.12']
pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
cuda-version: ['11.8', '12.1']

View File

@ -29,19 +29,19 @@ jobs:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements-lint.txt
- name: Analysing the code with ruff
run: |
echo "::add-matcher::.github/workflows/matchers/ruff.json"
ruff check --output-format github .
- name: Run isort
run: |
isort . --check-only
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements-lint.txt
- name: Analysing the code with ruff
run: |
echo "::add-matcher::.github/workflows/matchers/ruff.json"
ruff check --output-format github .
- name: Run isort
run: |
isort . --check-only

View File

@ -23,16 +23,16 @@ jobs:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install yapf==0.32.0
pip install toml==0.10.2
- name: Running yapf
run: |
yapf --diff --recursive .
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install yapf==0.32.0
pip install toml==0.10.2
- name: Running yapf
run: |
yapf --diff --recursive .

View File

@ -6,17 +6,16 @@ version: 2
build:
os: ubuntu-22.04
tools:
python: "3.8"
python: '3.9'
sphinx:
configuration: docs/source/conf.py
fail_on_warning: true
configuration: docs/source/conf.py
fail_on_warning: true
# If using Sphinx, optionally build your docs in additional formats such as PDF
formats: []
# Optionally declare the Python requirements required to build your docs
python:
install:
- requirements: docs/requirements-docs.txt
install:
- requirements: docs/requirements-docs.txt

View File

@ -128,9 +128,9 @@ endif()
if(VLLM_GPU_LANG STREQUAL "CUDA")
#
# For cuda we want to be able to control which architectures we compile for on
# For cuda we want to be able to control which architectures we compile for on
# a per-file basis in order to cut down on compile time. So here we extract
# the set of architectures we want to compile for and remove the from the
# the set of architectures we want to compile for and remove the from the
# CMAKE_CUDA_FLAGS so that they are not applied globally.
#
clear_cuda_arches(CUDA_ARCH_FLAGS)
@ -138,7 +138,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
# Filter the target architectures by the supported supported archs
# since for some files we will build for all CUDA_ARCHS.
cuda_archs_loose_intersection(CUDA_ARCHS
cuda_archs_loose_intersection(CUDA_ARCHS
"${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
else()
@ -236,7 +236,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# are not supported by Machete yet.
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
if (MARLIN_ARCHS)
set(MARLIN_SRCS
set(MARLIN_SRCS
"csrc/quantization/fp8/fp8_marlin.cu"
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
@ -277,7 +277,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"in CUDA target architectures")
endif()
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
# build any 3x kernels
set(SCALED_MM_3X_ARCHS)
endif()
@ -285,7 +285,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
#
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x.
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
@ -316,10 +316,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
#
# For the Machete kernels we automatically generate sources for various
# For the Machete kernels we automatically generate sources for various
# preselected input type pairs and schedules.
# Generate sources:
set(MACHETE_GEN_SCRIPT
set(MACHETE_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
@ -329,8 +329,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
RESULT_VARIABLE machete_generation_result
OUTPUT_VARIABLE machete_generation_output
@ -340,11 +340,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
if (NOT machete_generation_result EQUAL 0)
message(FATAL_ERROR "Machete generation failed."
" Result: \"${machete_generation_result}\""
" Result: \"${machete_generation_result}\""
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
else()
set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
CACHE STRING "Last run machete generate script hash" FORCE)
message(STATUS "Machete generation completed successfully.")
endif()
@ -366,7 +366,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
AND MACHETE_ARCHS)
message(STATUS "Not building Machete kernels as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
@ -392,8 +392,8 @@ define_gpu_extension_target(
USE_SABI 3
WITH_SOABI)
# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
# driver API. This causes problems when linking with earlier versions of CUDA.
# Setting this variable sidesteps the issue by calling the driver directly.
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
@ -471,9 +471,9 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
return()
endif ()
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
# arches in the CUDA case (and instead set the gencodes on a per file basis)
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
# arches in the CUDA case (and instead set the gencodes on a per file basis)
# we need to manually set VLLM_GPU_ARCHES here.
if(VLLM_GPU_LANG STREQUAL "CUDA")
foreach(_ARCH ${CUDA_ARCHS})

View File

@ -79,7 +79,7 @@ async def async_request_tgi(
# any data, we should skip it.
if chunk_bytes.startswith(":"):
continue
chunk = remove_prefix(chunk_bytes, "data:")
chunk = chunk_bytes.removeprefix("data:")
data = json.loads(chunk)
timestamp = time.perf_counter()
@ -144,8 +144,8 @@ async def async_request_trt_llm(
if not chunk_bytes:
continue
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
"data:")
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data:")
data = json.loads(chunk)
output.generated_text += data["text_output"]
@ -261,8 +261,8 @@ async def async_request_openai_completions(
if not chunk_bytes:
continue
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
"data: ")
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
@ -349,8 +349,8 @@ async def async_request_openai_chat_completions(
if not chunk_bytes:
continue
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
"data: ")
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
@ -389,14 +389,6 @@ async def async_request_openai_chat_completions(
return output
# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
# introduced in Python 3.9
def remove_prefix(text: str, prefix: str) -> str:
if text.startswith(prefix):
return text[len(prefix):]
return text
def get_model(pretrained_model_name_or_path: str) -> str:
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
from modelscope import snapshot_download

View File

@ -269,10 +269,10 @@ def run_square_bench(args):
def run_range_bench(args):
m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")]
m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")]
m_start, k_start, n_start = (int(x) for x in args.dim_start.split(","))
m_end, k_end, n_end = (int(x) for x in args.dim_end.split(","))
m_increment, k_increment, n_increment = \
[int(x) for x in args.dim_increment.split(",")]
(int(x) for x in args.dim_increment.split(","))
Ms = list(range(m_start, m_end + 1, m_increment))
Ks = list(range(k_start, k_end + 1, k_increment))
Ns = list(range(n_start, n_end + 1, n_increment))

View File

@ -468,7 +468,7 @@ def generate():
impl_configs = []
GPTQ_kernel_type_configs = list(
(TypeConfig(
TypeConfig(
element_a=element_a,
element_b=element_b,
element_b_scale=element_a,
@ -476,7 +476,7 @@ def generate():
element_d=element_a,
accumulator=DataType.f32,
) for element_b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
for element_a in (DataType.f16, DataType.bf16)))
for element_a in (DataType.f16, DataType.bf16))
GPTQ_kernel_specializations = [
Specialization(with_C=False, with_zeropoints=False, with_scales=True)
@ -490,7 +490,7 @@ def generate():
]
AWQ_kernel_type_configs = list(
(TypeConfig(
TypeConfig(
element_a=element_a,
element_b=element_b,
element_b_scale=element_a,
@ -498,7 +498,7 @@ def generate():
element_d=element_a,
accumulator=DataType.f32,
) for element_b in (DataType.u4, DataType.u8)
for element_a in (DataType.f16, DataType.bf16)))
for element_a in (DataType.f16, DataType.bf16))
AWQ_kernel_specializations = [
Specialization(with_C=False, with_zeropoints=True, with_scales=True)

View File

@ -10,7 +10,7 @@ Requirements
============
* OS: Linux
* Python: 3.8 - 3.12
* Python: 3.9 -- 3.12
* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
Install released versions
@ -148,7 +148,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T
.. tip::
Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` .
For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` .
As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
@ -181,8 +181,8 @@ to be run simultaneously, via the environment variable ``MAX_JOBS``. For example
$ export MAX_JOBS=6
$ pip install -e .
This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings>`_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory.
A side effect is a much slower build process.
This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings>`_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory.
A side effect is a much slower build process.
Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
@ -209,7 +209,7 @@ Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
Unsupported OS build
--------------------
vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing:

View File

@ -34,7 +34,7 @@ select = [
# Pyflakes
"F",
# pyupgrade
# "UP",
"UP",
# flake8-bugbear
"B",
# flake8-simplify
@ -55,7 +55,7 @@ ignore = [
]
[tool.mypy]
python_version = "3.8"
python_version = "3.9"
ignore_missing_imports = true
check_untyped_defs = true

View File

@ -1,5 +1,4 @@
import importlib.util
import io
import logging
import os
import re
@ -327,7 +326,7 @@ def get_neuronxcc_version():
"__init__.py")
# Check if the command was executed successfully
with open(version_file, "rt") as fp:
with open(version_file) as fp:
content = fp.read()
# Extract the version using a regular expression
@ -404,7 +403,8 @@ def read_readme() -> str:
"""Read the README file if present."""
p = get_path("README.md")
if os.path.isfile(p):
return io.open(get_path("README.md"), "r", encoding="utf-8").read()
with open(get_path("README.md"), encoding="utf-8") as f:
return f.read()
else:
return ""
@ -498,7 +498,6 @@ setup(
"Documentation": "https://vllm.readthedocs.io/en/latest/",
},
classifiers=[
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
@ -512,7 +511,7 @@ setup(
],
packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
"tests*")),
python_requires=">=3.8",
python_requires=">=3.9",
install_requires=get_requirements(),
ext_modules=ext_modules,
extras_require={

View File

@ -429,8 +429,8 @@ def benchmark():
# print in tabular format
print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph")
for b in cudagraph_sizes:
print((f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
f"\t{piecewise_cudagraph_time[b]:.3f}"))
print(f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
f"\t{piecewise_cudagraph_time[b]:.3f}")
if __name__ == "__main__":

View File

@ -1,6 +1,5 @@
import json
import os
import sys
import tempfile
from collections import UserList
from enum import Enum
@ -52,7 +51,7 @@ PromptVideoInput = _PromptMultiModalInput[np.ndarray]
def _read_prompts(filename: str) -> List[str]:
with open(filename, "r") as f:
with open(filename) as f:
prompts = f.readlines()
return prompts
@ -62,14 +61,8 @@ class _ImageAssetPrompts(TypedDict):
cherry_blossom: str
if sys.version_info < (3, 9):
# UserList cannot be subscripted
class _ImageAssetsBase(UserList):
pass
else:
class _ImageAssetsBase(UserList[ImageAsset]):
pass
class _ImageAssetsBase(UserList[ImageAsset]):
pass
class _ImageAssets(_ImageAssetsBase):
@ -94,14 +87,8 @@ class _VideoAssetPrompts(TypedDict):
sample_demo_1: str
if sys.version_info < (3, 9):
# UserList cannot be subscripted
class _VideoAssetsBase(UserList):
pass
else:
class _VideoAssetsBase(UserList[VideoAsset]):
pass
class _VideoAssetsBase(UserList[VideoAsset]):
pass
class _VideoAssets(_VideoAssetsBase):
@ -958,7 +945,7 @@ def dummy_opt_path():
"*.msgpack"
])
assert os.path.exists(json_path)
with open(json_path, "r") as f:
with open(json_path) as f:
config = json.load(f)
config["architectures"] = ["MyOPTForCausalLM"]
with open(json_path, "w") as f:
@ -977,7 +964,7 @@ def dummy_llava_path():
"*.msgpack"
])
assert os.path.exists(json_path)
with open(json_path, "r") as f:
with open(json_path) as f:
config = json.load(f)
config["architectures"] = ["MyLlava"]
with open(json_path, "w") as f:
@ -996,7 +983,7 @@ def dummy_gemma2_embedding_path():
"*.msgpack"
])
assert os.path.exists(json_path)
with open(json_path, "r") as f:
with open(json_path) as f:
config = json.load(f)
config["architectures"] = ["MyGemma2Embedding"]
with open(json_path, "w") as f:

View File

@ -99,13 +99,11 @@ class TestPrefixCachingBlock:
token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
first_chain, second_chain = [
TestPrefixCachingBlock.create_chain(
block_size=block_size,
token_ids=token_ids,
num_empty_trailing_blocks=num_empty_trailing_blocks)
for _ in range(2)
]
first_chain, second_chain = (TestPrefixCachingBlock.create_chain(
block_size=block_size,
token_ids=token_ids,
num_empty_trailing_blocks=num_empty_trailing_blocks)
for _ in range(2))
for first_chain_block, second_chain_block in zip(
first_chain, second_chain):

View File

@ -510,7 +510,7 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
for var in (u_ref, delta_ref, B_ref, C_ref, z_ref)
]
for i in range(len(seqlens[0])):
u_s, delta_s, B_s, C_s, z_s = [v[i].unsqueeze(0) for v in splits]
u_s, delta_s, B_s, C_s, z_s = (v[i].unsqueeze(0) for v in splits)
if padded_state_indices[i] == PAD_SLOT_ID:
continue
out_ref_s, _ = selective_scan_ref(

View File

@ -104,7 +104,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
# Sad path tests for the multimodal input processor and mapper, respectively
@pytest.mark.parametrize("mm_data", [
{
"image": torch.rand((5))
"image": torch.rand(5)
},
{
"image": torch.rand((5, 5, 5, 5, 5))

View File

@ -413,12 +413,10 @@ class _CorrectnessTestHelper:
def generate_probs_for_test(
self, draft_and_target_probs_equal: bool
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
draft_probs, target_probs = [
F.softmax(
torch.rand(self.vocab_size, dtype=torch.float32),
dim=-1,
) for _ in range(2)
]
draft_probs, target_probs = (F.softmax(
torch.rand(self.vocab_size, dtype=torch.float32),
dim=-1,
) for _ in range(2))
num_reference_probs = 100
reference_probs = F.softmax(

View File

@ -29,7 +29,7 @@ def test_trace_function_call():
cur_dir = os.path.dirname(__file__)
enable_trace_function_call(path, cur_dir)
f1(1)
with open(path, 'r') as f:
with open(path) as f:
content = f.read()
assert "f1" in content

View File

@ -93,10 +93,10 @@ def test_mistral_edge_case(tokenizer, truth):
def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
if "mistral" in tokenizer_name:
yield (
bool(True) if request.param else
True if request.param else
pytest.skip("mistral doesn't support skip_special_tokens=False"))
else:
yield bool(True) if request.param else bool(False)
yield bool(request.param)
@pytest.mark.parametrize("truth", TRUTH)

View File

@ -46,7 +46,7 @@ if __name__ == "__main__":
args = parser.parse_args()
with open(args.json_trace, "r") as f:
with open(args.json_trace) as f:
profile_data = json.load(f)
if args.table == "summary":

View File

@ -434,7 +434,7 @@ def main(
f"{', Sparsity ' + sparsity if sparsity else ''}")
profile_json = None
with open(json_trace, "r") as f:
with open(json_trace) as f:
profile_json = json.load(f)
assert profile_json is not None

View File

@ -81,7 +81,7 @@ class Target:
# Allow for modest floating-point errors
epsilon = 0.000002
if (self.weighted_duration > self.Duration() + epsilon):
print('%s > %s?' % (self.weighted_duration, self.Duration()))
print('{} > {}?'.format(self.weighted_duration, self.Duration()))
assert (self.weighted_duration <= self.Duration() + epsilon)
return self.weighted_duration
@ -104,7 +104,7 @@ def ReadTargets(log, show_all):
The result is a list of Target objects."""
header = log.readline()
assert header == '# ninja log v5\n', \
'unrecognized ninja log version %r' % header
'unrecognized ninja log version {!r}'.format(header)
targets_dict = {}
last_end_seen = 0.0
for line in log:
@ -254,8 +254,8 @@ def SummarizeEntries(entries, extra_step_types):
# Warn if the sum of weighted times is off by more than half a second.
if abs(length - weighted_total) > 500:
print('Warning: Possible corrupt ninja log, results may be '
'untrustworthy. Length = %.3f, weighted total = %.3f' %
(length, weighted_total))
'untrustworthy. Length = {:.3f}, weighted total = {:.3f}'.format(
length, weighted_total))
entries_by_ext = defaultdict(list)
for target in entries:
@ -263,16 +263,17 @@ def SummarizeEntries(entries, extra_step_types):
entries_by_ext[extension].append(target)
for key, values in entries_by_ext.items():
print(' Longest build steps for %s:' % key)
print(' Longest build steps for {}:'.format(key))
values.sort(key=lambda x: x.WeightedDuration())
for target in values[-long_count:]:
print(' %8.1f weighted s to build %s (%.1f s elapsed time)' %
(target.WeightedDuration(), target.DescribeTargets(),
target.Duration()))
print(
' {:8.1f} weighted s to build {} ({:.1f} s elapsed time)'.
format(target.WeightedDuration(), target.DescribeTargets(),
target.Duration()))
print(' %.1f s weighted time (%.1f s elapsed time sum, %1.1fx '
'parallelism)' %
(length, total_cpu_time, total_cpu_time * 1.0 / length))
print(' {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
'parallelism)'.format(length, total_cpu_time,
total_cpu_time * 1.0 / length))
print(' %d build steps completed, average of %1.2f/s' %
(len(entries), len(entries) / (length)))
@ -298,11 +299,12 @@ def main():
long_ext_count += len(args.step_types.split(';'))
try:
with open(log_file, 'r') as log:
with open(log_file) as log:
entries = ReadTargets(log, False)
SummarizeEntries(entries, args.step_types)
except IOError:
print('Log file %r not found, no build summary created.' % log_file)
except OSError:
print('Log file {!r} not found, no build summary created.'.format(
log_file))
return errno.ENOENT

View File

@ -4,7 +4,7 @@ requires_files = glob.glob('requirements*.txt')
requires_files += ["pyproject.toml"]
for file in requires_files:
print(f">>> cleaning {file}")
with open(file, 'r') as f:
with open(file) as f:
lines = f.readlines()
if "torch" in "".join(lines).lower():
print("removed:")

View File

@ -192,10 +192,8 @@ class LocalStridedBlockSparseAttn(torch.nn.Module):
attn_mask = self.dense_attn_mask[None, :, :maxlen, :maxlen]
q2 = self.transpose_and_pad(q, cu_seqlens, maxlen, 1)
k2, v2 = [
self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio)
for x in [k, v]
]
k2, v2 = (self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio)
for x in [k, v])
spda_output = torch.nn.functional.scaled_dot_product_attention(
q2, k2, v2, attn_mask=attn_mask, scale=sm_scale)
return self.transpose_and_unpad(spda_output, cu_seqlens)

View File

@ -668,9 +668,10 @@ class ModelConfig:
@property
def is_encoder_decoder_model(self) -> bool:
"""Extract the HF encoder/decoder model flag."""
return getattr(self.hf_config, "is_encoder_decoder", False) or (
(hasattr(self.hf_config, "text_config") and getattr(
self.hf_config.text_config, "is_encoder_decoder", False)))
return getattr(
self.hf_config, "is_encoder_decoder",
False) or (hasattr(self.hf_config, "text_config") and getattr(
self.hf_config.text_config, "is_encoder_decoder", False))
@property
def is_multimodal_model(self) -> bool:

View File

@ -52,7 +52,7 @@ class Evictor(ABC):
pass
class BlockMetaData():
class BlockMetaData:
"""Data structure for storing key data describe cached block, so that
evitor could use to make its decision which one to choose for eviction

View File

@ -240,7 +240,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
if is_distributed:
get_world_group().barrier()
logger.info("reading GPU P2P access cache from %s", path)
with open(path, "r") as f:
with open(path) as f:
cache = json.load(f)
_gpu_p2p_access_cache = cache
return _gpu_p2p_access_cache[f"{src}->{tgt}"]

View File

@ -812,7 +812,7 @@ class AsyncLLMEngine(EngineClient):
async def run_engine_loop(engine_ref: ReferenceType):
"""We use a weakref to the engine so that the running loop
doesn't prevent the engine being garbage collected."""
engine: Optional["AsyncLLMEngine"] = engine_ref()
engine: Optional[AsyncLLMEngine] = engine_ref()
if not engine:
return

View File

@ -1541,8 +1541,8 @@ class LLMEngine:
seq_group.state.remaining_steps != ref_remaining_steps
for seq_group in seq_group_metadata_list[1:]
]):
raise AssertionError(("All running sequence groups should "
"have the same remaining steps."))
raise AssertionError("All running sequence groups should "
"have the same remaining steps.")
return ref_remaining_steps > 0

View File

@ -77,7 +77,7 @@ class StatLoggerBase(ABC):
self.num_generation_tokens: List[int] = []
self.last_local_log = time.time()
self.local_interval = local_interval
self.spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
self.spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None
@abstractmethod
def log(self, stats: Stats) -> None:

View File

@ -63,7 +63,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
single_step_process_prompt_logprob(self, seq_group, output)
@staticmethod
@functools.lru_cache()
@functools.lru_cache
def _log_prompt_logprob_unsupported_warning_once():
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
# If the feature combo become valid

View File

@ -362,7 +362,7 @@ def load_chat_template(
if chat_template is None:
return None
try:
with open(chat_template, "r") as f:
with open(chat_template) as f:
resolved_chat_template = f.read()
except OSError as e:
if isinstance(chat_template, Path):

View File

@ -120,7 +120,7 @@ async def read_file(path_or_url: str) -> str:
session.get(path_or_url) as resp:
return await resp.text()
else:
with open(path_or_url, "r", encoding="utf-8") as f:
with open(path_or_url, encoding="utf-8") as f:
return f.read()

View File

@ -32,7 +32,7 @@ class RayGPUExecutor(DistributedGPUExecutor):
uses_ray: bool = True
def _init_executor(self) -> None:
self.forward_dag: Optional["ray.dag.CompiledDAG"] = None
self.forward_dag: Optional[ray.dag.CompiledDAG] = None
# If the env var is set, it uses the Ray's compiled DAG API
# which optimizes the control plane overhead.
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.

View File

@ -67,8 +67,7 @@ def _configure_vllm_root_logger() -> None:
raise RuntimeError(
"Could not load logging config. File does not exist: %s",
VLLM_LOGGING_CONFIG_PATH)
with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8",
mode="r") as file:
with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
custom_config = json.loads(file.read())
if not isinstance(custom_config, dict):

View File

@ -343,7 +343,7 @@ class LoRAModelManager(AdapterModelManager):
# text modules (e.g. ChatGLM)
and hasattr(self.model, "get_mm_mapping"))
self.packed_modules: Dict[str, List[str]] = {}
self.modules: Dict[str, "BaseLayerWithLoRA"] = {}
self.modules: Dict[str, BaseLayerWithLoRA] = {}
# Dict instead of a Set for compatibility with LRUCache.
self._last_mapping: Optional[LoRAMapping] = None
self._create_lora_modules()
@ -548,7 +548,7 @@ class LoRAModelManager(AdapterModelManager):
else:
parts = module_name.split(".")
replacements = self.packed_modules_mapping[parts[-1]]
subloras: List[Optional["LoRALayerWeights"]] = []
subloras: List[Optional[LoRALayerWeights]] = []
for i, r in enumerate(replacements):
lora = LoRALayerWeights.create_dummy_lora_weights(
module_name + "." + r,

View File

@ -103,7 +103,7 @@ class CustomOp(nn.Module):
# On by default if VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE
# Specifying 'all' or 'none' in VLLM_CUSTOM_OPS takes precedence.
@staticmethod
@lru_cache()
@lru_cache
def default_on() -> bool:
count_none = envs.VLLM_CUSTOM_OPS.count("none")
count_all = envs.VLLM_CUSTOM_OPS.count("all")

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.

View File

@ -746,7 +746,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
config_file_path = self._get_config_file(qlora_adapter)
with open(config_file_path, "r") as f:
with open(config_file_path) as f:
config = json.load(f)
self.target_modules = config["target_modules"]

View File

@ -190,7 +190,7 @@ def get_model(
kv_cache_dtype: ov.Type,
**kwargs,
) -> torch.nn.Module:
lora_config = kwargs.get("lora_config", None)
lora_config = kwargs.get("lora_config")
ov_core = kwargs.get("ov_core")
if lora_config:
raise ValueError(

View File

@ -280,7 +280,7 @@ class TensorizerAgent:
self.tensorizer_args = (
self.tensorizer_config._construct_tensorizer_args())
self.extra_kwargs = extra_kwargs
if extra_kwargs.get("quant_config", None) is not None:
if extra_kwargs.get("quant_config") is not None:
self.quant_config = extra_kwargs["quant_config"]
else:
self.quant_config = quant_config
@ -380,8 +380,7 @@ def tensorizer_weights_iterator(
stream = open_stream(tensorizer_args.tensorizer_uri, **stream_params)
with TensorDeserializer(stream, **deserializer_args,
device="cpu") as state:
for name, param in state.items():
yield name, param
yield from state.items()
del state

View File

@ -188,7 +188,7 @@ def get_quant_config(model_config: ModelConfig,
f"{quant_config_files}")
quant_config_file = quant_config_files[0]
with open(quant_config_file, "r") as f:
with open(quant_config_file) as f:
config = json.load(f)
if model_config.quantization == "bitsandbytes":
@ -306,7 +306,7 @@ def filter_duplicate_safetensors_files(hf_weights_files: List[str],
# Iterate through the weight_map (weight_name: safetensors files)
# to identify weights that we should use.
with open(index_file_name, "r") as f:
with open(index_file_name) as f:
weight_map = json.load(f)["weight_map"]
weight_files_in_index = set()
for weight_name in weight_map:
@ -382,7 +382,7 @@ def np_cache_weights_iterator(
with open(weight_names_file, "w") as f:
json.dump(weight_names, f)
with open(weight_names_file, "r") as f:
with open(weight_names_file) as f:
weight_names = json.load(f)
for name in weight_names:
@ -423,8 +423,7 @@ def pt_weights_iterator(
bar_format=_BAR_FORMAT,
):
state = torch.load(bin_file, map_location="cpu")
for name, param in state.items():
yield name, param
yield from state.items()
del state
torch.cuda.empty_cache()

View File

@ -48,7 +48,7 @@ class ArcticMLP(nn.Module):
is_residual_mlp: bool = False,
quant_config: Optional[QuantizationConfig] = None,
reduce_results: bool = True):
super(ArcticMLP, self).__init__()
super().__init__()
self.hidden_size = config.hidden_size
self.expert_id = expert_id
self.layer_id = layer_id
@ -89,7 +89,7 @@ class ArcticMoE(nn.Module):
params_dtype: Optional[torch.dtype] = None,
quant_config: Optional[QuantizationConfig] = None,
reduce_results: bool = True):
super(ArcticMoE, self).__init__()
super().__init__()
self.tp_size = tp_size or get_tensor_model_parallel_world_size()
self.hidden_size = config.hidden_size

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py
# Copyright 2023 The vLLM team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/THUDM/GLM-4
"""Inference-only ChatGLM model compatible with THUDM weights."""

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX

View File

@ -1,4 +1,3 @@
# coding=utf-8
from typing import Iterable, List, Optional, Tuple, Union
import torch

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 DeciAI Research Team. All rights reserved.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py
# Copyright 2024 The LG U+ CTO AI Tech Lab.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py
# Copyright 2023 The vLLM team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py
# Copyright 2023 The vLLM team.
# Copyright 2023 HuggingFace Inc. team. All rights reserved.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Copyright 2023 The vLLM team.
# Copyright (c) Google Inc.
#

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Copyright 2024 The vLLM team.
# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
#

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/THUDM/GLM-4
"""Inference-only GLM-4v model visual encoder compatible with THUDM weights."""

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
# Copyright 2023 The vLLM team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
# Copyright 2023 The vLLM team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py
# Copyright 2023 The vLLM team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
# Copyright 2023 The vLLM team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.

View File

@ -1,5 +1,3 @@
# coding=utf-8
# adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py
# Copyright 2024 The vLLM team.
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.

View File

@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
from functools import partial
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union

View File

@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
from typing import List, Optional, Tuple, Union
import torch

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://huggingface.co/inceptionai/jais-30b-chat-v3/blob/main/modeling_jais.py
# Copyright 2023 The vLLM team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
"""Inference-only Jamba model."""
from typing import Iterable, List, Optional, Tuple

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
"""PyTorch MAMBA model."""
from typing import Iterable, List, Optional, Tuple

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2024 The ModelBest team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");

View File

@ -37,7 +37,7 @@ class MLPSpeculatorLayerNorm(nn.Module):
eps=1e-06,
elementwise_scale_and_shift=True,
):
super(MLPSpeculatorLayerNorm, self).__init__()
super().__init__()
self.elementwise_scale_and_shift = elementwise_scale_and_shift
if self.elementwise_scale_and_shift:
self.weight = nn.Parameter(torch.empty(normalized_shape))

View File

@ -1121,9 +1121,9 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
batch_size * num_image * num_patch, -1).contiguous()
image_input_idx = image_input_idx * valid.to(image_input_idx.dtype)
offset = torch.cat(
[seq_len.new_zeros(
(1)), seq_len.cumsum(dim=0)[:-1]], dim=0)[:, None]
offset = torch.cat([seq_len.new_zeros(1),
seq_len.cumsum(dim=0)[:-1]],
dim=0)[:, None]
image_input_idx = image_input_idx + offset.to(image_input_idx.dtype)
image_input_idx = image_input_idx.flatten()[:, None]
mat = image_input_idx == torch.arange(

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
import math
from typing import Iterable, List, Optional, Tuple, Union

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py
# Copyright 2024 The vLLM team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
# Copyright 2023 The vLLM team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py
# Copyright (c) OrionStar Inc.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py
# Copyright 2023 The vLLM team.
# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py
# Copyright 2023 The vLLM team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from llama.py
"""Inference-only Phi3 model code inherit from Llama.py"""

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Copyright 2024 The vLLM team.
# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
#

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.

View File

@ -136,11 +136,11 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
if image_token_id not in inputs['prompt_token_ids']:
raise ValueError(
(f"You've passed {inputs=} without {image_token_id=}"
" Make sure to process your input via mistral_common's"
" tokenizer or pass a chat completion request. For more"
" For more info, see: "
"https://github.com/vllm-project/vllm/issues/8411."))
f"You've passed {inputs=} without {image_token_id=}"
" Make sure to process your input via mistral_common's"
" tokenizer or pass a chat completion request. For more"
" For more info, see: "
"https://github.com/vllm-project/vllm/issues/8411.")
return inputs

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
# Copyright (c) Alibaba Cloud.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
# Copyright 2024 The Qwen team.
@ -417,9 +416,9 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
and hasattr(config, "max_window_layers")):
raise ValueError("Sliding window for some but all layers is not "
"supported. This model uses sliding window "
"but `max_window_layers` = %s is less than "
"`num_hidden_layers` = %s. Please open an issue "
"to discuss this feature." % (
"but `max_window_layers` = {} is less than "
"`num_hidden_layers` = {}. Please open an issue "
"to discuss this feature.".format(
config.max_window_layers,
config.num_hidden_layers,
))

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Copyright 2024 The Qwen team.
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
# Copyright 2024 Kakao Corp. (Kanana-X Team)
@ -60,9 +59,9 @@ class Qwen2ForSequenceClassification(nn.Module):
and hasattr(config, "max_window_layers")):
raise ValueError("Sliding window for some but all layers is not "
"supported. This model uses sliding window "
"but `max_window_layers` = %s is less than "
"`num_hidden_layers` = %s. Please open an issue "
"to discuss this feature." % (
"but `max_window_layers` = {} is less than "
"`num_hidden_layers` = {}. Please open an issue "
"to discuss this feature.".format(
config.max_window_layers,
config.num_hidden_layers,
))

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
# Copyright 2024 The Qwen team.

View File

@ -1,4 +1,3 @@
# coding=utf-8
# Adapted from
# https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
# Copyright 2024 The Qwen team.
@ -71,9 +70,9 @@ class Qwen2ForRewardModel(nn.Module, SupportsPP):
and hasattr(config, "max_window_layers")):
raise ValueError("Sliding window for some but all layers is not "
"supported. This model uses sliding window "
"but `max_window_layers` = %s is less than "
"`num_hidden_layers` = %s. Please open an issue "
"to discuss this feature." % (
"but `max_window_layers` = {} is less than "
"`num_hidden_layers` = {}. Please open an issue "
"to discuss this feature.".format(
config.max_window_layers,
config.num_hidden_layers,
))

Some files were not shown because too many files have changed in this diff Show More