[CI/Build] drop support for Python 3.8 EOL (#8464)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
This commit is contained in:
parent
4be3a45158
commit
21063c11c7
@ -56,7 +56,7 @@ serving_column_mapping = {
|
||||
|
||||
def read_markdown(file):
|
||||
if os.path.exists(file):
|
||||
with open(file, "r") as f:
|
||||
with open(file) as f:
|
||||
return f.read() + "\n"
|
||||
else:
|
||||
return f"{file} not found.\n"
|
||||
@ -75,14 +75,14 @@ if __name__ == "__main__":
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*.json"):
|
||||
|
||||
with open(test_file, "r") as f:
|
||||
with open(test_file) as f:
|
||||
raw_result = json.loads(f.read())
|
||||
|
||||
if "serving" in str(test_file):
|
||||
# this result is generated via `benchmark_serving.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
raw_result.update(command)
|
||||
|
||||
@ -97,7 +97,7 @@ if __name__ == "__main__":
|
||||
# this result is generated via `benchmark_latency.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
raw_result.update(command)
|
||||
|
||||
@ -119,7 +119,7 @@ if __name__ == "__main__":
|
||||
# this result is generated via `benchmark_throughput.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
raw_result.update(command)
|
||||
|
||||
|
@ -72,7 +72,7 @@ def main(args):
|
||||
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*_nightly_results.json"):
|
||||
with open(test_file, "r") as f:
|
||||
with open(test_file) as f:
|
||||
results = results + json.loads(f.read())
|
||||
|
||||
# generate markdown table
|
||||
@ -80,7 +80,7 @@ def main(args):
|
||||
|
||||
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
|
||||
|
||||
with open(args.description, "r") as f:
|
||||
with open(args.description) as f:
|
||||
description = f.read()
|
||||
|
||||
description = description.format(
|
||||
|
@ -36,11 +36,11 @@ if __name__ == "__main__":
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*.json"):
|
||||
|
||||
with open(test_file, "r") as f:
|
||||
with open(test_file) as f:
|
||||
raw_result = json.loads(f.read())
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
raw_result.update(command)
|
||||
|
||||
|
2
.github/workflows/mypy.yaml
vendored
2
.github/workflows/mypy.yaml
vendored
@ -25,7 +25,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
||||
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
||||
steps:
|
||||
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
|
2
.github/workflows/publish.yml
vendored
2
.github/workflows/publish.yml
vendored
@ -48,7 +48,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: ['ubuntu-20.04']
|
||||
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
|
||||
python-version: ['3.9', '3.10', '3.11', '3.12']
|
||||
pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
|
||||
cuda-version: ['11.8', '12.1']
|
||||
|
||||
|
32
.github/workflows/ruff.yml
vendored
32
.github/workflows/ruff.yml
vendored
@ -29,19 +29,19 @@ jobs:
|
||||
matrix:
|
||||
python-version: ["3.12"]
|
||||
steps:
|
||||
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements-lint.txt
|
||||
- name: Analysing the code with ruff
|
||||
run: |
|
||||
echo "::add-matcher::.github/workflows/matchers/ruff.json"
|
||||
ruff check --output-format github .
|
||||
- name: Run isort
|
||||
run: |
|
||||
isort . --check-only
|
||||
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements-lint.txt
|
||||
- name: Analysing the code with ruff
|
||||
run: |
|
||||
echo "::add-matcher::.github/workflows/matchers/ruff.json"
|
||||
ruff check --output-format github .
|
||||
- name: Run isort
|
||||
run: |
|
||||
isort . --check-only
|
||||
|
26
.github/workflows/yapf.yml
vendored
26
.github/workflows/yapf.yml
vendored
@ -23,16 +23,16 @@ jobs:
|
||||
matrix:
|
||||
python-version: ["3.12"]
|
||||
steps:
|
||||
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install yapf==0.32.0
|
||||
pip install toml==0.10.2
|
||||
- name: Running yapf
|
||||
run: |
|
||||
yapf --diff --recursive .
|
||||
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install yapf==0.32.0
|
||||
pip install toml==0.10.2
|
||||
- name: Running yapf
|
||||
run: |
|
||||
yapf --diff --recursive .
|
||||
|
@ -6,17 +6,16 @@ version: 2
|
||||
build:
|
||||
os: ubuntu-22.04
|
||||
tools:
|
||||
python: "3.8"
|
||||
python: '3.9'
|
||||
|
||||
sphinx:
|
||||
configuration: docs/source/conf.py
|
||||
fail_on_warning: true
|
||||
configuration: docs/source/conf.py
|
||||
fail_on_warning: true
|
||||
|
||||
# If using Sphinx, optionally build your docs in additional formats such as PDF
|
||||
formats: []
|
||||
|
||||
# Optionally declare the Python requirements required to build your docs
|
||||
python:
|
||||
install:
|
||||
- requirements: docs/requirements-docs.txt
|
||||
|
||||
install:
|
||||
- requirements: docs/requirements-docs.txt
|
||||
|
@ -128,9 +128,9 @@ endif()
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
#
|
||||
# For cuda we want to be able to control which architectures we compile for on
|
||||
# For cuda we want to be able to control which architectures we compile for on
|
||||
# a per-file basis in order to cut down on compile time. So here we extract
|
||||
# the set of architectures we want to compile for and remove the from the
|
||||
# the set of architectures we want to compile for and remove the from the
|
||||
# CMAKE_CUDA_FLAGS so that they are not applied globally.
|
||||
#
|
||||
clear_cuda_arches(CUDA_ARCH_FLAGS)
|
||||
@ -138,7 +138,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
|
||||
# Filter the target architectures by the supported supported archs
|
||||
# since for some files we will build for all CUDA_ARCHS.
|
||||
cuda_archs_loose_intersection(CUDA_ARCHS
|
||||
cuda_archs_loose_intersection(CUDA_ARCHS
|
||||
"${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
|
||||
message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
|
||||
else()
|
||||
@ -236,7 +236,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# are not supported by Machete yet.
|
||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
|
||||
if (MARLIN_ARCHS)
|
||||
set(MARLIN_SRCS
|
||||
set(MARLIN_SRCS
|
||||
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||
@ -277,7 +277,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"in CUDA target architectures")
|
||||
endif()
|
||||
|
||||
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
|
||||
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
|
||||
# build any 3x kernels
|
||||
set(SCALED_MM_3X_ARCHS)
|
||||
endif()
|
||||
@ -285,7 +285,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
#
|
||||
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
||||
# kernels for the remaining archs that are not already built for 3x.
|
||||
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
||||
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
||||
"7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
|
||||
# subtract out the archs that are already built for 3x
|
||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||
@ -316,10 +316,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
|
||||
#
|
||||
# For the Machete kernels we automatically generate sources for various
|
||||
# For the Machete kernels we automatically generate sources for various
|
||||
# preselected input type pairs and schedules.
|
||||
# Generate sources:
|
||||
set(MACHETE_GEN_SCRIPT
|
||||
set(MACHETE_GEN_SCRIPT
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
|
||||
file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
|
||||
|
||||
@ -329,8 +329,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
|
||||
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
|
||||
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
|
||||
RESULT_VARIABLE machete_generation_result
|
||||
OUTPUT_VARIABLE machete_generation_output
|
||||
@ -340,11 +340,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
if (NOT machete_generation_result EQUAL 0)
|
||||
message(FATAL_ERROR "Machete generation failed."
|
||||
" Result: \"${machete_generation_result}\""
|
||||
" Result: \"${machete_generation_result}\""
|
||||
"\nCheck the log for details: "
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
|
||||
else()
|
||||
set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
|
||||
set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
|
||||
CACHE STRING "Last run machete generate script hash" FORCE)
|
||||
message(STATUS "Machete generation completed successfully.")
|
||||
endif()
|
||||
@ -366,7 +366,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
|
||||
else()
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
|
||||
AND MACHETE_ARCHS)
|
||||
message(STATUS "Not building Machete kernels as CUDA Compiler version is "
|
||||
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
||||
@ -392,8 +392,8 @@ define_gpu_extension_target(
|
||||
USE_SABI 3
|
||||
WITH_SOABI)
|
||||
|
||||
# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
|
||||
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
|
||||
# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
|
||||
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
|
||||
# driver API. This causes problems when linking with earlier versions of CUDA.
|
||||
# Setting this variable sidesteps the issue by calling the driver directly.
|
||||
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
||||
@ -471,9 +471,9 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
|
||||
return()
|
||||
endif ()
|
||||
|
||||
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
|
||||
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
|
||||
# arches in the CUDA case (and instead set the gencodes on a per file basis)
|
||||
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
|
||||
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
|
||||
# arches in the CUDA case (and instead set the gencodes on a per file basis)
|
||||
# we need to manually set VLLM_GPU_ARCHES here.
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
foreach(_ARCH ${CUDA_ARCHS})
|
||||
|
@ -79,7 +79,7 @@ async def async_request_tgi(
|
||||
# any data, we should skip it.
|
||||
if chunk_bytes.startswith(":"):
|
||||
continue
|
||||
chunk = remove_prefix(chunk_bytes, "data:")
|
||||
chunk = chunk_bytes.removeprefix("data:")
|
||||
|
||||
data = json.loads(chunk)
|
||||
timestamp = time.perf_counter()
|
||||
@ -144,8 +144,8 @@ async def async_request_trt_llm(
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
|
||||
"data:")
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data:")
|
||||
|
||||
data = json.loads(chunk)
|
||||
output.generated_text += data["text_output"]
|
||||
@ -261,8 +261,8 @@ async def async_request_openai_completions(
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
|
||||
"data: ")
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data: ")
|
||||
if chunk == "[DONE]":
|
||||
latency = time.perf_counter() - st
|
||||
else:
|
||||
@ -349,8 +349,8 @@ async def async_request_openai_chat_completions(
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
|
||||
"data: ")
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data: ")
|
||||
if chunk == "[DONE]":
|
||||
latency = time.perf_counter() - st
|
||||
else:
|
||||
@ -389,14 +389,6 @@ async def async_request_openai_chat_completions(
|
||||
return output
|
||||
|
||||
|
||||
# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
|
||||
# introduced in Python 3.9
|
||||
def remove_prefix(text: str, prefix: str) -> str:
|
||||
if text.startswith(prefix):
|
||||
return text[len(prefix):]
|
||||
return text
|
||||
|
||||
|
||||
def get_model(pretrained_model_name_or_path: str) -> str:
|
||||
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
|
||||
from modelscope import snapshot_download
|
||||
|
@ -269,10 +269,10 @@ def run_square_bench(args):
|
||||
|
||||
|
||||
def run_range_bench(args):
|
||||
m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")]
|
||||
m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")]
|
||||
m_start, k_start, n_start = (int(x) for x in args.dim_start.split(","))
|
||||
m_end, k_end, n_end = (int(x) for x in args.dim_end.split(","))
|
||||
m_increment, k_increment, n_increment = \
|
||||
[int(x) for x in args.dim_increment.split(",")]
|
||||
(int(x) for x in args.dim_increment.split(","))
|
||||
Ms = list(range(m_start, m_end + 1, m_increment))
|
||||
Ks = list(range(k_start, k_end + 1, k_increment))
|
||||
Ns = list(range(n_start, n_end + 1, n_increment))
|
||||
|
@ -468,7 +468,7 @@ def generate():
|
||||
impl_configs = []
|
||||
|
||||
GPTQ_kernel_type_configs = list(
|
||||
(TypeConfig(
|
||||
TypeConfig(
|
||||
element_a=element_a,
|
||||
element_b=element_b,
|
||||
element_b_scale=element_a,
|
||||
@ -476,7 +476,7 @@ def generate():
|
||||
element_d=element_a,
|
||||
accumulator=DataType.f32,
|
||||
) for element_b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
|
||||
for element_a in (DataType.f16, DataType.bf16)))
|
||||
for element_a in (DataType.f16, DataType.bf16))
|
||||
|
||||
GPTQ_kernel_specializations = [
|
||||
Specialization(with_C=False, with_zeropoints=False, with_scales=True)
|
||||
@ -490,7 +490,7 @@ def generate():
|
||||
]
|
||||
|
||||
AWQ_kernel_type_configs = list(
|
||||
(TypeConfig(
|
||||
TypeConfig(
|
||||
element_a=element_a,
|
||||
element_b=element_b,
|
||||
element_b_scale=element_a,
|
||||
@ -498,7 +498,7 @@ def generate():
|
||||
element_d=element_a,
|
||||
accumulator=DataType.f32,
|
||||
) for element_b in (DataType.u4, DataType.u8)
|
||||
for element_a in (DataType.f16, DataType.bf16)))
|
||||
for element_a in (DataType.f16, DataType.bf16))
|
||||
|
||||
AWQ_kernel_specializations = [
|
||||
Specialization(with_C=False, with_zeropoints=True, with_scales=True)
|
||||
|
@ -10,7 +10,7 @@ Requirements
|
||||
============
|
||||
|
||||
* OS: Linux
|
||||
* Python: 3.8 - 3.12
|
||||
* Python: 3.9 -- 3.12
|
||||
* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
|
||||
|
||||
Install released versions
|
||||
@ -148,7 +148,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T
|
||||
.. tip::
|
||||
|
||||
Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
|
||||
For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` .
|
||||
For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` .
|
||||
As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
|
||||
|
||||
|
||||
@ -181,8 +181,8 @@ to be run simultaneously, via the environment variable ``MAX_JOBS``. For example
|
||||
$ export MAX_JOBS=6
|
||||
$ pip install -e .
|
||||
|
||||
This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings>`_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory.
|
||||
A side effect is a much slower build process.
|
||||
This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings>`_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory.
|
||||
A side effect is a much slower build process.
|
||||
|
||||
Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
|
||||
|
||||
@ -209,7 +209,7 @@ Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
|
||||
Unsupported OS build
|
||||
--------------------
|
||||
|
||||
vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
|
||||
vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
|
||||
|
||||
Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing:
|
||||
|
||||
|
@ -34,7 +34,7 @@ select = [
|
||||
# Pyflakes
|
||||
"F",
|
||||
# pyupgrade
|
||||
# "UP",
|
||||
"UP",
|
||||
# flake8-bugbear
|
||||
"B",
|
||||
# flake8-simplify
|
||||
@ -55,7 +55,7 @@ ignore = [
|
||||
]
|
||||
|
||||
[tool.mypy]
|
||||
python_version = "3.8"
|
||||
python_version = "3.9"
|
||||
|
||||
ignore_missing_imports = true
|
||||
check_untyped_defs = true
|
||||
|
9
setup.py
9
setup.py
@ -1,5 +1,4 @@
|
||||
import importlib.util
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
@ -327,7 +326,7 @@ def get_neuronxcc_version():
|
||||
"__init__.py")
|
||||
|
||||
# Check if the command was executed successfully
|
||||
with open(version_file, "rt") as fp:
|
||||
with open(version_file) as fp:
|
||||
content = fp.read()
|
||||
|
||||
# Extract the version using a regular expression
|
||||
@ -404,7 +403,8 @@ def read_readme() -> str:
|
||||
"""Read the README file if present."""
|
||||
p = get_path("README.md")
|
||||
if os.path.isfile(p):
|
||||
return io.open(get_path("README.md"), "r", encoding="utf-8").read()
|
||||
with open(get_path("README.md"), encoding="utf-8") as f:
|
||||
return f.read()
|
||||
else:
|
||||
return ""
|
||||
|
||||
@ -498,7 +498,6 @@ setup(
|
||||
"Documentation": "https://vllm.readthedocs.io/en/latest/",
|
||||
},
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
@ -512,7 +511,7 @@ setup(
|
||||
],
|
||||
packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
|
||||
"tests*")),
|
||||
python_requires=">=3.8",
|
||||
python_requires=">=3.9",
|
||||
install_requires=get_requirements(),
|
||||
ext_modules=ext_modules,
|
||||
extras_require={
|
||||
|
@ -429,8 +429,8 @@ def benchmark():
|
||||
# print in tabular format
|
||||
print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph")
|
||||
for b in cudagraph_sizes:
|
||||
print((f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
|
||||
f"\t{piecewise_cudagraph_time[b]:.3f}"))
|
||||
print(f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
|
||||
f"\t{piecewise_cudagraph_time[b]:.3f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -1,6 +1,5 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
from collections import UserList
|
||||
from enum import Enum
|
||||
@ -52,7 +51,7 @@ PromptVideoInput = _PromptMultiModalInput[np.ndarray]
|
||||
|
||||
|
||||
def _read_prompts(filename: str) -> List[str]:
|
||||
with open(filename, "r") as f:
|
||||
with open(filename) as f:
|
||||
prompts = f.readlines()
|
||||
return prompts
|
||||
|
||||
@ -62,14 +61,8 @@ class _ImageAssetPrompts(TypedDict):
|
||||
cherry_blossom: str
|
||||
|
||||
|
||||
if sys.version_info < (3, 9):
|
||||
# UserList cannot be subscripted
|
||||
class _ImageAssetsBase(UserList):
|
||||
pass
|
||||
else:
|
||||
|
||||
class _ImageAssetsBase(UserList[ImageAsset]):
|
||||
pass
|
||||
class _ImageAssetsBase(UserList[ImageAsset]):
|
||||
pass
|
||||
|
||||
|
||||
class _ImageAssets(_ImageAssetsBase):
|
||||
@ -94,14 +87,8 @@ class _VideoAssetPrompts(TypedDict):
|
||||
sample_demo_1: str
|
||||
|
||||
|
||||
if sys.version_info < (3, 9):
|
||||
# UserList cannot be subscripted
|
||||
class _VideoAssetsBase(UserList):
|
||||
pass
|
||||
else:
|
||||
|
||||
class _VideoAssetsBase(UserList[VideoAsset]):
|
||||
pass
|
||||
class _VideoAssetsBase(UserList[VideoAsset]):
|
||||
pass
|
||||
|
||||
|
||||
class _VideoAssets(_VideoAssetsBase):
|
||||
@ -958,7 +945,7 @@ def dummy_opt_path():
|
||||
"*.msgpack"
|
||||
])
|
||||
assert os.path.exists(json_path)
|
||||
with open(json_path, "r") as f:
|
||||
with open(json_path) as f:
|
||||
config = json.load(f)
|
||||
config["architectures"] = ["MyOPTForCausalLM"]
|
||||
with open(json_path, "w") as f:
|
||||
@ -977,7 +964,7 @@ def dummy_llava_path():
|
||||
"*.msgpack"
|
||||
])
|
||||
assert os.path.exists(json_path)
|
||||
with open(json_path, "r") as f:
|
||||
with open(json_path) as f:
|
||||
config = json.load(f)
|
||||
config["architectures"] = ["MyLlava"]
|
||||
with open(json_path, "w") as f:
|
||||
@ -996,7 +983,7 @@ def dummy_gemma2_embedding_path():
|
||||
"*.msgpack"
|
||||
])
|
||||
assert os.path.exists(json_path)
|
||||
with open(json_path, "r") as f:
|
||||
with open(json_path) as f:
|
||||
config = json.load(f)
|
||||
config["architectures"] = ["MyGemma2Embedding"]
|
||||
with open(json_path, "w") as f:
|
||||
|
@ -99,13 +99,11 @@ class TestPrefixCachingBlock:
|
||||
|
||||
token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
|
||||
|
||||
first_chain, second_chain = [
|
||||
TestPrefixCachingBlock.create_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
num_empty_trailing_blocks=num_empty_trailing_blocks)
|
||||
for _ in range(2)
|
||||
]
|
||||
first_chain, second_chain = (TestPrefixCachingBlock.create_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
num_empty_trailing_blocks=num_empty_trailing_blocks)
|
||||
for _ in range(2))
|
||||
|
||||
for first_chain_block, second_chain_block in zip(
|
||||
first_chain, second_chain):
|
||||
|
@ -510,7 +510,7 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
|
||||
for var in (u_ref, delta_ref, B_ref, C_ref, z_ref)
|
||||
]
|
||||
for i in range(len(seqlens[0])):
|
||||
u_s, delta_s, B_s, C_s, z_s = [v[i].unsqueeze(0) for v in splits]
|
||||
u_s, delta_s, B_s, C_s, z_s = (v[i].unsqueeze(0) for v in splits)
|
||||
if padded_state_indices[i] == PAD_SLOT_ID:
|
||||
continue
|
||||
out_ref_s, _ = selective_scan_ref(
|
||||
|
@ -104,7 +104,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
|
||||
# Sad path tests for the multimodal input processor and mapper, respectively
|
||||
@pytest.mark.parametrize("mm_data", [
|
||||
{
|
||||
"image": torch.rand((5))
|
||||
"image": torch.rand(5)
|
||||
},
|
||||
{
|
||||
"image": torch.rand((5, 5, 5, 5, 5))
|
||||
|
@ -413,12 +413,10 @@ class _CorrectnessTestHelper:
|
||||
def generate_probs_for_test(
|
||||
self, draft_and_target_probs_equal: bool
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
draft_probs, target_probs = [
|
||||
F.softmax(
|
||||
torch.rand(self.vocab_size, dtype=torch.float32),
|
||||
dim=-1,
|
||||
) for _ in range(2)
|
||||
]
|
||||
draft_probs, target_probs = (F.softmax(
|
||||
torch.rand(self.vocab_size, dtype=torch.float32),
|
||||
dim=-1,
|
||||
) for _ in range(2))
|
||||
|
||||
num_reference_probs = 100
|
||||
reference_probs = F.softmax(
|
||||
|
@ -29,7 +29,7 @@ def test_trace_function_call():
|
||||
cur_dir = os.path.dirname(__file__)
|
||||
enable_trace_function_call(path, cur_dir)
|
||||
f1(1)
|
||||
with open(path, 'r') as f:
|
||||
with open(path) as f:
|
||||
content = f.read()
|
||||
|
||||
assert "f1" in content
|
||||
|
@ -93,10 +93,10 @@ def test_mistral_edge_case(tokenizer, truth):
|
||||
def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
|
||||
if "mistral" in tokenizer_name:
|
||||
yield (
|
||||
bool(True) if request.param else
|
||||
True if request.param else
|
||||
pytest.skip("mistral doesn't support skip_special_tokens=False"))
|
||||
else:
|
||||
yield bool(True) if request.param else bool(False)
|
||||
yield bool(request.param)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("truth", TRUTH)
|
||||
|
@ -46,7 +46,7 @@ if __name__ == "__main__":
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.json_trace, "r") as f:
|
||||
with open(args.json_trace) as f:
|
||||
profile_data = json.load(f)
|
||||
|
||||
if args.table == "summary":
|
||||
|
@ -434,7 +434,7 @@ def main(
|
||||
f"{', Sparsity ' + sparsity if sparsity else ''}")
|
||||
|
||||
profile_json = None
|
||||
with open(json_trace, "r") as f:
|
||||
with open(json_trace) as f:
|
||||
profile_json = json.load(f)
|
||||
assert profile_json is not None
|
||||
|
||||
|
@ -81,7 +81,7 @@ class Target:
|
||||
# Allow for modest floating-point errors
|
||||
epsilon = 0.000002
|
||||
if (self.weighted_duration > self.Duration() + epsilon):
|
||||
print('%s > %s?' % (self.weighted_duration, self.Duration()))
|
||||
print('{} > {}?'.format(self.weighted_duration, self.Duration()))
|
||||
assert (self.weighted_duration <= self.Duration() + epsilon)
|
||||
return self.weighted_duration
|
||||
|
||||
@ -104,7 +104,7 @@ def ReadTargets(log, show_all):
|
||||
The result is a list of Target objects."""
|
||||
header = log.readline()
|
||||
assert header == '# ninja log v5\n', \
|
||||
'unrecognized ninja log version %r' % header
|
||||
'unrecognized ninja log version {!r}'.format(header)
|
||||
targets_dict = {}
|
||||
last_end_seen = 0.0
|
||||
for line in log:
|
||||
@ -254,8 +254,8 @@ def SummarizeEntries(entries, extra_step_types):
|
||||
# Warn if the sum of weighted times is off by more than half a second.
|
||||
if abs(length - weighted_total) > 500:
|
||||
print('Warning: Possible corrupt ninja log, results may be '
|
||||
'untrustworthy. Length = %.3f, weighted total = %.3f' %
|
||||
(length, weighted_total))
|
||||
'untrustworthy. Length = {:.3f}, weighted total = {:.3f}'.format(
|
||||
length, weighted_total))
|
||||
|
||||
entries_by_ext = defaultdict(list)
|
||||
for target in entries:
|
||||
@ -263,16 +263,17 @@ def SummarizeEntries(entries, extra_step_types):
|
||||
entries_by_ext[extension].append(target)
|
||||
|
||||
for key, values in entries_by_ext.items():
|
||||
print(' Longest build steps for %s:' % key)
|
||||
print(' Longest build steps for {}:'.format(key))
|
||||
values.sort(key=lambda x: x.WeightedDuration())
|
||||
for target in values[-long_count:]:
|
||||
print(' %8.1f weighted s to build %s (%.1f s elapsed time)' %
|
||||
(target.WeightedDuration(), target.DescribeTargets(),
|
||||
target.Duration()))
|
||||
print(
|
||||
' {:8.1f} weighted s to build {} ({:.1f} s elapsed time)'.
|
||||
format(target.WeightedDuration(), target.DescribeTargets(),
|
||||
target.Duration()))
|
||||
|
||||
print(' %.1f s weighted time (%.1f s elapsed time sum, %1.1fx '
|
||||
'parallelism)' %
|
||||
(length, total_cpu_time, total_cpu_time * 1.0 / length))
|
||||
print(' {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
|
||||
'parallelism)'.format(length, total_cpu_time,
|
||||
total_cpu_time * 1.0 / length))
|
||||
print(' %d build steps completed, average of %1.2f/s' %
|
||||
(len(entries), len(entries) / (length)))
|
||||
|
||||
@ -298,11 +299,12 @@ def main():
|
||||
long_ext_count += len(args.step_types.split(';'))
|
||||
|
||||
try:
|
||||
with open(log_file, 'r') as log:
|
||||
with open(log_file) as log:
|
||||
entries = ReadTargets(log, False)
|
||||
SummarizeEntries(entries, args.step_types)
|
||||
except IOError:
|
||||
print('Log file %r not found, no build summary created.' % log_file)
|
||||
except OSError:
|
||||
print('Log file {!r} not found, no build summary created.'.format(
|
||||
log_file))
|
||||
return errno.ENOENT
|
||||
|
||||
|
||||
|
@ -4,7 +4,7 @@ requires_files = glob.glob('requirements*.txt')
|
||||
requires_files += ["pyproject.toml"]
|
||||
for file in requires_files:
|
||||
print(f">>> cleaning {file}")
|
||||
with open(file, 'r') as f:
|
||||
with open(file) as f:
|
||||
lines = f.readlines()
|
||||
if "torch" in "".join(lines).lower():
|
||||
print("removed:")
|
||||
|
@ -192,10 +192,8 @@ class LocalStridedBlockSparseAttn(torch.nn.Module):
|
||||
attn_mask = self.dense_attn_mask[None, :, :maxlen, :maxlen]
|
||||
|
||||
q2 = self.transpose_and_pad(q, cu_seqlens, maxlen, 1)
|
||||
k2, v2 = [
|
||||
self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio)
|
||||
for x in [k, v]
|
||||
]
|
||||
k2, v2 = (self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio)
|
||||
for x in [k, v])
|
||||
spda_output = torch.nn.functional.scaled_dot_product_attention(
|
||||
q2, k2, v2, attn_mask=attn_mask, scale=sm_scale)
|
||||
return self.transpose_and_unpad(spda_output, cu_seqlens)
|
||||
|
@ -668,9 +668,10 @@ class ModelConfig:
|
||||
@property
|
||||
def is_encoder_decoder_model(self) -> bool:
|
||||
"""Extract the HF encoder/decoder model flag."""
|
||||
return getattr(self.hf_config, "is_encoder_decoder", False) or (
|
||||
(hasattr(self.hf_config, "text_config") and getattr(
|
||||
self.hf_config.text_config, "is_encoder_decoder", False)))
|
||||
return getattr(
|
||||
self.hf_config, "is_encoder_decoder",
|
||||
False) or (hasattr(self.hf_config, "text_config") and getattr(
|
||||
self.hf_config.text_config, "is_encoder_decoder", False))
|
||||
|
||||
@property
|
||||
def is_multimodal_model(self) -> bool:
|
||||
|
@ -52,7 +52,7 @@ class Evictor(ABC):
|
||||
pass
|
||||
|
||||
|
||||
class BlockMetaData():
|
||||
class BlockMetaData:
|
||||
"""Data structure for storing key data describe cached block, so that
|
||||
evitor could use to make its decision which one to choose for eviction
|
||||
|
||||
|
@ -240,7 +240,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
|
||||
if is_distributed:
|
||||
get_world_group().barrier()
|
||||
logger.info("reading GPU P2P access cache from %s", path)
|
||||
with open(path, "r") as f:
|
||||
with open(path) as f:
|
||||
cache = json.load(f)
|
||||
_gpu_p2p_access_cache = cache
|
||||
return _gpu_p2p_access_cache[f"{src}->{tgt}"]
|
||||
|
@ -812,7 +812,7 @@ class AsyncLLMEngine(EngineClient):
|
||||
async def run_engine_loop(engine_ref: ReferenceType):
|
||||
"""We use a weakref to the engine so that the running loop
|
||||
doesn't prevent the engine being garbage collected."""
|
||||
engine: Optional["AsyncLLMEngine"] = engine_ref()
|
||||
engine: Optional[AsyncLLMEngine] = engine_ref()
|
||||
if not engine:
|
||||
return
|
||||
|
||||
|
@ -1541,8 +1541,8 @@ class LLMEngine:
|
||||
seq_group.state.remaining_steps != ref_remaining_steps
|
||||
for seq_group in seq_group_metadata_list[1:]
|
||||
]):
|
||||
raise AssertionError(("All running sequence groups should "
|
||||
"have the same remaining steps."))
|
||||
raise AssertionError("All running sequence groups should "
|
||||
"have the same remaining steps.")
|
||||
|
||||
return ref_remaining_steps > 0
|
||||
|
||||
|
@ -77,7 +77,7 @@ class StatLoggerBase(ABC):
|
||||
self.num_generation_tokens: List[int] = []
|
||||
self.last_local_log = time.time()
|
||||
self.local_interval = local_interval
|
||||
self.spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
|
||||
self.spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None
|
||||
|
||||
@abstractmethod
|
||||
def log(self, stats: Stats) -> None:
|
||||
|
@ -63,7 +63,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
|
||||
single_step_process_prompt_logprob(self, seq_group, output)
|
||||
|
||||
@staticmethod
|
||||
@functools.lru_cache()
|
||||
@functools.lru_cache
|
||||
def _log_prompt_logprob_unsupported_warning_once():
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
|
@ -362,7 +362,7 @@ def load_chat_template(
|
||||
if chat_template is None:
|
||||
return None
|
||||
try:
|
||||
with open(chat_template, "r") as f:
|
||||
with open(chat_template) as f:
|
||||
resolved_chat_template = f.read()
|
||||
except OSError as e:
|
||||
if isinstance(chat_template, Path):
|
||||
|
@ -120,7 +120,7 @@ async def read_file(path_or_url: str) -> str:
|
||||
session.get(path_or_url) as resp:
|
||||
return await resp.text()
|
||||
else:
|
||||
with open(path_or_url, "r", encoding="utf-8") as f:
|
||||
with open(path_or_url, encoding="utf-8") as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
|
@ -32,7 +32,7 @@ class RayGPUExecutor(DistributedGPUExecutor):
|
||||
uses_ray: bool = True
|
||||
|
||||
def _init_executor(self) -> None:
|
||||
self.forward_dag: Optional["ray.dag.CompiledDAG"] = None
|
||||
self.forward_dag: Optional[ray.dag.CompiledDAG] = None
|
||||
# If the env var is set, it uses the Ray's compiled DAG API
|
||||
# which optimizes the control plane overhead.
|
||||
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
|
||||
|
@ -67,8 +67,7 @@ def _configure_vllm_root_logger() -> None:
|
||||
raise RuntimeError(
|
||||
"Could not load logging config. File does not exist: %s",
|
||||
VLLM_LOGGING_CONFIG_PATH)
|
||||
with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8",
|
||||
mode="r") as file:
|
||||
with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
|
||||
custom_config = json.loads(file.read())
|
||||
|
||||
if not isinstance(custom_config, dict):
|
||||
|
@ -343,7 +343,7 @@ class LoRAModelManager(AdapterModelManager):
|
||||
# text modules (e.g. ChatGLM)
|
||||
and hasattr(self.model, "get_mm_mapping"))
|
||||
self.packed_modules: Dict[str, List[str]] = {}
|
||||
self.modules: Dict[str, "BaseLayerWithLoRA"] = {}
|
||||
self.modules: Dict[str, BaseLayerWithLoRA] = {}
|
||||
# Dict instead of a Set for compatibility with LRUCache.
|
||||
self._last_mapping: Optional[LoRAMapping] = None
|
||||
self._create_lora_modules()
|
||||
@ -548,7 +548,7 @@ class LoRAModelManager(AdapterModelManager):
|
||||
else:
|
||||
parts = module_name.split(".")
|
||||
replacements = self.packed_modules_mapping[parts[-1]]
|
||||
subloras: List[Optional["LoRALayerWeights"]] = []
|
||||
subloras: List[Optional[LoRALayerWeights]] = []
|
||||
for i, r in enumerate(replacements):
|
||||
lora = LoRALayerWeights.create_dummy_lora_weights(
|
||||
module_name + "." + r,
|
||||
|
@ -103,7 +103,7 @@ class CustomOp(nn.Module):
|
||||
# On by default if VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE
|
||||
# Specifying 'all' or 'none' in VLLM_CUSTOM_OPS takes precedence.
|
||||
@staticmethod
|
||||
@lru_cache()
|
||||
@lru_cache
|
||||
def default_on() -> bool:
|
||||
count_none = envs.VLLM_CUSTOM_OPS.count("none")
|
||||
count_all = envs.VLLM_CUSTOM_OPS.count("all")
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
|
||||
# https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -746,7 +746,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
||||
|
||||
config_file_path = self._get_config_file(qlora_adapter)
|
||||
|
||||
with open(config_file_path, "r") as f:
|
||||
with open(config_file_path) as f:
|
||||
config = json.load(f)
|
||||
self.target_modules = config["target_modules"]
|
||||
|
||||
|
@ -190,7 +190,7 @@ def get_model(
|
||||
kv_cache_dtype: ov.Type,
|
||||
**kwargs,
|
||||
) -> torch.nn.Module:
|
||||
lora_config = kwargs.get("lora_config", None)
|
||||
lora_config = kwargs.get("lora_config")
|
||||
ov_core = kwargs.get("ov_core")
|
||||
if lora_config:
|
||||
raise ValueError(
|
||||
|
@ -280,7 +280,7 @@ class TensorizerAgent:
|
||||
self.tensorizer_args = (
|
||||
self.tensorizer_config._construct_tensorizer_args())
|
||||
self.extra_kwargs = extra_kwargs
|
||||
if extra_kwargs.get("quant_config", None) is not None:
|
||||
if extra_kwargs.get("quant_config") is not None:
|
||||
self.quant_config = extra_kwargs["quant_config"]
|
||||
else:
|
||||
self.quant_config = quant_config
|
||||
@ -380,8 +380,7 @@ def tensorizer_weights_iterator(
|
||||
stream = open_stream(tensorizer_args.tensorizer_uri, **stream_params)
|
||||
with TensorDeserializer(stream, **deserializer_args,
|
||||
device="cpu") as state:
|
||||
for name, param in state.items():
|
||||
yield name, param
|
||||
yield from state.items()
|
||||
del state
|
||||
|
||||
|
||||
|
@ -188,7 +188,7 @@ def get_quant_config(model_config: ModelConfig,
|
||||
f"{quant_config_files}")
|
||||
|
||||
quant_config_file = quant_config_files[0]
|
||||
with open(quant_config_file, "r") as f:
|
||||
with open(quant_config_file) as f:
|
||||
config = json.load(f)
|
||||
|
||||
if model_config.quantization == "bitsandbytes":
|
||||
@ -306,7 +306,7 @@ def filter_duplicate_safetensors_files(hf_weights_files: List[str],
|
||||
|
||||
# Iterate through the weight_map (weight_name: safetensors files)
|
||||
# to identify weights that we should use.
|
||||
with open(index_file_name, "r") as f:
|
||||
with open(index_file_name) as f:
|
||||
weight_map = json.load(f)["weight_map"]
|
||||
weight_files_in_index = set()
|
||||
for weight_name in weight_map:
|
||||
@ -382,7 +382,7 @@ def np_cache_weights_iterator(
|
||||
with open(weight_names_file, "w") as f:
|
||||
json.dump(weight_names, f)
|
||||
|
||||
with open(weight_names_file, "r") as f:
|
||||
with open(weight_names_file) as f:
|
||||
weight_names = json.load(f)
|
||||
|
||||
for name in weight_names:
|
||||
@ -423,8 +423,7 @@ def pt_weights_iterator(
|
||||
bar_format=_BAR_FORMAT,
|
||||
):
|
||||
state = torch.load(bin_file, map_location="cpu")
|
||||
for name, param in state.items():
|
||||
yield name, param
|
||||
yield from state.items()
|
||||
del state
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
@ -48,7 +48,7 @@ class ArcticMLP(nn.Module):
|
||||
is_residual_mlp: bool = False,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
reduce_results: bool = True):
|
||||
super(ArcticMLP, self).__init__()
|
||||
super().__init__()
|
||||
self.hidden_size = config.hidden_size
|
||||
self.expert_id = expert_id
|
||||
self.layer_id = layer_id
|
||||
@ -89,7 +89,7 @@ class ArcticMoE(nn.Module):
|
||||
params_dtype: Optional[torch.dtype] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
reduce_results: bool = True):
|
||||
super(ArcticMoE, self).__init__()
|
||||
super().__init__()
|
||||
|
||||
self.tp_size = tp_size or get_tensor_model_parallel_world_size()
|
||||
self.hidden_size = config.hidden_size
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/THUDM/GLM-4
|
||||
"""Inference-only ChatGLM model compatible with THUDM weights."""
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
from typing import Iterable, List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
|
||||
# Copyright 2023 DeciAI Research Team. All rights reserved.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py
|
||||
# Copyright 2024 The LG U+ CTO AI Tech Lab.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright 2023 HuggingFace Inc. team. All rights reserved.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright (c) Google Inc.
|
||||
#
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 The vLLM team.
|
||||
# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/THUDM/GLM-4
|
||||
"""Inference-only GLM-4v model visual encoder compatible with THUDM weights."""
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -1,5 +1,3 @@
|
||||
# coding=utf-8
|
||||
|
||||
# adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py
|
||||
# Copyright 2024 The vLLM team.
|
||||
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from functools import partial
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://huggingface.co/inceptionai/jais-30b-chat-v3/blob/main/modeling_jais.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
"""Inference-only Jamba model."""
|
||||
from typing import Iterable, List, Optional, Tuple
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
"""PyTorch MAMBA model."""
|
||||
from typing import Iterable, List, Optional, Tuple
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
|
||||
# Copyright 2024 The ModelBest team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
|
@ -37,7 +37,7 @@ class MLPSpeculatorLayerNorm(nn.Module):
|
||||
eps=1e-06,
|
||||
elementwise_scale_and_shift=True,
|
||||
):
|
||||
super(MLPSpeculatorLayerNorm, self).__init__()
|
||||
super().__init__()
|
||||
self.elementwise_scale_and_shift = elementwise_scale_and_shift
|
||||
if self.elementwise_scale_and_shift:
|
||||
self.weight = nn.Parameter(torch.empty(normalized_shape))
|
||||
|
@ -1121,9 +1121,9 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
batch_size * num_image * num_patch, -1).contiguous()
|
||||
|
||||
image_input_idx = image_input_idx * valid.to(image_input_idx.dtype)
|
||||
offset = torch.cat(
|
||||
[seq_len.new_zeros(
|
||||
(1)), seq_len.cumsum(dim=0)[:-1]], dim=0)[:, None]
|
||||
offset = torch.cat([seq_len.new_zeros(1),
|
||||
seq_len.cumsum(dim=0)[:-1]],
|
||||
dim=0)[:, None]
|
||||
image_input_idx = image_input_idx + offset.to(image_input_idx.dtype)
|
||||
image_input_idx = image_input_idx.flatten()[:, None]
|
||||
mat = image_input_idx == torch.arange(
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
|
||||
import math
|
||||
from typing import Iterable, List, Optional, Tuple, Union
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py
|
||||
# Copyright 2024 The vLLM team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py
|
||||
# Copyright (c) OrionStar Inc.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from llama.py
|
||||
"""Inference-only Phi3 model code inherit from Llama.py"""
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 The vLLM team.
|
||||
# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
|
@ -136,11 +136,11 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
|
||||
|
||||
if image_token_id not in inputs['prompt_token_ids']:
|
||||
raise ValueError(
|
||||
(f"You've passed {inputs=} without {image_token_id=}"
|
||||
" Make sure to process your input via mistral_common's"
|
||||
" tokenizer or pass a chat completion request. For more"
|
||||
" For more info, see: "
|
||||
"https://github.com/vllm-project/vllm/issues/8411."))
|
||||
f"You've passed {inputs=} without {image_token_id=}"
|
||||
" Make sure to process your input via mistral_common's"
|
||||
" tokenizer or pass a chat completion request. For more"
|
||||
" For more info, see: "
|
||||
"https://github.com/vllm-project/vllm/issues/8411.")
|
||||
|
||||
return inputs
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
|
||||
# Copyright (c) Alibaba Cloud.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
|
||||
# Copyright 2024 The Qwen team.
|
||||
@ -417,9 +416,9 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
and hasattr(config, "max_window_layers")):
|
||||
raise ValueError("Sliding window for some but all layers is not "
|
||||
"supported. This model uses sliding window "
|
||||
"but `max_window_layers` = %s is less than "
|
||||
"`num_hidden_layers` = %s. Please open an issue "
|
||||
"to discuss this feature." % (
|
||||
"but `max_window_layers` = {} is less than "
|
||||
"`num_hidden_layers` = {}. Please open an issue "
|
||||
"to discuss this feature.".format(
|
||||
config.max_window_layers,
|
||||
config.num_hidden_layers,
|
||||
))
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2024 The Qwen team.
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
|
||||
# Copyright 2024 Kakao Corp. (Kanana-X Team)
|
||||
@ -60,9 +59,9 @@ class Qwen2ForSequenceClassification(nn.Module):
|
||||
and hasattr(config, "max_window_layers")):
|
||||
raise ValueError("Sliding window for some but all layers is not "
|
||||
"supported. This model uses sliding window "
|
||||
"but `max_window_layers` = %s is less than "
|
||||
"`num_hidden_layers` = %s. Please open an issue "
|
||||
"to discuss this feature." % (
|
||||
"but `max_window_layers` = {} is less than "
|
||||
"`num_hidden_layers` = {}. Please open an issue "
|
||||
"to discuss this feature.".format(
|
||||
config.max_window_layers,
|
||||
config.num_hidden_layers,
|
||||
))
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
|
||||
# Copyright 2024 The Qwen team.
|
||||
|
@ -1,4 +1,3 @@
|
||||
# coding=utf-8
|
||||
# Adapted from
|
||||
# https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
|
||||
# Copyright 2024 The Qwen team.
|
||||
@ -71,9 +70,9 @@ class Qwen2ForRewardModel(nn.Module, SupportsPP):
|
||||
and hasattr(config, "max_window_layers")):
|
||||
raise ValueError("Sliding window for some but all layers is not "
|
||||
"supported. This model uses sliding window "
|
||||
"but `max_window_layers` = %s is less than "
|
||||
"`num_hidden_layers` = %s. Please open an issue "
|
||||
"to discuss this feature." % (
|
||||
"but `max_window_layers` = {} is less than "
|
||||
"`num_hidden_layers` = {}. Please open an issue "
|
||||
"to discuss this feature.".format(
|
||||
config.max_window_layers,
|
||||
config.num_hidden_layers,
|
||||
))
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user