[CI] Try introducing isort. (#3495)
This commit is contained in:
parent
e67c295b0c
commit
01bfb22b41
5
.github/workflows/ruff.yml
vendored
5
.github/workflows/ruff.yml
vendored
@ -25,10 +25,13 @@ jobs:
|
|||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1
|
pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
|
||||||
- name: Analysing the code with ruff
|
- name: Analysing the code with ruff
|
||||||
run: |
|
run: |
|
||||||
ruff .
|
ruff .
|
||||||
- name: Spelling check with codespell
|
- name: Spelling check with codespell
|
||||||
run: |
|
run: |
|
||||||
codespell --toml pyproject.toml
|
codespell --toml pyproject.toml
|
||||||
|
- name: Run isort
|
||||||
|
run: |
|
||||||
|
isort . --check-only
|
||||||
|
@ -1,8 +1,7 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from vllm import LLM
|
from vllm import LLM, SamplingParams
|
||||||
from vllm import SamplingParams
|
|
||||||
|
|
||||||
PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501
|
PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501
|
||||||
|
|
||||||
|
@ -25,15 +25,12 @@ from datetime import datetime
|
|||||||
from typing import AsyncGenerator, List, Tuple
|
from typing import AsyncGenerator, List, Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
||||||
|
RequestFuncOutput)
|
||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
|
||||||
|
|
||||||
from backend_request_func import (
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
ASYNC_REQUEST_FUNCS,
|
|
||||||
RequestFuncInput,
|
|
||||||
RequestFuncOutput,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -6,9 +6,9 @@ import time
|
|||||||
from typing import List, Optional, Tuple
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
from tqdm import tqdm
|
||||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||||
PreTrainedTokenizerBase)
|
PreTrainedTokenizerBase)
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
|
|
||||||
def sample_requests(
|
def sample_requests(
|
||||||
|
@ -2,11 +2,13 @@ import json
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from vllm.model_executor.layers.fused_moe import fused_moe, get_config_file_name
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
import triton
|
import triton
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.fused_moe import (fused_moe,
|
||||||
|
get_config_file_name)
|
||||||
|
|
||||||
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
from typing import Optional
|
|
||||||
import argparse
|
import argparse
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
|
|
||||||
from vllm._C import ops
|
from vllm._C import ops
|
||||||
|
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
|
||||||
|
|
||||||
NUM_BLOCKS = 1024
|
NUM_BLOCKS = 1024
|
||||||
PARTITION_SIZE = 512
|
PARTITION_SIZE = 512
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
|
import argparse
|
||||||
|
from itertools import accumulate
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import argparse
|
|
||||||
import torch
|
|
||||||
import nvtx
|
import nvtx
|
||||||
from itertools import accumulate
|
import torch
|
||||||
|
|
||||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||||
|
|
||||||
|
|
||||||
|
@ -9,8 +9,8 @@
|
|||||||
#
|
#
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import shutil
|
|
||||||
import os
|
import os
|
||||||
|
import shutil
|
||||||
|
|
||||||
from torch.utils.hipify.hipify_python import hipify
|
from torch.utils.hipify.hipify_python import hipify
|
||||||
|
|
||||||
|
@ -6,10 +6,10 @@
|
|||||||
# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
|
# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
|
||||||
import datetime
|
import datetime
|
||||||
import locale
|
import locale
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import os
|
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -10,10 +10,11 @@
|
|||||||
# add these directories to sys.path here. If the directory is relative to the
|
# add these directories to sys.path here. If the directory is relative to the
|
||||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||||
|
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from sphinx.ext import autodoc
|
from sphinx.ext import autodoc
|
||||||
import logging
|
|
||||||
|
|
||||||
sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))
|
sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import argparse
|
import argparse
|
||||||
from openai import OpenAI
|
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
# Argument parser setup
|
# Argument parser setup
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import argparse
|
import argparse
|
||||||
from typing import List, Tuple
|
from typing import List, Tuple
|
||||||
|
|
||||||
from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
|
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
|
||||||
|
|
||||||
|
|
||||||
def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
|
def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
|
||||||
|
@ -5,11 +5,11 @@ for offline inference.
|
|||||||
Requires HuggingFace credentials for access to Llama2.
|
Requires HuggingFace credentials for access to Llama2.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Optional, List, Tuple
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
|
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
|
|
||||||
|
|
||||||
|
@ -5,11 +5,13 @@ distributively on a multi-nodes cluster.
|
|||||||
Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
|
Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import ray
|
import ray
|
||||||
|
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
# Create a sampling params object.
|
# Create a sampling params object.
|
||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
|
|
||||||
|
42
format.sh
42
format.sh
@ -25,6 +25,7 @@ YAPF_VERSION=$(yapf --version | awk '{print $2}')
|
|||||||
RUFF_VERSION=$(ruff --version | awk '{print $2}')
|
RUFF_VERSION=$(ruff --version | awk '{print $2}')
|
||||||
MYPY_VERSION=$(mypy --version | awk '{print $2}')
|
MYPY_VERSION=$(mypy --version | awk '{print $2}')
|
||||||
CODESPELL_VERSION=$(codespell --version)
|
CODESPELL_VERSION=$(codespell --version)
|
||||||
|
ISORT_VERSION=$(isort --vn)
|
||||||
|
|
||||||
# # params: tool name, tool version, required version
|
# # params: tool name, tool version, required version
|
||||||
tool_version_check() {
|
tool_version_check() {
|
||||||
@ -37,6 +38,7 @@ tool_version_check() {
|
|||||||
tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
|
tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
|
||||||
tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)"
|
tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)"
|
||||||
tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
|
tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
|
||||||
|
tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-dev.txt | cut -d'=' -f3)"
|
||||||
tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-dev.txt | cut -d'=' -f3)"
|
tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-dev.txt | cut -d'=' -f3)"
|
||||||
|
|
||||||
YAPF_FLAGS=(
|
YAPF_FLAGS=(
|
||||||
@ -178,6 +180,46 @@ else
|
|||||||
lint_changed
|
lint_changed
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# check spelling of specified files
|
||||||
|
isort_check() {
|
||||||
|
isort "$@"
|
||||||
|
}
|
||||||
|
|
||||||
|
isort_check_all(){
|
||||||
|
isort .
|
||||||
|
}
|
||||||
|
|
||||||
|
# Spelling check of files that differ from main branch.
|
||||||
|
isort_check_changed() {
|
||||||
|
# The `if` guard ensures that the list of filenames is not empty, which
|
||||||
|
# could cause ruff to receive 0 positional arguments, making it hang
|
||||||
|
# waiting for STDIN.
|
||||||
|
#
|
||||||
|
# `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
|
||||||
|
# exist on both branches.
|
||||||
|
MERGEBASE="$(git merge-base origin/main HEAD)"
|
||||||
|
|
||||||
|
if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
|
||||||
|
git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
|
||||||
|
isort
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run Isort
|
||||||
|
# This flag runs spell check of individual files. --files *must* be the first command line
|
||||||
|
# arg to use this option.
|
||||||
|
if [[ "$1" == '--files' ]]; then
|
||||||
|
isort_check "${@:2}"
|
||||||
|
# If `--all` is passed, then any further arguments are ignored and the
|
||||||
|
# entire python directory is linted.
|
||||||
|
elif [[ "$1" == '--all' ]]; then
|
||||||
|
isort_check_all
|
||||||
|
else
|
||||||
|
# Check spelling only of the files that changed in last commit.
|
||||||
|
isort_check_changed
|
||||||
|
fi
|
||||||
|
echo 'vLLM isort: Done'
|
||||||
|
|
||||||
if ! git diff --quiet &>/dev/null; then
|
if ! git diff --quiet &>/dev/null; then
|
||||||
echo 'Reformatted files. Please review and stage the changes.'
|
echo 'Reformatted files. Please review and stage the changes.'
|
||||||
echo 'Changes not staged for commit:'
|
echo 'Changes not staged for commit:'
|
||||||
|
@ -51,3 +51,7 @@ exclude = "vllm/model_executor/parallel_utils/|vllm/model_executor/models/"
|
|||||||
[tool.codespell]
|
[tool.codespell]
|
||||||
ignore-words-list = "dout, te, indicies"
|
ignore-words-list = "dout, te, indicies"
|
||||||
skip = "./tests/prompts"
|
skip = "./tests/prompts"
|
||||||
|
|
||||||
|
[tool.isort]
|
||||||
|
use_parentheses = true
|
||||||
|
skip_gitignore = true
|
||||||
|
@ -4,6 +4,7 @@ toml==0.10.2
|
|||||||
tomli==2.0.1
|
tomli==2.0.1
|
||||||
ruff==0.1.5
|
ruff==0.1.5
|
||||||
codespell==2.2.6
|
codespell==2.2.6
|
||||||
|
isort==5.13.2
|
||||||
|
|
||||||
# type checking
|
# type checking
|
||||||
mypy==0.991
|
mypy==0.991
|
||||||
|
10
setup.py
10
setup.py
@ -1,16 +1,16 @@
|
|||||||
import io
|
import io
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import logging
|
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
from shutil import which
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from packaging.version import parse, Version
|
|
||||||
from setuptools import setup, find_packages, Extension
|
|
||||||
from setuptools.command.build_ext import build_ext
|
|
||||||
from shutil import which
|
|
||||||
import torch
|
import torch
|
||||||
|
from packaging.version import Version, parse
|
||||||
|
from setuptools import Extension, find_packages, setup
|
||||||
|
from setuptools.command.build_ext import build_ext
|
||||||
from torch.utils.cpp_extension import CUDA_HOME
|
from torch.utils.cpp_extension import CUDA_HOME
|
||||||
|
|
||||||
ROOT_DIR = os.path.dirname(__file__)
|
ROOT_DIR = os.path.dirname(__file__)
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
from dataclasses import dataclass
|
|
||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
|
||||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
|
||||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||||
|
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||||
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
|
|
||||||
chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
|
chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
|
||||||
__file__))).parent.parent / "examples/template_chatml.jinja"
|
__file__))).parent.parent / "examples/template_chatml.jinja"
|
||||||
|
@ -6,8 +6,8 @@ import torch
|
|||||||
from transformers import AutoModelForCausalLM
|
from transformers import AutoModelForCausalLM
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
|
||||||
from vllm.config import TokenizerPoolConfig
|
from vllm.config import TokenizerPoolConfig
|
||||||
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
|
|
||||||
_TEST_DIR = os.path.dirname(__file__)
|
_TEST_DIR = os.path.dirname(__file__)
|
||||||
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
|
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
|
||||||
|
@ -1,13 +1,14 @@
|
|||||||
import pytest
|
|
||||||
import time
|
import time
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
from vllm.block import PhysicalTokenBlock
|
from vllm.block import PhysicalTokenBlock
|
||||||
from vllm.core.block_manager import (UncachedBlockAllocator, BlockSpaceManager,
|
from vllm.core.block_manager import (AllocStatus, BlockSpaceManager,
|
||||||
AllocStatus)
|
UncachedBlockAllocator)
|
||||||
|
from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceStatus
|
||||||
from vllm.utils import Device
|
from vllm.utils import Device
|
||||||
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus, Logprob
|
|
||||||
|
|
||||||
from .utils import create_dummy_prompt
|
from .utils import create_dummy_prompt
|
||||||
|
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
from typing import List
|
|
||||||
import pytest # noqa
|
|
||||||
import time
|
import time
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import pytest # noqa
|
||||||
|
|
||||||
from vllm.config import CacheConfig, SchedulerConfig
|
from vllm.config import CacheConfig, SchedulerConfig
|
||||||
from vllm.core.scheduler import Scheduler
|
from vllm.core.scheduler import Scheduler
|
||||||
from vllm.sequence import SequenceGroup, Logprob
|
from vllm.sequence import Logprob, SequenceGroup
|
||||||
|
|
||||||
from .utils import create_dummy_prompt
|
from .utils import create_dummy_prompt
|
||||||
|
|
||||||
|
@ -3,14 +3,12 @@
|
|||||||
Run `pytest tests/distributed/test_comm_ops.py --forked`.
|
Run `pytest tests/distributed/test_comm_ops.py --forked`.
|
||||||
"""
|
"""
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
|
||||||
import ray
|
import ray
|
||||||
|
import torch
|
||||||
|
|
||||||
from vllm.model_executor.parallel_utils.communication_op import (
|
from vllm.model_executor.parallel_utils.communication_op import (
|
||||||
tensor_model_parallel_all_reduce,
|
broadcast_tensor_dict, tensor_model_parallel_all_gather,
|
||||||
tensor_model_parallel_all_gather,
|
tensor_model_parallel_all_reduce)
|
||||||
broadcast_tensor_dict,
|
|
||||||
)
|
|
||||||
from vllm.test_utils import (init_test_distributed_environment,
|
from vllm.test_utils import (init_test_distributed_environment,
|
||||||
multi_process_tensor_parallel)
|
multi_process_tensor_parallel)
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
|
import os
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import os
|
|
||||||
import pytest
|
import pytest
|
||||||
import ray
|
import ray
|
||||||
import torch
|
import torch
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
# This unit test should be moved to a new
|
# This unit test should be moved to a new
|
||||||
# tests/test_guided_decoding directory.
|
# tests/test_guided_decoding directory.
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
|
||||||
import torch
|
import torch
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
from vllm.model_executor.guided_logits_processors import (RegexLogitsProcessor,
|
from vllm.model_executor.guided_logits_processors import (JSONLogitsProcessor,
|
||||||
JSONLogitsProcessor)
|
RegexLogitsProcessor)
|
||||||
|
|
||||||
TEST_SCHEMA = {
|
TEST_SCHEMA = {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
|
@ -1,22 +1,21 @@
|
|||||||
|
# imports for guided decoding tests
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import sys
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import sys
|
import jsonschema
|
||||||
|
import openai # use the official client for correctness check
|
||||||
import pytest
|
import pytest
|
||||||
import requests
|
|
||||||
# using Ray for overall ease of process management, parallel requests,
|
# using Ray for overall ease of process management, parallel requests,
|
||||||
# and debugging.
|
# and debugging.
|
||||||
import ray
|
import ray
|
||||||
import openai # use the official client for correctness check
|
import requests
|
||||||
# downloading lora to test lora requests
|
# downloading lora to test lora requests
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
# imports for guided decoding tests
|
|
||||||
import json
|
|
||||||
import jsonschema
|
|
||||||
import re
|
|
||||||
|
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
|
|
||||||
MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds
|
MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.utils import create_kv_caches_with_random
|
from vllm.utils import create_kv_caches_with_random
|
||||||
|
|
||||||
|
|
||||||
|
@ -2,10 +2,10 @@ from typing import Type
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
from allclose_default import get_default_atol, get_default_rtol
|
||||||
|
|
||||||
from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
|
from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
|
||||||
NewGELU, SiluAndMul)
|
NewGELU, SiluAndMul)
|
||||||
from allclose_default import get_default_atol, get_default_rtol
|
|
||||||
|
|
||||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||||
NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing
|
NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing
|
||||||
|
@ -3,13 +3,12 @@ from typing import List, Optional, Tuple
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
from allclose_default import get_default_atol, get_default_rtol
|
||||||
from xformers import ops as xops
|
from xformers import ops as xops
|
||||||
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
|
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
|
||||||
|
|
||||||
from vllm._C import ops, cache_ops
|
from vllm._C import cache_ops, ops
|
||||||
from vllm.utils import get_max_shared_memory_bytes
|
from vllm.utils import get_max_shared_memory_bytes, is_hip
|
||||||
from vllm.utils import is_hip
|
|
||||||
from allclose_default import get_default_atol, get_default_rtol
|
|
||||||
|
|
||||||
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
||||||
# This will change depending on the compute capability.
|
# This will change depending on the compute capability.
|
||||||
|
@ -1,10 +1,9 @@
|
|||||||
import random
|
import random
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from typing import Tuple
|
|
||||||
|
|
||||||
from vllm._C import cache_ops
|
from vllm._C import cache_ops
|
||||||
|
|
||||||
COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
|
COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
|
||||||
|
@ -7,8 +7,8 @@ import torch
|
|||||||
from transformers import MixtralConfig
|
from transformers import MixtralConfig
|
||||||
from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
|
from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
|
||||||
|
|
||||||
from vllm.model_executor.layers.fused_moe import fused_moe
|
|
||||||
from vllm.model_executor.layers.activation import SiluAndMul
|
from vllm.model_executor.layers.activation import SiluAndMul
|
||||||
|
from vllm.model_executor.layers.fused_moe import fused_moe
|
||||||
from vllm.model_executor.models.mixtral import MixtralMoE
|
from vllm.model_executor.models.mixtral import MixtralMoE
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
|
from itertools import accumulate
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from allclose_default import get_default_atol, get_default_rtol
|
from allclose_default import get_default_atol, get_default_rtol
|
||||||
from itertools import accumulate
|
|
||||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||||
|
|
||||||
IS_NEOX_STYLE = [True, False]
|
IS_NEOX_STYLE = [True, False]
|
||||||
|
@ -1,12 +1,13 @@
|
|||||||
import random
|
import random
|
||||||
import pytest
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from vllm.attention.ops.prefix_prefill import context_attention_fwd
|
|
||||||
from xformers import ops as xops
|
from xformers import ops as xops
|
||||||
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
|
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
|
||||||
|
|
||||||
|
from vllm.attention.ops.prefix_prefill import context_attention_fwd
|
||||||
|
|
||||||
NUM_HEADS = [64]
|
NUM_HEADS = [64]
|
||||||
NUM_QUERIES_PER_KV = [1, 8, 64]
|
NUM_QUERIES_PER_KV = [1, 8, 64]
|
||||||
HEAD_SIZES = [128]
|
HEAD_SIZES = [128]
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
import torch
|
|
||||||
import pytest
|
|
||||||
import random
|
import random
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
|
||||||
from vllm.model_executor.layers.ops.rand import seeded_uniform
|
from vllm.model_executor.layers.ops.rand import seeded_uniform
|
||||||
from vllm.model_executor.utils import set_random_seed
|
from vllm.model_executor.utils import set_random_seed
|
||||||
|
|
||||||
|
@ -1,15 +1,15 @@
|
|||||||
import gc
|
import gc
|
||||||
|
|
||||||
import torch
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import torch
|
||||||
import triton
|
import triton
|
||||||
import triton.language as tl
|
import triton.language as tl
|
||||||
|
|
||||||
from vllm.model_executor.layers.ops.sample import (
|
from vllm.model_executor.layers.ops.sample import (
|
||||||
_uniform_to_exponential, sample, get_num_triton_sampler_splits,
|
MAX_TRITON_N_COLS, _uniform_to_exponential, get_num_triton_sampler_splits,
|
||||||
MAX_TRITON_N_COLS)
|
sample)
|
||||||
from vllm.model_executor.utils import set_random_seed
|
|
||||||
from vllm.model_executor.sampling_metadata import SamplingTensors
|
from vllm.model_executor.sampling_metadata import SamplingTensors
|
||||||
|
from vllm.model_executor.utils import set_random_seed
|
||||||
|
|
||||||
SINGLE_SPLIT_VOCAB_SIZE = 32000 # llama/mistral/mixtral vocab size
|
SINGLE_SPLIT_VOCAB_SIZE = 32000 # llama/mistral/mixtral vocab size
|
||||||
MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100
|
MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100
|
||||||
|
@ -2,7 +2,7 @@ import contextlib
|
|||||||
import gc
|
import gc
|
||||||
import tempfile
|
import tempfile
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from unittest.mock import patch, MagicMock
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import ray
|
import ray
|
||||||
@ -12,13 +12,13 @@ from huggingface_hub import snapshot_download
|
|||||||
|
|
||||||
import vllm
|
import vllm
|
||||||
from vllm.config import LoRAConfig
|
from vllm.config import LoRAConfig
|
||||||
from vllm.model_executor.layers.sampler import Sampler
|
|
||||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
|
||||||
from vllm.model_executor.model_loader import get_model
|
|
||||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||||
MergedColumnParallelLinear,
|
MergedColumnParallelLinear,
|
||||||
RowParallelLinear)
|
RowParallelLinear)
|
||||||
|
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||||
|
from vllm.model_executor.layers.sampler import Sampler
|
||||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||||
|
from vllm.model_executor.model_loader import get_model
|
||||||
from vllm.model_executor.parallel_utils.parallel_state import (
|
from vllm.model_executor.parallel_utils.parallel_state import (
|
||||||
destroy_model_parallel, initialize_model_parallel)
|
destroy_model_parallel, initialize_model_parallel)
|
||||||
|
|
||||||
|
@ -1,12 +1,14 @@
|
|||||||
|
import tempfile
|
||||||
|
from random import sample
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
import peft
|
import peft
|
||||||
import pytest
|
import pytest
|
||||||
from random import sample
|
|
||||||
import tempfile
|
|
||||||
from transformers import AutoModelForCausalLM
|
from transformers import AutoModelForCausalLM
|
||||||
|
|
||||||
import vllm
|
import vllm
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
|
|
||||||
from .conftest import cleanup
|
from .conftest import cleanup
|
||||||
|
|
||||||
MODEL_PATH = "Felladrin/Llama-68M-Chat-v1"
|
MODEL_PATH = "Felladrin/Llama-68M-Chat-v1"
|
||||||
|
@ -1,32 +1,28 @@
|
|||||||
import pytest
|
|
||||||
import random
|
import random
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import List, Optional, Dict, Tuple
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
|
||||||
from vllm.lora.layers import (
|
from vllm.config import LoRAConfig
|
||||||
ColumnParallelLinearWithLoRA,
|
from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
|
||||||
|
LogitsProcessorWithLoRA, LoRAMapping,
|
||||||
MergedColumnParallelLinearWithLoRA,
|
MergedColumnParallelLinearWithLoRA,
|
||||||
QKVParallelLinearWithLora,
|
QKVParallelLinearWithLora,
|
||||||
VocabParallelEmbeddingWithLoRA,
|
|
||||||
RowParallelLinearWithLoRA,
|
RowParallelLinearWithLoRA,
|
||||||
LogitsProcessorWithLoRA,
|
VocabParallelEmbeddingWithLoRA)
|
||||||
LoRAMapping,
|
from vllm.lora.models import (LoRALayerWeights, PackedLoRALayerWeights,
|
||||||
BaseLayerWithLoRA,
|
convert_mapping)
|
||||||
)
|
|
||||||
from vllm.lora.models import (LoRALayerWeights, convert_mapping,
|
|
||||||
PackedLoRALayerWeights)
|
|
||||||
from vllm.config import LoRAConfig
|
|
||||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
|
||||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||||
MergedColumnParallelLinear,
|
MergedColumnParallelLinear,
|
||||||
RowParallelLinear,
|
QKVParallelLinear,
|
||||||
QKVParallelLinear)
|
RowParallelLinear)
|
||||||
|
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||||
VocabParallelEmbedding, ParallelLMHead)
|
ParallelLMHead, VocabParallelEmbedding)
|
||||||
from vllm.model_executor.utils import set_random_seed
|
from vllm.model_executor.utils import set_random_seed
|
||||||
|
|
||||||
from .utils import DummyLoRAManager
|
from .utils import DummyLoRAManager
|
||||||
|
@ -3,6 +3,7 @@ import ray
|
|||||||
|
|
||||||
import vllm
|
import vllm
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
|
|
||||||
from .conftest import cleanup
|
from .conftest import cleanup
|
||||||
|
|
||||||
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
||||||
|
@ -8,11 +8,11 @@ from torch import nn
|
|||||||
|
|
||||||
from vllm.config import LoRAConfig
|
from vllm.config import LoRAConfig
|
||||||
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
|
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
|
||||||
RowParallelLinearWithLoRA,
|
MergedColumnParallelLinearWithLoRA,
|
||||||
MergedColumnParallelLinearWithLoRA)
|
RowParallelLinearWithLoRA)
|
||||||
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
|
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
|
||||||
from vllm.lora.models import (LoRAModel, LoRAModelManager,
|
from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
|
||||||
LRUCacheLoRAModelManager, LoRAMapping)
|
LRUCacheLoRAModelManager)
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
|
from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
|
||||||
WorkerLoRAManager)
|
WorkerLoRAManager)
|
||||||
|
@ -1,8 +1,10 @@
|
|||||||
import pytest
|
import pytest
|
||||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
|
|
||||||
from vllm.transformers_utils.tokenizer import get_lora_tokenizer
|
from vllm.transformers_utils.tokenizer import get_lora_tokenizer
|
||||||
|
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
|
||||||
|
|
||||||
from ..conftest import get_tokenizer_pool_config
|
from ..conftest import get_tokenizer_pool_config
|
||||||
|
|
||||||
|
|
||||||
|
@ -2,8 +2,8 @@ from collections import OrderedDict
|
|||||||
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
|
from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
|
||||||
from vllm.utils import LRUCache
|
from vllm.utils import LRUCache
|
||||||
from vllm.lora.utils import (parse_fine_tuned_lora_name, replace_submodule)
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_fine_tuned_lora_name():
|
def test_parse_fine_tuned_lora_name():
|
||||||
|
@ -3,10 +3,10 @@ import random
|
|||||||
import tempfile
|
import tempfile
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig,
|
||||||
|
SchedulerConfig)
|
||||||
from vllm.lora.models import LoRAMapping
|
from vllm.lora.models import LoRAMapping
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
|
|
||||||
DeviceConfig, LoRAConfig)
|
|
||||||
from vllm.worker.worker import Worker
|
from vllm.worker.worker import Worker
|
||||||
|
|
||||||
|
|
||||||
|
@ -11,9 +11,11 @@ up to 3 times to see if we pass.
|
|||||||
Run `pytest tests/models/test_marlin.py --forked`.
|
Run `pytest tests/models/test_marlin.py --forked`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from dataclasses import dataclass
|
|
||||||
from vllm.model_executor.layers.quantization import (
|
from vllm.model_executor.layers.quantization import (
|
||||||
_QUANTIZATION_CONFIG_REGISTRY)
|
_QUANTIZATION_CONFIG_REGISTRY)
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from tests.conftest import VllmRunner
|
|
||||||
|
|
||||||
|
from tests.conftest import VllmRunner
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
|
|
||||||
MODELS = ["facebook/opt-125m"]
|
MODELS = ["facebook/opt-125m"]
|
||||||
|
@ -1,13 +1,12 @@
|
|||||||
"""Tests for rejection sampling."""
|
"""Tests for rejection sampling."""
|
||||||
import pytest
|
|
||||||
from typing import List, Tuple
|
from typing import List, Tuple
|
||||||
|
|
||||||
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
|
||||||
from vllm.model_executor.utils import set_random_seed
|
|
||||||
|
|
||||||
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
|
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
|
||||||
|
from vllm.model_executor.utils import set_random_seed
|
||||||
|
|
||||||
CUDA_DEVICES = [
|
CUDA_DEVICES = [
|
||||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||||
|
@ -1,11 +1,10 @@
|
|||||||
import random
|
import random
|
||||||
from typing import Tuple, List
|
from typing import List, Optional, Tuple
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from transformers import GenerationConfig, GenerationMixin
|
from transformers import GenerationConfig, GenerationMixin
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from vllm.model_executor.layers.sampler import Sampler
|
from vllm.model_executor.layers.sampler import Sampler
|
||||||
from vllm.model_executor.utils import set_random_seed
|
from vllm.model_executor.utils import set_random_seed
|
||||||
|
@ -8,8 +8,8 @@ from itertools import combinations
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.model_executor.utils import set_random_seed
|
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
|
from vllm.model_executor.utils import set_random_seed
|
||||||
|
|
||||||
MODEL = "facebook/opt-125m"
|
MODEL = "facebook/opt-125m"
|
||||||
RANDOM_SEEDS = list(range(5))
|
RANDOM_SEEDS = list(range(5))
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
import torch
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import torch
|
||||||
|
|
||||||
from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
|
from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
|
||||||
|
|
||||||
from .utils import mock_worker, create_seq_group_metadata_from_prompts
|
from .utils import create_seq_group_metadata_from_prompts, mock_worker
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('num_target_seq_ids', [100])
|
@pytest.mark.parametrize('num_target_seq_ids', [100])
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
import torch
|
|
||||||
import math
|
import math
|
||||||
import pytest
|
|
||||||
|
|
||||||
from unittest.mock import MagicMock
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
|
||||||
from vllm.spec_decode.metrics import AsyncMetricsCollector
|
from vllm.spec_decode.metrics import AsyncMetricsCollector
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,18 +1,19 @@
|
|||||||
import torch
|
|
||||||
import random
|
import random
|
||||||
import pytest
|
|
||||||
from unittest.mock import MagicMock
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
from vllm.spec_decode.multi_step_worker import (MultiStepWorker,
|
import pytest
|
||||||
DraftModelTop1Proposer)
|
import torch
|
||||||
from vllm.worker.worker import Worker
|
|
||||||
from vllm.model_executor.utils import set_random_seed
|
from vllm.model_executor.utils import set_random_seed
|
||||||
from vllm.sequence import SamplerOutput
|
from vllm.sequence import SamplerOutput
|
||||||
|
from vllm.spec_decode.multi_step_worker import (DraftModelTop1Proposer,
|
||||||
|
MultiStepWorker)
|
||||||
|
from vllm.worker.worker import Worker
|
||||||
|
|
||||||
from .utils import (create_execute_model_data, create_worker,
|
from .utils import (assert_logprobs_dict_allclose, create_batch,
|
||||||
create_seq_group_metadata_from_prompts, zero_kv_cache,
|
create_execute_model_data,
|
||||||
patch_execute_model_with_seeds,
|
create_seq_group_metadata_from_prompts, create_worker,
|
||||||
assert_logprobs_dict_allclose, create_batch)
|
patch_execute_model_with_seeds, zero_kv_cache)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('num_steps', list(range(1, 17)))
|
@pytest.mark.parametrize('num_steps', list(range(1, 17)))
|
||||||
|
@ -1,18 +1,20 @@
|
|||||||
import torch
|
|
||||||
import random
|
import random
|
||||||
import pytest
|
|
||||||
from unittest.mock import MagicMock
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
|
||||||
|
from vllm.model_executor.utils import set_random_seed
|
||||||
|
from vllm.spec_decode.interfaces import SpeculativeProposals
|
||||||
|
from vllm.spec_decode.metrics import (AsyncMetricsCollector,
|
||||||
|
SpecDecodeWorkerMetrics)
|
||||||
from vllm.spec_decode.multi_step_worker import MultiStepWorker
|
from vllm.spec_decode.multi_step_worker import MultiStepWorker
|
||||||
from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
|
from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
|
||||||
split_num_cache_blocks_evenly)
|
split_num_cache_blocks_evenly)
|
||||||
from vllm.spec_decode.interfaces import SpeculativeProposals
|
|
||||||
from vllm.model_executor.utils import set_random_seed
|
from .utils import (ExecuteModelData, create_batch, create_sampler_output_list,
|
||||||
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
|
mock_worker)
|
||||||
from .utils import (mock_worker, create_batch, ExecuteModelData,
|
|
||||||
create_sampler_output_list)
|
|
||||||
from vllm.spec_decode.metrics import (SpecDecodeWorkerMetrics,
|
|
||||||
AsyncMetricsCollector)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('k', [1, 2, 6])
|
@pytest.mark.parametrize('k', [1, 2, 6])
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
from vllm.spec_decode.util import get_all_seq_ids
|
from unittest.mock import MagicMock
|
||||||
from vllm.sequence import SequenceGroupMetadata
|
|
||||||
from vllm.spec_decode.util import split_batch_by_proposal_len
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from unittest.mock import MagicMock
|
|
||||||
|
from vllm.sequence import SequenceGroupMetadata
|
||||||
|
from vllm.spec_decode.util import get_all_seq_ids, split_batch_by_proposal_len
|
||||||
|
|
||||||
|
|
||||||
def test_get_all_seq_ids():
|
def test_get_all_seq_ids():
|
||||||
|
@ -1,17 +1,19 @@
|
|||||||
import torch
|
from dataclasses import dataclass, fields
|
||||||
from typing import List, Optional, Dict, Iterable, Union
|
from itertools import count
|
||||||
|
from typing import Dict, Iterable, List, Optional, Union
|
||||||
from unittest.mock import MagicMock
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
from vllm.worker.worker import Worker
|
import torch
|
||||||
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
|
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.sequence import (Logprob, SequenceGroupMetadata, SequenceData,
|
|
||||||
SamplerOutput, SequenceGroupOutput, SequenceOutput)
|
|
||||||
from vllm.sampling_params import SamplingParams
|
|
||||||
from vllm.worker.cache_engine import CacheEngine
|
|
||||||
from vllm.model_executor.utils import set_random_seed
|
from vllm.model_executor.utils import set_random_seed
|
||||||
from itertools import count
|
from vllm.sampling_params import SamplingParams
|
||||||
from dataclasses import dataclass, fields
|
from vllm.sequence import (Logprob, SamplerOutput, SequenceData,
|
||||||
|
SequenceGroupMetadata, SequenceGroupOutput,
|
||||||
|
SequenceOutput)
|
||||||
|
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
|
||||||
|
from vllm.worker.cache_engine import CacheEngine
|
||||||
|
from vllm.worker.worker import Worker
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -7,8 +7,8 @@ from typing import List, Optional
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
|
||||||
from vllm.sequence import Sequence
|
from vllm.sequence import Sequence
|
||||||
|
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
||||||
|
|
||||||
# Make two prefixes with different first blocks.
|
# Make two prefixes with different first blocks.
|
||||||
prefix_start = [("You are an expert"), ("You are a")]
|
prefix_start = [("You are an expert"), ("You are a")]
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.sequence import SequenceGroupOutput, SamplerOutput, SequenceOutput
|
from vllm.sequence import SamplerOutput, SequenceGroupOutput, SequenceOutput
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from vllm.transformers_utils.tokenizer import get_cached_tokenizer
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
from vllm.transformers_utils.tokenizer import get_cached_tokenizer
|
||||||
|
|
||||||
|
|
||||||
def test_cached_tokenizer():
|
def test_cached_tokenizer():
|
||||||
reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
from typing import List, Dict
|
|
||||||
|
|
||||||
from vllm.sequence import Sequence, Logprob, SamplingParams, SequenceGroup
|
from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
|
||||||
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
|
|
||||||
from vllm.transformers_utils.tokenizer import detokenize_incrementally
|
|
||||||
from vllm.transformers_utils.detokenizer import Detokenizer
|
from vllm.transformers_utils.detokenizer import Detokenizer
|
||||||
|
from vllm.transformers_utils.tokenizer import detokenize_incrementally
|
||||||
|
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
|
||||||
|
|
||||||
TRUTH = [
|
TRUTH = [
|
||||||
"Hello here, this is a simple test",
|
"Hello here, this is a simple test",
|
||||||
|
@ -1,14 +1,16 @@
|
|||||||
import os
|
|
||||||
import pytest
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import os
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
|
||||||
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
|
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
|
||||||
from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
|
from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
|
||||||
RayTokenizerGroupPool)
|
RayTokenizerGroupPool)
|
||||||
from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
|
from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
|
||||||
TokenizerGroup)
|
TokenizerGroup)
|
||||||
|
|
||||||
from ..conftest import get_tokenizer_pool_config
|
from ..conftest import get_tokenizer_pool_config
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.worker.worker import Worker
|
|
||||||
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
|
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
|
||||||
|
from vllm.worker.worker import Worker
|
||||||
|
|
||||||
|
|
||||||
def test_swap() -> None:
|
def test_swap() -> None:
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
|
from vllm.attention.backends.abstract import (AttentionBackend,
|
||||||
|
AttentionMetadata)
|
||||||
from vllm.attention.layer import Attention
|
from vllm.attention.layer import Attention
|
||||||
from vllm.attention.selector import get_attn_backend
|
from vllm.attention.selector import get_attn_backend
|
||||||
|
|
||||||
|
@ -7,12 +7,13 @@ flashinfer for all the attention operations.
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Dict, List, Optional, Tuple, Type
|
from typing import Dict, List, Optional, Tuple, Type
|
||||||
|
|
||||||
from flash_attn import flash_attn_varlen_func
|
|
||||||
import torch
|
import torch
|
||||||
|
from flash_attn import flash_attn_varlen_func
|
||||||
|
|
||||||
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
||||||
AttentionMetadata)
|
AttentionMetadata)
|
||||||
from vllm.attention.ops.paged_attn import PagedAttention, PagedAttentionMetadata
|
from vllm.attention.ops.paged_attn import (PagedAttention,
|
||||||
|
PagedAttentionMetadata)
|
||||||
|
|
||||||
|
|
||||||
class FlashAttentionBackend(AttentionBackend):
|
class FlashAttentionBackend(AttentionBackend):
|
||||||
|
@ -11,7 +11,8 @@ from xformers.ops.fmha.attn_bias import (AttentionBias,
|
|||||||
|
|
||||||
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
||||||
AttentionMetadata)
|
AttentionMetadata)
|
||||||
from vllm.attention.ops.paged_attn import PagedAttention, PagedAttentionMetadata
|
from vllm.attention.ops.paged_attn import (PagedAttention,
|
||||||
|
PagedAttentionMetadata)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.utils import is_hip
|
from vllm.utils import is_hip
|
||||||
|
|
||||||
|
@ -3,8 +3,7 @@ from typing import Dict, List, Optional, Tuple
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm._C import cache_ops
|
from vllm._C import cache_ops, ops
|
||||||
from vllm._C import ops
|
|
||||||
from vllm.attention.ops.prefix_prefill import context_attention_fwd
|
from vllm.attention.ops.prefix_prefill import context_attention_fwd
|
||||||
|
|
||||||
# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
|
# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
|
||||||
|
@ -13,11 +13,13 @@ logger = init_logger(__name__)
|
|||||||
def get_attn_backend(dtype: torch.dtype) -> AttentionBackend:
|
def get_attn_backend(dtype: torch.dtype) -> AttentionBackend:
|
||||||
if _can_use_flash_attn(dtype):
|
if _can_use_flash_attn(dtype):
|
||||||
logger.info("Using FlashAttention backend.")
|
logger.info("Using FlashAttention backend.")
|
||||||
from vllm.attention.backends.flash_attn import FlashAttentionBackend # noqa: F401
|
from vllm.attention.backends.flash_attn import ( # noqa: F401
|
||||||
|
FlashAttentionBackend)
|
||||||
return FlashAttentionBackend
|
return FlashAttentionBackend
|
||||||
else:
|
else:
|
||||||
logger.info("Using XFormers backend.")
|
logger.info("Using XFormers backend.")
|
||||||
from vllm.attention.backends.xformers import XFormersBackend # noqa: F401
|
from vllm.attention.backends.xformers import ( # noqa: F401
|
||||||
|
XFormersBackend)
|
||||||
return XFormersBackend
|
return XFormersBackend
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,15 +1,15 @@
|
|||||||
from typing import TYPE_CHECKING, Optional, Union, ClassVar
|
|
||||||
from dataclasses import dataclass
|
|
||||||
import os
|
|
||||||
from packaging.version import Version
|
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import TYPE_CHECKING, ClassVar, Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
from packaging.version import Version
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.transformers_utils.config import get_config
|
from vllm.transformers_utils.config import get_config
|
||||||
from vllm.utils import get_cpu_memory, is_hip, is_neuron, get_nvcc_cuda_version
|
from vllm.utils import get_cpu_memory, get_nvcc_cuda_version, is_hip, is_neuron
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from ray.util.placement_group import PlacementGroup
|
from ray.util.placement_group import PlacementGroup
|
||||||
@ -103,7 +103,8 @@ class ModelConfig:
|
|||||||
if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true":
|
if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true":
|
||||||
# download model from ModelScope hub,
|
# download model from ModelScope hub,
|
||||||
# lazy import so that modelscope is not required for normal use.
|
# lazy import so that modelscope is not required for normal use.
|
||||||
from modelscope.hub.snapshot_download import snapshot_download # pylint: disable=C
|
# pylint: disable=C.
|
||||||
|
from modelscope.hub.snapshot_download import snapshot_download
|
||||||
|
|
||||||
if not os.path.exists(model):
|
if not os.path.exists(model):
|
||||||
model_path = snapshot_download(model_id=model,
|
model_path = snapshot_download(model_id=model,
|
||||||
|
@ -1,15 +1,15 @@
|
|||||||
"""A block manager that manages token blocks."""
|
"""A block manager that manages token blocks."""
|
||||||
import enum
|
import enum
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
from itertools import count, takewhile
|
from itertools import count, takewhile
|
||||||
from os.path import commonprefix
|
from os.path import commonprefix
|
||||||
from typing import Dict, List, Optional, Set, Tuple
|
from typing import Dict, List, Optional, Set, Tuple
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
|
|
||||||
from vllm.block import BlockTable, PhysicalTokenBlock
|
from vllm.block import BlockTable, PhysicalTokenBlock
|
||||||
|
from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor
|
||||||
|
from vllm.logger import init_logger
|
||||||
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
|
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
|
||||||
from vllm.utils import Device
|
from vllm.utils import Device
|
||||||
from vllm.core.evictor import Evictor, EvictionPolicy, make_evictor
|
|
||||||
from vllm.logger import init_logger
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import enum
|
import enum
|
||||||
from typing import OrderedDict
|
|
||||||
from abc import ABC, abstractmethod, abstractproperty
|
from abc import ABC, abstractmethod, abstractproperty
|
||||||
|
from typing import OrderedDict
|
||||||
|
|
||||||
from vllm.block import PhysicalTokenBlock
|
from vllm.block import PhysicalTokenBlock
|
||||||
|
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
from collections import deque
|
|
||||||
import enum
|
import enum
|
||||||
import time
|
import time
|
||||||
from typing import Deque, Dict, Iterable, List, Optional, Tuple, Union, Set
|
from collections import deque
|
||||||
|
from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union
|
||||||
|
|
||||||
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
|
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
|
||||||
from vllm.core.block_manager import AllocStatus, BlockSpaceManager
|
from vllm.core.block_manager import AllocStatus, BlockSpaceManager
|
||||||
from vllm.core.policy import PolicyFactory
|
from vllm.core.policy import PolicyFactory
|
||||||
from vllm.lora.request import LoRARequest
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
|
from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
|
||||||
SequenceGroupMetadata, SequenceStatus)
|
SequenceGroupMetadata, SequenceStatus)
|
||||||
|
|
||||||
|
@ -3,9 +3,8 @@ import dataclasses
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
|
||||||
ParallelConfig, SchedulerConfig, LoRAConfig,
|
ParallelConfig, SchedulerConfig, TokenizerPoolConfig)
|
||||||
TokenizerPoolConfig)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -2,17 +2,17 @@ import asyncio
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import (Callable, Dict, Iterable, List, Optional, Set, Tuple, Type,
|
from typing import (AsyncIterator, Callable, Dict, Iterable, List, Optional,
|
||||||
Union, AsyncIterator)
|
Set, Tuple, Type, Union)
|
||||||
|
|
||||||
from transformers import PreTrainedTokenizer
|
from transformers import PreTrainedTokenizer
|
||||||
|
|
||||||
from vllm.lora.request import LoRARequest
|
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||||
from vllm.engine.llm_engine import LLMEngine
|
from vllm.engine.llm_engine import LLMEngine
|
||||||
from vllm.engine.ray_utils import initialize_ray_cluster, ray
|
from vllm.engine.ray_utils import initialize_ray_cluster, ray
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
|
|
||||||
|
@ -4,22 +4,22 @@ from typing import Iterable, List, Optional, Tuple, Type, Union
|
|||||||
from transformers import PreTrainedTokenizer
|
from transformers import PreTrainedTokenizer
|
||||||
|
|
||||||
import vllm
|
import vllm
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
|
||||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
ParallelConfig, SchedulerConfig)
|
||||||
ParallelConfig, SchedulerConfig, LoRAConfig)
|
|
||||||
from vllm.core.scheduler import Scheduler, SchedulerOutputs
|
from vllm.core.scheduler import Scheduler, SchedulerOutputs
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.executor.executor_base import ExecutorBase
|
|
||||||
from vllm.engine.metrics import StatLogger, Stats
|
from vllm.engine.metrics import StatLogger, Stats
|
||||||
from vllm.engine.ray_utils import initialize_ray_cluster
|
from vllm.engine.ray_utils import initialize_ray_cluster
|
||||||
|
from vllm.executor.executor_base import ExecutorBase
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup,
|
from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup,
|
||||||
SequenceGroupOutput, SequenceOutput, SequenceStatus)
|
SequenceGroupOutput, SequenceOutput, SequenceStatus)
|
||||||
|
from vllm.transformers_utils.detokenizer import Detokenizer
|
||||||
from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
|
from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
|
||||||
get_tokenizer_group)
|
get_tokenizer_group)
|
||||||
from vllm.transformers_utils.detokenizer import Detokenizer
|
|
||||||
from vllm.utils import Counter
|
from vllm.utils import Counter
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
@ -1,11 +1,12 @@
|
|||||||
from vllm.logger import init_logger
|
import time
|
||||||
from prometheus_client import (Counter, Gauge, Histogram, Info, REGISTRY,
|
from dataclasses import dataclass
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info,
|
||||||
disable_created_metrics)
|
disable_created_metrics)
|
||||||
|
|
||||||
import time
|
from vllm.logger import init_logger
|
||||||
import numpy as np
|
|
||||||
from typing import Dict, List
|
|
||||||
from dataclasses import dataclass
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
@ -1,10 +1,9 @@
|
|||||||
import pickle
|
import pickle
|
||||||
|
from typing import List, Optional, Tuple
|
||||||
from typing import Optional, List, Tuple
|
|
||||||
|
|
||||||
from vllm.config import ParallelConfig
|
from vllm.config import ParallelConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.utils import is_hip, set_cuda_visible_devices, get_ip
|
from vllm.utils import get_ip, is_hip, set_cuda_visible_devices
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
@ -11,9 +11,9 @@ import json
|
|||||||
import ssl
|
import ssl
|
||||||
from typing import AsyncGenerator
|
from typing import AsyncGenerator
|
||||||
|
|
||||||
|
import uvicorn
|
||||||
from fastapi import FastAPI, Request
|
from fastapi import FastAPI, Request
|
||||||
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
||||||
import uvicorn
|
|
||||||
|
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||||
|
@ -3,9 +3,9 @@ from typing import List, Optional, Union
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
||||||
|
|
||||||
from vllm.lora.request import LoRARequest
|
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.engine.llm_engine import LLMEngine
|
from vllm.engine.llm_engine import LLMEngine
|
||||||
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.utils import Counter
|
from vllm.utils import Counter
|
||||||
|
@ -1,28 +1,27 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
from contextlib import asynccontextmanager
|
|
||||||
import os
|
|
||||||
import importlib
|
import importlib
|
||||||
import inspect
|
import inspect
|
||||||
|
import os
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
from http import HTTPStatus
|
||||||
|
|
||||||
from prometheus_client import make_asgi_app
|
|
||||||
import fastapi
|
import fastapi
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from http import HTTPStatus
|
|
||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
from fastapi.exceptions import RequestValidationError
|
from fastapi.exceptions import RequestValidationError
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from fastapi.responses import JSONResponse, StreamingResponse, Response
|
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
||||||
|
from prometheus_client import make_asgi_app
|
||||||
|
|
||||||
import vllm
|
import vllm
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||||
from vllm.entrypoints.openai.protocol import (CompletionRequest,
|
|
||||||
ChatCompletionRequest,
|
|
||||||
ErrorResponse)
|
|
||||||
from vllm.logger import init_logger
|
|
||||||
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
||||||
|
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||||
|
CompletionRequest, ErrorResponse)
|
||||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||||
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
TIMEOUT_KEEP_ALIVE = 5 # seconds
|
TIMEOUT_KEEP_ALIVE = 5 # seconds
|
||||||
|
|
||||||
|
@ -3,12 +3,11 @@
|
|||||||
import time
|
import time
|
||||||
from typing import Dict, List, Literal, Optional, Union
|
from typing import Dict, List, Literal, Optional, Union
|
||||||
|
|
||||||
|
import torch
|
||||||
from pydantic import BaseModel, Field, model_validator
|
from pydantic import BaseModel, Field, model_validator
|
||||||
|
|
||||||
from vllm.utils import random_uuid
|
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
|
from vllm.utils import random_uuid
|
||||||
import torch
|
|
||||||
|
|
||||||
|
|
||||||
class ErrorResponse(BaseModel):
|
class ErrorResponse(BaseModel):
|
||||||
|
@ -1,19 +1,21 @@
|
|||||||
import time
|
|
||||||
import codecs
|
import codecs
|
||||||
|
import time
|
||||||
|
from typing import AsyncGenerator, AsyncIterator, List, Optional, Union
|
||||||
|
|
||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
from typing import AsyncGenerator, AsyncIterator, Optional, List, Union
|
|
||||||
from vllm.logger import init_logger
|
|
||||||
from vllm.utils import random_uuid
|
|
||||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||||
from vllm.entrypoints.openai.protocol import (
|
from vllm.entrypoints.openai.protocol import (
|
||||||
ChatCompletionRequest, ChatCompletionResponse,
|
ChatCompletionRequest, ChatCompletionResponse,
|
||||||
ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
|
ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
|
||||||
ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse,
|
ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse,
|
||||||
UsageInfo)
|
UsageInfo)
|
||||||
from vllm.outputs import RequestOutput
|
from vllm.entrypoints.openai.serving_engine import LoRA, OpenAIServing
|
||||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.guided_decoding import (
|
from vllm.model_executor.guided_decoding import (
|
||||||
get_guided_decoding_logits_processor)
|
get_guided_decoding_logits_processor)
|
||||||
|
from vllm.outputs import RequestOutput
|
||||||
|
from vllm.utils import random_uuid
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
@ -1,24 +1,23 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
|
from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, List,
|
||||||
|
Optional, Tuple)
|
||||||
|
|
||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
from typing import (AsyncGenerator, AsyncIterator, Callable, List, Optional,
|
|
||||||
Dict, Tuple)
|
|
||||||
from vllm.logger import init_logger
|
|
||||||
from vllm.utils import random_uuid
|
|
||||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||||
from vllm.entrypoints.openai.protocol import (
|
from vllm.entrypoints.openai.protocol import (CompletionRequest,
|
||||||
CompletionRequest,
|
|
||||||
CompletionResponse,
|
CompletionResponse,
|
||||||
CompletionResponseChoice,
|
CompletionResponseChoice,
|
||||||
CompletionResponseStreamChoice,
|
CompletionResponseStreamChoice,
|
||||||
CompletionStreamResponse,
|
CompletionStreamResponse,
|
||||||
LogProbs,
|
LogProbs, UsageInfo)
|
||||||
UsageInfo,
|
from vllm.entrypoints.openai.serving_engine import LoRA, OpenAIServing
|
||||||
)
|
from vllm.logger import init_logger
|
||||||
from vllm.outputs import RequestOutput
|
|
||||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA
|
|
||||||
from vllm.model_executor.guided_decoding import (
|
from vllm.model_executor.guided_decoding import (
|
||||||
get_guided_decoding_logits_processor)
|
get_guided_decoding_logits_processor)
|
||||||
|
from vllm.outputs import RequestOutput
|
||||||
|
from vllm.utils import random_uuid
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
@ -3,16 +3,16 @@ import json
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from typing import Dict, List, Optional, Union
|
from typing import Dict, List, Optional, Union
|
||||||
from vllm.logger import init_logger
|
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
|
||||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||||
from vllm.entrypoints.openai.protocol import (CompletionRequest,
|
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||||
ChatCompletionRequest,
|
CompletionRequest, ErrorResponse,
|
||||||
ErrorResponse, LogProbs,
|
LogProbs, ModelCard, ModelList,
|
||||||
ModelCard, ModelList,
|
|
||||||
ModelPermission)
|
ModelPermission)
|
||||||
|
from vllm.logger import init_logger
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.sequence import Logprob
|
from vllm.sequence import Logprob
|
||||||
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Dict, List, Optional
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
|
||||||
ParallelConfig, SchedulerConfig, LoRAConfig)
|
ParallelConfig, SchedulerConfig)
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
|
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
|
||||||
|
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
from typing import Dict, List, Optional
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
|
||||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
ParallelConfig, SchedulerConfig)
|
||||||
ParallelConfig, SchedulerConfig, LoRAConfig)
|
|
||||||
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
|
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
|
||||||
from vllm.executor.utils import check_block_size_valid
|
from vllm.executor.utils import check_block_size_valid
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
|
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
|
||||||
from vllm.utils import (get_ip, get_open_port, get_distributed_init_method,
|
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
|
||||||
make_async)
|
make_async)
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
from typing import Dict, List, Optional
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
|
||||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
ParallelConfig, SchedulerConfig)
|
||||||
ParallelConfig, SchedulerConfig, LoRAConfig)
|
|
||||||
from vllm.executor.executor_base import ExecutorBase
|
from vllm.executor.executor_base import ExecutorBase
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
|
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
@ -1,20 +1,20 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import copy
|
import copy
|
||||||
from collections import defaultdict
|
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
|
from collections import defaultdict
|
||||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
||||||
|
|
||||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
|
||||||
ParallelConfig, SchedulerConfig, LoRAConfig)
|
ParallelConfig, SchedulerConfig)
|
||||||
from vllm.engine.ray_utils import RayWorkerVllm, ray
|
from vllm.engine.ray_utils import RayWorkerVllm, ray
|
||||||
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
|
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
|
||||||
from vllm.executor.utils import check_block_size_valid
|
from vllm.executor.utils import check_block_size_valid
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
|
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
|
||||||
from vllm.utils import (set_cuda_visible_devices, get_ip, get_open_port,
|
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
|
||||||
get_distributed_init_method, make_async)
|
make_async, set_cuda_visible_devices)
|
||||||
|
|
||||||
if ray is not None:
|
if ray is not None:
|
||||||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
||||||
@ -343,7 +343,7 @@ class RayGPUExecutor(ExecutorBase):
|
|||||||
raise ValueError(f"Ray version {required_version} or greater is "
|
raise ValueError(f"Ray version {required_version} or greater is "
|
||||||
f"required, but found {current_version}")
|
f"required, but found {current_version}")
|
||||||
|
|
||||||
from ray.dag import MultiOutputNode, InputNode
|
from ray.dag import InputNode, MultiOutputNode
|
||||||
assert self.parallel_config.worker_use_ray
|
assert self.parallel_config.worker_use_ray
|
||||||
|
|
||||||
# Right now, compiled DAG requires at least 1 arg. We send
|
# Right now, compiled DAG requires at least 1 arg. We send
|
||||||
|
@ -2,8 +2,8 @@
|
|||||||
# https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
|
# https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
|
||||||
"""Logging configuration for vLLM."""
|
"""Logging configuration for vLLM."""
|
||||||
import logging
|
import logging
|
||||||
import sys
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
VLLM_CONFIGURE_LOGGING = int(os.getenv("VLLM_CONFIGURE_LOGGING", "1"))
|
VLLM_CONFIGURE_LOGGING = int(os.getenv("VLLM_CONFIGURE_LOGGING", "1"))
|
||||||
|
|
||||||
|
@ -10,18 +10,16 @@ from transformers import PretrainedConfig
|
|||||||
|
|
||||||
from vllm.config import LoRAConfig
|
from vllm.config import LoRAConfig
|
||||||
from vllm.lora.punica import add_lora, add_lora_slice, bgmv
|
from vllm.lora.punica import add_lora, add_lora_slice, bgmv
|
||||||
from vllm.model_executor.parallel_utils.communication_op import (
|
|
||||||
tensor_model_parallel_all_gather,
|
|
||||||
tensor_model_parallel_all_reduce,
|
|
||||||
tensor_model_parallel_gather,
|
|
||||||
)
|
|
||||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||||
RowParallelLinear,
|
MergedColumnParallelLinear,
|
||||||
QKVParallelLinear,
|
QKVParallelLinear,
|
||||||
MergedColumnParallelLinear)
|
RowParallelLinear)
|
||||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||||
VocabParallelEmbedding, ParallelLMHead)
|
ParallelLMHead, VocabParallelEmbedding)
|
||||||
|
from vllm.model_executor.parallel_utils.communication_op import (
|
||||||
|
tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce,
|
||||||
|
tensor_model_parallel_gather)
|
||||||
from vllm.model_executor.parallel_utils.parallel_state import (
|
from vllm.model_executor.parallel_utils.parallel_state import (
|
||||||
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
|
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
|
||||||
from vllm.model_executor.parallel_utils.utils import (
|
from vllm.model_executor.parallel_utils.utils import (
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.utils import is_pin_memory_available
|
from vllm.utils import is_pin_memory_available
|
||||||
|
|
||||||
|
|
||||||
|
@ -4,19 +4,18 @@ import logging
|
|||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from typing import (Callable, Dict, Hashable, List, Optional, Tuple, Type)
|
from typing import Callable, Dict, Hashable, List, Optional, Tuple, Type
|
||||||
|
|
||||||
import safetensors.torch
|
import safetensors.torch
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from vllm.config import LoRAConfig
|
from vllm.config import LoRAConfig
|
||||||
from vllm.utils import LRUCache, is_pin_memory_available
|
|
||||||
|
|
||||||
from vllm.lora.layers import (BaseLayerWithLoRA, LoRAMapping, from_layer,
|
from vllm.lora.layers import (BaseLayerWithLoRA, LoRAMapping, from_layer,
|
||||||
from_layer_logits_processor)
|
from_layer_logits_processor)
|
||||||
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
|
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
|
||||||
from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
|
from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
|
||||||
|
from vllm.utils import LRUCache, is_pin_memory_available
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -4,11 +4,11 @@ from typing import Any, Dict, List, Optional, Set, Type
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from vllm.config import LoRAConfig
|
||||||
|
from vllm.lora.layers import LoRAMapping
|
||||||
from vllm.lora.models import (LoRAModel, LoRAModelManager,
|
from vllm.lora.models import (LoRAModel, LoRAModelManager,
|
||||||
LRUCacheLoRAModelManager, create_lora_manager)
|
LRUCacheLoRAModelManager, create_lora_manager)
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.lora.layers import LoRAMapping
|
|
||||||
from vllm.config import LoRAConfig
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -5,16 +5,16 @@ from enum import Enum
|
|||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from json import dumps as json_dumps
|
from json import dumps as json_dumps
|
||||||
from re import escape as regex_escape
|
from re import escape as regex_escape
|
||||||
from typing import Union, Tuple
|
from typing import Tuple, Union
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
from vllm.entrypoints.openai.protocol import (CompletionRequest,
|
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||||
ChatCompletionRequest)
|
CompletionRequest)
|
||||||
from vllm.model_executor.guided_logits_processors import (JSONLogitsProcessor,
|
from vllm.model_executor.guided_logits_processors import (CFGLogitsProcessor,
|
||||||
RegexLogitsProcessor,
|
JSONLogitsProcessor,
|
||||||
CFGLogitsProcessor)
|
RegexLogitsProcessor)
|
||||||
|
|
||||||
|
|
||||||
class GuidedDecodingMode(Enum):
|
class GuidedDecodingMode(Enum):
|
||||||
|
@ -16,13 +16,13 @@
|
|||||||
import json
|
import json
|
||||||
import math
|
import math
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import Union, DefaultDict, Dict, List, Optional, Callable
|
from typing import Callable, DefaultDict, Dict, List, Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
from outlines.fsm.fsm import CFGFSM, RegexFSM
|
||||||
|
from outlines.fsm.json_schema import build_regex_from_schema
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
from outlines.fsm.fsm import RegexFSM, CFGFSM
|
|
||||||
from outlines.fsm.json_schema import build_regex_from_schema
|
|
||||||
|
|
||||||
|
|
||||||
class BaseLogitsProcessor:
|
class BaseLogitsProcessor:
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||||
fused_moe,
|
fused_moe, get_config_file_name)
|
||||||
get_config_file_name,
|
|
||||||
)
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"fused_moe",
|
"fused_moe",
|
||||||
|
@ -5,14 +5,14 @@ import torch
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from torch.nn.parameter import Parameter
|
from torch.nn.parameter import Parameter
|
||||||
|
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
from vllm.model_executor.parallel_utils.communication_op import (
|
||||||
|
tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce)
|
||||||
from vllm.model_executor.parallel_utils.parallel_state import (
|
from vllm.model_executor.parallel_utils.parallel_state import (
|
||||||
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
|
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
|
||||||
from vllm.model_executor.parallel_utils.communication_op import (
|
|
||||||
tensor_model_parallel_all_reduce, tensor_model_parallel_all_gather)
|
|
||||||
from vllm.model_executor.parallel_utils.utils import (
|
from vllm.model_executor.parallel_utils.utils import (
|
||||||
divide, split_tensor_along_last_dim)
|
divide, split_tensor_along_last_dim)
|
||||||
from vllm.model_executor.utils import set_weight_attrs
|
from vllm.model_executor.utils import set_weight_attrs
|
||||||
from vllm.logger import init_logger
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import triton
|
import triton
|
||||||
import triton.language as tl
|
import triton.language as tl
|
||||||
|
|
||||||
from typing import Optional, Union
|
|
||||||
|
|
||||||
|
|
||||||
def seeded_uniform(
|
def seeded_uniform(
|
||||||
*size,
|
*size,
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import math
|
import math
|
||||||
from typing import Tuple, Optional
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import triton
|
import triton
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
from typing import Type
|
from typing import Type
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.quantization.awq import AWQConfig
|
||||||
from vllm.model_executor.layers.quantization.base_config import (
|
from vllm.model_executor.layers.quantization.base_config import (
|
||||||
QuantizationConfig)
|
QuantizationConfig)
|
||||||
from vllm.model_executor.layers.quantization.awq import AWQConfig
|
|
||||||
from vllm.model_executor.layers.quantization.gptq import GPTQConfig
|
from vllm.model_executor.layers.quantization.gptq import GPTQConfig
|
||||||
from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
|
|
||||||
from vllm.model_executor.layers.quantization.marlin import MarlinConfig
|
from vllm.model_executor.layers.quantization.marlin import MarlinConfig
|
||||||
|
from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
|
||||||
|
|
||||||
_QUANTIZATION_CONFIG_REGISTRY = {
|
_QUANTIZATION_CONFIG_REGISTRY = {
|
||||||
"awq": AWQConfig,
|
"awq": AWQConfig,
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import enum
|
import enum
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Any, Dict, List, Optional
|
|
||||||
from fractions import Fraction
|
from fractions import Fraction
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch.nn.parameter import Parameter
|
from torch.nn.parameter import Parameter
|
||||||
|
@ -4,7 +4,8 @@ import torch
|
|||||||
from torch.nn.parameter import Parameter
|
from torch.nn.parameter import Parameter
|
||||||
|
|
||||||
from vllm._C import ops
|
from vllm._C import ops
|
||||||
from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs
|
from vllm.model_executor.layers.linear import (LinearMethodBase,
|
||||||
|
set_weight_attrs)
|
||||||
from vllm.model_executor.layers.quantization.base_config import (
|
from vllm.model_executor.layers.quantization.base_config import (
|
||||||
QuantizationConfig)
|
QuantizationConfig)
|
||||||
|
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
from typing import Tuple, Optional
|
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
|
||||||
import torch.jit
|
import torch.jit
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
|
||||||
class RejectionSampler(nn.Module):
|
class RejectionSampler(nn.Module):
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user