[CI] Try introducing isort. (#3495)
This commit is contained in:
parent
e67c295b0c
commit
01bfb22b41
7
.github/workflows/ruff.yml
vendored
7
.github/workflows/ruff.yml
vendored
@ -25,10 +25,13 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1
|
||||
pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
|
||||
- name: Analysing the code with ruff
|
||||
run: |
|
||||
ruff .
|
||||
- name: Spelling check with codespell
|
||||
run: |
|
||||
codespell --toml pyproject.toml
|
||||
codespell --toml pyproject.toml
|
||||
- name: Run isort
|
||||
run: |
|
||||
isort . --check-only
|
||||
|
@ -1,8 +1,7 @@
|
||||
import argparse
|
||||
import time
|
||||
|
||||
from vllm import LLM
|
||||
from vllm import SamplingParams
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501
|
||||
|
||||
|
@ -25,15 +25,12 @@ from datetime import datetime
|
||||
from typing import AsyncGenerator, List, Tuple
|
||||
|
||||
import numpy as np
|
||||
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
||||
RequestFuncOutput)
|
||||
from tqdm.asyncio import tqdm
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from backend_request_func import (
|
||||
ASYNC_REQUEST_FUNCS,
|
||||
RequestFuncInput,
|
||||
RequestFuncOutput,
|
||||
)
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -6,9 +6,9 @@ import time
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||
PreTrainedTokenizerBase)
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def sample_requests(
|
||||
|
@ -2,11 +2,13 @@ import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
from vllm.model_executor.layers.fused_moe import fused_moe, get_config_file_name
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import triton
|
||||
|
||||
from vllm.model_executor.layers.fused_moe import (fused_moe,
|
||||
get_config_file_name)
|
||||
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
||||
|
||||
|
||||
|
@ -1,12 +1,12 @@
|
||||
from typing import Optional
|
||||
import argparse
|
||||
import random
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
|
||||
from vllm._C import ops
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
|
||||
|
||||
NUM_BLOCKS = 1024
|
||||
PARTITION_SIZE = 512
|
||||
|
@ -1,9 +1,10 @@
|
||||
import argparse
|
||||
from itertools import accumulate
|
||||
from typing import Optional
|
||||
|
||||
import argparse
|
||||
import torch
|
||||
import nvtx
|
||||
from itertools import accumulate
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
|
||||
|
||||
|
@ -9,8 +9,8 @@
|
||||
#
|
||||
|
||||
import argparse
|
||||
import shutil
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from torch.utils.hipify.hipify_python import hipify
|
||||
|
||||
|
@ -6,10 +6,10 @@
|
||||
# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
|
||||
import datetime
|
||||
import locale
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import os
|
||||
from collections import namedtuple
|
||||
|
||||
try:
|
||||
|
@ -10,10 +10,11 @@
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
from sphinx.ext import autodoc
|
||||
import logging
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
import argparse
|
||||
from openai import OpenAI
|
||||
|
||||
import gradio as gr
|
||||
from openai import OpenAI
|
||||
|
||||
# Argument parser setup
|
||||
parser = argparse.ArgumentParser(
|
||||
|
@ -1,7 +1,7 @@
|
||||
import argparse
|
||||
from typing import List, Tuple
|
||||
|
||||
from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
|
||||
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
|
||||
|
||||
|
||||
def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
|
||||
|
@ -5,11 +5,11 @@ for offline inference.
|
||||
Requires HuggingFace credentials for access to Llama2.
|
||||
"""
|
||||
|
||||
from typing import Optional, List, Tuple
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
|
||||
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
|
||||
|
@ -5,11 +5,13 @@ distributively on a multi-nodes cluster.
|
||||
Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
|
||||
"""
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from typing import Dict
|
||||
|
||||
import numpy as np
|
||||
import ray
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
|
42
format.sh
42
format.sh
@ -25,6 +25,7 @@ YAPF_VERSION=$(yapf --version | awk '{print $2}')
|
||||
RUFF_VERSION=$(ruff --version | awk '{print $2}')
|
||||
MYPY_VERSION=$(mypy --version | awk '{print $2}')
|
||||
CODESPELL_VERSION=$(codespell --version)
|
||||
ISORT_VERSION=$(isort --vn)
|
||||
|
||||
# # params: tool name, tool version, required version
|
||||
tool_version_check() {
|
||||
@ -37,6 +38,7 @@ tool_version_check() {
|
||||
tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
|
||||
tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)"
|
||||
tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
|
||||
tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-dev.txt | cut -d'=' -f3)"
|
||||
tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-dev.txt | cut -d'=' -f3)"
|
||||
|
||||
YAPF_FLAGS=(
|
||||
@ -178,6 +180,46 @@ else
|
||||
lint_changed
|
||||
fi
|
||||
|
||||
# check spelling of specified files
|
||||
isort_check() {
|
||||
isort "$@"
|
||||
}
|
||||
|
||||
isort_check_all(){
|
||||
isort .
|
||||
}
|
||||
|
||||
# Spelling check of files that differ from main branch.
|
||||
isort_check_changed() {
|
||||
# The `if` guard ensures that the list of filenames is not empty, which
|
||||
# could cause ruff to receive 0 positional arguments, making it hang
|
||||
# waiting for STDIN.
|
||||
#
|
||||
# `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
|
||||
# exist on both branches.
|
||||
MERGEBASE="$(git merge-base origin/main HEAD)"
|
||||
|
||||
if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
|
||||
git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
|
||||
isort
|
||||
fi
|
||||
}
|
||||
|
||||
# Run Isort
|
||||
# This flag runs spell check of individual files. --files *must* be the first command line
|
||||
# arg to use this option.
|
||||
if [[ "$1" == '--files' ]]; then
|
||||
isort_check "${@:2}"
|
||||
# If `--all` is passed, then any further arguments are ignored and the
|
||||
# entire python directory is linted.
|
||||
elif [[ "$1" == '--all' ]]; then
|
||||
isort_check_all
|
||||
else
|
||||
# Check spelling only of the files that changed in last commit.
|
||||
isort_check_changed
|
||||
fi
|
||||
echo 'vLLM isort: Done'
|
||||
|
||||
if ! git diff --quiet &>/dev/null; then
|
||||
echo 'Reformatted files. Please review and stage the changes.'
|
||||
echo 'Changes not staged for commit:'
|
||||
|
@ -51,3 +51,7 @@ exclude = "vllm/model_executor/parallel_utils/|vllm/model_executor/models/"
|
||||
[tool.codespell]
|
||||
ignore-words-list = "dout, te, indicies"
|
||||
skip = "./tests/prompts"
|
||||
|
||||
[tool.isort]
|
||||
use_parentheses = true
|
||||
skip_gitignore = true
|
||||
|
@ -4,6 +4,7 @@ toml==0.10.2
|
||||
tomli==2.0.1
|
||||
ruff==0.1.5
|
||||
codespell==2.2.6
|
||||
isort==5.13.2
|
||||
|
||||
# type checking
|
||||
mypy==0.991
|
||||
|
10
setup.py
10
setup.py
@ -1,16 +1,16 @@
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
import subprocess
|
||||
import sys
|
||||
from shutil import which
|
||||
from typing import List
|
||||
|
||||
from packaging.version import parse, Version
|
||||
from setuptools import setup, find_packages, Extension
|
||||
from setuptools.command.build_ext import build_ext
|
||||
from shutil import which
|
||||
import torch
|
||||
from packaging.version import Version, parse
|
||||
from setuptools import Extension, find_packages, setup
|
||||
from setuptools.command.build_ext import build_ext
|
||||
from torch.utils.cpp_extension import CUDA_HOME
|
||||
|
||||
ROOT_DIR = os.path.dirname(__file__)
|
||||
|
@ -1,12 +1,12 @@
|
||||
from dataclasses import dataclass
|
||||
import os
|
||||
import pathlib
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
|
||||
__file__))).parent.parent / "examples/template_chatml.jinja"
|
||||
|
@ -6,8 +6,8 @@ import torch
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.config import TokenizerPoolConfig
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
_TEST_DIR = os.path.dirname(__file__)
|
||||
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
|
||||
|
@ -1,13 +1,14 @@
|
||||
import pytest
|
||||
import time
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.block import PhysicalTokenBlock
|
||||
from vllm.core.block_manager import (UncachedBlockAllocator, BlockSpaceManager,
|
||||
AllocStatus)
|
||||
from vllm.core.block_manager import (AllocStatus, BlockSpaceManager,
|
||||
UncachedBlockAllocator)
|
||||
from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceStatus
|
||||
from vllm.utils import Device
|
||||
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus, Logprob
|
||||
|
||||
from .utils import create_dummy_prompt
|
||||
|
||||
|
@ -1,10 +1,11 @@
|
||||
from typing import List
|
||||
import pytest # noqa
|
||||
import time
|
||||
from typing import List
|
||||
|
||||
import pytest # noqa
|
||||
|
||||
from vllm.config import CacheConfig, SchedulerConfig
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.sequence import SequenceGroup, Logprob
|
||||
from vllm.sequence import Logprob, SequenceGroup
|
||||
|
||||
from .utils import create_dummy_prompt
|
||||
|
||||
|
@ -3,14 +3,12 @@
|
||||
Run `pytest tests/distributed/test_comm_ops.py --forked`.
|
||||
"""
|
||||
import pytest
|
||||
import torch
|
||||
import ray
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.parallel_utils.communication_op import (
|
||||
tensor_model_parallel_all_reduce,
|
||||
tensor_model_parallel_all_gather,
|
||||
broadcast_tensor_dict,
|
||||
)
|
||||
broadcast_tensor_dict, tensor_model_parallel_all_gather,
|
||||
tensor_model_parallel_all_reduce)
|
||||
from vllm.test_utils import (init_test_distributed_environment,
|
||||
multi_process_tensor_parallel)
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
import os
|
||||
import random
|
||||
|
||||
import os
|
||||
import pytest
|
||||
import ray
|
||||
import torch
|
||||
|
@ -1,11 +1,11 @@
|
||||
# This unit test should be moved to a new
|
||||
# tests/test_guided_decoding directory.
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
import torch
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.model_executor.guided_logits_processors import (RegexLogitsProcessor,
|
||||
JSONLogitsProcessor)
|
||||
from vllm.model_executor.guided_logits_processors import (JSONLogitsProcessor,
|
||||
RegexLogitsProcessor)
|
||||
|
||||
TEST_SCHEMA = {
|
||||
"type": "object",
|
||||
|
@ -1,22 +1,21 @@
|
||||
# imports for guided decoding tests
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
import sys
|
||||
import jsonschema
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import requests
|
||||
# using Ray for overall ease of process management, parallel requests,
|
||||
# and debugging.
|
||||
import ray
|
||||
import openai # use the official client for correctness check
|
||||
import requests
|
||||
# downloading lora to test lora requests
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
# imports for guided decoding tests
|
||||
import json
|
||||
import jsonschema
|
||||
import re
|
||||
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds
|
||||
|
@ -1,4 +1,5 @@
|
||||
import pytest
|
||||
|
||||
from vllm.utils import create_kv_caches_with_random
|
||||
|
||||
|
||||
|
@ -2,10 +2,10 @@ from typing import Type
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from allclose_default import get_default_atol, get_default_rtol
|
||||
|
||||
from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
|
||||
NewGELU, SiluAndMul)
|
||||
from allclose_default import get_default_atol, get_default_rtol
|
||||
|
||||
DTYPES = [torch.half, torch.bfloat16, torch.float]
|
||||
NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing
|
||||
|
@ -3,13 +3,12 @@ from typing import List, Optional, Tuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from allclose_default import get_default_atol, get_default_rtol
|
||||
from xformers import ops as xops
|
||||
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
|
||||
|
||||
from vllm._C import ops, cache_ops
|
||||
from vllm.utils import get_max_shared_memory_bytes
|
||||
from vllm.utils import is_hip
|
||||
from allclose_default import get_default_atol, get_default_rtol
|
||||
from vllm._C import cache_ops, ops
|
||||
from vllm.utils import get_max_shared_memory_bytes, is_hip
|
||||
|
||||
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
||||
# This will change depending on the compute capability.
|
||||
|
@ -1,10 +1,9 @@
|
||||
import random
|
||||
from typing import Tuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from typing import Tuple
|
||||
|
||||
from vllm._C import cache_ops
|
||||
|
||||
COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
|
||||
|
@ -7,8 +7,8 @@ import torch
|
||||
from transformers import MixtralConfig
|
||||
from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
|
||||
|
||||
from vllm.model_executor.layers.fused_moe import fused_moe
|
||||
from vllm.model_executor.layers.activation import SiluAndMul
|
||||
from vllm.model_executor.layers.fused_moe import fused_moe
|
||||
from vllm.model_executor.models.mixtral import MixtralMoE
|
||||
|
||||
|
||||
|
@ -1,9 +1,10 @@
|
||||
from itertools import accumulate
|
||||
from typing import List, Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from allclose_default import get_default_atol, get_default_rtol
|
||||
from itertools import accumulate
|
||||
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
|
||||
IS_NEOX_STYLE = [True, False]
|
||||
|
@ -1,12 +1,13 @@
|
||||
import random
|
||||
import pytest
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from vllm.attention.ops.prefix_prefill import context_attention_fwd
|
||||
from xformers import ops as xops
|
||||
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
|
||||
|
||||
from vllm.attention.ops.prefix_prefill import context_attention_fwd
|
||||
|
||||
NUM_HEADS = [64]
|
||||
NUM_QUERIES_PER_KV = [1, 8, 64]
|
||||
HEAD_SIZES = [128]
|
||||
|
@ -1,7 +1,8 @@
|
||||
import torch
|
||||
import pytest
|
||||
import random
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.ops.rand import seeded_uniform
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
|
||||
|
@ -1,15 +1,15 @@
|
||||
import gc
|
||||
|
||||
import torch
|
||||
import pytest
|
||||
import torch
|
||||
import triton
|
||||
import triton.language as tl
|
||||
|
||||
from vllm.model_executor.layers.ops.sample import (
|
||||
_uniform_to_exponential, sample, get_num_triton_sampler_splits,
|
||||
MAX_TRITON_N_COLS)
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
MAX_TRITON_N_COLS, _uniform_to_exponential, get_num_triton_sampler_splits,
|
||||
sample)
|
||||
from vllm.model_executor.sampling_metadata import SamplingTensors
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
|
||||
SINGLE_SPLIT_VOCAB_SIZE = 32000 # llama/mistral/mixtral vocab size
|
||||
MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100
|
||||
|
@ -2,7 +2,7 @@ import contextlib
|
||||
import gc
|
||||
import tempfile
|
||||
from collections import OrderedDict
|
||||
from unittest.mock import patch, MagicMock
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
@ -12,13 +12,13 @@ from huggingface_hub import snapshot_download
|
||||
|
||||
import vllm
|
||||
from vllm.config import LoRAConfig
|
||||
from vllm.model_executor.layers.sampler import Sampler
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.sampler import Sampler
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
from vllm.model_executor.parallel_utils.parallel_state import (
|
||||
destroy_model_parallel, initialize_model_parallel)
|
||||
|
||||
|
@ -1,12 +1,14 @@
|
||||
import tempfile
|
||||
from random import sample
|
||||
from typing import List, Optional
|
||||
|
||||
import peft
|
||||
import pytest
|
||||
from random import sample
|
||||
import tempfile
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from .conftest import cleanup
|
||||
|
||||
MODEL_PATH = "Felladrin/Llama-68M-Chat-v1"
|
||||
|
@ -1,32 +1,28 @@
|
||||
import pytest
|
||||
import random
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Dict, Tuple
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm.lora.layers import (
|
||||
ColumnParallelLinearWithLoRA,
|
||||
MergedColumnParallelLinearWithLoRA,
|
||||
QKVParallelLinearWithLora,
|
||||
VocabParallelEmbeddingWithLoRA,
|
||||
RowParallelLinearWithLoRA,
|
||||
LogitsProcessorWithLoRA,
|
||||
LoRAMapping,
|
||||
BaseLayerWithLoRA,
|
||||
)
|
||||
from vllm.lora.models import (LoRALayerWeights, convert_mapping,
|
||||
PackedLoRALayerWeights)
|
||||
from vllm.config import LoRAConfig
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
|
||||
LogitsProcessorWithLoRA, LoRAMapping,
|
||||
MergedColumnParallelLinearWithLoRA,
|
||||
QKVParallelLinearWithLora,
|
||||
RowParallelLinearWithLoRA,
|
||||
VocabParallelEmbeddingWithLoRA)
|
||||
from vllm.lora.models import (LoRALayerWeights, PackedLoRALayerWeights,
|
||||
convert_mapping)
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
RowParallelLinear,
|
||||
QKVParallelLinear)
|
||||
QKVParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
VocabParallelEmbedding, ParallelLMHead)
|
||||
ParallelLMHead, VocabParallelEmbedding)
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
|
||||
from .utils import DummyLoRAManager
|
||||
|
@ -3,6 +3,7 @@ import ray
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from .conftest import cleanup
|
||||
|
||||
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
||||
|
@ -8,11 +8,11 @@ from torch import nn
|
||||
|
||||
from vllm.config import LoRAConfig
|
||||
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
|
||||
RowParallelLinearWithLoRA,
|
||||
MergedColumnParallelLinearWithLoRA)
|
||||
MergedColumnParallelLinearWithLoRA,
|
||||
RowParallelLinearWithLoRA)
|
||||
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.models import (LoRAModel, LoRAModelManager,
|
||||
LRUCacheLoRAModelManager, LoRAMapping)
|
||||
from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
|
||||
LRUCacheLoRAModelManager)
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
|
||||
WorkerLoRAManager)
|
||||
|
@ -1,8 +1,10 @@
|
||||
import pytest
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
|
||||
from vllm.transformers_utils.tokenizer import get_lora_tokenizer
|
||||
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
|
||||
|
||||
from ..conftest import get_tokenizer_pool_config
|
||||
|
||||
|
||||
|
@ -2,8 +2,8 @@ from collections import OrderedDict
|
||||
|
||||
from torch import nn
|
||||
|
||||
from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
|
||||
from vllm.utils import LRUCache
|
||||
from vllm.lora.utils import (parse_fine_tuned_lora_name, replace_submodule)
|
||||
|
||||
|
||||
def test_parse_fine_tuned_lora_name():
|
||||
|
@ -3,10 +3,10 @@ import random
|
||||
import tempfile
|
||||
from unittest.mock import patch
|
||||
|
||||
from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig,
|
||||
SchedulerConfig)
|
||||
from vllm.lora.models import LoRAMapping
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
|
||||
DeviceConfig, LoRAConfig)
|
||||
from vllm.worker.worker import Worker
|
||||
|
||||
|
||||
|
@ -11,9 +11,11 @@ up to 3 times to see if we pass.
|
||||
Run `pytest tests/models/test_marlin.py --forked`.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from dataclasses import dataclass
|
||||
|
||||
from vllm.model_executor.layers.quantization import (
|
||||
_QUANTIZATION_CONFIG_REGISTRY)
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
import pytest
|
||||
import torch
|
||||
from tests.conftest import VllmRunner
|
||||
|
||||
from tests.conftest import VllmRunner
|
||||
from vllm import SamplingParams
|
||||
|
||||
MODELS = ["facebook/opt-125m"]
|
||||
|
@ -1,13 +1,12 @@
|
||||
"""Tests for rejection sampling."""
|
||||
import pytest
|
||||
from typing import List, Tuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
|
||||
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
|
||||
CUDA_DEVICES = [
|
||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||
|
@ -1,11 +1,10 @@
|
||||
import random
|
||||
from typing import Tuple, List
|
||||
from typing import List, Optional, Tuple
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import GenerationConfig, GenerationMixin
|
||||
from typing import Optional
|
||||
|
||||
from vllm.model_executor.layers.sampler import Sampler
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
|
@ -8,8 +8,8 @@ from itertools import combinations
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
from vllm import SamplingParams
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
|
||||
MODEL = "facebook/opt-125m"
|
||||
RANDOM_SEEDS = list(range(5))
|
||||
|
@ -1,9 +1,9 @@
|
||||
import torch
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
|
||||
|
||||
from .utils import mock_worker, create_seq_group_metadata_from_prompts
|
||||
from .utils import create_seq_group_metadata_from_prompts, mock_worker
|
||||
|
||||
|
||||
@pytest.mark.parametrize('num_target_seq_ids', [100])
|
||||
|
@ -1,9 +1,9 @@
|
||||
import torch
|
||||
import math
|
||||
import pytest
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.spec_decode.metrics import AsyncMetricsCollector
|
||||
|
||||
|
||||
|
@ -1,18 +1,19 @@
|
||||
import torch
|
||||
import random
|
||||
import pytest
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from vllm.spec_decode.multi_step_worker import (MultiStepWorker,
|
||||
DraftModelTop1Proposer)
|
||||
from vllm.worker.worker import Worker
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
from vllm.sequence import SamplerOutput
|
||||
from vllm.spec_decode.multi_step_worker import (DraftModelTop1Proposer,
|
||||
MultiStepWorker)
|
||||
from vllm.worker.worker import Worker
|
||||
|
||||
from .utils import (create_execute_model_data, create_worker,
|
||||
create_seq_group_metadata_from_prompts, zero_kv_cache,
|
||||
patch_execute_model_with_seeds,
|
||||
assert_logprobs_dict_allclose, create_batch)
|
||||
from .utils import (assert_logprobs_dict_allclose, create_batch,
|
||||
create_execute_model_data,
|
||||
create_seq_group_metadata_from_prompts, create_worker,
|
||||
patch_execute_model_with_seeds, zero_kv_cache)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('num_steps', list(range(1, 17)))
|
||||
|
@ -1,18 +1,20 @@
|
||||
import torch
|
||||
import random
|
||||
import pytest
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
from vllm.spec_decode.interfaces import SpeculativeProposals
|
||||
from vllm.spec_decode.metrics import (AsyncMetricsCollector,
|
||||
SpecDecodeWorkerMetrics)
|
||||
from vllm.spec_decode.multi_step_worker import MultiStepWorker
|
||||
from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
|
||||
split_num_cache_blocks_evenly)
|
||||
from vllm.spec_decode.interfaces import SpeculativeProposals
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
|
||||
from .utils import (mock_worker, create_batch, ExecuteModelData,
|
||||
create_sampler_output_list)
|
||||
from vllm.spec_decode.metrics import (SpecDecodeWorkerMetrics,
|
||||
AsyncMetricsCollector)
|
||||
|
||||
from .utils import (ExecuteModelData, create_batch, create_sampler_output_list,
|
||||
mock_worker)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('k', [1, 2, 6])
|
||||
|
@ -1,9 +1,9 @@
|
||||
from vllm.spec_decode.util import get_all_seq_ids
|
||||
from vllm.sequence import SequenceGroupMetadata
|
||||
from vllm.spec_decode.util import split_batch_by_proposal_len
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from vllm.sequence import SequenceGroupMetadata
|
||||
from vllm.spec_decode.util import get_all_seq_ids, split_batch_by_proposal_len
|
||||
|
||||
|
||||
def test_get_all_seq_ids():
|
||||
|
@ -1,17 +1,19 @@
|
||||
import torch
|
||||
from typing import List, Optional, Dict, Iterable, Union
|
||||
from dataclasses import dataclass, fields
|
||||
from itertools import count
|
||||
from typing import Dict, Iterable, List, Optional, Union
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from vllm.worker.worker import Worker
|
||||
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
|
||||
import torch
|
||||
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.sequence import (Logprob, SequenceGroupMetadata, SequenceData,
|
||||
SamplerOutput, SequenceGroupOutput, SequenceOutput)
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.worker.cache_engine import CacheEngine
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
from itertools import count
|
||||
from dataclasses import dataclass, fields
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import (Logprob, SamplerOutput, SequenceData,
|
||||
SequenceGroupMetadata, SequenceGroupOutput,
|
||||
SequenceOutput)
|
||||
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
|
||||
from vllm.worker.cache_engine import CacheEngine
|
||||
from vllm.worker.worker import Worker
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -7,8 +7,8 @@ from typing import List, Optional
|
||||
import pytest
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
||||
from vllm.sequence import Sequence
|
||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
||||
|
||||
# Make two prefixes with different first blocks.
|
||||
prefix_start = [("You are an expert"), ("You are a")]
|
||||
|
@ -1,6 +1,6 @@
|
||||
import pytest
|
||||
|
||||
from vllm.sequence import SequenceGroupOutput, SamplerOutput, SequenceOutput
|
||||
from vllm.sequence import SamplerOutput, SequenceGroupOutput, SequenceOutput
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -1,7 +1,9 @@
|
||||
from copy import deepcopy
|
||||
from vllm.transformers_utils.tokenizer import get_cached_tokenizer
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.transformers_utils.tokenizer import get_cached_tokenizer
|
||||
|
||||
|
||||
def test_cached_tokenizer():
|
||||
reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||
|
@ -1,12 +1,12 @@
|
||||
from typing import Dict, List
|
||||
|
||||
import pytest
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
from typing import List, Dict
|
||||
|
||||
from vllm.sequence import Sequence, Logprob, SamplingParams, SequenceGroup
|
||||
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
|
||||
from vllm.transformers_utils.tokenizer import detokenize_incrementally
|
||||
from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
|
||||
from vllm.transformers_utils.detokenizer import Detokenizer
|
||||
from vllm.transformers_utils.tokenizer import detokenize_incrementally
|
||||
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
|
||||
|
||||
TRUTH = [
|
||||
"Hello here, this is a simple test",
|
||||
|
@ -1,14 +1,16 @@
|
||||
import os
|
||||
import pytest
|
||||
import asyncio
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
|
||||
from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
|
||||
RayTokenizerGroupPool)
|
||||
from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
|
||||
TokenizerGroup)
|
||||
|
||||
from ..conftest import get_tokenizer_pool_config
|
||||
|
||||
|
||||
|
@ -1,8 +1,8 @@
|
||||
import torch
|
||||
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.worker.worker import Worker
|
||||
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
|
||||
from vllm.worker.worker import Worker
|
||||
|
||||
|
||||
def test_swap() -> None:
|
||||
|
@ -1,4 +1,5 @@
|
||||
from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
|
||||
from vllm.attention.backends.abstract import (AttentionBackend,
|
||||
AttentionMetadata)
|
||||
from vllm.attention.layer import Attention
|
||||
from vllm.attention.selector import get_attn_backend
|
||||
|
||||
|
@ -7,12 +7,13 @@ flashinfer for all the attention operations.
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Optional, Tuple, Type
|
||||
|
||||
from flash_attn import flash_attn_varlen_func
|
||||
import torch
|
||||
from flash_attn import flash_attn_varlen_func
|
||||
|
||||
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
||||
AttentionMetadata)
|
||||
from vllm.attention.ops.paged_attn import PagedAttention, PagedAttentionMetadata
|
||||
from vllm.attention.ops.paged_attn import (PagedAttention,
|
||||
PagedAttentionMetadata)
|
||||
|
||||
|
||||
class FlashAttentionBackend(AttentionBackend):
|
||||
|
@ -11,7 +11,8 @@ from xformers.ops.fmha.attn_bias import (AttentionBias,
|
||||
|
||||
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
||||
AttentionMetadata)
|
||||
from vllm.attention.ops.paged_attn import PagedAttention, PagedAttentionMetadata
|
||||
from vllm.attention.ops.paged_attn import (PagedAttention,
|
||||
PagedAttentionMetadata)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import is_hip
|
||||
|
||||
|
@ -3,8 +3,7 @@ from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import torch
|
||||
|
||||
from vllm._C import cache_ops
|
||||
from vllm._C import ops
|
||||
from vllm._C import cache_ops, ops
|
||||
from vllm.attention.ops.prefix_prefill import context_attention_fwd
|
||||
|
||||
# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
|
||||
|
@ -13,11 +13,13 @@ logger = init_logger(__name__)
|
||||
def get_attn_backend(dtype: torch.dtype) -> AttentionBackend:
|
||||
if _can_use_flash_attn(dtype):
|
||||
logger.info("Using FlashAttention backend.")
|
||||
from vllm.attention.backends.flash_attn import FlashAttentionBackend # noqa: F401
|
||||
from vllm.attention.backends.flash_attn import ( # noqa: F401
|
||||
FlashAttentionBackend)
|
||||
return FlashAttentionBackend
|
||||
else:
|
||||
logger.info("Using XFormers backend.")
|
||||
from vllm.attention.backends.xformers import XFormersBackend # noqa: F401
|
||||
from vllm.attention.backends.xformers import ( # noqa: F401
|
||||
XFormersBackend)
|
||||
return XFormersBackend
|
||||
|
||||
|
||||
|
@ -1,15 +1,15 @@
|
||||
from typing import TYPE_CHECKING, Optional, Union, ClassVar
|
||||
from dataclasses import dataclass
|
||||
import os
|
||||
from packaging.version import Version
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, ClassVar, Optional, Union
|
||||
|
||||
import torch
|
||||
from packaging.version import Version
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.config import get_config
|
||||
from vllm.utils import get_cpu_memory, is_hip, is_neuron, get_nvcc_cuda_version
|
||||
from vllm.utils import get_cpu_memory, get_nvcc_cuda_version, is_hip, is_neuron
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ray.util.placement_group import PlacementGroup
|
||||
@ -103,7 +103,8 @@ class ModelConfig:
|
||||
if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true":
|
||||
# download model from ModelScope hub,
|
||||
# lazy import so that modelscope is not required for normal use.
|
||||
from modelscope.hub.snapshot_download import snapshot_download # pylint: disable=C
|
||||
# pylint: disable=C.
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
|
||||
if not os.path.exists(model):
|
||||
model_path = snapshot_download(model_id=model,
|
||||
|
@ -1,15 +1,15 @@
|
||||
"""A block manager that manages token blocks."""
|
||||
import enum
|
||||
from abc import ABC, abstractmethod
|
||||
from itertools import count, takewhile
|
||||
from os.path import commonprefix
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from vllm.block import BlockTable, PhysicalTokenBlock
|
||||
from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor
|
||||
from vllm.logger import init_logger
|
||||
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
|
||||
from vllm.utils import Device
|
||||
from vllm.core.evictor import Evictor, EvictionPolicy, make_evictor
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
import enum
|
||||
from typing import OrderedDict
|
||||
from abc import ABC, abstractmethod, abstractproperty
|
||||
from typing import OrderedDict
|
||||
|
||||
from vllm.block import PhysicalTokenBlock
|
||||
|
||||
|
@ -1,13 +1,13 @@
|
||||
from collections import deque
|
||||
import enum
|
||||
import time
|
||||
from typing import Deque, Dict, Iterable, List, Optional, Tuple, Union, Set
|
||||
from collections import deque
|
||||
from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union
|
||||
|
||||
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
|
||||
from vllm.core.block_manager import AllocStatus, BlockSpaceManager
|
||||
from vllm.core.policy import PolicyFactory
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
|
||||
SequenceGroupMetadata, SequenceStatus)
|
||||
|
||||
|
@ -3,9 +3,8 @@ import dataclasses
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
||||
ParallelConfig, SchedulerConfig, LoRAConfig,
|
||||
TokenizerPoolConfig)
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
|
||||
ParallelConfig, SchedulerConfig, TokenizerPoolConfig)
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -2,17 +2,17 @@ import asyncio
|
||||
import os
|
||||
import time
|
||||
from functools import partial
|
||||
from typing import (Callable, Dict, Iterable, List, Optional, Set, Tuple, Type,
|
||||
Union, AsyncIterator)
|
||||
from typing import (AsyncIterator, Callable, Dict, Iterable, List, Optional,
|
||||
Set, Tuple, Type, Union)
|
||||
|
||||
from transformers import PreTrainedTokenizer
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.engine.ray_utils import initialize_ray_cluster, ray
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
|
@ -4,22 +4,22 @@ from typing import Iterable, List, Optional, Tuple, Type, Union
|
||||
from transformers import PreTrainedTokenizer
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
||||
ParallelConfig, SchedulerConfig, LoRAConfig)
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
|
||||
ParallelConfig, SchedulerConfig)
|
||||
from vllm.core.scheduler import Scheduler, SchedulerOutputs
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
from vllm.engine.metrics import StatLogger, Stats
|
||||
from vllm.engine.ray_utils import initialize_ray_cluster
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup,
|
||||
SequenceGroupOutput, SequenceOutput, SequenceStatus)
|
||||
from vllm.transformers_utils.detokenizer import Detokenizer
|
||||
from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
|
||||
get_tokenizer_group)
|
||||
from vllm.transformers_utils.detokenizer import Detokenizer
|
||||
from vllm.utils import Counter
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
@ -1,11 +1,12 @@
|
||||
from vllm.logger import init_logger
|
||||
from prometheus_client import (Counter, Gauge, Histogram, Info, REGISTRY,
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List
|
||||
|
||||
import numpy as np
|
||||
from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info,
|
||||
disable_created_metrics)
|
||||
|
||||
import time
|
||||
import numpy as np
|
||||
from typing import Dict, List
|
||||
from dataclasses import dataclass
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
@ -1,10 +1,9 @@
|
||||
import pickle
|
||||
|
||||
from typing import Optional, List, Tuple
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import is_hip, set_cuda_visible_devices, get_ip
|
||||
from vllm.utils import get_ip, is_hip, set_cuda_visible_devices
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
@ -11,9 +11,9 @@ import json
|
||||
import ssl
|
||||
from typing import AsyncGenerator
|
||||
|
||||
import uvicorn
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
||||
import uvicorn
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
|
@ -3,9 +3,9 @@ from typing import List, Optional, Union
|
||||
from tqdm import tqdm
|
||||
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import Counter
|
||||
|
@ -1,28 +1,27 @@
|
||||
import asyncio
|
||||
from contextlib import asynccontextmanager
|
||||
import os
|
||||
import importlib
|
||||
import inspect
|
||||
import os
|
||||
from contextlib import asynccontextmanager
|
||||
from http import HTTPStatus
|
||||
|
||||
from prometheus_client import make_asgi_app
|
||||
import fastapi
|
||||
import uvicorn
|
||||
from http import HTTPStatus
|
||||
from fastapi import Request
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse, StreamingResponse, Response
|
||||
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
||||
from prometheus_client import make_asgi_app
|
||||
|
||||
import vllm
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.entrypoints.openai.protocol import (CompletionRequest,
|
||||
ChatCompletionRequest,
|
||||
ErrorResponse)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
CompletionRequest, ErrorResponse)
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
||||
from vllm.logger import init_logger
|
||||
|
||||
TIMEOUT_KEEP_ALIVE = 5 # seconds
|
||||
|
||||
|
@ -3,12 +3,11 @@
|
||||
import time
|
||||
from typing import Dict, List, Literal, Optional, Union
|
||||
|
||||
import torch
|
||||
from pydantic import BaseModel, Field, model_validator
|
||||
|
||||
from vllm.utils import random_uuid
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
import torch
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
|
||||
class ErrorResponse(BaseModel):
|
||||
|
@ -1,19 +1,21 @@
|
||||
import time
|
||||
import codecs
|
||||
import time
|
||||
from typing import AsyncGenerator, AsyncIterator, List, Optional, Union
|
||||
|
||||
from fastapi import Request
|
||||
from typing import AsyncGenerator, AsyncIterator, Optional, List, Union
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest, ChatCompletionResponse,
|
||||
ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
|
||||
ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse,
|
||||
UsageInfo)
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA
|
||||
from vllm.entrypoints.openai.serving_engine import LoRA, OpenAIServing
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.guided_decoding import (
|
||||
get_guided_decoding_logits_processor)
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
@ -1,24 +1,23 @@
|
||||
import asyncio
|
||||
import time
|
||||
from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, List,
|
||||
Optional, Tuple)
|
||||
|
||||
from fastapi import Request
|
||||
from typing import (AsyncGenerator, AsyncIterator, Callable, List, Optional,
|
||||
Dict, Tuple)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
CompletionRequest,
|
||||
CompletionResponse,
|
||||
CompletionResponseChoice,
|
||||
CompletionResponseStreamChoice,
|
||||
CompletionStreamResponse,
|
||||
LogProbs,
|
||||
UsageInfo,
|
||||
)
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA
|
||||
from vllm.entrypoints.openai.protocol import (CompletionRequest,
|
||||
CompletionResponse,
|
||||
CompletionResponseChoice,
|
||||
CompletionResponseStreamChoice,
|
||||
CompletionStreamResponse,
|
||||
LogProbs, UsageInfo)
|
||||
from vllm.entrypoints.openai.serving_engine import LoRA, OpenAIServing
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.guided_decoding import (
|
||||
get_guided_decoding_logits_processor)
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
@ -3,16 +3,16 @@ import json
|
||||
from dataclasses import dataclass
|
||||
from http import HTTPStatus
|
||||
from typing import Dict, List, Optional, Union
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.entrypoints.openai.protocol import (CompletionRequest,
|
||||
ChatCompletionRequest,
|
||||
ErrorResponse, LogProbs,
|
||||
ModelCard, ModelList,
|
||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
CompletionRequest, ErrorResponse,
|
||||
LogProbs, ModelCard, ModelList,
|
||||
ModelPermission)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import Logprob
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
@ -1,8 +1,8 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
||||
ParallelConfig, SchedulerConfig, LoRAConfig)
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
|
||||
ParallelConfig, SchedulerConfig)
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
|
||||
|
||||
|
@ -1,13 +1,13 @@
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
||||
ParallelConfig, SchedulerConfig, LoRAConfig)
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
|
||||
ParallelConfig, SchedulerConfig)
|
||||
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
|
||||
from vllm.executor.utils import check_block_size_valid
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
|
||||
from vllm.utils import (get_ip, get_open_port, get_distributed_init_method,
|
||||
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
|
||||
make_async)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
@ -1,10 +1,10 @@
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
||||
ParallelConfig, SchedulerConfig, LoRAConfig)
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
|
||||
ParallelConfig, SchedulerConfig)
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
@ -1,20 +1,20 @@
|
||||
import asyncio
|
||||
import copy
|
||||
from collections import defaultdict
|
||||
import os
|
||||
import pickle
|
||||
from collections import defaultdict
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
||||
|
||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
||||
ParallelConfig, SchedulerConfig, LoRAConfig)
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
|
||||
ParallelConfig, SchedulerConfig)
|
||||
from vllm.engine.ray_utils import RayWorkerVllm, ray
|
||||
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
|
||||
from vllm.executor.utils import check_block_size_valid
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
|
||||
from vllm.utils import (set_cuda_visible_devices, get_ip, get_open_port,
|
||||
get_distributed_init_method, make_async)
|
||||
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
|
||||
make_async, set_cuda_visible_devices)
|
||||
|
||||
if ray is not None:
|
||||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
||||
@ -343,7 +343,7 @@ class RayGPUExecutor(ExecutorBase):
|
||||
raise ValueError(f"Ray version {required_version} or greater is "
|
||||
f"required, but found {current_version}")
|
||||
|
||||
from ray.dag import MultiOutputNode, InputNode
|
||||
from ray.dag import InputNode, MultiOutputNode
|
||||
assert self.parallel_config.worker_use_ray
|
||||
|
||||
# Right now, compiled DAG requires at least 1 arg. We send
|
||||
|
@ -2,8 +2,8 @@
|
||||
# https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
|
||||
"""Logging configuration for vLLM."""
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
import sys
|
||||
|
||||
VLLM_CONFIGURE_LOGGING = int(os.getenv("VLLM_CONFIGURE_LOGGING", "1"))
|
||||
|
||||
|
@ -10,18 +10,16 @@ from transformers import PretrainedConfig
|
||||
|
||||
from vllm.config import LoRAConfig
|
||||
from vllm.lora.punica import add_lora, add_lora_slice, bgmv
|
||||
from vllm.model_executor.parallel_utils.communication_op import (
|
||||
tensor_model_parallel_all_gather,
|
||||
tensor_model_parallel_all_reduce,
|
||||
tensor_model_parallel_gather,
|
||||
)
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
RowParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
QKVParallelLinear,
|
||||
MergedColumnParallelLinear)
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
VocabParallelEmbedding, ParallelLMHead)
|
||||
ParallelLMHead, VocabParallelEmbedding)
|
||||
from vllm.model_executor.parallel_utils.communication_op import (
|
||||
tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce,
|
||||
tensor_model_parallel_gather)
|
||||
from vllm.model_executor.parallel_utils.parallel_state import (
|
||||
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
|
||||
from vllm.model_executor.parallel_utils.utils import (
|
||||
|
@ -1,6 +1,7 @@
|
||||
from typing import List, Optional
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.utils import is_pin_memory_available
|
||||
|
||||
|
||||
|
@ -4,19 +4,18 @@ import logging
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
from typing import (Callable, Dict, Hashable, List, Optional, Tuple, Type)
|
||||
from typing import Callable, Dict, Hashable, List, Optional, Tuple, Type
|
||||
|
||||
import safetensors.torch
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from vllm.config import LoRAConfig
|
||||
from vllm.utils import LRUCache, is_pin_memory_available
|
||||
|
||||
from vllm.lora.layers import (BaseLayerWithLoRA, LoRAMapping, from_layer,
|
||||
from_layer_logits_processor)
|
||||
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
|
||||
from vllm.utils import LRUCache, is_pin_memory_available
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -4,11 +4,11 @@ from typing import Any, Dict, List, Optional, Set, Type
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.config import LoRAConfig
|
||||
from vllm.lora.layers import LoRAMapping
|
||||
from vllm.lora.models import (LoRAModel, LoRAModelManager,
|
||||
LRUCacheLoRAModelManager, create_lora_manager)
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.layers import LoRAMapping
|
||||
from vllm.config import LoRAConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -5,16 +5,16 @@ from enum import Enum
|
||||
from functools import lru_cache
|
||||
from json import dumps as json_dumps
|
||||
from re import escape as regex_escape
|
||||
from typing import Union, Tuple
|
||||
from typing import Tuple, Union
|
||||
|
||||
from pydantic import BaseModel
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (CompletionRequest,
|
||||
ChatCompletionRequest)
|
||||
from vllm.model_executor.guided_logits_processors import (JSONLogitsProcessor,
|
||||
RegexLogitsProcessor,
|
||||
CFGLogitsProcessor)
|
||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
CompletionRequest)
|
||||
from vllm.model_executor.guided_logits_processors import (CFGLogitsProcessor,
|
||||
JSONLogitsProcessor,
|
||||
RegexLogitsProcessor)
|
||||
|
||||
|
||||
class GuidedDecodingMode(Enum):
|
||||
|
@ -16,13 +16,13 @@
|
||||
import json
|
||||
import math
|
||||
from collections import defaultdict
|
||||
from typing import Union, DefaultDict, Dict, List, Optional, Callable
|
||||
from typing import Callable, DefaultDict, Dict, List, Optional, Union
|
||||
|
||||
import torch
|
||||
from outlines.fsm.fsm import CFGFSM, RegexFSM
|
||||
from outlines.fsm.json_schema import build_regex_from_schema
|
||||
from pydantic import BaseModel
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
from outlines.fsm.fsm import RegexFSM, CFGFSM
|
||||
from outlines.fsm.json_schema import build_regex_from_schema
|
||||
|
||||
|
||||
class BaseLogitsProcessor:
|
||||
|
@ -1,7 +1,5 @@
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||
fused_moe,
|
||||
get_config_file_name,
|
||||
)
|
||||
fused_moe, get_config_file_name)
|
||||
|
||||
__all__ = [
|
||||
"fused_moe",
|
||||
|
@ -5,14 +5,14 @@ import torch
|
||||
import torch.nn.functional as F
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.parallel_utils.communication_op import (
|
||||
tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce)
|
||||
from vllm.model_executor.parallel_utils.parallel_state import (
|
||||
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
|
||||
from vllm.model_executor.parallel_utils.communication_op import (
|
||||
tensor_model_parallel_all_reduce, tensor_model_parallel_all_gather)
|
||||
from vllm.model_executor.parallel_utils.utils import (
|
||||
divide, split_tensor_along_last_dim)
|
||||
from vllm.model_executor.utils import set_weight_attrs
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
@ -1,9 +1,9 @@
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
import triton
|
||||
import triton.language as tl
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
|
||||
def seeded_uniform(
|
||||
*size,
|
||||
|
@ -1,5 +1,5 @@
|
||||
import math
|
||||
from typing import Tuple, Optional
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import torch
|
||||
import triton
|
||||
|
@ -1,11 +1,11 @@
|
||||
from typing import Type
|
||||
|
||||
from vllm.model_executor.layers.quantization.awq import AWQConfig
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig)
|
||||
from vllm.model_executor.layers.quantization.awq import AWQConfig
|
||||
from vllm.model_executor.layers.quantization.gptq import GPTQConfig
|
||||
from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
|
||||
from vllm.model_executor.layers.quantization.marlin import MarlinConfig
|
||||
from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
|
||||
|
||||
_QUANTIZATION_CONFIG_REGISTRY = {
|
||||
"awq": AWQConfig,
|
||||
|
@ -1,7 +1,7 @@
|
||||
import enum
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional
|
||||
from fractions import Fraction
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import torch
|
||||
from torch.nn.parameter import Parameter
|
||||
|
@ -4,7 +4,8 @@ import torch
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
from vllm._C import ops
|
||||
from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs
|
||||
from vllm.model_executor.layers.linear import (LinearMethodBase,
|
||||
set_weight_attrs)
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig)
|
||||
|
||||
|
@ -1,9 +1,9 @@
|
||||
from typing import Tuple, Optional
|
||||
from functools import cached_property
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.jit
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class RejectionSampler(nn.Module):
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user