[CI] Try introducing isort. (#3495)

This commit is contained in:
SangBin Cho 2024-03-25 23:59:47 +09:00 committed by GitHub
parent e67c295b0c
commit 01bfb22b41
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
144 changed files with 512 additions and 465 deletions

View File

@ -25,10 +25,13 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1
pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
- name: Analysing the code with ruff
run: |
ruff .
- name: Spelling check with codespell
run: |
codespell --toml pyproject.toml
codespell --toml pyproject.toml
- name: Run isort
run: |
isort . --check-only

View File

@ -1,8 +1,7 @@
import argparse
import time
from vllm import LLM
from vllm import SamplingParams
from vllm import LLM, SamplingParams
PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501

View File

@ -25,15 +25,12 @@ from datetime import datetime
from typing import AsyncGenerator, List, Tuple
import numpy as np
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
RequestFuncOutput)
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
from vllm.transformers_utils.tokenizer import get_tokenizer
from backend_request_func import (
ASYNC_REQUEST_FUNCS,
RequestFuncInput,
RequestFuncOutput,
)
from vllm.transformers_utils.tokenizer import get_tokenizer
@dataclass

View File

@ -6,9 +6,9 @@ import time
from typing import List, Optional, Tuple
import torch
from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase)
from tqdm import tqdm
def sample_requests(

View File

@ -2,11 +2,13 @@ import json
import os
import sys
from vllm.model_executor.layers.fused_moe import fused_moe, get_config_file_name
import torch
import torch.nn.functional as F
import triton
from vllm.model_executor.layers.fused_moe import (fused_moe,
get_config_file_name)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

View File

@ -1,12 +1,12 @@
from typing import Optional
import argparse
import random
import time
from typing import Optional
import torch
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
from vllm._C import ops
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
NUM_BLOCKS = 1024
PARTITION_SIZE = 512

View File

@ -1,9 +1,10 @@
import argparse
from itertools import accumulate
from typing import Optional
import argparse
import torch
import nvtx
from itertools import accumulate
import torch
from vllm.model_executor.layers.rotary_embedding import get_rope

View File

@ -9,8 +9,8 @@
#
import argparse
import shutil
import os
import shutil
from torch.utils.hipify.hipify_python import hipify

View File

@ -6,10 +6,10 @@
# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
import datetime
import locale
import os
import re
import subprocess
import sys
import os
from collections import namedtuple
try:

View File

@ -10,10 +10,11 @@
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
import logging
import os
import sys
from sphinx.ext import autodoc
import logging
sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))

View File

@ -1,6 +1,7 @@
import argparse
from openai import OpenAI
import gradio as gr
from openai import OpenAI
# Argument parser setup
parser = argparse.ArgumentParser(

View File

@ -1,7 +1,7 @@
import argparse
from typing import List, Tuple
from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
def create_test_prompts() -> List[Tuple[str, SamplingParams]]:

View File

@ -5,11 +5,11 @@ for offline inference.
Requires HuggingFace credentials for access to Llama2.
"""
from typing import Optional, List, Tuple
from typing import List, Optional, Tuple
from huggingface_hub import snapshot_download
from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.lora.request import LoRARequest

View File

@ -5,11 +5,13 @@ distributively on a multi-nodes cluster.
Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
"""
from vllm import LLM, SamplingParams
from typing import Dict
import numpy as np
import ray
from vllm import LLM, SamplingParams
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

View File

@ -25,6 +25,7 @@ YAPF_VERSION=$(yapf --version | awk '{print $2}')
RUFF_VERSION=$(ruff --version | awk '{print $2}')
MYPY_VERSION=$(mypy --version | awk '{print $2}')
CODESPELL_VERSION=$(codespell --version)
ISORT_VERSION=$(isort --vn)
# # params: tool name, tool version, required version
tool_version_check() {
@ -37,6 +38,7 @@ tool_version_check() {
tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)"
tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-dev.txt | cut -d'=' -f3)"
tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-dev.txt | cut -d'=' -f3)"
YAPF_FLAGS=(
@ -178,6 +180,46 @@ else
lint_changed
fi
# check spelling of specified files
isort_check() {
isort "$@"
}
isort_check_all(){
isort .
}
# Spelling check of files that differ from main branch.
isort_check_changed() {
# The `if` guard ensures that the list of filenames is not empty, which
# could cause ruff to receive 0 positional arguments, making it hang
# waiting for STDIN.
#
# `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
# exist on both branches.
MERGEBASE="$(git merge-base origin/main HEAD)"
if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
isort
fi
}
# Run Isort
# This flag runs spell check of individual files. --files *must* be the first command line
# arg to use this option.
if [[ "$1" == '--files' ]]; then
isort_check "${@:2}"
# If `--all` is passed, then any further arguments are ignored and the
# entire python directory is linted.
elif [[ "$1" == '--all' ]]; then
isort_check_all
else
# Check spelling only of the files that changed in last commit.
isort_check_changed
fi
echo 'vLLM isort: Done'
if ! git diff --quiet &>/dev/null; then
echo 'Reformatted files. Please review and stage the changes.'
echo 'Changes not staged for commit:'

View File

@ -51,3 +51,7 @@ exclude = "vllm/model_executor/parallel_utils/|vllm/model_executor/models/"
[tool.codespell]
ignore-words-list = "dout, te, indicies"
skip = "./tests/prompts"
[tool.isort]
use_parentheses = true
skip_gitignore = true

View File

@ -4,6 +4,7 @@ toml==0.10.2
tomli==2.0.1
ruff==0.1.5
codespell==2.2.6
isort==5.13.2
# type checking
mypy==0.991

View File

@ -1,16 +1,16 @@
import io
import logging
import os
import re
import logging
import subprocess
import sys
from shutil import which
from typing import List
from packaging.version import parse, Version
from setuptools import setup, find_packages, Extension
from setuptools.command.build_ext import build_ext
from shutil import which
import torch
from packaging.version import Version, parse
from setuptools import Extension, find_packages, setup
from setuptools.command.build_ext import build_ext
from torch.utils.cpp_extension import CUDA_HOME
ROOT_DIR = os.path.dirname(__file__)

View File

@ -1,12 +1,12 @@
from dataclasses import dataclass
import os
import pathlib
from dataclasses import dataclass
import pytest
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.transformers_utils.tokenizer import get_tokenizer
chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
__file__))).parent.parent / "examples/template_chatml.jinja"

View File

@ -6,8 +6,8 @@ import torch
from transformers import AutoModelForCausalLM
from vllm import LLM, SamplingParams
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.config import TokenizerPoolConfig
from vllm.transformers_utils.tokenizer import get_tokenizer
_TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]

View File

@ -1,13 +1,14 @@
import pytest
import time
from typing import List
import pytest
from vllm import SamplingParams
from vllm.block import PhysicalTokenBlock
from vllm.core.block_manager import (UncachedBlockAllocator, BlockSpaceManager,
AllocStatus)
from vllm.core.block_manager import (AllocStatus, BlockSpaceManager,
UncachedBlockAllocator)
from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceStatus
from vllm.utils import Device
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus, Logprob
from .utils import create_dummy_prompt

View File

@ -1,10 +1,11 @@
from typing import List
import pytest # noqa
import time
from typing import List
import pytest # noqa
from vllm.config import CacheConfig, SchedulerConfig
from vllm.core.scheduler import Scheduler
from vllm.sequence import SequenceGroup, Logprob
from vllm.sequence import Logprob, SequenceGroup
from .utils import create_dummy_prompt

View File

@ -3,14 +3,12 @@
Run `pytest tests/distributed/test_comm_ops.py --forked`.
"""
import pytest
import torch
import ray
import torch
from vllm.model_executor.parallel_utils.communication_op import (
tensor_model_parallel_all_reduce,
tensor_model_parallel_all_gather,
broadcast_tensor_dict,
)
broadcast_tensor_dict, tensor_model_parallel_all_gather,
tensor_model_parallel_all_reduce)
from vllm.test_utils import (init_test_distributed_environment,
multi_process_tensor_parallel)

View File

@ -1,6 +1,6 @@
import os
import random
import os
import pytest
import ray
import torch

View File

@ -1,11 +1,11 @@
# This unit test should be moved to a new
# tests/test_guided_decoding directory.
from transformers import AutoTokenizer
import torch
from transformers import AutoTokenizer
from vllm.model_executor.guided_logits_processors import (RegexLogitsProcessor,
JSONLogitsProcessor)
from vllm.model_executor.guided_logits_processors import (JSONLogitsProcessor,
RegexLogitsProcessor)
TEST_SCHEMA = {
"type": "object",

View File

@ -1,22 +1,21 @@
# imports for guided decoding tests
import json
import os
import re
import subprocess
import sys
import time
import sys
import jsonschema
import openai # use the official client for correctness check
import pytest
import requests
# using Ray for overall ease of process management, parallel requests,
# and debugging.
import ray
import openai # use the official client for correctness check
import requests
# downloading lora to test lora requests
from huggingface_hub import snapshot_download
# imports for guided decoding tests
import json
import jsonschema
import re
from vllm.transformers_utils.tokenizer import get_tokenizer
MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds

View File

@ -1,4 +1,5 @@
import pytest
from vllm.utils import create_kv_caches_with_random

View File

@ -2,10 +2,10 @@ from typing import Type
import pytest
import torch
from allclose_default import get_default_atol, get_default_rtol
from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
NewGELU, SiluAndMul)
from allclose_default import get_default_atol, get_default_rtol
DTYPES = [torch.half, torch.bfloat16, torch.float]
NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing

View File

@ -3,13 +3,12 @@ from typing import List, Optional, Tuple
import pytest
import torch
from allclose_default import get_default_atol, get_default_rtol
from xformers import ops as xops
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
from vllm._C import ops, cache_ops
from vllm.utils import get_max_shared_memory_bytes
from vllm.utils import is_hip
from allclose_default import get_default_atol, get_default_rtol
from vllm._C import cache_ops, ops
from vllm.utils import get_max_shared_memory_bytes, is_hip
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability.

View File

@ -1,10 +1,9 @@
import random
from typing import Tuple
import pytest
import torch
from typing import Tuple
from vllm._C import cache_ops
COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]

View File

@ -7,8 +7,8 @@ import torch
from transformers import MixtralConfig
from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
from vllm.model_executor.layers.fused_moe import fused_moe
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe import fused_moe
from vllm.model_executor.models.mixtral import MixtralMoE

View File

@ -1,9 +1,10 @@
from itertools import accumulate
from typing import List, Optional
import pytest
import torch
from allclose_default import get_default_atol, get_default_rtol
from itertools import accumulate
from vllm.model_executor.layers.rotary_embedding import get_rope
IS_NEOX_STYLE = [True, False]

View File

@ -1,12 +1,13 @@
import random
import pytest
import time
import pytest
import torch
from vllm.attention.ops.prefix_prefill import context_attention_fwd
from xformers import ops as xops
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
from vllm.attention.ops.prefix_prefill import context_attention_fwd
NUM_HEADS = [64]
NUM_QUERIES_PER_KV = [1, 8, 64]
HEAD_SIZES = [128]

View File

@ -1,7 +1,8 @@
import torch
import pytest
import random
import pytest
import torch
from vllm.model_executor.layers.ops.rand import seeded_uniform
from vllm.model_executor.utils import set_random_seed

View File

@ -1,15 +1,15 @@
import gc
import torch
import pytest
import torch
import triton
import triton.language as tl
from vllm.model_executor.layers.ops.sample import (
_uniform_to_exponential, sample, get_num_triton_sampler_splits,
MAX_TRITON_N_COLS)
from vllm.model_executor.utils import set_random_seed
MAX_TRITON_N_COLS, _uniform_to_exponential, get_num_triton_sampler_splits,
sample)
from vllm.model_executor.sampling_metadata import SamplingTensors
from vllm.model_executor.utils import set_random_seed
SINGLE_SPLIT_VOCAB_SIZE = 32000 # llama/mistral/mixtral vocab size
MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100

View File

@ -2,7 +2,7 @@ import contextlib
import gc
import tempfile
from collections import OrderedDict
from unittest.mock import patch, MagicMock
from unittest.mock import MagicMock, patch
import pytest
import ray
@ -12,13 +12,13 @@ from huggingface_hub import snapshot_download
import vllm
from vllm.config import LoRAConfig
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.model_loader import get_model
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
MergedColumnParallelLinear,
RowParallelLinear)
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader import get_model
from vllm.model_executor.parallel_utils.parallel_state import (
destroy_model_parallel, initialize_model_parallel)

View File

@ -1,12 +1,14 @@
import tempfile
from random import sample
from typing import List, Optional
import peft
import pytest
from random import sample
import tempfile
from transformers import AutoModelForCausalLM
import vllm
from vllm.lora.request import LoRARequest
from .conftest import cleanup
MODEL_PATH = "Felladrin/Llama-68M-Chat-v1"

View File

@ -1,32 +1,28 @@
import pytest
import random
from copy import deepcopy
from dataclasses import dataclass
from typing import List, Optional, Dict, Tuple
from typing import Dict, List, Optional, Tuple
import pytest
import torch
import torch.nn.functional as F
from vllm.lora.layers import (
ColumnParallelLinearWithLoRA,
MergedColumnParallelLinearWithLoRA,
QKVParallelLinearWithLora,
VocabParallelEmbeddingWithLoRA,
RowParallelLinearWithLoRA,
LogitsProcessorWithLoRA,
LoRAMapping,
BaseLayerWithLoRA,
)
from vllm.lora.models import (LoRALayerWeights, convert_mapping,
PackedLoRALayerWeights)
from vllm.config import LoRAConfig
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
LogitsProcessorWithLoRA, LoRAMapping,
MergedColumnParallelLinearWithLoRA,
QKVParallelLinearWithLora,
RowParallelLinearWithLoRA,
VocabParallelEmbeddingWithLoRA)
from vllm.lora.models import (LoRALayerWeights, PackedLoRALayerWeights,
convert_mapping)
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
MergedColumnParallelLinear,
RowParallelLinear,
QKVParallelLinear)
QKVParallelLinear,
RowParallelLinear)
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding, ParallelLMHead)
ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.utils import set_random_seed
from .utils import DummyLoRAManager

View File

@ -3,6 +3,7 @@ import ray
import vllm
from vllm.lora.request import LoRARequest
from .conftest import cleanup
MODEL_PATH = "meta-llama/Llama-2-7b-hf"

View File

@ -8,11 +8,11 @@ from torch import nn
from vllm.config import LoRAConfig
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
RowParallelLinearWithLoRA,
MergedColumnParallelLinearWithLoRA)
MergedColumnParallelLinearWithLoRA,
RowParallelLinearWithLoRA)
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
from vllm.lora.models import (LoRAModel, LoRAModelManager,
LRUCacheLoRAModelManager, LoRAMapping)
from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
LRUCacheLoRAModelManager)
from vllm.lora.request import LoRARequest
from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
WorkerLoRAManager)

View File

@ -1,8 +1,10 @@
import pytest
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
from vllm.transformers_utils.tokenizer import get_lora_tokenizer
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
from ..conftest import get_tokenizer_pool_config

View File

@ -2,8 +2,8 @@ from collections import OrderedDict
from torch import nn
from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
from vllm.utils import LRUCache
from vllm.lora.utils import (parse_fine_tuned_lora_name, replace_submodule)
def test_parse_fine_tuned_lora_name():

View File

@ -3,10 +3,10 @@ import random
import tempfile
from unittest.mock import patch
from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig,
SchedulerConfig)
from vllm.lora.models import LoRAMapping
from vllm.lora.request import LoRARequest
from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
DeviceConfig, LoRAConfig)
from vllm.worker.worker import Worker

View File

@ -11,9 +11,11 @@ up to 3 times to see if we pass.
Run `pytest tests/models/test_marlin.py --forked`.
"""
from dataclasses import dataclass
import pytest
import torch
from dataclasses import dataclass
from vllm.model_executor.layers.quantization import (
_QUANTIZATION_CONFIG_REGISTRY)

View File

@ -1,7 +1,7 @@
import pytest
import torch
from tests.conftest import VllmRunner
from tests.conftest import VllmRunner
from vllm import SamplingParams
MODELS = ["facebook/opt-125m"]

View File

@ -1,13 +1,12 @@
"""Tests for rejection sampling."""
import pytest
from typing import List, Tuple
import pytest
import torch
import torch.nn.functional as F
from vllm.model_executor.utils import set_random_seed
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
from vllm.model_executor.utils import set_random_seed
CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)

View File

@ -1,11 +1,10 @@
import random
from typing import Tuple, List
from typing import List, Optional, Tuple
from unittest.mock import patch
import pytest
import torch
from transformers import GenerationConfig, GenerationMixin
from typing import Optional
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.utils import set_random_seed

View File

@ -8,8 +8,8 @@ from itertools import combinations
import pytest
from vllm.model_executor.utils import set_random_seed
from vllm import SamplingParams
from vllm.model_executor.utils import set_random_seed
MODEL = "facebook/opt-125m"
RANDOM_SEEDS = list(range(5))

View File

@ -1,9 +1,9 @@
import torch
import pytest
import torch
from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
from .utils import mock_worker, create_seq_group_metadata_from_prompts
from .utils import create_seq_group_metadata_from_prompts, mock_worker
@pytest.mark.parametrize('num_target_seq_ids', [100])

View File

@ -1,9 +1,9 @@
import torch
import math
import pytest
from unittest.mock import MagicMock
import pytest
import torch
from vllm.spec_decode.metrics import AsyncMetricsCollector

View File

@ -1,18 +1,19 @@
import torch
import random
import pytest
from unittest.mock import MagicMock
from vllm.spec_decode.multi_step_worker import (MultiStepWorker,
DraftModelTop1Proposer)
from vllm.worker.worker import Worker
import pytest
import torch
from vllm.model_executor.utils import set_random_seed
from vllm.sequence import SamplerOutput
from vllm.spec_decode.multi_step_worker import (DraftModelTop1Proposer,
MultiStepWorker)
from vllm.worker.worker import Worker
from .utils import (create_execute_model_data, create_worker,
create_seq_group_metadata_from_prompts, zero_kv_cache,
patch_execute_model_with_seeds,
assert_logprobs_dict_allclose, create_batch)
from .utils import (assert_logprobs_dict_allclose, create_batch,
create_execute_model_data,
create_seq_group_metadata_from_prompts, create_worker,
patch_execute_model_with_seeds, zero_kv_cache)
@pytest.mark.parametrize('num_steps', list(range(1, 17)))

View File

@ -1,18 +1,20 @@
import torch
import random
import pytest
from unittest.mock import MagicMock
import pytest
import torch
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
from vllm.model_executor.utils import set_random_seed
from vllm.spec_decode.interfaces import SpeculativeProposals
from vllm.spec_decode.metrics import (AsyncMetricsCollector,
SpecDecodeWorkerMetrics)
from vllm.spec_decode.multi_step_worker import MultiStepWorker
from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
split_num_cache_blocks_evenly)
from vllm.spec_decode.interfaces import SpeculativeProposals
from vllm.model_executor.utils import set_random_seed
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
from .utils import (mock_worker, create_batch, ExecuteModelData,
create_sampler_output_list)
from vllm.spec_decode.metrics import (SpecDecodeWorkerMetrics,
AsyncMetricsCollector)
from .utils import (ExecuteModelData, create_batch, create_sampler_output_list,
mock_worker)
@pytest.mark.parametrize('k', [1, 2, 6])

View File

@ -1,9 +1,9 @@
from vllm.spec_decode.util import get_all_seq_ids
from vllm.sequence import SequenceGroupMetadata
from vllm.spec_decode.util import split_batch_by_proposal_len
from unittest.mock import MagicMock
import pytest
from unittest.mock import MagicMock
from vllm.sequence import SequenceGroupMetadata
from vllm.spec_decode.util import get_all_seq_ids, split_batch_by_proposal_len
def test_get_all_seq_ids():

View File

@ -1,17 +1,19 @@
import torch
from typing import List, Optional, Dict, Iterable, Union
from dataclasses import dataclass, fields
from itertools import count
from typing import Dict, Iterable, List, Optional, Union
from unittest.mock import MagicMock
from vllm.worker.worker import Worker
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
import torch
from vllm.engine.arg_utils import EngineArgs
from vllm.sequence import (Logprob, SequenceGroupMetadata, SequenceData,
SamplerOutput, SequenceGroupOutput, SequenceOutput)
from vllm.sampling_params import SamplingParams
from vllm.worker.cache_engine import CacheEngine
from vllm.model_executor.utils import set_random_seed
from itertools import count
from dataclasses import dataclass, fields
from vllm.sampling_params import SamplingParams
from vllm.sequence import (Logprob, SamplerOutput, SequenceData,
SequenceGroupMetadata, SequenceGroupOutput,
SequenceOutput)
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
from vllm.worker.cache_engine import CacheEngine
from vllm.worker.worker import Worker
@dataclass

View File

@ -7,8 +7,8 @@ from typing import List, Optional
import pytest
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
from vllm.sequence import Sequence
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
# Make two prefixes with different first blocks.
prefix_start = [("You are an expert"), ("You are a")]

View File

@ -1,6 +1,6 @@
import pytest
from vllm.sequence import SequenceGroupOutput, SamplerOutput, SequenceOutput
from vllm.sequence import SamplerOutput, SequenceGroupOutput, SequenceOutput
@pytest.fixture

View File

@ -1,7 +1,9 @@
from copy import deepcopy
from vllm.transformers_utils.tokenizer import get_cached_tokenizer
from transformers import AutoTokenizer
from vllm.transformers_utils.tokenizer import get_cached_tokenizer
def test_cached_tokenizer():
reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")

View File

@ -1,12 +1,12 @@
from typing import Dict, List
import pytest
from transformers import AutoTokenizer
from typing import List, Dict
from vllm.sequence import Sequence, Logprob, SamplingParams, SequenceGroup
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
from vllm.transformers_utils.tokenizer import detokenize_incrementally
from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
from vllm.transformers_utils.detokenizer import Detokenizer
from vllm.transformers_utils.tokenizer import detokenize_incrementally
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
TRUTH = [
"Hello here, this is a simple test",

View File

@ -1,14 +1,16 @@
import os
import pytest
import asyncio
import os
from unittest.mock import patch
import pytest
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
RayTokenizerGroupPool)
from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
TokenizerGroup)
from ..conftest import get_tokenizer_pool_config

View File

@ -1,8 +1,8 @@
import torch
from vllm.engine.arg_utils import EngineArgs
from vllm.worker.worker import Worker
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
from vllm.worker.worker import Worker
def test_swap() -> None:

View File

@ -1,4 +1,5 @@
from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
from vllm.attention.backends.abstract import (AttentionBackend,
AttentionMetadata)
from vllm.attention.layer import Attention
from vllm.attention.selector import get_attn_backend

View File

@ -7,12 +7,13 @@ flashinfer for all the attention operations.
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Type
from flash_attn import flash_attn_varlen_func
import torch
from flash_attn import flash_attn_varlen_func
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
AttentionMetadata)
from vllm.attention.ops.paged_attn import PagedAttention, PagedAttentionMetadata
from vllm.attention.ops.paged_attn import (PagedAttention,
PagedAttentionMetadata)
class FlashAttentionBackend(AttentionBackend):

View File

@ -11,7 +11,8 @@ from xformers.ops.fmha.attn_bias import (AttentionBias,
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
AttentionMetadata)
from vllm.attention.ops.paged_attn import PagedAttention, PagedAttentionMetadata
from vllm.attention.ops.paged_attn import (PagedAttention,
PagedAttentionMetadata)
from vllm.logger import init_logger
from vllm.utils import is_hip

View File

@ -3,8 +3,7 @@ from typing import Dict, List, Optional, Tuple
import torch
from vllm._C import cache_ops
from vllm._C import ops
from vllm._C import cache_ops, ops
from vllm.attention.ops.prefix_prefill import context_attention_fwd
# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.

View File

@ -13,11 +13,13 @@ logger = init_logger(__name__)
def get_attn_backend(dtype: torch.dtype) -> AttentionBackend:
if _can_use_flash_attn(dtype):
logger.info("Using FlashAttention backend.")
from vllm.attention.backends.flash_attn import FlashAttentionBackend # noqa: F401
from vllm.attention.backends.flash_attn import ( # noqa: F401
FlashAttentionBackend)
return FlashAttentionBackend
else:
logger.info("Using XFormers backend.")
from vllm.attention.backends.xformers import XFormersBackend # noqa: F401
from vllm.attention.backends.xformers import ( # noqa: F401
XFormersBackend)
return XFormersBackend

View File

@ -1,15 +1,15 @@
from typing import TYPE_CHECKING, Optional, Union, ClassVar
from dataclasses import dataclass
import os
from packaging.version import Version
import json
import os
from dataclasses import dataclass
from typing import TYPE_CHECKING, ClassVar, Optional, Union
import torch
from packaging.version import Version
from transformers import PretrainedConfig
from vllm.logger import init_logger
from vllm.transformers_utils.config import get_config
from vllm.utils import get_cpu_memory, is_hip, is_neuron, get_nvcc_cuda_version
from vllm.utils import get_cpu_memory, get_nvcc_cuda_version, is_hip, is_neuron
if TYPE_CHECKING:
from ray.util.placement_group import PlacementGroup
@ -103,7 +103,8 @@ class ModelConfig:
if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true":
# download model from ModelScope hub,
# lazy import so that modelscope is not required for normal use.
from modelscope.hub.snapshot_download import snapshot_download # pylint: disable=C
# pylint: disable=C.
from modelscope.hub.snapshot_download import snapshot_download
if not os.path.exists(model):
model_path = snapshot_download(model_id=model,

View File

@ -1,15 +1,15 @@
"""A block manager that manages token blocks."""
import enum
from abc import ABC, abstractmethod
from itertools import count, takewhile
from os.path import commonprefix
from typing import Dict, List, Optional, Set, Tuple
from abc import ABC, abstractmethod
from vllm.block import BlockTable, PhysicalTokenBlock
from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor
from vllm.logger import init_logger
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
from vllm.utils import Device
from vllm.core.evictor import Evictor, EvictionPolicy, make_evictor
from vllm.logger import init_logger
logger = init_logger(__name__)

View File

@ -1,6 +1,6 @@
import enum
from typing import OrderedDict
from abc import ABC, abstractmethod, abstractproperty
from typing import OrderedDict
from vllm.block import PhysicalTokenBlock

View File

@ -1,13 +1,13 @@
from collections import deque
import enum
import time
from typing import Deque, Dict, Iterable, List, Optional, Tuple, Union, Set
from collections import deque
from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
from vllm.core.block_manager import AllocStatus, BlockSpaceManager
from vllm.core.policy import PolicyFactory
from vllm.lora.request import LoRARequest
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
SequenceGroupMetadata, SequenceStatus)

View File

@ -3,9 +3,8 @@ import dataclasses
from dataclasses import dataclass
from typing import Optional, Tuple
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
ParallelConfig, SchedulerConfig, LoRAConfig,
TokenizerPoolConfig)
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
ParallelConfig, SchedulerConfig, TokenizerPoolConfig)
@dataclass

View File

@ -2,17 +2,17 @@ import asyncio
import os
import time
from functools import partial
from typing import (Callable, Dict, Iterable, List, Optional, Set, Tuple, Type,
Union, AsyncIterator)
from typing import (AsyncIterator, Callable, Dict, Iterable, List, Optional,
Set, Tuple, Type, Union)
from transformers import PreTrainedTokenizer
from vllm.lora.request import LoRARequest
from vllm.config import ModelConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.engine.ray_utils import initialize_ray_cluster, ray
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.outputs import RequestOutput
from vllm.sampling_params import SamplingParams

View File

@ -4,22 +4,22 @@ from typing import Iterable, List, Optional, Tuple, Type, Union
from transformers import PreTrainedTokenizer
import vllm
from vllm.lora.request import LoRARequest
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
ParallelConfig, SchedulerConfig, LoRAConfig)
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
ParallelConfig, SchedulerConfig)
from vllm.core.scheduler import Scheduler, SchedulerOutputs
from vllm.engine.arg_utils import EngineArgs
from vllm.executor.executor_base import ExecutorBase
from vllm.engine.metrics import StatLogger, Stats
from vllm.engine.ray_utils import initialize_ray_cluster
from vllm.executor.executor_base import ExecutorBase
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.outputs import RequestOutput
from vllm.sampling_params import SamplingParams
from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup,
SequenceGroupOutput, SequenceOutput, SequenceStatus)
from vllm.transformers_utils.detokenizer import Detokenizer
from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
get_tokenizer_group)
from vllm.transformers_utils.detokenizer import Detokenizer
from vllm.utils import Counter
logger = init_logger(__name__)

View File

@ -1,11 +1,12 @@
from vllm.logger import init_logger
from prometheus_client import (Counter, Gauge, Histogram, Info, REGISTRY,
import time
from dataclasses import dataclass
from typing import Dict, List
import numpy as np
from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info,
disable_created_metrics)
import time
import numpy as np
from typing import Dict, List
from dataclasses import dataclass
from vllm.logger import init_logger
logger = init_logger(__name__)

View File

@ -1,10 +1,9 @@
import pickle
from typing import Optional, List, Tuple
from typing import List, Optional, Tuple
from vllm.config import ParallelConfig
from vllm.logger import init_logger
from vllm.utils import is_hip, set_cuda_visible_devices, get_ip
from vllm.utils import get_ip, is_hip, set_cuda_visible_devices
logger = init_logger(__name__)

View File

@ -11,9 +11,9 @@ import json
import ssl
from typing import AsyncGenerator
import uvicorn
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, Response, StreamingResponse
import uvicorn
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine

View File

@ -3,9 +3,9 @@ from typing import List, Optional, Union
from tqdm import tqdm
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
from vllm.lora.request import LoRARequest
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.lora.request import LoRARequest
from vllm.outputs import RequestOutput
from vllm.sampling_params import SamplingParams
from vllm.utils import Counter

View File

@ -1,28 +1,27 @@
import asyncio
from contextlib import asynccontextmanager
import os
import importlib
import inspect
import os
from contextlib import asynccontextmanager
from http import HTTPStatus
from prometheus_client import make_asgi_app
import fastapi
import uvicorn
from http import HTTPStatus
from fastapi import Request
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, StreamingResponse, Response
from fastapi.responses import JSONResponse, Response, StreamingResponse
from prometheus_client import make_asgi_app
import vllm
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.openai.protocol import (CompletionRequest,
ChatCompletionRequest,
ErrorResponse)
from vllm.logger import init_logger
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
CompletionRequest, ErrorResponse)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.logger import init_logger
TIMEOUT_KEEP_ALIVE = 5 # seconds

View File

@ -3,12 +3,11 @@
import time
from typing import Dict, List, Literal, Optional, Union
import torch
from pydantic import BaseModel, Field, model_validator
from vllm.utils import random_uuid
from vllm.sampling_params import SamplingParams
import torch
from vllm.utils import random_uuid
class ErrorResponse(BaseModel):

View File

@ -1,19 +1,21 @@
import time
import codecs
import time
from typing import AsyncGenerator, AsyncIterator, List, Optional, Union
from fastapi import Request
from typing import AsyncGenerator, AsyncIterator, Optional, List, Union
from vllm.logger import init_logger
from vllm.utils import random_uuid
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest, ChatCompletionResponse,
ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse,
UsageInfo)
from vllm.outputs import RequestOutput
from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA
from vllm.entrypoints.openai.serving_engine import LoRA, OpenAIServing
from vllm.logger import init_logger
from vllm.model_executor.guided_decoding import (
get_guided_decoding_logits_processor)
from vllm.outputs import RequestOutput
from vllm.utils import random_uuid
logger = init_logger(__name__)

View File

@ -1,24 +1,23 @@
import asyncio
import time
from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, List,
Optional, Tuple)
from fastapi import Request
from typing import (AsyncGenerator, AsyncIterator, Callable, List, Optional,
Dict, Tuple)
from vllm.logger import init_logger
from vllm.utils import random_uuid
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.openai.protocol import (
CompletionRequest,
CompletionResponse,
CompletionResponseChoice,
CompletionResponseStreamChoice,
CompletionStreamResponse,
LogProbs,
UsageInfo,
)
from vllm.outputs import RequestOutput
from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA
from vllm.entrypoints.openai.protocol import (CompletionRequest,
CompletionResponse,
CompletionResponseChoice,
CompletionResponseStreamChoice,
CompletionStreamResponse,
LogProbs, UsageInfo)
from vllm.entrypoints.openai.serving_engine import LoRA, OpenAIServing
from vllm.logger import init_logger
from vllm.model_executor.guided_decoding import (
get_guided_decoding_logits_processor)
from vllm.outputs import RequestOutput
from vllm.utils import random_uuid
logger = init_logger(__name__)

View File

@ -3,16 +3,16 @@ import json
from dataclasses import dataclass
from http import HTTPStatus
from typing import Dict, List, Optional, Union
from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.openai.protocol import (CompletionRequest,
ChatCompletionRequest,
ErrorResponse, LogProbs,
ModelCard, ModelList,
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
CompletionRequest, ErrorResponse,
LogProbs, ModelCard, ModelList,
ModelPermission)
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import Logprob
from vllm.transformers_utils.tokenizer import get_tokenizer
logger = init_logger(__name__)

View File

@ -1,8 +1,8 @@
from abc import ABC, abstractmethod
from typing import Dict, List, Optional
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
ParallelConfig, SchedulerConfig, LoRAConfig)
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
ParallelConfig, SchedulerConfig)
from vllm.lora.request import LoRARequest
from vllm.sequence import SamplerOutput, SequenceGroupMetadata

View File

@ -1,13 +1,13 @@
from typing import Dict, List, Optional
from vllm.lora.request import LoRARequest
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
ParallelConfig, SchedulerConfig, LoRAConfig)
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
ParallelConfig, SchedulerConfig)
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from vllm.executor.utils import check_block_size_valid
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
from vllm.utils import (get_ip, get_open_port, get_distributed_init_method,
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
make_async)
logger = init_logger(__name__)

View File

@ -1,10 +1,10 @@
from typing import Dict, List, Optional
from vllm.lora.request import LoRARequest
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
ParallelConfig, SchedulerConfig, LoRAConfig)
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
ParallelConfig, SchedulerConfig)
from vllm.executor.executor_base import ExecutorBase
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
logger = init_logger(__name__)

View File

@ -1,20 +1,20 @@
import asyncio
import copy
from collections import defaultdict
import os
import pickle
from collections import defaultdict
from typing import TYPE_CHECKING, Any, Dict, List, Optional
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
ParallelConfig, SchedulerConfig, LoRAConfig)
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
ParallelConfig, SchedulerConfig)
from vllm.engine.ray_utils import RayWorkerVllm, ray
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from vllm.executor.utils import check_block_size_valid
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
from vllm.utils import (set_cuda_visible_devices, get_ip, get_open_port,
get_distributed_init_method, make_async)
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
make_async, set_cuda_visible_devices)
if ray is not None:
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@ -343,7 +343,7 @@ class RayGPUExecutor(ExecutorBase):
raise ValueError(f"Ray version {required_version} or greater is "
f"required, but found {current_version}")
from ray.dag import MultiOutputNode, InputNode
from ray.dag import InputNode, MultiOutputNode
assert self.parallel_config.worker_use_ray
# Right now, compiled DAG requires at least 1 arg. We send

View File

@ -2,8 +2,8 @@
# https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
"""Logging configuration for vLLM."""
import logging
import sys
import os
import sys
VLLM_CONFIGURE_LOGGING = int(os.getenv("VLLM_CONFIGURE_LOGGING", "1"))

View File

@ -10,18 +10,16 @@ from transformers import PretrainedConfig
from vllm.config import LoRAConfig
from vllm.lora.punica import add_lora, add_lora_slice, bgmv
from vllm.model_executor.parallel_utils.communication_op import (
tensor_model_parallel_all_gather,
tensor_model_parallel_all_reduce,
tensor_model_parallel_gather,
)
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
RowParallelLinear,
MergedColumnParallelLinear,
QKVParallelLinear,
MergedColumnParallelLinear)
RowParallelLinear)
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding, ParallelLMHead)
ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.parallel_utils.communication_op import (
tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce,
tensor_model_parallel_gather)
from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
from vllm.model_executor.parallel_utils.utils import (

View File

@ -1,6 +1,7 @@
from typing import List, Optional
import torch
from vllm.utils import is_pin_memory_available

View File

@ -4,19 +4,18 @@ import logging
import math
import os
import re
from typing import (Callable, Dict, Hashable, List, Optional, Tuple, Type)
from typing import Callable, Dict, Hashable, List, Optional, Tuple, Type
import safetensors.torch
import torch
from torch import nn
from vllm.config import LoRAConfig
from vllm.utils import LRUCache, is_pin_memory_available
from vllm.lora.layers import (BaseLayerWithLoRA, LoRAMapping, from_layer,
from_layer_logits_processor)
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
from vllm.utils import LRUCache, is_pin_memory_available
logger = logging.getLogger(__name__)

View File

@ -4,11 +4,11 @@ from typing import Any, Dict, List, Optional, Set, Type
import torch
from vllm.config import LoRAConfig
from vllm.lora.layers import LoRAMapping
from vllm.lora.models import (LoRAModel, LoRAModelManager,
LRUCacheLoRAModelManager, create_lora_manager)
from vllm.lora.request import LoRARequest
from vllm.lora.layers import LoRAMapping
from vllm.config import LoRAConfig
logger = logging.getLogger(__name__)

View File

@ -5,16 +5,16 @@ from enum import Enum
from functools import lru_cache
from json import dumps as json_dumps
from re import escape as regex_escape
from typing import Union, Tuple
from typing import Tuple, Union
from pydantic import BaseModel
from transformers import PreTrainedTokenizerBase
from vllm.entrypoints.openai.protocol import (CompletionRequest,
ChatCompletionRequest)
from vllm.model_executor.guided_logits_processors import (JSONLogitsProcessor,
RegexLogitsProcessor,
CFGLogitsProcessor)
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
CompletionRequest)
from vllm.model_executor.guided_logits_processors import (CFGLogitsProcessor,
JSONLogitsProcessor,
RegexLogitsProcessor)
class GuidedDecodingMode(Enum):

View File

@ -16,13 +16,13 @@
import json
import math
from collections import defaultdict
from typing import Union, DefaultDict, Dict, List, Optional, Callable
from typing import Callable, DefaultDict, Dict, List, Optional, Union
import torch
from outlines.fsm.fsm import CFGFSM, RegexFSM
from outlines.fsm.json_schema import build_regex_from_schema
from pydantic import BaseModel
from transformers import PreTrainedTokenizerBase
from outlines.fsm.fsm import RegexFSM, CFGFSM
from outlines.fsm.json_schema import build_regex_from_schema
class BaseLogitsProcessor:

View File

@ -1,7 +1,5 @@
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_moe,
get_config_file_name,
)
fused_moe, get_config_file_name)
__all__ = [
"fused_moe",

View File

@ -5,14 +5,14 @@ import torch
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from vllm.logger import init_logger
from vllm.model_executor.parallel_utils.communication_op import (
tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce)
from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
from vllm.model_executor.parallel_utils.communication_op import (
tensor_model_parallel_all_reduce, tensor_model_parallel_all_gather)
from vllm.model_executor.parallel_utils.utils import (
divide, split_tensor_along_last_dim)
from vllm.model_executor.utils import set_weight_attrs
from vllm.logger import init_logger
logger = init_logger(__name__)

View File

@ -1,9 +1,9 @@
from typing import Optional, Union
import torch
import triton
import triton.language as tl
from typing import Optional, Union
def seeded_uniform(
*size,

View File

@ -1,5 +1,5 @@
import math
from typing import Tuple, Optional
from typing import Optional, Tuple
import torch
import triton

View File

@ -1,11 +1,11 @@
from typing import Type
from vllm.model_executor.layers.quantization.awq import AWQConfig
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.layers.quantization.awq import AWQConfig
from vllm.model_executor.layers.quantization.gptq import GPTQConfig
from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
from vllm.model_executor.layers.quantization.marlin import MarlinConfig
from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
_QUANTIZATION_CONFIG_REGISTRY = {
"awq": AWQConfig,

View File

@ -1,7 +1,7 @@
import enum
from enum import Enum
from typing import Any, Dict, List, Optional
from fractions import Fraction
from typing import Any, Dict, List, Optional
import torch
from torch.nn.parameter import Parameter

View File

@ -4,7 +4,8 @@ import torch
from torch.nn.parameter import Parameter
from vllm._C import ops
from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs
from vllm.model_executor.layers.linear import (LinearMethodBase,
set_weight_attrs)
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)

View File

@ -1,9 +1,9 @@
from typing import Tuple, Optional
from functools import cached_property
from typing import Optional, Tuple
import torch
import torch.nn as nn
import torch.jit
import torch.nn as nn
class RejectionSampler(nn.Module):

Some files were not shown because too many files have changed in this diff Show More