Update pre-commit
hooks (#12475)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
6116ca8cd7
commit
823ab79633
@ -3,18 +3,18 @@ default_stages:
|
|||||||
- manual # Run in CI
|
- manual # Run in CI
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/google/yapf
|
- repo: https://github.com/google/yapf
|
||||||
rev: v0.32.0
|
rev: v0.43.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: yapf
|
- id: yapf
|
||||||
args: [--in-place, --verbose]
|
args: [--in-place, --verbose]
|
||||||
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
|
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.6.5
|
rev: v0.9.3
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff
|
- id: ruff
|
||||||
args: [--output-format, github]
|
args: [--output-format, github]
|
||||||
- repo: https://github.com/codespell-project/codespell
|
- repo: https://github.com/codespell-project/codespell
|
||||||
rev: v2.3.0
|
rev: v2.4.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: codespell
|
- id: codespell
|
||||||
exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
|
exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
|
||||||
@ -23,7 +23,7 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: isort
|
||||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
rev: v18.1.5
|
rev: v19.1.7
|
||||||
hooks:
|
hooks:
|
||||||
- id: clang-format
|
- id: clang-format
|
||||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
|
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
|
||||||
@ -35,7 +35,7 @@ repos:
|
|||||||
- id: pymarkdown
|
- id: pymarkdown
|
||||||
files: docs/.*
|
files: docs/.*
|
||||||
- repo: https://github.com/rhysd/actionlint
|
- repo: https://github.com/rhysd/actionlint
|
||||||
rev: v1.7.6
|
rev: v1.7.7
|
||||||
hooks:
|
hooks:
|
||||||
- id: actionlint
|
- id: actionlint
|
||||||
- repo: local
|
- repo: local
|
||||||
|
@ -926,8 +926,8 @@ def main(args: argparse.Namespace):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Traffic
|
# Traffic
|
||||||
result_json["request_rate"] = (
|
result_json["request_rate"] = (args.request_rate if args.request_rate
|
||||||
args.request_rate if args.request_rate < float("inf") else "inf")
|
< float("inf") else "inf")
|
||||||
result_json["burstiness"] = args.burstiness
|
result_json["burstiness"] = args.burstiness
|
||||||
result_json["max_concurrency"] = args.max_concurrency
|
result_json["max_concurrency"] = args.max_concurrency
|
||||||
|
|
||||||
|
@ -38,9 +38,13 @@ struct Signal {
|
|||||||
alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
|
alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
|
struct __align__(16) RankData {
|
||||||
|
const void* __restrict__ ptrs[8];
|
||||||
|
};
|
||||||
|
|
||||||
struct __align__(16) RankSignals { Signal* signals[8]; };
|
struct __align__(16) RankSignals {
|
||||||
|
Signal* signals[8];
|
||||||
|
};
|
||||||
|
|
||||||
// like std::array, but aligned
|
// like std::array, but aligned
|
||||||
template <typename T, int sz>
|
template <typename T, int sz>
|
||||||
|
@ -138,8 +138,8 @@ __device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
|
|||||||
const int HI = 0x00f000f0;
|
const int HI = 0x00f000f0;
|
||||||
const int EX = 0x64006400;
|
const int EX = 0x64006400;
|
||||||
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
||||||
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
|
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
|
||||||
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
|
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
|
||||||
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
|
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
|
||||||
// directly into `SUB` and `ADD`.
|
// directly into `SUB` and `ADD`.
|
||||||
const int SUB = 0x64086408;
|
const int SUB = 0x64086408;
|
||||||
@ -182,8 +182,8 @@ __device__ inline FragB dequant<vllm::kU4.id()>(int q) {
|
|||||||
const int HI = 0x00f000f0;
|
const int HI = 0x00f000f0;
|
||||||
const int EX = 0x64006400;
|
const int EX = 0x64006400;
|
||||||
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
||||||
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
|
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
|
||||||
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
|
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
|
||||||
|
|
||||||
const int SUB = 0x64006400;
|
const int SUB = 0x64006400;
|
||||||
const int MUL = 0x2c002c00;
|
const int MUL = 0x2c002c00;
|
||||||
|
@ -173,8 +173,8 @@ dequant<half, vllm::kU4B8.id()>(int q) {
|
|||||||
const int HI = 0x00f000f0;
|
const int HI = 0x00f000f0;
|
||||||
const int EX = 0x64006400;
|
const int EX = 0x64006400;
|
||||||
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
||||||
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
|
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
|
||||||
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
|
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
|
||||||
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
|
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
|
||||||
// directly into `SUB` and `ADD`.
|
// directly into `SUB` and `ADD`.
|
||||||
const int SUB = 0x64086408;
|
const int SUB = 0x64086408;
|
||||||
@ -197,9 +197,9 @@ dequant<nv_bfloat16, vllm::kU4B8.id()>(int q) {
|
|||||||
|
|
||||||
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
||||||
|
|
||||||
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
|
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
|
||||||
q >>= 4;
|
q >>= 4;
|
||||||
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
|
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
|
||||||
|
|
||||||
typename ScalarType<nv_bfloat16>::FragB frag_b;
|
typename ScalarType<nv_bfloat16>::FragB frag_b;
|
||||||
static constexpr uint32_t MUL = 0x3F803F80;
|
static constexpr uint32_t MUL = 0x3F803F80;
|
||||||
@ -221,8 +221,8 @@ dequant<half, vllm::kU4.id()>(int q) {
|
|||||||
const int HI = 0x00f000f0;
|
const int HI = 0x00f000f0;
|
||||||
const int EX = 0x64006400;
|
const int EX = 0x64006400;
|
||||||
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
||||||
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
|
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
|
||||||
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
|
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
|
||||||
|
|
||||||
const int SUB = 0x64006400;
|
const int SUB = 0x64006400;
|
||||||
const int MUL = 0x2c002c00;
|
const int MUL = 0x2c002c00;
|
||||||
@ -244,9 +244,9 @@ dequant<nv_bfloat16, vllm::kU4.id()>(int q) {
|
|||||||
|
|
||||||
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
||||||
|
|
||||||
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
|
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
|
||||||
q >>= 4;
|
q >>= 4;
|
||||||
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
|
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
|
||||||
|
|
||||||
typename ScalarType<nv_bfloat16>::FragB frag_b;
|
typename ScalarType<nv_bfloat16>::FragB frag_b;
|
||||||
static constexpr uint32_t MUL = 0x3F803F80;
|
static constexpr uint32_t MUL = 0x3F803F80;
|
||||||
|
@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) {
|
|||||||
const int HI = 0x00f000f0;
|
const int HI = 0x00f000f0;
|
||||||
const int EX = 0x64006400;
|
const int EX = 0x64006400;
|
||||||
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
||||||
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
|
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
|
||||||
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
|
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
|
||||||
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
|
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
|
||||||
// directly into `SUB` and `ADD`.
|
// directly into `SUB` and `ADD`.
|
||||||
const int SUB = 0x64086408;
|
const int SUB = 0x64086408;
|
||||||
|
@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
|
|||||||
static constexpr uint32_t HI = 0x00f000f0;
|
static constexpr uint32_t HI = 0x00f000f0;
|
||||||
static constexpr uint32_t EX = 0x64006400;
|
static constexpr uint32_t EX = 0x64006400;
|
||||||
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
||||||
uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
|
uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
|
||||||
uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
|
uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
|
||||||
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
|
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
|
||||||
// directly into `SUB` and `ADD`.
|
// directly into `SUB` and `ADD`.
|
||||||
static constexpr uint32_t SUB = 0x64086408;
|
static constexpr uint32_t SUB = 0x64086408;
|
||||||
|
@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) {
|
|||||||
const int HI = 0x00f000f0;
|
const int HI = 0x00f000f0;
|
||||||
const int EX = 0x64006400;
|
const int EX = 0x64006400;
|
||||||
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
// Guarantee that the `(a & b) | c` operations are LOP3s.
|
||||||
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
|
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
|
||||||
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
|
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
|
||||||
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
|
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
|
||||||
// directly into `SUB` and `ADD`.
|
// directly into `SUB` and `ADD`.
|
||||||
const int SUB = 0x64086408;
|
const int SUB = 0x64086408;
|
||||||
|
@ -907,7 +907,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
|||||||
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
|
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
|
||||||
// max_num_partitions, head_size]
|
// max_num_partitions, head_size]
|
||||||
const int* __restrict__ context_lens, // [num_seqs]
|
const int* __restrict__ context_lens, // [num_seqs]
|
||||||
const int max_num_partitions){UNREACHABLE_CODE}
|
const int max_num_partitions) {
|
||||||
|
UNREACHABLE_CODE
|
||||||
|
}
|
||||||
|
|
||||||
#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
||||||
|
|
||||||
|
2
setup.py
2
setup.py
@ -417,7 +417,7 @@ def get_rocm_version():
|
|||||||
|
|
||||||
if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
|
if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
|
||||||
ctypes.byref(patch)) == 0):
|
ctypes.byref(patch)) == 0):
|
||||||
return "%d.%d.%d" % (major.value, minor.value, patch.value)
|
return f"{major.value}.{minor.value}.{patch.value}"
|
||||||
return None
|
return None
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
|
@ -92,8 +92,10 @@ def native_w8a8_block_fp8_matmul(A,
|
|||||||
A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
|
A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
|
||||||
]
|
]
|
||||||
B_tiles = [[
|
B_tiles = [[
|
||||||
B[j * block_n:min((j + 1) * block_n, N),
|
B[
|
||||||
i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles)
|
j * block_n:min((j + 1) * block_n, N),
|
||||||
|
i * block_k:min((i + 1) * block_k, K),
|
||||||
|
] for i in range(k_tiles)
|
||||||
] for j in range(n_tiles)]
|
] for j in range(n_tiles)]
|
||||||
C_tiles = [
|
C_tiles = [
|
||||||
C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
|
C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
|
||||||
@ -157,9 +159,9 @@ def setup_cuda():
|
|||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed",
|
@pytest.mark.parametrize(
|
||||||
itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE,
|
"num_tokens,d,dtype,group_size,seed",
|
||||||
SEEDS))
|
itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS))
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
|
def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
|
||||||
torch.manual_seed(seed)
|
torch.manual_seed(seed)
|
||||||
@ -174,9 +176,9 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
|
|||||||
assert torch.allclose(scale, ref_scale)
|
assert torch.allclose(scale, ref_scale)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed",
|
@pytest.mark.parametrize(
|
||||||
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES,
|
"M,N,K,block_size,out_dtype,seed",
|
||||||
SEEDS))
|
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
|
def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
|
||||||
torch.manual_seed(seed)
|
torch.manual_seed(seed)
|
||||||
@ -207,9 +209,10 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
|
|||||||
assert rel_diff < 0.001
|
assert rel_diff < 0.001
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed",
|
@pytest.mark.parametrize(
|
||||||
itertools.product(M_moe, N_moe, K_moe, E, TOP_KS,
|
"M,N,K,E,topk,block_size,dtype,seed",
|
||||||
BLOCK_SIZE, DTYPES, SEEDS))
|
itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES,
|
||||||
|
SEEDS))
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
|
def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
|
||||||
torch.manual_seed(seed)
|
torch.manual_seed(seed)
|
||||||
|
@ -20,7 +20,7 @@ def test_run(my_rank, buffer, device):
|
|||||||
assert buffer.buffer_size == 0
|
assert buffer.buffer_size == 0
|
||||||
assert len(buffer.buffer) == 0
|
assert len(buffer.buffer) == 0
|
||||||
|
|
||||||
print("My rank: %d, device: %s" % (my_rank, device))
|
print(f"My rank: {my_rank}, device: {device}")
|
||||||
|
|
||||||
# insert
|
# insert
|
||||||
tokens = torch.tensor([1, 2, 3]).to(device)
|
tokens = torch.tensor([1, 2, 3]).to(device)
|
||||||
@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
|
|||||||
assert buffer.buffer_size == 0
|
assert buffer.buffer_size == 0
|
||||||
assert len(buffer.buffer) == 0
|
assert len(buffer.buffer) == 0
|
||||||
|
|
||||||
print("My rank: %d, Test run passed!" % (my_rank))
|
print(f"My rank: {my_rank}, Test run passed!")
|
||||||
|
|
||||||
|
|
||||||
def stress_test(my_rank, buf, device):
|
def stress_test(my_rank, buf, device):
|
||||||
@ -94,7 +94,7 @@ def stress_test(my_rank, buf, device):
|
|||||||
assert torch.allclose(k, k_)
|
assert torch.allclose(k, k_)
|
||||||
assert torch.allclose(v, v_)
|
assert torch.allclose(v, v_)
|
||||||
assert torch.allclose(h, h_)
|
assert torch.allclose(h, h_)
|
||||||
print('Rank %d done' % my_rank)
|
print(f"Rank {my_rank} done")
|
||||||
torch.distributed.barrier()
|
torch.distributed.barrier()
|
||||||
|
|
||||||
if my_rank == 0:
|
if my_rank == 0:
|
||||||
@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
|
|||||||
else:
|
else:
|
||||||
torch.distributed.send(torch.tensor([n]), 0)
|
torch.distributed.send(torch.tensor([n]), 0)
|
||||||
|
|
||||||
print("My rank: %d, Passed stress test!" % (my_rank))
|
print(f"My rank: {my_rank}, Passed stress test!")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@ -122,7 +122,7 @@ if __name__ == "__main__":
|
|||||||
rank=my_rank,
|
rank=my_rank,
|
||||||
)
|
)
|
||||||
|
|
||||||
print("initialized! My rank is %d" % my_rank)
|
print(f"initialized! My rank is {my_rank}")
|
||||||
|
|
||||||
config = KVTransferConfig(
|
config = KVTransferConfig(
|
||||||
kv_connector='PyNcclConnector',
|
kv_connector='PyNcclConnector',
|
||||||
|
@ -55,9 +55,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
|
|||||||
return generated_texts
|
return generated_texts
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail(current_platform.is_rocm(),
|
@pytest.mark.xfail(
|
||||||
reason="Qwen2-VL dependency xformers incompatible with ROCm"
|
current_platform.is_rocm(),
|
||||||
)
|
reason="Qwen2-VL dependency xformers incompatible with ROCm")
|
||||||
def test_qwen2vl_lora(qwen2vl_lora_files):
|
def test_qwen2vl_lora(qwen2vl_lora_files):
|
||||||
llm = vllm.LLM(
|
llm = vllm.LLM(
|
||||||
MODEL_PATH,
|
MODEL_PATH,
|
||||||
|
@ -521,12 +521,13 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
|
|||||||
# - image embeddings
|
# - image embeddings
|
||||||
# - video
|
# - video
|
||||||
# - custom inputs
|
# - custom inputs
|
||||||
@pytest.mark.parametrize("model_type,test_case",
|
@pytest.mark.parametrize(
|
||||||
get_parametrized_options(
|
"model_type,test_case",
|
||||||
VLM_TEST_SETTINGS,
|
get_parametrized_options(
|
||||||
test_type=VLMTestType.IMAGE,
|
VLM_TEST_SETTINGS,
|
||||||
fork_new_process_for_each_test=False,
|
test_type=VLMTestType.IMAGE,
|
||||||
))
|
fork_new_process_for_each_test=False,
|
||||||
|
))
|
||||||
def test_single_image_models(tmp_path: PosixPath, model_type: str,
|
def test_single_image_models(tmp_path: PosixPath, model_type: str,
|
||||||
test_case: ExpandableVLMTestArgs,
|
test_case: ExpandableVLMTestArgs,
|
||||||
hf_runner: Type[HfRunner],
|
hf_runner: Type[HfRunner],
|
||||||
@ -543,12 +544,13 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_type,test_case",
|
@pytest.mark.parametrize(
|
||||||
get_parametrized_options(
|
"model_type,test_case",
|
||||||
VLM_TEST_SETTINGS,
|
get_parametrized_options(
|
||||||
test_type=VLMTestType.MULTI_IMAGE,
|
VLM_TEST_SETTINGS,
|
||||||
fork_new_process_for_each_test=False,
|
test_type=VLMTestType.MULTI_IMAGE,
|
||||||
))
|
fork_new_process_for_each_test=False,
|
||||||
|
))
|
||||||
def test_multi_image_models(tmp_path: PosixPath, model_type: str,
|
def test_multi_image_models(tmp_path: PosixPath, model_type: str,
|
||||||
test_case: ExpandableVLMTestArgs,
|
test_case: ExpandableVLMTestArgs,
|
||||||
hf_runner: Type[HfRunner],
|
hf_runner: Type[HfRunner],
|
||||||
@ -565,12 +567,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_type,test_case",
|
@pytest.mark.parametrize(
|
||||||
get_parametrized_options(
|
"model_type,test_case",
|
||||||
VLM_TEST_SETTINGS,
|
get_parametrized_options(
|
||||||
test_type=VLMTestType.EMBEDDING,
|
VLM_TEST_SETTINGS,
|
||||||
fork_new_process_for_each_test=False,
|
test_type=VLMTestType.EMBEDDING,
|
||||||
))
|
fork_new_process_for_each_test=False,
|
||||||
|
))
|
||||||
def test_image_embedding_models(model_type: str,
|
def test_image_embedding_models(model_type: str,
|
||||||
test_case: ExpandableVLMTestArgs,
|
test_case: ExpandableVLMTestArgs,
|
||||||
hf_runner: Type[HfRunner],
|
hf_runner: Type[HfRunner],
|
||||||
@ -586,12 +589,13 @@ def test_image_embedding_models(model_type: str,
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_type,test_case",
|
@pytest.mark.parametrize(
|
||||||
get_parametrized_options(
|
"model_type,test_case",
|
||||||
VLM_TEST_SETTINGS,
|
get_parametrized_options(
|
||||||
test_type=VLMTestType.VIDEO,
|
VLM_TEST_SETTINGS,
|
||||||
fork_new_process_for_each_test=False,
|
test_type=VLMTestType.VIDEO,
|
||||||
))
|
fork_new_process_for_each_test=False,
|
||||||
|
))
|
||||||
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
|
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||||
hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
|
hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
|
||||||
video_assets: _VideoAssets):
|
video_assets: _VideoAssets):
|
||||||
@ -605,12 +609,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_type,test_case",
|
@pytest.mark.parametrize(
|
||||||
get_parametrized_options(
|
"model_type,test_case",
|
||||||
VLM_TEST_SETTINGS,
|
get_parametrized_options(
|
||||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
VLM_TEST_SETTINGS,
|
||||||
fork_new_process_for_each_test=False,
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||||
))
|
fork_new_process_for_each_test=False,
|
||||||
|
))
|
||||||
def test_custom_inputs_models(
|
def test_custom_inputs_models(
|
||||||
model_type: str,
|
model_type: str,
|
||||||
test_case: ExpandableVLMTestArgs,
|
test_case: ExpandableVLMTestArgs,
|
||||||
@ -627,12 +632,13 @@ def test_custom_inputs_models(
|
|||||||
|
|
||||||
|
|
||||||
#### Tests filtering for things running each test as a new process
|
#### Tests filtering for things running each test as a new process
|
||||||
@pytest.mark.parametrize("model_type,test_case",
|
@pytest.mark.parametrize(
|
||||||
get_parametrized_options(
|
"model_type,test_case",
|
||||||
VLM_TEST_SETTINGS,
|
get_parametrized_options(
|
||||||
test_type=VLMTestType.IMAGE,
|
VLM_TEST_SETTINGS,
|
||||||
fork_new_process_for_each_test=True,
|
test_type=VLMTestType.IMAGE,
|
||||||
))
|
fork_new_process_for_each_test=True,
|
||||||
|
))
|
||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||||
test_case: ExpandableVLMTestArgs,
|
test_case: ExpandableVLMTestArgs,
|
||||||
@ -650,12 +656,13 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_type,test_case",
|
@pytest.mark.parametrize(
|
||||||
get_parametrized_options(
|
"model_type,test_case",
|
||||||
VLM_TEST_SETTINGS,
|
get_parametrized_options(
|
||||||
test_type=VLMTestType.MULTI_IMAGE,
|
VLM_TEST_SETTINGS,
|
||||||
fork_new_process_for_each_test=True,
|
test_type=VLMTestType.MULTI_IMAGE,
|
||||||
))
|
fork_new_process_for_each_test=True,
|
||||||
|
))
|
||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||||
test_case: ExpandableVLMTestArgs,
|
test_case: ExpandableVLMTestArgs,
|
||||||
@ -673,12 +680,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_type,test_case",
|
@pytest.mark.parametrize(
|
||||||
get_parametrized_options(
|
"model_type,test_case",
|
||||||
VLM_TEST_SETTINGS,
|
get_parametrized_options(
|
||||||
test_type=VLMTestType.EMBEDDING,
|
VLM_TEST_SETTINGS,
|
||||||
fork_new_process_for_each_test=True,
|
test_type=VLMTestType.EMBEDDING,
|
||||||
))
|
fork_new_process_for_each_test=True,
|
||||||
|
))
|
||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
def test_image_embedding_models_heavy(model_type: str,
|
def test_image_embedding_models_heavy(model_type: str,
|
||||||
test_case: ExpandableVLMTestArgs,
|
test_case: ExpandableVLMTestArgs,
|
||||||
@ -695,12 +703,13 @@ def test_image_embedding_models_heavy(model_type: str,
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_type,test_case",
|
@pytest.mark.parametrize(
|
||||||
get_parametrized_options(
|
"model_type,test_case",
|
||||||
VLM_TEST_SETTINGS,
|
get_parametrized_options(
|
||||||
test_type=VLMTestType.VIDEO,
|
VLM_TEST_SETTINGS,
|
||||||
fork_new_process_for_each_test=True,
|
test_type=VLMTestType.VIDEO,
|
||||||
))
|
fork_new_process_for_each_test=True,
|
||||||
|
))
|
||||||
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
|
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||||
hf_runner: Type[HfRunner],
|
hf_runner: Type[HfRunner],
|
||||||
vllm_runner: Type[VllmRunner],
|
vllm_runner: Type[VllmRunner],
|
||||||
@ -715,12 +724,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_type,test_case",
|
@pytest.mark.parametrize(
|
||||||
get_parametrized_options(
|
"model_type,test_case",
|
||||||
VLM_TEST_SETTINGS,
|
get_parametrized_options(
|
||||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
VLM_TEST_SETTINGS,
|
||||||
fork_new_process_for_each_test=True,
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||||
))
|
fork_new_process_for_each_test=True,
|
||||||
|
))
|
||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
def test_custom_inputs_models_heavy(
|
def test_custom_inputs_models_heavy(
|
||||||
model_type: str,
|
model_type: str,
|
||||||
|
@ -135,10 +135,10 @@ def _dump_outputs_w_logprobs(
|
|||||||
outputs: OutputsLogprobs,
|
outputs: OutputsLogprobs,
|
||||||
filename: "StrPath",
|
filename: "StrPath",
|
||||||
) -> None:
|
) -> None:
|
||||||
json_data = [(tokens, text,
|
json_data = [(tokens, text, [{
|
||||||
[{k: asdict(v)
|
k: asdict(v)
|
||||||
for k, v in token_logprobs.items()}
|
for k, v in token_logprobs.items()
|
||||||
for token_logprobs in (logprobs or [])])
|
} for token_logprobs in (logprobs or [])])
|
||||||
for tokens, text, logprobs in outputs]
|
for tokens, text, logprobs in outputs]
|
||||||
|
|
||||||
with open(filename, "w") as f:
|
with open(filename, "w") as f:
|
||||||
@ -149,11 +149,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
|
|||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
json_data = json.load(f)
|
json_data = json.load(f)
|
||||||
|
|
||||||
return [(tokens, text,
|
return [(tokens, text, [{
|
||||||
[{int(k): Logprob(**v)
|
int(k): Logprob(**v)
|
||||||
for k, v in token_logprobs.items()}
|
for k, v in token_logprobs.items()
|
||||||
for token_logprobs in logprobs])
|
} for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
|
||||||
for tokens, text, logprobs in json_data]
|
|
||||||
|
|
||||||
|
|
||||||
@large_gpu_test(min_gb=80)
|
@large_gpu_test(min_gb=80)
|
||||||
|
@ -314,9 +314,9 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
|
@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
|
||||||
@pytest.mark.skipif(not sparse_cutlass_supported(),
|
@pytest.mark.skipif(
|
||||||
reason="2of4 Sparse is not yet supported on this GPU type."
|
not sparse_cutlass_supported(),
|
||||||
)
|
reason="2of4 Sparse is not yet supported on this GPU type.")
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"args_2of4",
|
"args_2of4",
|
||||||
[("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
|
[("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
|
||||||
|
@ -23,16 +23,17 @@ def mock_causal_accepted_tensor(
|
|||||||
"""
|
"""
|
||||||
batch_size = last_accepted_indices.shape[0]
|
batch_size = last_accepted_indices.shape[0]
|
||||||
|
|
||||||
accepted = (torch.arange(k).expand(batch_size, k) <=
|
accepted = (torch.arange(k).expand(batch_size, k)
|
||||||
last_accepted_indices.unsqueeze(-1).broadcast_to(
|
<= last_accepted_indices.unsqueeze(-1).broadcast_to(
|
||||||
batch_size, k))
|
batch_size, k))
|
||||||
|
|
||||||
# Sprinkle accepted values after the contiguous initial accepted values.
|
# Sprinkle accepted values after the contiguous initial accepted values.
|
||||||
# This replicates the behavior of rejection sampling, which may "accept"
|
# This replicates the behavior of rejection sampling, which may "accept"
|
||||||
# a token that cannot be accepted because of causality.
|
# a token that cannot be accepted because of causality.
|
||||||
sprinkle_candidates = (
|
sprinkle_candidates = (torch.arange(k).expand(
|
||||||
torch.arange(k).expand(batch_size, k) >
|
batch_size,
|
||||||
last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + 1)
|
k) > last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) +
|
||||||
|
1)
|
||||||
sprinkle = torch.rand(batch_size, k) > 0.5
|
sprinkle = torch.rand(batch_size, k) > 0.5
|
||||||
accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates]
|
accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates]
|
||||||
return accepted
|
return accepted
|
||||||
@ -445,8 +446,8 @@ def test_rejection_sampling_approximates_target_distribution(
|
|||||||
distance_wrt_reference)
|
distance_wrt_reference)
|
||||||
|
|
||||||
expected_improvement_multiplier = 20
|
expected_improvement_multiplier = 20
|
||||||
assert (relative_change_in_distance_wrt_target >
|
assert (relative_change_in_distance_wrt_target
|
||||||
relative_change_in_distance_wrt_reference *
|
> relative_change_in_distance_wrt_reference *
|
||||||
expected_improvement_multiplier)
|
expected_improvement_multiplier)
|
||||||
|
|
||||||
|
|
||||||
|
@ -274,8 +274,9 @@ def SummarizeEntries(entries, extra_step_types):
|
|||||||
print(' {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
|
print(' {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
|
||||||
'parallelism)'.format(length, total_cpu_time,
|
'parallelism)'.format(length, total_cpu_time,
|
||||||
total_cpu_time * 1.0 / length))
|
total_cpu_time * 1.0 / length))
|
||||||
print(' %d build steps completed, average of %1.2f/s' %
|
print(' {} build steps completed, average of {:1.2f}/s'.format(
|
||||||
(len(entries), len(entries) / (length)))
|
len(entries),
|
||||||
|
len(entries) / (length)))
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -820,8 +820,8 @@ def scaled_int8_quant(
|
|||||||
if scale is not None:
|
if scale is not None:
|
||||||
# static-per-tensor quantization.
|
# static-per-tensor quantization.
|
||||||
assert symmetric == (
|
assert symmetric == (
|
||||||
azp is
|
azp
|
||||||
None), "azp must only be provided for asymmetric quantization."
|
is None), "azp must only be provided for asymmetric quantization."
|
||||||
torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
|
torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
|
||||||
return output, scale, azp
|
return output, scale, azp
|
||||||
|
|
||||||
|
@ -219,8 +219,8 @@ if triton.__version__ >= "2.1.0":
|
|||||||
float("-inf"))
|
float("-inf"))
|
||||||
if SLIDING_WINDOW > 0:
|
if SLIDING_WINDOW > 0:
|
||||||
qk = tl.where(
|
qk = tl.where(
|
||||||
offs_m[:, None] -
|
offs_m[:, None] - (start_n + offs_n[None, :])
|
||||||
(start_n + offs_n[None, :]) < SLIDING_WINDOW, qk, -10000)
|
< SLIDING_WINDOW, qk, -10000)
|
||||||
|
|
||||||
# -- compute m_ij, p, l_ij
|
# -- compute m_ij, p, l_ij
|
||||||
m_ij = tl.max(qk, 1)
|
m_ij = tl.max(qk, 1)
|
||||||
@ -324,10 +324,10 @@ if triton.__version__ >= "2.1.0":
|
|||||||
(cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
|
(cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
|
||||||
cur_head * stride_qh + offs_d[None, :] * stride_qd)
|
cur_head * stride_qh + offs_d[None, :] * stride_qd)
|
||||||
|
|
||||||
q = tl.load(
|
q = tl.load(Q + off_q,
|
||||||
Q + off_q,
|
mask=offs_m[:, None]
|
||||||
mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
|
< cur_batch_seq_len - cur_batch_ctx_len,
|
||||||
other=0.0)
|
other=0.0)
|
||||||
|
|
||||||
# # initialize pointer to m and l
|
# # initialize pointer to m and l
|
||||||
m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
|
m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
|
||||||
@ -402,8 +402,8 @@ if triton.__version__ >= "2.1.0":
|
|||||||
# -- compute qk ----
|
# -- compute qk ----
|
||||||
k = tl.load(k_ptrs +
|
k = tl.load(k_ptrs +
|
||||||
(cur_batch_in_all_start_index + start_n) * stride_kbs,
|
(cur_batch_in_all_start_index + start_n) * stride_kbs,
|
||||||
mask=(start_n + offs_n[None, :]) <
|
mask=(start_n + offs_n[None, :])
|
||||||
cur_batch_seq_len - cur_batch_ctx_len,
|
< cur_batch_seq_len - cur_batch_ctx_len,
|
||||||
other=0.0)
|
other=0.0)
|
||||||
|
|
||||||
qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
|
qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
|
||||||
@ -430,8 +430,8 @@ if triton.__version__ >= "2.1.0":
|
|||||||
# update acc
|
# update acc
|
||||||
v = tl.load(v_ptrs +
|
v = tl.load(v_ptrs +
|
||||||
(cur_batch_in_all_start_index + start_n) * stride_vbs,
|
(cur_batch_in_all_start_index + start_n) * stride_vbs,
|
||||||
mask=(start_n + offs_n[:, None]) <
|
mask=(start_n + offs_n[:, None])
|
||||||
cur_batch_seq_len - cur_batch_ctx_len,
|
< cur_batch_seq_len - cur_batch_ctx_len,
|
||||||
other=0.0)
|
other=0.0)
|
||||||
|
|
||||||
p = p.to(v.dtype)
|
p = p.to(v.dtype)
|
||||||
@ -639,8 +639,8 @@ if triton.__version__ >= "2.1.0":
|
|||||||
k = tl.load(k_ptrs +
|
k = tl.load(k_ptrs +
|
||||||
(cur_batch_in_all_start_index + start_n) * stride_kbs,
|
(cur_batch_in_all_start_index + start_n) * stride_kbs,
|
||||||
mask=dim_mask[:, None] &
|
mask=dim_mask[:, None] &
|
||||||
((start_n + offs_n[None, :]) <
|
((start_n + offs_n[None, :])
|
||||||
cur_batch_seq_len - cur_batch_ctx_len),
|
< cur_batch_seq_len - cur_batch_ctx_len),
|
||||||
other=0.0)
|
other=0.0)
|
||||||
|
|
||||||
qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
|
qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
|
||||||
@ -677,8 +677,8 @@ if triton.__version__ >= "2.1.0":
|
|||||||
v = tl.load(v_ptrs +
|
v = tl.load(v_ptrs +
|
||||||
(cur_batch_in_all_start_index + start_n) * stride_vbs,
|
(cur_batch_in_all_start_index + start_n) * stride_vbs,
|
||||||
mask=dim_mask[None, :] &
|
mask=dim_mask[None, :] &
|
||||||
((start_n + offs_n[:, None]) <
|
((start_n + offs_n[:, None])
|
||||||
cur_batch_seq_len - cur_batch_ctx_len),
|
< cur_batch_seq_len - cur_batch_ctx_len),
|
||||||
other=0.0)
|
other=0.0)
|
||||||
p = p.to(v.dtype)
|
p = p.to(v.dtype)
|
||||||
|
|
||||||
|
@ -627,8 +627,8 @@ def attn_fwd(
|
|||||||
causal_start_idx,
|
causal_start_idx,
|
||||||
dtype=tl.int32)
|
dtype=tl.int32)
|
||||||
mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
|
mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
|
||||||
out_ptrs_mask = (mask_m_offsets[:, None] >=
|
out_ptrs_mask = (mask_m_offsets[:, None]
|
||||||
out_mask_boundary[None, :])
|
>= out_mask_boundary[None, :])
|
||||||
z = 0.0
|
z = 0.0
|
||||||
acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
|
acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
|
||||||
# write back LSE
|
# write back LSE
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from functools import lru_cache
|
from functools import cache
|
||||||
from typing import Generator, Optional, Type
|
from typing import Generator, Optional, Type
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
@ -100,7 +100,7 @@ def get_attn_backend(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@cache
|
||||||
def _cached_get_attn_backend(
|
def _cached_get_attn_backend(
|
||||||
head_size: int,
|
head_size: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
|
@ -67,7 +67,8 @@ _RUNNER_TASKS: Dict[RunnerType, List[_ResolvedTask]] = {
|
|||||||
|
|
||||||
_TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = {
|
_TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = {
|
||||||
task: runner
|
task: runner
|
||||||
for runner, tasks in _RUNNER_TASKS.items() for task in tasks
|
for runner, tasks in _RUNNER_TASKS.items()
|
||||||
|
for task in tasks
|
||||||
}
|
}
|
||||||
|
|
||||||
HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig],
|
HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig],
|
||||||
@ -1976,8 +1977,8 @@ class SpeculativeConfig:
|
|||||||
"typical_acceptance_sampler.")
|
"typical_acceptance_sampler.")
|
||||||
|
|
||||||
if (self.draft_token_acceptance_method != 'rejection_sampler'
|
if (self.draft_token_acceptance_method != 'rejection_sampler'
|
||||||
and self.draft_token_acceptance_method !=
|
and self.draft_token_acceptance_method
|
||||||
'typical_acceptance_sampler'):
|
!= 'typical_acceptance_sampler'):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Expected draft_token_acceptance_method to be either "
|
"Expected draft_token_acceptance_method to be either "
|
||||||
"rejection_sampler or typical_acceptance_sampler. Instead it "
|
"rejection_sampler or typical_acceptance_sampler. Instead it "
|
||||||
|
@ -34,9 +34,10 @@ class RefCounter(RefCounterProtocol):
|
|||||||
|
|
||||||
def __init__(self, all_block_indices: Iterable[BlockId]):
|
def __init__(self, all_block_indices: Iterable[BlockId]):
|
||||||
deduped = set(all_block_indices)
|
deduped = set(all_block_indices)
|
||||||
self._refcounts: Dict[BlockId,
|
self._refcounts: Dict[BlockId, RefCount] = {
|
||||||
RefCount] = {index: 0
|
index: 0
|
||||||
for index in deduped}
|
for index in deduped
|
||||||
|
}
|
||||||
|
|
||||||
def incr(self, block_id: BlockId) -> RefCount:
|
def incr(self, block_id: BlockId) -> RefCount:
|
||||||
assert block_id in self._refcounts
|
assert block_id in self._refcounts
|
||||||
|
@ -136,8 +136,8 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
|
|||||||
device=Device.GPU)
|
device=Device.GPU)
|
||||||
|
|
||||||
# Use watermark to avoid frequent cache eviction.
|
# Use watermark to avoid frequent cache eviction.
|
||||||
if (self.num_total_gpu_blocks - num_required_blocks <
|
if (self.num_total_gpu_blocks - num_required_blocks
|
||||||
self.watermark_blocks):
|
< self.watermark_blocks):
|
||||||
return AllocStatus.NEVER
|
return AllocStatus.NEVER
|
||||||
if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
|
if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
|
||||||
return AllocStatus.OK
|
return AllocStatus.OK
|
||||||
|
@ -988,8 +988,8 @@ class Scheduler:
|
|||||||
waiting_queue.popleft()
|
waiting_queue.popleft()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if (budget.num_batched_tokens >=
|
if (budget.num_batched_tokens
|
||||||
self.scheduler_config.max_num_batched_tokens):
|
>= self.scheduler_config.max_num_batched_tokens):
|
||||||
# We've reached the budget limit - since there might be
|
# We've reached the budget limit - since there might be
|
||||||
# continuous prefills in the running queue, we should break
|
# continuous prefills in the running queue, we should break
|
||||||
# to avoid scheduling any new prefills.
|
# to avoid scheduling any new prefills.
|
||||||
@ -1096,8 +1096,8 @@ class Scheduler:
|
|||||||
running_scheduled.swapped_out) == 0:
|
running_scheduled.swapped_out) == 0:
|
||||||
swapped_in = self._schedule_swapped(budget, curr_loras)
|
swapped_in = self._schedule_swapped(budget, curr_loras)
|
||||||
|
|
||||||
assert (budget.num_batched_tokens <=
|
assert (budget.num_batched_tokens
|
||||||
self.scheduler_config.max_num_batched_tokens)
|
<= self.scheduler_config.max_num_batched_tokens)
|
||||||
assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
|
assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
|
||||||
|
|
||||||
# Update waiting requests.
|
# Update waiting requests.
|
||||||
@ -1189,8 +1189,8 @@ class Scheduler:
|
|||||||
curr_loras,
|
curr_loras,
|
||||||
enable_chunking=True)
|
enable_chunking=True)
|
||||||
|
|
||||||
assert (budget.num_batched_tokens <=
|
assert (budget.num_batched_tokens
|
||||||
self.scheduler_config.max_num_batched_tokens)
|
<= self.scheduler_config.max_num_batched_tokens)
|
||||||
assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
|
assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
|
||||||
|
|
||||||
# Update waiting requests.
|
# Update waiting requests.
|
||||||
@ -1358,8 +1358,8 @@ class Scheduler:
|
|||||||
# NOTE: We use get_len instead of get_prompt_len because when
|
# NOTE: We use get_len instead of get_prompt_len because when
|
||||||
# a sequence is preempted, prefill includes previous generated
|
# a sequence is preempted, prefill includes previous generated
|
||||||
# output tokens.
|
# output tokens.
|
||||||
if (token_chunk_size + num_computed_tokens <
|
if (token_chunk_size + num_computed_tokens
|
||||||
seqs[0].data.get_len()):
|
< seqs[0].data.get_len()):
|
||||||
do_sample = False
|
do_sample = False
|
||||||
|
|
||||||
# It assumes the scheduled_seq_groups is ordered by
|
# It assumes the scheduled_seq_groups is ordered by
|
||||||
@ -1625,10 +1625,9 @@ class Scheduler:
|
|||||||
if self.scheduler_config.delay_factor > 0 and self.waiting:
|
if self.scheduler_config.delay_factor > 0 and self.waiting:
|
||||||
earliest_arrival_time = min(
|
earliest_arrival_time = min(
|
||||||
[e.metrics.arrival_time for e in self.waiting])
|
[e.metrics.arrival_time for e in self.waiting])
|
||||||
passed_delay = (
|
passed_delay = ((now - earliest_arrival_time)
|
||||||
(now - earliest_arrival_time) >
|
> (self.scheduler_config.delay_factor *
|
||||||
(self.scheduler_config.delay_factor * self.last_prompt_latency)
|
self.last_prompt_latency) or not self.running)
|
||||||
or not self.running)
|
|
||||||
else:
|
else:
|
||||||
passed_delay = True
|
passed_delay = True
|
||||||
return passed_delay
|
return passed_delay
|
||||||
|
@ -352,8 +352,8 @@ class MessageQueue:
|
|||||||
sched_yield()
|
sched_yield()
|
||||||
|
|
||||||
# if we wait for a long time, log a message
|
# if we wait for a long time, log a message
|
||||||
if (time.monotonic() - start_time >
|
if (time.monotonic() - start_time
|
||||||
VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
|
> VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
|
||||||
logger.debug("No available block found in %s second. ",
|
logger.debug("No available block found in %s second. ",
|
||||||
VLLM_RINGBUFFER_WARNING_INTERVAL)
|
VLLM_RINGBUFFER_WARNING_INTERVAL)
|
||||||
n_warning += 1
|
n_warning += 1
|
||||||
@ -410,8 +410,8 @@ class MessageQueue:
|
|||||||
sched_yield()
|
sched_yield()
|
||||||
|
|
||||||
# if we wait for a long time, log a message
|
# if we wait for a long time, log a message
|
||||||
if (time.monotonic() - start_time >
|
if (time.monotonic() - start_time
|
||||||
VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
|
> VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
|
||||||
logger.debug("No available block found in %s second. ",
|
logger.debug("No available block found in %s second. ",
|
||||||
VLLM_RINGBUFFER_WARNING_INTERVAL)
|
VLLM_RINGBUFFER_WARNING_INTERVAL)
|
||||||
n_warning += 1
|
n_warning += 1
|
||||||
|
@ -1014,8 +1014,8 @@ def initialize_model_parallel(
|
|||||||
backend = backend or torch.distributed.get_backend(
|
backend = backend or torch.distributed.get_backend(
|
||||||
get_world_group().device_group)
|
get_world_group().device_group)
|
||||||
|
|
||||||
if (world_size !=
|
if (world_size
|
||||||
tensor_model_parallel_size * pipeline_model_parallel_size):
|
!= tensor_model_parallel_size * pipeline_model_parallel_size):
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"world_size ({world_size}) is not equal to "
|
f"world_size ({world_size}) is not equal to "
|
||||||
f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
|
f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
|
||||||
@ -1069,8 +1069,8 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
|
|||||||
return
|
return
|
||||||
|
|
||||||
if all([
|
if all([
|
||||||
vllm_config.kv_transfer_config.need_kv_parallel_group,
|
vllm_config.kv_transfer_config.need_kv_parallel_group, _KV_TRANSFER
|
||||||
_KV_TRANSFER is None
|
is None
|
||||||
]):
|
]):
|
||||||
_KV_TRANSFER = kv_transfer.KVTransferAgent(
|
_KV_TRANSFER = kv_transfer.KVTransferAgent(
|
||||||
rank=get_world_group().rank,
|
rank=get_world_group().rank,
|
||||||
|
@ -3,7 +3,7 @@ import codecs
|
|||||||
import json
|
import json
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from collections import defaultdict, deque
|
from collections import defaultdict, deque
|
||||||
from functools import lru_cache, partial
|
from functools import cache, lru_cache, partial
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List,
|
from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List,
|
||||||
Literal, Optional, Tuple, TypeVar, Union, cast)
|
Literal, Optional, Tuple, TypeVar, Union, cast)
|
||||||
@ -377,7 +377,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
|||||||
return self._model_config.allowed_local_media_path
|
return self._model_config.allowed_local_media_path
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@lru_cache(maxsize=None)
|
@cache
|
||||||
def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
|
def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
|
||||||
return tokenizer.decode(token_index)
|
return tokenizer.decode(token_index)
|
||||||
|
|
||||||
|
@ -522,11 +522,10 @@ class OpenAIServingCompletion(OpenAIServing):
|
|||||||
out_top_logprobs.append({
|
out_top_logprobs.append({
|
||||||
# Convert float("-inf") to the
|
# Convert float("-inf") to the
|
||||||
# JSON-serializable float that OpenAI uses
|
# JSON-serializable float that OpenAI uses
|
||||||
self._get_decoded_token(
|
self._get_decoded_token(top_lp[1],
|
||||||
top_lp[1],
|
top_lp[0],
|
||||||
top_lp[0],
|
tokenizer,
|
||||||
tokenizer,
|
return_as_token_id=self.return_tokens_as_token_ids):
|
||||||
return_as_token_id=self.return_tokens_as_token_ids):
|
|
||||||
max(top_lp[1].logprob, -9999.0)
|
max(top_lp[1].logprob, -9999.0)
|
||||||
for i, top_lp in enumerate(step_top_logprobs.items())
|
for i, top_lp in enumerate(step_top_logprobs.items())
|
||||||
if num_output_top_logprobs >= i
|
if num_output_top_logprobs >= i
|
||||||
|
@ -62,8 +62,8 @@ class Granite20bFCToolParser(ToolParser):
|
|||||||
start_of_json = match.end()
|
start_of_json = match.end()
|
||||||
# end_index == the start of the next function call
|
# end_index == the start of the next function call
|
||||||
# (if exists)
|
# (if exists)
|
||||||
next_function_call_start = (matches[i + 1].start()
|
next_function_call_start = (matches[i + 1].start() if i +
|
||||||
if i + 1 < len(matches) else None)
|
1 < len(matches) else None)
|
||||||
|
|
||||||
raw_function_calls.append(
|
raw_function_calls.append(
|
||||||
dec.raw_decode(
|
dec.raw_decode(
|
||||||
|
@ -220,8 +220,10 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
|
|||||||
lora_b.T, non_blocking=True)
|
lora_b.T, non_blocking=True)
|
||||||
if embeddings_tensor is not None:
|
if embeddings_tensor is not None:
|
||||||
self.embeddings_tensors[
|
self.embeddings_tensors[
|
||||||
index, :embeddings_tensor.shape[0], :embeddings_tensor.
|
index,
|
||||||
shape[1], ].copy_(embeddings_tensor, non_blocking=True)
|
:embeddings_tensor.shape[0],
|
||||||
|
:embeddings_tensor.shape[1],
|
||||||
|
].copy_(embeddings_tensor, non_blocking=True)
|
||||||
if self.embeddings_slice is not None:
|
if self.embeddings_slice is not None:
|
||||||
# TODO(yard1): Optimize this copy, we don't need to copy
|
# TODO(yard1): Optimize this copy, we don't need to copy
|
||||||
# everything, just the modified part
|
# everything, just the modified part
|
||||||
@ -1024,8 +1026,10 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
|
|||||||
lora_b.T, non_blocking=True)
|
lora_b.T, non_blocking=True)
|
||||||
if embeddings_tensor is not None:
|
if embeddings_tensor is not None:
|
||||||
self.embeddings_tensors[
|
self.embeddings_tensors[
|
||||||
index, :embeddings_tensor.shape[0], :embeddings_tensor.
|
index,
|
||||||
shape[1], ] = embeddings_tensor
|
:embeddings_tensor.shape[0],
|
||||||
|
:embeddings_tensor.shape[1],
|
||||||
|
] = embeddings_tensor
|
||||||
|
|
||||||
def _get_logits(
|
def _get_logits(
|
||||||
self,
|
self,
|
||||||
|
@ -75,8 +75,9 @@ class LoRAModel(AdapterModel):
|
|||||||
# Scaling factor for long context lora model. None if it is not
|
# Scaling factor for long context lora model. None if it is not
|
||||||
# fine tuned for the long context.
|
# fine tuned for the long context.
|
||||||
self.scaling_factor = scaling_factor
|
self.scaling_factor = scaling_factor
|
||||||
assert (lora_model_id >
|
assert (
|
||||||
0), f"a valid lora id should be greater than 0, got {self.id}"
|
lora_model_id
|
||||||
|
> 0), f"a valid lora id should be greater than 0, got {self.id}"
|
||||||
self.rank = rank
|
self.rank = rank
|
||||||
self.loras: Dict[str, LoRALayerWeights] = loras
|
self.loras: Dict[str, LoRALayerWeights] = loras
|
||||||
|
|
||||||
|
@ -136,9 +136,8 @@ def _sgmv_expand_kernel(
|
|||||||
c_ptr = (out_ptr + offset_cm[:, None] * output_d0_stride +
|
c_ptr = (out_ptr + offset_cm[:, None] * output_d0_stride +
|
||||||
offset_cn[None, :] * output_d1_stride)
|
offset_cn[None, :] * output_d1_stride)
|
||||||
M = tl.load(seq_lens + cur_batch)
|
M = tl.load(seq_lens + cur_batch)
|
||||||
c_mask = (offset_cm[:, None] <
|
c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (
|
||||||
(cur_seq_start + M)) & (offset_cn[None, :] <
|
offset_cn[None, :] < (cur_slice_start + curr_N))
|
||||||
(cur_slice_start + curr_N))
|
|
||||||
if ADD_INPUTS:
|
if ADD_INPUTS:
|
||||||
tiled_out = tl.load(c_ptr, mask=c_mask)
|
tiled_out = tl.load(c_ptr, mask=c_mask)
|
||||||
tiled_c += tiled_out
|
tiled_c += tiled_out
|
||||||
|
@ -114,8 +114,8 @@ def _sgmv_shrink_kernel(
|
|||||||
slice_id * output_d0_stride)
|
slice_id * output_d0_stride)
|
||||||
c_ptr = cur_out_ptr + offset_cm[:, None] * output_d1_stride + offset_cn[
|
c_ptr = cur_out_ptr + offset_cm[:, None] * output_d1_stride + offset_cn[
|
||||||
None, :] * output_d2_stride
|
None, :] * output_d2_stride
|
||||||
c_mask = (offset_cm[:, None] <
|
c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :]
|
||||||
(cur_seq_start + M)) & (offset_cn[None, :] < N)
|
< N)
|
||||||
accumulator *= scaling
|
accumulator *= scaling
|
||||||
# handles write-back with reduction-splitting
|
# handles write-back with reduction-splitting
|
||||||
if SPLIT_K == 1:
|
if SPLIT_K == 1:
|
||||||
|
@ -73,12 +73,12 @@ class MPLinearKernel(ABC):
|
|||||||
torch.nn.Parameter(new_param.data, requires_grad=False))
|
torch.nn.Parameter(new_param.data, requires_grad=False))
|
||||||
|
|
||||||
def _get_weight_params(
|
def _get_weight_params(
|
||||||
self, layer: torch.nn.Module
|
self, layer: torch.nn.Module) -> Tuple[
|
||||||
) -> Tuple[torch.Tensor, # w_q
|
torch.Tensor, # w_q
|
||||||
torch.Tensor, # w_s
|
torch.Tensor, # w_s
|
||||||
Optional[torch.Tensor], # w_zp,
|
Optional[torch.Tensor], # w_zp,
|
||||||
Optional[torch.Tensor] # w_gidx
|
Optional[torch.Tensor] # w_gidx
|
||||||
]:
|
]:
|
||||||
return (
|
return (
|
||||||
getattr(layer, self.w_q_name),
|
getattr(layer, self.w_q_name),
|
||||||
getattr(layer, self.w_s_name),
|
getattr(layer, self.w_s_name),
|
||||||
|
@ -48,13 +48,13 @@ class ScaledMMLinearKernel(ABC):
|
|||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def _get_weight_params(
|
def _get_weight_params(
|
||||||
self, layer: torch.nn.Module
|
self, layer: torch.nn.Module) -> Tuple[
|
||||||
) -> Tuple[torch.Tensor, # weight
|
torch.Tensor, # weight
|
||||||
torch.Tensor, # weight_scale
|
torch.Tensor, # weight_scale
|
||||||
Optional[torch.Tensor], # input_scale,
|
Optional[torch.Tensor], # input_scale,
|
||||||
Optional[torch.Tensor], # input_zp
|
Optional[torch.Tensor], # input_zp
|
||||||
Optional[torch.Tensor], # azp_adj
|
Optional[torch.Tensor], # azp_adj
|
||||||
]:
|
]:
|
||||||
return (
|
return (
|
||||||
getattr(layer, self.w_q_name),
|
getattr(layer, self.w_q_name),
|
||||||
getattr(layer, self.w_s_name),
|
getattr(layer, self.w_s_name),
|
||||||
|
@ -72,9 +72,10 @@ def block_quant_to_tensor_quant(
|
|||||||
x_dq_block = x_q_block.to(torch.float32)
|
x_dq_block = x_q_block.to(torch.float32)
|
||||||
|
|
||||||
x_dq_block_tiles = [[
|
x_dq_block_tiles = [[
|
||||||
x_dq_block[j * block_n:min((j + 1) * block_n, n),
|
x_dq_block[
|
||||||
i * block_k:min((i + 1) * block_k, k), ]
|
j * block_n:min((j + 1) * block_n, n),
|
||||||
for i in range(k_tiles)
|
i * block_k:min((i + 1) * block_k, k),
|
||||||
|
] for i in range(k_tiles)
|
||||||
] for j in range(n_tiles)]
|
] for j in range(n_tiles)]
|
||||||
|
|
||||||
for i in range(k_tiles):
|
for i in range(k_tiles):
|
||||||
|
@ -73,8 +73,8 @@ def requantize_with_max_scale(
|
|||||||
# from disk in this case. Skip requantization in this case (since)
|
# from disk in this case. Skip requantization in this case (since)
|
||||||
# we already are quantized with the single scale.
|
# we already are quantized with the single scale.
|
||||||
# * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
|
# * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
|
||||||
unfused_module_in_checkpoint = (weight_scale[-1] > torch.finfo(
|
unfused_module_in_checkpoint = (weight_scale[-1]
|
||||||
torch.float8_e4m3fn).min)
|
> torch.finfo(torch.float8_e4m3fn).min)
|
||||||
|
|
||||||
# If unfused checkpoint, need requanize with the single scale.
|
# If unfused checkpoint, need requanize with the single scale.
|
||||||
if unfused_module_in_checkpoint:
|
if unfused_module_in_checkpoint:
|
||||||
|
@ -716,9 +716,10 @@ def _sample_with_torch(
|
|||||||
tensors required for Pythonization
|
tensors required for Pythonization
|
||||||
'''
|
'''
|
||||||
|
|
||||||
categorized_seq_group_ids: Dict[SamplingType,
|
categorized_seq_group_ids: Dict[SamplingType, List[int]] = {
|
||||||
List[int]] = {t: []
|
t: []
|
||||||
for t in SamplingType}
|
for t in SamplingType
|
||||||
|
}
|
||||||
categorized_sample_indices = sampling_metadata.categorized_sample_indices
|
categorized_sample_indices = sampling_metadata.categorized_sample_indices
|
||||||
for i, seq_group in enumerate(sampling_metadata.seq_groups):
|
for i, seq_group in enumerate(sampling_metadata.seq_groups):
|
||||||
sampling_params = seq_group.sampling_params
|
sampling_params = seq_group.sampling_params
|
||||||
|
@ -115,17 +115,17 @@ class VocabParallelEmbeddingShardIndices:
|
|||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
# sanity checks
|
# sanity checks
|
||||||
assert (self.padded_org_vocab_start_index <=
|
assert (self.padded_org_vocab_start_index
|
||||||
self.padded_org_vocab_end_index)
|
<= self.padded_org_vocab_end_index)
|
||||||
assert (self.padded_added_vocab_start_index <=
|
assert (self.padded_added_vocab_start_index
|
||||||
self.padded_added_vocab_end_index)
|
<= self.padded_added_vocab_end_index)
|
||||||
|
|
||||||
assert self.org_vocab_start_index <= self.org_vocab_end_index
|
assert self.org_vocab_start_index <= self.org_vocab_end_index
|
||||||
assert self.added_vocab_start_index <= self.added_vocab_end_index
|
assert self.added_vocab_start_index <= self.added_vocab_end_index
|
||||||
|
|
||||||
assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
|
assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
|
||||||
assert (self.added_vocab_start_index <=
|
assert (self.added_vocab_start_index
|
||||||
self.padded_added_vocab_start_index)
|
<= self.padded_added_vocab_start_index)
|
||||||
assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
|
assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
|
||||||
assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
|
assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
|
||||||
|
|
||||||
@ -141,8 +141,8 @@ def get_masked_input_and_mask(
|
|||||||
added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
# torch.compile will fuse all of the pointwise ops below
|
# torch.compile will fuse all of the pointwise ops below
|
||||||
# into a single kernel, making it very fast
|
# into a single kernel, making it very fast
|
||||||
org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ <
|
org_vocab_mask = (input_ >= org_vocab_start_index) & (
|
||||||
org_vocab_end_index)
|
input_ < org_vocab_end_index)
|
||||||
added_vocab_mask = (input_ >= added_vocab_start_index) & (
|
added_vocab_mask = (input_ >= added_vocab_start_index) & (
|
||||||
input_ < added_vocab_end_index)
|
input_ < added_vocab_end_index)
|
||||||
added_offset = added_vocab_start_index - (
|
added_offset = added_vocab_start_index - (
|
||||||
|
@ -1121,8 +1121,9 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
|||||||
# from being incorrectly identified as being present in
|
# from being incorrectly identified as being present in
|
||||||
# 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
|
# 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
|
||||||
shard_pos = quant_param_name.find(shard_name)
|
shard_pos = quant_param_name.find(shard_name)
|
||||||
can_correct_rename = (shard_pos > 0) and (
|
can_correct_rename = (shard_pos
|
||||||
quant_param_name[shard_pos - 1] == ".")
|
> 0) and (quant_param_name[shard_pos - 1]
|
||||||
|
== ".")
|
||||||
# If the quant_param_name is packed, it won't occur in the
|
# If the quant_param_name is packed, it won't occur in the
|
||||||
# param_dict before renaming.
|
# param_dict before renaming.
|
||||||
new_quant_param_name = quant_param_name.replace(
|
new_quant_param_name = quant_param_name.replace(
|
||||||
|
@ -298,8 +298,8 @@ class TensorizerAgent:
|
|||||||
to allow for adapter added tokens."""
|
to allow for adapter added tokens."""
|
||||||
for child in self.model.modules():
|
for child in self.model.modules():
|
||||||
if (isinstance(child, VocabParallelEmbedding)
|
if (isinstance(child, VocabParallelEmbedding)
|
||||||
and child.weight.shape[0] <
|
and child.weight.shape[0]
|
||||||
child.num_embeddings_per_partition):
|
< child.num_embeddings_per_partition):
|
||||||
new_weight = torch.empty(child.num_embeddings_per_partition,
|
new_weight = torch.empty(child.num_embeddings_per_partition,
|
||||||
child.embedding_dim,
|
child.embedding_dim,
|
||||||
dtype=child.weight.dtype,
|
dtype=child.weight.dtype,
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Inference-only Gemma model compatible with HuggingFace weights."""
|
"""Inference-only Gemma model compatible with HuggingFace weights."""
|
||||||
from functools import lru_cache
|
from functools import cache
|
||||||
from typing import Iterable, List, Optional, Set, Tuple, Union
|
from typing import Iterable, List, Optional, Set, Tuple, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
@ -48,7 +48,7 @@ from .utils import (is_pp_missing_parameter,
|
|||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@cache
|
||||||
def _get_gemma_act_fn(
|
def _get_gemma_act_fn(
|
||||||
hidden_act: Optional[str],
|
hidden_act: Optional[str],
|
||||||
hidden_activation: Optional[str],
|
hidden_activation: Optional[str],
|
||||||
|
@ -429,10 +429,10 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
|||||||
for e in range(p.size(0)):
|
for e in range(p.size(0)):
|
||||||
w1_name = n.replace(
|
w1_name = n.replace(
|
||||||
'.block_sparse_moe.input_linear.weight',
|
'.block_sparse_moe.input_linear.weight',
|
||||||
".block_sparse_moe.experts.%d.w1.weight" % e)
|
f".block_sparse_moe.experts.{e}.w1.weight")
|
||||||
w3_name = n.replace(
|
w3_name = n.replace(
|
||||||
'.block_sparse_moe.input_linear.weight',
|
'.block_sparse_moe.input_linear.weight',
|
||||||
".block_sparse_moe.experts.%d.w3.weight" % e)
|
f".block_sparse_moe.experts.{e}.w3.weight")
|
||||||
w1_param, w3_param = p[e].chunk(2, dim=0)
|
w1_param, w3_param = p[e].chunk(2, dim=0)
|
||||||
assert w1_name not in new_weights
|
assert w1_name not in new_weights
|
||||||
assert w3_name not in new_weights
|
assert w3_name not in new_weights
|
||||||
@ -442,7 +442,7 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
|||||||
for e in range(p.size(0)):
|
for e in range(p.size(0)):
|
||||||
w2_name = n.replace(
|
w2_name = n.replace(
|
||||||
'.block_sparse_moe.output_linear.weight',
|
'.block_sparse_moe.output_linear.weight',
|
||||||
".block_sparse_moe.experts.%d.w2.weight" % e)
|
f".block_sparse_moe.experts.{e}.w2.weight")
|
||||||
w2_param = p[e]
|
w2_param = p[e]
|
||||||
assert w2_name not in new_weights
|
assert w2_name not in new_weights
|
||||||
new_weights[w2_name] = w2_param
|
new_weights[w2_name] = w2_param
|
||||||
|
@ -1365,8 +1365,8 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
|
|||||||
# For 1) text-only prefill and decode, 2) image-present decode.
|
# For 1) text-only prefill and decode, 2) image-present decode.
|
||||||
if image_inputs is None:
|
if image_inputs is None:
|
||||||
full_text_row_masked_out_mask = (
|
full_text_row_masked_out_mask = (
|
||||||
attn_metadata.encoder_seq_lens_tensor != 0).reshape(-1, 1).to(
|
attn_metadata.encoder_seq_lens_tensor
|
||||||
input_ids.device)
|
!= 0).reshape(-1, 1).to(input_ids.device)
|
||||||
skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0
|
skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0
|
||||||
|
|
||||||
# For image-present prefill.
|
# For image-present prefill.
|
||||||
|
@ -81,8 +81,8 @@ class MLPSpeculator(nn.Module):
|
|||||||
|
|
||||||
if self.tie_weights:
|
if self.tie_weights:
|
||||||
assert (
|
assert (
|
||||||
self.n_predict >
|
self.n_predict > 1
|
||||||
1), "You cannot tie weights between stages when only 1 exists"
|
), "You cannot tie weights between stages when only 1 exists"
|
||||||
embedding = VocabParallelEmbedding(
|
embedding = VocabParallelEmbedding(
|
||||||
config.vocab_size,
|
config.vocab_size,
|
||||||
self.inner_dim,
|
self.inner_dim,
|
||||||
|
@ -167,8 +167,8 @@ def sparsemixer(scores, jitter_eps=0.01):
|
|||||||
# compute mask for sparsity
|
# compute mask for sparsity
|
||||||
mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
|
mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
|
||||||
factor = scores.abs().clamp(min=mask_logits_threshold)
|
factor = scores.abs().clamp(min=mask_logits_threshold)
|
||||||
mask_logits_threshold = (
|
mask_logits_threshold = ((mask_logits_threshold - scores) /
|
||||||
(mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
|
factor) > (2 * jitter_eps)
|
||||||
|
|
||||||
# apply mask
|
# apply mask
|
||||||
masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf"))
|
masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf"))
|
||||||
@ -192,8 +192,8 @@ def sparsemixer(scores, jitter_eps=0.01):
|
|||||||
mask_logits_threshold, max_ind = masked_scores.max(dim=-1,
|
mask_logits_threshold, max_ind = masked_scores.max(dim=-1,
|
||||||
keepdim=True)
|
keepdim=True)
|
||||||
factor = scores.abs().clamp(min=mask_logits_threshold)
|
factor = scores.abs().clamp(min=mask_logits_threshold)
|
||||||
mask_logits_threshold = (
|
mask_logits_threshold = ((mask_logits_threshold - scores) /
|
||||||
(mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
|
factor) > (2 * jitter_eps)
|
||||||
|
|
||||||
# apply mask
|
# apply mask
|
||||||
masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold,
|
masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold,
|
||||||
|
@ -462,7 +462,8 @@ class _ModelRegistry:
|
|||||||
|
|
||||||
|
|
||||||
ModelRegistry = _ModelRegistry({
|
ModelRegistry = _ModelRegistry({
|
||||||
model_arch: _LazyRegisteredModel(
|
model_arch:
|
||||||
|
_LazyRegisteredModel(
|
||||||
module_name=f"vllm.model_executor.models.{mod_relname}",
|
module_name=f"vllm.model_executor.models.{mod_relname}",
|
||||||
class_name=cls_name,
|
class_name=cls_name,
|
||||||
)
|
)
|
||||||
|
@ -333,10 +333,10 @@ class ModifiedWhisperEncoder(WhisperEncoder):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor,
|
@MULTIMODAL_REGISTRY.register_processor(
|
||||||
info=UltravoxProcessingInfo,
|
UltravoxMultiModalProcessor,
|
||||||
dummy_inputs=UltravoxDummyInputsBuilder
|
info=UltravoxProcessingInfo,
|
||||||
)
|
dummy_inputs=UltravoxDummyInputsBuilder)
|
||||||
class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
|
class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
|
||||||
|
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
|
@ -599,9 +599,8 @@ def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int):
|
|||||||
device: torch.device,
|
device: torch.device,
|
||||||
) -> IntermediateTensors:
|
) -> IntermediateTensors:
|
||||||
return IntermediateTensors({
|
return IntermediateTensors({
|
||||||
key: torch.zeros((batch_size, hidden_size),
|
key:
|
||||||
dtype=dtype,
|
torch.zeros((batch_size, hidden_size), dtype=dtype, device=device)
|
||||||
device=device)
|
|
||||||
for key in keys
|
for key in keys
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@ -166,7 +166,8 @@ class SamplingMetadata:
|
|||||||
pin_memory=pin_memory,
|
pin_memory=pin_memory,
|
||||||
)
|
)
|
||||||
categorized_sample_indices = {
|
categorized_sample_indices = {
|
||||||
t: async_tensor_h2d(
|
t:
|
||||||
|
async_tensor_h2d(
|
||||||
seq_ids,
|
seq_ids,
|
||||||
dtype=torch.int,
|
dtype=torch.int,
|
||||||
target_device=device,
|
target_device=device,
|
||||||
@ -198,8 +199,12 @@ def _prepare_seq_groups(
|
|||||||
device: str,
|
device: str,
|
||||||
generators: Optional[Dict[str, torch.Generator]] = None,
|
generators: Optional[Dict[str, torch.Generator]] = None,
|
||||||
cache: Optional[SamplingMetadataCache] = None,
|
cache: Optional[SamplingMetadataCache] = None,
|
||||||
) -> Tuple[List[SequenceGroupToSample], List[int], Dict[SamplingType,
|
) -> Tuple[
|
||||||
List[int]], int, ]:
|
List[SequenceGroupToSample],
|
||||||
|
List[int],
|
||||||
|
Dict[SamplingType, List[int]],
|
||||||
|
int,
|
||||||
|
]:
|
||||||
"""Prepare sequence groups and indices for sampling.
|
"""Prepare sequence groups and indices for sampling.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -38,8 +38,8 @@ class NeuronPlatform(Platform):
|
|||||||
if parallel_config.world_size > 1:
|
if parallel_config.world_size > 1:
|
||||||
parallel_config.distributed_executor_backend = "uni"
|
parallel_config.distributed_executor_backend = "uni"
|
||||||
|
|
||||||
assert (vllm_config.lora_config is
|
assert (vllm_config.lora_config
|
||||||
None), "LoRA is not supported for Neuron backend."
|
is None), "LoRA is not supported for Neuron backend."
|
||||||
assert (not vllm_config.speculative_config
|
assert (not vllm_config.speculative_config
|
||||||
), "Speculative decoding not yet supported for Neuron backend."
|
), "Speculative decoding not yet supported for Neuron backend."
|
||||||
|
|
||||||
|
@ -121,8 +121,8 @@ class ScalarType:
|
|||||||
min_raw = max_raw | sign_bit_double
|
min_raw = max_raw | sign_bit_double
|
||||||
return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
|
return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
|
||||||
else:
|
else:
|
||||||
assert (not self.is_signed() or
|
assert (not self.is_signed() or self.size_bits
|
||||||
self.size_bits <= 64), "Cannot represent min as a int64_t"
|
<= 64), "Cannot represent min as a int64_t"
|
||||||
|
|
||||||
if self.is_signed():
|
if self.is_signed():
|
||||||
return -(1 << (self.size_bits - 1))
|
return -(1 << (self.size_bits - 1))
|
||||||
|
@ -510,8 +510,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
|
|||||||
self, execute_model_req: ExecuteModelRequest) -> bool:
|
self, execute_model_req: ExecuteModelRequest) -> bool:
|
||||||
# When the batch size is too large, disable speculative decoding
|
# When the batch size is too large, disable speculative decoding
|
||||||
# to stop trading off throughput for latency.
|
# to stop trading off throughput for latency.
|
||||||
return (execute_model_req.running_queue_size >=
|
return (execute_model_req.running_queue_size
|
||||||
self.disable_by_batch_size)
|
>= self.disable_by_batch_size)
|
||||||
|
|
||||||
def _maybe_disable_speculative_tokens(
|
def _maybe_disable_speculative_tokens(
|
||||||
self, disable_all_speculation: bool,
|
self, disable_all_speculation: bool,
|
||||||
|
@ -104,11 +104,11 @@ class Top1Proposer(SpeculativeProposer):
|
|||||||
sampler_transposed=transposed,
|
sampler_transposed=transposed,
|
||||||
)
|
)
|
||||||
|
|
||||||
proposals = SpeculativeProposals(
|
proposals = SpeculativeProposals(proposal_token_ids=proposal_tokens,
|
||||||
proposal_token_ids=proposal_tokens,
|
proposal_probs=proposal_probs,
|
||||||
proposal_probs=proposal_probs,
|
proposal_lens=proposal_lens,
|
||||||
proposal_lens=proposal_lens,
|
no_proposals=maybe_sampler_output
|
||||||
no_proposals=maybe_sampler_output is None)
|
is None)
|
||||||
return proposals
|
return proposals
|
||||||
|
|
||||||
def _split_by_proposal_len(
|
def _split_by_proposal_len(
|
||||||
|
@ -40,13 +40,15 @@ def get_sampled_token_logprobs(
|
|||||||
"""
|
"""
|
||||||
num_steps, batch_size, vocab_size = logprob_tensor.shape
|
num_steps, batch_size, vocab_size = logprob_tensor.shape
|
||||||
|
|
||||||
selected_logprobs = logprob_tensor[torch.arange(num_steps).unsqueeze(1),
|
selected_logprobs = logprob_tensor[
|
||||||
torch.arange(batch_size),
|
torch.arange(num_steps).unsqueeze(1),
|
||||||
sampled_token_ids, ]
|
torch.arange(batch_size),
|
||||||
|
sampled_token_ids,
|
||||||
|
]
|
||||||
expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand(
|
expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand(
|
||||||
-1, -1, vocab_size)
|
-1, -1, vocab_size)
|
||||||
sampled_token_ids_ranks = (logprob_tensor >
|
sampled_token_ids_ranks = (logprob_tensor
|
||||||
expanded_selected_logprobs).sum(-1).add_(1)
|
> expanded_selected_logprobs).sum(-1).add_(1)
|
||||||
|
|
||||||
return sampled_token_ids_ranks, selected_logprobs
|
return sampled_token_ids_ranks, selected_logprobs
|
||||||
|
|
||||||
|
@ -182,8 +182,8 @@ class NemotronConfig(PretrainedConfig):
|
|||||||
if self.rope_scaling is None:
|
if self.rope_scaling is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
if not isinstance(self.rope_scaling,
|
if not isinstance(self.rope_scaling, dict) or len(
|
||||||
dict) or len(self.rope_scaling) != 2:
|
self.rope_scaling) != 2:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"`rope_scaling` must be a dictionary with two fields, "
|
"`rope_scaling` must be a dictionary with two fields, "
|
||||||
f"`type` and `factor`, got {self.rope_scaling}")
|
f"`type` and `factor`, got {self.rope_scaling}")
|
||||||
|
@ -29,7 +29,7 @@ from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
|
|||||||
from collections import OrderedDict, UserDict, defaultdict
|
from collections import OrderedDict, UserDict, defaultdict
|
||||||
from collections.abc import Hashable, Iterable, Mapping
|
from collections.abc import Hashable, Iterable, Mapping
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from functools import lru_cache, partial, wraps
|
from functools import cache, lru_cache, partial, wraps
|
||||||
from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
|
from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
|
||||||
Dict, Generator, Generic, Iterator, List, Literal,
|
Dict, Generator, Generic, Iterator, List, Literal,
|
||||||
NamedTuple, Optional, Tuple, Type, TypeVar, Union,
|
NamedTuple, Optional, Tuple, Type, TypeVar, Union,
|
||||||
@ -352,7 +352,7 @@ class PyObjectCache:
|
|||||||
self._index = 0
|
self._index = 0
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@cache
|
||||||
def get_max_shared_memory_bytes(gpu: int = 0) -> int:
|
def get_max_shared_memory_bytes(gpu: int = 0) -> int:
|
||||||
"""Returns the maximum shared memory per thread block in bytes."""
|
"""Returns the maximum shared memory per thread block in bytes."""
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
@ -697,7 +697,7 @@ def create_kv_caches_with_random(
|
|||||||
return key_caches, value_caches
|
return key_caches, value_caches
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@cache
|
||||||
def is_pin_memory_available() -> bool:
|
def is_pin_memory_available() -> bool:
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
return current_platform.is_pin_memory_available()
|
return current_platform.is_pin_memory_available()
|
||||||
@ -886,7 +886,7 @@ def init_cached_hf_modules() -> None:
|
|||||||
init_hf_modules()
|
init_hf_modules()
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@cache
|
||||||
def find_library(lib_name: str) -> str:
|
def find_library(lib_name: str) -> str:
|
||||||
"""
|
"""
|
||||||
Find the library file in the system.
|
Find the library file in the system.
|
||||||
@ -1607,7 +1607,7 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
|
|||||||
return module
|
return module
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@cache
|
||||||
def get_vllm_optional_dependencies():
|
def get_vllm_optional_dependencies():
|
||||||
metadata = importlib.metadata.metadata("vllm")
|
metadata = importlib.metadata.metadata("vllm")
|
||||||
requirements = metadata.get_all("Requires-Dist", [])
|
requirements = metadata.get_all("Requires-Dist", [])
|
||||||
|
@ -247,8 +247,8 @@ class Scheduler:
|
|||||||
token_budget -= num_new_tokens
|
token_budget -= num_new_tokens
|
||||||
request.status = RequestStatus.RUNNING
|
request.status = RequestStatus.RUNNING
|
||||||
request.num_computed_tokens = num_computed_tokens
|
request.num_computed_tokens = num_computed_tokens
|
||||||
has_partial_request = (num_computed_tokens + num_new_tokens <
|
has_partial_request = (num_computed_tokens + num_new_tokens
|
||||||
request.num_tokens)
|
< request.num_tokens)
|
||||||
|
|
||||||
# Encoder-related.
|
# Encoder-related.
|
||||||
if encoder_inputs_to_schedule:
|
if encoder_inputs_to_schedule:
|
||||||
|
@ -311,8 +311,8 @@ class RequestStats:
|
|||||||
return []
|
return []
|
||||||
latency_s_lst = []
|
latency_s_lst = []
|
||||||
for i in range(1, len(self.output_token_ts_s_lst)):
|
for i in range(1, len(self.output_token_ts_s_lst)):
|
||||||
assert (self.output_token_ts_s_lst[i] >=
|
assert (self.output_token_ts_s_lst[i]
|
||||||
self.output_token_ts_s_lst[i - 1])
|
>= self.output_token_ts_s_lst[i - 1])
|
||||||
latency_s = (self.output_token_ts_s_lst[i] -
|
latency_s = (self.output_token_ts_s_lst[i] -
|
||||||
self.output_token_ts_s_lst[i - 1])
|
self.output_token_ts_s_lst[i - 1])
|
||||||
latency_s_lst.append(latency_s)
|
latency_s_lst.append(latency_s)
|
||||||
|
@ -205,7 +205,7 @@ class GPUModelRunner:
|
|||||||
|
|
||||||
def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
|
def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
|
||||||
# Remove stopped requests from the cached states.
|
# Remove stopped requests from the cached states.
|
||||||
# Keep the states of the pre-empted requests.
|
# Keep the states of the preempted requests.
|
||||||
for req_id in scheduler_output.finished_req_ids:
|
for req_id in scheduler_output.finished_req_ids:
|
||||||
self.requests.pop(req_id, None)
|
self.requests.pop(req_id, None)
|
||||||
self.encoder_cache.pop(req_id, None)
|
self.encoder_cache.pop(req_id, None)
|
||||||
|
@ -173,13 +173,13 @@ class HPUWorker(LocalOrDistributedWorkerBase):
|
|||||||
cpu_fallback_ctx as cpu_fallback_local_metric:
|
cpu_fallback_ctx as cpu_fallback_local_metric:
|
||||||
output = LocalOrDistributedWorkerBase.execute_model(
|
output = LocalOrDistributedWorkerBase.execute_model(
|
||||||
self, execute_model_req)
|
self, execute_model_req)
|
||||||
if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0
|
if (log_graph_compilation and gc_local_metric.stats()[0][1]
|
||||||
) or log_graph_compilation_all:
|
> 0) or log_graph_compilation_all:
|
||||||
msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: "
|
msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: "
|
||||||
f"{gc_local_metric.stats()}, {input_stats}")
|
f"{gc_local_metric.stats()}, {input_stats}")
|
||||||
logger.warning(msg)
|
logger.warning(msg)
|
||||||
if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] >
|
if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1]
|
||||||
0) or log_cpu_fallbacks_all:
|
> 0) or log_cpu_fallbacks_all:
|
||||||
msg = ("VLLM_HPU_STEP_CPU_FALLBACK: "
|
msg = ("VLLM_HPU_STEP_CPU_FALLBACK: "
|
||||||
f"{cpu_fallback_local_metric.stats()}, {input_stats}")
|
f"{cpu_fallback_local_metric.stats()}, {input_stats}")
|
||||||
logger.warning(msg)
|
logger.warning(msg)
|
||||||
|
@ -316,8 +316,8 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
|
|||||||
logger.info("batch_size: %d, seq_len: %d", batch_size,
|
logger.info("batch_size: %d, seq_len: %d", batch_size,
|
||||||
seq_len)
|
seq_len)
|
||||||
num_tokens = batch_size * seq_len
|
num_tokens = batch_size * seq_len
|
||||||
if (num_tokens >=
|
if (num_tokens
|
||||||
self.scheduler_config.max_num_batched_tokens):
|
>= self.scheduler_config.max_num_batched_tokens):
|
||||||
break
|
break
|
||||||
seq_len = seq_len * 2
|
seq_len = seq_len * 2
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user