Remove hardcoded device="cuda"
to support more devices (#2503)
Co-authored-by: Jiang Li <jiang1.li@intel.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
This commit is contained in:
parent
c410f5d020
commit
96b6f475dd
@ -25,6 +25,7 @@ def main(args: argparse.Namespace):
|
|||||||
dtype=args.dtype,
|
dtype=args.dtype,
|
||||||
enforce_eager=args.enforce_eager,
|
enforce_eager=args.enforce_eager,
|
||||||
kv_cache_dtype=args.kv_cache_dtype,
|
kv_cache_dtype=args.kv_cache_dtype,
|
||||||
|
device=args.device,
|
||||||
)
|
)
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
@ -135,5 +136,11 @@ if __name__ == '__main__':
|
|||||||
default=None,
|
default=None,
|
||||||
help=('path to save the pytorch profiler output. Can be visualized '
|
help=('path to save the pytorch profiler output. Can be visualized '
|
||||||
'with ui.perfetto.dev or Tensorboard.'))
|
'with ui.perfetto.dev or Tensorboard.'))
|
||||||
|
parser.add_argument(
|
||||||
|
"--device",
|
||||||
|
type=str,
|
||||||
|
default="cuda",
|
||||||
|
choices=["cuda"],
|
||||||
|
help='device type for vLLM execution, supporting CUDA only currently.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
@ -72,6 +72,7 @@ def run_vllm(
|
|||||||
max_model_len: Optional[int],
|
max_model_len: Optional[int],
|
||||||
enforce_eager: bool,
|
enforce_eager: bool,
|
||||||
kv_cache_dtype: str,
|
kv_cache_dtype: str,
|
||||||
|
device: str,
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
@ -85,6 +86,7 @@ def run_vllm(
|
|||||||
max_model_len=max_model_len,
|
max_model_len=max_model_len,
|
||||||
enforce_eager=enforce_eager,
|
enforce_eager=enforce_eager,
|
||||||
kv_cache_dtype=kv_cache_dtype,
|
kv_cache_dtype=kv_cache_dtype,
|
||||||
|
device=device,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
@ -209,7 +211,7 @@ def main(args: argparse.Namespace):
|
|||||||
args.seed, args.n, args.use_beam_search,
|
args.seed, args.n, args.use_beam_search,
|
||||||
args.trust_remote_code, args.dtype,
|
args.trust_remote_code, args.dtype,
|
||||||
args.max_model_len, args.enforce_eager,
|
args.max_model_len, args.enforce_eager,
|
||||||
args.kv_cache_dtype)
|
args.kv_cache_dtype, args.device)
|
||||||
elif args.backend == "hf":
|
elif args.backend == "hf":
|
||||||
assert args.tensor_parallel_size == 1
|
assert args.tensor_parallel_size == 1
|
||||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||||
@ -294,6 +296,12 @@ if __name__ == "__main__":
|
|||||||
default="auto",
|
default="auto",
|
||||||
help=
|
help=
|
||||||
'Data type for kv cache storage. If "auto", will use model data type.')
|
'Data type for kv cache storage. If "auto", will use model data type.')
|
||||||
|
parser.add_argument(
|
||||||
|
"--device",
|
||||||
|
type=str,
|
||||||
|
default="cuda",
|
||||||
|
choices=["cuda"],
|
||||||
|
help='device type for vLLM execution, supporting CUDA only currently.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.tokenizer is None:
|
if args.tokenizer is None:
|
||||||
args.tokenizer = args.model
|
args.tokenizer = args.model
|
||||||
|
@ -25,10 +25,12 @@ def main(
|
|||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
seed: int,
|
seed: int,
|
||||||
do_profile: bool,
|
do_profile: bool,
|
||||||
|
device: str = "cuda",
|
||||||
kv_cache_dtype: Optional[str] = None,
|
kv_cache_dtype: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
random.seed(seed)
|
random.seed(seed)
|
||||||
torch.random.manual_seed(seed)
|
torch.random.manual_seed(seed)
|
||||||
|
if torch.cuda.is_available():
|
||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
|
|
||||||
scale = float(1.0 / (head_size**0.5))
|
scale = float(1.0 / (head_size**0.5))
|
||||||
@ -36,7 +38,7 @@ def main(
|
|||||||
num_query_heads,
|
num_query_heads,
|
||||||
head_size,
|
head_size,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
device="cuda")
|
device=device)
|
||||||
query.uniform_(-scale, scale)
|
query.uniform_(-scale, scale)
|
||||||
|
|
||||||
assert num_query_heads % num_kv_heads == 0
|
assert num_query_heads % num_kv_heads == 0
|
||||||
@ -44,11 +46,11 @@ def main(
|
|||||||
if use_alibi:
|
if use_alibi:
|
||||||
alibi_slopes = torch.randn(num_query_heads,
|
alibi_slopes = torch.randn(num_query_heads,
|
||||||
dtype=torch.float,
|
dtype=torch.float,
|
||||||
device="cuda")
|
device=device)
|
||||||
|
|
||||||
context_lens = [context_len for _ in range(num_seqs)]
|
context_lens = [context_len for _ in range(num_seqs)]
|
||||||
max_context_len = max(context_lens)
|
max_context_len = max(context_lens)
|
||||||
context_lens = torch.tensor(context_lens, dtype=torch.int, device="cuda")
|
context_lens = torch.tensor(context_lens, dtype=torch.int, device=device)
|
||||||
|
|
||||||
# Create the block tables.
|
# Create the block tables.
|
||||||
max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
|
max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
|
||||||
@ -59,12 +61,17 @@ def main(
|
|||||||
for _ in range(max_num_blocks_per_seq)
|
for _ in range(max_num_blocks_per_seq)
|
||||||
]
|
]
|
||||||
block_tables.append(block_table)
|
block_tables.append(block_table)
|
||||||
block_tables = torch.tensor(block_tables, dtype=torch.int, device="cuda")
|
block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
|
||||||
|
|
||||||
# Create the KV cache.
|
# Create the KV cache.
|
||||||
key_caches, value_caches = create_kv_caches_with_random(
|
key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
|
||||||
NUM_BLOCKS, block_size, 1, num_kv_heads, head_size, kv_cache_dtype,
|
block_size,
|
||||||
dtype)
|
1,
|
||||||
|
num_kv_heads,
|
||||||
|
head_size,
|
||||||
|
kv_cache_dtype,
|
||||||
|
dtype,
|
||||||
|
device=device)
|
||||||
key_cache, value_cache = key_caches[0], value_caches[0]
|
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||||
|
|
||||||
# Prepare for the paged attention kernel.
|
# Prepare for the paged attention kernel.
|
||||||
@ -84,7 +91,7 @@ def main(
|
|||||||
)
|
)
|
||||||
max_logits = torch.empty_like(exp_sums)
|
max_logits = torch.empty_like(exp_sums)
|
||||||
|
|
||||||
def run_benchmark(num_iters: int, profile: bool = False) -> float:
|
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
if profile:
|
if profile:
|
||||||
torch.cuda.cudart().cudaProfilerStart()
|
torch.cuda.cudart().cudaProfilerStart()
|
||||||
@ -135,6 +142,7 @@ def main(
|
|||||||
|
|
||||||
# Warmup.
|
# Warmup.
|
||||||
print("Warming up...")
|
print("Warming up...")
|
||||||
|
run_benchmark = run_cuda_benchmark
|
||||||
run_benchmark(num_iters=3, profile=False)
|
run_benchmark(num_iters=3, profile=False)
|
||||||
|
|
||||||
# Benchmark.
|
# Benchmark.
|
||||||
@ -175,6 +183,7 @@ if __name__ == '__main__':
|
|||||||
default="auto",
|
default="auto",
|
||||||
help=
|
help=
|
||||||
'Data type for kv cache storage. If "auto", will use model data type.')
|
'Data type for kv cache storage. If "auto", will use model data type.')
|
||||||
|
parser.add_argument("--device", type=str, choices=["cuda"], default="cuda")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
print(args)
|
print(args)
|
||||||
|
|
||||||
|
@ -7,26 +7,29 @@ DTYPES = [torch.half, torch.bfloat16, torch.float]
|
|||||||
NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing
|
NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing
|
||||||
D = [512, 4096, 5120, 13824] # Arbitrary values for testing
|
D = [512, 4096, 5120, 13824] # Arbitrary values for testing
|
||||||
SEEDS = [0]
|
SEEDS = [0]
|
||||||
DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
|
CUDA_DEVICES = [
|
||||||
|
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||||
@pytest.mark.parametrize("d", D)
|
@pytest.mark.parametrize("d", D)
|
||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
@pytest.mark.parametrize("seed", SEEDS)
|
@pytest.mark.parametrize("seed", SEEDS)
|
||||||
@pytest.mark.parametrize("device", DEVICES)
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_silu_and_mul(
|
def test_silu_and_mul(
|
||||||
num_tokens: int,
|
num_tokens: int,
|
||||||
d: int,
|
d: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
seed: int,
|
seed: int,
|
||||||
device: int,
|
device: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
torch.random.manual_seed(seed)
|
torch.random.manual_seed(seed)
|
||||||
|
if torch.cuda.is_available():
|
||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
gpu_id = f"cuda:{device}"
|
torch.set_default_device(device)
|
||||||
x = torch.randn(num_tokens, 2 * d, dtype=dtype, device=gpu_id)
|
x = torch.randn(num_tokens, 2 * d, dtype=dtype)
|
||||||
layer = SiluAndMul()
|
layer = SiluAndMul()
|
||||||
out = layer(x)
|
out = layer(x)
|
||||||
ref_out = layer._forward(x)
|
ref_out = layer._forward(x)
|
||||||
@ -37,19 +40,20 @@ def test_silu_and_mul(
|
|||||||
@pytest.mark.parametrize("d", D)
|
@pytest.mark.parametrize("d", D)
|
||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
@pytest.mark.parametrize("seed", SEEDS)
|
@pytest.mark.parametrize("seed", SEEDS)
|
||||||
@pytest.mark.parametrize("device", DEVICES)
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_gelu_new(
|
def test_gelu_new(
|
||||||
num_tokens: int,
|
num_tokens: int,
|
||||||
d: int,
|
d: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
seed: int,
|
seed: int,
|
||||||
device: int,
|
device: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
torch.random.manual_seed(seed)
|
torch.random.manual_seed(seed)
|
||||||
|
if torch.cuda.is_available():
|
||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
gpu_id = f"cuda:{device}"
|
torch.set_default_device(device)
|
||||||
x = torch.randn(num_tokens, d, dtype=dtype, device=gpu_id)
|
x = torch.randn(num_tokens, d, dtype=dtype)
|
||||||
layer = NewGELU()
|
layer = NewGELU()
|
||||||
out = layer(x)
|
out = layer(x)
|
||||||
ref_out = layer._forward(x)
|
ref_out = layer._forward(x)
|
||||||
@ -60,18 +64,19 @@ def test_gelu_new(
|
|||||||
@pytest.mark.parametrize("d", D)
|
@pytest.mark.parametrize("d", D)
|
||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
@pytest.mark.parametrize("seed", SEEDS)
|
@pytest.mark.parametrize("seed", SEEDS)
|
||||||
@pytest.mark.parametrize("device", DEVICES)
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
def test_gelu_fast(
|
def test_gelu_fast(
|
||||||
num_tokens: int,
|
num_tokens: int,
|
||||||
d: int,
|
d: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
seed: int,
|
seed: int,
|
||||||
device: int,
|
device: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
torch.random.manual_seed(seed)
|
torch.random.manual_seed(seed)
|
||||||
|
if torch.cuda.is_available():
|
||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
gpu_id = f"cuda:{device}"
|
torch.set_default_device(device)
|
||||||
x = torch.randn(num_tokens, d, dtype=dtype, device=gpu_id)
|
x = torch.randn(num_tokens, d, dtype=dtype)
|
||||||
layer = FastGELU()
|
layer = FastGELU()
|
||||||
out = layer(x)
|
out = layer(x)
|
||||||
ref_out = layer._forward(x)
|
ref_out = layer._forward(x)
|
||||||
|
@ -27,7 +27,9 @@ BLOCK_SIZES = [16, 32]
|
|||||||
USE_ALIBI = [False, True]
|
USE_ALIBI = [False, True]
|
||||||
KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]
|
KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]
|
||||||
SEEDS = [0]
|
SEEDS = [0]
|
||||||
DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
|
CUDA_DEVICES = [
|
||||||
|
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def ref_masked_attention(
|
def ref_masked_attention(
|
||||||
@ -91,7 +93,7 @@ def ref_single_query_cached_kv_attention(
|
|||||||
alibi_bias = None
|
alibi_bias = None
|
||||||
if alibi_slopes is not None:
|
if alibi_slopes is not None:
|
||||||
# Create the ALiBi bias used in the paged attention kernel.
|
# Create the ALiBi bias used in the paged attention kernel.
|
||||||
position_ids = torch.arange(context_len, device=query.device).int()
|
position_ids = torch.arange(context_len).int()
|
||||||
alibi_bias = (position_ids - context_len + 1).float()
|
alibi_bias = (position_ids - context_len + 1).float()
|
||||||
alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
|
alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
|
||||||
1, 1, -1)
|
1, 1, -1)
|
||||||
@ -110,7 +112,7 @@ def ref_single_query_cached_kv_attention(
|
|||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
|
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
|
||||||
@pytest.mark.parametrize("seed", SEEDS)
|
@pytest.mark.parametrize("seed", SEEDS)
|
||||||
@pytest.mark.parametrize("device", DEVICES)
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
def test_paged_attention(
|
def test_paged_attention(
|
||||||
kv_cache_factory,
|
kv_cache_factory,
|
||||||
version: str,
|
version: str,
|
||||||
@ -122,33 +124,28 @@ def test_paged_attention(
|
|||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
kv_cache_dtype: str,
|
kv_cache_dtype: str,
|
||||||
seed: int,
|
seed: int,
|
||||||
device: int,
|
device: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
random.seed(seed)
|
random.seed(seed)
|
||||||
torch.random.manual_seed(seed)
|
torch.random.manual_seed(seed)
|
||||||
|
if torch.cuda.is_available():
|
||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
gpu_id = f"cuda:{device}"
|
torch.set_default_device(device)
|
||||||
scale = float(1.0 / (head_size**0.5))
|
scale = float(1.0 / (head_size**0.5))
|
||||||
num_query_heads, num_kv_heads = num_heads
|
num_query_heads, num_kv_heads = num_heads
|
||||||
query = torch.empty(num_seqs,
|
query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype)
|
||||||
num_query_heads,
|
|
||||||
head_size,
|
|
||||||
dtype=dtype,
|
|
||||||
device=gpu_id)
|
|
||||||
query.uniform_(-scale, scale)
|
query.uniform_(-scale, scale)
|
||||||
|
|
||||||
assert num_query_heads % num_kv_heads == 0
|
assert num_query_heads % num_kv_heads == 0
|
||||||
num_queries_per_kv = num_query_heads // num_kv_heads
|
num_queries_per_kv = num_query_heads // num_kv_heads
|
||||||
alibi_slopes = None
|
alibi_slopes = None
|
||||||
if use_alibi:
|
if use_alibi:
|
||||||
alibi_slopes = torch.randn(num_query_heads,
|
alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
|
||||||
dtype=torch.float,
|
|
||||||
device=gpu_id)
|
|
||||||
|
|
||||||
context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
|
context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
|
||||||
context_lens[-1] = MAX_SEQ_LEN
|
context_lens[-1] = MAX_SEQ_LEN
|
||||||
max_context_len = max(context_lens)
|
max_context_len = max(context_lens)
|
||||||
context_lens = torch.tensor(context_lens, dtype=torch.int, device=gpu_id)
|
context_lens = torch.tensor(context_lens, dtype=torch.int)
|
||||||
|
|
||||||
# Create the block tables.
|
# Create the block tables.
|
||||||
max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
|
max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
|
||||||
@ -159,13 +156,13 @@ def test_paged_attention(
|
|||||||
for _ in range(max_num_blocks_per_seq)
|
for _ in range(max_num_blocks_per_seq)
|
||||||
]
|
]
|
||||||
block_tables.append(block_table)
|
block_tables.append(block_table)
|
||||||
block_tables = torch.tensor(block_tables, dtype=torch.int, device=gpu_id)
|
block_tables = torch.tensor(block_tables, dtype=torch.int)
|
||||||
|
|
||||||
# Create the KV caches.
|
# Create the KV caches.
|
||||||
key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
|
key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
|
||||||
num_kv_heads, head_size,
|
num_kv_heads, head_size,
|
||||||
kv_cache_dtype, dtype, seed,
|
kv_cache_dtype, dtype, seed,
|
||||||
gpu_id)
|
device)
|
||||||
key_cache, value_cache = key_caches[0], value_caches[0]
|
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||||
|
|
||||||
# Call the paged attention kernel.
|
# Call the paged attention kernel.
|
||||||
@ -193,12 +190,10 @@ def test_paged_attention(
|
|||||||
tmp_output = torch.empty(
|
tmp_output = torch.empty(
|
||||||
size=(num_seqs, num_heads, num_partitions, head_size),
|
size=(num_seqs, num_heads, num_partitions, head_size),
|
||||||
dtype=output.dtype,
|
dtype=output.dtype,
|
||||||
device=output.device,
|
|
||||||
)
|
)
|
||||||
exp_sums = torch.empty(
|
exp_sums = torch.empty(
|
||||||
size=(num_seqs, num_heads, num_partitions),
|
size=(num_seqs, num_heads, num_partitions),
|
||||||
dtype=torch.float32,
|
dtype=torch.float32,
|
||||||
device=output.device,
|
|
||||||
)
|
)
|
||||||
max_logits = torch.empty_like(exp_sums)
|
max_logits = torch.empty_like(exp_sums)
|
||||||
ops.paged_attention_v2(
|
ops.paged_attention_v2(
|
||||||
@ -229,14 +224,14 @@ def test_paged_attention(
|
|||||||
block_size, x)
|
block_size, x)
|
||||||
dequantized_key_cache = torch.empty(size=key_cache_shape,
|
dequantized_key_cache = torch.empty(size=key_cache_shape,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
device=gpu_id)
|
device=device)
|
||||||
cache_ops.convert_fp8_e5m2(key_cache, dequantized_key_cache)
|
cache_ops.convert_fp8_e5m2(key_cache, dequantized_key_cache)
|
||||||
key_cache = dequantized_key_cache
|
key_cache = dequantized_key_cache
|
||||||
|
|
||||||
value_cache_shape = value_cache.shape
|
value_cache_shape = value_cache.shape
|
||||||
dequantized_value_cache = torch.empty(size=value_cache_shape,
|
dequantized_value_cache = torch.empty(size=value_cache_shape,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
device=gpu_id)
|
device=device)
|
||||||
cache_ops.convert_fp8_e5m2(value_cache, dequantized_value_cache)
|
cache_ops.convert_fp8_e5m2(value_cache, dequantized_value_cache)
|
||||||
value_cache = dequantized_value_cache
|
value_cache = dequantized_value_cache
|
||||||
|
|
||||||
@ -283,7 +278,7 @@ def ref_multi_query_kv_attention(
|
|||||||
attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
|
attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
|
||||||
diagonal=1)
|
diagonal=1)
|
||||||
attn_mask = attn_mask * torch.finfo(dtype).min
|
attn_mask = attn_mask * torch.finfo(dtype).min
|
||||||
attn_mask = attn_mask.to(dtype=dtype, device=query.device)
|
attn_mask = attn_mask.to(dtype=dtype)
|
||||||
|
|
||||||
ref_output = ref_masked_attention(
|
ref_output = ref_masked_attention(
|
||||||
query[start_idx:end_idx],
|
query[start_idx:end_idx],
|
||||||
@ -303,7 +298,7 @@ def ref_multi_query_kv_attention(
|
|||||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
@pytest.mark.parametrize("seed", SEEDS)
|
@pytest.mark.parametrize("seed", SEEDS)
|
||||||
@pytest.mark.parametrize("device", DEVICES)
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_multi_query_kv_attention(
|
def test_multi_query_kv_attention(
|
||||||
num_seqs: int,
|
num_seqs: int,
|
||||||
@ -311,12 +306,13 @@ def test_multi_query_kv_attention(
|
|||||||
head_size: int,
|
head_size: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
seed: int,
|
seed: int,
|
||||||
device: int,
|
device: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
random.seed(seed)
|
random.seed(seed)
|
||||||
torch.random.manual_seed(seed)
|
torch.random.manual_seed(seed)
|
||||||
|
if torch.cuda.is_available():
|
||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
gpu_id = f"cuda:{device}"
|
torch.set_default_device(device)
|
||||||
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
|
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
|
||||||
# As the xformers library is already tested with its own tests, we can use
|
# As the xformers library is already tested with its own tests, we can use
|
||||||
# a smaller MAX_SEQ_LEN here.
|
# a smaller MAX_SEQ_LEN here.
|
||||||
@ -329,8 +325,7 @@ def test_multi_query_kv_attention(
|
|||||||
qkv = torch.empty(num_tokens,
|
qkv = torch.empty(num_tokens,
|
||||||
num_query_heads + 2 * num_kv_heads,
|
num_query_heads + 2 * num_kv_heads,
|
||||||
head_size,
|
head_size,
|
||||||
dtype=dtype,
|
dtype=dtype)
|
||||||
device=gpu_id)
|
|
||||||
qkv.uniform_(-scale, scale)
|
qkv.uniform_(-scale, scale)
|
||||||
query, key, value = qkv.split(
|
query, key, value = qkv.split(
|
||||||
[num_query_heads, num_kv_heads, num_kv_heads], dim=1)
|
[num_query_heads, num_kv_heads, num_kv_heads], dim=1)
|
||||||
|
@ -17,7 +17,9 @@ BLOCK_SIZES = [8, 16, 32]
|
|||||||
NUM_BLOCKS = [1024, 3600] # Arbitrary values for testing
|
NUM_BLOCKS = [1024, 3600] # Arbitrary values for testing
|
||||||
NUM_MAPPINGS = [256] # Arbitrary values for testing
|
NUM_MAPPINGS = [256] # Arbitrary values for testing
|
||||||
SEEDS = [0]
|
SEEDS = [0]
|
||||||
DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
|
CUDA_DEVICES = [
|
||||||
|
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||||
|
]
|
||||||
KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]
|
KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]
|
||||||
|
|
||||||
|
|
||||||
@ -29,7 +31,7 @@ KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]
|
|||||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
@pytest.mark.parametrize("seed", SEEDS)
|
@pytest.mark.parametrize("seed", SEEDS)
|
||||||
@pytest.mark.parametrize("device", DEVICES)
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
|
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_copy_blocks(
|
def test_copy_blocks(
|
||||||
@ -42,13 +44,14 @@ def test_copy_blocks(
|
|||||||
num_blocks: int,
|
num_blocks: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
seed: int,
|
seed: int,
|
||||||
device: int,
|
|
||||||
kv_cache_dtype: str,
|
kv_cache_dtype: str,
|
||||||
|
device: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
random.seed(seed)
|
random.seed(seed)
|
||||||
torch.random.manual_seed(seed)
|
torch.random.manual_seed(seed)
|
||||||
|
if torch.cuda.is_available():
|
||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
gpu_id = f"cuda:{device}"
|
torch.set_default_device(device)
|
||||||
# Generate random block mappings where each source block is mapped to two
|
# Generate random block mappings where each source block is mapped to two
|
||||||
# destination blocks.
|
# destination blocks.
|
||||||
assert 2 * num_mappings <= num_blocks
|
assert 2 * num_mappings <= num_blocks
|
||||||
@ -66,7 +69,7 @@ def test_copy_blocks(
|
|||||||
key_caches, value_caches = kv_cache_factory(num_blocks, block_size,
|
key_caches, value_caches = kv_cache_factory(num_blocks, block_size,
|
||||||
num_layers, num_heads,
|
num_layers, num_heads,
|
||||||
head_size, kv_cache_dtype,
|
head_size, kv_cache_dtype,
|
||||||
dtype, seed, gpu_id)
|
dtype, seed, device)
|
||||||
|
|
||||||
# Clone the KV caches.
|
# Clone the KV caches.
|
||||||
cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
|
cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
|
||||||
@ -98,7 +101,7 @@ def test_copy_blocks(
|
|||||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
@pytest.mark.parametrize("seed", SEEDS)
|
@pytest.mark.parametrize("seed", SEEDS)
|
||||||
@pytest.mark.parametrize("device", DEVICES)
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_reshape_and_cache(
|
def test_reshape_and_cache(
|
||||||
kv_cache_factory,
|
kv_cache_factory,
|
||||||
@ -109,29 +112,25 @@ def test_reshape_and_cache(
|
|||||||
num_blocks: int,
|
num_blocks: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
seed: int,
|
seed: int,
|
||||||
device: int,
|
device: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
random.seed(seed)
|
random.seed(seed)
|
||||||
torch.random.manual_seed(seed)
|
torch.random.manual_seed(seed)
|
||||||
|
if torch.cuda.is_available():
|
||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
gpu_id = f"cuda:{device}"
|
torch.set_default_device(device)
|
||||||
# Create a random slot mapping.
|
# Create a random slot mapping.
|
||||||
num_slots = block_size * num_blocks
|
num_slots = block_size * num_blocks
|
||||||
slot_mapping = random.sample(range(num_slots), num_tokens)
|
slot_mapping = random.sample(range(num_slots), num_tokens)
|
||||||
slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=gpu_id)
|
slot_mapping = torch.tensor(slot_mapping, dtype=torch.long)
|
||||||
|
|
||||||
qkv = torch.randn(num_tokens,
|
qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype)
|
||||||
3,
|
|
||||||
num_heads,
|
|
||||||
head_size,
|
|
||||||
dtype=dtype,
|
|
||||||
device=gpu_id)
|
|
||||||
_, key, value = qkv.unbind(dim=1)
|
_, key, value = qkv.unbind(dim=1)
|
||||||
|
|
||||||
# Create the KV caches.
|
# Create the KV caches.
|
||||||
key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
|
key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
|
||||||
num_heads, head_size, dtype,
|
num_heads, head_size, dtype,
|
||||||
None, seed, gpu_id)
|
None, seed, device)
|
||||||
key_cache, value_cache = key_caches[0], value_caches[0]
|
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||||
|
|
||||||
# Clone the KV caches.
|
# Clone the KV caches.
|
||||||
@ -166,7 +165,7 @@ def test_reshape_and_cache(
|
|||||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
@pytest.mark.parametrize("seed", SEEDS)
|
@pytest.mark.parametrize("seed", SEEDS)
|
||||||
@pytest.mark.parametrize("device", DEVICES)
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_swap_blocks(
|
def test_swap_blocks(
|
||||||
kv_cache_factory,
|
kv_cache_factory,
|
||||||
@ -182,6 +181,7 @@ def test_swap_blocks(
|
|||||||
) -> None:
|
) -> None:
|
||||||
random.seed(seed)
|
random.seed(seed)
|
||||||
torch.random.manual_seed(seed)
|
torch.random.manual_seed(seed)
|
||||||
|
if torch.cuda.is_available():
|
||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
src_device = f"{direction[0]}:{device}" if direction[
|
src_device = f"{direction[0]}:{device}" if direction[
|
||||||
0] == "cuda" else direction[0]
|
0] == "cuda" else direction[0]
|
||||||
|
@ -8,7 +8,9 @@ NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing
|
|||||||
HIDDEN_SIZES = [768, 5120, 8192] # Arbitrary values for testing
|
HIDDEN_SIZES = [768, 5120, 8192] # Arbitrary values for testing
|
||||||
ADD_RESIDUAL = [False, True]
|
ADD_RESIDUAL = [False, True]
|
||||||
SEEDS = [0]
|
SEEDS = [0]
|
||||||
DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
|
CUDA_DEVICES = [
|
||||||
|
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
|
||||||
@ -16,7 +18,7 @@ DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
|
|||||||
@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
|
@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
|
||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
@pytest.mark.parametrize("seed", SEEDS)
|
@pytest.mark.parametrize("seed", SEEDS)
|
||||||
@pytest.mark.parametrize("device", DEVICES)
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_rms_norm(
|
def test_rms_norm(
|
||||||
num_tokens: int,
|
num_tokens: int,
|
||||||
@ -24,15 +26,16 @@ def test_rms_norm(
|
|||||||
add_residual: bool,
|
add_residual: bool,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
seed: int,
|
seed: int,
|
||||||
device: int,
|
device: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
torch.random.manual_seed(seed)
|
torch.random.manual_seed(seed)
|
||||||
|
if torch.cuda.is_available():
|
||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
gpu_id = f"cuda:{device}"
|
torch.set_default_device(device)
|
||||||
layer = RMSNorm(hidden_size).to(dtype=dtype, device=gpu_id)
|
layer = RMSNorm(hidden_size).to(dtype=dtype)
|
||||||
layer.weight.data.normal_(mean=1.0, std=0.1)
|
layer.weight.data.normal_(mean=1.0, std=0.1)
|
||||||
scale = 1 / (2 * hidden_size)
|
scale = 1 / (2 * hidden_size)
|
||||||
x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=gpu_id)
|
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||||
x *= scale
|
x *= scale
|
||||||
residual = torch.randn_like(x) * scale if add_residual else None
|
residual = torch.randn_like(x) * scale if add_residual else None
|
||||||
|
|
||||||
|
@ -13,7 +13,9 @@ NUM_HEADS = [7, 17] # Arbitrary values for testing
|
|||||||
BATCH_SIZES = [1, 5] # Arbitrary values for testing
|
BATCH_SIZES = [1, 5] # Arbitrary values for testing
|
||||||
SEQ_LENS = [11, 8192] # Arbitrary values for testing
|
SEQ_LENS = [11, 8192] # Arbitrary values for testing
|
||||||
SEEDS = [0]
|
SEEDS = [0]
|
||||||
DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
|
CUDA_DEVICES = [
|
||||||
|
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
|
@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
|
||||||
@ -24,7 +26,7 @@ DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
|
|||||||
@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
|
@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
|
||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
@pytest.mark.parametrize("seed", SEEDS)
|
@pytest.mark.parametrize("seed", SEEDS)
|
||||||
@pytest.mark.parametrize("device", DEVICES)
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_rotary_embedding(
|
def test_rotary_embedding(
|
||||||
is_neox_style: bool,
|
is_neox_style: bool,
|
||||||
@ -35,28 +37,26 @@ def test_rotary_embedding(
|
|||||||
rotary_dim: Optional[int],
|
rotary_dim: Optional[int],
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
seed: int,
|
seed: int,
|
||||||
device: int,
|
device: str,
|
||||||
max_position: int = 8192,
|
max_position: int = 8192,
|
||||||
base: int = 10000,
|
base: int = 10000,
|
||||||
) -> None:
|
) -> None:
|
||||||
if rotary_dim is None:
|
if rotary_dim is None:
|
||||||
rotary_dim = head_size
|
rotary_dim = head_size
|
||||||
torch.random.manual_seed(seed)
|
torch.random.manual_seed(seed)
|
||||||
|
if torch.cuda.is_available():
|
||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
gpu_id = f"cuda:{device}"
|
torch.set_default_device(device)
|
||||||
if rotary_dim is None:
|
if rotary_dim is None:
|
||||||
rotary_dim = head_size
|
rotary_dim = head_size
|
||||||
rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
|
rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
|
||||||
rope = rope.to(dtype=dtype, device=gpu_id)
|
rope = rope.to(dtype=dtype)
|
||||||
|
|
||||||
positions = torch.randint(0,
|
positions = torch.randint(0, max_position, (batch_size, seq_len))
|
||||||
max_position, (batch_size, seq_len),
|
|
||||||
device=gpu_id)
|
|
||||||
query = torch.randn(batch_size,
|
query = torch.randn(batch_size,
|
||||||
seq_len,
|
seq_len,
|
||||||
num_heads * head_size,
|
num_heads * head_size,
|
||||||
dtype=dtype,
|
dtype=dtype)
|
||||||
device=gpu_id)
|
|
||||||
key = torch.randn_like(query)
|
key = torch.randn_like(query)
|
||||||
|
|
||||||
# NOTE(woosuk): The reference implementation should be executed first
|
# NOTE(woosuk): The reference implementation should be executed first
|
||||||
|
@ -11,19 +11,27 @@ from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
|
|||||||
NUM_HEADS = [12]
|
NUM_HEADS = [12]
|
||||||
HEAD_SIZES = [128]
|
HEAD_SIZES = [128]
|
||||||
DTYPES = [torch.float16]
|
DTYPES = [torch.float16]
|
||||||
|
CUDA_DEVICES = [
|
||||||
|
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||||
@pytest.mark.parametrize("dtype", DTYPES)
|
@pytest.mark.parametrize("dtype", DTYPES)
|
||||||
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_contexted_kv_attention(
|
def test_contexted_kv_attention(
|
||||||
num_heads: int,
|
num_heads: int,
|
||||||
head_size: int,
|
head_size: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
|
device: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
random.seed(0)
|
random.seed(0)
|
||||||
torch.manual_seed(0)
|
torch.manual_seed(0)
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.manual_seed(0)
|
||||||
|
torch.set_default_device(device)
|
||||||
MAX_SEQ_LEN = 1024
|
MAX_SEQ_LEN = 1024
|
||||||
MAX_CTX_LEN = 1024
|
MAX_CTX_LEN = 1024
|
||||||
BS = 10
|
BS = 10
|
||||||
@ -35,24 +43,11 @@ def test_contexted_kv_attention(
|
|||||||
seq_lens = [a + b for a, b in zip(subquery_lens, ctx_lens)]
|
seq_lens = [a + b for a, b in zip(subquery_lens, ctx_lens)]
|
||||||
|
|
||||||
num_tokens = sum(subquery_lens)
|
num_tokens = sum(subquery_lens)
|
||||||
query = torch.empty(num_tokens,
|
query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
|
||||||
num_heads,
|
|
||||||
head_size,
|
|
||||||
dtype=dtype,
|
|
||||||
device='cuda')
|
|
||||||
query.uniform_(-1e-3, 1e-3)
|
query.uniform_(-1e-3, 1e-3)
|
||||||
output = torch.empty(num_tokens,
|
output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
|
||||||
num_heads,
|
|
||||||
head_size,
|
|
||||||
dtype=dtype,
|
|
||||||
device='cuda')
|
|
||||||
|
|
||||||
kv = torch.empty(sum(seq_lens),
|
kv = torch.empty(sum(seq_lens), 2, num_heads, head_size, dtype=dtype)
|
||||||
2,
|
|
||||||
num_heads,
|
|
||||||
head_size,
|
|
||||||
dtype=dtype,
|
|
||||||
device='cuda')
|
|
||||||
kv.uniform_(-1e-3, 1e-3)
|
kv.uniform_(-1e-3, 1e-3)
|
||||||
key, value = kv.unbind(dim=1)
|
key, value = kv.unbind(dim=1)
|
||||||
|
|
||||||
@ -60,39 +55,27 @@ def test_contexted_kv_attention(
|
|||||||
block_size,
|
block_size,
|
||||||
num_heads,
|
num_heads,
|
||||||
head_size,
|
head_size,
|
||||||
dtype=dtype,
|
dtype=dtype)
|
||||||
device='cuda')
|
|
||||||
v_cache = torch.zeros(cache_size,
|
v_cache = torch.zeros(cache_size,
|
||||||
block_size,
|
block_size,
|
||||||
num_heads,
|
num_heads,
|
||||||
head_size,
|
head_size,
|
||||||
dtype=dtype,
|
dtype=dtype)
|
||||||
device='cuda')
|
k = torch.zeros(sum(subquery_lens), num_heads, head_size, dtype=dtype)
|
||||||
k = torch.zeros(sum(subquery_lens),
|
v = torch.zeros(sum(subquery_lens), num_heads, head_size, dtype=dtype)
|
||||||
num_heads,
|
values = torch.arange(0, cache_size, dtype=torch.long)
|
||||||
head_size,
|
|
||||||
dtype=dtype,
|
|
||||||
device='cuda')
|
|
||||||
v = torch.zeros(sum(subquery_lens),
|
|
||||||
num_heads,
|
|
||||||
head_size,
|
|
||||||
dtype=dtype,
|
|
||||||
device='cuda')
|
|
||||||
values = torch.arange(0, cache_size, dtype=torch.long, device='cuda')
|
|
||||||
values = values[torch.randperm(cache_size)]
|
values = values[torch.randperm(cache_size)]
|
||||||
block_table = values[:BS * max_block_per_request].view(
|
block_table = values[:BS * max_block_per_request].view(
|
||||||
BS, max_block_per_request)
|
BS, max_block_per_request)
|
||||||
b_seq_len = torch.tensor(seq_lens, dtype=torch.long, device='cuda')
|
b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
|
||||||
b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long, device='cuda')
|
b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
|
||||||
b_start_loc = torch.cumsum(torch.tensor([0] + subquery_lens[:-1],
|
b_start_loc = torch.cumsum(torch.tensor([0] + subquery_lens[:-1],
|
||||||
dtype=torch.long,
|
dtype=torch.long),
|
||||||
device='cuda'),
|
|
||||||
dim=0)
|
dim=0)
|
||||||
max_input_len = MAX_SEQ_LEN
|
max_input_len = MAX_SEQ_LEN
|
||||||
# copy kv to cache
|
# copy kv to cache
|
||||||
b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
|
b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
|
||||||
dtype=torch.long,
|
dtype=torch.long),
|
||||||
device='cuda'),
|
|
||||||
dim=0)
|
dim=0)
|
||||||
for i in range(BS):
|
for i in range(BS):
|
||||||
for j in range(subquery_lens[i]):
|
for j in range(subquery_lens[i]):
|
||||||
|
@ -126,8 +126,8 @@ def llama_2_7b_engine_extra_embeddings() -> nn.Module:
|
|||||||
cleanup()
|
cleanup()
|
||||||
get_model_old = get_model
|
get_model_old = get_model
|
||||||
|
|
||||||
def get_model_patched(model_config, lora_config=None):
|
def get_model_patched(model_config, device_config, lora_config=None):
|
||||||
return get_model_old(model_config,
|
return get_model_old(model_config, device_config,
|
||||||
LoRAConfig(max_loras=4, max_lora_rank=8))
|
LoRAConfig(max_loras=4, max_lora_rank=8))
|
||||||
|
|
||||||
with patch("vllm.worker.model_runner.get_model", get_model_patched):
|
with patch("vllm.worker.model_runner.get_model", get_model_patched):
|
||||||
|
@ -34,6 +34,9 @@ TOLERANCES = {
|
|||||||
torch.float32: (5e-3, 5e-3),
|
torch.float32: (5e-3, 5e-3),
|
||||||
torch.bfloat16: (3e-2, 2e-2),
|
torch.bfloat16: (3e-2, 2e-2),
|
||||||
}
|
}
|
||||||
|
CUDA_DEVICES = [
|
||||||
|
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def get_random_id_to_index(num_loras: int,
|
def get_random_id_to_index(num_loras: int,
|
||||||
@ -151,14 +154,10 @@ def create_random_inputs(
|
|||||||
for _ in range(num_inputs):
|
for _ in range(num_inputs):
|
||||||
if input_type == torch.int:
|
if input_type == torch.int:
|
||||||
inputs.append(
|
inputs.append(
|
||||||
torch.randint(low=int(low),
|
torch.randint(low=int(low), high=int(high), size=input_size))
|
||||||
high=int(high),
|
|
||||||
size=input_size,
|
|
||||||
device="cuda"))
|
|
||||||
else:
|
else:
|
||||||
inputs.append(
|
inputs.append(
|
||||||
torch.rand(size=input_size, dtype=input_type, device="cuda") *
|
torch.rand(size=input_size, dtype=input_type) * high + low)
|
||||||
high + low)
|
|
||||||
|
|
||||||
lora_id = random.choice(active_lora_ids)
|
lora_id = random.choice(active_lora_ids)
|
||||||
index_mapping += [lora_id] * input_size[0]
|
index_mapping += [lora_id] * input_size[0]
|
||||||
@ -169,8 +168,10 @@ def create_random_inputs(
|
|||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
|
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
|
||||||
def test_embeddings(dist_init, num_loras) -> None:
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
|
def test_embeddings(dist_init, num_loras, device) -> None:
|
||||||
|
|
||||||
|
torch.set_default_device(device)
|
||||||
max_loras = 8
|
max_loras = 8
|
||||||
lora_config = LoRAConfig(max_loras=max_loras,
|
lora_config = LoRAConfig(max_loras=max_loras,
|
||||||
max_lora_rank=8,
|
max_lora_rank=8,
|
||||||
@ -259,8 +260,10 @@ def test_embeddings(dist_init, num_loras) -> None:
|
|||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
# @pytest.mark.skip(reason="Fails when loras are in any slot other than the first.")
|
# @pytest.mark.skip(reason="Fails when loras are in any slot other than the first.")
|
||||||
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
|
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
|
||||||
def test_embeddings_with_new_embeddings(dist_init, num_loras) -> None:
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
|
def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
|
||||||
|
|
||||||
|
torch.set_default_device(device)
|
||||||
max_loras = 8
|
max_loras = 8
|
||||||
lora_config = LoRAConfig(max_loras=max_loras,
|
lora_config = LoRAConfig(max_loras=max_loras,
|
||||||
max_lora_rank=8,
|
max_lora_rank=8,
|
||||||
@ -305,8 +308,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras) -> None:
|
|||||||
|
|
||||||
# Add empty embeddings_tensors for unoccupied lora slots.
|
# Add empty embeddings_tensors for unoccupied lora slots.
|
||||||
for _ in range(max_loras - len(embeddings_tensors)):
|
for _ in range(max_loras - len(embeddings_tensors)):
|
||||||
embeddings_tensors.append(
|
embeddings_tensors.append(torch.zeros(embeddings_tensors[0].shape))
|
||||||
torch.zeros(embeddings_tensors[0].shape, device="cuda"))
|
|
||||||
|
|
||||||
inputs, index_mapping, prompt_mapping = create_random_inputs(
|
inputs, index_mapping, prompt_mapping = create_random_inputs(
|
||||||
active_lora_ids=list(lora_dict.keys()),
|
active_lora_ids=list(lora_dict.keys()),
|
||||||
@ -388,8 +390,10 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras) -> None:
|
|||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
|
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
|
||||||
def test_lm_head_sampler(dist_init, num_loras) -> None:
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
|
def test_lm_head_sampler(dist_init, num_loras, device) -> None:
|
||||||
|
|
||||||
|
torch.set_default_device(device)
|
||||||
max_loras = 8
|
max_loras = 8
|
||||||
lora_config = LoRAConfig(max_loras=max_loras,
|
lora_config = LoRAConfig(max_loras=max_loras,
|
||||||
max_lora_rank=8,
|
max_lora_rank=8,
|
||||||
@ -432,7 +436,7 @@ def test_lm_head_sampler(dist_init, num_loras) -> None:
|
|||||||
)
|
)
|
||||||
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
|
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
|
||||||
|
|
||||||
input_ = torch.rand(20, 1024, device="cuda")
|
input_ = torch.rand(20, 1024)
|
||||||
mapping_info = convert_mapping(
|
mapping_info = convert_mapping(
|
||||||
lora_mapping,
|
lora_mapping,
|
||||||
id_to_index,
|
id_to_index,
|
||||||
@ -500,8 +504,10 @@ def test_lm_head_sampler(dist_init, num_loras) -> None:
|
|||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
|
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
|
||||||
@pytest.mark.parametrize("orientation", ["row", "column"])
|
@pytest.mark.parametrize("orientation", ["row", "column"])
|
||||||
def test_linear_parallel(dist_init, num_loras, orientation) -> None:
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
|
def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
|
||||||
|
|
||||||
|
torch.set_default_device(device)
|
||||||
max_loras = 8
|
max_loras = 8
|
||||||
lora_config = LoRAConfig(max_loras=max_loras,
|
lora_config = LoRAConfig(max_loras=max_loras,
|
||||||
max_lora_rank=8,
|
max_lora_rank=8,
|
||||||
@ -597,8 +603,10 @@ def test_linear_parallel(dist_init, num_loras, orientation) -> None:
|
|||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
|
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
|
||||||
@pytest.mark.parametrize("repeats", [2, 3])
|
@pytest.mark.parametrize("repeats", [2, 3])
|
||||||
def test_column_parallel_packed(dist_init, num_loras, repeats) -> None:
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
|
def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
|
||||||
|
|
||||||
|
torch.set_default_device(device)
|
||||||
max_loras = 8
|
max_loras = 8
|
||||||
lora_config = LoRAConfig(max_loras=max_loras,
|
lora_config = LoRAConfig(max_loras=max_loras,
|
||||||
max_lora_rank=8,
|
max_lora_rank=8,
|
||||||
|
@ -5,7 +5,8 @@ from unittest.mock import patch
|
|||||||
|
|
||||||
from vllm.lora.models import LoRAMapping
|
from vllm.lora.models import LoRAMapping
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.config import ModelConfig, ParallelConfig, SchedulerConfig, LoRAConfig
|
from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
|
||||||
|
DeviceConfig, LoRAConfig)
|
||||||
from vllm.worker.worker import Worker
|
from vllm.worker.worker import Worker
|
||||||
|
|
||||||
|
|
||||||
@ -25,6 +26,7 @@ def test_worker_apply_lora(sql_lora_files):
|
|||||||
),
|
),
|
||||||
parallel_config=ParallelConfig(1, 1, False),
|
parallel_config=ParallelConfig(1, 1, False),
|
||||||
scheduler_config=SchedulerConfig(32, 32, 32, 256),
|
scheduler_config=SchedulerConfig(32, 32, 32, 256),
|
||||||
|
device_config=DeviceConfig("cuda"),
|
||||||
local_rank=0,
|
local_rank=0,
|
||||||
rank=0,
|
rank=0,
|
||||||
lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
|
lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
|
||||||
|
@ -9,6 +9,10 @@ from vllm.model_executor.utils import set_random_seed
|
|||||||
|
|
||||||
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
|
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
|
||||||
|
|
||||||
|
CUDA_DEVICES = [
|
||||||
|
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def mock_causal_accepted_tensor(
|
def mock_causal_accepted_tensor(
|
||||||
k: int, last_accepted_indices: torch.Tensor) -> torch.Tensor:
|
k: int, last_accepted_indices: torch.Tensor) -> torch.Tensor:
|
||||||
@ -39,11 +43,14 @@ def mock_causal_accepted_tensor(
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"which_tokens_accepted",
|
"which_tokens_accepted",
|
||||||
["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"])
|
["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"])
|
||||||
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_correct_output_format(which_tokens_accepted: str, seed: int):
|
def test_correct_output_format(which_tokens_accepted: str, seed: int,
|
||||||
|
device: str):
|
||||||
"""Verify the output has correct format given predetermined accepted matrix.
|
"""Verify the output has correct format given predetermined accepted matrix.
|
||||||
"""
|
"""
|
||||||
set_random_seed(seed)
|
set_random_seed(seed)
|
||||||
|
torch.set_default_device(device)
|
||||||
|
|
||||||
batch_size = 10
|
batch_size = 10
|
||||||
k = 5
|
k = 5
|
||||||
@ -66,18 +73,15 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int):
|
|||||||
recovered_token_ids = torch.randint(low=0,
|
recovered_token_ids = torch.randint(low=0,
|
||||||
high=vocab_size,
|
high=vocab_size,
|
||||||
size=(batch_size, k),
|
size=(batch_size, k),
|
||||||
dtype=torch.int64,
|
dtype=torch.int64)
|
||||||
device="cuda")
|
|
||||||
draft_token_ids = torch.randint(low=0,
|
draft_token_ids = torch.randint(low=0,
|
||||||
high=vocab_size,
|
high=vocab_size,
|
||||||
size=(batch_size, k),
|
size=(batch_size, k),
|
||||||
dtype=torch.int64,
|
dtype=torch.int64)
|
||||||
device="cuda")
|
|
||||||
bonus_token_ids = torch.randint(low=0,
|
bonus_token_ids = torch.randint(low=0,
|
||||||
high=vocab_size,
|
high=vocab_size,
|
||||||
size=(batch_size, 1),
|
size=(batch_size, 1),
|
||||||
dtype=torch.int64,
|
dtype=torch.int64)
|
||||||
device="cuda")
|
|
||||||
|
|
||||||
rejection_sampler = RejectionSampler()
|
rejection_sampler = RejectionSampler()
|
||||||
rejection_sampler.init_gpu_tensors(rank=0)
|
rejection_sampler.init_gpu_tensors(rank=0)
|
||||||
@ -120,31 +124,24 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int):
|
|||||||
@pytest.mark.parametrize("k", list(range(1, 6)))
|
@pytest.mark.parametrize("k", list(range(1, 6)))
|
||||||
@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
|
@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
|
||||||
@pytest.mark.parametrize("batch_size", list(range(1, 32)))
|
@pytest.mark.parametrize("batch_size", list(range(1, 32)))
|
||||||
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int):
|
def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
|
||||||
|
device: str):
|
||||||
|
torch.set_default_device(device)
|
||||||
rejection_sampler = RejectionSampler()
|
rejection_sampler = RejectionSampler()
|
||||||
rejection_sampler.init_gpu_tensors(rank=0)
|
rejection_sampler.init_gpu_tensors(rank=0)
|
||||||
|
|
||||||
draft_probs = torch.rand(batch_size,
|
draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
|
||||||
k,
|
target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
|
||||||
vocab_size,
|
|
||||||
dtype=torch.float32,
|
|
||||||
device="cuda")
|
|
||||||
target_probs = torch.rand(batch_size,
|
|
||||||
k,
|
|
||||||
vocab_size,
|
|
||||||
dtype=torch.float32,
|
|
||||||
device="cuda")
|
|
||||||
bonus_token_ids = torch.randint(low=0,
|
bonus_token_ids = torch.randint(low=0,
|
||||||
high=vocab_size,
|
high=vocab_size,
|
||||||
size=(batch_size, 1),
|
size=(batch_size, 1),
|
||||||
dtype=torch.int64,
|
dtype=torch.int64)
|
||||||
device="cuda")
|
|
||||||
draft_token_ids = torch.randint(low=0,
|
draft_token_ids = torch.randint(low=0,
|
||||||
high=vocab_size,
|
high=vocab_size,
|
||||||
size=(batch_size, k),
|
size=(batch_size, k),
|
||||||
dtype=torch.int64,
|
dtype=torch.int64)
|
||||||
device="cuda")
|
|
||||||
|
|
||||||
rejection_sampler(target_probs, bonus_token_ids, draft_probs,
|
rejection_sampler(target_probs, bonus_token_ids, draft_probs,
|
||||||
draft_token_ids)
|
draft_token_ids)
|
||||||
@ -153,36 +150,28 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int):
|
|||||||
@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
|
@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
|
||||||
@pytest.mark.parametrize("which_token_ids",
|
@pytest.mark.parametrize("which_token_ids",
|
||||||
["bonus_token_ids", "draft_token_ids"])
|
["bonus_token_ids", "draft_token_ids"])
|
||||||
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
|
def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
|
||||||
which_token_ids: str):
|
which_token_ids: str, device: str):
|
||||||
k = 3
|
k = 3
|
||||||
batch_size = 5
|
batch_size = 5
|
||||||
vocab_size = 30_000
|
vocab_size = 30_000
|
||||||
|
torch.set_default_device(device)
|
||||||
|
|
||||||
rejection_sampler = RejectionSampler(strict_mode=True)
|
rejection_sampler = RejectionSampler(strict_mode=True)
|
||||||
rejection_sampler.init_gpu_tensors(rank=0)
|
rejection_sampler.init_gpu_tensors(rank=0)
|
||||||
|
|
||||||
draft_probs = torch.rand(batch_size,
|
draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
|
||||||
k,
|
target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
|
||||||
vocab_size,
|
|
||||||
dtype=torch.float32,
|
|
||||||
device="cuda")
|
|
||||||
target_probs = torch.rand(batch_size,
|
|
||||||
k,
|
|
||||||
vocab_size,
|
|
||||||
dtype=torch.float32,
|
|
||||||
device="cuda")
|
|
||||||
bonus_token_ids = torch.randint(low=0,
|
bonus_token_ids = torch.randint(low=0,
|
||||||
high=vocab_size,
|
high=vocab_size,
|
||||||
size=(batch_size, 1),
|
size=(batch_size, 1),
|
||||||
dtype=torch.int64,
|
dtype=torch.int64)
|
||||||
device="cuda")
|
|
||||||
draft_token_ids = torch.randint(low=0,
|
draft_token_ids = torch.randint(low=0,
|
||||||
high=vocab_size,
|
high=vocab_size,
|
||||||
size=(batch_size, k),
|
size=(batch_size, k),
|
||||||
dtype=torch.int64,
|
dtype=torch.int64)
|
||||||
device="cuda")
|
|
||||||
|
|
||||||
oob_token_ids = None
|
oob_token_ids = None
|
||||||
if which_token_ids == "bonus_token_ids":
|
if which_token_ids == "bonus_token_ids":
|
||||||
@ -237,6 +226,7 @@ def test_rejection_sampling_approximates_target_distribution(
|
|||||||
probabilities are exactly equal. Rejection sampling should
|
probabilities are exactly equal. Rejection sampling should
|
||||||
still work without any NaNs or exceptions.
|
still work without any NaNs or exceptions.
|
||||||
"""
|
"""
|
||||||
|
torch.set_default_device("cpu")
|
||||||
set_random_seed(seed)
|
set_random_seed(seed)
|
||||||
|
|
||||||
helper = _CorrectnessTestHelper(
|
helper = _CorrectnessTestHelper(
|
||||||
|
@ -31,24 +31,26 @@ def _prepare_test(
|
|||||||
batch_size: int
|
batch_size: int
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler, ModelRunner]:
|
) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler, ModelRunner]:
|
||||||
vocab_size = 32000
|
vocab_size = 32000
|
||||||
input_tensor = torch.rand((batch_size, 1024),
|
input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
|
||||||
device="cuda",
|
|
||||||
dtype=torch.float16)
|
|
||||||
fake_logits = torch.full((batch_size, vocab_size),
|
fake_logits = torch.full((batch_size, vocab_size),
|
||||||
1e-2,
|
1e-2,
|
||||||
device=input_tensor.device,
|
|
||||||
dtype=input_tensor.dtype)
|
dtype=input_tensor.dtype)
|
||||||
sampler = MockLogitsSampler(32000, fake_logits)
|
sampler = MockLogitsSampler(32000, fake_logits)
|
||||||
model_runner = ModelRunner(None, None, None, None)
|
model_runner = ModelRunner(None, None, None, None, None)
|
||||||
return input_tensor, fake_logits, sampler, model_runner
|
return input_tensor, fake_logits, sampler, model_runner
|
||||||
|
|
||||||
|
|
||||||
RANDOM_SEEDS = list(range(128))
|
RANDOM_SEEDS = list(range(128))
|
||||||
|
CUDA_DEVICES = [
|
||||||
|
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||||
def test_sampler_all_greedy(seed: int):
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
|
def test_sampler_all_greedy(seed: int, device: str):
|
||||||
set_random_seed(seed)
|
set_random_seed(seed)
|
||||||
|
torch.set_default_device(device)
|
||||||
batch_size = random.randint(1, 256)
|
batch_size = random.randint(1, 256)
|
||||||
input_tensor, fake_logits, sampler, model_runner = _prepare_test(
|
input_tensor, fake_logits, sampler, model_runner = _prepare_test(
|
||||||
batch_size)
|
batch_size)
|
||||||
@ -81,8 +83,10 @@ def test_sampler_all_greedy(seed: int):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||||
def test_sampler_all_random(seed: int):
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
|
def test_sampler_all_random(seed: int, device: str):
|
||||||
set_random_seed(seed)
|
set_random_seed(seed)
|
||||||
|
torch.set_default_device(device)
|
||||||
batch_size = random.randint(1, 256)
|
batch_size = random.randint(1, 256)
|
||||||
input_tensor, fake_logits, sampler, model_runner = _prepare_test(
|
input_tensor, fake_logits, sampler, model_runner = _prepare_test(
|
||||||
batch_size)
|
batch_size)
|
||||||
@ -120,8 +124,10 @@ def test_sampler_all_random(seed: int):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||||
def test_sampler_all_beam(seed: int):
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
|
def test_sampler_all_beam(seed: int, device: str):
|
||||||
set_random_seed(seed)
|
set_random_seed(seed)
|
||||||
|
torch.set_default_device(device)
|
||||||
batch_size = random.randint(1, 256)
|
batch_size = random.randint(1, 256)
|
||||||
input_tensor, _, sampler, model_runner = _prepare_test(batch_size)
|
input_tensor, _, sampler, model_runner = _prepare_test(batch_size)
|
||||||
|
|
||||||
@ -156,8 +162,10 @@ def test_sampler_all_beam(seed: int):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||||
def test_sampler_mixed(seed: int):
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
|
def test_sampler_mixed(seed: int, device: str):
|
||||||
set_random_seed(seed)
|
set_random_seed(seed)
|
||||||
|
torch.set_default_device(device)
|
||||||
batch_size = random.randint(1, 256)
|
batch_size = random.randint(1, 256)
|
||||||
input_tensor, fake_logits, sampler, model_runner = _prepare_test(
|
input_tensor, fake_logits, sampler, model_runner = _prepare_test(
|
||||||
batch_size)
|
batch_size)
|
||||||
@ -212,8 +220,10 @@ def test_sampler_mixed(seed: int):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||||
def test_sampler_logits_processors(seed: int):
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
|
def test_sampler_logits_processors(seed: int, device: str):
|
||||||
set_random_seed(seed)
|
set_random_seed(seed)
|
||||||
|
torch.set_default_device(device)
|
||||||
batch_size = random.randint(1, 256)
|
batch_size = random.randint(1, 256)
|
||||||
input_tensor, _, sampler, model_runner = _prepare_test(batch_size)
|
input_tensor, _, sampler, model_runner = _prepare_test(batch_size)
|
||||||
|
|
||||||
@ -252,14 +262,15 @@ def test_sampler_logits_processors(seed: int):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||||
def test_sampler_top_k_top_p(seed: int):
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
|
def test_sampler_top_k_top_p(seed: int, device: str):
|
||||||
set_random_seed(seed)
|
set_random_seed(seed)
|
||||||
batch_size = random.randint(1, 256)
|
batch_size = random.randint(1, 256)
|
||||||
top_k = random.randint(100, 500)
|
top_k = random.randint(100, 500)
|
||||||
top_p = random.random() * 0.1
|
top_p = random.random() * 0.1
|
||||||
vocab_size = 32000
|
vocab_size = 32000
|
||||||
input_tensor = torch.rand((batch_size, 1024),
|
input_tensor = torch.rand((batch_size, 1024),
|
||||||
device="cuda",
|
device=device,
|
||||||
dtype=torch.float16)
|
dtype=torch.float16)
|
||||||
fake_logits = torch.normal(0,
|
fake_logits = torch.normal(0,
|
||||||
5,
|
5,
|
||||||
@ -267,7 +278,7 @@ def test_sampler_top_k_top_p(seed: int):
|
|||||||
device=input_tensor.device,
|
device=input_tensor.device,
|
||||||
dtype=input_tensor.dtype)
|
dtype=input_tensor.dtype)
|
||||||
sampler = MockLogitsSampler(32000, fake_logits)
|
sampler = MockLogitsSampler(32000, fake_logits)
|
||||||
model_runner = ModelRunner(None, None, None, None)
|
model_runner = ModelRunner(None, None, None, None, None)
|
||||||
|
|
||||||
generation_model = GenerationMixin()
|
generation_model = GenerationMixin()
|
||||||
generation_config = GenerationConfig(top_k=top_k,
|
generation_config = GenerationConfig(top_k=top_k,
|
||||||
|
@ -84,7 +84,7 @@ def create_worker(cls: type,
|
|||||||
)
|
)
|
||||||
|
|
||||||
(model_config, cache_config, parallel_config, scheduler_config,
|
(model_config, cache_config, parallel_config, scheduler_config,
|
||||||
_) = engine_args.create_engine_configs()
|
device_config, _) = engine_args.create_engine_configs()
|
||||||
|
|
||||||
distributed_init_method = get_distributed_init_method(
|
distributed_init_method = get_distributed_init_method(
|
||||||
get_ip(), get_open_port())
|
get_ip(), get_open_port())
|
||||||
@ -93,6 +93,7 @@ def create_worker(cls: type,
|
|||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
parallel_config=parallel_config,
|
parallel_config=parallel_config,
|
||||||
scheduler_config=scheduler_config,
|
scheduler_config=scheduler_config,
|
||||||
|
device_config=device_config,
|
||||||
local_rank=0,
|
local_rank=0,
|
||||||
rank=0,
|
rank=0,
|
||||||
distributed_init_method=distributed_init_method,
|
distributed_init_method=distributed_init_method,
|
||||||
|
@ -6,7 +6,7 @@ from vllm.worker.model_runner import ModelRunner
|
|||||||
|
|
||||||
|
|
||||||
def test_prepare_prompt():
|
def test_prepare_prompt():
|
||||||
model_runner = ModelRunner(None, None, None, None)
|
model_runner = ModelRunner(None, None, None, None, None)
|
||||||
model_runner.set_block_size(16)
|
model_runner.set_block_size(16)
|
||||||
|
|
||||||
batch_size = random.randint(1, 256)
|
batch_size = random.randint(1, 256)
|
||||||
|
@ -444,6 +444,12 @@ class SchedulerConfig:
|
|||||||
f"({self.max_num_seqs}).")
|
f"({self.max_num_seqs}).")
|
||||||
|
|
||||||
|
|
||||||
|
class DeviceConfig:
|
||||||
|
|
||||||
|
def __init__(self, device: str = "cuda") -> None:
|
||||||
|
self.device = torch.device(device)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class LoRAConfig:
|
class LoRAConfig:
|
||||||
max_lora_rank: int
|
max_lora_rank: int
|
||||||
|
@ -3,8 +3,8 @@ import dataclasses
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
|
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
||||||
SchedulerConfig, LoRAConfig)
|
ParallelConfig, SchedulerConfig, LoRAConfig)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -43,6 +43,7 @@ class EngineArgs:
|
|||||||
lora_extra_vocab_size: int = 256
|
lora_extra_vocab_size: int = 256
|
||||||
lora_dtype = 'auto'
|
lora_dtype = 'auto'
|
||||||
max_cpu_loras: Optional[int] = None
|
max_cpu_loras: Optional[int] = None
|
||||||
|
device: str = 'cuda'
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
if self.tokenizer is None:
|
if self.tokenizer is None:
|
||||||
@ -127,13 +128,13 @@ class EngineArgs:
|
|||||||
'--kv-cache-dtype',
|
'--kv-cache-dtype',
|
||||||
type=str,
|
type=str,
|
||||||
choices=['auto', 'fp8_e5m2'],
|
choices=['auto', 'fp8_e5m2'],
|
||||||
default='auto',
|
default=EngineArgs.kv_cache_dtype,
|
||||||
help='Data type for kv cache storage. If "auto", will use model '
|
help='Data type for kv cache storage. If "auto", will use model '
|
||||||
'data type. Note FP8 is not supported when cuda version is '
|
'data type. Note FP8 is not supported when cuda version is '
|
||||||
'lower than 11.8.')
|
'lower than 11.8.')
|
||||||
parser.add_argument('--max-model-len',
|
parser.add_argument('--max-model-len',
|
||||||
type=int,
|
type=int,
|
||||||
default=None,
|
default=EngineArgs.max_model_len,
|
||||||
help='model context length. If unspecified, '
|
help='model context length. If unspecified, '
|
||||||
'will be automatically derived from the model.')
|
'will be automatically derived from the model.')
|
||||||
# Parallel arguments
|
# Parallel arguments
|
||||||
@ -154,6 +155,7 @@ class EngineArgs:
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--max-parallel-loading-workers',
|
'--max-parallel-loading-workers',
|
||||||
type=int,
|
type=int,
|
||||||
|
default=EngineArgs.max_parallel_loading_workers,
|
||||||
help='load model sequentially in multiple batches, '
|
help='load model sequentially in multiple batches, '
|
||||||
'to avoid RAM OOM when using tensor '
|
'to avoid RAM OOM when using tensor '
|
||||||
'parallel and large models')
|
'parallel and large models')
|
||||||
@ -200,7 +202,7 @@ class EngineArgs:
|
|||||||
'-q',
|
'-q',
|
||||||
type=str,
|
type=str,
|
||||||
choices=['awq', 'gptq', 'squeezellm', None],
|
choices=['awq', 'gptq', 'squeezellm', None],
|
||||||
default=None,
|
default=EngineArgs.quantization,
|
||||||
help='Method used to quantize the weights. If '
|
help='Method used to quantize the weights. If '
|
||||||
'None, we first check the `quantization_config` '
|
'None, we first check the `quantization_config` '
|
||||||
'attribute in the model config file. If that is '
|
'attribute in the model config file. If that is '
|
||||||
@ -255,6 +257,13 @@ class EngineArgs:
|
|||||||
help=('Maximum number of LoRAs to store in CPU memory. '
|
help=('Maximum number of LoRAs to store in CPU memory. '
|
||||||
'Must be >= than max_num_seqs. '
|
'Must be >= than max_num_seqs. '
|
||||||
'Defaults to max_num_seqs.'))
|
'Defaults to max_num_seqs.'))
|
||||||
|
parser.add_argument(
|
||||||
|
"--device",
|
||||||
|
type=str,
|
||||||
|
default=EngineArgs.device,
|
||||||
|
choices=["cuda"],
|
||||||
|
help=('Device type for vLLM execution. '
|
||||||
|
'Currently, only CUDA-compatible devices are supported.'))
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -268,7 +277,8 @@ class EngineArgs:
|
|||||||
def create_engine_configs(
|
def create_engine_configs(
|
||||||
self,
|
self,
|
||||||
) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig,
|
) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig,
|
||||||
Optional[LoRAConfig]]:
|
DeviceConfig, Optional[LoRAConfig]]:
|
||||||
|
device_config = DeviceConfig(self.device)
|
||||||
model_config = ModelConfig(self.model, self.tokenizer,
|
model_config = ModelConfig(self.model, self.tokenizer,
|
||||||
self.tokenizer_mode, self.trust_remote_code,
|
self.tokenizer_mode, self.trust_remote_code,
|
||||||
self.download_dir, self.load_format,
|
self.download_dir, self.load_format,
|
||||||
@ -296,7 +306,8 @@ class EngineArgs:
|
|||||||
lora_dtype=self.lora_dtype,
|
lora_dtype=self.lora_dtype,
|
||||||
max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
|
max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
|
||||||
and self.max_cpu_loras > 0 else None) if self.enable_lora else None
|
and self.max_cpu_loras > 0 else None) if self.enable_lora else None
|
||||||
return model_config, cache_config, parallel_config, scheduler_config, lora_config
|
return (model_config, cache_config, parallel_config, scheduler_config,
|
||||||
|
device_config, lora_config)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -6,8 +6,8 @@ from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple,
|
|||||||
Union)
|
Union)
|
||||||
|
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
|
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
||||||
SchedulerConfig, LoRAConfig)
|
ParallelConfig, SchedulerConfig, LoRAConfig)
|
||||||
from vllm.core.scheduler import Scheduler, SchedulerOutputs
|
from vllm.core.scheduler import Scheduler, SchedulerOutputs
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.engine.metrics import StatLogger, Stats
|
from vllm.engine.metrics import StatLogger, Stats
|
||||||
@ -53,6 +53,7 @@ class LLMEngine:
|
|||||||
management.
|
management.
|
||||||
parallel_config: The configuration related to distributed execution.
|
parallel_config: The configuration related to distributed execution.
|
||||||
scheduler_config: The configuration related to the request scheduler.
|
scheduler_config: The configuration related to the request scheduler.
|
||||||
|
device_config: The configuration related to the device.
|
||||||
placement_group: Ray placement group for distributed execution.
|
placement_group: Ray placement group for distributed execution.
|
||||||
Required for distributed execution.
|
Required for distributed execution.
|
||||||
log_stats: Whether to log statistics.
|
log_stats: Whether to log statistics.
|
||||||
@ -64,6 +65,7 @@ class LLMEngine:
|
|||||||
cache_config: CacheConfig,
|
cache_config: CacheConfig,
|
||||||
parallel_config: ParallelConfig,
|
parallel_config: ParallelConfig,
|
||||||
scheduler_config: SchedulerConfig,
|
scheduler_config: SchedulerConfig,
|
||||||
|
device_config: DeviceConfig,
|
||||||
lora_config: Optional[LoRAConfig],
|
lora_config: Optional[LoRAConfig],
|
||||||
placement_group: Optional["PlacementGroup"],
|
placement_group: Optional["PlacementGroup"],
|
||||||
log_stats: bool,
|
log_stats: bool,
|
||||||
@ -85,6 +87,7 @@ class LLMEngine:
|
|||||||
f"quantization={model_config.quantization}, "
|
f"quantization={model_config.quantization}, "
|
||||||
f"enforce_eager={model_config.enforce_eager}, "
|
f"enforce_eager={model_config.enforce_eager}, "
|
||||||
f"kv_cache_dtype={cache_config.cache_dtype}, "
|
f"kv_cache_dtype={cache_config.cache_dtype}, "
|
||||||
|
f"device_config={device_config.device}, "
|
||||||
f"seed={model_config.seed})")
|
f"seed={model_config.seed})")
|
||||||
# TODO(woosuk): Print more configs in debug mode.
|
# TODO(woosuk): Print more configs in debug mode.
|
||||||
|
|
||||||
@ -93,6 +96,7 @@ class LLMEngine:
|
|||||||
self.lora_config = lora_config
|
self.lora_config = lora_config
|
||||||
self.parallel_config = parallel_config
|
self.parallel_config = parallel_config
|
||||||
self.scheduler_config = scheduler_config
|
self.scheduler_config = scheduler_config
|
||||||
|
self.device_config = device_config
|
||||||
self.log_stats = log_stats
|
self.log_stats = log_stats
|
||||||
self._verify_args()
|
self._verify_args()
|
||||||
|
|
||||||
@ -138,6 +142,7 @@ class LLMEngine:
|
|||||||
self.model_config,
|
self.model_config,
|
||||||
self.parallel_config,
|
self.parallel_config,
|
||||||
self.scheduler_config,
|
self.scheduler_config,
|
||||||
|
self.device_config,
|
||||||
local_rank=0,
|
local_rank=0,
|
||||||
rank=0,
|
rank=0,
|
||||||
distributed_init_method=distributed_init_method,
|
distributed_init_method=distributed_init_method,
|
||||||
@ -233,6 +238,7 @@ class LLMEngine:
|
|||||||
model_config = copy.deepcopy(self.model_config)
|
model_config = copy.deepcopy(self.model_config)
|
||||||
parallel_config = copy.deepcopy(self.parallel_config)
|
parallel_config = copy.deepcopy(self.parallel_config)
|
||||||
scheduler_config = copy.deepcopy(self.scheduler_config)
|
scheduler_config = copy.deepcopy(self.scheduler_config)
|
||||||
|
device_config = copy.deepcopy(self.device_config)
|
||||||
|
|
||||||
for rank, (worker, (node_id,
|
for rank, (worker, (node_id,
|
||||||
_)) in enumerate(zip(self.workers,
|
_)) in enumerate(zip(self.workers,
|
||||||
@ -244,6 +250,7 @@ class LLMEngine:
|
|||||||
model_config,
|
model_config,
|
||||||
parallel_config,
|
parallel_config,
|
||||||
scheduler_config,
|
scheduler_config,
|
||||||
|
device_config,
|
||||||
local_rank,
|
local_rank,
|
||||||
rank,
|
rank,
|
||||||
distributed_init_method,
|
distributed_init_method,
|
||||||
@ -257,6 +264,7 @@ class LLMEngine:
|
|||||||
model_config,
|
model_config,
|
||||||
parallel_config,
|
parallel_config,
|
||||||
scheduler_config,
|
scheduler_config,
|
||||||
|
device_config,
|
||||||
driver_local_rank,
|
driver_local_rank,
|
||||||
driver_rank,
|
driver_rank,
|
||||||
distributed_init_method,
|
distributed_init_method,
|
||||||
|
@ -89,9 +89,7 @@ class ScaledActivation(nn.Module):
|
|||||||
if params_dtype is None:
|
if params_dtype is None:
|
||||||
params_dtype = torch.get_default_dtype()
|
params_dtype = torch.get_default_dtype()
|
||||||
self.scales = nn.Parameter(
|
self.scales = nn.Parameter(
|
||||||
torch.empty(intermediate_size_per_partition,
|
torch.empty(intermediate_size_per_partition, dtype=params_dtype))
|
||||||
dtype=params_dtype,
|
|
||||||
device="cuda"))
|
|
||||||
set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})
|
set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
|
@ -200,7 +200,7 @@ def _make_alibi_bias(
|
|||||||
seq_len: int,
|
seq_len: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
) -> LowerTriangularMaskWithTensorBias:
|
) -> LowerTriangularMaskWithTensorBias:
|
||||||
bias = torch.arange(seq_len, dtype=dtype, device="cuda")
|
bias = torch.arange(seq_len, dtype=dtype)
|
||||||
# NOTE(zhuohan): HF uses
|
# NOTE(zhuohan): HF uses
|
||||||
# `bias = bias[None, :].repeat(prompt_len, 1)`
|
# `bias = bias[None, :].repeat(prompt_len, 1)`
|
||||||
# here. We find that both biases give the same results, but
|
# here. We find that both biases give the same results, but
|
||||||
|
@ -54,7 +54,6 @@ class UnquantizedLinearMethod(LinearMethodBase):
|
|||||||
params_dtype: torch.dtype) -> Dict[str, Any]:
|
params_dtype: torch.dtype) -> Dict[str, Any]:
|
||||||
weight = Parameter(torch.empty(output_size_per_partition,
|
weight = Parameter(torch.empty(output_size_per_partition,
|
||||||
input_size_per_partition,
|
input_size_per_partition,
|
||||||
device=torch.cuda.current_device(),
|
|
||||||
dtype=params_dtype),
|
dtype=params_dtype),
|
||||||
requires_grad=False)
|
requires_grad=False)
|
||||||
set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
|
set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
|
||||||
@ -113,9 +112,7 @@ class ReplicatedLinear(torch.nn.Module):
|
|||||||
self.register_parameter(name, weight)
|
self.register_parameter(name, weight)
|
||||||
if bias:
|
if bias:
|
||||||
self.bias = Parameter(
|
self.bias = Parameter(
|
||||||
torch.empty(self.output_size,
|
torch.empty(self.output_size, dtype=self.params_dtype))
|
||||||
device=torch.cuda.current_device(),
|
|
||||||
dtype=self.params_dtype))
|
|
||||||
set_weight_attrs(self.bias, {"output_dim": 0})
|
set_weight_attrs(self.bias, {"output_dim": 0})
|
||||||
else:
|
else:
|
||||||
self.register_parameter("bias", None)
|
self.register_parameter("bias", None)
|
||||||
@ -183,7 +180,6 @@ class ColumnParallelLinear(torch.nn.Module):
|
|||||||
if bias:
|
if bias:
|
||||||
self.bias = Parameter(
|
self.bias = Parameter(
|
||||||
torch.empty(self.output_size_per_partition,
|
torch.empty(self.output_size_per_partition,
|
||||||
device=torch.cuda.current_device(),
|
|
||||||
dtype=params_dtype))
|
dtype=params_dtype))
|
||||||
set_weight_attrs(self.bias, {
|
set_weight_attrs(self.bias, {
|
||||||
"output_dim": 0,
|
"output_dim": 0,
|
||||||
@ -509,9 +505,7 @@ class RowParallelLinear(torch.nn.Module):
|
|||||||
|
|
||||||
if bias:
|
if bias:
|
||||||
self.bias = Parameter(
|
self.bias = Parameter(
|
||||||
torch.empty(self.output_size,
|
torch.empty(self.output_size, dtype=params_dtype))
|
||||||
device=torch.cuda.current_device(),
|
|
||||||
dtype=params_dtype))
|
|
||||||
set_weight_attrs(self.bias, {
|
set_weight_attrs(self.bias, {
|
||||||
"output_dim": 0,
|
"output_dim": 0,
|
||||||
"weight_loader": self.weight_loader,
|
"weight_loader": self.weight_loader,
|
||||||
|
@ -96,7 +96,6 @@ class AWQLinearMethod(LinearMethodBase):
|
|||||||
torch.empty(
|
torch.empty(
|
||||||
input_size_per_partition,
|
input_size_per_partition,
|
||||||
output_size_per_partition // self.quant_config.pack_factor,
|
output_size_per_partition // self.quant_config.pack_factor,
|
||||||
device="cuda",
|
|
||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
),
|
),
|
||||||
requires_grad=False,
|
requires_grad=False,
|
||||||
@ -112,7 +111,6 @@ class AWQLinearMethod(LinearMethodBase):
|
|||||||
torch.empty(
|
torch.empty(
|
||||||
input_size_per_partition // self.quant_config.group_size,
|
input_size_per_partition // self.quant_config.group_size,
|
||||||
output_size_per_partition // self.quant_config.pack_factor,
|
output_size_per_partition // self.quant_config.pack_factor,
|
||||||
device="cuda",
|
|
||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
),
|
),
|
||||||
requires_grad=False,
|
requires_grad=False,
|
||||||
@ -128,7 +126,6 @@ class AWQLinearMethod(LinearMethodBase):
|
|||||||
torch.empty(
|
torch.empty(
|
||||||
input_size_per_partition // self.quant_config.group_size,
|
input_size_per_partition // self.quant_config.group_size,
|
||||||
output_size_per_partition,
|
output_size_per_partition,
|
||||||
device="cuda",
|
|
||||||
dtype=params_dtype,
|
dtype=params_dtype,
|
||||||
),
|
),
|
||||||
requires_grad=False,
|
requires_grad=False,
|
||||||
|
@ -127,7 +127,6 @@ class GPTQLinearMethod(LinearMethodBase):
|
|||||||
torch.empty(
|
torch.empty(
|
||||||
input_size_per_partition // self.quant_config.pack_factor,
|
input_size_per_partition // self.quant_config.pack_factor,
|
||||||
output_size_per_partition,
|
output_size_per_partition,
|
||||||
device="cuda",
|
|
||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
),
|
),
|
||||||
requires_grad=False,
|
requires_grad=False,
|
||||||
@ -145,7 +144,6 @@ class GPTQLinearMethod(LinearMethodBase):
|
|||||||
i // self.quant_config.group_size
|
i // self.quant_config.group_size
|
||||||
for i in range(input_size_per_partition)
|
for i in range(input_size_per_partition)
|
||||||
],
|
],
|
||||||
device="cuda",
|
|
||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
),
|
),
|
||||||
requires_grad=False,
|
requires_grad=False,
|
||||||
@ -156,7 +154,6 @@ class GPTQLinearMethod(LinearMethodBase):
|
|||||||
torch.empty(
|
torch.empty(
|
||||||
scale_and_zero_size,
|
scale_and_zero_size,
|
||||||
output_size_per_partition // self.quant_config.pack_factor,
|
output_size_per_partition // self.quant_config.pack_factor,
|
||||||
device="cuda",
|
|
||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
),
|
),
|
||||||
requires_grad=False,
|
requires_grad=False,
|
||||||
@ -172,7 +169,6 @@ class GPTQLinearMethod(LinearMethodBase):
|
|||||||
torch.empty(
|
torch.empty(
|
||||||
scale_and_zero_size,
|
scale_and_zero_size,
|
||||||
output_size_per_partition,
|
output_size_per_partition,
|
||||||
device="cuda",
|
|
||||||
dtype=params_dtype,
|
dtype=params_dtype,
|
||||||
),
|
),
|
||||||
requires_grad=False,
|
requires_grad=False,
|
||||||
|
@ -80,7 +80,6 @@ class SqueezeLLMLinearMethod(LinearMethodBase):
|
|||||||
torch.empty(
|
torch.empty(
|
||||||
input_size_per_partition // self.quant_config.pack_factor,
|
input_size_per_partition // self.quant_config.pack_factor,
|
||||||
output_size_per_partition,
|
output_size_per_partition,
|
||||||
device="cuda",
|
|
||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
),
|
),
|
||||||
requires_grad=False,
|
requires_grad=False,
|
||||||
@ -96,7 +95,6 @@ class SqueezeLLMLinearMethod(LinearMethodBase):
|
|||||||
torch.empty(
|
torch.empty(
|
||||||
output_size,
|
output_size,
|
||||||
self.quant_config.weight_bits**2,
|
self.quant_config.weight_bits**2,
|
||||||
device="cuda",
|
|
||||||
dtype=params_dtype,
|
dtype=params_dtype,
|
||||||
),
|
),
|
||||||
requires_grad=False,
|
requires_grad=False,
|
||||||
@ -118,12 +116,12 @@ class SqueezeLLMLinearMethod(LinearMethodBase):
|
|||||||
out_shape = x.shape[:-1] + (qweight.shape[-1], )
|
out_shape = x.shape[:-1] + (qweight.shape[-1], )
|
||||||
reshaped_x = x.reshape(-1, x.shape[-1])
|
reshaped_x = x.reshape(-1, x.shape[-1])
|
||||||
if is_hip():
|
if is_hip():
|
||||||
out_f = torch.zeros(out_shape, device="cuda", dtype=torch.float)
|
out_f = torch.zeros(out_shape, dtype=torch.float)
|
||||||
ops.squeezellm_gemm(reshaped_x, qweight, out_f, lookup_table)
|
ops.squeezellm_gemm(reshaped_x, qweight, out_f, lookup_table)
|
||||||
out = out_f.to(dtype=torch.float16)
|
out = out_f.to(dtype=torch.float16)
|
||||||
else:
|
else:
|
||||||
# NOTE: The output tensor should be zero-initialized.
|
# NOTE: The output tensor should be zero-initialized.
|
||||||
out = torch.zeros(out_shape, device="cuda", dtype=torch.float16)
|
out = torch.zeros(out_shape, dtype=torch.float16)
|
||||||
ops.squeezellm_gemm(reshaped_x, qweight, out, lookup_table)
|
ops.squeezellm_gemm(reshaped_x, qweight, out, lookup_table)
|
||||||
|
|
||||||
if bias is not None:
|
if bias is not None:
|
||||||
|
@ -77,16 +77,13 @@ class RotaryEmbedding(nn.Module):
|
|||||||
# create the cache on GPU for faster initialization. This may cause
|
# create the cache on GPU for faster initialization. This may cause
|
||||||
# a slight numerical difference between the HF implementation and ours.
|
# a slight numerical difference between the HF implementation and ours.
|
||||||
inv_freq = 1.0 / (base**(torch.arange(
|
inv_freq = 1.0 / (base**(torch.arange(
|
||||||
0, self.rotary_dim, 2, dtype=torch.float, device="cuda") /
|
0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
|
||||||
self.rotary_dim))
|
|
||||||
return inv_freq
|
return inv_freq
|
||||||
|
|
||||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||||
"""Compute the cos and sin cache."""
|
"""Compute the cos and sin cache."""
|
||||||
inv_freq = self._compute_inv_freq(self.base)
|
inv_freq = self._compute_inv_freq(self.base)
|
||||||
t = torch.arange(self.max_position_embeddings,
|
t = torch.arange(self.max_position_embeddings, dtype=torch.float)
|
||||||
dtype=torch.float,
|
|
||||||
device="cuda")
|
|
||||||
|
|
||||||
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
||||||
cos = freqs.cos()
|
cos = freqs.cos()
|
||||||
@ -174,7 +171,7 @@ class LinearScalingRotaryEmbedding(RotaryEmbedding):
|
|||||||
# Thus, the maximum length after applying the rope scaling is
|
# Thus, the maximum length after applying the rope scaling is
|
||||||
# self.max_position_embeddings * self.scaling_factor.
|
# self.max_position_embeddings * self.scaling_factor.
|
||||||
max_len = self.max_position_embeddings * self.scaling_factor
|
max_len = self.max_position_embeddings * self.scaling_factor
|
||||||
t = torch.arange(max_len, dtype=torch.float, device="cuda")
|
t = torch.arange(max_len, dtype=torch.float)
|
||||||
t = t / self.scaling_factor
|
t = t / self.scaling_factor
|
||||||
|
|
||||||
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
||||||
@ -214,7 +211,7 @@ class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
|
|||||||
(self.scaling_factor - 1))**(self.rotary_dim /
|
(self.scaling_factor - 1))**(self.rotary_dim /
|
||||||
(self.rotary_dim - 2))
|
(self.rotary_dim - 2))
|
||||||
inv_freq = self._compute_inv_freq(base)
|
inv_freq = self._compute_inv_freq(base)
|
||||||
t = torch.arange(max_len, dtype=torch.float, device="cuda")
|
t = torch.arange(max_len, dtype=torch.float)
|
||||||
|
|
||||||
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
||||||
cos = freqs.cos()
|
cos = freqs.cos()
|
||||||
@ -297,8 +294,8 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
|
|||||||
is_neox_style)
|
is_neox_style)
|
||||||
|
|
||||||
def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
|
def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
|
||||||
pos_freqs = self.base**(torch.arange(
|
pos_freqs = self.base**(
|
||||||
0, self.rotary_dim, 2, dtype=torch.float, device="cuda") /
|
torch.arange(0, self.rotary_dim, 2, dtype=torch.float) /
|
||||||
self.rotary_dim)
|
self.rotary_dim)
|
||||||
inv_freq_extrapolation = 1.0 / pos_freqs
|
inv_freq_extrapolation = 1.0 / pos_freqs
|
||||||
inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
|
inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
|
||||||
@ -308,8 +305,8 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
|
|||||||
self.max_position_embeddings)
|
self.max_position_embeddings)
|
||||||
# Get n-d rotational scaling corrected for extrapolation
|
# Get n-d rotational scaling corrected for extrapolation
|
||||||
inv_freq_mask = (1 - _yarn_linear_ramp_mask(
|
inv_freq_mask = (1 - _yarn_linear_ramp_mask(
|
||||||
low, high, self.rotary_dim // 2, dtype=torch.float,
|
low, high, self.rotary_dim // 2,
|
||||||
device="cuda")) * self.extrapolation_factor
|
dtype=torch.float)) * self.extrapolation_factor
|
||||||
inv_freq = inv_freq_interpolation * (
|
inv_freq = inv_freq_interpolation * (
|
||||||
1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
|
1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
|
||||||
return inv_freq
|
return inv_freq
|
||||||
@ -317,7 +314,6 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
|
|||||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||||
inv_freq = self._compute_inv_freq(self.scaling_factor)
|
inv_freq = self._compute_inv_freq(self.scaling_factor)
|
||||||
t = torch.arange(self.max_position_embeddings * self.scaling_factor,
|
t = torch.arange(self.max_position_embeddings * self.scaling_factor,
|
||||||
device="cuda",
|
|
||||||
dtype=torch.float32)
|
dtype=torch.float32)
|
||||||
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
||||||
cos = (freqs.cos() * self.mscale)
|
cos = (freqs.cos() * self.mscale)
|
||||||
|
@ -77,7 +77,6 @@ class VocabParallelEmbedding(torch.nn.Module):
|
|||||||
self.weight = Parameter(
|
self.weight = Parameter(
|
||||||
torch.empty(self.num_embeddings_per_partition,
|
torch.empty(self.num_embeddings_per_partition,
|
||||||
self.embedding_dim,
|
self.embedding_dim,
|
||||||
device=torch.cuda.current_device(),
|
|
||||||
dtype=params_dtype))
|
dtype=params_dtype))
|
||||||
set_weight_attrs(self.weight, {
|
set_weight_attrs(self.weight, {
|
||||||
"parallel_dim": 0,
|
"parallel_dim": 0,
|
||||||
@ -139,7 +138,6 @@ class ParallelLMHead(VocabParallelEmbedding):
|
|||||||
if bias:
|
if bias:
|
||||||
self.bias = Parameter(
|
self.bias = Parameter(
|
||||||
torch.empty(self.num_embeddings_per_partition,
|
torch.empty(self.num_embeddings_per_partition,
|
||||||
device=torch.cuda.current_device(),
|
|
||||||
dtype=params_dtype))
|
dtype=params_dtype))
|
||||||
set_weight_attrs(self.bias, {
|
set_weight_attrs(self.bias, {
|
||||||
"parallel_dim": 0,
|
"parallel_dim": 0,
|
||||||
|
@ -5,7 +5,7 @@ from typing import Optional, Type
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
from vllm.config import ModelConfig, LoRAConfig
|
from vllm.config import DeviceConfig, ModelConfig, LoRAConfig
|
||||||
from vllm.model_executor.models import ModelRegistry
|
from vllm.model_executor.models import ModelRegistry
|
||||||
from vllm.model_executor.weight_utils import (get_quant_config,
|
from vllm.model_executor.weight_utils import (get_quant_config,
|
||||||
initialize_dummy_weights)
|
initialize_dummy_weights)
|
||||||
@ -38,6 +38,7 @@ def _get_model_architecture(model_config: ModelConfig) -> Type[nn.Module]:
|
|||||||
|
|
||||||
|
|
||||||
def get_model(model_config: ModelConfig,
|
def get_model(model_config: ModelConfig,
|
||||||
|
device_config: DeviceConfig,
|
||||||
lora_config: Optional[LoRAConfig] = None) -> nn.Module:
|
lora_config: Optional[LoRAConfig] = None) -> nn.Module:
|
||||||
model_class = _get_model_architecture(model_config)
|
model_class = _get_model_architecture(model_config)
|
||||||
|
|
||||||
@ -64,7 +65,7 @@ def get_model(model_config: ModelConfig,
|
|||||||
with _set_default_torch_dtype(model_config.dtype):
|
with _set_default_torch_dtype(model_config.dtype):
|
||||||
# Create a model instance.
|
# Create a model instance.
|
||||||
# The weights will be initialized as empty tensors.
|
# The weights will be initialized as empty tensors.
|
||||||
with torch.device("cuda"):
|
with torch.device(device_config.device):
|
||||||
if getattr(model_class, "supports_lora", False):
|
if getattr(model_class, "supports_lora", False):
|
||||||
model = model_class(model_config.hf_config, linear_method,
|
model = model_class(model_config.hf_config, linear_method,
|
||||||
lora_config)
|
lora_config)
|
||||||
|
@ -228,6 +228,7 @@ def create_kv_caches_with_random(
|
|||||||
device: Optional[str] = "cuda",
|
device: Optional[str] = "cuda",
|
||||||
) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
|
) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
|
||||||
torch.random.manual_seed(seed)
|
torch.random.manual_seed(seed)
|
||||||
|
if torch.cuda.is_available():
|
||||||
torch.cuda.manual_seed(seed)
|
torch.cuda.manual_seed(seed)
|
||||||
|
|
||||||
if isinstance(cache_dtype, str):
|
if isinstance(cache_dtype, str):
|
||||||
|
@ -104,11 +104,13 @@ class CacheEngine:
|
|||||||
size=(self.num_cpu_blocks, *key_block_shape),
|
size=(self.num_cpu_blocks, *key_block_shape),
|
||||||
dtype=self.dtype,
|
dtype=self.dtype,
|
||||||
pin_memory=pin_memory,
|
pin_memory=pin_memory,
|
||||||
|
device="cpu",
|
||||||
)
|
)
|
||||||
value_blocks = torch.empty(
|
value_blocks = torch.empty(
|
||||||
size=(self.num_cpu_blocks, *value_block_shape),
|
size=(self.num_cpu_blocks, *value_block_shape),
|
||||||
dtype=self.dtype,
|
dtype=self.dtype,
|
||||||
pin_memory=pin_memory,
|
pin_memory=pin_memory,
|
||||||
|
device="cpu",
|
||||||
)
|
)
|
||||||
cpu_cache.append((key_blocks, value_blocks))
|
cpu_cache.append((key_blocks, value_blocks))
|
||||||
return cpu_cache
|
return cpu_cache
|
||||||
|
@ -5,7 +5,7 @@ import numpy as np
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
from vllm.config import ModelConfig, LoRAConfig, ParallelConfig, SchedulerConfig
|
from vllm.config import DeviceConfig, ModelConfig, LoRAConfig, ParallelConfig, SchedulerConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor import get_model, InputMetadata, SamplingMetadata
|
from vllm.model_executor import get_model, InputMetadata, SamplingMetadata
|
||||||
from vllm.model_executor.parallel_utils.communication_op import (
|
from vllm.model_executor.parallel_utils.communication_op import (
|
||||||
@ -35,6 +35,7 @@ class ModelRunner:
|
|||||||
model_config: ModelConfig,
|
model_config: ModelConfig,
|
||||||
parallel_config: ParallelConfig,
|
parallel_config: ParallelConfig,
|
||||||
scheduler_config: SchedulerConfig,
|
scheduler_config: SchedulerConfig,
|
||||||
|
device_config: DeviceConfig,
|
||||||
lora_config: Optional[LoRAConfig],
|
lora_config: Optional[LoRAConfig],
|
||||||
kv_cache_dtype: Optional[str] = "auto",
|
kv_cache_dtype: Optional[str] = "auto",
|
||||||
is_driver_worker: bool = False,
|
is_driver_worker: bool = False,
|
||||||
@ -49,7 +50,10 @@ class ModelRunner:
|
|||||||
# FIXME(woosuk): This is a hack to make the tests work. Refactor this.
|
# FIXME(woosuk): This is a hack to make the tests work. Refactor this.
|
||||||
self.sliding_window = (model_config.get_sliding_window()
|
self.sliding_window = (model_config.get_sliding_window()
|
||||||
if model_config is not None else None)
|
if model_config is not None else None)
|
||||||
self.device = torch.device(torch.cuda.current_device())
|
self.device_config = (device_config
|
||||||
|
if device_config is not None else DeviceConfig())
|
||||||
|
self.device = self.device_config.device
|
||||||
|
|
||||||
self.model = None
|
self.model = None
|
||||||
self.block_size = None # Set after initial profiling.
|
self.block_size = None # Set after initial profiling.
|
||||||
self.lora_manager = None
|
self.lora_manager = None
|
||||||
@ -72,7 +76,8 @@ class ModelRunner:
|
|||||||
self.kv_cache_dtype = kv_cache_dtype
|
self.kv_cache_dtype = kv_cache_dtype
|
||||||
|
|
||||||
def load_model(self) -> None:
|
def load_model(self) -> None:
|
||||||
self.model = get_model(self.model_config, self.lora_config)
|
self.model = get_model(self.model_config, self.device_config,
|
||||||
|
self.lora_config)
|
||||||
|
|
||||||
vocab_size = self.model.config.vocab_size
|
vocab_size = self.model.config.vocab_size
|
||||||
|
|
||||||
@ -182,22 +187,25 @@ class ModelRunner:
|
|||||||
input_tokens = _make_tensor_with_pad(input_tokens,
|
input_tokens = _make_tensor_with_pad(input_tokens,
|
||||||
max_prompt_len,
|
max_prompt_len,
|
||||||
pad=0,
|
pad=0,
|
||||||
dtype=torch.long)
|
dtype=torch.long,
|
||||||
|
device=self.device)
|
||||||
input_positions = _make_tensor_with_pad(input_positions,
|
input_positions = _make_tensor_with_pad(input_positions,
|
||||||
max_prompt_len,
|
max_prompt_len,
|
||||||
pad=0,
|
pad=0,
|
||||||
dtype=torch.long)
|
dtype=torch.long,
|
||||||
|
device=self.device)
|
||||||
slot_mapping = _make_tensor_with_pad(slot_mapping,
|
slot_mapping = _make_tensor_with_pad(slot_mapping,
|
||||||
max_prompt_len,
|
max_prompt_len,
|
||||||
pad=_PAD_SLOT_ID,
|
pad=_PAD_SLOT_ID,
|
||||||
dtype=torch.long)
|
dtype=torch.long,
|
||||||
|
device=self.device)
|
||||||
lora_index_mapping = [
|
lora_index_mapping = [
|
||||||
_pad_to_max(mapping, max_prompt_len, pad=0)
|
_pad_to_max(mapping, max_prompt_len, pad=0)
|
||||||
for mapping in lora_index_mapping
|
for mapping in lora_index_mapping
|
||||||
]
|
]
|
||||||
context_lens_tensor = torch.tensor(context_lens,
|
context_lens_tensor = torch.tensor(context_lens,
|
||||||
dtype=torch.int,
|
dtype=torch.int,
|
||||||
device='cuda')
|
device=self.device)
|
||||||
# Prepare prefix block tables
|
# Prepare prefix block tables
|
||||||
max_prompt_block_table_len = max(len(t) for t in prefix_block_tables)
|
max_prompt_block_table_len = max(len(t) for t in prefix_block_tables)
|
||||||
block_tables = _make_tensor_with_pad(
|
block_tables = _make_tensor_with_pad(
|
||||||
@ -205,15 +213,16 @@ class ModelRunner:
|
|||||||
max_len=max_prompt_block_table_len,
|
max_len=max_prompt_block_table_len,
|
||||||
pad=0,
|
pad=0,
|
||||||
dtype=torch.int,
|
dtype=torch.int,
|
||||||
|
device=self.device,
|
||||||
)
|
)
|
||||||
start_loc_tensor = torch.arange(0,
|
start_loc_tensor = torch.arange(0,
|
||||||
len(prompt_lens) * max_prompt_len,
|
len(prompt_lens) * max_prompt_len,
|
||||||
max_prompt_len,
|
max_prompt_len,
|
||||||
dtype=torch.long,
|
dtype=torch.long,
|
||||||
device='cuda')
|
device=self.device)
|
||||||
prompt_lens_tensor = torch.tensor(prompt_lens,
|
prompt_lens_tensor = torch.tensor(prompt_lens,
|
||||||
dtype=torch.long,
|
dtype=torch.long,
|
||||||
device='cuda')
|
device=self.device)
|
||||||
|
|
||||||
input_metadata = InputMetadata(
|
input_metadata = InputMetadata(
|
||||||
is_prompt=True,
|
is_prompt=True,
|
||||||
@ -305,20 +314,20 @@ class ModelRunner:
|
|||||||
max_len=1,
|
max_len=1,
|
||||||
pad=0,
|
pad=0,
|
||||||
dtype=torch.long,
|
dtype=torch.long,
|
||||||
device="cuda")
|
device=self.device)
|
||||||
input_positions = _make_tensor_with_pad(input_positions,
|
input_positions = _make_tensor_with_pad(input_positions,
|
||||||
max_len=1,
|
max_len=1,
|
||||||
pad=0,
|
pad=0,
|
||||||
dtype=torch.long,
|
dtype=torch.long,
|
||||||
device="cuda")
|
device=self.device)
|
||||||
slot_mapping = _make_tensor_with_pad(slot_mapping,
|
slot_mapping = _make_tensor_with_pad(slot_mapping,
|
||||||
max_len=1,
|
max_len=1,
|
||||||
pad=_PAD_SLOT_ID,
|
pad=_PAD_SLOT_ID,
|
||||||
dtype=torch.long,
|
dtype=torch.long,
|
||||||
device="cuda")
|
device=self.device)
|
||||||
context_lens = torch.tensor(context_lens,
|
context_lens = torch.tensor(context_lens,
|
||||||
dtype=torch.int,
|
dtype=torch.int,
|
||||||
device="cuda")
|
device=self.device)
|
||||||
|
|
||||||
if use_captured_graph:
|
if use_captured_graph:
|
||||||
# The shape of graph_block_tables is
|
# The shape of graph_block_tables is
|
||||||
@ -327,7 +336,7 @@ class ModelRunner:
|
|||||||
for i, block_table in enumerate(block_tables):
|
for i, block_table in enumerate(block_tables):
|
||||||
if block_table:
|
if block_table:
|
||||||
input_block_tables[i, :len(block_table)] = block_table
|
input_block_tables[i, :len(block_table)] = block_table
|
||||||
block_tables = torch.tensor(input_block_tables, device="cuda")
|
block_tables = torch.tensor(input_block_tables, device=self.device)
|
||||||
else:
|
else:
|
||||||
max_block_table_len = max(
|
max_block_table_len = max(
|
||||||
len(block_table) for block_table in block_tables)
|
len(block_table) for block_table in block_tables)
|
||||||
@ -336,7 +345,7 @@ class ModelRunner:
|
|||||||
max_len=max_block_table_len,
|
max_len=max_block_table_len,
|
||||||
pad=0,
|
pad=0,
|
||||||
dtype=torch.int,
|
dtype=torch.int,
|
||||||
device="cuda",
|
device=self.device,
|
||||||
)
|
)
|
||||||
|
|
||||||
lora_index_mapping = [
|
lora_index_mapping = [
|
||||||
@ -355,7 +364,8 @@ class ModelRunner:
|
|||||||
use_cuda_graph=use_captured_graph,
|
use_cuda_graph=use_captured_graph,
|
||||||
kv_cache_dtype=self.kv_cache_dtype,
|
kv_cache_dtype=self.kv_cache_dtype,
|
||||||
)
|
)
|
||||||
return input_tokens, input_positions, input_metadata, lora_index_mapping, lora_prompt_mapping, lora_requests
|
return (input_tokens, input_positions, input_metadata,
|
||||||
|
lora_index_mapping, lora_prompt_mapping, lora_requests)
|
||||||
|
|
||||||
def _prepare_sample(
|
def _prepare_sample(
|
||||||
self,
|
self,
|
||||||
@ -410,9 +420,13 @@ class ModelRunner:
|
|||||||
|
|
||||||
selected_token_indices = _async_h2d(selected_token_indices,
|
selected_token_indices = _async_h2d(selected_token_indices,
|
||||||
dtype=torch.long,
|
dtype=torch.long,
|
||||||
|
target_device=self.device,
|
||||||
pin_memory=not self.in_wsl)
|
pin_memory=not self.in_wsl)
|
||||||
categorized_sample_indices = {
|
categorized_sample_indices = {
|
||||||
t: _async_h2d(seq_ids, dtype=torch.int, pin_memory=not self.in_wsl)
|
t: _async_h2d(seq_ids,
|
||||||
|
dtype=torch.int,
|
||||||
|
target_device=self.device,
|
||||||
|
pin_memory=not self.in_wsl)
|
||||||
for t, seq_ids in categorized_sample_indices.items()
|
for t, seq_ids in categorized_sample_indices.items()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -511,7 +525,8 @@ class ModelRunner:
|
|||||||
perform_sampling=False,
|
perform_sampling=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
return input_tokens, input_positions, input_metadata, sampling_metadata, lora_requests, lora_mapping
|
return (input_tokens, input_positions, input_metadata,
|
||||||
|
sampling_metadata, lora_requests, lora_mapping)
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def execute_model(
|
def execute_model(
|
||||||
@ -519,8 +534,9 @@ class ModelRunner:
|
|||||||
seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
|
seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
|
||||||
kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
|
kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
|
||||||
) -> Optional[SamplerOutput]:
|
) -> Optional[SamplerOutput]:
|
||||||
input_tokens, input_positions, input_metadata, sampling_metadata, lora_requests, lora_mapping = (
|
(input_tokens, input_positions, input_metadata, sampling_metadata,
|
||||||
self.prepare_input_tensors(seq_group_metadata_list))
|
lora_requests,
|
||||||
|
lora_mapping) = self.prepare_input_tensors(seq_group_metadata_list)
|
||||||
|
|
||||||
if self.lora_config:
|
if self.lora_config:
|
||||||
self.set_active_loras(lora_requests, lora_mapping)
|
self.set_active_loras(lora_requests, lora_mapping)
|
||||||
@ -789,14 +805,10 @@ def _make_tensor_with_pad(
|
|||||||
max_len: int,
|
max_len: int,
|
||||||
pad: int,
|
pad: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
device: Union[str, torch.device] = "cuda",
|
device: Optional[Union[str, torch.device]],
|
||||||
pin_memory: bool = False,
|
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
padded_x = [_pad_to_max(x_i, max_len, pad) for x_i in x]
|
padded_x = [_pad_to_max(x_i, max_len, pad) for x_i in x]
|
||||||
return torch.tensor(padded_x,
|
return torch.tensor(padded_x, dtype=dtype, device=device)
|
||||||
dtype=dtype,
|
|
||||||
device=device,
|
|
||||||
pin_memory=pin_memory and str(device) == "cpu")
|
|
||||||
|
|
||||||
|
|
||||||
def _get_graph_batch_size(batch_size: int) -> int:
|
def _get_graph_batch_size(batch_size: int) -> int:
|
||||||
@ -808,6 +820,11 @@ def _get_graph_batch_size(batch_size: int) -> int:
|
|||||||
return (batch_size + 7) // 8 * 8
|
return (batch_size + 7) // 8 * 8
|
||||||
|
|
||||||
|
|
||||||
def _async_h2d(data: list, dtype, pin_memory):
|
def _async_h2d(
|
||||||
t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory)
|
data: list,
|
||||||
return t.to(device="cuda", non_blocking=True)
|
dtype: torch.dtype,
|
||||||
|
target_device: Union[str, torch.device],
|
||||||
|
pin_memory: bool,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory, device="cpu")
|
||||||
|
return t.to(device=target_device, non_blocking=True)
|
||||||
|
@ -6,8 +6,8 @@ from typing import Dict, List, Tuple, Set, Optional
|
|||||||
import torch
|
import torch
|
||||||
import torch.distributed
|
import torch.distributed
|
||||||
|
|
||||||
from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
|
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
||||||
SchedulerConfig, LoRAConfig)
|
ParallelConfig, SchedulerConfig, LoRAConfig)
|
||||||
from vllm.model_executor import set_random_seed
|
from vllm.model_executor import set_random_seed
|
||||||
from vllm.model_executor.parallel_utils.communication_op import (
|
from vllm.model_executor.parallel_utils.communication_op import (
|
||||||
broadcast_tensor_dict)
|
broadcast_tensor_dict)
|
||||||
@ -33,6 +33,7 @@ class Worker:
|
|||||||
model_config: ModelConfig,
|
model_config: ModelConfig,
|
||||||
parallel_config: ParallelConfig,
|
parallel_config: ParallelConfig,
|
||||||
scheduler_config: SchedulerConfig,
|
scheduler_config: SchedulerConfig,
|
||||||
|
device_config: DeviceConfig,
|
||||||
local_rank: int,
|
local_rank: int,
|
||||||
rank: int,
|
rank: int,
|
||||||
distributed_init_method: str,
|
distributed_init_method: str,
|
||||||
@ -43,6 +44,7 @@ class Worker:
|
|||||||
self.model_config = model_config
|
self.model_config = model_config
|
||||||
self.parallel_config = parallel_config
|
self.parallel_config = parallel_config
|
||||||
self.scheduler_config = scheduler_config
|
self.scheduler_config = scheduler_config
|
||||||
|
self.device_config = device_config
|
||||||
self.local_rank = local_rank
|
self.local_rank = local_rank
|
||||||
self.rank = rank
|
self.rank = rank
|
||||||
self.distributed_init_method = distributed_init_method
|
self.distributed_init_method = distributed_init_method
|
||||||
@ -54,6 +56,7 @@ class Worker:
|
|||||||
self.model_runner = ModelRunner(model_config,
|
self.model_runner = ModelRunner(model_config,
|
||||||
parallel_config,
|
parallel_config,
|
||||||
scheduler_config,
|
scheduler_config,
|
||||||
|
device_config,
|
||||||
lora_config=self.lora_config,
|
lora_config=self.lora_config,
|
||||||
kv_cache_dtype=kv_cache_dtype,
|
kv_cache_dtype=kv_cache_dtype,
|
||||||
is_driver_worker=is_driver_worker)
|
is_driver_worker=is_driver_worker)
|
||||||
@ -65,6 +68,7 @@ class Worker:
|
|||||||
self.gpu_cache = None
|
self.gpu_cache = None
|
||||||
|
|
||||||
def init_model(self) -> None:
|
def init_model(self) -> None:
|
||||||
|
if self.device_config.device.type == "cuda":
|
||||||
# torch.distributed.all_reduce does not free the input tensor until
|
# torch.distributed.all_reduce does not free the input tensor until
|
||||||
# the synchronization point. This causes the memory usage to grow
|
# the synchronization point. This causes the memory usage to grow
|
||||||
# as the number of all_reduce calls increases. This env var disables
|
# as the number of all_reduce calls increases. This env var disables
|
||||||
@ -79,7 +83,9 @@ class Worker:
|
|||||||
torch.cuda.set_device(self.device)
|
torch.cuda.set_device(self.device)
|
||||||
|
|
||||||
_check_if_gpu_supports_dtype(self.model_config.dtype)
|
_check_if_gpu_supports_dtype(self.model_config.dtype)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Not support device type: {self.device_config.device}")
|
||||||
# Initialize the distributed environment.
|
# Initialize the distributed environment.
|
||||||
init_distributed_environment(self.parallel_config, self.rank,
|
init_distributed_environment(self.parallel_config, self.rank,
|
||||||
self.distributed_init_method)
|
self.distributed_init_method)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user