vllm/cacheflow/worker/controller.py

from typing import Dict, List, Union, Tuple, Optional

try:
    import ray
except ImportError:
    ray = None

from cacheflow.core.scheduler import Scheduler
from cacheflow.sequence import SequenceGroupInputs
from cacheflow.worker.worker import Worker


DeviceID = Tuple[int, str, int] # rank, node resource (node IP), device id


class Controller:

    def __init__(
        self,
        stage_id: int,
        stage_devices: List[DeviceID],
        world_size: int,
        tensor_parallel_size: int,
        pipeline_parallel_size: int,
        distributed_init_method: str,
        model_name: str,
        block_size: int,
        num_gpu_blocks: int,
        num_cpu_blocks: int,
        dtype: str,
        seed: int,
        cache_dir: Optional[str],
        use_dummy_weights: bool,
        use_np_cache: bool,
        max_num_batched_tokens: int,
        use_ray: bool,
    ) -> None:
        self.stage_id = stage_id
        self.stage_devices = stage_devices
        self.model_name = model_name
        self.block_size = block_size
        self.num_gpu_blocks = num_gpu_blocks
        self.num_cpu_blocks = num_cpu_blocks
        self.use_ray = use_ray

        # Which pipeline stage is this node assigned to?
        self.is_first_stage = stage_id == 0
        self.is_last_stage = False

        self.workers: List[Worker] = []
        for rank, node_resource, device_id in stage_devices:
            if self.use_ray:
                worker_cls = ray.remote(num_cpus=0,
                                        num_gpus=1,
                                        resources={node_resource: 1e-5})(Worker).remote
            else:
                worker_cls = Worker
            worker = worker_cls(
                model_name=model_name,
                block_size=block_size,
                num_gpu_blocks=num_gpu_blocks,
                num_cpu_blocks=num_cpu_blocks,
                dtype=dtype,
                seed=seed,
                distributed_init_method=distributed_init_method,
                rank=rank,
                world_size=world_size,
                tensor_parallel_size=tensor_parallel_size,
                pipeline_parallel_size=pipeline_parallel_size,
                cache_dir=cache_dir,
                use_dummy_weights=use_dummy_weights,
                use_np_cache=use_np_cache,
                max_num_batched_tokens=max_num_batched_tokens,
            )
            self.workers.append(worker)

    def set_next(
        self,
        next_node: Union['Controller', 'Scheduler'],
    ) -> None:
        self.next_node = next_node
        self.is_last_stage = isinstance(next_node, Scheduler)

    def execute_stage(
        self,
        input_seq_groups: List[SequenceGroupInputs],
        blocks_to_swap_in: Dict[int, int],
        blocks_to_swap_out: Dict[int, int],
        blocks_to_copy: Dict[int, List[int]],
    ) -> None:
        all_outputs = []
        for worker in self.workers:
            executor = (worker.execute_stage.remote
                        if self.use_ray else worker.execute_stage)
            output = executor(
                input_seq_groups,
                blocks_to_swap_in,
                blocks_to_swap_out,
                blocks_to_copy,
            )
            all_outputs.append(output)

        if self.use_ray:
            all_outputs = ray.get(all_outputs)

        # Make sure all workers have the same results.
        output = all_outputs[0]
        for other_output in all_outputs[1:]:
            assert output == other_output

        if self.is_last_stage:
            self.next_node.post_step(output)
        else:
            # TODO: Support pipeline parallelism.
            assert False
New weight loader without np copy (#52) 2023-05-03 15:32:04 +08:00			`from typing import Dict, List, Union, Tuple, Optional`
Support tensor parallel (#2) 2023-03-22 04:45:42 +08:00
Add an option to launch cacheflow without ray (#51) 2023-04-30 15:42:17 +08:00			`try:`
			`import ray`
			`except ImportError:`
			`ray = None`
Add controller 2023-02-23 09:32:19 +00:00
Refactor system architecture (#82) 2023-05-09 15:30:12 -07:00			`from cacheflow.core.scheduler import Scheduler`
Support beam search & parallel generation (#7) 2023-03-10 09:58:21 -08:00			`from cacheflow.sequence import SequenceGroupInputs`
Add controller 2023-02-23 09:32:19 +00:00			`from cacheflow.worker.worker import Worker`


Support tensor parallel (#2) 2023-03-22 04:45:42 +08:00			`DeviceID = Tuple[int, str, int] # rank, node resource (node IP), device id`


Add controller 2023-02-23 09:32:19 +00:00			`class Controller:`

			`def __init__(`
			`self,`
Support tensor parallel (#2) 2023-03-22 04:45:42 +08:00			`stage_id: int,`
			`stage_devices: List[DeviceID],`
			`world_size: int,`
			`tensor_parallel_size: int,`
			`pipeline_parallel_size: int,`
			`distributed_init_method: str,`
Add controller 2023-02-23 09:32:19 +00:00			`model_name: str,`
			`block_size: int,`
			`num_gpu_blocks: int,`
			`num_cpu_blocks: int,`
Support beam search & parallel generation (#7) 2023-03-10 09:58:21 -08:00			`dtype: str,`
			`seed: int,`
New weight loader without np copy (#52) 2023-05-03 15:32:04 +08:00			`cache_dir: Optional[str],`
Add an option to use dummy model weights (#33) 2023-04-08 23:36:12 -07:00			`use_dummy_weights: bool,`
New weight loader without np copy (#52) 2023-05-03 15:32:04 +08:00			`use_np_cache: bool,`
Add CUDA graph-based all reduce launcher (#26) 2023-04-05 11:16:57 -07:00			`max_num_batched_tokens: int,`
Add an option to launch cacheflow without ray (#51) 2023-04-30 15:42:17 +08:00			`use_ray: bool,`
Add controller 2023-02-23 09:32:19 +00:00			`) -> None:`
Support tensor parallel (#2) 2023-03-22 04:45:42 +08:00			`self.stage_id = stage_id`
			`self.stage_devices = stage_devices`
Add controller 2023-02-23 09:32:19 +00:00			`self.model_name = model_name`
			`self.block_size = block_size`
			`self.num_gpu_blocks = num_gpu_blocks`
			`self.num_cpu_blocks = num_cpu_blocks`
Add an option to launch cacheflow without ray (#51) 2023-04-30 15:42:17 +08:00			`self.use_ray = use_ray`
Add controller 2023-02-23 09:32:19 +00:00
			`# Which pipeline stage is this node assigned to?`
Support tensor parallel (#2) 2023-03-22 04:45:42 +08:00			`self.is_first_stage = stage_id == 0`
Add controller 2023-02-23 09:32:19 +00:00			`self.is_last_stage = False`

			`self.workers: List[Worker] = []`
Support tensor parallel (#2) 2023-03-22 04:45:42 +08:00			`for rank, node_resource, device_id in stage_devices:`
Add an option to launch cacheflow without ray (#51) 2023-04-30 15:42:17 +08:00			`if self.use_ray:`
			`worker_cls = ray.remote(num_cpus=0,`
			`num_gpus=1,`
			`resources={node_resource: 1e-5})(Worker).remote`
			`else:`
			`worker_cls = Worker`
			`worker = worker_cls(`
Add controller 2023-02-23 09:32:19 +00:00			`model_name=model_name,`
			`block_size=block_size,`
			`num_gpu_blocks=num_gpu_blocks,`
			`num_cpu_blocks=num_cpu_blocks,`
Set default dtype to half 2023-02-23 21:31:39 +00:00			`dtype=dtype,`
Support beam search & parallel generation (#7) 2023-03-10 09:58:21 -08:00			`seed=seed,`
Support tensor parallel (#2) 2023-03-22 04:45:42 +08:00			`distributed_init_method=distributed_init_method,`
			`rank=rank,`
			`world_size=world_size,`
			`tensor_parallel_size=tensor_parallel_size,`
			`pipeline_parallel_size=pipeline_parallel_size,`
New weight loader without np copy (#52) 2023-05-03 15:32:04 +08:00			`cache_dir=cache_dir,`
Add an option to use dummy model weights (#33) 2023-04-08 23:36:12 -07:00			`use_dummy_weights=use_dummy_weights,`
New weight loader without np copy (#52) 2023-05-03 15:32:04 +08:00			`use_np_cache=use_np_cache,`
Add CUDA graph-based all reduce launcher (#26) 2023-04-05 11:16:57 -07:00			`max_num_batched_tokens=max_num_batched_tokens,`
Add controller 2023-02-23 09:32:19 +00:00			`)`
			`self.workers.append(worker)`

			`def set_next(`
			`self,`
			`next_node: Union['Controller', 'Scheduler'],`
			`) -> None:`
			`self.next_node = next_node`
			`self.is_last_stage = isinstance(next_node, Scheduler)`

			`def execute_stage(`
			`self,`
Support beam search & parallel generation (#7) 2023-03-10 09:58:21 -08:00			`input_seq_groups: List[SequenceGroupInputs],`
Add controller 2023-02-23 09:32:19 +00:00			`blocks_to_swap_in: Dict[int, int],`
			`blocks_to_swap_out: Dict[int, int],`
Support beam search & parallel generation (#7) 2023-03-10 09:58:21 -08:00			`blocks_to_copy: Dict[int, List[int]],`
Add controller 2023-02-23 09:32:19 +00:00			`) -> None:`
Add an option to launch cacheflow without ray (#51) 2023-04-30 15:42:17 +08:00			`all_outputs = []`
Support tensor parallel (#2) 2023-03-22 04:45:42 +08:00			`for worker in self.workers:`
Add an option to launch cacheflow without ray (#51) 2023-04-30 15:42:17 +08:00			`executor = (worker.execute_stage.remote`
			`if self.use_ray else worker.execute_stage)`
			`output = executor(`
Support tensor parallel (#2) 2023-03-22 04:45:42 +08:00			`input_seq_groups,`
			`blocks_to_swap_in,`
			`blocks_to_swap_out,`
			`blocks_to_copy,`
			`)`
Add an option to launch cacheflow without ray (#51) 2023-04-30 15:42:17 +08:00			`all_outputs.append(output)`

			`if self.use_ray:`
			`all_outputs = ray.get(all_outputs)`
Support tensor parallel (#2) 2023-03-22 04:45:42 +08:00
			`# Make sure all workers have the same results.`
			`output = all_outputs[0]`
			`for other_output in all_outputs[1:]:`
			`assert output == other_output`
Add controller 2023-02-23 09:32:19 +00:00
			`if self.is_last_stage:`
			`self.next_node.post_step(output)`
			`else:`
			`# TODO: Support pipeline parallelism.`
			`assert False`