[Hardware][CPU] Support MOE models on x86 CPU (#11831)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
This commit is contained in:
parent
5959564f94
commit
aa1e77a19c
@ -5,7 +5,7 @@
|
|||||||
vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
|
vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
|
||||||
|
|
||||||
- Tensor Parallel
|
- Tensor Parallel
|
||||||
- Model Quantization (`INT8 W8A8, AWQ`)
|
- Model Quantization (`INT8 W8A8, AWQ, GPTQ`)
|
||||||
- Chunked-prefill
|
- Chunked-prefill
|
||||||
- Prefix-caching
|
- Prefix-caching
|
||||||
- FP8-E5M2 KV-Caching (TODO)
|
- FP8-E5M2 KV-Caching (TODO)
|
||||||
|
@ -48,6 +48,10 @@ from ...utils import check_logprobs_close
|
|||||||
),
|
),
|
||||||
pytest.param("stabilityai/stablelm-3b-4e1t"), # stablelm
|
pytest.param("stabilityai/stablelm-3b-4e1t"), # stablelm
|
||||||
pytest.param("bigcode/starcoder2-3b"), # starcoder2
|
pytest.param("bigcode/starcoder2-3b"), # starcoder2
|
||||||
|
pytest.param(
|
||||||
|
"ehristoforu/Falcon3-MoE-2x7B-Insruct", # mixtral
|
||||||
|
marks=[pytest.mark.cpu_model],
|
||||||
|
)
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
@pytest.mark.parametrize("max_tokens", [32])
|
@pytest.mark.parametrize("max_tokens", [32])
|
||||||
|
@ -13,6 +13,7 @@ from vllm.model_executor.layers.quantization.base_config import (
|
|||||||
QuantizationConfig, QuantizeMethodBase)
|
QuantizationConfig, QuantizeMethodBase)
|
||||||
from vllm.model_executor.utils import set_weight_attrs
|
from vllm.model_executor.utils import set_weight_attrs
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.platforms.interface import CpuArchEnum
|
||||||
|
|
||||||
if current_platform.is_cuda_alike():
|
if current_platform.is_cuda_alike():
|
||||||
from .fused_moe import fused_experts
|
from .fused_moe import fused_experts
|
||||||
@ -83,6 +84,20 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|||||||
layer.register_parameter("w2_weight", w2_weight)
|
layer.register_parameter("w2_weight", w2_weight)
|
||||||
set_weight_attrs(w2_weight, extra_weight_attrs)
|
set_weight_attrs(w2_weight, extra_weight_attrs)
|
||||||
|
|
||||||
|
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
||||||
|
super().process_weights_after_loading(layer)
|
||||||
|
|
||||||
|
if current_platform.is_cpu():
|
||||||
|
if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
|
||||||
|
import intel_extension_for_pytorch as ipex
|
||||||
|
layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
|
||||||
|
layer.w13_weight,
|
||||||
|
layer.w2_weight,
|
||||||
|
use_prepack=True,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError("CPU MOE only supports x86 arch.")
|
||||||
|
|
||||||
def apply(
|
def apply(
|
||||||
self,
|
self,
|
||||||
layer: torch.nn.Module,
|
layer: torch.nn.Module,
|
||||||
@ -142,9 +157,29 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|||||||
topk_ids=topk_ids,
|
topk_ids=topk_ids,
|
||||||
inplace=True)
|
inplace=True)
|
||||||
|
|
||||||
def forward_cpu(self, *args, **kwargs):
|
def forward_cpu(
|
||||||
raise NotImplementedError(
|
self,
|
||||||
"The CPU backend currently does not support MoE.")
|
layer: torch.nn.Module,
|
||||||
|
x: torch.Tensor,
|
||||||
|
use_grouped_topk: bool,
|
||||||
|
top_k: int,
|
||||||
|
router_logits: torch.Tensor,
|
||||||
|
renormalize: bool,
|
||||||
|
topk_group: Optional[int] = None,
|
||||||
|
num_expert_group: Optional[int] = None,
|
||||||
|
custom_routing_function: Optional[Callable] = None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
assert custom_routing_function is None
|
||||||
|
return layer.ipex_fusion(
|
||||||
|
x,
|
||||||
|
use_grouped_topk,
|
||||||
|
top_k,
|
||||||
|
router_logits,
|
||||||
|
renormalize,
|
||||||
|
topk_group,
|
||||||
|
num_expert_group,
|
||||||
|
)
|
||||||
|
|
||||||
def forward_tpu(
|
def forward_tpu(
|
||||||
self,
|
self,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user