2023-11-23 16:31:19 -08:00
|
|
|
#include "cache.h"
|
|
|
|
#include "cuda_utils.h"
|
|
|
|
#include "ops.h"
|
|
|
|
#include <torch/extension.h>
|
|
|
|
|
|
|
|
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|
|
|
// vLLM custom ops
|
|
|
|
pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
|
|
|
|
|
|
|
|
// Attention ops
|
|
|
|
ops.def(
|
|
|
|
"paged_attention_v1",
|
|
|
|
&paged_attention_v1,
|
|
|
|
"Compute the attention between an input query and the cached keys/values using PagedAttention.");
|
|
|
|
ops.def(
|
|
|
|
"paged_attention_v2",
|
|
|
|
&paged_attention_v2,
|
|
|
|
"PagedAttention V2.");
|
|
|
|
|
|
|
|
// Activation ops
|
|
|
|
ops.def(
|
|
|
|
"silu_and_mul",
|
|
|
|
&silu_and_mul,
|
|
|
|
"Activation function used in SwiGLU.");
|
|
|
|
ops.def(
|
2024-02-21 20:17:52 -08:00
|
|
|
"gelu_and_mul",
|
|
|
|
&gelu_and_mul,
|
2024-03-12 22:06:17 -07:00
|
|
|
"Activation function used in GeGLU with `none` approximation.");
|
|
|
|
ops.def(
|
|
|
|
"gelu_tanh_and_mul",
|
|
|
|
&gelu_tanh_and_mul,
|
|
|
|
"Activation function used in GeGLU with `tanh` approximation.");
|
2024-02-21 20:17:52 -08:00
|
|
|
ops.def(
|
2023-11-23 16:31:19 -08:00
|
|
|
"gelu_new",
|
|
|
|
&gelu_new,
|
|
|
|
"GELU implementation used in GPT-2.");
|
|
|
|
ops.def(
|
|
|
|
"gelu_fast",
|
|
|
|
&gelu_fast,
|
|
|
|
"Approximate GELU implementation.");
|
|
|
|
|
|
|
|
// Layernorm
|
|
|
|
ops.def(
|
|
|
|
"rms_norm",
|
|
|
|
&rms_norm,
|
|
|
|
"Apply Root Mean Square (RMS) Normalization to the input tensor.");
|
|
|
|
|
|
|
|
ops.def(
|
|
|
|
"fused_add_rms_norm",
|
|
|
|
&fused_add_rms_norm,
|
|
|
|
"In-place fused Add and RMS Normalization");
|
|
|
|
|
|
|
|
// Rotary embedding
|
|
|
|
ops.def(
|
|
|
|
"rotary_embedding",
|
|
|
|
&rotary_embedding,
|
|
|
|
"Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
|
|
|
|
|
2024-03-13 13:45:26 -07:00
|
|
|
ops.def(
|
|
|
|
"batched_rotary_embedding",
|
|
|
|
&batched_rotary_embedding,
|
|
|
|
"Apply GPT-NeoX or GPT-J style rotary embedding to query and key (supports multiple loras)");
|
|
|
|
|
2024-03-01 14:47:51 -06:00
|
|
|
// Quantization ops
|
2024-02-05 17:38:02 -08:00
|
|
|
#ifndef USE_ROCM
|
2024-04-23 13:59:33 -04:00
|
|
|
ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM");
|
|
|
|
ops.def("aqlm_dequant", &aqlm_dequant, "Decompression method for AQLM");
|
2023-11-23 16:31:19 -08:00
|
|
|
ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
|
2024-03-01 14:47:51 -06:00
|
|
|
ops.def("marlin_gemm", &marlin_gemm, "Marlin Optimized Quantized GEMM for GPTQ");
|
2024-01-27 08:53:17 +01:00
|
|
|
ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
|
2023-12-08 15:16:52 +08:00
|
|
|
#endif
|
2024-03-01 14:47:51 -06:00
|
|
|
|
2023-12-15 19:04:22 +08:00
|
|
|
ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
|
|
|
|
ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
|
2023-11-23 16:31:19 -08:00
|
|
|
ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
|
[Kernel] FP8 support for MoE kernel / Mixtral (#4244)
This PR is the first step towards fixing https://github.com/vllm-project/vllm/pull/3208
It implements dynamic per-tensor scaling (see https://github.com/vllm-project/vllm/pull/4118), so users do not need to compute activation scales on a calibration dataset and they also don't need to convert their model checkpoints. It is enough to specify the `quantization="fp8"` argument. You can try out the PR like this:
```python
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(model="mistralai/Mixtral-8x7B-Instruct-v0.1", tensor_parallel_size=2, quantization="fp8")
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
**Performance**: For this PR, the focus is on making the code clean (while still trying to get reasonable performance), there is a bunch of optimizations that we will submit as a follow up PR that significantly improve the performance (similar to the numbers in https://github.com/vllm-project/vllm/pull/3954). With this PR, the results are as follows:
<img width="725" alt="Screenshot 2024-04-21 at 1 31 50 PM" src="https://github.com/vllm-project/vllm/assets/113316/d8fe1118-07a0-4d4e-8530-37a77d465a03">
**Accuracy**: The accuracy with this PR on MMLU on `mistralai/Mixtral-8x7B-v0.1` is as follows:
```
| Groups |Version|Filter|n-shot|Metric|Value | |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu |N/A |none | 0|acc |0.7018|± |0.0036|
| - humanities |N/A |none | 5|acc |0.6472|± |0.0065|
| - other |N/A |none | 5|acc |0.7673|± |0.0072|
| - social_sciences|N/A |none | 5|acc |0.8099|± |0.0070|
| - stem |N/A |none | 5|acc |0.6131|± |0.0083|
```
this compares favorably with the fp16 results which are
```
| Groups |Version|Filter|n-shot|Metric|Value | |Stderr|
|------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu |N/A |none | 0|acc |0.7020|± |0.1313|
| - humanities |N/A |none | 5|acc |0.6425|± |0.1349|
| - other |N/A |none | 5|acc |0.7744|± |0.1038|
| - social_sciences|N/A |none | 5|acc |0.8131|± |0.0695|
| - stem |N/A |none | 5|acc |0.6108|± |0.1383|
```
Happy hacking!
2024-04-23 18:18:23 -07:00
|
|
|
ops.def("scaled_fp8_quant", &scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor");
|
2024-01-30 13:19:48 +08:00
|
|
|
ops.def(
|
2024-01-29 22:43:37 -08:00
|
|
|
"moe_align_block_size",
|
|
|
|
&moe_align_block_size,
|
|
|
|
"Aligning the number of tokens to be processed by each expert such that it is divisible by the block size.");
|
2023-11-23 16:31:19 -08:00
|
|
|
|
|
|
|
// Cache ops
|
|
|
|
pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
|
|
|
|
cache_ops.def(
|
|
|
|
"swap_blocks",
|
|
|
|
&swap_blocks,
|
|
|
|
"Swap in (out) the cache blocks from src to dst");
|
|
|
|
cache_ops.def(
|
|
|
|
"copy_blocks",
|
|
|
|
©_blocks,
|
|
|
|
"Copy the cache blocks from src to dst");
|
|
|
|
cache_ops.def(
|
|
|
|
"reshape_and_cache",
|
|
|
|
&reshape_and_cache,
|
|
|
|
"Reshape the key and value tensors and cache them");
|
2024-01-29 08:43:54 +08:00
|
|
|
cache_ops.def(
|
2024-04-03 16:15:55 -05:00
|
|
|
"convert_fp8",
|
|
|
|
&convert_fp8,
|
|
|
|
"Convert the key and value cache to fp8 data type");
|
2023-11-23 16:31:19 -08:00
|
|
|
|
|
|
|
// Cuda utils
|
|
|
|
pybind11::module cuda_utils = m.def_submodule("cuda_utils", "vLLM cuda utils");
|
|
|
|
cuda_utils.def(
|
|
|
|
"get_device_attribute",
|
|
|
|
&get_device_attribute,
|
|
|
|
"Gets the specified device attribute.");
|
2024-01-26 15:41:10 -05:00
|
|
|
|
|
|
|
cuda_utils.def(
|
|
|
|
"get_max_shared_memory_per_block_device_attribute",
|
|
|
|
&get_max_shared_memory_per_block_device_attribute,
|
|
|
|
"Gets the maximum shared memory per block device attribute.");
|
|
|
|
|
2024-01-28 04:46:35 +08:00
|
|
|
#ifndef USE_ROCM
|
|
|
|
// Custom all-reduce kernels
|
|
|
|
pybind11::module custom_ar = m.def_submodule("custom_ar", "custom allreduce");
|
|
|
|
custom_ar.def("init_custom_ar", &init_custom_ar, "init_custom_ar");
|
|
|
|
custom_ar.def("should_custom_ar", &should_custom_ar, "should_custom_ar");
|
|
|
|
custom_ar.def("all_reduce_reg", &all_reduce_reg, "all_reduce_reg");
|
|
|
|
custom_ar.def("all_reduce_unreg", &all_reduce_unreg, "all_reduce_unreg");
|
|
|
|
custom_ar.def("dispose", &dispose, "dispose");
|
|
|
|
custom_ar.def("meta_size", &meta_size, "meta_size");
|
|
|
|
custom_ar.def("register_buffer", ®ister_buffer, "register_buffer");
|
|
|
|
custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta,
|
|
|
|
"get_graph_buffer_ipc_meta");
|
|
|
|
custom_ar.def("register_graph_buffers", ®ister_graph_buffers,
|
|
|
|
"register_graph_buffers");
|
|
|
|
#endif
|
|
|
|
|
2023-11-23 16:31:19 -08:00
|
|
|
}
|