vllm/csrc/pybind.cpp

#include "cache.h"
#include "cuda_utils.h"
#include "ops.h"
#include <torch/extension.h>

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  // vLLM custom ops
  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");

  // Attention ops
  ops.def(
    "paged_attention_v1",
    &paged_attention_v1,
    "Compute the attention between an input query and the cached keys/values using PagedAttention.");
  ops.def(
    "paged_attention_v2",
    &paged_attention_v2,
    "PagedAttention V2.");

  // Activation ops
  ops.def(
    "silu_and_mul",
    &silu_and_mul,
    "Activation function used in SwiGLU.");
  ops.def(
    "gelu_and_mul",
    &gelu_and_mul,
    "Activation function used in GeGLU with `none` approximation.");
  ops.def(
    "gelu_tanh_and_mul",
    &gelu_tanh_and_mul,
    "Activation function used in GeGLU with `tanh` approximation.");
  ops.def(
    "gelu_new",
    &gelu_new,
    "GELU implementation used in GPT-2.");
  ops.def(
    "gelu_fast",
    &gelu_fast,
    "Approximate GELU implementation.");

  // Layernorm
  ops.def(
    "rms_norm",
    &rms_norm,
    "Apply Root Mean Square (RMS) Normalization to the input tensor.");

  ops.def(
    "fused_add_rms_norm",
    &fused_add_rms_norm,
    "In-place fused Add and RMS Normalization");

  // Rotary embedding
  ops.def(
    "rotary_embedding",
    &rotary_embedding,
    "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");

  ops.def(
    "batched_rotary_embedding",
    &batched_rotary_embedding,
    "Apply GPT-NeoX or GPT-J style rotary embedding to query and key (supports multiple loras)");

// Quantization ops
#ifndef USE_ROCM
  ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM");
  ops.def("aqlm_dequant", &aqlm_dequant, "Decompression method for AQLM");
  ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
  ops.def("marlin_gemm", &marlin_gemm, "Marlin Optimized Quantized GEMM for GPTQ");
  ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
#endif
 
  ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
  ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
  ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
  ops.def("scaled_fp8_quant", &scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor");
  ops.def(
    "moe_align_block_size",
    &moe_align_block_size,
    "Aligning the number of tokens to be processed by each expert such that it is divisible by the block size.");

  // Cache ops
  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
  cache_ops.def(
    "swap_blocks",
    &swap_blocks,
    "Swap in (out) the cache blocks from src to dst");
  cache_ops.def(
    "copy_blocks",
    &copy_blocks,
    "Copy the cache blocks from src to dst");
  cache_ops.def(
    "reshape_and_cache",
    &reshape_and_cache,
    "Reshape the key and value tensors and cache them");
  cache_ops.def(
    "convert_fp8",
    &convert_fp8,
    "Convert the key and value cache to fp8 data type");

  // Cuda utils
  pybind11::module cuda_utils = m.def_submodule("cuda_utils", "vLLM cuda utils");
  cuda_utils.def(
    "get_device_attribute",
    &get_device_attribute,
    "Gets the specified device attribute.");

  cuda_utils.def(
    "get_max_shared_memory_per_block_device_attribute",
    &get_max_shared_memory_per_block_device_attribute,
    "Gets the maximum shared memory per block device attribute.");

#ifndef USE_ROCM
  // Custom all-reduce kernels
  pybind11::module custom_ar = m.def_submodule("custom_ar", "custom allreduce");
  custom_ar.def("init_custom_ar", &init_custom_ar, "init_custom_ar");
  custom_ar.def("should_custom_ar", &should_custom_ar, "should_custom_ar");
  custom_ar.def("all_reduce_reg", &all_reduce_reg, "all_reduce_reg");
  custom_ar.def("all_reduce_unreg", &all_reduce_unreg, "all_reduce_unreg");
  custom_ar.def("dispose", &dispose, "dispose");
  custom_ar.def("meta_size", &meta_size, "meta_size");
  custom_ar.def("register_buffer", &register_buffer, "register_buffer");
  custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta,
                "get_graph_buffer_ipc_meta");
  custom_ar.def("register_graph_buffers", &register_graph_buffers,
                "register_graph_buffers");
#endif

}
[Build] Avoid building too many extensions (#1624) 2023-11-23 16:31:19 -08:00			`#include "cache.h"`
			`#include "cuda_utils.h"`
			`#include "ops.h"`
			`#include <torch/extension.h>`

			`PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {`
			`// vLLM custom ops`
			`pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");`

			`// Attention ops`
			`ops.def(`
			`"paged_attention_v1",`
			`&paged_attention_v1,`
			`"Compute the attention between an input query and the cached keys/values using PagedAttention.");`
			`ops.def(`
			`"paged_attention_v2",`
			`&paged_attention_v2,`
			`"PagedAttention V2.");`

			`// Activation ops`
			`ops.def(`
			`"silu_and_mul",`
			`&silu_and_mul,`
			`"Activation function used in SwiGLU.");`
			`ops.def(`
Optimize GeGLU layer in Gemma (#2975) 2024-02-21 20:17:52 -08:00			`"gelu_and_mul",`
			`&gelu_and_mul,`
Add kernel for GeGLU with approximate GELU (#3337) 2024-03-12 22:06:17 -07:00			"Activation function used in GeGLU with `none` approximation.");
			`ops.def(`
			`"gelu_tanh_and_mul",`
			`&gelu_tanh_and_mul,`
			"Activation function used in GeGLU with `tanh` approximation.");
Optimize GeGLU layer in Gemma (#2975) 2024-02-21 20:17:52 -08:00			`ops.def(`
[Build] Avoid building too many extensions (#1624) 2023-11-23 16:31:19 -08:00			`"gelu_new",`
			`&gelu_new,`
			`"GELU implementation used in GPT-2.");`
			`ops.def(`
			`"gelu_fast",`
			`&gelu_fast,`
			`"Approximate GELU implementation.");`

			`// Layernorm`
			`ops.def(`
			`"rms_norm",`
			`&rms_norm,`
			`"Apply Root Mean Square (RMS) Normalization to the input tensor.");`

			`ops.def(`
			`"fused_add_rms_norm",`
			`&fused_add_rms_norm,`
			`"In-place fused Add and RMS Normalization");`

			`// Rotary embedding`
			`ops.def(`
			`"rotary_embedding",`
			`&rotary_embedding,`
			`"Apply GPT-NeoX or GPT-J style rotary embedding to query and key");`

Add batched RoPE kernel (#3095) 2024-03-13 13:45:26 -07:00			`ops.def(`
			`"batched_rotary_embedding",`
			`&batched_rotary_embedding,`
			`"Apply GPT-NeoX or GPT-J style rotary embedding to query and key (supports multiple loras)");`

Integrate Marlin Kernels for Int4 GPTQ inference (#2497) Co-authored-by: Robert Shaw <114415538+rib-2@users.noreply.github.com> Co-authored-by: alexm <alexm@neuralmagic.com> 2024-03-01 14:47:51 -06:00			`// Quantization ops`
Add fused top-K softmax kernel for MoE (#2769) 2024-02-05 17:38:02 -08:00			`#ifndef USE_ROCM`
AQLM CUDA support (#3287) Co-authored-by: mgoin <michael@neuralmagic.com> 2024-04-23 13:59:33 -04:00			`ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM");`
			`ops.def("aqlm_dequant", &aqlm_dequant, "Decompression method for AQLM");`
[Build] Avoid building too many extensions (#1624) 2023-11-23 16:31:19 -08:00			`ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");`
Integrate Marlin Kernels for Int4 GPTQ inference (#2497) Co-authored-by: Robert Shaw <114415538+rib-2@users.noreply.github.com> Co-authored-by: alexm <alexm@neuralmagic.com> 2024-03-01 14:47:51 -06:00			`ops.def("marlin_gemm", &marlin_gemm, "Marlin Optimized Quantized GEMM for GPTQ");`
AWQ: Up to 2.66x higher throughput (#2566) 2024-01-27 08:53:17 +01:00			`ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");`
Merge EmbeddedLLM/vllm-rocm into vLLM main (#1836) Co-authored-by: Philipp Moritz <pcmoritz@gmail.com> Co-authored-by: Amir Balwel <amoooori04@gmail.com> Co-authored-by: root <kuanfu.liu@akirakan.com> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: kuanfu <kuanfu.liu@embeddedllm.com> Co-authored-by: miloice <17350011+kliuae@users.noreply.github.com> 2023-12-08 15:16:52 +08:00			`#endif`
Integrate Marlin Kernels for Int4 GPTQ inference (#2497) Co-authored-by: Robert Shaw <114415538+rib-2@users.noreply.github.com> Co-authored-by: alexm <alexm@neuralmagic.com> 2024-03-01 14:47:51 -06:00
Add GPTQ support (#916) 2023-12-15 19:04:22 +08:00			`ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");`
			`ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");`
[Build] Avoid building too many extensions (#1624) 2023-11-23 16:31:19 -08:00			`ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");`
[Kernel] FP8 support for MoE kernel / Mixtral (#4244) This PR is the first step towards fixing https://github.com/vllm-project/vllm/pull/3208 It implements dynamic per-tensor scaling (see https://github.com/vllm-project/vllm/pull/4118), so users do not need to compute activation scales on a calibration dataset and they also don't need to convert their model checkpoints. It is enough to specify the `quantization="fp8"` argument. You can try out the PR like this: ```python from vllm import LLM, SamplingParams prompts = [ "Hello, my name is", "The president of the United States is", "The capital of France is", "The future of AI is", ] sampling_params = SamplingParams(temperature=0.8, top_p=0.95) llm = LLM(model="mistralai/Mixtral-8x7B-Instruct-v0.1", tensor_parallel_size=2, quantization="fp8") outputs = llm.generate(prompts, sampling_params) # Print the outputs. for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` Performance: For this PR, the focus is on making the code clean (while still trying to get reasonable performance), there is a bunch of optimizations that we will submit as a follow up PR that significantly improve the performance (similar to the numbers in https://github.com/vllm-project/vllm/pull/3954). With this PR, the results are as follows: <img width="725" alt="Screenshot 2024-04-21 at 1 31 50 PM" src="https://github.com/vllm-project/vllm/assets/113316/d8fe1118-07a0-4d4e-8530-37a77d465a03"> Accuracy: The accuracy with this PR on MMLU on `mistralai/Mixtral-8x7B-v0.1` is as follows: ``` \| Groups \|Version\|Filter\|n-shot\|Metric\|Value \| \|Stderr\| \|------------------\|-------\|------\|-----:\|------\|-----:\|---\|-----:\| \|mmlu \|N/A \|none \| 0\|acc \|0.7018\|± \|0.0036\| \| - humanities \|N/A \|none \| 5\|acc \|0.6472\|± \|0.0065\| \| - other \|N/A \|none \| 5\|acc \|0.7673\|± \|0.0072\| \| - social_sciences\|N/A \|none \| 5\|acc \|0.8099\|± \|0.0070\| \| - stem \|N/A \|none \| 5\|acc \|0.6131\|± \|0.0083\| ``` this compares favorably with the fp16 results which are ``` \| Groups \|Version\|Filter\|n-shot\|Metric\|Value \| \|Stderr\| \|------------------\|-------\|------\|-----:\|------\|-----:\|---\|-----:\| \|mmlu \|N/A \|none \| 0\|acc \|0.7020\|± \|0.1313\| \| - humanities \|N/A \|none \| 5\|acc \|0.6425\|± \|0.1349\| \| - other \|N/A \|none \| 5\|acc \|0.7744\|± \|0.1038\| \| - social_sciences\|N/A \|none \| 5\|acc \|0.8131\|± \|0.0695\| \| - stem \|N/A \|none \| 5\|acc \|0.6108\|± \|0.1383\| ``` Happy hacking! 2024-04-23 18:18:23 -07:00			`ops.def("scaled_fp8_quant", &scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor");`
DeepseekMoE support with Fused MoE kernel (#2453) Co-authored-by: roy <jasonailu87@gmail.com> 2024-01-30 13:19:48 +08:00			`ops.def(`
Fused MOE for Mixtral (#2542) Co-authored-by: chen shen <scv119@gmail.com> 2024-01-29 22:43:37 -08:00			`"moe_align_block_size",`
			`&moe_align_block_size,`
			`"Aligning the number of tokens to be processed by each expert such that it is divisible by the block size.");`
[Build] Avoid building too many extensions (#1624) 2023-11-23 16:31:19 -08:00
			`// Cache ops`
			`pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");`
			`cache_ops.def(`
			`"swap_blocks",`
			`&swap_blocks,`
			`"Swap in (out) the cache blocks from src to dst");`
			`cache_ops.def(`
			`"copy_blocks",`
			`&copy_blocks,`
			`"Copy the cache blocks from src to dst");`
			`cache_ops.def(`
			`"reshape_and_cache",`
			`&reshape_and_cache,`
			`"Reshape the key and value tensors and cache them");`
Support FP8-E5M2 KV Cache (#2279) Co-authored-by: zhaoyang <zhao.yang16@zte.com.cn> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> 2024-01-29 08:43:54 +08:00			`cache_ops.def(`
Enable scaled FP8 (e4m3fn) KV cache on ROCm (AMD GPU) (#3290) Co-authored-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Co-authored-by: HaiShaw <hixiao@gmail.com> Co-authored-by: AdrianAbeyta <Adrian.Abeyta@amd.com> Co-authored-by: Matthew Wong <Matthew.Wong2@amd.com> Co-authored-by: root <root@gt-pla-u18-08.pla.dcgpu> Co-authored-by: mawong-amd <156021403+mawong-amd@users.noreply.github.com> Co-authored-by: ttbachyinsda <ttbachyinsda@outlook.com> Co-authored-by: guofangze <guofangze@kuaishou.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: jacobthebanana <50071502+jacobthebanana@users.noreply.github.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> 2024-04-03 16:15:55 -05:00			`"convert_fp8",`
			`&convert_fp8,`
			`"Convert the key and value cache to fp8 data type");`
[Build] Avoid building too many extensions (#1624) 2023-11-23 16:31:19 -08:00
			`// Cuda utils`
			`pybind11::module cuda_utils = m.def_submodule("cuda_utils", "vLLM cuda utils");`
			`cuda_utils.def(`
			`"get_device_attribute",`
			`&get_device_attribute,`
			`"Gets the specified device attribute.");`
[ROCm] add support to ROCm 6.0 and MI300 (#2274) 2024-01-26 15:41:10 -05:00
			`cuda_utils.def(`
			`"get_max_shared_memory_per_block_device_attribute",`
			`&get_max_shared_memory_per_block_device_attribute,`
			`"Gets the maximum shared memory per block device attribute.");`

Implement custom all reduce kernels (#2192) 2024-01-28 04:46:35 +08:00			`#ifndef USE_ROCM`
			`// Custom all-reduce kernels`
			`pybind11::module custom_ar = m.def_submodule("custom_ar", "custom allreduce");`
			`custom_ar.def("init_custom_ar", &init_custom_ar, "init_custom_ar");`
			`custom_ar.def("should_custom_ar", &should_custom_ar, "should_custom_ar");`
			`custom_ar.def("all_reduce_reg", &all_reduce_reg, "all_reduce_reg");`
			`custom_ar.def("all_reduce_unreg", &all_reduce_unreg, "all_reduce_unreg");`
			`custom_ar.def("dispose", &dispose, "dispose");`
			`custom_ar.def("meta_size", &meta_size, "meta_size");`
			`custom_ar.def("register_buffer", &register_buffer, "register_buffer");`
			`custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta,`
			`"get_graph_buffer_ipc_meta");`
			`custom_ar.def("register_graph_buffers", &register_graph_buffers,`
			`"register_graph_buffers");`
			`#endif`

[Build] Avoid building too many extensions (#1624) 2023-11-23 16:31:19 -08:00			`}`