vllm/csrc/activation_kernels.cu

#include <ATen/cuda/CUDAContext.h>
#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>

#include <cmath>

#include "cuda_compat.h"
#include "dispatch_utils.h"

namespace vllm {

// Activation and gating kernel template.
template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
__global__ void act_and_mul_kernel(
  scalar_t* __restrict__ out,               // [..., d]
  const scalar_t* __restrict__ input,       // [..., 2, d]
  const int d) {
  const int64_t token_idx = blockIdx.x;
  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
    out[token_idx * d + idx] = ACT_FN(x) * y;
  }
}

template<typename T>
__device__ __forceinline__ T silu_kernel(const T& x) {
  // x * sigmoid(x)
  return (T) (((float) x) / (1.0f + expf((float) -x)));
}

template<typename T>
__device__ __forceinline__ T gelu_kernel(const T& x) {
  // Equivalent to PyTorch GELU with 'none' approximation.
  // Refer to:
  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
  const float f = (float) x;
  constexpr float ALPHA = M_SQRT1_2;
  return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));
}

template<typename T>
__device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
  // Equivalent to PyTorch GELU with 'tanh' approximation.
  // Refer to:
  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
  const float f = (float) x;
  constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
  constexpr float KAPPA = 0.044715;
  float x_cube = f * f * f;
  float inner = BETA * (f + KAPPA * x_cube);
  return (T) (0.5f * f * (1.0f + ::tanhf(inner)));
}

} // namespace vllm

// Launch activation and gating kernel.
#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                                             \
  int d = input.size(-1) / 2;                                                             \
  int64_t num_tokens = input.numel() / input.size(-1);                                    \
  dim3 grid(num_tokens);                                                                  \
  dim3 block(std::min(d, 1024));                                                          \
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));                       \
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                           \
  VLLM_DISPATCH_FLOATING_TYPES(                                                           \
    input.scalar_type(),                                                                  \
    "act_and_mul_kernel",                                                                 \
    [&] {                                                                                 \
      vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>(   \
        out.data_ptr<scalar_t>(),                                                         \
        input.data_ptr<scalar_t>(),                                                       \
        d);                                                                               \
    });

void silu_and_mul(
  torch::Tensor& out,      // [..., d]
  torch::Tensor& input)    // [..., 2 * d]
{
  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
}

void gelu_and_mul(
  torch::Tensor& out,      // [..., d]
  torch::Tensor& input)    // [..., 2 * d]
{
  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
}

void gelu_tanh_and_mul(
  torch::Tensor& out,      // [..., d]
  torch::Tensor& input)    // [..., 2 * d]
{
  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
}

namespace vllm {

// Element-wise activation kernel template.
template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
__global__ void activation_kernel(
  scalar_t* __restrict__ out,               // [..., d]
  const scalar_t* __restrict__ input,       // [..., d]
  const int d) {
  const int64_t token_idx = blockIdx.x;
  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
    const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
    out[token_idx * d + idx] = ACT_FN(x);
  }
}

} // namespace vllm

// Launch element-wise activation kernel.
#define LAUNCH_ACTIVATION_KERNEL(KERNEL)                                                  \
  int d = input.size(-1);                                                                 \
  int64_t num_tokens = input.numel() / d;                                                 \
  dim3 grid(num_tokens);                                                                  \
  dim3 block(std::min(d, 1024));                                                          \
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));                       \
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                           \
  VLLM_DISPATCH_FLOATING_TYPES(                                                           \
    input.scalar_type(),                                                                  \
    "activation_kernel",                                                                  \
    [&] {                                                                                 \
      vllm::activation_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>(    \
        out.data_ptr<scalar_t>(),                                                         \
        input.data_ptr<scalar_t>(),                                                       \
        d);                                                                               \
    });

namespace vllm {

template<typename T>
__device__ __forceinline__ T gelu_new_kernel(const T& x) {
  const float x3 = (float) (x * x * x);
  const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3))));
  return ((T) 0.5) * x * (((T) 1.0) + t);
}

template<typename T>
__device__ __forceinline__ T gelu_fast_kernel(const T& x) {
  const float f = (float) x;
  const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x));
  return ((T) 0.5) * x * (((T) 1.0) + t);
}

} // namespace vllm

void gelu_new(
  torch::Tensor& out,     // [..., d]
  torch::Tensor& input)   // [..., d]
{
  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
}

void gelu_fast(
  torch::Tensor& out,     // [..., d]
  torch::Tensor& input)   // [..., d]
{
  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
}
Optimize data movement (#20) 2023-04-02 00:30:17 -07:00			`#include <ATen/cuda/CUDAContext.h>`
[FIX] Support non-zero CUDA devices in custom kernels (#1959) 2024-01-03 11:09:59 +08:00			`#include <torch/extension.h>`
			`#include <c10/cuda/CUDAGuard.h>`
Optimize data movement (#20) 2023-04-02 00:30:17 -07:00
Optimize GeGLU layer in Gemma (#2975) 2024-02-21 20:17:52 -08:00			`#include <cmath>`

Merge EmbeddedLLM/vllm-rocm into vLLM main (#1836) Co-authored-by: Philipp Moritz <pcmoritz@gmail.com> Co-authored-by: Amir Balwel <amoooori04@gmail.com> Co-authored-by: root <kuanfu.liu@akirakan.com> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: kuanfu <kuanfu.liu@embeddedllm.com> Co-authored-by: miloice <17350011+kliuae@users.noreply.github.com> 2023-12-08 15:16:52 +08:00			`#include "cuda_compat.h"`
Avoid compiling kernels for double data type (#933) 2023-09-02 14:59:47 +09:00			`#include "dispatch_utils.h"`

Change the name to vLLM (#150) 2023-06-17 03:07:40 -07:00			`namespace vllm {`
Optimize data movement (#20) 2023-04-02 00:30:17 -07:00
Optimize GeGLU layer in Gemma (#2975) 2024-02-21 20:17:52 -08:00			`// Activation and gating kernel template.`
			`template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>`
			`__global__ void act_and_mul_kernel(`
Change scheduler & input tensor shape (#1381) 2023-10-16 17:48:42 -07:00			`scalar_t* __restrict__ out, // [..., d]`
			`const scalar_t* __restrict__ input, // [..., 2, d]`
Optimize data movement (#20) 2023-04-02 00:30:17 -07:00			`const int d) {`
Support YaRN models (#1264) Signed-off-by: Antoni Baum <antoni.baum@protonmail.com> Co-authored-by: Viktor Ferenczi <viktor@ferenczi.eu> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> 2023-11-03 14:12:48 -07:00			`const int64_t token_idx = blockIdx.x;`
			`for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {`
Merge EmbeddedLLM/vllm-rocm into vLLM main (#1836) Co-authored-by: Philipp Moritz <pcmoritz@gmail.com> Co-authored-by: Amir Balwel <amoooori04@gmail.com> Co-authored-by: root <kuanfu.liu@akirakan.com> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: kuanfu <kuanfu.liu@embeddedllm.com> Co-authored-by: miloice <17350011+kliuae@users.noreply.github.com> 2023-12-08 15:16:52 +08:00			`const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);`
			`const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);`
Optimize GeGLU layer in Gemma (#2975) 2024-02-21 20:17:52 -08:00			`out[token_idx * d + idx] = ACT_FN(x) * y;`
Optimize data movement (#20) 2023-04-02 00:30:17 -07:00			`}`
			`}`

Optimize GeGLU layer in Gemma (#2975) 2024-02-21 20:17:52 -08:00			`template<typename T>`
			`__device__ __forceinline__ T silu_kernel(const T& x) {`
			`// x * sigmoid(x)`
			`return (T) (((float) x) / (1.0f + expf((float) -x)));`
			`}`

			`template<typename T>`
			`__device__ __forceinline__ T gelu_kernel(const T& x) {`
			`// Equivalent to PyTorch GELU with 'none' approximation.`
			`// Refer to:`
Add kernel for GeGLU with approximate GELU (#3337) 2024-03-12 22:06:17 -07:00			`// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38`
Optimize GeGLU layer in Gemma (#2975) 2024-02-21 20:17:52 -08:00			`const float f = (float) x;`
			`constexpr float ALPHA = M_SQRT1_2;`
			`return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));`
			`}`

Add kernel for GeGLU with approximate GELU (#3337) 2024-03-12 22:06:17 -07:00			`template<typename T>`
			`__device__ __forceinline__ T gelu_tanh_kernel(const T& x) {`
			`// Equivalent to PyTorch GELU with 'tanh' approximation.`
			`// Refer to:`
			`// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30`
			`const float f = (float) x;`
			`constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;`
			`constexpr float KAPPA = 0.044715;`
			`float x_cube = f * f * f;`
			`float inner = BETA * (f + KAPPA * x_cube);`
			`return (T) (0.5f * f * (1.0f + ::tanhf(inner)));`
			`}`

Change the name to vLLM (#150) 2023-06-17 03:07:40 -07:00			`} // namespace vllm`
Optimize data movement (#20) 2023-04-02 00:30:17 -07:00
Optimize GeGLU layer in Gemma (#2975) 2024-02-21 20:17:52 -08:00			`// Launch activation and gating kernel.`
			`#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \`
			`int d = input.size(-1) / 2; \`
			`int64_t num_tokens = input.numel() / input.size(-1); \`
			`dim3 grid(num_tokens); \`
			`dim3 block(std::min(d, 1024)); \`
			`const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \`
			`const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \`
			`VLLM_DISPATCH_FLOATING_TYPES( \`
			`input.scalar_type(), \`
			`"act_and_mul_kernel", \`
			`[&] { \`
			`vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>( \`
			`out.data_ptr<scalar_t>(), \`
			`input.data_ptr<scalar_t>(), \`
			`d); \`
			`});`

Optimize data movement (#20) 2023-04-02 00:30:17 -07:00			`void silu_and_mul(`
Change scheduler & input tensor shape (#1381) 2023-10-16 17:48:42 -07:00			`torch::Tensor& out, // [..., d]`
			`torch::Tensor& input) // [..., 2 * d]`
Optimize data movement (#20) 2023-04-02 00:30:17 -07:00			`{`
Optimize GeGLU layer in Gemma (#2975) 2024-02-21 20:17:52 -08:00			`LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);`
			`}`
Optimize data movement (#20) 2023-04-02 00:30:17 -07:00
Optimize GeGLU layer in Gemma (#2975) 2024-02-21 20:17:52 -08:00			`void gelu_and_mul(`
			`torch::Tensor& out, // [..., d]`
			`torch::Tensor& input) // [..., 2 * d]`
			`{`
			`LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);`
Optimize data movement (#20) 2023-04-02 00:30:17 -07:00			`}`
Implement approximate GELU kernels (#828) 2023-08-23 07:43:21 +09:00
Add kernel for GeGLU with approximate GELU (#3337) 2024-03-12 22:06:17 -07:00			`void gelu_tanh_and_mul(`
			`torch::Tensor& out, // [..., d]`
			`torch::Tensor& input) // [..., 2 * d]`
			`{`
			`LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);`
			`}`

Implement approximate GELU kernels (#828) 2023-08-23 07:43:21 +09:00			`namespace vllm {`

			`// Element-wise activation kernel template.`
			`template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>`
			`__global__ void activation_kernel(`
Change scheduler & input tensor shape (#1381) 2023-10-16 17:48:42 -07:00			`scalar_t* __restrict__ out, // [..., d]`
			`const scalar_t* __restrict__ input, // [..., d]`
Implement approximate GELU kernels (#828) 2023-08-23 07:43:21 +09:00			`const int d) {`
Support YaRN models (#1264) Signed-off-by: Antoni Baum <antoni.baum@protonmail.com> Co-authored-by: Viktor Ferenczi <viktor@ferenczi.eu> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> 2023-11-03 14:12:48 -07:00			`const int64_t token_idx = blockIdx.x;`
			`for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {`
Merge EmbeddedLLM/vllm-rocm into vLLM main (#1836) Co-authored-by: Philipp Moritz <pcmoritz@gmail.com> Co-authored-by: Amir Balwel <amoooori04@gmail.com> Co-authored-by: root <kuanfu.liu@akirakan.com> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: kuanfu <kuanfu.liu@embeddedllm.com> Co-authored-by: miloice <17350011+kliuae@users.noreply.github.com> 2023-12-08 15:16:52 +08:00			`const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);`
Implement approximate GELU kernels (#828) 2023-08-23 07:43:21 +09:00			`out[token_idx * d + idx] = ACT_FN(x);`
			`}`
			`}`

			`} // namespace vllm`

			`// Launch element-wise activation kernel.`
			`#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \`
Change scheduler & input tensor shape (#1381) 2023-10-16 17:48:42 -07:00			`int d = input.size(-1); \`
Support YaRN models (#1264) Signed-off-by: Antoni Baum <antoni.baum@protonmail.com> Co-authored-by: Viktor Ferenczi <viktor@ferenczi.eu> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> 2023-11-03 14:12:48 -07:00			`int64_t num_tokens = input.numel() / d; \`
Implement approximate GELU kernels (#828) 2023-08-23 07:43:21 +09:00			`dim3 grid(num_tokens); \`
			`dim3 block(std::min(d, 1024)); \`
[FIX] Support non-zero CUDA devices in custom kernels (#1959) 2024-01-03 11:09:59 +08:00			`const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \`
Implement approximate GELU kernels (#828) 2023-08-23 07:43:21 +09:00			`const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \`
Avoid compiling kernels for double data type (#933) 2023-09-02 14:59:47 +09:00			`VLLM_DISPATCH_FLOATING_TYPES( \`
Implement approximate GELU kernels (#828) 2023-08-23 07:43:21 +09:00			`input.scalar_type(), \`
			`"activation_kernel", \`
			`[&] { \`
			`vllm::activation_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>( \`
			`out.data_ptr<scalar_t>(), \`
			`input.data_ptr<scalar_t>(), \`
			`d); \`
			`});`

			`namespace vllm {`

			`template<typename T>`
			`__device__ __forceinline__ T gelu_new_kernel(const T& x) {`
			`const float x3 = (float) (x * x * x);`
			`const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3))));`
			`return ((T) 0.5) * x * (((T) 1.0) + t);`
			`}`

			`template<typename T>`
			`__device__ __forceinline__ T gelu_fast_kernel(const T& x) {`
			`const float f = (float) x;`
			`const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x));`
			`return ((T) 0.5) * x * (((T) 1.0) + t);`
			`}`

			`} // namespace vllm`

			`void gelu_new(`
Change scheduler & input tensor shape (#1381) 2023-10-16 17:48:42 -07:00			`torch::Tensor& out, // [..., d]`
			`torch::Tensor& input) // [..., d]`
Implement approximate GELU kernels (#828) 2023-08-23 07:43:21 +09:00			`{`
			`LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);`
			`}`

			`void gelu_fast(`
Change scheduler & input tensor shape (#1381) 2023-10-16 17:48:42 -07:00			`torch::Tensor& out, // [..., d]`
			`torch::Tensor& input) // [..., d]`
Implement approximate GELU kernels (#828) 2023-08-23 07:43:21 +09:00			`{`
			`LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);`
			`}`