vllm/csrc/cuda_utils.h

#pragma once

#include <stdio.h>

#if defined(__HIPCC__)
  #define HOST_DEVICE_INLINE __host__ __device__
  #define DEVICE_INLINE __device__
  #define HOST_INLINE __host__
#elif defined(__CUDACC__) || defined(_NVHPC_CUDA)
  #define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
  #define DEVICE_INLINE __device__ __forceinline__
  #define HOST_INLINE __host__ __forceinline__
#else
  #define HOST_DEVICE_INLINE inline
  #define DEVICE_INLINE inline
  #define HOST_INLINE inline
#endif

#define CUDA_CHECK(cmd)                                             \
  do {                                                              \
    cudaError_t e = cmd;                                            \
    if (e != cudaSuccess) {                                         \
      printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \
             cudaGetErrorString(e));                                \
      exit(EXIT_FAILURE);                                           \
    }                                                               \
  } while (0)

int64_t get_device_attribute(int64_t attribute, int64_t device_id);

int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);

namespace cuda_utils {

template <typename T>
HOST_DEVICE_INLINE constexpr std::enable_if_t<std::is_integral_v<T>, T>
ceil_div(T a, T b) {
  return (a + b - 1) / b;
}

};  // namespace cuda_utils
Avoid multiple redefinition (#1817) 2023-12-14 12:35:58 -05:00			`#pragma once`

[NVIDIA] Support nvfp4 quantization (#12784) 2025-02-12 19:51:51 -08:00			`#include <stdio.h>`

[Attention] MLA with chunked prefill (#12639) Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Patrick Horn <patrick.horn@gmail.com> Co-authored-by: simon-mo <xmo@berkeley.edu> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> 2025-02-21 18:30:12 -05:00			`#if defined(__HIPCC__)`
			`#define HOST_DEVICE_INLINE __host__ __device__`
			`#define DEVICE_INLINE __device__`
			`#define HOST_INLINE __host__`
			`#elif defined(__CUDACC__) \|\| defined(_NVHPC_CUDA)`
			`#define HOST_DEVICE_INLINE __host__ __device__ __forceinline__`
			`#define DEVICE_INLINE __device__ __forceinline__`
			`#define HOST_INLINE __host__ __forceinline__`
[Kernel] (1/N) Machete - Hopper Optimized Mixed Precision Linear Kernel (#7174) 2024-08-20 09:09:33 -04:00			`#else`
			`#define HOST_DEVICE_INLINE inline`
			`#define DEVICE_INLINE inline`
			`#define HOST_INLINE inline`
			`#endif`

[NVIDIA] Support nvfp4 quantization (#12784) 2025-02-12 19:51:51 -08:00			`#define CUDA_CHECK(cmd) \`
			`do { \`
			`cudaError_t e = cmd; \`
			`if (e != cudaSuccess) { \`
			`printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \`
			`cudaGetErrorString(e)); \`
			`exit(EXIT_FAILURE); \`
			`} \`
			`} while (0)`

[Kernel][Misc] Use TORCH_LIBRARY instead of PYBIND11_MODULE for custom ops (#5047) 2024-06-09 16:23:30 -04:00			`int64_t get_device_attribute(int64_t attribute, int64_t device_id);`
[Build] Avoid building too many extensions (#1624) 2023-11-23 16:31:19 -08:00
[Kernel][Misc] Use TORCH_LIBRARY instead of PYBIND11_MODULE for custom ops (#5047) 2024-06-09 16:23:30 -04:00			`int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);`
[Attention] MLA with chunked prefill (#12639) Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Patrick Horn <patrick.horn@gmail.com> Co-authored-by: simon-mo <xmo@berkeley.edu> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> 2025-02-21 18:30:12 -05:00
			`namespace cuda_utils {`

			`template <typename T>`
			`HOST_DEVICE_INLINE constexpr std::enable_if_t<std::is_integral_v<T>, T>`
			`ceil_div(T a, T b) {`
			`return (a + b - 1) / b;`
			`}`

			`}; // namespace cuda_utils`