#pragma once #include #if defined(__HIPCC__) #define HOST_DEVICE_INLINE __host__ __device__ #define DEVICE_INLINE __device__ #define HOST_INLINE __host__ #elif defined(__CUDACC__) || defined(_NVHPC_CUDA) #define HOST_DEVICE_INLINE __host__ __device__ __forceinline__ #define DEVICE_INLINE __device__ __forceinline__ #define HOST_INLINE __host__ __forceinline__ #else #define HOST_DEVICE_INLINE inline #define DEVICE_INLINE inline #define HOST_INLINE inline #endif #define CUDA_CHECK(cmd) \ do { \ cudaError_t e = cmd; \ if (e != cudaSuccess) { \ printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \ cudaGetErrorString(e)); \ exit(EXIT_FAILURE); \ } \ } while (0) int64_t get_device_attribute(int64_t attribute, int64_t device_id); int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id); namespace cuda_utils { template HOST_DEVICE_INLINE constexpr std::enable_if_t, T> ceil_div(T a, T b) { return (a + b - 1) / b; } }; // namespace cuda_utils