vllm/csrc/cutlass_extensions/gemm/dispatch_policy.hpp

#pragma once

#include "cutlass/gemm/dispatch_policy.hpp"

namespace cutlass::gemm {

//////////////////////////////////////////////////////////////////////////////

// FP8 related policies (including Blocked Scaled Accumulation)
//  `ScaleGranularityM` specifies scaling granularity along M, while zero-value
//  `ScaleGranularityM` indicates that scaling granularity is
//  `size<0>(TileShape_MNK{})` along M.
template <int ScaleGranularityM = 0>
struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum
    : KernelTmaWarpSpecializedCooperative {};

// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp
// specialized dynamic schedule For FP8 kernels with Block Scaling
template <int Stages_, class ClusterShape_ = Shape<_1, _1, _1>,
          class KernelSchedule = KernelTmaWarpSpecialized,
          int ScaleGranularityM =
              0  // `ScaleGranularityM` specifies scaling granularity along M,
                 // while zero-value `ScaleGranularityM` indicates that scaling
                 // granularity is `size<0>(TileShape_MNK{})` along M.
          >
struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8
    : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_,
                                         KernelSchedule> {
  static_assert(
      cute::is_same_v<
          KernelSchedule,
          KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<
              ScaleGranularityM>>,
      "KernelSchedule must be one of the warp specialized policies");
};

//////////////////////////////////////////////////////////////////////////////

}  // namespace cutlass::gemm
[Kernel] Update `cutlass_scaled_mm` to support 2d group (blockwise) scaling (#11868) 2025-01-30 21:33:00 -05:00			`#pragma once`

			`#include "cutlass/gemm/dispatch_policy.hpp"`

			`namespace cutlass::gemm {`

			`//////////////////////////////////////////////////////////////////////////////`

			`// FP8 related policies (including Blocked Scaled Accumulation)`
			// `ScaleGranularityM` specifies scaling granularity along M, while zero-value
			// `ScaleGranularityM` indicates that scaling granularity is
			// `size<0>(TileShape_MNK{})` along M.
			`template <int ScaleGranularityM = 0>`
			`struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum`
			`: KernelTmaWarpSpecializedCooperative {};`

			`// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp`
			`// specialized dynamic schedule For FP8 kernels with Block Scaling`
			`template <int Stages_, class ClusterShape_ = Shape<_1, _1, _1>,`
			`class KernelSchedule = KernelTmaWarpSpecialized,`
			`int ScaleGranularityM =`
			0 // `ScaleGranularityM` specifies scaling granularity along M,
			// while zero-value `ScaleGranularityM` indicates that scaling
			// granularity is `size<0>(TileShape_MNK{})` along M.
			`>`
			`struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8`
			`: MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_,`
			`KernelSchedule> {`
			`static_assert(`
			`cute::is_same_v<`
			`KernelSchedule,`
			`KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<`
			`ScaleGranularityM>>,`
			`"KernelSchedule must be one of the warp specialized policies");`
			`};`

			`//////////////////////////////////////////////////////////////////////////////`

			`} // namespace cutlass::gemm`