From c34eeec58d3a94437c5311e256f8ba21d1912a39 Mon Sep 17 00:00:00 2001
From: Brayden Zhong <b8zhong@uwaterloo.ca>
Date: Thu, 6 Mar 2025 19:42:49 -0500
Subject: [PATCH] [Bugfix] Correctly call `cudaProfilerStop` in benchmarks
 script (#14183)

Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
---
 benchmarks/kernels/benchmark_layernorm.py       | 2 +-
 benchmarks/kernels/benchmark_lora.py            | 1 -
 benchmarks/kernels/benchmark_machete.py         | 3 +--
 benchmarks/kernels/benchmark_moe.py             | 1 +
 benchmarks/kernels/benchmark_paged_attention.py | 2 +-
 benchmarks/kernels/benchmark_quant.py           | 2 +-
 6 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py
index d265c91b..e12d74c0 100644
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -40,7 +40,7 @@ def main(num_tokens: int,
 
         end_time = time.perf_counter()
         if profile:
-            torch.cuda.cudart().cudaProfilerStart()
+            torch.cuda.cudart().cudaProfilerStop()
         return (end_time - start_time) / num_iters
 
     # Warmup.
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 5eaeec01..3c4d6a6a 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -153,7 +153,6 @@ def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor,
         result = torch.nn.functional.linear(x, w)
         result *= scaling
         out_list.append(result)
-    torch.cat(out_list, dim=0)
 
     cat_result = torch.cat(out_list, dim=0)
 
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 3fa57bd7..a661ea9d 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -45,7 +45,6 @@ def terse_type_name(dt):
         torch.float16: "fp16",
         torch.int8: "int8",
         torch.float8_e4m3fn: "fp8",
-        torch.bfloat16: "bf16",
         torch.float: "float",
         torch.int: "int",
     }[dt]
@@ -259,7 +258,7 @@ def machete_create_bench_fn(bt: BenchmarkTensors,
 
     return lambda: ops.machete_mm(
         a=bt.a,
-        b_q=bt.w_q,
+        b_q=w_q,
         b_type=bt.wtype,
         b_group_scales=bt.w_g_s,
         b_group_zeros=w_g_zp,
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index dce0bef4..9de8d5af 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import argparse
+import json
 import time
 from contextlib import nullcontext
 from datetime import datetime
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 221d7b7d..48b351bc 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -176,7 +176,7 @@ def main(
 
         end_time = time.perf_counter()
         if profile:
-            torch.cuda.cudart().cudaProfilerStart()
+            torch.cuda.cudart().cudaProfilerStop()
         return (end_time - start_time) / num_iters
 
     # Warmup.
diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py
index 0ddea939..b643897a 100644
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -40,7 +40,7 @@ def main(num_tokens: int,
 
         end_time = time.perf_counter()
         if profile:
-            torch.cuda.cudart().cudaProfilerStart()
+            torch.cuda.cudart().cudaProfilerStop()
         return (end_time - start_time) / num_iters
 
     # Warmup.