From c34eeec58d3a94437c5311e256f8ba21d1912a39 Mon Sep 17 00:00:00 2001 From: Brayden Zhong Date: Thu, 6 Mar 2025 19:42:49 -0500 Subject: [PATCH] [Bugfix] Correctly call `cudaProfilerStop` in benchmarks script (#14183) Signed-off-by: Brayden Zhong --- benchmarks/kernels/benchmark_layernorm.py | 2 +- benchmarks/kernels/benchmark_lora.py | 1 - benchmarks/kernels/benchmark_machete.py | 3 +-- benchmarks/kernels/benchmark_moe.py | 1 + benchmarks/kernels/benchmark_paged_attention.py | 2 +- benchmarks/kernels/benchmark_quant.py | 2 +- 6 files changed, 5 insertions(+), 6 deletions(-) diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index d265c91b..e12d74c0 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -40,7 +40,7 @@ def main(num_tokens: int, end_time = time.perf_counter() if profile: - torch.cuda.cudart().cudaProfilerStart() + torch.cuda.cudart().cudaProfilerStop() return (end_time - start_time) / num_iters # Warmup. diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 5eaeec01..3c4d6a6a 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -153,7 +153,6 @@ def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor, result = torch.nn.functional.linear(x, w) result *= scaling out_list.append(result) - torch.cat(out_list, dim=0) cat_result = torch.cat(out_list, dim=0) diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index 3fa57bd7..a661ea9d 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -45,7 +45,6 @@ def terse_type_name(dt): torch.float16: "fp16", torch.int8: "int8", torch.float8_e4m3fn: "fp8", - torch.bfloat16: "bf16", torch.float: "float", torch.int: "int", }[dt] @@ -259,7 +258,7 @@ def machete_create_bench_fn(bt: BenchmarkTensors, return lambda: ops.machete_mm( a=bt.a, - b_q=bt.w_q, + b_q=w_q, b_type=bt.wtype, b_group_scales=bt.w_g_s, b_group_zeros=w_g_zp, diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index dce0bef4..9de8d5af 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import argparse +import json import time from contextlib import nullcontext from datetime import datetime diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 221d7b7d..48b351bc 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -176,7 +176,7 @@ def main( end_time = time.perf_counter() if profile: - torch.cuda.cudart().cudaProfilerStart() + torch.cuda.cudart().cudaProfilerStop() return (end_time - start_time) / num_iters # Warmup. diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 0ddea939..b643897a 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -40,7 +40,7 @@ def main(num_tokens: int, end_time = time.perf_counter() if profile: - torch.cuda.cudart().cudaProfilerStart() + torch.cuda.cudart().cudaProfilerStop() return (end_time - start_time) / num_iters # Warmup.