NVIDIA · danielfrg · Apr 22, 2026 · Apr 22, 2026 · rwgk · Apr 23, 2026
diff --git a/benchmarks/cuda_bindings/README.md b/benchmarks/cuda_bindings/README.md
@@ -47,12 +47,14 @@ To run the benchmarks combine the environment and task:
 ```bash
 # Run the Python benchmarks in the wheel environment
 pixi run -e wheel bench
+pixi run -e wheel bench--min-time 0.1
 
 # Run the Python benchmarks in the source environment
 pixi run -e source bench
 
 # Run the C++ benchmarks
 pixi run -e wheel bench-cpp
+pixi run -e wheel bench-cpp --min-time 0.1
 ```
 
 Both runners automatically save results to JSON files in the benchmarks

diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp
@@ -6,6 +6,7 @@
 
 #include <chrono>
 #include <cmath>
+#include <algorithm>
 #include <cstdint>
 #include <cstdlib>
 #include <ctime>
@@ -22,6 +23,9 @@ struct Options {
     std::uint64_t warmups = 5;
     std::uint64_t values = 20;
     std::uint64_t runs = 20;
+    double min_time_sec = 0.0;
+    std::uint64_t max_loops = 1000000;
+    std::uint64_t calibrate_rounds = 3;
     std::string output_path;
     std::string benchmark_name;
 };
@@ -46,6 +50,18 @@ inline Options parse_args(int argc, char** argv) {
             options.warmups = std::strtoull(argv[++i], nullptr, 10);
             continue;
         }
+        if (arg == "--min-time" && i + 1 < argc) {
+            options.min_time_sec = std::strtod(argv[++i], nullptr);
+            continue;
+        }
+        if (arg == "--max-loops" && i + 1 < argc) {
+            options.max_loops = std::strtoull(argv[++i], nullptr, 10);
+            continue;
+        }
+        if (arg == "--calibrate-rounds" && i + 1 < argc) {
+            options.calibrate_rounds = std::strtoull(argv[++i], nullptr, 10);
+            continue;
+        }
         if (arg == "--values" && i + 1 < argc) {
             options.values = std::strtoull(argv[++i], nullptr, 10);
             continue;
@@ -68,6 +84,9 @@ inline Options parse_args(int argc, char** argv) {
                       << "  --warmups N     Warmup values per run (default: 5)\n"
                       << "  --values N      Timed values per run (default: 20)\n"
                       << "  --runs N        Number of runs (default: 20)\n"
+                      << "  --min-time S    Calibrate loops to reach S seconds per value\n"
+                      << "  --max-loops N   Max loops used during calibration (default: 1000000)\n"
+                      << "  --calibrate-rounds N  Calibration passes (default: 3)\n"
                       << "  -o, --output F  Write pyperf-compatible JSON to file\n"
                       << "  --name S        Benchmark name (overrides default)\n";
             std::exit(0);
@@ -93,6 +112,47 @@ inline std::string iso_now() {
     return std::string(buf);
 }
 
+// Calibrate loop count to hit a minimum wall time per value.
+template <typename Fn>
+std::uint64_t calibrate_loops(const Options& options, Fn&& fn) {
+    if (options.min_time_sec <= 0.0) {
+        return options.loops;
+    }
+
+    std::uint64_t best = 1;
+    const std::uint64_t max_loops = std::max<std::uint64_t>(1, options.max_loops);
+    const std::uint64_t rounds = std::max<std::uint64_t>(1, options.calibrate_rounds);
+
+    for (std::uint64_t round = 0; round < rounds; ++round) {
+        std::uint64_t loops = 1;
+        double elapsed = 0.0;
+
+        while (true) {
+            const auto t0 = std::chrono::steady_clock::now();
+            for (std::uint64_t i = 0; i < loops; ++i) {
+                fn();
+            }
+            const auto t1 = std::chrono::steady_clock::now();
+            elapsed = std::chrono::duration<double>(t1 - t0).count();
+
+            if (elapsed >= options.min_time_sec || loops >= max_loops) {
+                break;
+            }
+            if (loops > max_loops / 2) {
+                loops = max_loops;
+            } else {
+                loops *= 2;
+            }
+        }
+
+        if (loops > best) {
+            best = loops;
+        }
+    }
+
+    return best;
+}
+
 // Run a benchmark function. The function signature is: void fn() — one call = one operation.
 // The harness calls fn() in a tight loop `loops` times per value.
 template <typename Fn>
@@ -238,9 +298,15 @@ class BenchmarkSuite {
     // Run a benchmark and record it. The name is used as the benchmark ID.
     template <typename Fn>
     void run(const std::string& name, Fn&& fn) {
-        auto results = run_benchmark(options_, std::forward<Fn>(fn));
+        std::uint64_t loops = options_.loops;
+        Options custom = options_;
+        if (options_.min_time_sec > 0.0) {
+            loops = calibrate_loops(options_, fn);
+            custom.loops = loops;
+        }
+        auto results = run_benchmark(custom, std::forward<Fn>(fn));
         print_summary(name, results);
-        entries_.push_back({name, options_.loops, std::move(results)});
+        entries_.push_back({name, loops, std::move(results)});
     }
 
     // Run a benchmark with a custom loop count (for slow operations like compilation).

diff --git a/benchmarks/cuda_bindings/compare.py b/benchmarks/cuda_bindings/compare.py
@@ -29,14 +29,27 @@ def load_benchmarks(path: Path) -> dict[str, list[float]]:
                 name = run.get("metadata", {}).get("name", "")
                 if name:
                     break
-        values = []
+        values: list[float] = []
         for run in bench.get("runs", []):
             values.extend(run.get("values", []))
         if name and values:
             results[name] = values
     return results
 
 
+def stats(values: list[float]) -> tuple[float, float, float, int]:
+    mean = statistics.mean(values)
+    stdev = statistics.pstdev(values) if len(values) > 1 else 0.0
+    rsd = (stdev / mean) if mean else 0.0
+    return mean, stdev, rsd, len(values)
+
+
+def fmt_rsd(rsd: float | None) -> str:
+    if rsd is None:
+        return "-"
+    return f"{rsd * 100:.1f}%"
+
+
 def fmt_ns(seconds: float) -> str:
     ns = seconds * 1e9
     if ns >= 1000:
@@ -79,13 +92,16 @@ def main() -> None:
 
     # Header
     if cpp_benchmarks:
-        header = f"{'Benchmark':<{name_width}}  {'C++ (mean)':>12}  {'Python (mean)':>14}  {'Overhead':>10}"
+        header = (
+            f"{'Benchmark':<{name_width}}  {'C++ (mean)':>12}  {'C++ RSD':>8}  "
+            f"{'Python (mean)':>14}  {'Py RSD':>7}  {'Overhead':>10}"
+        )
         sep = "-" * len(header)
         print(sep)
         print(header)
         print(sep)
     else:
-        header = f"{'Benchmark':<{name_width}}  {'Python (mean)':>14}"
+        header = f"{'Benchmark':<{name_width}}  {'Python (mean)':>14}  {'Py RSD':>7}"
         sep = "-" * len(header)
         print(sep)
         print(header)
@@ -95,21 +111,29 @@ def main() -> None:
         py_vals = py_benchmarks.get(name)
         cpp_vals = cpp_benchmarks.get(name)
 
-        py_str = fmt_ns(statistics.mean(py_vals)) if py_vals else "-"
-        cpp_str = fmt_ns(statistics.mean(cpp_vals)) if cpp_vals else "-"
+        py_stats = stats(py_vals) if py_vals else None
+        cpp_stats = stats(cpp_vals) if cpp_vals else None
+
+        py_str = fmt_ns(py_stats[0]) if py_stats else "-"
+        cpp_str = fmt_ns(cpp_stats[0]) if cpp_stats else "-"
+        py_rsd = fmt_rsd(py_stats[2]) if py_stats else "-"
+        cpp_rsd = fmt_rsd(cpp_stats[2]) if cpp_stats else "-"
 
-        if py_vals and cpp_vals:
-            py_mean = statistics.mean(py_vals)
-            cpp_mean = statistics.mean(cpp_vals)
+        if py_stats and cpp_stats:
+            py_mean = py_stats[0]
+            cpp_mean = cpp_stats[0]
             overhead_ns = (py_mean - cpp_mean) * 1e9
             overhead_str = f"+{overhead_ns:.0f} ns"
         else:
             overhead_str = "-"
 
         if cpp_benchmarks:
-            print(f"{name:<{name_width}}  {cpp_str:>12}  {py_str:>14}  {overhead_str:>10}")
+            print(
+                f"{name:<{name_width}}  {cpp_str:>12}  {cpp_rsd:>8}  "
+                f"{py_str:>14}  {py_rsd:>7}  {overhead_str:>10}"
+            )
         else:
-            print(f"{name:<{name_width}}  {py_str:>14}")
+            print(f"{name:<{name_width}}  {py_str:>14}  {py_rsd:>7}")
 
     print(sep)