From daf603defeee60ef653867d0cb351092938553f1 Mon Sep 17 00:00:00 2001
From: Daniel Rodriguez <daniel@danielfrg.com>
Date: Wed, 22 Apr 2026 13:40:15 -0500
Subject: [PATCH 1/4] Improve Cpp harness with min-time for more stable
 collection

---
 benchmarks/cuda_bindings/README.md            |  2 +
 .../benchmarks/cpp/bench_support.hpp          | 70 ++++++++++++++++++-
 benchmarks/cuda_bindings/compare.py           | 52 +++++++++++---
 3 files changed, 112 insertions(+), 12 deletions(-)
diff --git a/benchmarks/cuda_bindings/README.md b/benchmarks/cuda_bindings/README.md
index f8d5ccf0436..7881392b511 100644
--- a/benchmarks/cuda_bindings/README.md
+++ b/benchmarks/cuda_bindings/README.md
@@ -47,12 +47,14 @@ To run the benchmarks combine the environment and task:
 ```bash
 # Run the Python benchmarks in the wheel environment
 pixi run -e wheel bench
+pixi run -e wheel bench--min-time 0.1
 
 # Run the Python benchmarks in the source environment
 pixi run -e source bench
 
 # Run the C++ benchmarks
 pixi run -e wheel bench-cpp
+pixi run -e wheel bench-cpp --min-time 0.1
 ```
 
 Both runners automatically save results to JSON files in the benchmarks
diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp
index 8b541228667..2755bf21183 100644
--- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp
+++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp
@@ -6,6 +6,7 @@
 
 #include <chrono>
 #include <cmath>
+#include <algorithm>
 #include <cstdint>
 #include <cstdlib>
 #include <ctime>
@@ -22,6 +23,9 @@ struct Options {
     std::uint64_t warmups = 5;
     std::uint64_t values = 20;
     std::uint64_t runs = 20;
+    double min_time_sec = 0.0;
+    std::uint64_t max_loops = 1000000;
+    std::uint64_t calibrate_rounds = 3;
     std::string output_path;
     std::string benchmark_name;
 };
@@ -46,6 +50,18 @@ inline Options parse_args(int argc, char** argv) {
             options.warmups = std::strtoull(argv[++i], nullptr, 10);
             continue;
         }
+        if (arg == "--min-time" && i + 1 < argc) {
+            options.min_time_sec = std::strtod(argv[++i], nullptr);
+            continue;
+        }
+        if (arg == "--max-loops" && i + 1 < argc) {
+            options.max_loops = std::strtoull(argv[++i], nullptr, 10);
+            continue;
+        }
+        if (arg == "--calibrate-rounds" && i + 1 < argc) {
+            options.calibrate_rounds = std::strtoull(argv[++i], nullptr, 10);
+            continue;
+        }
         if (arg == "--values" && i + 1 < argc) {
             options.values = std::strtoull(argv[++i], nullptr, 10);
             continue;
@@ -68,6 +84,9 @@ inline Options parse_args(int argc, char** argv) {
                       << "  --warmups N     Warmup values per run (default: 5)\n"
                       << "  --values N      Timed values per run (default: 20)\n"
                       << "  --runs N        Number of runs (default: 20)\n"
+                      << "  --min-time S    Calibrate loops to reach S seconds per value\n"
+                      << "  --max-loops N   Max loops used during calibration (default: 1000000)\n"
+                      << "  --calibrate-rounds N  Calibration passes (default: 3)\n"
                       << "  -o, --output F  Write pyperf-compatible JSON to file\n"
                       << "  --name S        Benchmark name (overrides default)\n";
             std::exit(0);
@@ -93,6 +112,47 @@ inline std::string iso_now() {
     return std::string(buf);
 }
 
+// Calibrate loop count to hit a minimum wall time per value.
+template <typename Fn>
+std::uint64_t calibrate_loops(const Options& options, Fn&& fn) {
+    if (options.min_time_sec <= 0.0) {
+        return options.loops;
+    }
+
+    std::uint64_t best = 1;
+    const std::uint64_t max_loops = std::max<std::uint64_t>(1, options.max_loops);
+    const std::uint64_t rounds = std::max<std::uint64_t>(1, options.calibrate_rounds);
+
+    for (std::uint64_t round = 0; round < rounds; ++round) {
+        std::uint64_t loops = 1;
+        double elapsed = 0.0;
+
+        while (true) {
+            const auto t0 = std::chrono::steady_clock::now();
+            for (std::uint64_t i = 0; i < loops; ++i) {
+                fn();
+            }
+            const auto t1 = std::chrono::steady_clock::now();
+            elapsed = std::chrono::duration<double>(t1 - t0).count();
+
+            if (elapsed >= options.min_time_sec || loops >= max_loops) {
+                break;
+            }
+            if (loops > max_loops / 2) {
+                loops = max_loops;
+            } else {
+                loops *= 2;
+            }
+        }
+
+        if (loops > best) {
+            best = loops;
+        }
+    }
+
+    return best;
+}
+
 // Run a benchmark function. The function signature is: void fn() — one call = one operation.
 // The harness calls fn() in a tight loop `loops` times per value.
 template <typename Fn>
@@ -238,9 +298,15 @@ class BenchmarkSuite {
     // Run a benchmark and record it. The name is used as the benchmark ID.
     template <typename Fn>
     void run(const std::string& name, Fn&& fn) {
-        auto results = run_benchmark(options_, std::forward<Fn>(fn));
+        std::uint64_t loops = options_.loops;
+        Options custom = options_;
+        if (options_.min_time_sec > 0.0) {
+            loops = calibrate_loops(options_, fn);
+            custom.loops = loops;
+        }
+        auto results = run_benchmark(custom, std::forward<Fn>(fn));
         print_summary(name, results);
-        entries_.push_back({name, options_.loops, std::move(results)});
+        entries_.push_back({name, loops, std::move(results)});
     }
 
     // Run a benchmark with a custom loop count (for slow operations like compilation).
diff --git a/benchmarks/cuda_bindings/compare.py b/benchmarks/cuda_bindings/compare.py
index 6a3e94f3447..7fd9ca87178 100644
--- a/benchmarks/cuda_bindings/compare.py
+++ b/benchmarks/cuda_bindings/compare.py
@@ -29,7 +29,7 @@ def load_benchmarks(path: Path) -> dict[str, list[float]]:
                 name = run.get("metadata", {}).get("name", "")
                 if name:
                     break
-        values = []
+        values: list[float] = []
         for run in bench.get("runs", []):
             values.extend(run.get("values", []))
         if name and values:
@@ -37,6 +37,19 @@ def load_benchmarks(path: Path) -> dict[str, list[float]]:
     return results
 
 
+def stats(values: list[float]) -> tuple[float, float, float, int]:
+    mean = statistics.mean(values)
+    stdev = statistics.pstdev(values) if len(values) > 1 else 0.0
+    rsd = (stdev / mean) if mean else 0.0
+    return mean, stdev, rsd, len(values)
+
+
+def fmt_rsd(rsd: float | None) -> str:
+    if rsd is None:
+        return "-"
+    return f"{rsd * 100:.1f}%"
+
+
 def fmt_ns(seconds: float) -> str:
     ns = seconds * 1e9
     if ns >= 1000:
@@ -58,6 +71,12 @@ def main() -> None:
         default=DEFAULT_CPP,
         help=f"C++ results JSON (default: {DEFAULT_CPP.name})",
     )
+    parser.add_argument(
+        "--target-us",
+        type=float,
+        default=1.0,
+        help="Overhead target in microseconds (default: 1.0)",
+    )
     args = parser.parse_args()
 
     if not args.python.exists():
@@ -79,13 +98,16 @@ def main() -> None:
 
     # Header
     if cpp_benchmarks:
-        header = f"{'Benchmark':<{name_width}}  {'C++ (mean)':>12}  {'Python (mean)':>14}  {'Overhead':>10}"
+        header = (
+            f"{'Benchmark':<{name_width}}  {'C++ (mean)':>12}  {'C++ RSD':>8}  "
+            f"{'Python (mean)':>14}  {'Py RSD':>7}  {'Overhead':>10}  {'Target':>6}"
+        )
         sep = "-" * len(header)
         print(sep)
         print(header)
         print(sep)
     else:
-        header = f"{'Benchmark':<{name_width}}  {'Python (mean)':>14}"
+        header = f"{'Benchmark':<{name_width}}  {'Python (mean)':>14}  {'Py RSD':>7}"
         sep = "-" * len(header)
         print(sep)
         print(header)
@@ -95,21 +117,31 @@ def main() -> None:
         py_vals = py_benchmarks.get(name)
         cpp_vals = cpp_benchmarks.get(name)
 
-        py_str = fmt_ns(statistics.mean(py_vals)) if py_vals else "-"
-        cpp_str = fmt_ns(statistics.mean(cpp_vals)) if cpp_vals else "-"
+        py_stats = stats(py_vals) if py_vals else None
+        cpp_stats = stats(cpp_vals) if cpp_vals else None
+
+        py_str = fmt_ns(py_stats[0]) if py_stats else "-"
+        cpp_str = fmt_ns(cpp_stats[0]) if cpp_stats else "-"
+        py_rsd = fmt_rsd(py_stats[2]) if py_stats else "-"
+        cpp_rsd = fmt_rsd(cpp_stats[2]) if cpp_stats else "-"
 
-        if py_vals and cpp_vals:
-            py_mean = statistics.mean(py_vals)
-            cpp_mean = statistics.mean(cpp_vals)
+        if py_stats and cpp_stats:
+            py_mean = py_stats[0]
+            cpp_mean = cpp_stats[0]
             overhead_ns = (py_mean - cpp_mean) * 1e9
             overhead_str = f"+{overhead_ns:.0f} ns"
+            target = "OK" if overhead_ns <= args.target_us * 1000 else "FAIL"
         else:
             overhead_str = "-"
+            target = "-"
 
         if cpp_benchmarks:
-            print(f"{name:<{name_width}}  {cpp_str:>12}  {py_str:>14}  {overhead_str:>10}")
+            print(
+                f"{name:<{name_width}}  {cpp_str:>12}  {cpp_rsd:>8}  "
+                f"{py_str:>14}  {py_rsd:>7}  {overhead_str:>10}  {target:>6}"
+            )
         else:
-            print(f"{name:<{name_width}}  {py_str:>14}")
+            print(f"{name:<{name_width}}  {py_str:>14}  {py_rsd:>7}")
 
     print(sep)
 

From cd15d84728cb1574e38f98cb359c03fd9fa88ea1 Mon Sep 17 00:00:00 2001
From: Daniel Rodriguez <daniel@danielfrg.com>
Date: Wed, 22 Apr 2026 13:47:28 -0500
Subject: [PATCH 2/4] Improve Cpp harness with min-time for more stable
 collection

---
 benchmarks/cuda_bindings/compare.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/benchmarks/cuda_bindings/compare.py b/benchmarks/cuda_bindings/compare.py
index 7fd9ca87178..7dbc972e92a 100644
--- a/benchmarks/cuda_bindings/compare.py
+++ b/benchmarks/cuda_bindings/compare.py
@@ -71,12 +71,6 @@ def main() -> None:
         default=DEFAULT_CPP,
         help=f"C++ results JSON (default: {DEFAULT_CPP.name})",
     )
-    parser.add_argument(
-        "--target-us",
-        type=float,
-        default=1.0,
-        help="Overhead target in microseconds (default: 1.0)",
-    )
     args = parser.parse_args()
 
     if not args.python.exists():
@@ -100,7 +94,7 @@ def main() -> None:
     if cpp_benchmarks:
         header = (
             f"{'Benchmark':<{name_width}}  {'C++ (mean)':>12}  {'C++ RSD':>8}  "
-            f"{'Python (mean)':>14}  {'Py RSD':>7}  {'Overhead':>10}  {'Target':>6}"
+            f"{'Python (mean)':>14}  {'Py RSD':>7}  {'Overhead':>10}"
         )
         sep = "-" * len(header)
         print(sep)
@@ -130,15 +124,13 @@ def main() -> None:
             cpp_mean = cpp_stats[0]
             overhead_ns = (py_mean - cpp_mean) * 1e9
             overhead_str = f"+{overhead_ns:.0f} ns"
-            target = "OK" if overhead_ns <= args.target_us * 1000 else "FAIL"
         else:
             overhead_str = "-"
-            target = "-"
 
         if cpp_benchmarks:
             print(
                 f"{name:<{name_width}}  {cpp_str:>12}  {cpp_rsd:>8}  "
-                f"{py_str:>14}  {py_rsd:>7}  {overhead_str:>10}  {target:>6}"
+                f"{py_str:>14}  {py_rsd:>7}  {overhead_str:>10}"
             )
         else:
             print(f"{name:<{name_width}}  {py_str:>14}  {py_rsd:>7}")

From abde710509fec7fa706e2ca6e8ebd2a99bf1c147 Mon Sep 17 00:00:00 2001
From: Daniel Rodriguez <daniel@danielfrg.com>
Date: Fri, 24 Apr 2026 18:34:57 -0500
Subject: [PATCH 3/4] Remove limit and added drain

---
 .../benchmarks/cpp/bench_event.cpp            |   5 +
 .../benchmarks/cpp/bench_launch.cpp           |   6 +
 .../benchmarks/cpp/bench_memory.cpp           |   5 +
 .../benchmarks/cpp/bench_stream.cpp           |   6 +
 .../benchmarks/cpp/bench_support.hpp          | 112 ++++++++++++++++--
 5 files changed, 122 insertions(+), 12 deletions(-)

diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp
index 44cd6177786..c24aa983199 100644
--- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp
+++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp
@@ -45,6 +45,11 @@ int main(int argc, char** argv) {
     check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed");
 
     bench::BenchmarkSuite suite(options);
+    // Drain the persistent stream after calibration so event_record (which
+    // enqueues onto the stream) and event_synchronize start from a known state.
+    suite.set_post_calibrate([&]() {
+        check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed");
+    });
 
     // --- event_create_destroy ---
     {
diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp
index 984c82fcf32..4897859a61a 100644
--- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp
+++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp
@@ -238,6 +238,12 @@ int main(int argc, char** argv) {
     void* struct_params[] = {&struct_2048B};
 
     bench::BenchmarkSuite suite(options);
+    // After calibration, drain the persistent stream so the first measured
+    // sample does not start on a backlogged stream. Calibration for enqueue-
+    // style ops (kernel launches) may queue many thousands of operations.
+    suite.set_post_calibrate([&]() {
+        check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed");
+    });
 
     suite.run("launch.launch_empty_kernel", [&]() {
         check_cu(cuLaunchKernel(empty_kernel, 1, 1, 1, 1, 1, 1, 0, stream, nullptr, nullptr),
diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp
index 4e71b73fb5e..803363be480 100644
--- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp
+++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp
@@ -52,6 +52,11 @@ int main(int argc, char** argv) {
     uint8_t host_dst[COPY_SIZE] = {};
 
     bench::BenchmarkSuite suite(options);
+    // Drain the persistent stream after calibration so async benchmarks
+    // (mem_alloc_async_free_async) don't start measurement on a backlogged stream.
+    suite.set_post_calibrate([&]() {
+        check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed");
+    });
 
     // --- mem_alloc_free ---
     {
diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp
index 702e86aef02..95ad0790f9f 100644
--- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp
+++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp
@@ -38,6 +38,12 @@ int main(int argc, char** argv) {
     check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed");
 
     bench::BenchmarkSuite suite(options);
+    // Drain the persistent stream after calibration for completeness.
+    // stream_create_destroy uses a local stream, but stream_query/synchronize
+    // observe the persistent one.
+    suite.set_post_calibrate([&]() {
+        check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed");
+    });
 
     // --- stream_create_destroy ---
     {
diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp
index 2755bf21183..131f69de540 100644
--- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp
+++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp
@@ -11,6 +11,7 @@
 #include <cstdlib>
 #include <ctime>
 #include <fstream>
+#include <functional>
 #include <iomanip>
 #include <iostream>
 #include <string>
@@ -24,7 +25,10 @@ struct Options {
     std::uint64_t values = 20;
     std::uint64_t runs = 20;
     double min_time_sec = 0.0;
-    std::uint64_t max_loops = 1000000;
+    // Safety cap for the calibration doubling loop. Set high enough that even
+    // sub-nanosecond ops can reach typical --min-time targets (e.g. 100ms).
+    // A warning is printed if calibration hits this cap before reaching min-time.
+    std::uint64_t max_loops = 100000000;
     std::uint64_t calibrate_rounds = 3;
     std::string output_path;
     std::string benchmark_name;
@@ -85,7 +89,7 @@ inline Options parse_args(int argc, char** argv) {
                       << "  --values N      Timed values per run (default: 20)\n"
                       << "  --runs N        Number of runs (default: 20)\n"
                       << "  --min-time S    Calibrate loops to reach S seconds per value\n"
-                      << "  --max-loops N   Max loops used during calibration (default: 1000000)\n"
+                      << "  --max-loops N   Safety cap for calibration loop count (default: 100000000)\n"
                       << "  --calibrate-rounds N  Calibration passes (default: 3)\n"
                       << "  -o, --output F  Write pyperf-compatible JSON to file\n"
                       << "  --name S        Benchmark name (overrides default)\n";
@@ -113,18 +117,34 @@ inline std::string iso_now() {
 }
 
 // Calibrate loop count to hit a minimum wall time per value.
+// Returns the chosen loop count. If `capped_out` is non-null, it is set to
+// true when calibration reached `max_loops` before hitting `min_time_sec`
+// (meaning --min-time was NOT actually satisfied by the calibration).
 template <typename Fn>
-std::uint64_t calibrate_loops(const Options& options, Fn&& fn) {
+std::uint64_t calibrate_loops(
+    const Options& options,
+    Fn&& fn,
+    bool* capped_out = nullptr,
+    double* last_elapsed_out = nullptr
+) {
     if (options.min_time_sec <= 0.0) {
+        if (capped_out) *capped_out = false;
+        if (last_elapsed_out) *last_elapsed_out = 0.0;
         return options.loops;
     }
 
-    std::uint64_t best = 1;
-    const std::uint64_t max_loops = std::max<std::uint64_t>(1, options.max_loops);
+    // Allow callers (e.g. the explicit-loop overload) to request a minimum
+    // starting loop count via options.loops.
+    const std::uint64_t start_loops = std::max<std::uint64_t>(1, options.loops);
+    std::uint64_t best = start_loops;
+    const std::uint64_t max_loops = std::max<std::uint64_t>(start_loops, options.max_loops);
     const std::uint64_t rounds = std::max<std::uint64_t>(1, options.calibrate_rounds);
 
+    bool capped = false;
+    double last_elapsed = 0.0;
+
     for (std::uint64_t round = 0; round < rounds; ++round) {
-        std::uint64_t loops = 1;
+        std::uint64_t loops = start_loops;
         double elapsed = 0.0;
 
         while (true) {
@@ -135,7 +155,11 @@ std::uint64_t calibrate_loops(const Options& options, Fn&& fn) {
             const auto t1 = std::chrono::steady_clock::now();
             elapsed = std::chrono::duration<double>(t1 - t0).count();
 
-            if (elapsed >= options.min_time_sec || loops >= max_loops) {
+            if (elapsed >= options.min_time_sec) {
+                break;
+            }
+            if (loops >= max_loops) {
+                capped = true;
                 break;
             }
             if (loops > max_loops / 2) {
@@ -148,8 +172,11 @@ std::uint64_t calibrate_loops(const Options& options, Fn&& fn) {
         if (loops > best) {
             best = loops;
         }
+        last_elapsed = elapsed;
     }
 
+    if (capped_out) *capped_out = capped;
+    if (last_elapsed_out) *last_elapsed_out = last_elapsed;
     return best;
 }
 
@@ -295,28 +322,59 @@ class BenchmarkSuite {
 public:
     explicit BenchmarkSuite(Options options) : options_(std::move(options)) {}
 
+    // Post-calibration hook. If set, invoked after calibration and before the
+    // first measured warmup/value, for every benchmark in this suite. Intended
+    // for async benchmarks that need to drain state left behind by calibration
+    // (e.g. cuStreamSynchronize on a persistent stream). Can be overridden
+    // per-call via the `post_calibrate` parameter on `run()`.
+    void set_post_calibrate(std::function<void()> hook) {
+        post_calibrate_ = std::move(hook);
+    }
+
     // Run a benchmark and record it. The name is used as the benchmark ID.
+    // If --min-time is set, loop count is auto-calibrated. `post_calibrate`,
+    // if provided, runs after calibration and before measurement.
     template <typename Fn>
-    void run(const std::string& name, Fn&& fn) {
+    void run(
+        const std::string& name,
+        Fn&& fn,
+        std::function<void()> post_calibrate = {}
+    ) {
         std::uint64_t loops = options_.loops;
         Options custom = options_;
         if (options_.min_time_sec > 0.0) {
-            loops = calibrate_loops(options_, fn);
+            loops = calibrate_and_warn(name, options_, fn);
             custom.loops = loops;
+            invoke_post_calibrate(post_calibrate);
         }
         auto results = run_benchmark(custom, std::forward<Fn>(fn));
         print_summary(name, results);
         entries_.push_back({name, loops, std::move(results)});
     }
 
-    // Run a benchmark with a custom loop count (for slow operations like compilation).
+    // Run a benchmark with a custom loop count (used as a floor for fast ops
+    // or a fixed count for slow ops like compilation). When --min-time is set,
+    // calibration still runs but starts from `loops_override` as the minimum.
     template <typename Fn>
-    void run(const std::string& name, std::uint64_t loops_override, Fn&& fn) {
+    void run(
+        const std::string& name,
+        std::uint64_t loops_override,
+        Fn&& fn,
+        std::function<void()> post_calibrate = {}
+    ) {
+        std::uint64_t loops = loops_override;
         Options custom = options_;
         custom.loops = loops_override;
+        if (options_.min_time_sec > 0.0) {
+            Options calib_opts = options_;
+            calib_opts.loops = loops_override;  // floor
+            loops = calibrate_and_warn(name, calib_opts, fn);
+            custom.loops = loops;
+            invoke_post_calibrate(post_calibrate);
+        }
         auto results = run_benchmark(custom, std::forward<Fn>(fn));
         print_summary(name, results);
-        entries_.push_back({name, loops_override, std::move(results)});
+        entries_.push_back({name, loops, std::move(results)});
     }
 
     // Write all collected benchmarks to the output file (if -o was given).
@@ -329,6 +387,36 @@ class BenchmarkSuite {
 private:
     Options options_;
     std::vector<BenchmarkEntry> entries_;
+    std::function<void()> post_calibrate_;
+
+    void invoke_post_calibrate(const std::function<void()>& per_call) const {
+        if (per_call) {
+            per_call();
+        } else if (post_calibrate_) {
+            post_calibrate_();
+        }
+    }
+
+    template <typename Fn>
+    std::uint64_t calibrate_and_warn(
+        const std::string& name,
+        const Options& calib_opts,
+        Fn&& fn
+    ) const {
+        bool capped = false;
+        double last_elapsed = 0.0;
+        std::uint64_t loops = calibrate_loops(
+            calib_opts, std::forward<Fn>(fn), &capped, &last_elapsed
+        );
+        if (capped) {
+            std::cerr << "WARNING: " << name
+                      << ": calibration hit --max-loops (" << calib_opts.max_loops
+                      << ") before reaching --min-time (" << calib_opts.min_time_sec
+                      << "s). Last sample: " << last_elapsed
+                      << "s. Raise --max-loops to satisfy --min-time for this benchmark.\n";
+        }
+        return loops;
+    }
 
     static void write_multi_pyperf_json(
         const std::string& output_path,

From 1fb84c997276c60ba1972cf5889ad32161ee4160 Mon Sep 17 00:00:00 2001
From: Daniel Rodriguez <daniel@danielfrg.com>
Date: Fri, 24 Apr 2026 18:35:16 -0500
Subject: [PATCH 4/4] Remove limit and added drain

---
 benchmarks/cuda_bindings/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/cuda_bindings/README.md b/benchmarks/cuda_bindings/README.md
index 7881392b511..cffca57bef3 100644
--- a/benchmarks/cuda_bindings/README.md
+++ b/benchmarks/cuda_bindings/README.md
@@ -47,7 +47,7 @@ To run the benchmarks combine the environment and task:
 ```bash
 # Run the Python benchmarks in the wheel environment
 pixi run -e wheel bench
-pixi run -e wheel bench--min-time 0.1
+pixi run -e wheel bench --min-time 0.1
 
 # Run the Python benchmarks in the source environment
 pixi run -e source bench