From daf603defeee60ef653867d0cb351092938553f1 Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Wed, 22 Apr 2026 13:40:15 -0500 Subject: [PATCH 1/4] Improve Cpp harness with min-time for more stable collection --- benchmarks/cuda_bindings/README.md | 2 + .../benchmarks/cpp/bench_support.hpp | 70 ++++++++++++++++++- benchmarks/cuda_bindings/compare.py | 52 +++++++++++--- 3 files changed, 112 insertions(+), 12 deletions(-) diff --git a/benchmarks/cuda_bindings/README.md b/benchmarks/cuda_bindings/README.md index f8d5ccf0436..7881392b511 100644 --- a/benchmarks/cuda_bindings/README.md +++ b/benchmarks/cuda_bindings/README.md @@ -47,12 +47,14 @@ To run the benchmarks combine the environment and task: ```bash # Run the Python benchmarks in the wheel environment pixi run -e wheel bench +pixi run -e wheel bench--min-time 0.1 # Run the Python benchmarks in the source environment pixi run -e source bench # Run the C++ benchmarks pixi run -e wheel bench-cpp +pixi run -e wheel bench-cpp --min-time 0.1 ``` Both runners automatically save results to JSON files in the benchmarks diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp index 8b541228667..2755bf21183 100644 --- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -22,6 +23,9 @@ struct Options { std::uint64_t warmups = 5; std::uint64_t values = 20; std::uint64_t runs = 20; + double min_time_sec = 0.0; + std::uint64_t max_loops = 1000000; + std::uint64_t calibrate_rounds = 3; std::string output_path; std::string benchmark_name; }; @@ -46,6 +50,18 @@ inline Options parse_args(int argc, char** argv) { options.warmups = std::strtoull(argv[++i], nullptr, 10); continue; } + if (arg == "--min-time" && i + 1 < argc) { + options.min_time_sec = std::strtod(argv[++i], nullptr); + continue; + } + if (arg == "--max-loops" && i + 1 < argc) { + options.max_loops = std::strtoull(argv[++i], nullptr, 10); + continue; + } + if (arg == "--calibrate-rounds" && i + 1 < argc) { + options.calibrate_rounds = std::strtoull(argv[++i], nullptr, 10); + continue; + } if (arg == "--values" && i + 1 < argc) { options.values = std::strtoull(argv[++i], nullptr, 10); continue; @@ -68,6 +84,9 @@ inline Options parse_args(int argc, char** argv) { << " --warmups N Warmup values per run (default: 5)\n" << " --values N Timed values per run (default: 20)\n" << " --runs N Number of runs (default: 20)\n" + << " --min-time S Calibrate loops to reach S seconds per value\n" + << " --max-loops N Max loops used during calibration (default: 1000000)\n" + << " --calibrate-rounds N Calibration passes (default: 3)\n" << " -o, --output F Write pyperf-compatible JSON to file\n" << " --name S Benchmark name (overrides default)\n"; std::exit(0); @@ -93,6 +112,47 @@ inline std::string iso_now() { return std::string(buf); } +// Calibrate loop count to hit a minimum wall time per value. +template +std::uint64_t calibrate_loops(const Options& options, Fn&& fn) { + if (options.min_time_sec <= 0.0) { + return options.loops; + } + + std::uint64_t best = 1; + const std::uint64_t max_loops = std::max(1, options.max_loops); + const std::uint64_t rounds = std::max(1, options.calibrate_rounds); + + for (std::uint64_t round = 0; round < rounds; ++round) { + std::uint64_t loops = 1; + double elapsed = 0.0; + + while (true) { + const auto t0 = std::chrono::steady_clock::now(); + for (std::uint64_t i = 0; i < loops; ++i) { + fn(); + } + const auto t1 = std::chrono::steady_clock::now(); + elapsed = std::chrono::duration(t1 - t0).count(); + + if (elapsed >= options.min_time_sec || loops >= max_loops) { + break; + } + if (loops > max_loops / 2) { + loops = max_loops; + } else { + loops *= 2; + } + } + + if (loops > best) { + best = loops; + } + } + + return best; +} + // Run a benchmark function. The function signature is: void fn() — one call = one operation. // The harness calls fn() in a tight loop `loops` times per value. template @@ -238,9 +298,15 @@ class BenchmarkSuite { // Run a benchmark and record it. The name is used as the benchmark ID. template void run(const std::string& name, Fn&& fn) { - auto results = run_benchmark(options_, std::forward(fn)); + std::uint64_t loops = options_.loops; + Options custom = options_; + if (options_.min_time_sec > 0.0) { + loops = calibrate_loops(options_, fn); + custom.loops = loops; + } + auto results = run_benchmark(custom, std::forward(fn)); print_summary(name, results); - entries_.push_back({name, options_.loops, std::move(results)}); + entries_.push_back({name, loops, std::move(results)}); } // Run a benchmark with a custom loop count (for slow operations like compilation). diff --git a/benchmarks/cuda_bindings/compare.py b/benchmarks/cuda_bindings/compare.py index 6a3e94f3447..7fd9ca87178 100644 --- a/benchmarks/cuda_bindings/compare.py +++ b/benchmarks/cuda_bindings/compare.py @@ -29,7 +29,7 @@ def load_benchmarks(path: Path) -> dict[str, list[float]]: name = run.get("metadata", {}).get("name", "") if name: break - values = [] + values: list[float] = [] for run in bench.get("runs", []): values.extend(run.get("values", [])) if name and values: @@ -37,6 +37,19 @@ def load_benchmarks(path: Path) -> dict[str, list[float]]: return results +def stats(values: list[float]) -> tuple[float, float, float, int]: + mean = statistics.mean(values) + stdev = statistics.pstdev(values) if len(values) > 1 else 0.0 + rsd = (stdev / mean) if mean else 0.0 + return mean, stdev, rsd, len(values) + + +def fmt_rsd(rsd: float | None) -> str: + if rsd is None: + return "-" + return f"{rsd * 100:.1f}%" + + def fmt_ns(seconds: float) -> str: ns = seconds * 1e9 if ns >= 1000: @@ -58,6 +71,12 @@ def main() -> None: default=DEFAULT_CPP, help=f"C++ results JSON (default: {DEFAULT_CPP.name})", ) + parser.add_argument( + "--target-us", + type=float, + default=1.0, + help="Overhead target in microseconds (default: 1.0)", + ) args = parser.parse_args() if not args.python.exists(): @@ -79,13 +98,16 @@ def main() -> None: # Header if cpp_benchmarks: - header = f"{'Benchmark':<{name_width}} {'C++ (mean)':>12} {'Python (mean)':>14} {'Overhead':>10}" + header = ( + f"{'Benchmark':<{name_width}} {'C++ (mean)':>12} {'C++ RSD':>8} " + f"{'Python (mean)':>14} {'Py RSD':>7} {'Overhead':>10} {'Target':>6}" + ) sep = "-" * len(header) print(sep) print(header) print(sep) else: - header = f"{'Benchmark':<{name_width}} {'Python (mean)':>14}" + header = f"{'Benchmark':<{name_width}} {'Python (mean)':>14} {'Py RSD':>7}" sep = "-" * len(header) print(sep) print(header) @@ -95,21 +117,31 @@ def main() -> None: py_vals = py_benchmarks.get(name) cpp_vals = cpp_benchmarks.get(name) - py_str = fmt_ns(statistics.mean(py_vals)) if py_vals else "-" - cpp_str = fmt_ns(statistics.mean(cpp_vals)) if cpp_vals else "-" + py_stats = stats(py_vals) if py_vals else None + cpp_stats = stats(cpp_vals) if cpp_vals else None + + py_str = fmt_ns(py_stats[0]) if py_stats else "-" + cpp_str = fmt_ns(cpp_stats[0]) if cpp_stats else "-" + py_rsd = fmt_rsd(py_stats[2]) if py_stats else "-" + cpp_rsd = fmt_rsd(cpp_stats[2]) if cpp_stats else "-" - if py_vals and cpp_vals: - py_mean = statistics.mean(py_vals) - cpp_mean = statistics.mean(cpp_vals) + if py_stats and cpp_stats: + py_mean = py_stats[0] + cpp_mean = cpp_stats[0] overhead_ns = (py_mean - cpp_mean) * 1e9 overhead_str = f"+{overhead_ns:.0f} ns" + target = "OK" if overhead_ns <= args.target_us * 1000 else "FAIL" else: overhead_str = "-" + target = "-" if cpp_benchmarks: - print(f"{name:<{name_width}} {cpp_str:>12} {py_str:>14} {overhead_str:>10}") + print( + f"{name:<{name_width}} {cpp_str:>12} {cpp_rsd:>8} " + f"{py_str:>14} {py_rsd:>7} {overhead_str:>10} {target:>6}" + ) else: - print(f"{name:<{name_width}} {py_str:>14}") + print(f"{name:<{name_width}} {py_str:>14} {py_rsd:>7}") print(sep) From cd15d84728cb1574e38f98cb359c03fd9fa88ea1 Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Wed, 22 Apr 2026 13:47:28 -0500 Subject: [PATCH 2/4] Improve Cpp harness with min-time for more stable collection --- benchmarks/cuda_bindings/compare.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/benchmarks/cuda_bindings/compare.py b/benchmarks/cuda_bindings/compare.py index 7fd9ca87178..7dbc972e92a 100644 --- a/benchmarks/cuda_bindings/compare.py +++ b/benchmarks/cuda_bindings/compare.py @@ -71,12 +71,6 @@ def main() -> None: default=DEFAULT_CPP, help=f"C++ results JSON (default: {DEFAULT_CPP.name})", ) - parser.add_argument( - "--target-us", - type=float, - default=1.0, - help="Overhead target in microseconds (default: 1.0)", - ) args = parser.parse_args() if not args.python.exists(): @@ -100,7 +94,7 @@ def main() -> None: if cpp_benchmarks: header = ( f"{'Benchmark':<{name_width}} {'C++ (mean)':>12} {'C++ RSD':>8} " - f"{'Python (mean)':>14} {'Py RSD':>7} {'Overhead':>10} {'Target':>6}" + f"{'Python (mean)':>14} {'Py RSD':>7} {'Overhead':>10}" ) sep = "-" * len(header) print(sep) @@ -130,15 +124,13 @@ def main() -> None: cpp_mean = cpp_stats[0] overhead_ns = (py_mean - cpp_mean) * 1e9 overhead_str = f"+{overhead_ns:.0f} ns" - target = "OK" if overhead_ns <= args.target_us * 1000 else "FAIL" else: overhead_str = "-" - target = "-" if cpp_benchmarks: print( f"{name:<{name_width}} {cpp_str:>12} {cpp_rsd:>8} " - f"{py_str:>14} {py_rsd:>7} {overhead_str:>10} {target:>6}" + f"{py_str:>14} {py_rsd:>7} {overhead_str:>10}" ) else: print(f"{name:<{name_width}} {py_str:>14} {py_rsd:>7}") From abde710509fec7fa706e2ca6e8ebd2a99bf1c147 Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Fri, 24 Apr 2026 18:34:57 -0500 Subject: [PATCH 3/4] Remove limit and added drain --- .../benchmarks/cpp/bench_event.cpp | 5 + .../benchmarks/cpp/bench_launch.cpp | 6 + .../benchmarks/cpp/bench_memory.cpp | 5 + .../benchmarks/cpp/bench_stream.cpp | 6 + .../benchmarks/cpp/bench_support.hpp | 112 ++++++++++++++++-- 5 files changed, 122 insertions(+), 12 deletions(-) diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp index 44cd6177786..c24aa983199 100644 --- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_event.cpp @@ -45,6 +45,11 @@ int main(int argc, char** argv) { check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed"); bench::BenchmarkSuite suite(options); + // Drain the persistent stream after calibration so event_record (which + // enqueues onto the stream) and event_synchronize start from a known state. + suite.set_post_calibrate([&]() { + check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed"); + }); // --- event_create_destroy --- { diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp index 984c82fcf32..4897859a61a 100644 --- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_launch.cpp @@ -238,6 +238,12 @@ int main(int argc, char** argv) { void* struct_params[] = {&struct_2048B}; bench::BenchmarkSuite suite(options); + // After calibration, drain the persistent stream so the first measured + // sample does not start on a backlogged stream. Calibration for enqueue- + // style ops (kernel launches) may queue many thousands of operations. + suite.set_post_calibrate([&]() { + check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed"); + }); suite.run("launch.launch_empty_kernel", [&]() { check_cu(cuLaunchKernel(empty_kernel, 1, 1, 1, 1, 1, 1, 0, stream, nullptr, nullptr), diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp index 4e71b73fb5e..803363be480 100644 --- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_memory.cpp @@ -52,6 +52,11 @@ int main(int argc, char** argv) { uint8_t host_dst[COPY_SIZE] = {}; bench::BenchmarkSuite suite(options); + // Drain the persistent stream after calibration so async benchmarks + // (mem_alloc_async_free_async) don't start measurement on a backlogged stream. + suite.set_post_calibrate([&]() { + check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed"); + }); // --- mem_alloc_free --- { diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp index 702e86aef02..95ad0790f9f 100644 --- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_stream.cpp @@ -38,6 +38,12 @@ int main(int argc, char** argv) { check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed"); bench::BenchmarkSuite suite(options); + // Drain the persistent stream after calibration for completeness. + // stream_create_destroy uses a local stream, but stream_query/synchronize + // observe the persistent one. + suite.set_post_calibrate([&]() { + check_cu(cuStreamSynchronize(stream), "post-calibrate sync failed"); + }); // --- stream_create_destroy --- { diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp index 2755bf21183..131f69de540 100644 --- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -24,7 +25,10 @@ struct Options { std::uint64_t values = 20; std::uint64_t runs = 20; double min_time_sec = 0.0; - std::uint64_t max_loops = 1000000; + // Safety cap for the calibration doubling loop. Set high enough that even + // sub-nanosecond ops can reach typical --min-time targets (e.g. 100ms). + // A warning is printed if calibration hits this cap before reaching min-time. + std::uint64_t max_loops = 100000000; std::uint64_t calibrate_rounds = 3; std::string output_path; std::string benchmark_name; @@ -85,7 +89,7 @@ inline Options parse_args(int argc, char** argv) { << " --values N Timed values per run (default: 20)\n" << " --runs N Number of runs (default: 20)\n" << " --min-time S Calibrate loops to reach S seconds per value\n" - << " --max-loops N Max loops used during calibration (default: 1000000)\n" + << " --max-loops N Safety cap for calibration loop count (default: 100000000)\n" << " --calibrate-rounds N Calibration passes (default: 3)\n" << " -o, --output F Write pyperf-compatible JSON to file\n" << " --name S Benchmark name (overrides default)\n"; @@ -113,18 +117,34 @@ inline std::string iso_now() { } // Calibrate loop count to hit a minimum wall time per value. +// Returns the chosen loop count. If `capped_out` is non-null, it is set to +// true when calibration reached `max_loops` before hitting `min_time_sec` +// (meaning --min-time was NOT actually satisfied by the calibration). template -std::uint64_t calibrate_loops(const Options& options, Fn&& fn) { +std::uint64_t calibrate_loops( + const Options& options, + Fn&& fn, + bool* capped_out = nullptr, + double* last_elapsed_out = nullptr +) { if (options.min_time_sec <= 0.0) { + if (capped_out) *capped_out = false; + if (last_elapsed_out) *last_elapsed_out = 0.0; return options.loops; } - std::uint64_t best = 1; - const std::uint64_t max_loops = std::max(1, options.max_loops); + // Allow callers (e.g. the explicit-loop overload) to request a minimum + // starting loop count via options.loops. + const std::uint64_t start_loops = std::max(1, options.loops); + std::uint64_t best = start_loops; + const std::uint64_t max_loops = std::max(start_loops, options.max_loops); const std::uint64_t rounds = std::max(1, options.calibrate_rounds); + bool capped = false; + double last_elapsed = 0.0; + for (std::uint64_t round = 0; round < rounds; ++round) { - std::uint64_t loops = 1; + std::uint64_t loops = start_loops; double elapsed = 0.0; while (true) { @@ -135,7 +155,11 @@ std::uint64_t calibrate_loops(const Options& options, Fn&& fn) { const auto t1 = std::chrono::steady_clock::now(); elapsed = std::chrono::duration(t1 - t0).count(); - if (elapsed >= options.min_time_sec || loops >= max_loops) { + if (elapsed >= options.min_time_sec) { + break; + } + if (loops >= max_loops) { + capped = true; break; } if (loops > max_loops / 2) { @@ -148,8 +172,11 @@ std::uint64_t calibrate_loops(const Options& options, Fn&& fn) { if (loops > best) { best = loops; } + last_elapsed = elapsed; } + if (capped_out) *capped_out = capped; + if (last_elapsed_out) *last_elapsed_out = last_elapsed; return best; } @@ -295,28 +322,59 @@ class BenchmarkSuite { public: explicit BenchmarkSuite(Options options) : options_(std::move(options)) {} + // Post-calibration hook. If set, invoked after calibration and before the + // first measured warmup/value, for every benchmark in this suite. Intended + // for async benchmarks that need to drain state left behind by calibration + // (e.g. cuStreamSynchronize on a persistent stream). Can be overridden + // per-call via the `post_calibrate` parameter on `run()`. + void set_post_calibrate(std::function hook) { + post_calibrate_ = std::move(hook); + } + // Run a benchmark and record it. The name is used as the benchmark ID. + // If --min-time is set, loop count is auto-calibrated. `post_calibrate`, + // if provided, runs after calibration and before measurement. template - void run(const std::string& name, Fn&& fn) { + void run( + const std::string& name, + Fn&& fn, + std::function post_calibrate = {} + ) { std::uint64_t loops = options_.loops; Options custom = options_; if (options_.min_time_sec > 0.0) { - loops = calibrate_loops(options_, fn); + loops = calibrate_and_warn(name, options_, fn); custom.loops = loops; + invoke_post_calibrate(post_calibrate); } auto results = run_benchmark(custom, std::forward(fn)); print_summary(name, results); entries_.push_back({name, loops, std::move(results)}); } - // Run a benchmark with a custom loop count (for slow operations like compilation). + // Run a benchmark with a custom loop count (used as a floor for fast ops + // or a fixed count for slow ops like compilation). When --min-time is set, + // calibration still runs but starts from `loops_override` as the minimum. template - void run(const std::string& name, std::uint64_t loops_override, Fn&& fn) { + void run( + const std::string& name, + std::uint64_t loops_override, + Fn&& fn, + std::function post_calibrate = {} + ) { + std::uint64_t loops = loops_override; Options custom = options_; custom.loops = loops_override; + if (options_.min_time_sec > 0.0) { + Options calib_opts = options_; + calib_opts.loops = loops_override; // floor + loops = calibrate_and_warn(name, calib_opts, fn); + custom.loops = loops; + invoke_post_calibrate(post_calibrate); + } auto results = run_benchmark(custom, std::forward(fn)); print_summary(name, results); - entries_.push_back({name, loops_override, std::move(results)}); + entries_.push_back({name, loops, std::move(results)}); } // Write all collected benchmarks to the output file (if -o was given). @@ -329,6 +387,36 @@ class BenchmarkSuite { private: Options options_; std::vector entries_; + std::function post_calibrate_; + + void invoke_post_calibrate(const std::function& per_call) const { + if (per_call) { + per_call(); + } else if (post_calibrate_) { + post_calibrate_(); + } + } + + template + std::uint64_t calibrate_and_warn( + const std::string& name, + const Options& calib_opts, + Fn&& fn + ) const { + bool capped = false; + double last_elapsed = 0.0; + std::uint64_t loops = calibrate_loops( + calib_opts, std::forward(fn), &capped, &last_elapsed + ); + if (capped) { + std::cerr << "WARNING: " << name + << ": calibration hit --max-loops (" << calib_opts.max_loops + << ") before reaching --min-time (" << calib_opts.min_time_sec + << "s). Last sample: " << last_elapsed + << "s. Raise --max-loops to satisfy --min-time for this benchmark.\n"; + } + return loops; + } static void write_multi_pyperf_json( const std::string& output_path, From 1fb84c997276c60ba1972cf5889ad32161ee4160 Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Fri, 24 Apr 2026 18:35:16 -0500 Subject: [PATCH 4/4] Remove limit and added drain --- benchmarks/cuda_bindings/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/cuda_bindings/README.md b/benchmarks/cuda_bindings/README.md index 7881392b511..cffca57bef3 100644 --- a/benchmarks/cuda_bindings/README.md +++ b/benchmarks/cuda_bindings/README.md @@ -47,7 +47,7 @@ To run the benchmarks combine the environment and task: ```bash # Run the Python benchmarks in the wheel environment pixi run -e wheel bench -pixi run -e wheel bench--min-time 0.1 +pixi run -e wheel bench --min-time 0.1 # Run the Python benchmarks in the source environment pixi run -e source bench