diff --git a/benchmarks/cuda_bindings/README.md b/benchmarks/cuda_bindings/README.md index f8d5ccf043..7881392b51 100644 --- a/benchmarks/cuda_bindings/README.md +++ b/benchmarks/cuda_bindings/README.md @@ -47,12 +47,14 @@ To run the benchmarks combine the environment and task: ```bash # Run the Python benchmarks in the wheel environment pixi run -e wheel bench +pixi run -e wheel bench--min-time 0.1 # Run the Python benchmarks in the source environment pixi run -e source bench # Run the C++ benchmarks pixi run -e wheel bench-cpp +pixi run -e wheel bench-cpp --min-time 0.1 ``` Both runners automatically save results to JSON files in the benchmarks diff --git a/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp index 8b54122866..2755bf2118 100644 --- a/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp +++ b/benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -22,6 +23,9 @@ struct Options { std::uint64_t warmups = 5; std::uint64_t values = 20; std::uint64_t runs = 20; + double min_time_sec = 0.0; + std::uint64_t max_loops = 1000000; + std::uint64_t calibrate_rounds = 3; std::string output_path; std::string benchmark_name; }; @@ -46,6 +50,18 @@ inline Options parse_args(int argc, char** argv) { options.warmups = std::strtoull(argv[++i], nullptr, 10); continue; } + if (arg == "--min-time" && i + 1 < argc) { + options.min_time_sec = std::strtod(argv[++i], nullptr); + continue; + } + if (arg == "--max-loops" && i + 1 < argc) { + options.max_loops = std::strtoull(argv[++i], nullptr, 10); + continue; + } + if (arg == "--calibrate-rounds" && i + 1 < argc) { + options.calibrate_rounds = std::strtoull(argv[++i], nullptr, 10); + continue; + } if (arg == "--values" && i + 1 < argc) { options.values = std::strtoull(argv[++i], nullptr, 10); continue; @@ -68,6 +84,9 @@ inline Options parse_args(int argc, char** argv) { << " --warmups N Warmup values per run (default: 5)\n" << " --values N Timed values per run (default: 20)\n" << " --runs N Number of runs (default: 20)\n" + << " --min-time S Calibrate loops to reach S seconds per value\n" + << " --max-loops N Max loops used during calibration (default: 1000000)\n" + << " --calibrate-rounds N Calibration passes (default: 3)\n" << " -o, --output F Write pyperf-compatible JSON to file\n" << " --name S Benchmark name (overrides default)\n"; std::exit(0); @@ -93,6 +112,47 @@ inline std::string iso_now() { return std::string(buf); } +// Calibrate loop count to hit a minimum wall time per value. +template +std::uint64_t calibrate_loops(const Options& options, Fn&& fn) { + if (options.min_time_sec <= 0.0) { + return options.loops; + } + + std::uint64_t best = 1; + const std::uint64_t max_loops = std::max(1, options.max_loops); + const std::uint64_t rounds = std::max(1, options.calibrate_rounds); + + for (std::uint64_t round = 0; round < rounds; ++round) { + std::uint64_t loops = 1; + double elapsed = 0.0; + + while (true) { + const auto t0 = std::chrono::steady_clock::now(); + for (std::uint64_t i = 0; i < loops; ++i) { + fn(); + } + const auto t1 = std::chrono::steady_clock::now(); + elapsed = std::chrono::duration(t1 - t0).count(); + + if (elapsed >= options.min_time_sec || loops >= max_loops) { + break; + } + if (loops > max_loops / 2) { + loops = max_loops; + } else { + loops *= 2; + } + } + + if (loops > best) { + best = loops; + } + } + + return best; +} + // Run a benchmark function. The function signature is: void fn() — one call = one operation. // The harness calls fn() in a tight loop `loops` times per value. template @@ -238,9 +298,15 @@ class BenchmarkSuite { // Run a benchmark and record it. The name is used as the benchmark ID. template void run(const std::string& name, Fn&& fn) { - auto results = run_benchmark(options_, std::forward(fn)); + std::uint64_t loops = options_.loops; + Options custom = options_; + if (options_.min_time_sec > 0.0) { + loops = calibrate_loops(options_, fn); + custom.loops = loops; + } + auto results = run_benchmark(custom, std::forward(fn)); print_summary(name, results); - entries_.push_back({name, options_.loops, std::move(results)}); + entries_.push_back({name, loops, std::move(results)}); } // Run a benchmark with a custom loop count (for slow operations like compilation). diff --git a/benchmarks/cuda_bindings/compare.py b/benchmarks/cuda_bindings/compare.py index 6a3e94f344..7dbc972e92 100644 --- a/benchmarks/cuda_bindings/compare.py +++ b/benchmarks/cuda_bindings/compare.py @@ -29,7 +29,7 @@ def load_benchmarks(path: Path) -> dict[str, list[float]]: name = run.get("metadata", {}).get("name", "") if name: break - values = [] + values: list[float] = [] for run in bench.get("runs", []): values.extend(run.get("values", [])) if name and values: @@ -37,6 +37,19 @@ def load_benchmarks(path: Path) -> dict[str, list[float]]: return results +def stats(values: list[float]) -> tuple[float, float, float, int]: + mean = statistics.mean(values) + stdev = statistics.pstdev(values) if len(values) > 1 else 0.0 + rsd = (stdev / mean) if mean else 0.0 + return mean, stdev, rsd, len(values) + + +def fmt_rsd(rsd: float | None) -> str: + if rsd is None: + return "-" + return f"{rsd * 100:.1f}%" + + def fmt_ns(seconds: float) -> str: ns = seconds * 1e9 if ns >= 1000: @@ -79,13 +92,16 @@ def main() -> None: # Header if cpp_benchmarks: - header = f"{'Benchmark':<{name_width}} {'C++ (mean)':>12} {'Python (mean)':>14} {'Overhead':>10}" + header = ( + f"{'Benchmark':<{name_width}} {'C++ (mean)':>12} {'C++ RSD':>8} " + f"{'Python (mean)':>14} {'Py RSD':>7} {'Overhead':>10}" + ) sep = "-" * len(header) print(sep) print(header) print(sep) else: - header = f"{'Benchmark':<{name_width}} {'Python (mean)':>14}" + header = f"{'Benchmark':<{name_width}} {'Python (mean)':>14} {'Py RSD':>7}" sep = "-" * len(header) print(sep) print(header) @@ -95,21 +111,29 @@ def main() -> None: py_vals = py_benchmarks.get(name) cpp_vals = cpp_benchmarks.get(name) - py_str = fmt_ns(statistics.mean(py_vals)) if py_vals else "-" - cpp_str = fmt_ns(statistics.mean(cpp_vals)) if cpp_vals else "-" + py_stats = stats(py_vals) if py_vals else None + cpp_stats = stats(cpp_vals) if cpp_vals else None + + py_str = fmt_ns(py_stats[0]) if py_stats else "-" + cpp_str = fmt_ns(cpp_stats[0]) if cpp_stats else "-" + py_rsd = fmt_rsd(py_stats[2]) if py_stats else "-" + cpp_rsd = fmt_rsd(cpp_stats[2]) if cpp_stats else "-" - if py_vals and cpp_vals: - py_mean = statistics.mean(py_vals) - cpp_mean = statistics.mean(cpp_vals) + if py_stats and cpp_stats: + py_mean = py_stats[0] + cpp_mean = cpp_stats[0] overhead_ns = (py_mean - cpp_mean) * 1e9 overhead_str = f"+{overhead_ns:.0f} ns" else: overhead_str = "-" if cpp_benchmarks: - print(f"{name:<{name_width}} {cpp_str:>12} {py_str:>14} {overhead_str:>10}") + print( + f"{name:<{name_width}} {cpp_str:>12} {cpp_rsd:>8} " + f"{py_str:>14} {py_rsd:>7} {overhead_str:>10}" + ) else: - print(f"{name:<{name_width}} {py_str:>14}") + print(f"{name:<{name_width}} {py_str:>14} {py_rsd:>7}") print(sep)