Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions benchmarks/cuda_bindings/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,14 @@ To run the benchmarks combine the environment and task:
```bash
# Run the Python benchmarks in the wheel environment
pixi run -e wheel bench
pixi run -e wheel bench--min-time 0.1
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing space after bench


# Run the Python benchmarks in the source environment
pixi run -e source bench

# Run the C++ benchmarks
pixi run -e wheel bench-cpp
pixi run -e wheel bench-cpp --min-time 0.1
```

Both runners automatically save results to JSON files in the benchmarks
Expand Down
70 changes: 68 additions & 2 deletions benchmarks/cuda_bindings/benchmarks/cpp/bench_support.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <chrono>
#include <cmath>
#include <algorithm>
#include <cstdint>
#include <cstdlib>
#include <ctime>
Expand All @@ -22,6 +23,9 @@ struct Options {
std::uint64_t warmups = 5;
std::uint64_t values = 20;
std::uint64_t runs = 20;
double min_time_sec = 0.0;
std::uint64_t max_loops = 1000000;
std::uint64_t calibrate_rounds = 3;
std::string output_path;
std::string benchmark_name;
};
Expand All @@ -46,6 +50,18 @@ inline Options parse_args(int argc, char** argv) {
options.warmups = std::strtoull(argv[++i], nullptr, 10);
continue;
}
if (arg == "--min-time" && i + 1 < argc) {
options.min_time_sec = std::strtod(argv[++i], nullptr);
continue;
}
if (arg == "--max-loops" && i + 1 < argc) {
options.max_loops = std::strtoull(argv[++i], nullptr, 10);
continue;
}
if (arg == "--calibrate-rounds" && i + 1 < argc) {
options.calibrate_rounds = std::strtoull(argv[++i], nullptr, 10);
continue;
}
if (arg == "--values" && i + 1 < argc) {
options.values = std::strtoull(argv[++i], nullptr, 10);
continue;
Expand All @@ -68,6 +84,9 @@ inline Options parse_args(int argc, char** argv) {
<< " --warmups N Warmup values per run (default: 5)\n"
<< " --values N Timed values per run (default: 20)\n"
<< " --runs N Number of runs (default: 20)\n"
<< " --min-time S Calibrate loops to reach S seconds per value\n"
<< " --max-loops N Max loops used during calibration (default: 1000000)\n"
<< " --calibrate-rounds N Calibration passes (default: 3)\n"
<< " -o, --output F Write pyperf-compatible JSON to file\n"
<< " --name S Benchmark name (overrides default)\n";
std::exit(0);
Expand All @@ -93,6 +112,47 @@ inline std::string iso_now() {
return std::string(buf);
}

// Calibrate loop count to hit a minimum wall time per value.
template <typename Fn>
std::uint64_t calibrate_loops(const Options& options, Fn&& fn) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

calibrate_loops() looks like a custom reimplementation of the same minimum-time calibration idea that tools like pyperf and Google Benchmark already provide. I understand that there are advantages to keeping the existing custom C++ harness / pyperf-compatible JSON output, but if we go that route, I think the custom calibration should match the usual/expected --min-time semantics more closely.

Right now, I do not think --min-time actually guarantees the requested minimum sample duration for a large chunk of these microbenchmarks. The calibration stops once it hits max_loops = 1000000, and BenchmarkSuite::run() then uses that capped loop count as-is. For very fast benchmarks, that cap is still well short of 0.1 s per value. For example:

  • ~6 ns/op tops out around 6 ms
  • ~33 ns/op tops out around 33 ms
  • even ~79 ns/op only reaches around 79 ms

So the new "stable collection" mode still under-times many of the exact fast C++ benchmarks it is meant to stabilize. The same issue also seems to apply to the explicit-loop overload, which means the --min-time semantics are not consistently honored there either.

I think that is important to fix before we rely on these numbers as "min-time collected" results, because readers will reasonably assume that --min-time 0.1 means each measured value is actually being calibrated to about 100 ms.

if (options.min_time_sec <= 0.0) {
return options.loops;
}

std::uint64_t best = 1;
const std::uint64_t max_loops = std::max<std::uint64_t>(1, options.max_loops);
const std::uint64_t rounds = std::max<std::uint64_t>(1, options.calibrate_rounds);

for (std::uint64_t round = 0; round < rounds; ++round) {
std::uint64_t loops = 1;
double elapsed = 0.0;

while (true) {
const auto t0 = std::chrono::steady_clock::now();
for (std::uint64_t i = 0; i < loops; ++i) {
fn();
}
const auto t1 = std::chrono::steady_clock::now();
elapsed = std::chrono::duration<double>(t1 - t0).count();

if (elapsed >= options.min_time_sec || loops >= max_loops) {
break;
}
if (loops > max_loops / 2) {
loops = max_loops;
} else {
loops *= 2;
}
}

if (loops > best) {
best = loops;
}
}

return best;
}

// Run a benchmark function. The function signature is: void fn() — one call = one operation.
// The harness calls fn() in a tight loop `loops` times per value.
template <typename Fn>
Expand Down Expand Up @@ -238,9 +298,15 @@ class BenchmarkSuite {
// Run a benchmark and record it. The name is used as the benchmark ID.
template <typename Fn>
void run(const std::string& name, Fn&& fn) {
auto results = run_benchmark(options_, std::forward<Fn>(fn));
std::uint64_t loops = options_.loops;
Options custom = options_;
if (options_.min_time_sec > 0.0) {
loops = calibrate_loops(options_, fn);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Generated by Cursor GPT-5.4 Extra High Fast after a few round of prompting, with a few small edits


I think the --min-time direction is great, but for async / stream-backed benchmarks the calibration pass may change the state being measured before the first timed sample. For synchronous operations that is probably harmless, but for enqueue-style operations it looks like calibration can leave a substantial backlog on the persistent stream, so the harness may end up measuring "enqueue on an already busy stream" rather than the quieter starting condition these benchmarks had before.

More concretely, calibrate_loops() repeatedly calls fn() until the target wall time is reached, and BenchmarkSuite::run() then immediately goes into run_benchmark() with no post-calibration drain/reset. That means calibration is not just estimating a loop count; it is also mutating benchmark state right before measurement begins.

Why I think this matters:

  • For async benchmarks, fn() often enqueues work onto a persistent stream rather than fully completing it.
  • A 0.1 s target can mean a lot of enqueued work before the first warmup/value. For example, a ~1.6 us kernel launch benchmark needs on the order of ~60k launches to get there.
  • So the first measured sample may start from a deeply backlogged stream rather than an effectively empty stream.

That seems especially relevant for benchmarks like:

  • kernel launches in bench_launch.cpp
  • cuMemAllocAsync / cuMemFreeAsync in bench_memory.cpp
  • cuEventRecord in bench_event.cpp

The effect may be small for some cases, but since this PR is specifically about improving measurement stability, I think it is worth making sure the calibration step is not also changing what is being measured.

Helpful follow-up directions could be:

  • add a post-calibration drain/reset step before the first measured warmup/value for async benchmarks
  • or do a quick A/B comparison with and without a cuStreamSynchronize() after calibration
  • or calibrate on a separate warmup path/stream if that is practical

If those variants all produce essentially the same numbers, that would be reassuring. If they move the launch/async-memory/event benchmarks materially, then this concern is real.

custom.loops = loops;
}
auto results = run_benchmark(custom, std::forward<Fn>(fn));
print_summary(name, results);
entries_.push_back({name, options_.loops, std::move(results)});
entries_.push_back({name, loops, std::move(results)});
}

// Run a benchmark with a custom loop count (for slow operations like compilation).
Expand Down
44 changes: 34 additions & 10 deletions benchmarks/cuda_bindings/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,27 @@ def load_benchmarks(path: Path) -> dict[str, list[float]]:
name = run.get("metadata", {}).get("name", "")
if name:
break
values = []
values: list[float] = []
for run in bench.get("runs", []):
values.extend(run.get("values", []))
if name and values:
results[name] = values
return results


def stats(values: list[float]) -> tuple[float, float, float, int]:
mean = statistics.mean(values)
stdev = statistics.pstdev(values) if len(values) > 1 else 0.0
rsd = (stdev / mean) if mean else 0.0
return mean, stdev, rsd, len(values)


def fmt_rsd(rsd: float | None) -> str:
if rsd is None:
return "-"
return f"{rsd * 100:.1f}%"


def fmt_ns(seconds: float) -> str:
ns = seconds * 1e9
if ns >= 1000:
Expand Down Expand Up @@ -79,13 +92,16 @@ def main() -> None:

# Header
if cpp_benchmarks:
header = f"{'Benchmark':<{name_width}} {'C++ (mean)':>12} {'Python (mean)':>14} {'Overhead':>10}"
header = (
f"{'Benchmark':<{name_width}} {'C++ (mean)':>12} {'C++ RSD':>8} "
f"{'Python (mean)':>14} {'Py RSD':>7} {'Overhead':>10}"
)
sep = "-" * len(header)
print(sep)
print(header)
print(sep)
else:
header = f"{'Benchmark':<{name_width}} {'Python (mean)':>14}"
header = f"{'Benchmark':<{name_width}} {'Python (mean)':>14} {'Py RSD':>7}"
sep = "-" * len(header)
print(sep)
print(header)
Expand All @@ -95,21 +111,29 @@ def main() -> None:
py_vals = py_benchmarks.get(name)
cpp_vals = cpp_benchmarks.get(name)

py_str = fmt_ns(statistics.mean(py_vals)) if py_vals else "-"
cpp_str = fmt_ns(statistics.mean(cpp_vals)) if cpp_vals else "-"
py_stats = stats(py_vals) if py_vals else None
cpp_stats = stats(cpp_vals) if cpp_vals else None

py_str = fmt_ns(py_stats[0]) if py_stats else "-"
cpp_str = fmt_ns(cpp_stats[0]) if cpp_stats else "-"
py_rsd = fmt_rsd(py_stats[2]) if py_stats else "-"
cpp_rsd = fmt_rsd(cpp_stats[2]) if cpp_stats else "-"

if py_vals and cpp_vals:
py_mean = statistics.mean(py_vals)
cpp_mean = statistics.mean(cpp_vals)
if py_stats and cpp_stats:
py_mean = py_stats[0]
cpp_mean = cpp_stats[0]
overhead_ns = (py_mean - cpp_mean) * 1e9
overhead_str = f"+{overhead_ns:.0f} ns"
else:
overhead_str = "-"

if cpp_benchmarks:
print(f"{name:<{name_width}} {cpp_str:>12} {py_str:>14} {overhead_str:>10}")
print(
f"{name:<{name_width}} {cpp_str:>12} {cpp_rsd:>8} "
f"{py_str:>14} {py_rsd:>7} {overhead_str:>10}"
)
else:
print(f"{name:<{name_width}} {py_str:>14}")
print(f"{name:<{name_width}} {py_str:>14} {py_rsd:>7}")

print(sep)

Expand Down
Loading