-
Notifications
You must be signed in to change notification settings - Fork 275
cuda.bindings benchmarks part 5 #1964
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,6 +6,7 @@ | |
|
|
||
| #include <chrono> | ||
| #include <cmath> | ||
| #include <algorithm> | ||
| #include <cstdint> | ||
| #include <cstdlib> | ||
| #include <ctime> | ||
|
|
@@ -22,6 +23,9 @@ struct Options { | |
| std::uint64_t warmups = 5; | ||
| std::uint64_t values = 20; | ||
| std::uint64_t runs = 20; | ||
| double min_time_sec = 0.0; | ||
| std::uint64_t max_loops = 1000000; | ||
| std::uint64_t calibrate_rounds = 3; | ||
| std::string output_path; | ||
| std::string benchmark_name; | ||
| }; | ||
|
|
@@ -46,6 +50,18 @@ inline Options parse_args(int argc, char** argv) { | |
| options.warmups = std::strtoull(argv[++i], nullptr, 10); | ||
| continue; | ||
| } | ||
| if (arg == "--min-time" && i + 1 < argc) { | ||
| options.min_time_sec = std::strtod(argv[++i], nullptr); | ||
| continue; | ||
| } | ||
| if (arg == "--max-loops" && i + 1 < argc) { | ||
| options.max_loops = std::strtoull(argv[++i], nullptr, 10); | ||
| continue; | ||
| } | ||
| if (arg == "--calibrate-rounds" && i + 1 < argc) { | ||
| options.calibrate_rounds = std::strtoull(argv[++i], nullptr, 10); | ||
| continue; | ||
| } | ||
| if (arg == "--values" && i + 1 < argc) { | ||
| options.values = std::strtoull(argv[++i], nullptr, 10); | ||
| continue; | ||
|
|
@@ -68,6 +84,9 @@ inline Options parse_args(int argc, char** argv) { | |
| << " --warmups N Warmup values per run (default: 5)\n" | ||
| << " --values N Timed values per run (default: 20)\n" | ||
| << " --runs N Number of runs (default: 20)\n" | ||
| << " --min-time S Calibrate loops to reach S seconds per value\n" | ||
| << " --max-loops N Max loops used during calibration (default: 1000000)\n" | ||
| << " --calibrate-rounds N Calibration passes (default: 3)\n" | ||
| << " -o, --output F Write pyperf-compatible JSON to file\n" | ||
| << " --name S Benchmark name (overrides default)\n"; | ||
| std::exit(0); | ||
|
|
@@ -93,6 +112,47 @@ inline std::string iso_now() { | |
| return std::string(buf); | ||
| } | ||
|
|
||
| // Calibrate loop count to hit a minimum wall time per value. | ||
| template <typename Fn> | ||
| std::uint64_t calibrate_loops(const Options& options, Fn&& fn) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Right now, I do not think
So the new "stable collection" mode still under-times many of the exact fast C++ benchmarks it is meant to stabilize. The same issue also seems to apply to the explicit-loop overload, which means the I think that is important to fix before we rely on these numbers as "min-time collected" results, because readers will reasonably assume that |
||
| if (options.min_time_sec <= 0.0) { | ||
| return options.loops; | ||
| } | ||
|
|
||
| std::uint64_t best = 1; | ||
| const std::uint64_t max_loops = std::max<std::uint64_t>(1, options.max_loops); | ||
| const std::uint64_t rounds = std::max<std::uint64_t>(1, options.calibrate_rounds); | ||
|
|
||
| for (std::uint64_t round = 0; round < rounds; ++round) { | ||
| std::uint64_t loops = 1; | ||
| double elapsed = 0.0; | ||
|
|
||
| while (true) { | ||
| const auto t0 = std::chrono::steady_clock::now(); | ||
| for (std::uint64_t i = 0; i < loops; ++i) { | ||
| fn(); | ||
| } | ||
| const auto t1 = std::chrono::steady_clock::now(); | ||
| elapsed = std::chrono::duration<double>(t1 - t0).count(); | ||
|
|
||
| if (elapsed >= options.min_time_sec || loops >= max_loops) { | ||
| break; | ||
| } | ||
| if (loops > max_loops / 2) { | ||
| loops = max_loops; | ||
| } else { | ||
| loops *= 2; | ||
| } | ||
| } | ||
|
|
||
| if (loops > best) { | ||
| best = loops; | ||
| } | ||
| } | ||
|
|
||
| return best; | ||
| } | ||
|
|
||
| // Run a benchmark function. The function signature is: void fn() — one call = one operation. | ||
| // The harness calls fn() in a tight loop `loops` times per value. | ||
| template <typename Fn> | ||
|
|
@@ -238,9 +298,15 @@ class BenchmarkSuite { | |
| // Run a benchmark and record it. The name is used as the benchmark ID. | ||
| template <typename Fn> | ||
| void run(const std::string& name, Fn&& fn) { | ||
| auto results = run_benchmark(options_, std::forward<Fn>(fn)); | ||
| std::uint64_t loops = options_.loops; | ||
| Options custom = options_; | ||
| if (options_.min_time_sec > 0.0) { | ||
| loops = calibrate_loops(options_, fn); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Generated by Cursor GPT-5.4 Extra High Fast after a few round of prompting, with a few small edits I think the More concretely, Why I think this matters:
That seems especially relevant for benchmarks like:
The effect may be small for some cases, but since this PR is specifically about improving measurement stability, I think it is worth making sure the calibration step is not also changing what is being measured. Helpful follow-up directions could be:
If those variants all produce essentially the same numbers, that would be reassuring. If they move the launch/async-memory/event benchmarks materially, then this concern is real. |
||
| custom.loops = loops; | ||
| } | ||
| auto results = run_benchmark(custom, std::forward<Fn>(fn)); | ||
| print_summary(name, results); | ||
| entries_.push_back({name, options_.loops, std::move(results)}); | ||
| entries_.push_back({name, loops, std::move(results)}); | ||
| } | ||
|
|
||
| // Run a benchmark with a custom loop count (for slow operations like compilation). | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing space after
bench