google · copybara-service · Mar 14, 2025 · Mar 14, 2025
diff --git a/BUILD.bazel b/BUILD.bazel
@@ -29,15 +29,13 @@ cc_library(
     ],
 )
 
+# Split from :threading to break a circular dependency with :allocator.
 cc_library(
-    name = "threading",
-    srcs = ["util/threading.cc"],
-    hdrs = ["util/threading.h"],
+    name = "topology",
+    srcs = ["util/topology.cc"],
+    hdrs = ["util/topology.h"],
     deps = [
-        ":basics",
-        # Placeholder for container detection, do not remove
         "@highway//:hwy",
-        "@highway//:thread_pool",
         "@highway//:topology",
     ],
 )
@@ -48,32 +46,54 @@ cc_library(
     hdrs = ["util/allocator.h"],
     deps = [
         ":basics",
-        ":threading",
+        ":topology",
         "@highway//:hwy",
         "@highway//:thread_pool",
+        "@highway//:topology",
     ],
 )
 
 cc_library(
-    name = "test_util",
-    hdrs = ["util/test_util.h"],
+    name = "threading",
+    srcs = ["util/threading.cc"],
+    hdrs = ["util/threading.h"],
     deps = [
+        ":allocator",
+        ":basics",
+        ":topology",
+        # Placeholder for container detection, do not remove
         "@highway//:hwy",
-        "@highway//:hwy_test_util",
-        "@highway//:stats",
+        "@highway//:thread_pool",
+        "@highway//:topology",
     ],
 )
 
 cc_test(
     name = "threading_test",
     srcs = ["util/threading_test.cc"],
     deps = [
+        ":allocator",
+        ":basics",
         ":threading",
         "@googletest//:gtest_main",
+        "@highway//:auto_tune",
         "@highway//:hwy",
         "@highway//:hwy_test_util",
         "@highway//:nanobenchmark",
+        "@highway//:robust_statistics",
+        "@highway//:stats",
         "@highway//:thread_pool",
+        "@highway//:timer",
+    ],
+)
+
+cc_library(
+    name = "test_util",
+    hdrs = ["util/test_util.h"],
+    deps = [
+        "@highway//:hwy",
+        "@highway//:hwy_test_util",
+        "@highway//:stats",
     ],
 )
 
@@ -104,6 +124,7 @@ cc_library(
         ":allocator",
         ":basics",
         ":threading",
+        ":topology",
         "//compression:compress",
         "@highway//:algo",
         "@highway//:bit_set",
@@ -113,7 +134,6 @@ cc_library(
         "@highway//:nanobenchmark",
         "@highway//:profiler",
         "@highway//:thread_pool",
-        "@highway//:topology",
         "@highway//hwy/contrib/sort:vqsort",
     ],
 )
@@ -128,11 +148,11 @@ cc_test(
     tags = ["ops_tests"],
     deps = [
         ":allocator",
+        ":app",
         ":ops",
         ":test_util",
         ":threading",
         "@googletest//:gtest_main",  # buildcleaner: keep
-        "//:app",
         "//compression:compress",
         "//compression:test_util",
         "@highway//:hwy",
@@ -154,11 +174,12 @@ cc_test(
     tags = ["ops_tests"],
     deps = [
         ":allocator",
+        ":app",
         ":common",
         ":ops",
         ":test_util",
+        ":threading",
         "@googletest//:gtest_main",  # buildcleaner: keep
-        "//:app",
         "//compression:compress",
         "@highway//:hwy",
         "@highway//:hwy_test_util",
@@ -405,6 +426,7 @@ cc_library(
         ":cross_entropy",
         ":gemma_lib",
         ":kv_cache",
+        ":ops",
         ":threading",
         # Placeholder for internal dep, do not remove.,
         "@google_benchmark//:benchmark",
@@ -464,13 +486,13 @@ cc_binary(
         ":benchmark_helper",
         ":common",
         ":gemma_lib",
+        ":ops",
         ":threading",
         # Placeholder for internal dep, do not remove.,
         "//compression:sfp",
         "//paligemma:image",
         "@highway//:hwy",
         "@highway//:profiler",
-        "@highway//:thread_pool",
     ],
 )
 
@@ -634,13 +656,12 @@ cc_test(
         ":backprop",
         ":backprop_scalar",
         ":common",
-        ":gemma_lib",
         ":ops",
         ":prompt",
         ":sampler",
+        ":threading",
         ":weights",
         "@googletest//:gtest_main",
-        "//:threading",
         "//compression:compress",
         "@highway//:hwy",
         "@highway//:hwy_test_util",

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -22,7 +22,7 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG f2209b911c74019e85d0b7a7a2833c9a2e1b7995 EXCLUDE_FROM_ALL)
+FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG c5bebf84ad01edec97e336f5c97ca4e0df6b4d06 EXCLUDE_FROM_ALL)
 FetchContent_MakeAvailable(highway)
 
 ## Note: absl needs to be installed by sentencepiece. This will only happen if
@@ -108,6 +108,8 @@ set(SOURCES
   util/test_util.h
   util/threading.cc
   util/threading.h
+  util/topology.cc
+  util/topology.h
   )
 
 if(NOT CMAKE_BUILD_TYPE)

diff --git a/MODULE.bazel b/MODULE.bazel
@@ -18,7 +18,7 @@ bazel_dep(name = "google_benchmark", version = "1.8.5")
 # Require a more recent version.
 git_override(
     module_name = "highway",
-    commit = "f2209b911c74019e85d0b7a7a2833c9a2e1b7995",
+    commit = "c5bebf84ad01edec97e336f5c97ca4e0df6b4d06",
     remote = "https://github.com/google/highway",
 )
 

diff --git a/backprop/backward_test.cc b/backprop/backward_test.cc
@@ -35,7 +35,6 @@
 #include "ops/ops.h"
 #include "util/threading.h"
 #include "hwy/base.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
 
 // clang-format off
 #undef HWY_TARGET_INCLUDE
@@ -59,9 +58,9 @@ void TestMatMulVJP() {
   static const size_t kRows = 8;
   static const size_t kCols = 64;
   static const size_t kTokens = 5;
-  gcpp::NestedPools pools(1, /*pin=*/Tristate::kFalse, BoundedSlice(0, 1),
-                          BoundedSlice(0, 8));
-  Allocator::Init(pools.Topology());
+  const BoundedTopology topology(BoundedSlice(0, 1), BoundedSlice(0, 8));
+  Allocator::Init(topology);
+  gcpp::NestedPools pools(topology, 1, /*pin=*/Tristate::kFalse);
   std::mt19937 gen(42);
   MatStorageT<float> weights("weights", kRows, kCols);
   MatStorageT<float> x("x", kTokens, kCols);
@@ -105,9 +104,9 @@ void TestMultiHeadMatMulVJP() {
   static const size_t kCols = 16;
   static const size_t kHeads = 4;
   static const size_t kTokens = 3;
-  gcpp::NestedPools pools(1, /*pin=*/Tristate::kFalse, BoundedSlice(0, 1),
-                          BoundedSlice(0, 8));
-  Allocator::Init(pools.Topology());
+  const BoundedTopology topology(BoundedSlice(0, 1), BoundedSlice(0, 8));
+  Allocator::Init(topology);
+  gcpp::NestedPools pools(topology, 1, /*pin=*/Tristate::kFalse);
   std::mt19937 gen(42);
   MatStorageT<float> weights("weights", kRows, kCols * kHeads);
   MatStorageT<float> x("x", kTokens, kCols * kHeads);
@@ -150,9 +149,9 @@ void TestMultiHeadMatMulVJP() {
 void TestRMSNormVJP() {
   static const size_t K = 2;
   static const size_t N = 64;
-  gcpp::NestedPools pools(1, /*pin=*/Tristate::kFalse, BoundedSlice(0, 1),
-                          BoundedSlice(0, 8));
-  Allocator::Init(pools.Topology());
+  const BoundedTopology topology(BoundedSlice(0, 1), BoundedSlice(0, 8));
+  Allocator::Init(topology);
+  gcpp::NestedPools pools(topology, 1, /*pin=*/Tristate::kFalse);
   std::mt19937 gen(42);
   MatStorageT<float> weights("weights", N, 1);
   MatStorageT<float> x("x", K, N);
@@ -216,9 +215,9 @@ static ModelConfig TestConfig() {
 
 void TestEndToEnd() {
   std::mt19937 gen(42);
-  gcpp::NestedPools pools(1, /*pin=*/Tristate::kFalse, BoundedSlice(0, 1),
-                          BoundedSlice(0, 1));
-  Allocator::Init(pools.Topology());
+  const BoundedTopology topology(BoundedSlice(0, 1), BoundedSlice(0, 1));
+  Allocator::Init(topology);
+  gcpp::NestedPools pools(topology, 1, /*pin=*/Tristate::kFalse);
   ModelConfig config = TestConfig();
   WeightsWrapper<float> weights(config);
   WeightsWrapper<float> grad(config);

diff --git a/backprop/optimize_test.cc b/backprop/optimize_test.cc
@@ -41,9 +41,10 @@
 namespace gcpp {
 
 TEST(OptimizeTest, GradientDescent) {
-  NestedPools pools(1, /*pin=*/Tristate::kFalse, BoundedSlice(0, 1),
-                    BoundedSlice(0, 1));
-  Allocator::Init(pools.Topology());
+  const BoundedTopology topology(BoundedSlice(0, 1), BoundedSlice(0, 1));
+  Allocator::Init(topology);
+  NestedPools pools(topology, 1, /*pin=*/Tristate::kFalse);
+  MatMulEnv env(topology, pools);
   hwy::ThreadPool& pool = pools.Pool();
   std::mt19937 gen(42);
 
@@ -66,7 +67,7 @@ TEST(OptimizeTest, GradientDescent) {
       config.layer_configs[0].qkv_dim,
       config.layer_configs[0].post_qk == PostQKType::HalfRope);
 
-  Gemma gemma(GemmaTokenizer(), info, pools);
+  Gemma gemma(GemmaTokenizer(), info, env);
 
   const auto generate = [&](const std::vector<int>& prompt) {
     std::vector<int> reply;

diff --git a/compression/blob_compare.cc b/compression/blob_compare.cc
@@ -202,8 +202,9 @@ void ReadAndCompareBlobs(const char* path1, const char* path2) {
   if (!CompareKeys(reader1, reader2)) return;
 
   // Single allocation, avoid initializing the memory.
-  NestedPools pools(0);
-  Allocator::Init(pools.Topology());
+  BoundedTopology topology;
+  Allocator::Init(topology);
+  NestedPools pools(topology);
   const size_t total_bytes = TotalBytes(reader1) + TotalBytes(reader2);
   BytePtr all_blobs = hwy::AllocateAligned<uint8_t>(total_bytes);
   size_t pos = 0;

diff --git a/evals/benchmark_helper.cc b/evals/benchmark_helper.cc
@@ -56,8 +56,9 @@ void InitGenerator(const InferenceArgs& inference, std::mt19937& gen) {
 
 GemmaEnv::GemmaEnv(const LoaderArgs& loader, const InferenceArgs& inference,
                    const AppArgs& app)
-    : pools_(CreatePools(app)) {
-  Allocator::Init(pools_.Topology());
+    : topology_(CreateTopology(app)),
+      pools_(CreatePools(topology_, app)),
+      env_(topology_, pools_) {
   InferenceArgs mutable_inference = inference;
   AbortIfInvalidArgs(mutable_inference);
   LoaderArgs mutable_loader = loader;
@@ -66,7 +67,7 @@ GemmaEnv::GemmaEnv(const LoaderArgs& loader, const InferenceArgs& inference,
     fprintf(stderr, "Skipping model load because: %s\n", err);
   } else {
     fprintf(stderr, "Loading model...\n");
-    model_ = AllocateGemma(mutable_loader, pools_);
+    model_ = AllocateGemma(mutable_loader, env_);
     // Only allocate one for starters because GenerateBatch might not be called.
     kv_caches_.resize(1);
     kv_caches_[0] = KVCache::Create(model_->GetModelConfig(),
@@ -236,7 +237,7 @@ std::string CacheString() {
 }
 
 void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app,
-                NestedPools& pools) {
+                const BoundedTopology& topology, NestedPools& pools) {
   loader.Print(app.verbosity);
   inference.Print(app.verbosity);
   app.Print(app.verbosity);
@@ -255,7 +256,7 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app,
             "Compiled config               : %s\n"
             "Weight Type                   : %s\n"
             "EmbedderInput Type            : %s\n",
-            dt, cpu100, pools.TopologyString(), pools.PinString(),
+            dt, cpu100, topology.TopologyString(), pools.PinString(),
             CacheString().c_str(), hwy::TargetName(hwy::DispatchedTarget()),
             hwy::VectorBytes() * 8, CompiledConfig(),
             StringFromType(loader.Info().weight), TypeName<EmbedderInputT>());

diff --git a/evals/benchmark_helper.h b/evals/benchmark_helper.h
@@ -24,6 +24,7 @@
 #include <vector>
 
 #include "gemma/gemma.h"
+#include "ops/matmul.h"
 #include "util/app.h"
 #include "util/threading.h"
 #include "hwy/base.h"
@@ -105,23 +106,20 @@ class GemmaEnv {
   KVCache& MutableKVCache() { return kv_caches_[0]; }
 
  private:
-  // Thread pool for running inference.
-  NestedPools pools_;
-  // Random number generator.
-  std::mt19937 gen_;
-  // The model to run inference on.
+  BoundedTopology topology_;
+  NestedPools pools_;  // Thread pool.
+  MatMulEnv env_;
+  std::mt19937 gen_;  // Random number generator.
   std::unique_ptr<Gemma> model_;
-  // KV caches, same number as query batch.
-  std::vector<KVCache> kv_caches_;
-  // Runtime config for inference.
+  std::vector<KVCache> kv_caches_;  // Same number as query batch.
   RuntimeConfig runtime_config_;
 };
 
 // Logs the inference speed in tokens/sec.
 void LogSpeedStats(double time_start, size_t total_tokens);
 
 void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app,
-                NestedPools& pools);
+                const BoundedTopology& topology, NestedPools& pools);
 void ShowHelp(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app);
 
 }  // namespace gcpp

diff --git a/examples/hello_world/BUILD.bazel b/examples/hello_world/BUILD.bazel
@@ -13,7 +13,6 @@ cc_binary(
         # Placeholder for internal dep, do not remove.,
         "//:app",
         "//:args",
-        "//:common",
         "//:gemma_lib",
         "//:threading",
         "//:tokenizer",

diff --git a/examples/hello_world/CMakeLists.txt b/examples/hello_world/CMakeLists.txt
@@ -17,7 +17,7 @@ project(hello_world)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 include(FetchContent)
-FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG f2209b911c74019e85d0b7a7a2833c9a2e1b7995)
+FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG c5bebf84ad01edec97e336f5c97ca4e0df6b4d06)
 FetchContent_MakeAvailable(highway)
 FetchContent_Declare(sentencepiece GIT_REPOSITORY https://github.com/google/sentencepiece GIT_TAG 53de76561cfc149d3c01037f0595669ad32a5e7c)
 FetchContent_MakeAvailable(sentencepiece)

diff --git a/examples/hello_world/run.cc b/examples/hello_world/run.cc
@@ -58,9 +58,10 @@ int main(int argc, char** argv) {
   }
 
   // Instantiate model and KV Cache
-  gcpp::NestedPools pools = gcpp::CreatePools(app);
-  gcpp::Allocator::Init(pools.Topology());
-  gcpp::Gemma model = gcpp::CreateGemma(loader, pools);
+  gcpp::BoundedTopology topology(gcpp::CreateTopology(app));
+  gcpp::NestedPools pools = gcpp::CreatePools(topology, app);
+  gcpp::MatMulEnv env(topology, pools);
+  gcpp::Gemma model = gcpp::CreateGemma(loader, env);
   gcpp::KVCache kv_cache =
       gcpp::KVCache::Create(model.GetModelConfig(),
                             inference.prefill_tbatch_size);