Rolling back CacheTopology refactor.

PiperOrigin-RevId: 565124875 Change-Id: I582ba68e2fb0dcd2d692af4bb6946f186ab7a436
google · Sep 13, 2023 · f2a7d62 · f2a7d62
1 parent 53116a3
commit f2a7d62
Show file tree

Hide file tree

Showing 11 changed files with 71 additions and 51 deletions.
diff --git a/tcmalloc/deallocation_profiler.cc b/tcmalloc/deallocation_profiler.cc
@@ -151,7 +151,7 @@ int ComputeIndex(CpuThreadMatchingStatus status, RpcMatchingStatus rpc_status) {
 
 int GetL3Id(int cpu_id) {
   return cpu_id >= 0
-             ? tcmalloc_internal::CacheTopology::Instance().GetL3FromCpuId(
+             ? tcmalloc_internal::Static::cache_topology().GetL3FromCpuId(
                    cpu_id)
              : -1;
 }

diff --git a/tcmalloc/internal/BUILD b/tcmalloc/internal/BUILD
@@ -391,7 +391,6 @@ cc_library(
     deps = [
         ":config",
         ":logging",
-        ":sysinfo",
         ":util",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",

diff --git a/tcmalloc/internal/cache_topology.cc b/tcmalloc/internal/cache_topology.cc
@@ -21,7 +21,6 @@
 #include "absl/strings/string_view.h"
 #include "tcmalloc/internal/config.h"
 #include "tcmalloc/internal/logging.h"
-#include "tcmalloc/internal/sysinfo.h"
 #include "tcmalloc/internal/util.h"
 
 GOOGLE_MALLOC_SECTION_BEGIN
@@ -50,9 +49,11 @@ int BuildCpuToL3CacheMap_FindFirstNumberInBuf(absl::string_view current) {
   return first_cpu;
 }
 
-void CacheTopology::Init() {
-  cpu_count_ = NumCPUs();
-  for (int cpu = 0; cpu < cpu_count_; ++cpu) {
+int BuildCpuToL3CacheMap(uint8_t l3_cache_index[CPU_SETSIZE]) {
+  int index = 0;
+  // Set to a sane value.
+  memset(l3_cache_index, 0, CPU_SETSIZE);
+  for (int cpu = 0; cpu < CPU_SETSIZE; ++cpu) {
     const int fd = OpenSysfsCacheList(cpu);
     if (fd == -1) {
       // At some point we reach the number of CPU on the system, and
@@ -64,11 +65,9 @@ void CacheTopology::Init() {
       // TODO(b/210049384): find a better replacement for shared_cpu_list in
       // this case, e.g. based on numa nodes.
 #ifdef __aarch64__
-      if (l3_count_ == 0) {
-        l3_count_ = 1;
-      }
+      if (index == 0) return 1;
 #endif
-      return;
+      return index;
     }
     // The file contains something like:
     //   0-11,22-33
@@ -84,11 +83,12 @@ void CacheTopology::Init() {
     CHECK_CONDITION(first_cpu < CPU_SETSIZE);
     CHECK_CONDITION(first_cpu <= cpu);
     if (cpu == first_cpu) {
-      l3_cache_index_[cpu] = l3_count_++;
+      l3_cache_index[cpu] = index++;
     } else {
-      l3_cache_index_[cpu] = l3_cache_index_[first_cpu];
+      l3_cache_index[cpu] = l3_cache_index[first_cpu];
     }
   }
+  return index;
 }
 
 }  // namespace tcmalloc_internal

diff --git a/tcmalloc/internal/cache_topology.h b/tcmalloc/internal/cache_topology.h
@@ -15,42 +15,41 @@
 #ifndef TCMALLOC_INTERNAL_CACHE_TOPOLOGY_H_
 #define TCMALLOC_INTERNAL_CACHE_TOPOLOGY_H_
 
-#include "absl/base/attributes.h"
 #include "tcmalloc/internal/config.h"
 #include "tcmalloc/internal/logging.h"
+#include "tcmalloc/internal/util.h"
 
 GOOGLE_MALLOC_SECTION_BEGIN
 namespace tcmalloc {
 namespace tcmalloc_internal {
 
+// Build a mapping from cpuid to the index of the L3 cache used by that cpu.
+// Returns the number of caches detected.
+int BuildCpuToL3CacheMap(uint8_t l3_cache_index[CPU_SETSIZE]);
+
+// Helper function exposed to permit testing it.
+int BuildCpuToL3CacheMap_FindFirstNumberInBuf(absl::string_view current);
+
 class CacheTopology {
  public:
-  static CacheTopology& Instance() {
-    ABSL_CONST_INIT static CacheTopology instance;
-    return instance;
-  }
-
   constexpr CacheTopology() = default;
 
-  void Init();
+  void Init() { shard_count_ = BuildCpuToL3CacheMap(l3_cache_index_); }
 
-  unsigned l3_count() const { return l3_count_; }
+  unsigned shard_count() const { return shard_count_; }
 
   unsigned GetL3FromCpuId(int cpu) const {
     ASSERT(cpu >= 0);
-    ASSERT(cpu < cpu_count_);
+    ASSERT(cpu < ABSL_ARRAYSIZE(l3_cache_index_));
     return l3_cache_index_[cpu];
   }
 
  private:
-  unsigned cpu_count_ = 0;
-  unsigned l3_count_ = 0;
-  uint8_t l3_cache_index_[CPU_SETSIZE] = {};
+  unsigned shard_count_ = 0;
+  // Mapping from cpu to the L3 cache used.
+  uint8_t l3_cache_index_[CPU_SETSIZE] = {0};
 };
 
-// Helper function exposed to permit testing it.
-int BuildCpuToL3CacheMap_FindFirstNumberInBuf(absl::string_view current);
-
 }  // namespace tcmalloc_internal
 }  // namespace tcmalloc
 GOOGLE_MALLOC_SECTION_END

diff --git a/tcmalloc/internal/cache_topology_test.cc b/tcmalloc/internal/cache_topology_test.cc
@@ -14,6 +14,9 @@
 
 #include "tcmalloc/internal/cache_topology.h"
 
+#include <sched.h>
+
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "tcmalloc/internal/sysinfo.h"
 
@@ -24,17 +27,17 @@ TEST(CacheTopology, ComputesSomethingReasonable) {
   // This test verifies that each L3 cache serves the same number of CPU. This
   // is not a strict requirement for the correct operation of this code, but a
   // sign of sanity.
-  CacheTopology topology;
-  topology.Init();
-  EXPECT_EQ(NumCPUs() % topology.l3_count(), 0);
-  ASSERT_GT(topology.l3_count(), 0);
+  uint8_t l3_cache_index[CPU_SETSIZE];
+  const int num_nodes = BuildCpuToL3CacheMap(l3_cache_index);
+  EXPECT_EQ(NumCPUs() % num_nodes, 0);
+  ASSERT_GT(num_nodes, 0);
   static const int kMaxNodes = 256 / 8;
   int count_per_node[kMaxNodes] = {0};
   for (int i = 0, n = NumCPUs(); i < n; ++i) {
-    count_per_node[topology.GetL3FromCpuId(i)]++;
+    count_per_node[l3_cache_index[i]]++;
   }
-  for (int i = 0; i < topology.l3_count(); ++i) {
-    EXPECT_EQ(count_per_node[i], NumCPUs() / topology.l3_count());
+  for (int i = 0; i < num_nodes; ++i) {
+    EXPECT_EQ(count_per_node[i], NumCPUs() / num_nodes);
   }
 }
 

diff --git a/tcmalloc/mock_transfer_cache.cc b/tcmalloc/mock_transfer_cache.cc
@@ -24,5 +24,6 @@ ABSL_CONST_INIT bool FakeShardedTransferCacheManager::enable_generic_cache_(
 ABSL_CONST_INIT bool
     FakeShardedTransferCacheManager::enable_cache_for_large_classes_only_(
         false);
+ABSL_CONST_INIT int FakeCpuLayout::num_shards_(0);
 }  // namespace tcmalloc_internal
 }  // namespace tcmalloc
diff --git a/tcmalloc/mock_transfer_cache.h b/tcmalloc/mock_transfer_cache.h
@@ -286,28 +286,37 @@ class TwoSizeClassManager : public FakeTransferCacheManager {
 
 class FakeCpuLayout {
  public:
-  static constexpr int kNumCpus = 6;
+  static constexpr int kNumCpus = 4;
   static constexpr int kCpusPerShard = 2;
 
+  FakeCpuLayout() : current_cpu_(0) {}
   void Init(int shards) {
     ASSERT(shards > 0);
-    ASSERT(shards * kCpusPerShard <= kNumCpus);
+    ASSERT(shards * kCpusPerShard <= CPU_SETSIZE);
     num_shards_ = shards;
   }
 
+  int CurrentCpu() { return current_cpu_; }
+
   void SetCurrentCpu(int cpu) {
     ASSERT(cpu >= 0);
     ASSERT(cpu < kNumCpus);
     current_cpu_ = cpu;
   }
 
-  unsigned NumShards() { return num_shards_; }
-  int CurrentCpu() { return current_cpu_; }
-  unsigned CpuShard(int cpu) { return cpu / kCpusPerShard; }
+  static int BuildCacheMap(uint8_t l3_cache_index[CPU_SETSIZE]) {
+    ASSERT(num_shards_ > 0);
+    ASSERT(num_shards_ * kCpusPerShard <= CPU_SETSIZE);
+    for (int cpu = 0; cpu < num_shards_ * kCpusPerShard; ++cpu) {
+      int shard = cpu / kCpusPerShard;
+      l3_cache_index[cpu] = shard;
+    }
+    return num_shards_;
+  }
 
  private:
-  int current_cpu_ = 0;
-  int num_shards_ = 0;
+  int current_cpu_;
+  static int num_shards_;
 };
 
 // Defines transfer cache manager for testing legacy transfer cache.

diff --git a/tcmalloc/segv_handler.cc b/tcmalloc/segv_handler.cc
@@ -17,7 +17,6 @@
 #include <fcntl.h>
 #include <unistd.h>
 
-#include "absl/base/internal/sysinfo.h"
 #include "absl/debugging/stacktrace.h"
 #include "tcmalloc/guarded_allocations.h"
 #include "tcmalloc/internal/environment.h"

diff --git a/tcmalloc/static_vars.cc b/tcmalloc/static_vars.cc
@@ -79,6 +79,7 @@ ABSL_CONST_INIT GuardedPageAllocator Static::guardedpage_allocator_;
 ABSL_CONST_INIT StackTraceFilter Static::stacktrace_filter_;
 ABSL_CONST_INIT NumaTopology<kNumaPartitions, kNumBaseClasses>
     Static::numa_topology_;
+ABSL_CONST_INIT CacheTopology Static::cache_topology_;
 // LINT.ThenChange(:static_vars_size)
 
 ABSL_CONST_INIT Static tc_globals;
@@ -104,7 +105,7 @@ size_t Static::metadata_bytes() {
       sizeof(allocation_samples) + sizeof(deallocation_samples) +
       sizeof(sampled_alloc_handle_generator) + sizeof(peak_heap_tracker_) +
       sizeof(guardedpage_allocator_) + sizeof(stacktrace_filter_) +
-      sizeof(numa_topology_) + sizeof(CacheTopology::Instance());
+      sizeof(numa_topology_) + sizeof(cache_topology_);
   // LINT.ThenChange(:static_vars)
 
   const size_t allocated = arena().stats().bytes_allocated +
@@ -139,7 +140,7 @@ ABSL_ATTRIBUTE_COLD ABSL_ATTRIBUTE_NOINLINE void Static::SlowInitIfNecessary() {
 
     CHECK_CONDITION(sizemap_.Init(size_classes));
     numa_topology_.Init();
-    CacheTopology::Instance().Init();
+    cache_topology_.Init();
     sampledallocation_allocator_.Init(&arena_);
     sampled_allocation_recorder_.Construct(&sampledallocation_allocator_);
     sampled_allocation_recorder().Init();

diff --git a/tcmalloc/static_vars.h b/tcmalloc/static_vars.h
@@ -177,6 +177,8 @@ class Static final {
         static_cast<int>(subtle::percpu::IsFastNoInit());
   }
 
+  static CacheTopology& cache_topology() { return cache_topology_; }
+
   static size_t metadata_bytes() ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
 
   // The root of the pagemap is potentially a large poorly utilized
@@ -197,6 +199,7 @@ class Static final {
 
   ABSL_CONST_INIT static Arena arena_;
   static SizeMap sizemap_;
+  ABSL_CONST_INIT static CacheTopology cache_topology_;
   TCMALLOC_ATTRIBUTE_NO_DESTROY ABSL_CONST_INIT static TransferCacheManager
       transfer_cache_;
   ABSL_CONST_INIT static ShardedTransferCacheManager sharded_transfer_cache_;

diff --git a/tcmalloc/transfer_cache.h b/tcmalloc/transfer_cache.h
@@ -108,10 +108,9 @@ class ShardedStaticForwarder : public StaticForwarder {
 
 class ProdCpuLayout {
  public:
-  static unsigned NumShards() { return CacheTopology::Instance().l3_count(); }
   static int CurrentCpu() { return subtle::percpu::RseqCpuId(); }
-  static unsigned CpuShard(int cpu) {
-    return CacheTopology::Instance().GetL3FromCpuId(cpu);
+  static int BuildCacheMap(uint8_t l3_cache_index[CPU_SETSIZE]) {
+    return BuildCpuToL3CacheMap(l3_cache_index);
   }
 };
 
@@ -146,7 +145,9 @@ class ShardedTransferCacheManagerBase {
 
   void Init() ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
     owner_->Init();
-    num_shards_ = cpu_layout_->NumShards();
+    // TODO(b/190711505):  Shift to using state available in Static, rather than
+    // maintaining our own copy.
+    num_shards_ = CpuLayout::BuildCacheMap(l3_cache_index_);
     shards_ = reinterpret_cast<Shard *>(owner_->Alloc(
         sizeof(Shard) * num_shards_, std::align_val_t{ABSL_CACHELINE_SIZE}));
     ASSERT(shards_ != nullptr);
@@ -306,7 +307,7 @@ class ShardedTransferCacheManagerBase {
 
   int tc_length(int cpu, int size_class) const {
     if (shards_ == nullptr) return 0;
-    const uint8_t shard = cpu_layout_->CpuShard(cpu);
+    const uint8_t shard = l3_cache_index_[cpu];
     if (!shard_initialized(shard)) return 0;
     return shards_[shard].transfer_caches[size_class].tc_length();
   }
@@ -386,14 +387,19 @@ class ShardedTransferCacheManagerBase {
   // Returns the cache shard corresponding to the given size class and the
   // current cpu's L3 node. The cache will be initialized if required.
   TransferCache &get_cache(int size_class) {
-    const uint8_t shard_index =
-        cpu_layout_->CpuShard(cpu_layout_->CurrentCpu());
+    const int cpu = cpu_layout_->CurrentCpu();
+    ASSERT(cpu < ABSL_ARRAYSIZE(l3_cache_index_));
+    ASSERT(cpu >= 0);
+    const uint8_t shard_index = l3_cache_index_[cpu];
     ASSERT(shard_index < num_shards_);
     Shard &shard = shards_[shard_index];
     absl::call_once(shard.once_flag, [this, &shard]() { InitShard(shard); });
     return shard.transfer_caches[size_class];
   }
 
+  // Mapping from cpu to the L3 cache used.
+  uint8_t l3_cache_index_[CPU_SETSIZE] = {0};
+
   Shard *shards_ = nullptr;
   int num_shards_ = 0;
   std::atomic<int> active_shards_ = 0;