Deflake CpuCacheTest.Metadata.

ckennelly · copybara-github · commit 253603d9e83f · 2024-12-07T12:27:54.000-08:00
If we are preempted during the allocation but returned to the same core,
our CPU mask may be unchanged (!Tampered()), but we fail to refill in full and
don't touch the slab.

To avoid this, make multiple attempts and confirm that we were preempted by
verifying the slab is in the uncached state on each failed attempt.

PiperOrigin-RevId: 703845482
Change-Id: I998a1015aa05d5a8ab1d7a468a9edb07e776d02b
diff --git a/tcmalloc/cpu_cache_test.cc b/tcmalloc/cpu_cache_test.cc
@@ -375,135 +375,181 @@ TEST(CpuCacheTest, Metadata) {
 
   const int num_cpus = NumCPUs();
 
-  CpuCache cache;
-  cache.Activate();
-
-  cpu_cache_internal::SlabShiftBounds shift_bounds =
-      cache.GetPerCpuSlabShiftBounds();
-
-  PerCPUMetadataState r = cache.MetadataMemoryUsage();
-  size_t slabs_size = subtle::percpu::GetSlabsAllocSize(
-      subtle::percpu::ToShiftType(shift_bounds.max_shift), num_cpus);
-  size_t resize_size = num_cpus * sizeof(bool);
-  size_t begins_size = kNumClasses * sizeof(std::atomic<uint16_t>);
-  EXPECT_EQ(r.virtual_size, slabs_size + resize_size + begins_size);
-  EXPECT_EQ(r.resident_size, 0);
-
-  auto count_cores = [&]() {
-    int populated_cores = 0;
-    for (int i = 0; i < num_cpus; i++) {
-      if (cache.HasPopulated(i)) {
-        populated_cores++;
+  const int kAttempts = 3;
+  for (int attempt = 1; attempt <= kAttempts; attempt++) {
+    SCOPED_TRACE(absl::StrCat("attempt=", attempt));
+
+    CpuCache cache;
+    cache.Activate();
+
+    cpu_cache_internal::SlabShiftBounds shift_bounds =
+        cache.GetPerCpuSlabShiftBounds();
+
+    PerCPUMetadataState r = cache.MetadataMemoryUsage();
+    size_t slabs_size = subtle::percpu::GetSlabsAllocSize(
+        subtle::percpu::ToShiftType(shift_bounds.max_shift), num_cpus);
+    size_t resize_size = num_cpus * sizeof(bool);
+    size_t begins_size = kNumClasses * sizeof(std::atomic<uint16_t>);
+    EXPECT_EQ(r.virtual_size, slabs_size + resize_size + begins_size);
+    EXPECT_EQ(r.resident_size, 0);
+
+    auto count_cores = [&]() {
+      int populated_cores = 0;
+      for (int i = 0; i < num_cpus; i++) {
+        if (cache.HasPopulated(i)) {
+          populated_cores++;
+        }
+      }
+      return populated_cores;
+    };
+
+    EXPECT_EQ(0, count_cores());
+
+    int allowed_cpu_id;
+    const size_t kSizeClass = 2;
+    const size_t num_to_move =
+        cache.forwarder().num_objects_to_move(kSizeClass);
+
+    TransferCacheStats tc_stats =
+        cache.forwarder().transfer_cache().GetStats(kSizeClass);
+    EXPECT_EQ(tc_stats.remove_hits, 0);
+    EXPECT_EQ(tc_stats.remove_misses, 0);
+    EXPECT_EQ(tc_stats.remove_object_misses, 0);
+    EXPECT_EQ(tc_stats.insert_hits, 0);
+    EXPECT_EQ(tc_stats.insert_misses, 0);
+    EXPECT_EQ(tc_stats.insert_object_misses, 0);
+
+    void* ptr;
+    {
+      // Restrict this thread to a single core while allocating and processing
+      // the slow path.
+      //
+      // TODO(b/151313823):  Without this restriction, we may access--for
+      // reading only--other slabs if we end up being migrated.  These may cause
+      // huge pages to be faulted for those cores, leading to test flakiness.
+      tcmalloc_internal::ScopedAffinityMask mask(
+          tcmalloc_internal::AllowedCpus()[0]);
+      allowed_cpu_id = subtle::percpu::TcmallocTest::VirtualCpuSynchronize();
+
+      ptr = cache.Allocate(kSizeClass);
+
+      if (mask.Tampered() ||
+          allowed_cpu_id !=
+              subtle::percpu::TcmallocTest::VirtualCpuSynchronize()) {
+        return;
       }
     }
-    return populated_cores;
-  };
+    EXPECT_NE(ptr, nullptr);
+    EXPECT_EQ(1, count_cores());
+
+    // We don't care if the transfer cache hit or missed, but the CPU cache
+    // should have done the operation.
+    tc_stats = cache.forwarder().transfer_cache().GetStats(kSizeClass);
+    if ((tc_stats.remove_object_misses != num_to_move ||
+         tc_stats.insert_hits + tc_stats.insert_misses != 0) &&
+        attempt < kAttempts) {
+      // The operation didn't occur as expected, likely because we were
+      // preempted but returned to the same core (otherwise Tampered would have
+      // fired).
+      //
+      // The MSB of tcmalloc_slabs should be cleared to indicate we were
+      // preempted.  As of December 2024, Refill and its callees do not invoke
+      // CacheCpuSlab.  This check can spuriously pass if we're preempted
+      // between the end of Allocate and now, rather than within Allocate, but
+      // it ensures we do not silently break.
+      EXPECT_EQ(subtle::percpu::tcmalloc_slabs & TCMALLOC_CACHED_SLABS_MASK, 0);
+
+      cache.Deallocate(ptr, kSizeClass);
+      cache.Deactivate();
 
-  EXPECT_EQ(0, count_cores());
+      continue;
+    }
 
-  int allowed_cpu_id;
-  const size_t kSizeClass = 2;
-  const size_t num_to_move = cache.forwarder().num_objects_to_move(kSizeClass);
-  void* ptr;
-  {
-    // Restrict this thread to a single core while allocating and processing the
-    // slow path.
-    //
-    // TODO(b/151313823):  Without this restriction, we may access--for reading
-    // only--other slabs if we end up being migrated.  These may cause huge
-    // pages to be faulted for those cores, leading to test flakiness.
-    tcmalloc_internal::ScopedAffinityMask mask(
-        tcmalloc_internal::AllowedCpus()[0]);
-    allowed_cpu_id = subtle::percpu::TcmallocTest::VirtualCpuSynchronize();
+    EXPECT_EQ(tc_stats.remove_hits + tc_stats.remove_misses, 1);
+    EXPECT_EQ(tc_stats.remove_object_misses, num_to_move);
+    EXPECT_EQ(tc_stats.insert_hits, 0);
+    EXPECT_EQ(tc_stats.insert_misses, 0);
+    EXPECT_EQ(tc_stats.insert_object_misses, 0);
+
+    r = cache.MetadataMemoryUsage();
+    EXPECT_EQ(
+        r.virtual_size,
+        resize_size + begins_size +
+            subtle::percpu::GetSlabsAllocSize(
+                subtle::percpu::ToShiftType(shift_bounds.max_shift), num_cpus));
+
+    // We expect to fault in a single core, but we may end up faulting an
+    // entire hugepage worth of memory when we touch that core and another when
+    // touching the header.
+    const size_t core_slab_size = r.virtual_size / num_cpus;
+    const size_t upper_bound =
+        ((core_slab_size + kHugePageSize - 1) & ~(kHugePageSize - 1)) +
+        kHugePageSize;
+
+    // A single core may be less than the full slab (core_slab_size), since we
+    // do not touch every page within the slab.
+    EXPECT_GT(r.resident_size, 0);
+    EXPECT_LE(r.resident_size, upper_bound)
+        << count_cores() << " " << core_slab_size << " " << kHugePageSize;
+
+    // This test is much more sensitive to implementation details of the per-CPU
+    // cache.  It may need to be updated from time to time.  These numbers were
+    // calculated by MADV_NOHUGEPAGE'ing the memory used for the slab and
+    // measuring the resident size.
+    switch (shift_bounds.max_shift) {
+      case 13:
+        EXPECT_GE(r.resident_size, 4096);
+        break;
+      case 19:
+        EXPECT_GE(r.resident_size, 8192);
+        break;
+      default:
+        ASSUME(false);
+        break;
+    }
 
-    ptr = cache.Allocate(kSizeClass);
+    // Read stats from the CPU caches.  This should not impact resident_size.
+    const size_t max_cpu_cache_size = Parameters::max_per_cpu_cache_size();
+    size_t total_used_bytes = 0;
+    for (int cpu = 0; cpu < num_cpus; ++cpu) {
+      size_t used_bytes = cache.UsedBytes(cpu);
+      total_used_bytes += used_bytes;
+
+      if (cpu == allowed_cpu_id) {
+        EXPECT_GT(used_bytes, 0);
+        EXPECT_TRUE(cache.HasPopulated(cpu));
+      } else {
+        EXPECT_EQ(used_bytes, 0);
+        EXPECT_FALSE(cache.HasPopulated(cpu));
+      }
 
-    if (mask.Tampered() ||
-        allowed_cpu_id !=
-            subtle::percpu::TcmallocTest::VirtualCpuSynchronize()) {
-      return;
+      EXPECT_LE(cache.Unallocated(cpu), max_cpu_cache_size);
+      EXPECT_EQ(cache.Capacity(cpu), max_cpu_cache_size);
+      EXPECT_EQ(cache.Allocated(cpu) + cache.Unallocated(cpu),
+                cache.Capacity(cpu));
     }
-  }
-  EXPECT_NE(ptr, nullptr);
-  EXPECT_EQ(1, count_cores());
-
-  r = cache.MetadataMemoryUsage();
-  EXPECT_EQ(
-      r.virtual_size,
-      resize_size + begins_size +
-          subtle::percpu::GetSlabsAllocSize(
-              subtle::percpu::ToShiftType(shift_bounds.max_shift), num_cpus));
-
-  // We expect to fault in a single core, but we may end up faulting an
-  // entire hugepage worth of memory when we touch that core and another when
-  // touching the header.
-  const size_t core_slab_size = r.virtual_size / num_cpus;
-  const size_t upper_bound =
-      ((core_slab_size + kHugePageSize - 1) & ~(kHugePageSize - 1)) +
-      kHugePageSize;
-
-  // A single core may be less than the full slab (core_slab_size), since we
-  // do not touch every page within the slab.
-  EXPECT_GT(r.resident_size, 0);
-  EXPECT_LE(r.resident_size, upper_bound)
-      << count_cores() << " " << core_slab_size << " " << kHugePageSize;
-
-  // This test is much more sensitive to implementation details of the per-CPU
-  // cache.  It may need to be updated from time to time.  These numbers were
-  // calculated by MADV_NOHUGEPAGE'ing the memory used for the slab and
-  // measuring the resident size.
-  switch (shift_bounds.max_shift) {
-    case 13:
-      EXPECT_GE(r.resident_size, 4096);
-      break;
-    case 19:
-      EXPECT_GE(r.resident_size, 8192);
-      break;
-    default:
-      ASSUME(false);
-      break;
-  }
 
-  // Read stats from the CPU caches.  This should not impact resident_size.
-  const size_t max_cpu_cache_size = Parameters::max_per_cpu_cache_size();
-  size_t total_used_bytes = 0;
-  for (int cpu = 0; cpu < num_cpus; ++cpu) {
-    size_t used_bytes = cache.UsedBytes(cpu);
-    total_used_bytes += used_bytes;
-
-    if (cpu == allowed_cpu_id) {
-      EXPECT_GT(used_bytes, 0);
-      EXPECT_TRUE(cache.HasPopulated(cpu));
-    } else {
-      EXPECT_EQ(used_bytes, 0);
-      EXPECT_FALSE(cache.HasPopulated(cpu));
+    for (int size_class = 1; size_class < kNumClasses; ++size_class) {
+      // This is sensitive to the current growth policies of CpuCache.  It may
+      // require updating from time-to-time.
+      EXPECT_EQ(cache.TotalObjectsOfClass(size_class),
+                (size_class == kSizeClass ? num_to_move - 1 : 0))
+          << size_class;
     }
+    EXPECT_EQ(cache.TotalUsedBytes(), total_used_bytes);
 
-    EXPECT_LE(cache.Unallocated(cpu), max_cpu_cache_size);
-    EXPECT_EQ(cache.Capacity(cpu), max_cpu_cache_size);
-    EXPECT_EQ(cache.Allocated(cpu) + cache.Unallocated(cpu),
-              cache.Capacity(cpu));
-  }
+    PerCPUMetadataState post_stats = cache.MetadataMemoryUsage();
+    // Confirm stats are within expected bounds.
+    EXPECT_GT(post_stats.resident_size, 0);
+    EXPECT_LE(post_stats.resident_size, upper_bound) << count_cores();
+    // Confirm stats are unchanged.
+    EXPECT_EQ(r.resident_size, post_stats.resident_size);
 
-  for (int size_class = 1; size_class < kNumClasses; ++size_class) {
-    // This is sensitive to the current growth policies of CpuCache.  It may
-    // require updating from time-to-time.
-    EXPECT_EQ(cache.TotalObjectsOfClass(size_class),
-              (size_class == kSizeClass ? num_to_move - 1 : 0))
-        << size_class;
+    // Tear down.
+    cache.Deallocate(ptr, kSizeClass);
+    cache.Deactivate();
+    break;
   }
-  EXPECT_EQ(cache.TotalUsedBytes(), total_used_bytes);
-
-  PerCPUMetadataState post_stats = cache.MetadataMemoryUsage();
-  // Confirm stats are within expected bounds.
-  EXPECT_GT(post_stats.resident_size, 0);
-  EXPECT_LE(post_stats.resident_size, upper_bound) << count_cores();
-  // Confirm stats are unchanged.
-  EXPECT_EQ(r.resident_size, post_stats.resident_size);
-
-  // Tear down.
-  cache.Deallocate(ptr, kSizeClass);
-  cache.Deactivate();
 }
 
 TEST(CpuCacheTest, CacheMissStats) {