Skip to content

Commit 253603d

Browse files
ckennellycopybara-github
authored andcommitted
Deflake CpuCacheTest.Metadata.
If we are preempted during the allocation but returned to the same core, our CPU mask may be unchanged (!Tampered()), but we fail to refill in full and don't touch the slab. To avoid this, make multiple attempts and confirm that we were preempted by verifying the slab is in the uncached state on each failed attempt. PiperOrigin-RevId: 703845482 Change-Id: I998a1015aa05d5a8ab1d7a468a9edb07e776d02b
1 parent 1e53d3f commit 253603d

File tree

1 file changed

+163
-117
lines changed

1 file changed

+163
-117
lines changed

tcmalloc/cpu_cache_test.cc

Lines changed: 163 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -375,135 +375,181 @@ TEST(CpuCacheTest, Metadata) {
375375

376376
const int num_cpus = NumCPUs();
377377

378-
CpuCache cache;
379-
cache.Activate();
380-
381-
cpu_cache_internal::SlabShiftBounds shift_bounds =
382-
cache.GetPerCpuSlabShiftBounds();
383-
384-
PerCPUMetadataState r = cache.MetadataMemoryUsage();
385-
size_t slabs_size = subtle::percpu::GetSlabsAllocSize(
386-
subtle::percpu::ToShiftType(shift_bounds.max_shift), num_cpus);
387-
size_t resize_size = num_cpus * sizeof(bool);
388-
size_t begins_size = kNumClasses * sizeof(std::atomic<uint16_t>);
389-
EXPECT_EQ(r.virtual_size, slabs_size + resize_size + begins_size);
390-
EXPECT_EQ(r.resident_size, 0);
391-
392-
auto count_cores = [&]() {
393-
int populated_cores = 0;
394-
for (int i = 0; i < num_cpus; i++) {
395-
if (cache.HasPopulated(i)) {
396-
populated_cores++;
378+
const int kAttempts = 3;
379+
for (int attempt = 1; attempt <= kAttempts; attempt++) {
380+
SCOPED_TRACE(absl::StrCat("attempt=", attempt));
381+
382+
CpuCache cache;
383+
cache.Activate();
384+
385+
cpu_cache_internal::SlabShiftBounds shift_bounds =
386+
cache.GetPerCpuSlabShiftBounds();
387+
388+
PerCPUMetadataState r = cache.MetadataMemoryUsage();
389+
size_t slabs_size = subtle::percpu::GetSlabsAllocSize(
390+
subtle::percpu::ToShiftType(shift_bounds.max_shift), num_cpus);
391+
size_t resize_size = num_cpus * sizeof(bool);
392+
size_t begins_size = kNumClasses * sizeof(std::atomic<uint16_t>);
393+
EXPECT_EQ(r.virtual_size, slabs_size + resize_size + begins_size);
394+
EXPECT_EQ(r.resident_size, 0);
395+
396+
auto count_cores = [&]() {
397+
int populated_cores = 0;
398+
for (int i = 0; i < num_cpus; i++) {
399+
if (cache.HasPopulated(i)) {
400+
populated_cores++;
401+
}
402+
}
403+
return populated_cores;
404+
};
405+
406+
EXPECT_EQ(0, count_cores());
407+
408+
int allowed_cpu_id;
409+
const size_t kSizeClass = 2;
410+
const size_t num_to_move =
411+
cache.forwarder().num_objects_to_move(kSizeClass);
412+
413+
TransferCacheStats tc_stats =
414+
cache.forwarder().transfer_cache().GetStats(kSizeClass);
415+
EXPECT_EQ(tc_stats.remove_hits, 0);
416+
EXPECT_EQ(tc_stats.remove_misses, 0);
417+
EXPECT_EQ(tc_stats.remove_object_misses, 0);
418+
EXPECT_EQ(tc_stats.insert_hits, 0);
419+
EXPECT_EQ(tc_stats.insert_misses, 0);
420+
EXPECT_EQ(tc_stats.insert_object_misses, 0);
421+
422+
void* ptr;
423+
{
424+
// Restrict this thread to a single core while allocating and processing
425+
// the slow path.
426+
//
427+
// TODO(b/151313823): Without this restriction, we may access--for
428+
// reading only--other slabs if we end up being migrated. These may cause
429+
// huge pages to be faulted for those cores, leading to test flakiness.
430+
tcmalloc_internal::ScopedAffinityMask mask(
431+
tcmalloc_internal::AllowedCpus()[0]);
432+
allowed_cpu_id = subtle::percpu::TcmallocTest::VirtualCpuSynchronize();
433+
434+
ptr = cache.Allocate(kSizeClass);
435+
436+
if (mask.Tampered() ||
437+
allowed_cpu_id !=
438+
subtle::percpu::TcmallocTest::VirtualCpuSynchronize()) {
439+
return;
397440
}
398441
}
399-
return populated_cores;
400-
};
442+
EXPECT_NE(ptr, nullptr);
443+
EXPECT_EQ(1, count_cores());
444+
445+
// We don't care if the transfer cache hit or missed, but the CPU cache
446+
// should have done the operation.
447+
tc_stats = cache.forwarder().transfer_cache().GetStats(kSizeClass);
448+
if ((tc_stats.remove_object_misses != num_to_move ||
449+
tc_stats.insert_hits + tc_stats.insert_misses != 0) &&
450+
attempt < kAttempts) {
451+
// The operation didn't occur as expected, likely because we were
452+
// preempted but returned to the same core (otherwise Tampered would have
453+
// fired).
454+
//
455+
// The MSB of tcmalloc_slabs should be cleared to indicate we were
456+
// preempted. As of December 2024, Refill and its callees do not invoke
457+
// CacheCpuSlab. This check can spuriously pass if we're preempted
458+
// between the end of Allocate and now, rather than within Allocate, but
459+
// it ensures we do not silently break.
460+
EXPECT_EQ(subtle::percpu::tcmalloc_slabs & TCMALLOC_CACHED_SLABS_MASK, 0);
461+
462+
cache.Deallocate(ptr, kSizeClass);
463+
cache.Deactivate();
401464

402-
EXPECT_EQ(0, count_cores());
465+
continue;
466+
}
403467

404-
int allowed_cpu_id;
405-
const size_t kSizeClass = 2;
406-
const size_t num_to_move = cache.forwarder().num_objects_to_move(kSizeClass);
407-
void* ptr;
408-
{
409-
// Restrict this thread to a single core while allocating and processing the
410-
// slow path.
411-
//
412-
// TODO(b/151313823): Without this restriction, we may access--for reading
413-
// only--other slabs if we end up being migrated. These may cause huge
414-
// pages to be faulted for those cores, leading to test flakiness.
415-
tcmalloc_internal::ScopedAffinityMask mask(
416-
tcmalloc_internal::AllowedCpus()[0]);
417-
allowed_cpu_id = subtle::percpu::TcmallocTest::VirtualCpuSynchronize();
468+
EXPECT_EQ(tc_stats.remove_hits + tc_stats.remove_misses, 1);
469+
EXPECT_EQ(tc_stats.remove_object_misses, num_to_move);
470+
EXPECT_EQ(tc_stats.insert_hits, 0);
471+
EXPECT_EQ(tc_stats.insert_misses, 0);
472+
EXPECT_EQ(tc_stats.insert_object_misses, 0);
473+
474+
r = cache.MetadataMemoryUsage();
475+
EXPECT_EQ(
476+
r.virtual_size,
477+
resize_size + begins_size +
478+
subtle::percpu::GetSlabsAllocSize(
479+
subtle::percpu::ToShiftType(shift_bounds.max_shift), num_cpus));
480+
481+
// We expect to fault in a single core, but we may end up faulting an
482+
// entire hugepage worth of memory when we touch that core and another when
483+
// touching the header.
484+
const size_t core_slab_size = r.virtual_size / num_cpus;
485+
const size_t upper_bound =
486+
((core_slab_size + kHugePageSize - 1) & ~(kHugePageSize - 1)) +
487+
kHugePageSize;
488+
489+
// A single core may be less than the full slab (core_slab_size), since we
490+
// do not touch every page within the slab.
491+
EXPECT_GT(r.resident_size, 0);
492+
EXPECT_LE(r.resident_size, upper_bound)
493+
<< count_cores() << " " << core_slab_size << " " << kHugePageSize;
494+
495+
// This test is much more sensitive to implementation details of the per-CPU
496+
// cache. It may need to be updated from time to time. These numbers were
497+
// calculated by MADV_NOHUGEPAGE'ing the memory used for the slab and
498+
// measuring the resident size.
499+
switch (shift_bounds.max_shift) {
500+
case 13:
501+
EXPECT_GE(r.resident_size, 4096);
502+
break;
503+
case 19:
504+
EXPECT_GE(r.resident_size, 8192);
505+
break;
506+
default:
507+
ASSUME(false);
508+
break;
509+
}
418510

419-
ptr = cache.Allocate(kSizeClass);
511+
// Read stats from the CPU caches. This should not impact resident_size.
512+
const size_t max_cpu_cache_size = Parameters::max_per_cpu_cache_size();
513+
size_t total_used_bytes = 0;
514+
for (int cpu = 0; cpu < num_cpus; ++cpu) {
515+
size_t used_bytes = cache.UsedBytes(cpu);
516+
total_used_bytes += used_bytes;
517+
518+
if (cpu == allowed_cpu_id) {
519+
EXPECT_GT(used_bytes, 0);
520+
EXPECT_TRUE(cache.HasPopulated(cpu));
521+
} else {
522+
EXPECT_EQ(used_bytes, 0);
523+
EXPECT_FALSE(cache.HasPopulated(cpu));
524+
}
420525

421-
if (mask.Tampered() ||
422-
allowed_cpu_id !=
423-
subtle::percpu::TcmallocTest::VirtualCpuSynchronize()) {
424-
return;
526+
EXPECT_LE(cache.Unallocated(cpu), max_cpu_cache_size);
527+
EXPECT_EQ(cache.Capacity(cpu), max_cpu_cache_size);
528+
EXPECT_EQ(cache.Allocated(cpu) + cache.Unallocated(cpu),
529+
cache.Capacity(cpu));
425530
}
426-
}
427-
EXPECT_NE(ptr, nullptr);
428-
EXPECT_EQ(1, count_cores());
429-
430-
r = cache.MetadataMemoryUsage();
431-
EXPECT_EQ(
432-
r.virtual_size,
433-
resize_size + begins_size +
434-
subtle::percpu::GetSlabsAllocSize(
435-
subtle::percpu::ToShiftType(shift_bounds.max_shift), num_cpus));
436-
437-
// We expect to fault in a single core, but we may end up faulting an
438-
// entire hugepage worth of memory when we touch that core and another when
439-
// touching the header.
440-
const size_t core_slab_size = r.virtual_size / num_cpus;
441-
const size_t upper_bound =
442-
((core_slab_size + kHugePageSize - 1) & ~(kHugePageSize - 1)) +
443-
kHugePageSize;
444-
445-
// A single core may be less than the full slab (core_slab_size), since we
446-
// do not touch every page within the slab.
447-
EXPECT_GT(r.resident_size, 0);
448-
EXPECT_LE(r.resident_size, upper_bound)
449-
<< count_cores() << " " << core_slab_size << " " << kHugePageSize;
450-
451-
// This test is much more sensitive to implementation details of the per-CPU
452-
// cache. It may need to be updated from time to time. These numbers were
453-
// calculated by MADV_NOHUGEPAGE'ing the memory used for the slab and
454-
// measuring the resident size.
455-
switch (shift_bounds.max_shift) {
456-
case 13:
457-
EXPECT_GE(r.resident_size, 4096);
458-
break;
459-
case 19:
460-
EXPECT_GE(r.resident_size, 8192);
461-
break;
462-
default:
463-
ASSUME(false);
464-
break;
465-
}
466531

467-
// Read stats from the CPU caches. This should not impact resident_size.
468-
const size_t max_cpu_cache_size = Parameters::max_per_cpu_cache_size();
469-
size_t total_used_bytes = 0;
470-
for (int cpu = 0; cpu < num_cpus; ++cpu) {
471-
size_t used_bytes = cache.UsedBytes(cpu);
472-
total_used_bytes += used_bytes;
473-
474-
if (cpu == allowed_cpu_id) {
475-
EXPECT_GT(used_bytes, 0);
476-
EXPECT_TRUE(cache.HasPopulated(cpu));
477-
} else {
478-
EXPECT_EQ(used_bytes, 0);
479-
EXPECT_FALSE(cache.HasPopulated(cpu));
532+
for (int size_class = 1; size_class < kNumClasses; ++size_class) {
533+
// This is sensitive to the current growth policies of CpuCache. It may
534+
// require updating from time-to-time.
535+
EXPECT_EQ(cache.TotalObjectsOfClass(size_class),
536+
(size_class == kSizeClass ? num_to_move - 1 : 0))
537+
<< size_class;
480538
}
539+
EXPECT_EQ(cache.TotalUsedBytes(), total_used_bytes);
481540

482-
EXPECT_LE(cache.Unallocated(cpu), max_cpu_cache_size);
483-
EXPECT_EQ(cache.Capacity(cpu), max_cpu_cache_size);
484-
EXPECT_EQ(cache.Allocated(cpu) + cache.Unallocated(cpu),
485-
cache.Capacity(cpu));
486-
}
541+
PerCPUMetadataState post_stats = cache.MetadataMemoryUsage();
542+
// Confirm stats are within expected bounds.
543+
EXPECT_GT(post_stats.resident_size, 0);
544+
EXPECT_LE(post_stats.resident_size, upper_bound) << count_cores();
545+
// Confirm stats are unchanged.
546+
EXPECT_EQ(r.resident_size, post_stats.resident_size);
487547

488-
for (int size_class = 1; size_class < kNumClasses; ++size_class) {
489-
// This is sensitive to the current growth policies of CpuCache. It may
490-
// require updating from time-to-time.
491-
EXPECT_EQ(cache.TotalObjectsOfClass(size_class),
492-
(size_class == kSizeClass ? num_to_move - 1 : 0))
493-
<< size_class;
548+
// Tear down.
549+
cache.Deallocate(ptr, kSizeClass);
550+
cache.Deactivate();
551+
break;
494552
}
495-
EXPECT_EQ(cache.TotalUsedBytes(), total_used_bytes);
496-
497-
PerCPUMetadataState post_stats = cache.MetadataMemoryUsage();
498-
// Confirm stats are within expected bounds.
499-
EXPECT_GT(post_stats.resident_size, 0);
500-
EXPECT_LE(post_stats.resident_size, upper_bound) << count_cores();
501-
// Confirm stats are unchanged.
502-
EXPECT_EQ(r.resident_size, post_stats.resident_size);
503-
504-
// Tear down.
505-
cache.Deallocate(ptr, kSizeClass);
506-
cache.Deactivate();
507553
}
508554

509555
TEST(CpuCacheTest, CacheMissStats) {

0 commit comments

Comments
 (0)