Skip to content

Commit

Permalink
Rolling back CacheTopology refactor.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 565124875
Change-Id: I582ba68e2fb0dcd2d692af4bb6946f186ab7a436
  • Loading branch information
v-gogte authored and copybara-github committed Sep 13, 2023
1 parent 53116a3 commit f2a7d62
Show file tree
Hide file tree
Showing 11 changed files with 71 additions and 51 deletions.
2 changes: 1 addition & 1 deletion tcmalloc/deallocation_profiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ int ComputeIndex(CpuThreadMatchingStatus status, RpcMatchingStatus rpc_status) {

int GetL3Id(int cpu_id) {
return cpu_id >= 0
? tcmalloc_internal::CacheTopology::Instance().GetL3FromCpuId(
? tcmalloc_internal::Static::cache_topology().GetL3FromCpuId(
cpu_id)
: -1;
}
Expand Down
1 change: 0 additions & 1 deletion tcmalloc/internal/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,6 @@ cc_library(
deps = [
":config",
":logging",
":sysinfo",
":util",
"@com_google_absl//absl/base",
"@com_google_absl//absl/base:core_headers",
Expand Down
20 changes: 10 additions & 10 deletions tcmalloc/internal/cache_topology.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
#include "absl/strings/string_view.h"
#include "tcmalloc/internal/config.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/internal/sysinfo.h"
#include "tcmalloc/internal/util.h"

GOOGLE_MALLOC_SECTION_BEGIN
Expand Down Expand Up @@ -50,9 +49,11 @@ int BuildCpuToL3CacheMap_FindFirstNumberInBuf(absl::string_view current) {
return first_cpu;
}

void CacheTopology::Init() {
cpu_count_ = NumCPUs();
for (int cpu = 0; cpu < cpu_count_; ++cpu) {
int BuildCpuToL3CacheMap(uint8_t l3_cache_index[CPU_SETSIZE]) {
int index = 0;
// Set to a sane value.
memset(l3_cache_index, 0, CPU_SETSIZE);
for (int cpu = 0; cpu < CPU_SETSIZE; ++cpu) {
const int fd = OpenSysfsCacheList(cpu);
if (fd == -1) {
// At some point we reach the number of CPU on the system, and
Expand All @@ -64,11 +65,9 @@ void CacheTopology::Init() {
// TODO(b/210049384): find a better replacement for shared_cpu_list in
// this case, e.g. based on numa nodes.
#ifdef __aarch64__
if (l3_count_ == 0) {
l3_count_ = 1;
}
if (index == 0) return 1;
#endif
return;
return index;
}
// The file contains something like:
// 0-11,22-33
Expand All @@ -84,11 +83,12 @@ void CacheTopology::Init() {
CHECK_CONDITION(first_cpu < CPU_SETSIZE);
CHECK_CONDITION(first_cpu <= cpu);
if (cpu == first_cpu) {
l3_cache_index_[cpu] = l3_count_++;
l3_cache_index[cpu] = index++;
} else {
l3_cache_index_[cpu] = l3_cache_index_[first_cpu];
l3_cache_index[cpu] = l3_cache_index[first_cpu];
}
}
return index;
}

} // namespace tcmalloc_internal
Expand Down
29 changes: 14 additions & 15 deletions tcmalloc/internal/cache_topology.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,42 +15,41 @@
#ifndef TCMALLOC_INTERNAL_CACHE_TOPOLOGY_H_
#define TCMALLOC_INTERNAL_CACHE_TOPOLOGY_H_

#include "absl/base/attributes.h"
#include "tcmalloc/internal/config.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/internal/util.h"

GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {

// Build a mapping from cpuid to the index of the L3 cache used by that cpu.
// Returns the number of caches detected.
int BuildCpuToL3CacheMap(uint8_t l3_cache_index[CPU_SETSIZE]);

// Helper function exposed to permit testing it.
int BuildCpuToL3CacheMap_FindFirstNumberInBuf(absl::string_view current);

class CacheTopology {
public:
static CacheTopology& Instance() {
ABSL_CONST_INIT static CacheTopology instance;
return instance;
}

constexpr CacheTopology() = default;

void Init();
void Init() { shard_count_ = BuildCpuToL3CacheMap(l3_cache_index_); }

unsigned l3_count() const { return l3_count_; }
unsigned shard_count() const { return shard_count_; }

unsigned GetL3FromCpuId(int cpu) const {
ASSERT(cpu >= 0);
ASSERT(cpu < cpu_count_);
ASSERT(cpu < ABSL_ARRAYSIZE(l3_cache_index_));
return l3_cache_index_[cpu];
}

private:
unsigned cpu_count_ = 0;
unsigned l3_count_ = 0;
uint8_t l3_cache_index_[CPU_SETSIZE] = {};
unsigned shard_count_ = 0;
// Mapping from cpu to the L3 cache used.
uint8_t l3_cache_index_[CPU_SETSIZE] = {0};
};

// Helper function exposed to permit testing it.
int BuildCpuToL3CacheMap_FindFirstNumberInBuf(absl::string_view current);

} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END
Expand Down
17 changes: 10 additions & 7 deletions tcmalloc/internal/cache_topology_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@

#include "tcmalloc/internal/cache_topology.h"

#include <sched.h>

#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "tcmalloc/internal/sysinfo.h"

Expand All @@ -24,17 +27,17 @@ TEST(CacheTopology, ComputesSomethingReasonable) {
// This test verifies that each L3 cache serves the same number of CPU. This
// is not a strict requirement for the correct operation of this code, but a
// sign of sanity.
CacheTopology topology;
topology.Init();
EXPECT_EQ(NumCPUs() % topology.l3_count(), 0);
ASSERT_GT(topology.l3_count(), 0);
uint8_t l3_cache_index[CPU_SETSIZE];
const int num_nodes = BuildCpuToL3CacheMap(l3_cache_index);
EXPECT_EQ(NumCPUs() % num_nodes, 0);
ASSERT_GT(num_nodes, 0);
static const int kMaxNodes = 256 / 8;
int count_per_node[kMaxNodes] = {0};
for (int i = 0, n = NumCPUs(); i < n; ++i) {
count_per_node[topology.GetL3FromCpuId(i)]++;
count_per_node[l3_cache_index[i]]++;
}
for (int i = 0; i < topology.l3_count(); ++i) {
EXPECT_EQ(count_per_node[i], NumCPUs() / topology.l3_count());
for (int i = 0; i < num_nodes; ++i) {
EXPECT_EQ(count_per_node[i], NumCPUs() / num_nodes);
}
}

Expand Down
1 change: 1 addition & 0 deletions tcmalloc/mock_transfer_cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,6 @@ ABSL_CONST_INIT bool FakeShardedTransferCacheManager::enable_generic_cache_(
ABSL_CONST_INIT bool
FakeShardedTransferCacheManager::enable_cache_for_large_classes_only_(
false);
ABSL_CONST_INIT int FakeCpuLayout::num_shards_(0);
} // namespace tcmalloc_internal
} // namespace tcmalloc
23 changes: 16 additions & 7 deletions tcmalloc/mock_transfer_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -286,28 +286,37 @@ class TwoSizeClassManager : public FakeTransferCacheManager {

class FakeCpuLayout {
public:
static constexpr int kNumCpus = 6;
static constexpr int kNumCpus = 4;
static constexpr int kCpusPerShard = 2;

FakeCpuLayout() : current_cpu_(0) {}
void Init(int shards) {
ASSERT(shards > 0);
ASSERT(shards * kCpusPerShard <= kNumCpus);
ASSERT(shards * kCpusPerShard <= CPU_SETSIZE);
num_shards_ = shards;
}

int CurrentCpu() { return current_cpu_; }

void SetCurrentCpu(int cpu) {
ASSERT(cpu >= 0);
ASSERT(cpu < kNumCpus);
current_cpu_ = cpu;
}

unsigned NumShards() { return num_shards_; }
int CurrentCpu() { return current_cpu_; }
unsigned CpuShard(int cpu) { return cpu / kCpusPerShard; }
static int BuildCacheMap(uint8_t l3_cache_index[CPU_SETSIZE]) {
ASSERT(num_shards_ > 0);
ASSERT(num_shards_ * kCpusPerShard <= CPU_SETSIZE);
for (int cpu = 0; cpu < num_shards_ * kCpusPerShard; ++cpu) {
int shard = cpu / kCpusPerShard;
l3_cache_index[cpu] = shard;
}
return num_shards_;
}

private:
int current_cpu_ = 0;
int num_shards_ = 0;
int current_cpu_;
static int num_shards_;
};

// Defines transfer cache manager for testing legacy transfer cache.
Expand Down
1 change: 0 additions & 1 deletion tcmalloc/segv_handler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
#include <fcntl.h>
#include <unistd.h>

#include "absl/base/internal/sysinfo.h"
#include "absl/debugging/stacktrace.h"
#include "tcmalloc/guarded_allocations.h"
#include "tcmalloc/internal/environment.h"
Expand Down
5 changes: 3 additions & 2 deletions tcmalloc/static_vars.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ ABSL_CONST_INIT GuardedPageAllocator Static::guardedpage_allocator_;
ABSL_CONST_INIT StackTraceFilter Static::stacktrace_filter_;
ABSL_CONST_INIT NumaTopology<kNumaPartitions, kNumBaseClasses>
Static::numa_topology_;
ABSL_CONST_INIT CacheTopology Static::cache_topology_;
// LINT.ThenChange(:static_vars_size)

ABSL_CONST_INIT Static tc_globals;
Expand All @@ -104,7 +105,7 @@ size_t Static::metadata_bytes() {
sizeof(allocation_samples) + sizeof(deallocation_samples) +
sizeof(sampled_alloc_handle_generator) + sizeof(peak_heap_tracker_) +
sizeof(guardedpage_allocator_) + sizeof(stacktrace_filter_) +
sizeof(numa_topology_) + sizeof(CacheTopology::Instance());
sizeof(numa_topology_) + sizeof(cache_topology_);
// LINT.ThenChange(:static_vars)

const size_t allocated = arena().stats().bytes_allocated +
Expand Down Expand Up @@ -139,7 +140,7 @@ ABSL_ATTRIBUTE_COLD ABSL_ATTRIBUTE_NOINLINE void Static::SlowInitIfNecessary() {

CHECK_CONDITION(sizemap_.Init(size_classes));
numa_topology_.Init();
CacheTopology::Instance().Init();
cache_topology_.Init();
sampledallocation_allocator_.Init(&arena_);
sampled_allocation_recorder_.Construct(&sampledallocation_allocator_);
sampled_allocation_recorder().Init();
Expand Down
3 changes: 3 additions & 0 deletions tcmalloc/static_vars.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@ class Static final {
static_cast<int>(subtle::percpu::IsFastNoInit());
}

static CacheTopology& cache_topology() { return cache_topology_; }

static size_t metadata_bytes() ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);

// The root of the pagemap is potentially a large poorly utilized
Expand All @@ -197,6 +199,7 @@ class Static final {

ABSL_CONST_INIT static Arena arena_;
static SizeMap sizemap_;
ABSL_CONST_INIT static CacheTopology cache_topology_;
TCMALLOC_ATTRIBUTE_NO_DESTROY ABSL_CONST_INIT static TransferCacheManager
transfer_cache_;
ABSL_CONST_INIT static ShardedTransferCacheManager sharded_transfer_cache_;
Expand Down
20 changes: 13 additions & 7 deletions tcmalloc/transfer_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,9 @@ class ShardedStaticForwarder : public StaticForwarder {

class ProdCpuLayout {
public:
static unsigned NumShards() { return CacheTopology::Instance().l3_count(); }
static int CurrentCpu() { return subtle::percpu::RseqCpuId(); }
static unsigned CpuShard(int cpu) {
return CacheTopology::Instance().GetL3FromCpuId(cpu);
static int BuildCacheMap(uint8_t l3_cache_index[CPU_SETSIZE]) {
return BuildCpuToL3CacheMap(l3_cache_index);
}
};

Expand Down Expand Up @@ -146,7 +145,9 @@ class ShardedTransferCacheManagerBase {

void Init() ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
owner_->Init();
num_shards_ = cpu_layout_->NumShards();
// TODO(b/190711505): Shift to using state available in Static, rather than
// maintaining our own copy.
num_shards_ = CpuLayout::BuildCacheMap(l3_cache_index_);
shards_ = reinterpret_cast<Shard *>(owner_->Alloc(
sizeof(Shard) * num_shards_, std::align_val_t{ABSL_CACHELINE_SIZE}));
ASSERT(shards_ != nullptr);
Expand Down Expand Up @@ -306,7 +307,7 @@ class ShardedTransferCacheManagerBase {

int tc_length(int cpu, int size_class) const {
if (shards_ == nullptr) return 0;
const uint8_t shard = cpu_layout_->CpuShard(cpu);
const uint8_t shard = l3_cache_index_[cpu];
if (!shard_initialized(shard)) return 0;
return shards_[shard].transfer_caches[size_class].tc_length();
}
Expand Down Expand Up @@ -386,14 +387,19 @@ class ShardedTransferCacheManagerBase {
// Returns the cache shard corresponding to the given size class and the
// current cpu's L3 node. The cache will be initialized if required.
TransferCache &get_cache(int size_class) {
const uint8_t shard_index =
cpu_layout_->CpuShard(cpu_layout_->CurrentCpu());
const int cpu = cpu_layout_->CurrentCpu();
ASSERT(cpu < ABSL_ARRAYSIZE(l3_cache_index_));
ASSERT(cpu >= 0);
const uint8_t shard_index = l3_cache_index_[cpu];
ASSERT(shard_index < num_shards_);
Shard &shard = shards_[shard_index];
absl::call_once(shard.once_flag, [this, &shard]() { InitShard(shard); });
return shard.transfer_caches[size_class];
}

// Mapping from cpu to the L3 cache used.
uint8_t l3_cache_index_[CPU_SETSIZE] = {0};

Shard *shards_ = nullptr;
int num_shards_ = 0;
std::atomic<int> active_shards_ = 0;
Expand Down

0 comments on commit f2a7d62

Please sign in to comment.