Skip to content

Commit

Permalink
tcmalloc: improve malloc fast path codegen
Browse files Browse the repository at this point in the history
Currently malloc fast path contains the following instruction
related to Sampler address calculation that is unused on the fast path:

  31241c:       64 48 8b 0c 25 00 00    mov    %fs:0x0,%rcx
  312423:       00 00

And then the slow path (sampling undo) is:

  31248f:       48 8d 89 ec fc ff ff    lea    -0x314(%rcx),%rcx
  312496:       48 01 41 28             add    %rax,0x28(%rcx)

With this change the excessive instruction is removed from the fast path,
and the slow path becomes:

  312346:       64 48 01 04 25 14 fd    add    %rax,%fs:0xfffffffffffffd14
  31234d:       ff ff

So that's -2 instructions/-11 bytes.

The only way to achieve this I found is by introducing an alias
of the Sampler variable, such that compiler is not aware that it's an alias.

name                   old cpu/op   new cpu/op   delta
BM_new_sized_delete/1  4.16ns ± 1%  3.80ns ± 0%  -8.56%  (p=0.016 n=5+4)
BM_new_sized_delete/8  4.18ns ± 2%  3.81ns ± 0%  -8.89%  (p=0.008 n=5+5)

name                   old INSTRUCTIONS/op     new INSTRUCTIONS/op     delta
BM_new_sized_delete/1    54.0 ± 0%               53.0 ± 0%  -1.85%          (p=0.029 n=4+4)
BM_new_sized_delete/8    54.0 ± 0%               53.0 ± 0%  -1.85%          (p=0.008 n=5+5)

PiperOrigin-RevId: 571310663
Change-Id: Ia99695a099eddbb8492cdc993057c011063728fb
  • Loading branch information
dvyukov authored and copybara-github committed Oct 6, 2023
1 parent e239ffa commit 60282bf
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 9 deletions.
5 changes: 5 additions & 0 deletions tcmalloc/internal/percpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ ABSL_CONST_INIT static std::atomic<bool> using_upstream_fence{false};
#endif // TCMALLOC_INTERNAL_PERCPU_USE_RSEQ

extern "C" thread_local char tcmalloc_sampler ABSL_ATTRIBUTE_INITIAL_EXEC;
extern "C" thread_local char tcmalloc_sampler_alias ABSL_ATTRIBUTE_INITIAL_EXEC;

// Is this thread's __rseq_abi struct currently registered with the kernel?
static bool ThreadRegistered() { return RseqCpuId() >= kCpuIdInitialized; }
Expand Down Expand Up @@ -89,6 +90,8 @@ static void InitPerCpu() {
#if TCMALLOC_INTERNAL_PERCPU_USE_RSEQ
// See the comment about data layout in percpu.h for details.
auto sampler_addr = reinterpret_cast<uintptr_t>(&tcmalloc_sampler);
auto sampler_alias_addr =
reinterpret_cast<uintptr_t>(&tcmalloc_sampler_alias);
// Have to use volatile because C++ compiler rejects to believe that
// objects can overlap.
volatile auto slabs_addr = reinterpret_cast<uintptr_t>(&tcmalloc_slabs);
Expand All @@ -106,6 +109,8 @@ static void InitPerCpu() {
CHECK_CONDITION((sampler_addr % TCMALLOC_SAMPLER_ALIGN) == 0);
// Ensure that tcmalloc_sampler is located before tcmalloc_slabs.
CHECK_CONDITION(sampler_addr + TCMALLOC_SAMPLER_SIZE <= slabs_addr);
// Ensure that sampler alias is actually an alias.
CHECK_CONDITION(sampler_addr == sampler_alias_addr);

constexpr int kMEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ = (1 << 8);
// It is safe to make the syscall below multiple times.
Expand Down
4 changes: 4 additions & 0 deletions tcmalloc/internal/percpu_rseq_asm.S
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,18 @@

// See the comment about data layout in percpu.h for details.
.type tcmalloc_sampler, @object
.type tcmalloc_sampler_alias, @object
.type tcmalloc_slabs, @object
.type __rseq_abi, @object
.section .tdata, "awT", @progbits, unique, 1
.globl tcmalloc_sampler
.globl tcmalloc_sampler_alias
.globl tcmalloc_slabs
.globl __rseq_abi
.p2align 6
.zero 64 + 32 - TCMALLOC_SAMPLER_SIZE - 8
tcmalloc_sampler:
tcmalloc_sampler_alias:
.zero TCMALLOC_SAMPLER_SIZE
// alignment padding (since tcmalloc_slabs is only 4-bytes aligned)
.zero 4
Expand All @@ -51,6 +54,7 @@ __rseq_abi:
.short 0xffff // vcpu_id (kCpuIdUninitialized)
.size __rseq_abi, 32
.size tcmalloc_sampler, TCMALLOC_SAMPLER_SIZE
.size tcmalloc_sampler_alias, TCMALLOC_SAMPLER_SIZE
.size tcmalloc_slabs, 8

#endif // TCMALLOC_PERCPU_RSEQ_SUPPORTED_PLATFORM
Expand Down
25 changes: 16 additions & 9 deletions tcmalloc/sampler.h
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,13 @@ class Sampler {
ssize_t GetGeometricVariable(ssize_t mean);
};

extern "C" ABSL_CONST_INIT thread_local Sampler tcmalloc_sampler_alias
ABSL_ATTRIBUTE_INITIAL_EXEC;
#ifdef __x86_64__
ABSL_CONST_INIT ABSL_ATTRIBUTE_WEAK thread_local Sampler tcmalloc_sampler_alias
ABSL_ATTRIBUTE_INITIAL_EXEC;
#endif

inline size_t Sampler::RecordAllocation(size_t k) {
// The first time we enter this function we expect bytes_until_sample_
// to be zero, and we must call SampleAllocationSlow() to ensure
Expand Down Expand Up @@ -234,7 +241,7 @@ Sampler::TryRecordAllocationFast(size_t k) {
// are permitted. And thus it makes sense to assert on that.
ASSERT(static_cast<ssize_t>(k) > 0);

#ifdef __aarch64__
#ifndef __x86_64__
// TODO(b/271483758): This produces a more efficient compare on ARM.
if (ABSL_PREDICT_FALSE(bytes_until_sample_ <= k)) {
#else
Expand All @@ -247,18 +254,18 @@ Sampler::TryRecordAllocationFast(size_t k) {
// to avoid non-tail calls in malloc fast-path. See also comments
// on declaration inside Sampler class.
//
// volatile is used here to improve compiler's choice of
// instructions. We know that this path is very rare and that there
// is no need to keep previous value of bytes_until_sample_ in
// register. This helps compiler generate slightly more efficient
// TODO(b/302050723): tcmalloc_sampler_alias is used here to improve
// compiler's choice of instructions. We know that this path is very rare
// and that there is no need to keep previous value of bytes_until_sample_
// in register. This helps compiler generate slightly more efficient
// sub <reg>, <mem> instruction for subtraction above.
#ifndef __aarch64__
volatile ssize_t* ptr = const_cast<volatile ssize_t*>(&bytes_until_sample_);
*ptr = *ptr + k;
#ifdef __x86_64__
ASSERT(this == &tcmalloc_sampler_alias);
tcmalloc_sampler_alias.bytes_until_sample_ += k;
#endif
return false;
}
#ifdef __aarch64__
#ifndef __x86_64__
bytes_until_sample_ -= static_cast<ssize_t>(k);
#endif
return true;
Expand Down

0 comments on commit 60282bf

Please sign in to comment.