Skip to content

Commit 25d02bb

Browse files
committed
[x86-64] Implement TLSGD to TLSIE relaxation
If we know that the .so file we are creating will not be dlopen'ed, we can relax __tls_get_addr function calls to GOT loads.
1 parent 983fe0f commit 25d02bb

File tree

8 files changed

+128
-24
lines changed

8 files changed

+128
-24
lines changed

docs/mold.1

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1198,6 +1198,8 @@ Mark DSO non-deletable at runtime.
11981198
.It Fl z Cm nodlopen
11991199
Mark DSO not available to
12001200
.Xr dlopen 3 .
1201+
This option makes it possible for the linker to optimize thread-local \
1202+
variable accesses by rewriting instructions for some targets.
12011203
.Pp
12021204
.It Fl z Cm nodump
12031205
Mark DSO not available to

elf/arch-i386.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
140140
0xcc, // (padding)
141141
};
142142
memcpy(buf, insn, sizeof(insn));
143+
*(ul32 *)(buf + 5) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
143144
*(ul32 *)(buf + 11) = sym.get_gotplt_addr(ctx) - ctx.got->shdr.sh_addr;
144145
} else {
145146
static const u8 insn[] = {
@@ -149,10 +150,9 @@ void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
149150
0xcc, // (padding)
150151
};
151152
memcpy(buf, insn, sizeof(insn));
153+
*(ul32 *)(buf + 5) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
152154
*(ul32 *)(buf + 11) = sym.get_gotplt_addr(ctx);
153155
}
154-
155-
*(ul32 *)(buf + 5) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
156156
}
157157

158158
template <>
@@ -528,7 +528,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
528528
ty != R_386_GOT32 && ty != R_386_GOT32X)
529529
Fatal(ctx) << *this << ": TLS_GD reloc must be followed by PLT or GOT32";
530530

531-
if (relax_tlsgd(ctx, sym))
531+
if (ctx.arg.relax && !ctx.arg.shared && !sym.is_imported)
532532
i++;
533533
else
534534
sym.flags.fetch_or(NEEDS_TLSGD, std::memory_order_relaxed);
@@ -542,7 +542,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
542542
ty != R_386_GOT32 && ty != R_386_GOT32X)
543543
Fatal(ctx) << *this << ": TLS_LDM reloc must be followed by PLT or GOT32";
544544

545-
if (relax_tlsld(ctx))
545+
if (ctx.arg.relax && !ctx.arg.shared)
546546
i++;
547547
else
548548
ctx.needs_tlsld.store(true, std::memory_order_relaxed);

elf/arch-s390x.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -456,12 +456,13 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
456456
break;
457457
case R_390_TLS_GD32:
458458
case R_390_TLS_GD64:
459-
if (!relax_tlsgd(ctx, sym))
459+
if (bool do_relax = ctx.arg.relax && !ctx.arg.shared && !sym.is_imported;
460+
!do_relax)
460461
sym.flags.fetch_or(NEEDS_TLSGD, std::memory_order_relaxed);
461462
break;
462463
case R_390_TLS_LDM32:
463464
case R_390_TLS_LDM64:
464-
if (!relax_tlsld(ctx))
465+
if (bool do_relax = ctx.arg.relax && !ctx.arg.shared; !do_relax)
465466
ctx.needs_tlsld.store(true, std::memory_order_relaxed);
466467
break;
467468
case R_390_TLS_LE32:

elf/arch-x86-64.cc

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -234,8 +234,37 @@ static void relax_gd_to_le(u8 *loc, ElfRel<E> rel, u64 val) {
234234
}
235235
}
236236

237+
static void relax_gd_to_ie(u8 *loc, ElfRel<E> rel, u64 val) {
238+
switch (rel.r_type) {
239+
case R_X86_64_PLT32:
240+
case R_X86_64_PC32:
241+
case R_X86_64_GOTPCREL:
242+
case R_X86_64_GOTPCRELX: {
243+
static const u8 insn[] = {
244+
0x64, 0x48, 0x8b, 0x04, 0x25, 0, 0, 0, 0, // mov %fs:0, %rax
245+
0x48, 0x03, 0x05, 0, 0, 0, 0, // add foo@gottpoff(%rip), %rax
246+
};
247+
memcpy(loc - 4, insn, sizeof(insn));
248+
*(ul32 *)(loc + 8) = val - 12;
249+
break;
250+
}
251+
case R_X86_64_PLTOFF64: {
252+
static const u8 insn[] = {
253+
0x64, 0x48, 0x8b, 0x04, 0x25, 0, 0, 0, 0, // mov %fs:0, %rax
254+
0x48, 0x03, 0x05, 0, 0, 0, 0, // add foo@gottpoff(%rip), %rax
255+
0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00, // nop
256+
};
257+
memcpy(loc - 3, insn, sizeof(insn));
258+
*(ul32 *)(loc + 9) = val - 13;
259+
break;
260+
}
261+
default:
262+
unreachable();
263+
}
264+
}
265+
237266
// Rewrite a function call to __tls_get_addr to a cheaper instruction
238-
// sequence. The difference from relax_ld_to_le is that we are
267+
// sequence. The difference from relax_gd_to_le is that we are
239268
// materializing a Dynamic Thread Pointer for the current ELF module
240269
// instead of an address for a particular thread-local variable.
241270
static void relax_ld_to_le(u8 *loc, ElfRel<E> rel, u64 val) {
@@ -416,6 +445,9 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
416445
case R_X86_64_TLSGD:
417446
if (sym.has_tlsgd(ctx)) {
418447
write32s(sym.get_tlsgd_addr(ctx) + A - P);
448+
} else if (sym.has_gottp(ctx)) {
449+
relax_gd_to_ie(loc, rels[i + 1], sym.get_gottp_addr(ctx) - P);
450+
i++;
419451
} else {
420452
relax_gd_to_le(loc, rels[i + 1], S - ctx.tp_addr);
421453
i++;
@@ -662,7 +694,7 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
662694
if (sym.is_imported)
663695
sym.flags.fetch_or(NEEDS_PLT, std::memory_order_relaxed);
664696
break;
665-
case R_X86_64_TLSGD: {
697+
case R_X86_64_TLSGD:
666698
if (rel.r_addend != -4)
667699
Fatal(ctx) << *this << ": bad r_addend for R_X86_64_TLSGD";
668700

@@ -675,13 +707,17 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
675707
ty != R_X86_64_GOTPCRELX)
676708
Fatal(ctx) << *this << ": TLSGD reloc must be followed by PLT or GOTPCREL";
677709

678-
if (relax_tlsgd(ctx, sym))
710+
if (ctx.arg.relax && !sym.is_imported && !ctx.arg.shared) {
679711
i++;
680-
else
712+
} else if (ctx.arg.relax && !sym.is_imported && ctx.arg.shared &&
713+
!ctx.arg.z_dlopen) {
714+
sym.flags.fetch_or(NEEDS_GOTTP, std::memory_order_relaxed);
715+
i++;
716+
} else {
681717
sym.flags.fetch_or(NEEDS_TLSGD, std::memory_order_relaxed);
718+
}
682719
break;
683-
}
684-
case R_X86_64_TLSLD: {
720+
case R_X86_64_TLSLD:
685721
if (rel.r_addend != -4)
686722
Fatal(ctx) << *this << ": bad r_addend for R_X86_64_TLSLD";
687723

@@ -694,12 +730,11 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
694730
ty != R_X86_64_GOTPCRELX)
695731
Fatal(ctx) << *this << ": TLSLD reloc must be followed by PLT or GOTPCREL";
696732

697-
if (relax_tlsld(ctx))
733+
if (ctx.arg.relax && !ctx.arg.shared)
698734
i++;
699735
else
700736
ctx.needs_tlsld.store(true, std::memory_order_relaxed);
701737
break;
702-
}
703738
case R_X86_64_GOTTPOFF: {
704739
if (rel.r_addend != -4)
705740
Fatal(ctx) << *this << ": bad r_addend for R_X86_64_GOTTPOFF";

elf/mold.h

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2826,16 +2826,6 @@ inline bool is_c_identifier(std::string_view s) {
28262826
return true;
28272827
}
28282828

2829-
template <typename E>
2830-
inline bool relax_tlsgd(Context<E> &ctx, Symbol<E> &sym) {
2831-
return ctx.arg.relax && !ctx.arg.shared && !sym.is_imported;
2832-
}
2833-
2834-
template <typename E>
2835-
inline bool relax_tlsld(Context<E> &ctx) {
2836-
return ctx.arg.relax && !ctx.arg.shared;
2837-
}
2838-
28392829
template <typename E>
28402830
inline bool relax_tlsdesc(Context<E> &ctx, Symbol<E> &sym) {
28412831
// TLSDESC relocs must be always relaxed for statically-linked

test/elf/tls-gd-to-ie.sh

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/bin/bash
2+
. $(dirname $0)/common.inc
3+
4+
cat <<EOF | $GCC -fPIC -c -o $t/a.o -xc -
5+
#include <stdio.h>
6+
7+
__attribute__((tls_model("global-dynamic"))) static _Thread_local int x1 = 1;
8+
__attribute__((tls_model("global-dynamic"))) _Thread_local int x2 = 2;
9+
__attribute__((tls_model("global-dynamic"))) _Thread_local int x3;
10+
11+
int foo() {
12+
x3 = 3;
13+
14+
printf("%d %d %d\n", x1, x2, x3);
15+
return 0;
16+
}
17+
EOF
18+
19+
cat <<EOF | $CC -fPIC -c -o $t/b.o -xc -
20+
int foo();
21+
int main() { foo(); }
22+
EOF
23+
24+
$CC -B. -shared -o $t/c.so $t/a.o
25+
$CC -B. -o $t/exe1 $t/b.o $t/c.so
26+
$QEMU $t/exe1 | grep -q '1 2 3'
27+
28+
$CC -B. -shared -o $t/d.so $t/a.o -Wl,-no-relax
29+
$CC -B. -o $t/exe2 $t/b.o $t/d.so
30+
$QEMU $t/exe2 | grep -q '1 2 3'
31+
32+
$CC -B. -shared -o $t/e.so $t/a.o -Wl,-z,nodlopen
33+
$CC -B. -o $t/exe3 $t/b.o $t/e.so
34+
$QEMU $t/exe3 | grep -q '1 2 3'
35+
36+
$CC -B. -shared -o $t/f.so $t/a.o -Wl,-z,nodlopen -Wl,-no-relax
37+
$CC -B. -o $t/exe4 $t/b.o $t/f.so
38+
$QEMU $t/exe4 | grep -q '1 2 3'

test/elf/x86_64_ifunc-alias.sh

100644100755
File mode changed.

test/elf/x86_64_tls-gd-to-ie.sh

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/bin/bash
2+
. $(dirname $0)/common.inc
3+
4+
cat <<EOF | $GCC -fPIC -c -o $t/a.o -xc - -mcmodel=large
5+
#include <stdio.h>
6+
7+
__attribute__((tls_model("global-dynamic"))) static _Thread_local int x1 = 1;
8+
__attribute__((tls_model("global-dynamic"))) _Thread_local int x2 = 2;
9+
__attribute__((tls_model("global-dynamic"))) _Thread_local int x3;
10+
11+
int foo() {
12+
x3 = 3;
13+
14+
printf("%d %d %d\n", x1, x2, x3);
15+
return 0;
16+
}
17+
EOF
18+
19+
cat <<EOF | $CC -fPIC -c -o $t/b.o -xc -
20+
int foo();
21+
int main() { foo(); }
22+
EOF
23+
24+
$CC -B. -shared -o $t/c.so $t/a.o
25+
$CC -B. -o $t/exe1 $t/b.o $t/c.so
26+
$QEMU $t/exe1 | grep -q '1 2 3'
27+
28+
$CC -B. -shared -o $t/d.so $t/a.o -Wl,-no-relax
29+
$CC -B. -o $t/exe2 $t/b.o $t/d.so
30+
$QEMU $t/exe2 | grep -q '1 2 3'
31+
32+
$CC -B. -shared -o $t/e.so $t/a.o -Wl,-z,nodlopen
33+
$CC -B. -o $t/exe3 $t/b.o $t/e.so
34+
$QEMU $t/exe3 | grep -q '1 2 3'
35+
36+
$CC -B. -shared -o $t/f.so $t/a.o -Wl,-z,nodlopen -Wl,-no-relax
37+
$CC -B. -o $t/exe4 $t/b.o $t/f.so
38+
$QEMU $t/exe4 | grep -q '1 2 3'

0 commit comments

Comments
 (0)