Skip to content

Commit

Permalink
Merge pull request #9225 from bjorng/bjorn/erts/opt-bit-syntax-constr…
Browse files Browse the repository at this point in the history
…uction

Slightly optimize bit syntax construction
  • Loading branch information
bjorng authored Jan 10, 2025
2 parents d0ee4a0 + c5b6451 commit 4856f40
Show file tree
Hide file tree
Showing 7 changed files with 138 additions and 58 deletions.
24 changes: 6 additions & 18 deletions erts/emulator/beam/erl_bits.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,18 +51,6 @@ typedef Uint16 erlfp16;
#include "erl_bits_f16.h"
#endif

/*
* Here is how many bits we can copy in each reduction.
*
* At the time of writing of this comment, CONTEXT_REDS was 4000 and
* BITS_PER_REDUCTION was 1 KiB (8192 bits). The time for copying an
* unaligned 4000 KiB binary on my computer (which has a 4,2 GHz Intel
* i7 CPU) was about 5 ms. The time was approximately 4 times lower if
* the source and destinations binaries were aligned.
*/

#define BITS_PER_REDUCTION (8*1024)

/*
* MAKE_MASK(n) constructs a mask with n bits.
* Example: MAKE_MASK(3) returns the binary number 00000111.
Expand Down Expand Up @@ -1191,7 +1179,7 @@ erts_bs_put_binary(ErlBitsState *EBS, Process *c_p, Eterm arg, Uint num_bits)
base, offset, num_bits);
EBS->erts_bin_offset += num_bits;

BUMP_REDS(c_p, num_bits / BITS_PER_REDUCTION);
BUMP_REDS(c_p, num_bits / ERL_BITS_PER_REDUCTION);
return 1;
}

Expand All @@ -1215,7 +1203,7 @@ erts_bs_put_binary_all(ErlBitsState *EBS, Process *c_p, Eterm arg, Uint unit)
base, offset, size);
EBS->erts_bin_offset += size;

BUMP_REDS(c_p, size / BITS_PER_REDUCTION);
BUMP_REDS(c_p, size / ERL_BITS_PER_REDUCTION);
return 1;
}

Expand Down Expand Up @@ -1703,7 +1691,7 @@ erts_bs_append_checked(Process* c_p, Eterm* reg, Uint live,
binp = erts_bin_realloc(binp, new_size);
br->val = binp;

BUMP_REDS(c_p, position / BITS_PER_REDUCTION);
BUMP_REDS(c_p, position / ERL_BITS_PER_REDUCTION);
}

binp->intern.apparent_size = NBYTES(used_size_in_bits);
Expand Down Expand Up @@ -1798,7 +1786,7 @@ erts_bs_append_checked(Process* c_p, Eterm* reg, Uint live,
src_bytes,
src_offset,
src_size);
BUMP_REDS(c_p, src_size / BITS_PER_REDUCTION);
BUMP_REDS(c_p, src_size / ERL_BITS_PER_REDUCTION);

return make_bitstring(sb);
}
Expand Down Expand Up @@ -1851,7 +1839,7 @@ erts_bs_private_append_checked(ErlBitsState* EBS, Process* p,
refc_binary = erts_bin_realloc(refc_binary, new_size);
br->val = refc_binary;

BUMP_REDS(p, EBS->erts_bin_offset / BITS_PER_REDUCTION);
BUMP_REDS(p, EBS->erts_bin_offset / ERL_BITS_PER_REDUCTION);
}

ASSERT(sb->start == 0);
Expand Down Expand Up @@ -1887,7 +1875,7 @@ erts_bs_private_append_checked(ErlBitsState* EBS, Process* p,
refc_binary->orig_bytes,
MIN(refc_binary->orig_size, new_size));

BUMP_REDS(p, EBS->erts_bin_offset / BITS_PER_REDUCTION);
BUMP_REDS(p, EBS->erts_bin_offset / ERL_BITS_PER_REDUCTION);
refc_binary = new_binary;
}

Expand Down
12 changes: 12 additions & 0 deletions erts/emulator/beam/erl_bits.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,18 @@
/** @brief returns the number of bits there are in \c x bytes. */
#define NBITS(x) ((Uint64)(x) << 3)

/*
* Here is how many bits we can copy in each reduction.
*
* At the time of writing of this comment, CONTEXT_REDS was 4000 and
* ERL_BITS_PER_REDUCTION was 1 KiB (8192 bits). The time for copying an
* unaligned 4000 KiB binary on my computer (which has a 4,2 GHz Intel
* i7 CPU) was about 5 ms. The time was approximately 4 times lower if
* the source and destinations binaries were aligned.
*/

#define ERL_BITS_PER_REDUCTION (8*1024)

#define BYTE_OFFSET(offset_in_bits) ((Uint)(offset_in_bits) >> 3)
#define BIT_OFFSET(offset_in_bits) ((offset_in_bits) & 7)

Expand Down
26 changes: 17 additions & 9 deletions erts/emulator/beam/jit/arm/instr_bs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2201,26 +2201,31 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
seg.size.as<ArgAtom>().get() == am_all) {
/* Include the entire binary/bitstring in the
* resulting binary. */

can_fail =
!(exact_type<BeamTypeId::Bitstring>(seg.src) &&
std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit);

load_erl_bits_state(ARG1);
a.mov(ARG2, c_p);
mov_arg(ARG3, seg.src);
mov_imm(ARG4, seg.unit);

emit_enter_runtime<Update::eReductions>(Live.get());
runtime_call<int (*)(ErlBitsState *, Process *, Eterm, Uint),
erts_bs_put_binary_all>();
if (can_fail) {
mov_imm(ARG4, seg.unit);
runtime_call<
int (*)(ErlBitsState *, Process *, Eterm, Uint),
erts_bs_put_binary_all>();
} else {
runtime_call<void (*)(ErlBitsState *, Process *, Eterm),
beam_jit_bs_put_binary_all>();
}
emit_leave_runtime<Update::eReductions>(Live.get());

error_info = beam_jit_update_bsc_reason_info(seg.error_info,
BSC_REASON_BADARG,
BSC_INFO_UNIT,
BSC_VALUE_FVALUE);
if (exact_type<BeamTypeId::Bitstring>(seg.src) &&
std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit) {
comment("skipped test for success because units are "
"compatible");
can_fail = false;
}
} else {
/* The size is a variable. We have verified that
* the value is a non-negative small in the
Expand Down Expand Up @@ -2252,6 +2257,9 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
mov_imm(ARG4, error_info);
}
a.cbz(ARG1, resolve_label(error, disp1MB));
} else {
comment("skipped test for success because units are "
"compatible");
}
break;
}
Expand Down
16 changes: 16 additions & 0 deletions erts/emulator/beam/jit/beam_jit_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1033,6 +1033,22 @@ Eterm beam_jit_int128_to_big(Process *p, Uint sign, Uint low, Uint high) {
return make_big(hp);
}

void beam_jit_bs_put_binary_all(ErlBitsState *EBS, Process *c_p, Eterm arg) {
Uint offset, size;
byte *base;

ERTS_GET_BITSTRING(arg, base, offset, size);

copy_binary_to_buffer(EBS->erts_current_bin,
EBS->erts_bin_offset,
base,
offset,
size);
EBS->erts_bin_offset += size;

BUMP_REDS(c_p, size / ERL_BITS_PER_REDUCTION);
}

ErtsMessage *beam_jit_decode_dist(Process *c_p, ErtsMessage *msgp) {
if (!erts_proc_sig_decode_dist(c_p, ERTS_PROC_LOCK_MAIN, msgp, 0)) {
/*
Expand Down
1 change: 1 addition & 0 deletions erts/emulator/beam/jit/beam_jit_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,7 @@ void beam_jit_bs_construct_fail_info(Process *c_p,
Eterm arg3,
Eterm arg1);
Sint beam_jit_bs_bit_size(Eterm term);
void beam_jit_bs_put_binary_all(ErlBitsState *EBS, Process *c_p, Eterm arg);

Eterm beam_jit_int128_to_big(Process *p, Uint sign, Uint low, Uint high);

Expand Down
6 changes: 6 additions & 0 deletions erts/emulator/beam/jit/x86/beam_asm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1421,6 +1421,12 @@ class BeamModuleAssembler : public BeamAssembler,
x86::Gp size_reg);
bool need_mask(const ArgVal Val, Sint size);
void set_zero(Sint effectiveSize);
void emit_accumulate(ArgVal src,
Sint effectiveSize,
x86::Gp bin_data,
x86::Gp tmp,
x86::Gp value,
bool isFirst);
bool bs_maybe_enter_runtime(bool entered);
void bs_maybe_leave_runtime(bool entered);
void emit_construct_utf8_shared();
Expand Down
111 changes: 80 additions & 31 deletions erts/emulator/beam/jit/x86/instr_bs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1414,6 +1414,47 @@ void BeamModuleAssembler::set_zero(Sint effectiveSize) {
}
}

/*
* Efficiently accumulate a value for a binary segment,
* using the smallest possible instructions.
*/
void BeamModuleAssembler::emit_accumulate(ArgVal src,
Sint effectiveSize,
x86::Gp bin_data,
x86::Gp tmp,
x86::Gp value,
bool isFirst) {
if (isFirst) {
/* There is no need to mask the first value being
* accumulated. */
if (effectiveSize > 32) {
a.mov(bin_data, value);
} else {
a.mov(bin_data.r32(), value.r32());
}
return;
}

ASSERT(effectiveSize < 64);

if (!need_mask(src, effectiveSize)) {
comment("skipped masking because the value always fits");
} else if (effectiveSize == 32) {
a.mov(value.r32(), value.r32());
} else if (effectiveSize == 16) {
a.movzx(value.r32(), value.r16());
} else if (effectiveSize == 8) {
a.movzx(value.r32(), value.r8());
} else if (effectiveSize < 32) {
a.and_(value.r32(), (1ULL << effectiveSize) - 1);
} else {
mov_imm(tmp, (1ULL << effectiveSize) - 1);
a.and_(value, tmp);
}

a.or_(bin_data, value);
}

/*
* In:
*
Expand Down Expand Up @@ -2308,22 +2349,28 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
seg.size.as<ArgAtom>().get() == am_all) {
/* Include the entire binary/bitstring in the
* resulting binary. */
mov_imm(ARG4, seg.unit);
can_fail =
!(exact_type<BeamTypeId::Bitstring>(seg.src) &&
std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit);

if (can_fail) {
mov_imm(ARG4, seg.unit);
}
mov_arg(ARG3, seg.src);
a.mov(ARG2, c_p);
load_erl_bits_state(ARG1);
runtime_call<int (*)(ErlBitsState *, Process *, Eterm, Uint),
erts_bs_put_binary_all>();
if (can_fail) {
runtime_call<
int (*)(ErlBitsState *, Process *, Eterm, Uint),
erts_bs_put_binary_all>();
} else {
runtime_call<void (*)(ErlBitsState *, Process *, Eterm),
beam_jit_bs_put_binary_all>();
}
error_info = beam_jit_update_bsc_reason_info(seg.error_info,
BSC_REASON_BADARG,
BSC_INFO_UNIT,
BSC_VALUE_FVALUE);
if (exact_type<BeamTypeId::Bitstring>(seg.src) &&
std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit) {
comment("skipped test for success because units are "
"compatible");
can_fail = false;
}
} else {
/* The size is a variable. We have verified that
* the value is a non-negative small in the
Expand All @@ -2347,7 +2394,10 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
BSC_VALUE_FVALUE);
}

if (can_fail) {
if (!can_fail) {
comment("skipped test for success because units are "
"compatible");
} else {
if (Fail.get() == 0) {
mov_imm(ARG4, error_info);
}
Expand Down Expand Up @@ -2399,12 +2449,13 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
x86::Gp bin_data = ARG5;

comment("accumulate value for integer segment");
if (seg.action == BscSegment::action::ACCUMULATE_FIRST) {
mov_imm(bin_data, 0);
} else if (seg.effectiveSize < 64) {
if (seg.action != BscSegment::action::ACCUMULATE_FIRST &&
seg.effectiveSize < 64) {
a.shl(bin_data, imm(seg.effectiveSize));
}
mov_arg(ARG1, seg.src);
if (!seg.src.isSmall()) {
mov_arg(ARG1, seg.src);
}

if (!always_small(seg.src)) {
if (always_one_of<BeamTypeId::Integer,
Expand Down Expand Up @@ -2441,26 +2492,24 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
}

a.bind(value_is_small);
a.sar(ARG1, imm(_TAG_IMMED1_SIZE));
if (seg.src.isSmall()) {
Sint val = signed_val(seg.src.as<ArgSmall>().get());
mov_imm(ARG1, val);
} else if (seg.effectiveSize + _TAG_IMMED1_SIZE <= 32) {
a.shr(ARG1d, imm(_TAG_IMMED1_SIZE));
} else {
a.sar(ARG1, imm(_TAG_IMMED1_SIZE));
}

/* Mask (if needed) and accumulate. */
a.bind(accumulate);
if (seg.effectiveSize == 64) {
a.mov(bin_data, ARG1);
} else if (!need_mask(seg.src, seg.effectiveSize)) {
comment("skipped masking because the value always fits");
a.or_(bin_data, ARG1);
} else if (seg.effectiveSize == 32) {
a.mov(ARG1d, ARG1d);
a.or_(bin_data, ARG1);
} else if (seg.effectiveSize < 32) {
a.and_(ARG1, (1ULL << seg.effectiveSize) - 1);
a.or_(bin_data, ARG1);
} else {
mov_imm(tmp, (1ULL << seg.effectiveSize) - 1);
a.and_(ARG1, tmp);
a.or_(bin_data, ARG1);
}
emit_accumulate(seg.src,
seg.effectiveSize,
bin_data,
tmp,
ARG1,
seg.action ==
BscSegment::action::ACCUMULATE_FIRST);
break;
}
case BscSegment::action::STORE: {
Expand Down

0 comments on commit 4856f40

Please sign in to comment.