diff --git a/bench/LUT6.S b/bench/LUT6.S new file mode 100644 index 0000000..b97bdf7 --- /dev/null +++ b/bench/LUT6.S @@ -0,0 +1,101 @@ +#if MX_N == 4 + +.global LUT6_rvv_vloxei8_m4 +LUT6_rvv_vloxei8_m4: + vsetvli t0, x0, e8, m4, ta, ma + li t0, 63 + vmv.v.x v24, t0 +1: + vsetvli a3, a2, e8, m4, ta, ma + vle8.v v8, (a1) + vand.vv v8, v8, v24 + vloxei8.v v8, (a0), v8 + vse8.v v8, (a1) + sub a2, a2, a3 + add a1, a1, a3 + bnez a2, 1b + ret + +.global LUT6_rvv_vluxei8_m4 +LUT6_rvv_vluxei8_m4: + vsetvli t0, x0, e8, m4, ta, ma + li t0, 63 + vmv.v.x v24, t0 +1: + vsetvli a3, a2, e8, m4, ta, ma + vle8.v v8, (a1) + vand.vv v8, v8, v24 + vluxei8.v v8, (a0), v8 + vse8.v v8, (a1) + sub a2, a2, a3 + add a1, a1, a3 + bnez a2, 1b + ret + +# a0 = lut, a1 = ptr, a2 = len +.global LUT6_rvv_gather_m4 +LUT6_rvv_gather_m4: + li t0, 64 + vsetvli zero, t0, e8, m4, ta, ma + vle8.v v0, (a0) + + vsetvli t0, x0, e8, m4, ta, ma + li t0, 63 + vmv.v.x v24, t0 +1: + vsetvli a0, a2, e8, m4, ta, ma + vle8.v v8, (a1) + vand.vv v8, v8, v24 + vrgather.vv v16, v0, v8 + vse8.v v16, (a1) + sub a2, a2, a0 + add a1, a1, a0 + bnez a2, 1b + ret + +.global LUT6_rvv_m1m2m4_gathers_m4 +LUT6_rvv_m1m2m4_gathers_m4: + li t0, 64 + vsetvli zero, t0, e8, m4, ta, ma + vle8.v v0, (a0) + + vsetvli t0, x0, e8, m4, ta, ma + li t0, 63 + vmv.v.x v24, t0 + + csrr t0, vlenb + srl t0, t0, 4 + sltiu t1, t0, 4 + sltiu t0, t0, 2 + j 0f +1: + vsetvli t1, x0, e8, m1, ta, ma + vrgather.vv v16, v0, v8 + vrgather.vv v17, v0, v9 + vrgather.vv v18, v0, v10 + vrgather.vv v19, v0, v11 +8: + vsetvli x0, a0, e8, m4, ta, ma + vse8.v v16, (a1) + sub a2, a2, a0 + add a1, a1, a0 + beqz a2, 9f +0: + vsetvli a0, a2, e8, m4, ta, ma + vle8.v v8, (a1) + vand.vv v8, v8, v24 + beqz t1, 1b + beqz t1, 2f + vrgather.vv v16, v0, v8 + j 8b +2: + vsetvli t1, x0, e8, m2, ta, ma + vrgather.vv v16, v0, v8 + vrgather.vv v18, v0, v10 + j 8b +9: + ret + +#endif + + diff --git a/bench/LUT6.c b/bench/LUT6.c new file mode 100644 index 0000000..074ef39 --- /dev/null +++ b/bench/LUT6.c @@ -0,0 +1,58 @@ +#include "bench.h" + +void +LUT6_scalar(uint8_t lut[64], uint8_t *ptr, size_t n) +{ + for (; n--; ++ptr) + *ptr = lut[*ptr & 63], BENCH_CLOBBER(); +} + +void +LUT6_scalar_autovec(uint8_t lut[64], uint8_t *ptr, size_t n) +{ + for (; n--; ++ptr) + *ptr = lut[*ptr & 63]; +} + + +#define IMPLS(f) \ + f(scalar) \ + f(scalar_autovec) \ + f(rvv_gather_m4) \ + f(rvv_m1m2m4_gathers_m4) \ + f(rvv_vluxei8_m4) \ + f(rvv_vloxei8_m4) \ + +typedef void Func(uint8_t lut[64], uint8_t *ptr, size_t n); + +#define DECLARE(f) extern Func LUT6_##f; +IMPLS(DECLARE) + +#define EXTRACT(f) { #f, &LUT6_##f }, +Impl impls[] = { IMPLS(EXTRACT) }; + +uint8_t *ptr; + +void init(void) { ptr = (uint8_t*)mem; } + +ux checksum(size_t n) { + ux sum = 0; + for (size_t i = 0; i < n; ++i) + sum = uhash(sum) + ptr[i]; + return sum; +} + +BENCH_BEG(base) { + static uint8_t lut[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789" + "+/"; + bench_memrand(ptr, n * sizeof *ptr); + TIME f(lut, ptr, n); +} BENCH_END + +Bench benches[] = { + BENCH( impls, MAX_MEM, "LUT6", bench_base ) +}; BENCH_MAIN(benches) + diff --git a/bench/Makefile b/bench/Makefile index 41399a8..33f6087 100644 --- a/bench/Makefile +++ b/bench/Makefile @@ -2,7 +2,7 @@ include ../config.mk -EXECS=memcpy memset utf8_count strlen mergelines mandelbrot chacha20 poly1305 ascii_to_utf16 ascii_to_utf32 byteswap LUT4 +EXECS=memcpy memset utf8_count strlen mergelines mandelbrot chacha20 poly1305 ascii_to_utf16 ascii_to_utf32 byteswap LUT4 LUT6 all: ${EXECS} diff --git a/bench/bench.h b/bench/bench.h index 453f260..8b9556c 100644 --- a/bench/bench.h +++ b/bench/bench.h @@ -78,7 +78,7 @@ main(void) size_t x; randState.x ^= rv_cycles()*7; - randState.y += rv_cycles() ^ (uintptr_t)&x + 666*(uintptr_t)mem; + randState.y += rv_cycles() ^ ((uintptr_t)&x + 666*(uintptr_t)mem); /* initialize memory */ bench_memrand(mem, MAX_MEM);