Skip to content

Commit

Permalink
add bench/hist
Browse files Browse the repository at this point in the history
  • Loading branch information
camel-cdr committed Nov 3, 2024
1 parent 7f1e460 commit 1a32036
Show file tree
Hide file tree
Showing 5 changed files with 209 additions and 1 deletion.
2 changes: 1 addition & 1 deletion bench/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

include ../config.mk

EXECS=memcpy memset utf8_count strlen mergelines mandelbrot chacha20 poly1305 ascii_to_utf16 ascii_to_utf32 byteswap LUT4 LUT6
EXECS=memcpy memset utf8_count strlen mergelines mandelbrot chacha20 poly1305 ascii_to_utf16 ascii_to_utf32 byteswap LUT4 LUT6 hist

all: ${EXECS}

Expand Down
1 change: 1 addition & 0 deletions bench/bench.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ compare_ux(void const *a, void const *b)

static URand randState = { 123, 456, 789 };
static ux bench_urand(void) { return urand(&randState); }
static float bench_urandf(void) { return urandf(&randState); }
static void bench_memrand(void *ptr, size_t n) { return memrand(&randState, ptr, n); }

typedef struct {
Expand Down
118 changes: 118 additions & 0 deletions bench/hist.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#if 0

void
hist_rvv_no_conflict(uint16_t hist[100], float *x, float *y, size_t n)
{
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
vl = __riscv_vsetvl_e32m8(n);
vfloat32m8_t vx = __riscv_vle32_v_f32m8(x, vl);
vfloat32m8_t vy = __riscv_vle32_v_f32m8(y, vl);
vfloat32m8_t vsq = __riscv_vfmacc(__riscv_vfmul(vx, vx, vl), vy, vy, vl);
vfloat32m8_t v = __riscv_vfsqrt(vsq, vl);
vuint16m4_t vidx = __riscv_vminu(__riscv_vfncvt_rtz_xu(v, vl), 100, vl);
vidx = __riscv_vadd(vidx, vidx, vl);
vuint16m4_t vcnt =__riscv_vluxei16(hist, vidx, vl);
vcnt = __riscv_vadd(vcnt, 1, vl);
__riscv_vsuxei16(hist, vidx, vcnt, vl);
}
}

void
hist_rvv_slidedown(uint16_t hist[100], float *x, float *y, size_t n)
{
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
vl = __riscv_vsetvl_e32m8(n);
vfloat32m8_t vx = __riscv_vle32_v_f32m8(x, vl);
vfloat32m8_t vy = __riscv_vle32_v_f32m8(y, vl);
vfloat32m8_t vsq = __riscv_vfmacc(__riscv_vfmul(vx, vx, vl), vy, vy, vl);
vfloat32m8_t v = __riscv_vfsqrt(vsq, vl);
vuint16m4_t vidx = __riscv_vminu(__riscv_vfncvt_rtz_xu(v, vl), 100, vl);

for (size_t i = 0; i < vl; ++i) {
size_t idx = __riscv_vmv_x(__riscv_vslidedown(vidx, i, 1));
++hist[idx];
}
}
}
#endif

#ifdef MX

.global MX(LUT4_rvv_vloxei8_)
MX(LUT4_rvv_vloxei8_):
1:
vsetvli a3, a2, e8, MX(), ta, ma
vle8.v v8, (a1)
vand.vi v8, v8, 15
vloxei8.v v8, (a0), v8
vse8.v v8, (a1)
sub a2, a2, a3
add a1, a1, a3
bnez a2, 1b
ret

/* assumes no conflicts, which causes the wrong result */
.global MX(hist_rvv_no_conflict_)
MX(hist_rvv_no_conflict_):
li a4, 100
1:
vsetvli a5, a3, e32, m8, ta, ma
vle32.v v8, (a1)
vle32.v v16, (a2)
vfmul.vv v8, v8, v8
vfmacc.vv v8, v16, v16
vfsqrt.v v8, v8
vsetvli zero, zero, e16, m4, ta, ma
vfncvt.rtz.xu.f.w v16, v8
vminu.vx v8, v16, a4
vadd.vv v8, v8, v8
vluxei16.v v12, (a0), v8
vadd.vi v12, v12, 1
vsuxei16.v v12, (a0), v8
sub a3, a3, a5
slli a5, a5, 2
add a1, a1, a5
add a2, a2, a5
bnez a3, 1b
ret

.global MX(hist_rvv_slidedown_)
MX(hist_rvv_slidedown_):
li a6, 100
j 2f
1:
sub a3, a3, a7
slli a5, a7, 2
add a1, a1, a5
add a2, a2, a5
beqz a3, 4f
2:
vsetvli a7, a3, e32, MX(), ta, ma
beqz a7, 1b
vle32.v v8, (a1)
vle32.v v16, (a2)
li a4, 0
vfmul.vv v8, v8, v8
vfmacc.vv v8, v16, v16
vfsqrt.v v8, v8
vsetvli zero, zero, e16, MXf2(), ta, ma
vfncvt.rtz.xu.f.w v16, v8
vminu.vx v8, v16, a6
vsetivli zero, 1, e16, MXf2(), ta, ma
3:
vslidedown.vx v12, v8, a4
vmv.x.s a5, v12
slli a5, a5, 1
add t0, a0, a5
lh a5, 0(t0)
addi a5, a5, 1
addi a4, a4, 1
sh a5, 0(t0)
bne a7, a4, 3b
j 1b
4:
ret

#endif


83 changes: 83 additions & 0 deletions bench/hist.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#include "bench.h"

#if __STDC_HOSTED__
#include <math.h>
#endif

void
hist_scalar(uint16_t hist[100], float *x, float *y, size_t n)
{
for (size_t i = 0; i < n; ++i) {
float dist = x[i]*x[i] + y[i]*y[i];
#if __STDC_HOSTED__
dist = sqrtf(dist);
#else
__asm volatile("fsqrt.s %0, %0\n" : "+f"(dist));
#endif
size_t idx = dist;
idx = idx > 100 ? 100 : dist;
++hist[idx];

}
}

void
hist_scalar_autovec(uint16_t hist[restrict 100], float *restrict x, float *restrict y, size_t n)
{
for (size_t i = 0; i < n; ++i) {
float dist = x[i]*x[i] + y[i]*y[i];
#if __STDC_HOSTED__
dist = sqrtf(dist);
#else
__asm volatile("fsqrt.s %0, %0\n" : "+f"(dist));
#endif
size_t idx = dist;
idx = idx > 100 ? 100 : dist;
++hist[idx];
}
}

#define IMPLS(f) \
f(scalar) \
f(scalar_autovec) \
MX(f, rvv_no_conflict) \
MX(f, rvv_slidedown) \

typedef void Func(uint16_t hist[100], float *x, float *y, size_t n);

#define DECLARE(f) extern Func hist_##f;
IMPLS(DECLARE)

#define EXTRACT(f) { #f, &hist_##f },
Impl impls[] = { IMPLS(EXTRACT) };

static uint16_t hist[100];
float *inx, *iny;

void init(void) {
inx = (float*)mem;
iny = (float*)(mem + MAX_MEM/2);
}

ux checksum(size_t n) {
ux sum = 0;
for (size_t i = 0; i < 100; ++i)
sum = hist[i];
return sum <= n; // sanity check for no_conflict
}

BENCH_BEG(base) {
n /= sizeof(float);
memset(hist, 0, sizeof hist);
float max = 70.71; // approx. sqrtf(100*100/2);
for (size_t i = 0; i < n; ++i) {
inx[i] = bench_urandf() * 2 * max - max;
iny[i] = bench_urandf() * 2 * max - max;
}
TIME f(hist, inx, iny, n);
} BENCH_END

Bench benches[] = {
BENCH( impls, MAX_MEM/2, "hist", bench_base)
}; BENCH_MAIN(benches)

6 changes: 6 additions & 0 deletions nolibc.h
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,12 @@ urand(URand *r)
return xp;
}

static inline float
urandf(URand *r) {
uint32_t x = urand(r);
return (x >> (32-24)) * (1.0f / (((uint32_t)1) << 24));
}

static void
memrand(URand *r, void *ptr, size_t n)
{
Expand Down

0 comments on commit 1a32036

Please sign in to comment.