diff --git a/.github/workflows/bench-config.h b/.github/workflows/bench-config.h new file mode 100644 index 0000000..78954b1 --- /dev/null +++ b/.github/workflows/bench-config.h @@ -0,0 +1,12 @@ +#define HAS_E64 (__riscv_v_elen >= 64) +#define HAS_F16 1 +#define MAX_MEM (4096*8) +#define NEXT(c) (c + c/3 + 3) +#define VALIDATE 1 +#define MIN_REPEATS 2 +#define MAX_REPEATS 2 + +#define STOP_CYCLES (1024*1024*500) +#define SCALE_mandelbrot(N) ((N)/10) +#define SCALE_mergelines(N) ((N)/10) +#define mandelbrot_ITER 100 diff --git a/.github/workflows/rv32-config.mk b/.github/workflows/rv32-config.mk new file mode 100644 index 0000000..6e597f9 --- /dev/null +++ b/.github/workflows/rv32-config.mk @@ -0,0 +1,3 @@ +WARN=-Wall -Wextra -Wno-unused-function -Wno-unused-parameter +CC=clang-17 +CFLAGS=--target=riscv32 -march=rv32gc_zve32f_zfh_zba_zbb_zbs -O3 ${WARN} -nostdlib -fno-builtin -nodefaultlibs -ffreestanding diff --git a/.github/workflows/rv32-run.sh b/.github/workflows/rv32-run.sh new file mode 100755 index 0000000..91a3031 --- /dev/null +++ b/.github/workflows/rv32-run.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=128,rvv_ta_all_1s=on,rvv_ma_all_1s=on,zfh=true,x-zvfh=true $@ && \ +qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=256,rvv_ta_all_1s=on,rvv_ma_all_1s=on,zfh=true,x-zvfh=true $@ && \ +qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=512,rvv_ta_all_1s=on,rvv_ma_all_1s=on,zfh=true,x-zvfh=true $@ && \ +qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=1024,rvv_ta_all_1s=on,rvv_ma_all_1s=on,zfh=true,x-zvfh=true $@ &&\ +qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=128,rvv_ta_all_1s=off,rvv_ma_all_1s=off,zfh=true,x-zvfh=true $@ && \ +qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=1024,rvv_ta_all_1s=off,rvv_ma_all_1s=off,zfh=true,x-zvfh=true $@ + diff --git a/.github/workflows/rv64-config.mk b/.github/workflows/rv64-config.mk new file mode 100644 index 0000000..8aa0106 --- /dev/null +++ b/.github/workflows/rv64-config.mk @@ -0,0 +1,3 @@ +WARN=-Wall -Wextra -Wno-unused-function -Wno-unused-parameter +CC=clang-17 +CFLAGS=--target=riscv64 -march=rv64gcv_zfh_zba_zbb_zbs -O3 ${WARN} -nostdlib -fno-builtin -nodefaultlibs -ffreestanding diff --git a/.github/workflows/rv64-run.sh b/.github/workflows/rv64-run.sh new file mode 100755 index 0000000..c94eb38 --- /dev/null +++ b/.github/workflows/rv64-run.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=128,rvv_ta_all_1s=on,rvv_ma_all_1s=on,zfh=true,x-zvfh=true $@ && \ +qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=256,rvv_ta_all_1s=on,rvv_ma_all_1s=on,zfh=true,x-zvfh=true $@ && \ +qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=512,rvv_ta_all_1s=on,rvv_ma_all_1s=on,zfh=true,x-zvfh=true $@ && \ +qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=1024,rvv_ta_all_1s=on,rvv_ma_all_1s=on,zfh=true,x-zvfh=true $@ && \ +qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=128,rvv_ta_all_1s=off,rvv_ma_all_1s=off,zfh=true,x-zvfh=true $@ && \ +qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=1024,rvv_ta_all_1s=off,rvv_ma_all_1s=off,zfh=true,x-zvfh=true $@ diff --git a/.github/workflows/validate-bench.yml b/.github/workflows/validate-bench.yml new file mode 100644 index 0000000..afcaed3 --- /dev/null +++ b/.github/workflows/validate-bench.yml @@ -0,0 +1,32 @@ +name: Validate bench + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + Tests: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install packages + run: | + sudo sed 's/jammy/devel/g' -i /etc/apt/sources.list + sudo apt-get update -q -y + sudo apt-get install -y make qemu-user-static clang-17 + - name: Validate RV64 + run: | + cp .github/workflows/rv64-config.mk ./config.mk + cp .github/workflows/rv64-run.sh ./run.sh + cp .github/workflows/bench-config.h ./bench/config.h + make -C bench run -j$(nproc) + make -C bench clean + - name: Validate RV32 + run: | + cp .github/workflows/rv32-config.mk ./config.mk + cp .github/workflows/rv32-run.sh ./run.sh + cp .github/workflows/bench-config.h ./bench/config.h + make -C bench run -j$(nproc) + make -C bench clean diff --git a/bench/Makefile b/bench/Makefile index b2597c5..41399a8 100644 --- a/bench/Makefile +++ b/bench/Makefile @@ -13,5 +13,5 @@ clean: rm -f ${EXECS} run: all - for i in ${EXECS}; do ../run.sh ./$$i; done + for i in ${EXECS}; do ../run.sh ./$$i || { printf "\n\n\033[0;31mFAILED\033[0m\n\n"; exit 1; } ; done diff --git a/bench/bench.h b/bench/bench.h index e422c05..24879ae 100644 --- a/bench/bench.h +++ b/bench/bench.h @@ -137,7 +137,7 @@ bench_run(Bench *benches, size_t nBenches) for (size_t n = 1; n < N; n = BENCH_NEXT(n)) { ux si = 0, s0 = 0; -#if MAX_REPEATS > 4 +#if VALIDATE if (i != b->impls) { URand seed = randState; (void)b->func(i->func, n); diff --git a/bench/byteswap.S b/bench/byteswap.S index 79154ef..367cc1f 100644 --- a/bench/byteswap.S +++ b/bench/byteswap.S @@ -1,15 +1,10 @@ -/* - * TODO: This currently only works for VLEN<=256. - * I think rvv 1.0 should only vrgatherei16.vv here in the future. - */ - -#ifdef MX - +#if HAS_RVV_1_0 +#if MX_N == 4 || MX_N == 2 || MX_N == 1 # a0 = ptr, a1 = len -.global MX(byteswap32_rvv_gather_) -MX(byteswap32_rvv_gather_): - vsetvli t0, x0, e8, MX(), ta, ma +.global MX(byteswap32_rvv_gatherei16_) +MX(byteswap32_rvv_gatherei16_): + vsetvli t0, x0, e16, MX2(), ta, ma vid.v v0 vand.vi v8, v0, 3 vrsub.vi v8, v8, 3 @@ -21,7 +16,7 @@ MX(byteswap32_rvv_gather_): vle32.v v8, (a0) slli t1, t0, 2 vsetvli x0, t1, e8, MX(), ta, ma - vrgather.vv v16, v8, v0 + vrgatherei16.vv v16, v8, v0 vsetvli x0, t0, e32, MX(), ta, ma vse32.v v16, (a0) sub a1, a1, t0 @@ -32,10 +27,10 @@ MX(byteswap32_rvv_gather_): #if MX_N == 2 -.macro byteswap32_rvv_m1_gathers n - .global byteswap32_rvv_m1_gathers_m\n - byteswap32_rvv_m1_gathers_m\n: - vsetvli t0, x0, e8, m1, ta, ma +.macro byteswap32_rvv_m1_gatherei16s n + .global byteswap32_rvv_m1_gatherei16s_m\n + byteswap32_rvv_m1_gatherei16s_m\n: + vsetvli t0, x0, e16, MX(), ta, ma vid.v v0 vand.vi v8, v0, 3 vrsub.vi v8, v8, 3 @@ -46,17 +41,17 @@ MX(byteswap32_rvv_gather_): vsetvli t0, a1, e32, m\n, ta, ma vle32.v v8, (a0) vsetvli t1, x0, e8, m1, ta, ma - vrgather.vv v16, v8, v0 + vrgatherei16.vv v16, v8, v0 .ifge \n-2 - vrgather.vv v17, v9, v0 + vrgatherei16.vv v17, v9, v0 .ifge \n-4 - vrgather.vv v18, v10, v0 - vrgather.vv v19, v11, v0 + vrgatherei16.vv v18, v10, v0 + vrgatherei16.vv v19, v11, v0 .ifge \n-8 - vrgather.vv v20, v12, v0 - vrgather.vv v21, v13, v0 - vrgather.vv v22, v14, v0 - vrgather.vv v23, v15, v0 + vrgatherei16.vv v20, v12, v0 + vrgatherei16.vv v21, v13, v0 + vrgatherei16.vv v22, v14, v0 + vrgatherei16.vv v23, v15, v0 .endif .endif .endif @@ -69,13 +64,13 @@ MX(byteswap32_rvv_gather_): ret .endm -byteswap32_rvv_m1_gathers 2 +byteswap32_rvv_m1_gatherei16s 2 #endif #if MX_N == 4 -byteswap32_rvv_m1_gathers 4 +byteswap32_rvv_m1_gatherei16s 4 #endif #if MX_N == 8 -byteswap32_rvv_m1_gathers 8 +byteswap32_rvv_m1_gatherei16s 8 #endif - +#endif diff --git a/bench/byteswap.c b/bench/byteswap.c index 2e1f607..ee98d94 100644 --- a/bench/byteswap.c +++ b/bench/byteswap.c @@ -40,14 +40,24 @@ byteswap32_SWAR_rev8(uint32_t *ptr, size_t n) #endif +/* we don't support these on XTheadVector */ +#ifndef __riscv_vector +#define IMPLS_RVV(f) +#else +#define IMPLS_RVV(f) \ + f(rvv_gatherei16_m1) \ + f(rvv_gatherei16_m2) \ + f(rvv_gatherei16_m4) \ + f(rvv_m1_gatherei16s_m2) \ + f(rvv_m1_gatherei16s_m4) \ + f(rvv_m1_gatherei16s_m8) +#endif + #define IMPLS(f) \ f(scalar) \ f(scalar_autovec) \ REV8(f) \ - MX(f, rvv_gather) \ - f(rvv_m1_gathers_m2) \ - f(rvv_m1_gathers_m4) \ - f(rvv_m1_gathers_m8) \ + IMPLS_RVV(f) typedef void Func(uint32_t *ptr, size_t n); diff --git a/bench/config.h b/bench/config.h index 895b438..0078049 100644 --- a/bench/config.h +++ b/bench/config.h @@ -15,6 +15,8 @@ /* stop repeats early afer this many cycles have elapsed */ #define STOP_CYCLES (1024*1024*500) +/* validate against reference implementation on the first repetition */ +#define VALIDATE 1 /* custom scaling factors for benchmarks, these are used to make sure each * benchmark approximately takes the same amount of time. */ @@ -24,3 +26,4 @@ /* benchmark specific configurations */ #define mandelbrot_ITER 100 + diff --git a/nolibc.h b/nolibc.h index 5a595d1..94d4235 100644 --- a/nolibc.h +++ b/nolibc.h @@ -26,15 +26,15 @@ static void print_flush(void); #define EXIT_SUCCESS 0 /* customize me */ -static void -exit(int x) { __asm volatile("unimp\n"); } - static void memwrite(void const *ptr, size_t len) { } // static size_t /* only needed for vector-utf/bench.c */ // memread(void *ptr, size_t len) { } +static void +exit(int x) { __asm volatile("unimp\n"); } + int main(void); void _start(void) {