riscv64: Support scalar-vector bitcasts (#8692)

afonso360 · web-flow · commit 9a443bcb3b21 · 2024-05-28T14:39:12.000Z
* riscv64: Implement scalar to vector bitcast and vice-versa

* riscv64: Delete `vfslide1up.vf` instruction

* fuzzgen: Enable `bitcast.i128` for RISC-V
diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle
@@ -2815,6 +2815,10 @@
 ;; Generates a bitcast instruction.
 ;; Args are: src, src_ty, dst_ty
 (decl gen_bitcast (Reg Type Type) Reg)
+(rule 5 (gen_bitcast r (ty_scalar_float src_ty) (ty_vec_fits_in_register _)) (rv_vfmv_sf r src_ty))
+(rule 4 (gen_bitcast r (ty_int_ref_scalar_64 src_ty) (ty_vec_fits_in_register _)) (rv_vmv_sx r src_ty))
+(rule 3 (gen_bitcast r (ty_vec_fits_in_register _) (ty_scalar_float dst_ty)) (rv_vfmv_fs r dst_ty))
+(rule 2 (gen_bitcast r (ty_vec_fits_in_register _) (ty_int_ref_scalar_64 dst_ty)) (rv_vmv_xs r dst_ty))
 (rule 1 (gen_bitcast r $F32 $I32) (rv_fmvxw r))
 (rule 1 (gen_bitcast r $F64 $I64) (rv_fmvxd r))
 (rule 1 (gen_bitcast r $I32 $F32) (rv_fmvwx r))
diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -268,13 +268,16 @@ impl VecAluOpRRRR {
             VecAluOpRRRR::VfnmaccVV | VecAluOpRRRR::VfnmaccVF => 0b101101,
             VecAluOpRRRR::VfmsacVV | VecAluOpRRRR::VfmsacVF => 0b101110,
             VecAluOpRRRR::VfnmsacVV | VecAluOpRRRR::VfnmsacVF => 0b101111,
+            VecAluOpRRRR::Vslide1upVX => 0b001110,
         }
     }
 
     pub fn category(&self) -> VecOpCategory {
         match self {
             VecAluOpRRRR::VmaccVV | VecAluOpRRRR::VnmsacVV => VecOpCategory::OPMVV,
-            VecAluOpRRRR::VmaccVX | VecAluOpRRRR::VnmsacVX => VecOpCategory::OPMVX,
+            VecAluOpRRRR::VmaccVX | VecAluOpRRRR::VnmsacVX | VecAluOpRRRR::Vslide1upVX => {
+                VecOpCategory::OPMVX
+            }
             VecAluOpRRRR::VfmaccVV
             | VecAluOpRRRR::VfnmaccVV
             | VecAluOpRRRR::VfmsacVV
@@ -299,7 +302,10 @@ impl VecAluOpRRRR {
 
 impl VecInstOverlapInfo for VecAluOpRRRR {
     fn forbids_src_dst_overlaps(&self) -> bool {
-        false
+        match self {
+            VecAluOpRRRR::Vslide1upVX => true,
+            _ => false,
+        }
     }
 }
 
diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -229,6 +229,7 @@
   (VfnmaccVF)
   (VfmsacVF)
   (VfnmsacVF)
+  (Vslide1upVX)
 ))
 
 ;; Register-Imm ALU Ops
@@ -1095,6 +1096,13 @@
 (rule (rv_vslideup_vvi vd vs2 imm mask vstate)
   (vec_alu_rrr_uimm5 (VecAluOpRRRImm5.VslideupVI) vd vs2 imm mask vstate))
 
+;; Helper for emitting the `vslide1up.vx` instruction.
+;;
+;; # vd[0]=x[rs1], vd[i+1] = vs2[i]
+(decl rv_vslide1up_vx (VReg VReg XReg VecOpMasking VState) VReg)
+(rule (rv_vslide1up_vx vd vs2 rs1 mask vstate)
+  (vec_alu_rrrr (VecAluOpRRRR.Vslide1upVX) vd vs2 rs1 mask vstate))
+
 ;; Helper for emitting the `vmv.x.s` instruction.
 ;; This instruction copies the first element of the source vector to the destination X register.
 ;; Masked versions of this instruction are not supported.
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -2436,8 +2436,24 @@
       (elf_tls_get_addr name))
 
 ;;;;;  Rules for `bitcast`;;;;;;;;;
-(rule
-   (lower (has_type out_ty (bitcast _ v @ (value_type in_ty))))
+
+;; These rules should probably be handled in `gen_bitcast`, but it's convenient to have that return
+;; a single register, instead of a `ValueRegs`
+(rule 2 (lower (has_type $I128 (bitcast _ v @ (value_type (ty_vec_fits_in_register _)))))
+    (value_regs
+      (gen_extractlane $I64X2 v 0)
+      (gen_extractlane $I64X2 v 1)))
+
+;; Move the high half into a vector register, and then use vslide1up to move it up and
+;; insert the lower half in one instruction.
+(rule 1 (lower (has_type (ty_vec_fits_in_register _) (bitcast _ v @ (value_type $I128))))
+    (let ((lo XReg (value_regs_get v 0))
+          (hi XReg (value_regs_get v 1))
+          (vstate VState (vstate_from_type $I64X2))
+          (vec VReg (rv_vmv_sx hi vstate)))
+      (rv_vslide1up_vx vec vec lo (unmasked) vstate)))
+
+(rule 0 (lower (has_type out_ty (bitcast _ v @ (value_type in_ty))))
    (gen_bitcast v in_ty out_ty))
 
 ;;;;;  Rules for `ceil`;;;;;;;;;
diff --git a/cranelift/filetests/filetests/isa/riscv64/bitcast-scalar-vector.clif b/cranelift/filetests/filetests/isa/riscv64/bitcast-scalar-vector.clif
@@ -0,0 +1,236 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+function %bitcast_vec_to_i128(i64x2) -> i128 {
+block0(v0: i64x2):
+    v1 = bitcast.i128 little v0
+    return v1
+}
+
+; VCode:
+;   addi sp,sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v8,-16(incoming_arg) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vmv.x.s a0,v8 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vslidedown.vi v12,v8,1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmv.x.s a1,v12 #avl=2, #vtype=(e64, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   addi sp,sp,16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   mv s0, sp
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, sp, 0x10
+;   .byte 0x07, 0x84, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x25, 0x80, 0x42
+;   .byte 0x57, 0xb6, 0x80, 0x3e
+;   .byte 0xd7, 0x25, 0xc0, 0x42
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bitcast_i128_to_vec(i128) -> i64x2 {
+block0(v0: i128):
+    v1 = bitcast.i64x2 little v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   vmv.s.x v12,a1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vmv1r.v v14,v12
+;   vslide1up.vx v14,v12,a0 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v14,0(a2) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xe6, 0x05, 0x42
+;   .byte 0x57, 0x37, 0xc0, 0x9e
+;   .byte 0x57, 0x67, 0xc5, 0x3a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x07, 0x06, 0x02
+;   ret
+
+function %bitcast_vec_to_i64(i32x2) -> i64 {
+block0(v0: i32x2):
+    v1 = bitcast.i64 little v0
+    return v1
+}
+
+; VCode:
+;   addi sp,sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v8,-16(incoming_arg) #avl=8, #vtype=(e8, m1, ta, ma)
+;   vmv.x.s a0,v8 #avl=1, #vtype=(e64, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   addi sp,sp,16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   mv s0, sp
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x04, 0xcc
+;   addi t6, sp, 0x10
+;   .byte 0x07, 0x84, 0x0f, 0x02
+;   .byte 0x57, 0xf0, 0x80, 0xcd
+;   .byte 0x57, 0x25, 0x80, 0x42
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bitcast_i64_to_vec(i64) -> i32x2 {
+block0(v0: i64):
+    v1 = bitcast.i32x2 little v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   vmv.s.x v11,a0 #avl=1, #vtype=(e64, m1, ta, ma)
+;   vse8.v v11,0(a1) #avl=8, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0xf0, 0x80, 0xcd
+;   .byte 0xd7, 0x65, 0x05, 0x42
+;   .byte 0x57, 0x70, 0x04, 0xcc
+;   .byte 0xa7, 0x85, 0x05, 0x02
+;   ret
+
+function %bitcast_vec_to_f64(i32x2) -> f64 {
+block0(v0: i32x2):
+    v1 = bitcast.f64 little v0
+    return v1
+}
+
+; VCode:
+;   addi sp,sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v8,-16(incoming_arg) #avl=8, #vtype=(e8, m1, ta, ma)
+;   vfmv.f.s fa0,v8 #avl=1, #vtype=(e64, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   addi sp,sp,16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   mv s0, sp
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x04, 0xcc
+;   addi t6, sp, 0x10
+;   .byte 0x07, 0x84, 0x0f, 0x02
+;   .byte 0x57, 0xf0, 0x80, 0xcd
+;   .byte 0x57, 0x15, 0x80, 0x42
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bitcast_f64_to_vec(f64) -> i32x2 {
+block0(v0: f64):
+    v1 = bitcast.i32x2 little v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   vfmv.s.f v11,fa0 #avl=1, #vtype=(e64, m1, ta, ma)
+;   vse8.v v11,0(a0) #avl=8, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0xf0, 0x80, 0xcd
+;   .byte 0xd7, 0x55, 0x05, 0x42
+;   .byte 0x57, 0x70, 0x04, 0xcc
+;   .byte 0xa7, 0x05, 0x05, 0x02
+;   ret
+
+function %bitcast_i16x2_to_f32(i16x2) -> f32 {
+block0(v0: i16x2):
+    v1 = bitcast.f32 little v0
+    return v1
+}
+
+; VCode:
+;   addi sp,sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v8,-16(incoming_arg) #avl=4, #vtype=(e8, m1, ta, ma)
+;   vfmv.f.s fa0,v8 #avl=1, #vtype=(e32, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   addi sp,sp,16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   mv s0, sp
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x02, 0xcc
+;   addi t6, sp, 0x10
+;   .byte 0x07, 0x84, 0x0f, 0x02
+;   .byte 0x57, 0xf0, 0x00, 0xcd
+;   .byte 0x57, 0x15, 0x80, 0x42
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bitcast_f32_to_i16x2(f32) -> i16x2 {
+block0(v0: f32):
+    v1 = bitcast.i16x2 little v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   vfmv.s.f v11,fa0 #avl=1, #vtype=(e32, m1, ta, ma)
+;   vse8.v v11,0(a0) #avl=4, #vtype=(e8, m1, ta, ma)
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0x57, 0xf0, 0x00, 0xcd
+;   .byte 0xd7, 0x55, 0x05, 0x42
+;   .byte 0x57, 0x70, 0x02, 0xcc
+;   .byte 0xa7, 0x05, 0x05, 0x02
+;   ret
+
diff --git a/cranelift/filetests/filetests/runtests/simd-bitcast-64bit.clif b/cranelift/filetests/filetests/runtests/simd-bitcast-64bit.clif
@@ -0,0 +1,36 @@
+test run
+target riscv64 has_v
+target riscv64 has_v has_c has_zcb
+
+function %bitcast_i32x2_to_i64(i32x2) -> i64 {
+block0(v0: i32x2):
+    v1 = bitcast.i64 little v0
+    return v1
+}
+; run: %bitcast_i32x2_to_i64([0xBEEF 0xC0FFEE]) == 0x00c0ffee_0000beef
+; run: %bitcast_i32x2_to_i64([-1 127]) == 0x0000007f_ffffffff
+
+
+function %bitcast_i64_to_i32x2(i64) -> i32x2 {
+block0(v0: i64):
+    v1 = bitcast.i32x2 little v0
+    return v1
+}
+; run: %bitcast_i64_to_i32x2(0x00c0ffee_0000beef) == [0xBEEF 0xC0FFEE]
+; run: %bitcast_i64_to_i32x2(0x0000007f_ffffffff) == [-1 127]
+
+function %bitcast_i32x2_to_f64(i32x2) -> f64 {
+block0(v0: i32x2):
+    v1 = bitcast.f64 little v0
+    return v1
+}
+; run: %bitcast_i32x2_to_f64([0xBEEF 0xC0FFEE]) == 0x1.0ffee0000beefp-1011
+; run: %bitcast_i32x2_to_f64([-1 127]) == 0x0.0007fffffffffp-1022
+
+function %bitcast_f64_to_i32x2(f64) -> i32x2 {
+block0(v0: f64):
+    v1 = bitcast.i32x2 little v0
+    return v1
+}
+; run: %bitcast_f64_to_i32x2(0x1.0ffee0000beefp-1011) == [0xBEEF 0xC0FFEE]
+; run: %bitcast_f64_to_i32x2(0x0.0007fffffffffp-1022) == [-1 127]
diff --git a/cranelift/filetests/filetests/runtests/simd-bitcast-i128.clif b/cranelift/filetests/filetests/runtests/simd-bitcast-i128.clif
@@ -0,0 +1,20 @@
+test run
+target riscv64 has_v
+target riscv64 has_v has_c has_zcb
+
+function %bitcast_i64x2_to_i128(i64x2) -> i128 {
+block0(v0: i64x2):
+    v1 = bitcast.i128 little v0
+    return v1
+}
+; run: %bitcast_i64x2_to_i128([0xBEEF 0xC0FFEE]) == 0x0000000000c0ffee_000000000000beef
+; run: %bitcast_i64x2_to_i128([-1 127]) == 0x000000000000007f_ffffffffffffffff
+
+
+function %bitcast_i128_to_i64x2(i128) -> i64x2 {
+block0(v0: i128):
+    v1 = bitcast.i64x2 little v0
+    return v1
+}
+; run: %bitcast_i128_to_i64x2(0x0000000000c0ffee_000000000000beef) == [0xBEEF 0xC0FFEE]
+; run: %bitcast_i128_to_i64x2(0x000000000000007f_ffffffffffffffff) == [-1 127]
diff --git a/cranelift/fuzzgen/src/function_generator.rs b/cranelift/fuzzgen/src/function_generator.rs
@@ -764,9 +764,6 @@ fn valid_for_target(triple: &Triple, op: Opcode, args: &[Type], rets: &[Type]) -
                     &[I128],
                     &[F32 | F64]
                 ),
-                // https://github.com/bytecodealliance/wasmtime/issues/6104
-                (Opcode::Bitcast, &[I128], &[_]),
-                (Opcode::Bitcast, &[_], &[I128]),
                 // TODO
                 (
                     Opcode::SelectSpectreGuard,