diff --git a/sim/simx/Makefile b/sim/simx/Makefile
index d3e726bbe..83054edc4 100644
--- a/sim/simx/Makefile
+++ b/sim/simx/Makefile
@@ -17,12 +17,12 @@ CXXFLAGS += $(CONFIGS)
 LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
 LDFLAGS += -Wl,-rpath,$(THIRD_PARTY_DIR)/ramulator -L$(THIRD_PARTY_DIR)/ramulator -lramulator
 
-SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
+SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
 SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(SRC_DIR)/core.cpp $(SRC_DIR)/emulator.cpp $(SRC_DIR)/decode.cpp $(SRC_DIR)/execute.cpp $(SRC_DIR)/func_unit.cpp $(SRC_DIR)/cache_sim.cpp $(SRC_DIR)/mem_sim.cpp $(SRC_DIR)/local_mem.cpp $(SRC_DIR)/mem_coalescer.cpp $(SRC_DIR)/dcrs.cpp $(SRC_DIR)/types.cpp
 
 # Add V extension sources
 ifneq ($(findstring -DEXT_V_ENABLE, $(CONFIGS)),)
-  SRCS += $(SRC_DIR)/execute_v.cpp
+  SRCS += $(SRC_DIR)/vpu.cpp
 endif
 
 # Debugging
diff --git a/sim/simx/arch.h b/sim/simx/arch.h
index d68345db6..6becf5c91 100644
--- a/sim/simx/arch.h
+++ b/sim/simx/arch.h
@@ -29,7 +29,6 @@ class Arch {
   uint16_t num_cores_;
   uint16_t num_clusters_;
   uint16_t socket_size_;
-  uint16_t vsize_;
   uint16_t num_barriers_;
   uint64_t local_mem_base_;
 
@@ -40,7 +39,6 @@ class Arch {
     , num_cores_(num_cores)
     , num_clusters_(NUM_CLUSTERS)
     , socket_size_(SOCKET_SIZE)
-    , vsize_(VLEN / 8)
     , num_barriers_(NUM_BARRIERS)
     , local_mem_base_(LMEM_BASE_ADDR)
   {}
@@ -73,10 +71,6 @@ class Arch {
     return socket_size_;
   }
 
-  uint16_t vsize() const {
-    return vsize_;
-  }
-
 };
 
 }
\ No newline at end of file
diff --git a/sim/simx/decode.cpp b/sim/simx/decode.cpp
index a4c0bb2ad..b57893daa 100644
--- a/sim/simx/decode.cpp
+++ b/sim/simx/decode.cpp
@@ -453,6 +453,7 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) {
     if (sep++ != 0) { os << ", "; } else { os << " "; }
     os << "0x" << std::hex << instr.getImm() << std::dec;
   }
+#ifdef EXT_V_ENABLE
   if (instr.getOpcode() == Opcode::SYS && instr.getFunc3() >= 5) {
     // CSRs with immediate values
     if (sep++ != 0) { os << ", "; } else { os << " "; }
@@ -462,6 +463,7 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) {
   if (instr.getVattrMask() != 0) {
     print_vec_attr(os, instr);
   }
+#endif
   return os;
 }
 }
diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp
index b834a87f2..7abec98c5 100644
--- a/sim/simx/emulator.cpp
+++ b/sim/simx/emulator.cpp
@@ -33,7 +33,9 @@ using namespace vortex;
 Emulator::warp_t::warp_t(const Arch& arch)
   : ireg_file(arch.num_threads(), std::vector<Word>(MAX_NUM_REGS))
   , freg_file(arch.num_threads(), std::vector<uint64_t>(MAX_NUM_REGS))
-  , vreg_file(MAX_NUM_REGS, std::vector<Byte>(arch.vsize()))
+#ifdef EXT_V_ENABLE
+  , vreg_file(MAX_NUM_REGS, std::vector<Byte>(MAX_NUM_REGS))
+#endif
   , uuid(0)
 {}
 
@@ -43,9 +45,11 @@ void Emulator::warp_t::clear(uint64_t startup_addr) {
   this->uuid = 0;
   this->fcsr = 0;
 
+#ifdef EXT_V_ENABLE
   this->vtype = {0, 0, 0, 0, 0};
   this->vl = 0;
   this->vlmax = 0;
+#endif
 
   for (auto& reg_file : this->ireg_file) {
     for (auto& reg : reg_file) {
@@ -68,6 +72,7 @@ void Emulator::warp_t::clear(uint64_t startup_addr) {
     }
   }
 
+#ifdef EXT_V_ENABLE
   for (auto& reg_file : this->vreg_file) {
     for (auto& reg : reg_file) {
     #ifndef NDEBUG
@@ -77,16 +82,7 @@ void Emulator::warp_t::clear(uint64_t startup_addr) {
     #endif
     }
   }
-
-  for (auto& reg_file : this->vreg_file) {
-    for (auto& reg : reg_file) {
-    #ifndef NDEBUG
-      reg = 0;
-    #else
-      reg = std::rand();
-    #endif
-    }
-  }
+#endif
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -102,13 +98,17 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
     // considered to be big enough to hold input tiles for one output tile.
     // In future versions, scratchpad size should be fixed to an appropriate value.
     , scratchpad(std::vector<Word>(32 * 32 * 32768))
+  #ifdef EXT_V_ENABLE
     , csrs_(arch.num_warps())
+  #endif
 {
   std::srand(50);
 
+#ifdef EXT_V_ENABLE
   for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
     csrs_.at(i).resize(arch.num_threads());
   }
+#endif
 
   this->clear();
 }
@@ -490,6 +490,7 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
   case VX_CSR_FRM:        return (warps_.at(wid).fcsr >> 5);
   case VX_CSR_FCSR:       return warps_.at(wid).fcsr;
 
+#ifdef EXT_V_ENABLE
   // Vector CRSs
   case VX_CSR_VSTART:
     return csrs_.at(wid).at(tid)[VX_CSR_VSTART];
@@ -514,6 +515,7 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
     return csrs_.at(wid).at(tid)[VX_CSR_VTIME];
   case VX_CSR_VINSTRET:
     return csrs_.at(wid).at(tid)[VX_CSR_VINSTRET];
+#endif
 
   case VX_CSR_MHARTID:    return (core_->id() * arch_.num_warps() + wid) * arch_.num_threads() + tid;
   case VX_CSR_THREAD_ID:  return tid;
@@ -631,6 +633,7 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) {
     csr_mscratch_ = value;
     break;
 
+#ifdef EXT_V_ENABLE
   // Vector CRSs
   case VX_CSR_VSTART:
     csrs_.at(wid).at(tid)[VX_CSR_VSTART] = value;
@@ -652,6 +655,7 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) {
     csrs_.at(wid).at(tid)[VX_CSR_VTYPE] = value;
     break;
   case VX_CSR_VLENB: // read only, set to VLEN / 8
+#endif
 
   case VX_CSR_SATP:
   #ifdef VM_ENABLE
diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp
index 436d43486..aae018fc5 100644
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@@ -25,7 +25,9 @@
 #include "emulator.h"
 #include "instr.h"
 #include "core.h"
+#ifdef EXT_V_ENABLE
 #include "processor_impl.h"
+#endif
 #include "VX_types.h"
 
 using namespace vortex;
@@ -117,8 +119,10 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
         }
         DPN(2, "}" << std::endl);
         break;
+    #ifdef EXT_V_ENABLE
       case RegType::Vector:
         break;
+    #endif
       default:
         break;
       }
@@ -707,11 +711,12 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
         }
       }
       rd_write = true;
-    } else {
-    #ifdef EXT_V_ENABLE
+    }
+  #ifdef EXT_V_ENABLE
+    else {
       this->loadVector(instr, wid, rsdata);
-    #endif
     }
+  #endif
     break;
   }
   case Opcode::S:
@@ -744,11 +749,12 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
           std::abort();
         }
       }
-    } else {
-    #ifdef EXT_V_ENABLE
+    }
+  #ifdef EXT_V_ENABLE
+    else {
       this->storeVector(instr, wid, rsdata);
-    #endif
     }
+  #endif
     break;
   }
   case Opcode::AMO: {
@@ -932,7 +938,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
     for (uint32_t t = thread_start; t < num_threads; ++t) {
       if (!warp.tmask.test(t))
         continue;
-      uint32_t frm = (func3 == 0x7) ? this->get_csr(VX_CSR_FRM, t, wid) : func3;
+      uint32_t frm = this->get_fpu_rm(func3, t, wid);
       uint32_t fflags = 0;
       switch (func7) {
       case 0x00: { // RV32F: FADD.S
@@ -1247,10 +1253,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
         break;
       }
       }
-      if (fflags) {
-        this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, t, wid) | fflags, t, wid);
-        this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, t, wid) | fflags, t, wid);
-      }
+      this->update_fcrs(fflags, t, wid);
     }
     rd_write = true;
     break;
@@ -1304,10 +1307,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
       default:
         break;
       }
-      if (fflags) {
-        this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, t, wid) | fflags, t, wid);
-        this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, t, wid) | fflags, t, wid);
-      }
+      this->update_fcrs(fflags, t, wid);
     }
     rd_write = true;
     break;
diff --git a/sim/simx/main.cpp b/sim/simx/main.cpp
index 3df8b0e1a..d6ed15a25 100644
--- a/sim/simx/main.cpp
+++ b/sim/simx/main.cpp
@@ -120,7 +120,9 @@ int main(int argc, char **argv) {
 #endif
     // run simulation
     // vector test exitcode is a special case
+  #ifdef EXT_V_ENABLE
     if (vector_test) return processor.run();
+  #endif
     // else continue as normal
     processor.run();
 
diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp
index 96fc49df9..a11351d03 100644
--- a/sim/simx/processor.cpp
+++ b/sim/simx/processor.cpp
@@ -127,7 +127,9 @@ int ProcessorImpl::run() {
         done = false;
         continue;
       }
+    #ifdef EXT_V_ENABLE
       exitcode |= cluster->get_exitcode();
+    #endif
     }
     perf_mem_latency_ += perf_mem_pending_reads_;
   } while (!done);
diff --git a/sim/simx/execute_v.cpp b/sim/simx/vpu.cpp
similarity index 55%
rename from sim/simx/execute_v.cpp
rename to sim/simx/vpu.cpp
index d14338024..3a70560ec 100644
--- a/sim/simx/execute_v.cpp
+++ b/sim/simx/vpu.cpp
@@ -1,7 +1,8 @@
 // This is a fork of https://github.com/troibe/vortex/tree/simx-v2-vector
-// The purpose of this fork is to make the simx-v2-vector up to date with master
+// The purpose of this fork is to make simx-v2-vector up to date with master
 // Thanks to Troibe for his amazing work
 
+#ifdef EXT_V_ENABLE
 #include "emulator.h"
 #include "instr.h"
 #include "processor_impl.h"
@@ -10,2397 +11,10 @@
 #include <math.h>
 #include <rvfloats.h>
 #include <stdlib.h>
+#include "vpu.h"
 
 using namespace vortex;
 
-template <typename T, typename R>
-class Add {
-public:
-  static R apply(T first, T second, R) {
-    return (R)first + (R)second;
-  }
-  static std::string name() { return "Add"; }
-};
-
-template <typename T, typename R>
-class Sub {
-public:
-  static R apply(T first, T second, R) {
-    return (R)second - (R)first;
-  }
-  static std::string name() { return "Sub"; }
-};
-
-template <typename T, typename R>
-class Adc {
-public:
-  static R apply(T first, T second, R third) {
-    return (R)first + (R)second + third;
-  }
-  static std::string name() { return "Adc"; }
-};
-
-template <typename T, typename R>
-class Madc {
-public:
-  static R apply(T first, T second, R third) {
-    return ((R)first + (R)second + third) > (R)std::numeric_limits<T>::max();
-  }
-  static std::string name() { return "Madc"; }
-};
-
-template <typename T, typename R>
-class Sbc {
-public:
-  static R apply(T first, T second, R third) {
-    return (R)second - (R)first - third;
-  }
-  static std::string name() { return "Sbc"; }
-};
-
-template <typename T, typename R>
-class Msbc {
-public:
-  static R apply(T first, T second, R third) {
-    return (R)second < ((R)first + third);
-  }
-  static std::string name() { return "Msbc"; }
-};
-
-template <typename T, typename R>
-class Ssub {
-public:
-  static R apply(T first, T second, uint32_t, uint32_t &vxsat_) {
-    // rounding mode is not relevant for this operation
-    T unclippedResult = second - first;
-    R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits<R>::min(), (T)std::numeric_limits<R>::max());
-    vxsat_ |= clippedResult != unclippedResult;
-    return clippedResult;
-  }
-  static std::string name() { return "Ssub"; }
-};
-
-template <typename T, typename R>
-class Ssubu {
-public:
-  static R apply(T first, T second, uint32_t, uint32_t &vxsat_) {
-    // rounding mode is not relevant for this operation
-    if (first > second) {
-      vxsat_ = true;
-      return 0;
-    } else {
-      vxsat_ = false;
-      return second - first;
-    }
-  }
-  static std::string name() { return "Ssubu"; }
-};
-
-template <typename T, typename R>
-class Sadd {
-public:
-  static R apply(T first, T second, uint32_t, uint32_t &vxsat_) {
-    // rounding mode is not relevant for this operation
-    T unclippedResult = second + first;
-    R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits<R>::min(), (T)std::numeric_limits<R>::max());
-    vxsat_ |= clippedResult != unclippedResult;
-    return clippedResult;
-  }
-  static std::string name() { return "Sadd"; }
-};
-
-template <typename T, typename R>
-class Rsub {
-public:
-  static R apply(T first, T second, R) {
-    return first - second;
-  }
-  static std::string name() { return "Rsub"; }
-};
-
-template <typename T, typename R>
-class Div {
-public:
-  static R apply(T first, T second, R) {
-    // logic taken from scalar div
-    if (first == 0) {
-      return -1;
-    } else if (second == std::numeric_limits<T>::min() && first == T(-1)) {
-      return second;
-    } else {
-      return (R)second / (R)first;
-    }
-  }
-  static std::string name() { return "Div"; }
-};
-
-template <typename T, typename R>
-class Rem {
-public:
-  static R apply(T first, T second, R) {
-    // logic taken from scalar rem
-    if (first == 0) {
-      return second;
-    } else if (second == std::numeric_limits<T>::min() && first == T(-1)) {
-      return 0;
-    } else {
-      return (R)second % (R)first;
-    }
-  }
-  static std::string name() { return "Rem"; }
-};
-
-template <typename T, typename R>
-class Mul {
-public:
-  static R apply(T first, T second, R) {
-    return (R)first * (R)second;
-  }
-  static std::string name() { return "Mul"; }
-};
-
-template <typename T, typename R>
-class Mulsu {
-public:
-  static R apply(T first, T second, R) {
-    R first_ext = zext((R)first, (sizeof(T) * 8));
-    return first_ext * (R)second;
-  }
-  static std::string name() { return "Mulsu"; }
-};
-
-template <typename T, typename R>
-class Mulh {
-public:
-  static R apply(T first, T second, R) {
-    __int128_t first_ext = sext((__int128_t)first, (sizeof(T) * 8));
-    __int128_t second_ext = sext((__int128_t)second, (sizeof(T) * 8));
-    return (first_ext * second_ext) >> (sizeof(T) * 8);
-  }
-  static std::string name() { return "Mulh"; }
-};
-
-template <typename T, typename R>
-class Mulhsu {
-public:
-  static R apply(T first, T second, R) {
-    __int128_t first_ext = zext((__int128_t)first, (sizeof(T) * 8));
-    __int128_t second_ext = sext((__int128_t)second, (sizeof(T) * 8));
-    return (first_ext * second_ext) >> (sizeof(T) * 8);
-  }
-  static std::string name() { return "Mulhsu"; }
-};
-
-template <typename T, typename R>
-class Mulhu {
-public:
-  static R apply(T first, T second, R) {
-    return ((__uint128_t)first * (__uint128_t)second) >> (sizeof(T) * 8);
-  }
-  static std::string name() { return "Mulhu"; }
-};
-
-template <typename T, typename R>
-class Madd {
-public:
-  static R apply(T first, T second, R third) {
-    return ((R)first * third) + (R)second;
-  }
-  static std::string name() { return "Madd"; }
-};
-
-template <typename T, typename R>
-class Nmsac {
-public:
-  static R apply(T first, T second, R third) {
-    return -((R)first * (R)second) + third;
-  }
-  static std::string name() { return "Nmsac"; }
-};
-
-template <typename T, typename R>
-class Macc {
-public:
-  static R apply(T first, T second, R third) {
-    return ((R)first * (R)second) + third;
-  }
-  static std::string name() { return "Macc"; }
-};
-
-template <typename T, typename R>
-class Maccsu {
-public:
-  static R apply(T first, T second, R third) {
-    R first_ext = sext((R)first, (sizeof(T) * 8));
-    R second_ext = zext((R)second, (sizeof(T) * 8));
-    return (first_ext * second_ext) + third;
-  }
-  static std::string name() { return "Maccsu"; }
-};
-
-template <typename T, typename R>
-class Maccus {
-public:
-  static R apply(T first, T second, R third) {
-    R first_ext = zext((R)first, (sizeof(T) * 8));
-    R second_ext = sext((R)second, (sizeof(T) * 8));
-    return (first_ext * second_ext) + third;
-  }
-  static std::string name() { return "Maccus"; }
-};
-
-template <typename T, typename R>
-class Nmsub {
-public:
-  static R apply(T first, T second, R third) {
-    return -((R)first * third) + (R)second;
-  }
-  static std::string name() { return "Nmsub"; }
-};
-
-template <typename T, typename R>
-class Min {
-public:
-  static R apply(T first, T second, R) {
-    return std::min(first, second);
-  }
-  static std::string name() { return "Min"; }
-};
-
-template <typename T, typename R>
-class Max {
-public:
-  static R apply(T first, T second, R) {
-    return std::max(first, second);
-  }
-  static std::string name() { return "Max"; }
-};
-
-template <typename T, typename R>
-class And {
-public:
-  static R apply(T first, T second, R) {
-    return first & second;
-  }
-  static std::string name() { return "And"; }
-};
-
-template <typename T, typename R>
-class Or {
-public:
-  static R apply(T first, T second, R) {
-    return first | second;
-  }
-  static std::string name() { return "Or"; }
-};
-
-template <typename T, typename R>
-class Xor {
-public:
-  static R apply(T first, T second, R) {
-    return first ^ second;
-  }
-  static std::string name() { return "Xor"; }
-};
-
-template <typename T, typename R>
-class Sll {
-public:
-  static R apply(T first, T second, R) {
-    // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount.
-    return second << (first & (sizeof(T) * 8 - 1));
-  }
-  static std::string name() { return "Sll"; }
-};
-
-template <typename T, typename R>
-bool bitAt(T value, R pos, R negOffset) {
-  R offsetPos = pos - negOffset;
-  return pos >= negOffset && ((value >> offsetPos) & 0x1);
-}
-
-template <typename T, typename R>
-bool anyBitUpTo(T value, R to, R negOffset) {
-  R offsetTo = to - negOffset;
-  return to >= negOffset && (value & (((R)1 << (offsetTo + 1)) - 1));
-}
-
-template <typename T, typename R>
-bool roundBit(T value, R shiftDown, uint32_t vxrm) {
-  switch (vxrm) {
-  case 0: // round-to-nearest-up
-    return bitAt(value, shiftDown, (R)1);
-  case 1: // round-to-nearest-even
-    return bitAt(value, shiftDown, (R)1) && (anyBitUpTo(value, shiftDown, (R)2) || bitAt(value, shiftDown, (R)0));
-  case 2: // round-down (truncate)
-    return 0;
-  case 3: // round-to-odd
-    return !bitAt(value, shiftDown, (R)0) && anyBitUpTo(value, shiftDown, (R)1);
-  default:
-    std::cout << "Roundoff - invalid value for vxrm: " << vxrm << std::endl;
-    std::abort();
-  }
-}
-
-template <typename T, typename R>
-class SrlSra {
-public:
-  static R apply(T first, T second, R) {
-    // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount.
-    return second >> (first & (sizeof(T) * 8 - 1));
-  }
-  static R apply(T first, T second, uint32_t vxrm, uint32_t) {
-    // Saturation is not relevant for this operation
-    // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount.
-    T firstValid = first & (sizeof(T) * 8 - 1);
-    return apply(firstValid, second, 0) + roundBit(second, firstValid, vxrm);
-  }
-  static std::string name() { return "SrlSra"; }
-};
-
-template <typename T, typename R>
-class Aadd {
-public:
-  static R apply(T first, T second, uint32_t vxrm, uint32_t) {
-    // Saturation is not relevant for this operation
-    T sum = second + first;
-    return (sum >> 1) + roundBit(sum, 1, vxrm);
-  }
-  static std::string name() { return "Aadd"; }
-};
-
-template <typename T, typename R>
-class Asub {
-public:
-  static R apply(T first, T second, uint32_t vxrm, uint32_t) {
-    // Saturation is not relevant for this operation
-    T difference = second - first;
-    return (difference >> 1) + roundBit(difference, 1, vxrm);
-  }
-  static std::string name() { return "Asub"; }
-};
-
-template <typename T, typename R>
-class Eq {
-public:
-  static R apply(T first, T second, R) {
-    return first == second;
-  }
-  static std::string name() { return "Eq"; }
-};
-
-template <typename T, typename R>
-class Ne {
-public:
-  static R apply(T first, T second, R) {
-    return first != second;
-  }
-  static std::string name() { return "Ne"; }
-};
-
-template <typename T, typename R>
-class Lt {
-public:
-  static R apply(T first, T second, R) {
-    return first > second;
-  }
-  static std::string name() { return "Lt"; }
-};
-
-template <typename T, typename R>
-class Le {
-public:
-  static R apply(T first, T second, R) {
-    return first >= second;
-  }
-  static std::string name() { return "Le"; }
-};
-
-template <typename T, typename R>
-class Gt {
-public:
-  static R apply(T first, T second, R) {
-    return first < second;
-  }
-  static std::string name() { return "Gt"; }
-};
-
-template <typename T, typename R>
-class AndNot {
-public:
-  static R apply(T first, T second, R) {
-    return second & ~first;
-  }
-  static std::string name() { return "AndNot"; }
-};
-
-template <typename T, typename R>
-class OrNot {
-public:
-  static R apply(T first, T second, R) {
-    return second | ~first;
-  }
-  static std::string name() { return "OrNot"; }
-};
-
-template <typename T, typename R>
-class Nand {
-public:
-  static R apply(T first, T second, R) {
-    return ~(second & first);
-  }
-  static std::string name() { return "Nand"; }
-};
-
-template <typename T, typename R>
-class Mv {
-public:
-  static R apply(T first, T, R) {
-    return first;
-  }
-  static std::string name() { return "Mv"; }
-};
-
-template <typename T, typename R>
-class Nor {
-public:
-  static R apply(T first, T second, R) {
-    return ~(second | first);
-  }
-  static std::string name() { return "Nor"; }
-};
-
-template <typename T, typename R>
-class Xnor {
-public:
-  static R apply(T first, T second, R) {
-    return ~(second ^ first);
-  }
-  static std::string name() { return "Xnor"; }
-};
-
-template <typename T, typename R>
-class Fadd {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(R) == 4) {
-      return rv_fadd_s(first, second, frm, &fflags);
-    } else if (sizeof(R) == 8) {
-      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
-      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
-      return rv_fadd_d(first_d, second_d, frm, &fflags);
-    } else {
-      std::cout << "Fadd only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fadd"; }
-};
-
-template <typename T, typename R>
-class Fsub {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(R) == 4) {
-      return rv_fsub_s(second, first, frm, &fflags);
-    } else if (sizeof(R) == 8) {
-      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
-      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
-      return rv_fsub_d(second_d, first_d, frm, &fflags);
-    } else {
-      std::cout << "Fsub only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fsub"; }
-};
-
-template <typename T, typename R>
-class Fmacc {
-public:
-  static R apply(T first, T second, R third) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(R) == 4) {
-      return rv_fmadd_s(first, second, third, frm, &fflags);
-    } else if (sizeof(R) == 8) {
-      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
-      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
-      return rv_fmadd_d(first_d, second_d, third, frm, &fflags);
-    } else {
-      std::cout << "Fmacc only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fmacc"; }
-};
-
-template <typename T, typename R>
-class Fnmacc {
-public:
-  static R apply(T first, T second, R third) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(R) == 4) {
-      return rv_fnmadd_s(first, second, third, frm, &fflags);
-    } else if (sizeof(R) == 8) {
-      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
-      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
-      return rv_fnmadd_d(first_d, second_d, third, frm, &fflags);
-    } else {
-      std::cout << "Fnmacc only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fnmacc"; }
-};
-
-template <typename T, typename R>
-class Fmsac {
-public:
-  static R apply(T first, T second, R third) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(R) == 4) {
-      return rv_fmadd_s(first, second, rv_fsgnjn_s(third, third), frm, &fflags);
-    } else if (sizeof(R) == 8) {
-      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
-      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
-      return rv_fmadd_d(first_d, second_d, rv_fsgnjn_d(third, third), frm, &fflags);
-    } else {
-      std::cout << "Fmsac only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fmsac"; }
-};
-
-template <typename T, typename R>
-class Fnmsac {
-public:
-  static R apply(T first, T second, R third) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(R) == 4) {
-      return rv_fnmadd_s(first, second, rv_fsgnjn_s(third, third), frm, &fflags);
-    } else if (sizeof(R) == 8) {
-      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
-      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
-      return rv_fnmadd_d(first_d, second_d, rv_fsgnjn_d(third, third), frm, &fflags);
-    } else {
-      std::cout << "Fnmsac only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fnmsac"; }
-};
-
-template <typename T, typename R>
-class Fmadd {
-public:
-  static R apply(T first, T second, R third) {
-    if (sizeof(T) == 4 || sizeof(T) == 8) {
-      return Fmacc<T, R>::apply(first, third, second);
-    } else {
-      std::cout << "Fmadd only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fmadd"; }
-};
-
-template <typename T, typename R>
-class Fnmadd {
-public:
-  static R apply(T first, T second, R third) {
-    if (sizeof(T) == 4 || sizeof(T) == 8) {
-      return Fnmacc<T, R>::apply(first, third, second);
-    } else {
-      std::cout << "Fnmadd only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fnmadd"; }
-};
-
-template <typename T, typename R>
-class Fmsub {
-public:
-  static R apply(T first, T second, R third) {
-    if (sizeof(T) == 4 || sizeof(T) == 8) {
-      return Fmsac<T, R>::apply(first, third, second);
-    } else {
-      std::cout << "Fmsub only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fmsub"; }
-};
-
-template <typename T, typename R>
-class Fnmsub {
-public:
-  static R apply(T first, T second, R third) {
-    if (sizeof(T) == 4 || sizeof(T) == 8) {
-      return Fnmsac<T, R>::apply(first, third, second);
-    } else {
-      std::cout << "Fnmsub only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fnmsub"; }
-};
-
-template <typename T, typename R>
-class Fmin {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring rounding modes for now
-    uint32_t fflags = 0;
-    if (sizeof(T) == 4) {
-      return rv_fmin_s(first, second, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_fmin_d(first, second, &fflags);
-    } else {
-      std::cout << "Fmin only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fmin"; }
-};
-
-template <typename T, typename R>
-class Fmax {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring rounding modes for now
-    uint32_t fflags = 0;
-    if (sizeof(T) == 4) {
-      return rv_fmax_s(first, second, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_fmax_d(first, second, &fflags);
-    } else {
-      std::cout << "Fmax only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fmax"; }
-};
-
-template <typename T, typename R>
-class Fsgnj {
-public:
-  static R apply(T first, T second, R) {
-    if (sizeof(T) == 4) {
-      return rv_fsgnj_s(second, first);
-    } else if (sizeof(T) == 8) {
-      return rv_fsgnj_d(second, first);
-    } else {
-      std::cout << "Fsgnj only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fsgnj"; }
-};
-
-template <typename T, typename R>
-class Fsgnjn {
-public:
-  static R apply(T first, T second, R) {
-    if (sizeof(T) == 4) {
-      return rv_fsgnjn_s(second, first);
-    } else if (sizeof(T) == 8) {
-      return rv_fsgnjn_d(second, first);
-    } else {
-      std::cout << "Fsgnjn only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fsgnjn"; }
-};
-
-template <typename T, typename R>
-class Fsgnjx {
-public:
-  static R apply(T first, T second, R) {
-    if (sizeof(T) == 4) {
-      return rv_fsgnjx_s(second, first);
-    } else if (sizeof(T) == 8) {
-      return rv_fsgnjx_d(second, first);
-    } else {
-      std::cout << "Fsgnjx only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fsgnjx"; }
-};
-
-template <typename T, typename R>
-class Fcvt {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(T) == 4) {
-      switch (first) {
-      case 0b00000: // vfcvt.xu.f.v
-        return rv_ftou_s(second, frm, &fflags);
-      case 0b00001: // vfcvt.x.f.v
-        return rv_ftoi_s(second, frm, &fflags);
-      case 0b00010: // vfcvt.f.xu.v
-        return rv_utof_s(second, frm, &fflags);
-      case 0b00011: // vfcvt.f.x.v
-        return rv_itof_s(second, frm, &fflags);
-      case 0b00110: // vfcvt.rtz.xu.f.v
-        return rv_ftou_s(second, 1, &fflags);
-      case 0b00111: // vfcvt.rtz.x.f.v
-        return rv_ftoi_s(second, 1, &fflags);
-      case 0b01000: // vfwcvt.xu.f.v
-        return rv_ftolu_s(second, frm, &fflags);
-      case 0b01001: // vfwcvt.x.f.v
-        return rv_ftol_s(second, frm, &fflags);
-      case 0b01010: // vfwcvt.f.xu.v
-        return rv_utof_d(second, frm, &fflags);
-      case 0b01011: // vfwcvt.f.x.v
-        return rv_itof_d(second, frm, &fflags);
-      case 0b01100: // vfwcvt.f.f.v
-        return rv_ftod(second);
-      case 0b01110: // vfwcvt.rtz.xu.f.v
-        return rv_ftolu_s(second, 1, &fflags);
-      case 0b01111: // vfwcvt.rtz.x.f.v
-        return rv_ftol_s(second, 1, &fflags);
-      default:
-        std::cout << "Fcvt has unsupported value for first: " << first << std::endl;
-        std::abort();
-      }
-    } else if (sizeof(T) == 8) {
-      switch (first) {
-      case 0b00000: // vfcvt.xu.f.v
-        return rv_ftolu_d(second, frm, &fflags);
-      case 0b00001: // vfcvt.x.f.v
-        return rv_ftol_d(second, frm, &fflags);
-      case 0b00010: // vfcvt.f.xu.v
-        return rv_lutof_d(second, frm, &fflags);
-      case 0b00011: // vfcvt.f.x.v
-        return rv_ltof_d(second, frm, &fflags);
-      case 0b00110: // vfcvt.rtz.xu.f.v
-        return rv_ftolu_d(second, 1, &fflags);
-      case 0b00111: // vfcvt.rtz.x.f.v
-        return rv_ftol_d(second, 1, &fflags);
-      case 0b01000: // vfwcvt.xu.f.v
-      case 0b01001: // vfwcvt.x.f.v
-      case 0b01010: // vfwcvt.f.xu.v
-      case 0b01011: // vfwcvt.f.x.v
-      case 0b01100: // vfwcvt.f.f.v
-      case 0b01110: // vfwcvt.rtz.xu.f.v
-      case 0b01111: // vfwcvt.rtz.x.f.v
-        std::cout << "Fwcvt only supports f32" << std::endl;
-        std::abort();
-      default:
-        std::cout << "Fcvt has unsupported value for first: " << first << std::endl;
-        std::abort();
-      }
-    } else {
-      std::cout << "Fcvt only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static R apply(T first, T second, uint32_t vxrm, uint32_t &) { // saturation argument is unused
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    if (sizeof(T) == 8) {
-      switch (first) {
-      case 0b10000: // vfncvt.xu.f.w
-        return rv_ftou_d(second, vxrm, &fflags);
-      case 0b10001: // vfncvt.x.f.w
-        return rv_ftoi_d(second, vxrm, &fflags);
-      case 0b10010: // vfncvt.f.xu.w
-        return rv_lutof_s(second, vxrm, &fflags);
-      case 0b10011: // vfncvt.f.x.w
-        return rv_ltof_s(second, vxrm, &fflags);
-      case 0b10100: // vfncvt.f.f.w
-        return rv_dtof_r(second, vxrm);
-      case 0b10101: // vfncvt.rod.f.f.w
-        return rv_dtof_r(second, 6);
-      case 0b10110: // vfncvt.rtz.xu.f.w
-        return rv_ftou_d(second, 1, &fflags);
-      case 0b10111: // vfncvt.rtz.x.f.w
-        return rv_ftoi_d(second, 1, &fflags);
-      default:
-        std::cout << "Fncvt has unsupported value for first: " << first << std::endl;
-        std::abort();
-      }
-    } else {
-      std::cout << "Fncvt only supports f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fcvt"; }
-};
-
-template <typename T, typename R>
-class Funary1 {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(T) == 4) {
-      switch (first) {
-      case 0b00000: // vfsqrt.v
-        return rv_fsqrt_s(second, frm, &fflags);
-      case 0b00100: // vfrsqrt7.v
-        return rv_frsqrt7_s(second, frm, &fflags);
-      case 0b00101: // vfrec7.v
-        return rv_frecip7_s(second, frm, &fflags);
-      case 0b10000: // vfclass.v
-        return rv_fclss_s(second);
-      default:
-        std::cout << "Funary1 has unsupported value for first: " << first << std::endl;
-        std::abort();
-      }
-    } else if (sizeof(T) == 8) {
-      switch (first) {
-      case 0b00000: // vfsqrt.v
-        return rv_fsqrt_d(second, frm, &fflags);
-      case 0b00100: // vfrsqrt7.v
-        return rv_frsqrt7_d(second, frm, &fflags);
-      case 0b00101: // vfrec7.v
-        return rv_frecip7_d(second, frm, &fflags);
-      case 0b10000: // vfclass.v
-        return rv_fclss_d(second);
-      default:
-        std::cout << "Funary1 has unsupported value for first: " << first << std::endl;
-        std::abort();
-      }
-    } else {
-      std::cout << "Funary1 only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Funary1"; }
-};
-
-template <typename T, typename R>
-class Xunary0 {
-public:
-  static R apply(T, T second, T) {
-    return second;
-  }
-  static std::string name() { return "Xunary0"; }
-};
-
-template <typename T, typename R>
-class Feq {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    if (sizeof(T) == 4) {
-      return rv_feq_s(second, first, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_feq_d(second, first, &fflags);
-    } else {
-      std::cout << "Feq only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Feq"; }
-};
-
-template <typename T, typename R>
-class Fle {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    if (sizeof(T) == 4) {
-      return rv_fle_s(second, first, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_fle_d(second, first, &fflags);
-    } else {
-      std::cout << "Fle only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fle"; }
-};
-
-template <typename T, typename R>
-class Flt {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    if (sizeof(T) == 4) {
-      return rv_flt_s(second, first, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_flt_d(second, first, &fflags);
-    } else {
-      std::cout << "Flt only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Flt"; }
-};
-
-template <typename T, typename R>
-class Fne {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    if (sizeof(T) == 4) {
-      return !rv_feq_s(second, first, &fflags);
-    } else if (sizeof(T) == 8) {
-      return !rv_feq_d(second, first, &fflags);
-    } else {
-      std::cout << "Fne only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fne"; }
-};
-
-template <typename T, typename R>
-class Fgt {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    if (sizeof(T) == 4) {
-      return rv_flt_s(first, second, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_flt_d(first, second, &fflags);
-    } else {
-      std::cout << "Fgt only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fgt"; }
-};
-
-template <typename T, typename R>
-class Fge {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    if (sizeof(T) == 4) {
-      return rv_fle_s(first, second, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_fle_d(first, second, &fflags);
-    } else {
-      std::cout << "Fge only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fge"; }
-};
-
-template <typename T, typename R>
-class Fdiv {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(T) == 4) {
-      return rv_fdiv_s(second, first, frm, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_fdiv_d(second, first, frm, &fflags);
-    } else {
-      std::cout << "Fdiv only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fdiv"; }
-};
-
-template <typename T, typename R>
-class Frdiv {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(T) == 4) {
-      return rv_fdiv_s(first, second, frm, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_fdiv_d(first, second, frm, &fflags);
-    } else {
-      std::cout << "Frdiv only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Frdiv"; }
-};
-
-template <typename T, typename R>
-class Fmul {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(R) == 4) {
-      return rv_fmul_s(first, second, frm, &fflags);
-    } else if (sizeof(R) == 8) {
-      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
-      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
-      return rv_fmul_d(first_d, second_d, frm, &fflags);
-    } else {
-      std::cout << "Fmul only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Fmul"; }
-};
-
-template <typename T, typename R>
-class Frsub {
-public:
-  static R apply(T first, T second, R) {
-    // ignoring flags for now
-    uint32_t fflags = 0;
-    // ignoring rounding mode for now
-    uint32_t frm = 0;
-    if (sizeof(T) == 4) {
-      return rv_fsub_s(first, second, frm, &fflags);
-    } else if (sizeof(T) == 8) {
-      return rv_fsub_d(first, second, frm, &fflags);
-    } else {
-      std::cout << "Frsub only supports f32 and f64" << std::endl;
-      std::abort();
-    }
-  }
-  static std::string name() { return "Frsub"; }
-};
-
-template <typename T, typename R>
-class Clip {
-public:
-  static R apply(T first, T second, uint32_t vxrm, uint32_t &vxsat_) {
-    // The low lg2(2*SEW) bits of the vector or scalar shift-amount value (e.g., the low 6 bits for a SEW=64-bit to
-    // SEW=32-bit narrowing operation) are used to control the right shift amount, which provides the scaling.
-    R firstValid = first & (sizeof(T) * 8 - 1);
-    T unclippedResult = (second >> firstValid) + roundBit(second, firstValid, vxrm);
-    R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits<R>::min(), (T)std::numeric_limits<R>::max());
-    vxsat_ |= clippedResult != unclippedResult;
-    return clippedResult;
-  }
-  static std::string name() { return "Clip"; }
-};
-
-template <typename T, typename R>
-class Smul {
-public:
-  static R apply(T first, T second, uint32_t vxrm, uint32_t &vxsat_) {
-    R shift = sizeof(R) * 8 - 1;
-    T unshiftedResult = first * second;
-    T unclippedResult = (unshiftedResult >> shift) + roundBit(unshiftedResult, shift, vxrm);
-    R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits<R>::min(), (T)std::numeric_limits<R>::max());
-    vxsat_ |= clippedResult != unclippedResult;
-    return clippedResult;
-  }
-  static std::string name() { return "Smul"; }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-bool isMasked(std::vector<std::vector<Byte>> &vreg_file, uint32_t maskVreg, uint32_t byteI, bool vmask) {
-  auto &mask = vreg_file.at(maskVreg);
-  uint8_t emask = *(uint8_t *)(mask.data() + byteI / 8);
-  uint8_t value = (emask >> (byteI % 8)) & 0x1;
-  DP(4, "Masking enabled: " << +!vmask << " mask element: " << +value);
-  return !vmask && value == 0;
-}
-
-template <typename DT>
-uint32_t getVreg(uint32_t baseVreg, uint32_t byteI) {
-  uint32_t vsew = sizeof(DT) * 8;
-  return (baseVreg + (byteI / (VLEN / vsew))) % 32;
-}
-
-template <typename DT>
-DT &getVregData(std::vector<vortex::Byte> &baseVregVec, uint32_t byteI) {
-  uint32_t vsew = sizeof(DT) * 8;
-  return *(DT *)(baseVregVec.data() + (byteI % (VLEN / vsew)) * vsew / 8);
-}
-
-template <typename DT>
-DT &getVregData(std::vector<std::vector<vortex::Byte>> &vreg_file, uint32_t baseVreg, uint32_t byteI) {
-  auto &vr1 = vreg_file.at(getVreg<DT>(baseVreg, byteI));
-  return getVregData<DT>(vr1, byteI);
-}
-
-template <typename DT>
-void vector_op_vix_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rdest, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  uint32_t vsew = sizeof(DT) * 8;
-  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
-  if (nfields * emul > 8) {
-    std::cout << "NFIELDS * EMUL = " << nfields * lmul << " but it should be <= 8" << std::endl;
-    std::abort();
-  }
-  for (uint32_t i = 0; i < vl * nfields; i++) {
-    if (isMasked(vreg_file, 0, i / nfields, vmask))
-      continue;
-
-    uint32_t nfields_strided = strided ? nfields : 1;
-    Word mem_addr = (base_addr & 0xFFFFFFFC) + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT);
-    Word mem_data = 0;
-    emul_->dcache_read(&mem_data, mem_addr, vsew / 8);
-    DP(4, "Loading data " << mem_data << " from: " << mem_addr << " to vec reg: " << getVreg<DT>(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
-    DT &result = getVregData<DT>(vreg_file, rdest + (i % nfields) * emul, i / nfields);
-    DP(4, "Previous data: " << +result);
-    result = (DT)mem_data;
-  }
-}
-
-void vector_op_vix_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rdest, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_load<uint8_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  case 16:
-    vector_op_vix_load<uint16_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  case 32:
-    vector_op_vix_load<uint32_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  case 64:
-    vector_op_vix_load<uint64_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VLE for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vv_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rdest, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  uint32_t vsew = sizeof(DT) * 8;
-  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
-  if (nfields * emul > 8) {
-    std::cout << "NFIELDS * EMUL = " << nfields * lmul << " but it should be <= 8" << std::endl;
-    std::abort();
-  }
-  for (uint32_t i = 0; i < vl * nfields; i++) {
-    if (isMasked(vreg_file, 0, i / nfields, vmask))
-      continue;
-
-    Word offset = 0;
-    switch (iSew) {
-    case 8:
-      offset = getVregData<uint8_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    case 16:
-      offset = getVregData<uint16_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    case 32:
-      offset = getVregData<uint32_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    case 64:
-      offset = getVregData<uint64_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    default:
-      std::cout << "Unsupported iSew: " << iSew << std::endl;
-      std::abort();
-    }
-
-    Word mem_addr = (base_addr & 0xFFFFFFFC) + offset + (i % nfields) * sizeof(DT);
-    Word mem_data = 0;
-    emul_->dcache_read(&mem_data, mem_addr, vsew / 8);
-    DP(4, "VLUX/VLOX - Loading data " << mem_data << " from: " << mem_addr << " with offset: " << std::dec << offset << " to vec reg: " << getVreg<DT>(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
-    DT &result = getVregData<DT>(vreg_file, rdest + (i % nfields) * emul, i / nfields);
-    DP(4, "Previous data: " << +result);
-    result = (DT)mem_data;
-  }
-}
-
-void vector_op_vv_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_load<uint8_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
-    break;
-  case 16:
-    vector_op_vv_load<uint16_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
-    break;
-  case 32:
-    vector_op_vv_load<uint32_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
-    break;
-  case 64:
-    vector_op_vv_load<uint64_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VLUX/VLOX for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vix_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc3, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  uint32_t vsew = sizeof(DT) * 8;
-  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
-  for (uint32_t i = 0; i < vl * nfields; i++) {
-    if (isMasked(vreg_file, 0, i / nfields, vmask))
-      continue;
-
-    uint32_t nfields_strided = strided ? nfields : 1;
-    Word mem_addr = base_addr + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT);
-    Word mem_data = getVregData<DT>(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields);
-    DP(4, "Storing: " << std::hex << mem_data << " at: " << mem_addr << " from vec reg: " << getVreg<DT>(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
-    emul_->dcache_write(&mem_data, mem_addr, vsew / 8);
-  }
-}
-
-void vector_op_vix_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc3, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_store<uint8_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  case 16:
-    vector_op_vix_store<uint16_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  case 32:
-    vector_op_vix_store<uint32_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  case 64:
-    vector_op_vix_store<uint64_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VSE for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vv_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rsrc3, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  uint32_t vsew = sizeof(DT) * 8;
-  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
-  for (uint32_t i = 0; i < vl * nfields; i++) {
-    if (isMasked(vreg_file, 0, i / nfields, vmask))
-      continue;
-
-    Word offset = 0;
-    switch (iSew) {
-    case 8:
-      offset = getVregData<uint8_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    case 16:
-      offset = getVregData<uint16_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    case 32:
-      offset = getVregData<uint32_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    case 64:
-      offset = getVregData<uint64_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    default:
-      std::cout << "Unsupported iSew: " << iSew << std::endl;
-      std::abort();
-    }
-
-    Word mem_addr = base_addr + offset + (i % nfields) * sizeof(DT);
-    Word mem_data = getVregData<DT>(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields);
-    DP(4, "VSUX/VSOX - Storing: " << std::hex << mem_data << " at: " << mem_addr << " with offset: " << std::dec << offset << " from vec reg: " << getVreg<DT>(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
-    emul_->dcache_write(&mem_data, mem_addr, vsew / 8);
-  }
-}
-
-void vector_op_vv_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rsrc3, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_store<uint8_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
-    break;
-  case 16:
-    vector_op_vv_store<uint16_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
-    break;
-  case 32:
-    vector_op_vv_store<uint32_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
-    break;
-  case 64:
-    vector_op_vv_store<uint64_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VSUX/VSOX for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT>
-void vector_op_vix(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT second = getVregData<DT>(vreg_file, rsrc0, i);
-    DT third = getVregData<DT>(vreg_file, rdest, i);
-    DT result = OP<DT, DT>::apply(first, second, third);
-    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DT>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix<OP, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vix<OP, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vix<OP, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vix<OP, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT>
-void vector_op_vix_carry(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl) {
-  for (uint32_t i = 0; i < vl; i++) {
-    DT second = getVregData<DT>(vreg_file, rsrc0, i);
-    bool third = !isMasked(vreg_file, 0, i, false);
-    DT result = OP<DT, DT>::apply(first, second, third);
-    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DT>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix_carry(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_carry<OP, DT8>(src1, vreg_file, rsrc0, rdest, vl);
-    break;
-  case 16:
-    vector_op_vix_carry<OP, DT16>(src1, vreg_file, rsrc0, rdest, vl);
-    break;
-  case 32:
-    vector_op_vix_carry<OP, DT32>(src1, vreg_file, rsrc0, rdest, vl);
-    break;
-  case 64:
-    vector_op_vix_carry<OP, DT64>(src1, vreg_file, rsrc0, rdest, vl);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX carry for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vix_carry_out(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    DT second = getVregData<DT>(vreg_file, rsrc0, i);
-    bool third = !vmask && !isMasked(vreg_file, 0, i, vmask);
-    bool result = OP<DT, DTR>::apply(first, second, third);
-    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    if (result) {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
-    } else {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
-    }
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64, typename DT128>
-void vector_op_vix_carry_out(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_carry_out<OP, DT8, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vix_carry_out<OP, DT16, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vix_carry_out<OP, DT32, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vix_carry_out<OP, DT64, DT128>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX carry out for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vix_merge(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    DT result = isMasked(vreg_file, 0, i, vmask) ? getVregData<DT>(vreg_file, rsrc0, i) : first;
-    DP(4, "Merge - Choosing result: " << +result);
-    getVregData<DT>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix_merge(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_merge<DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vix_merge<DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vix_merge<DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vix_merge<DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_scalar(DT &dest, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t vsew) {
-  if (rsrc0 != 0) {
-    std::cout << "Vwxunary0/Vwfunary0 has unsupported value for vs2: " << rsrc0 << std::endl;
-    std::abort();
-  }
-  switch (vsew) {
-  case 8:
-    dest = getVregData<uint8_t>(vreg_file, rsrc1, 0);
-    break;
-  case 16:
-    dest = getVregData<uint16_t>(vreg_file, rsrc1, 0);
-    break;
-  case 32:
-    dest = getVregData<uint32_t>(vreg_file, rsrc1, 0);
-    break;
-  case 64:
-    dest = getVregData<uint64_t>(vreg_file, rsrc1, 0);
-    break;
-  default:
-    std::cout << "Failed to execute vmv.x.s/vfmv.f.s for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vix_w(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT second = getVregData<DT>(vreg_file, rsrc0, i);
-    DTR third = getVregData<DTR>(vreg_file, rdest, i);
-    DTR result = OP<DT, DTR>::apply(first, second, third);
-    DP(4, "Widening " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix_w(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_w<OP, DT8, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vix_w<OP, DT16, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vix_w<OP, DT32, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX widening for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix_wx(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix<OP, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vix<OP, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vix<OP, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX widening wx for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vix_n(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT second = getVregData<DT>(vreg_file, rsrc0, i);
-    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
-    DP(4, "Narrowing " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix_n(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_n<OP, DT16, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 16:
-    vector_op_vix_n<OP, DT32, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 32:
-    vector_op_vix_n<OP, DT64, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX narrowing for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vix_sat(DTR first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT second = getVregData<DTR>(vreg_file, rsrc0, i);
-    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
-    DP(4, "Saturating " << (OP<DT, DTR>::name()) << "(" << +(DTR)first << ", " << +(DTR)second << ")" << " = " << +(DTR)result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64, typename DT128>
-void vector_op_vix_sat(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_sat<OP, DT16, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 16:
-    vector_op_vix_sat<OP, DT32, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 32:
-    vector_op_vix_sat<OP, DT64, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 64:
-    vector_op_vix_sat<OP, DT128, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX saturating for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix_scale(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_sat<OP, DT8, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 16:
-    vector_op_vix_sat<OP, DT16, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 32:
-    vector_op_vix_sat<OP, DT32, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 64:
-    vector_op_vix_sat<OP, DT64, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX scale for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP>
-void vector_op_vix_ext(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  if (vsew == 16) {
-    switch (src1) {
-    case 0b00110: // vzext.vf2
-      vector_op_vix_w<OP, uint8_t, uint16_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    case 0b00111: // vsext.vf2
-      vector_op_vix_w<OP, int8_t, int16_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    default:
-      std::cout << "Xunary0 has unsupported value for vf: " << src1 << std::endl;
-      std::abort();
-    }
-  } else if (vsew == 32) {
-    switch (src1) {
-    case 0b00100: // vzext.vf4
-      vector_op_vix_w<OP, uint8_t, uint32_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    case 0b00101: // vsext.vf4
-      vector_op_vix_w<OP, int8_t, int32_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    case 0b00110: // vzext.vf2
-      vector_op_vix_w<OP, uint16_t, uint32_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    case 0b00111: // vsext.vf2
-      vector_op_vix_w<OP, int16_t, int32_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    default:
-      std::cout << "Xunary0 has unsupported value for vf: " << src1 << std::endl;
-      std::abort();
-    }
-  } else if (vsew == 64) {
-    switch (src1) {
-    case 0b00010: // vzext.vf8
-      vector_op_vix_w<OP, uint8_t, uint64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    case 0b00011: // vsext.vf8
-      vector_op_vix_w<OP, int8_t, int64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    case 0b00100: // vzext.vf4
-      vector_op_vix_w<OP, uint16_t, uint64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    case 0b00101: // vsext.vf4
-      vector_op_vix_w<OP, int16_t, int64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    case 0b00110: // vzext.vf2
-      vector_op_vix_w<OP, uint32_t, uint64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    case 0b00111: // vsext.vf2
-      vector_op_vix_w<OP, int32_t, int64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-      break;
-    default:
-      std::cout << "Xunary0 has unsupported value for vf: " << src1 << std::endl;
-      std::abort();
-    }
-  } else {
-    std::cout << "Failed to execute Xunary0 for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT>
-void vector_op_vix_mask(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT second = getVregData<DT>(vreg_file, rsrc0, i);
-    bool result = OP<DT, bool>::apply(first, second, 0);
-    DP(4, "Integer/float compare mask " << (OP<DT, bool>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
-    if (result) {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
-    } else {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
-    }
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix_mask(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_mask<OP, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vix_mask<OP, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vix_mask<OP, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vix_mask<OP, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX integer/float compare mask for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vix_slide(Word first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, Word vlmax, uint32_t vmask, bool scalar) {
-  // If vlmax > 0 this means we have a vslidedown instruction, vslideup does not require vlmax
-  bool slideDown = vlmax;
-  uint32_t scalarPos = slideDown ? vl - 1 : 0;
-  // If scalar set is set this means we have a v(f)slide1up or v(f)slide1down instruction,
-  // so first is our scalar value and we need to overwrite it with 1 for later computations
-  if (scalar && vl && !isMasked(vreg_file, 0, scalarPos, vmask)) {
-    DP(4, "Slide - Moving scalar value " << +first << " to position " << +scalarPos);
-    getVregData<DT>(vreg_file, rdest, scalarPos) = first;
-  }
-  first = scalar ? 1 : first;
-
-  for (Word i = slideDown ? 0 : first; i < vl - (scalar && vl && slideDown); i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    __uint128_t iSrc = slideDown ? (__uint128_t)i + (__uint128_t)first : (__uint128_t)i - (__uint128_t)first; // prevent overflows/underflows
-    DT value = (!slideDown || iSrc < vlmax) ? getVregData<DT>(vreg_file, rsrc0, iSrc) : 0;
-    DP(4, "Slide - Moving value " << +value << " from position " << (uint64_t)iSrc << " to position " << +i);
-    getVregData<DT>(vreg_file, rdest, i) = value;
-  }
-}
-
-template <typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix_slide(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, Word vlmax, uint32_t vmask, bool scalar) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_slide<DT8>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask, scalar);
-    break;
-  case 16:
-    vector_op_vix_slide<DT16>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask, scalar);
-    break;
-  case 32:
-    vector_op_vix_slide<DT32>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask, scalar);
-    break;
-  case 64:
-    vector_op_vix_slide<DT64>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask, scalar);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX slide for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vix_gather(Word first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, Word vlmax, uint32_t vmask) {
-  for (Word i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT value = first < vlmax ? getVregData<DT>(vreg_file, rsrc0, first) : 0;
-    DP(4, "Register gather - Moving value " << +value << " from position " << +first << " to position " << +i);
-    getVregData<DT>(vreg_file, rdest, i) = value;
-  }
-}
-
-template <typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vix_gather(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, Word vlmax, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_gather<DT8>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask);
-    break;
-  case 16:
-    vector_op_vix_gather<DT16>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask);
-    break;
-  case 32:
-    vector_op_vix_gather<DT32>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask);
-    break;
-  case 64:
-    vector_op_vix_gather<DT64>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VI/VX register gather for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT>
-void vector_op_vv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    DT third = getVregData<DT>(vreg_file, rdest, i);
-    DT result = OP<DT, DT>::apply(first, second, third);
-    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DT>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv<OP, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vv<OP, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vv<OP, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vv<OP, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VV for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT>
-void vector_op_vv_carry(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl) {
-  for (uint32_t i = 0; i < vl; i++) {
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    bool third = !isMasked(vreg_file, 0, i, false);
-    DT result = OP<DT, DT>::apply(first, second, third);
-    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DT>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_carry(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_carry<OP, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl);
-    break;
-  case 16:
-    vector_op_vv_carry<OP, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl);
-    break;
-  case 32:
-    vector_op_vv_carry<OP, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl);
-    break;
-  case 64:
-    vector_op_vv_carry<OP, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl);
-    break;
-  default:
-    std::cout << "Failed to execute VV carry for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vv_carry_out(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    bool third = !vmask && !isMasked(vreg_file, 0, i, vmask);
-    bool result = OP<DT, DTR>::apply(first, second, third);
-    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    if (result) {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
-    } else {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
-    }
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64, typename DT128>
-void vector_op_vv_carry_out(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_carry_out<OP, DT8, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vv_carry_out<OP, DT16, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vv_carry_out<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vv_carry_out<OP, DT64, DT128>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VV carry out for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vv_merge(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    uint32_t rsrc = isMasked(vreg_file, 0, i, vmask) ? rsrc1 : rsrc0;
-    DT result = getVregData<DT>(vreg_file, rsrc, i);
-    DP(4, "Merge - Choosing result: " << +result);
-    getVregData<DT>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_merge(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_merge<DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vv_merge<DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vv_merge<DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vv_merge<DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VV for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vv_gather(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, bool ei16, uint32_t vlmax, uint32_t vmask) {
-  for (Word i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    uint32_t first = ei16 ? getVregData<uint16_t>(vreg_file, rsrc0, i) : getVregData<DT>(vreg_file, rsrc0, i);
-    DT value = first < vlmax ? getVregData<DT>(vreg_file, rsrc1, first) : 0;
-    DP(4, "Register gather - Moving value " << +value << " from position " << +first << " to position " << +i);
-    getVregData<DT>(vreg_file, rdest, i) = value;
-  }
-}
-
-template <typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_gather(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, bool ei16, uint32_t vlmax, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_gather<DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, ei16, vlmax, vmask);
-    break;
-  case 16:
-    vector_op_vv_gather<DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, ei16, vlmax, vmask);
-    break;
-  case 32:
-    vector_op_vv_gather<DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, ei16, vlmax, vmask);
-    break;
-  case 64:
-    vector_op_vv_gather<DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, ei16, vlmax, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VV register gather for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vv_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    DTR third = getVregData<DTR>(vreg_file, rdest, i);
-    DTR result = OP<DT, DTR>::apply(first, second, third);
-    DP(4, "Widening " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_w<OP, DT8, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vv_w<OP, DT16, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vv_w<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VV widening for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vv_wv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DTR second = getVregData<DTR>(vreg_file, rsrc1, i);
-    DTR third = getVregData<DTR>(vreg_file, rdest, i);
-    DTR result = OP<DTR, DTR>::apply(first, second, third);
-    DP(4, "Widening wv " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_wv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_wv<OP, DT8, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vv_wv<OP, DT16, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vv_wv<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VV widening wv for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vv_wfv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DTR second = getVregData<DTR>(vreg_file, rsrc1, i);
-    DTR third = getVregData<DTR>(vreg_file, rdest, i);
-    DTR result = OP<DTR, DTR>::apply(rv_ftod(first), second, third);
-    DP(4, "Widening wfv " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_wfv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  if (vsew == 32) {
-    vector_op_vv_wfv<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-  } else {
-    std::cout << "Failed to execute VV widening wfv for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vv_n(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DTR first = getVregData<DTR>(vreg_file, rsrc0, i);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
-    DP(4, "Narrowing " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_n(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_n<OP, DT16, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 16:
-    vector_op_vv_n<OP, DT32, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 32:
-    vector_op_vv_n<OP, DT64, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  default:
-    std::cout << "Failed to execute VV narrowing for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vv_sat(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT first = getVregData<DTR>(vreg_file, rsrc0, i);
-    DT second = getVregData<DTR>(vreg_file, rsrc1, i);
-    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
-    DP(4, "Saturating " << (OP<DT, DTR>::name()) << "(" << +(DTR)first << ", " << +(DTR)second << ")" << " = " << +(DTR)result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64, typename DT128>
-void vector_op_vv_sat(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_sat<OP, DT16, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 16:
-    vector_op_vv_sat<OP, DT32, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 32:
-    vector_op_vv_sat<OP, DT64, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 64:
-    vector_op_vv_sat<OP, DT128, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  default:
-    std::cout << "Failed to execute VV saturating for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_scale(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_sat<OP, DT8, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 16:
-    vector_op_vv_sat<OP, DT16, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 32:
-    vector_op_vv_sat<OP, DT32, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  case 64:
-    vector_op_vv_sat<OP, DT64, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
-    break;
-  default:
-    std::cout << "Failed to execute VV scale for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT>
-void vector_op_vv_red(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    // use rdest as accumulator
-    if (i == 0) {
-      getVregData<DT>(vreg_file, rdest, 0) = getVregData<DT>(vreg_file, rsrc0, 0);
-    }
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT first = getVregData<DT>(vreg_file, rdest, 0);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    DT result = OP<DT, DT>::apply(first, second, 0);
-    DP(4, "Reduction " << (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
-    getVregData<DT>(vreg_file, rdest, 0) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_red(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_red<OP, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vv_red<OP, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vv_red<OP, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vv_red<OP, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VV reduction for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vv_red_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    // use rdest as accumulator
-    if (i == 0) {
-      getVregData<DTR>(vreg_file, rdest, 0) = getVregData<DTR>(vreg_file, rsrc0, 0);
-    }
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DTR first = getVregData<DTR>(vreg_file, rdest, 0);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    DTR second_w = std::is_signed<DT>() ? sext((DTR)second, sizeof(DT) * 8) : zext((DTR)second, sizeof(DT) * 8);
-    DTR result = OP<DTR, DTR>::apply(first, second_w, 0);
-    DP(4, "Widening reduction " << (OP<DTR, DTR>::name()) << "(" << +first << ", " << +second_w << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, 0) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_red_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_red_w<OP, DT8, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vv_red_w<OP, DT16, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vv_red_w<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VV widening reduction for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
-void vector_op_vv_red_wf(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    // use rdest as accumulator
-    if (i == 0) {
-      getVregData<DTR>(vreg_file, rdest, 0) = getVregData<DTR>(vreg_file, rsrc0, 0);
-    }
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DTR first = getVregData<DTR>(vreg_file, rdest, 0);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    DTR second_w = rv_ftod(second);
-    DTR result = OP<DTR, DTR>::apply(first, second_w, 0);
-    DP(4, "Float widening reduction " << (OP<DTR, DTR>::name()) << "(" << +first << ", " << +second_w << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, 0) = result;
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_red_wf(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  if (vsew == 32) {
-    vector_op_vv_red_wf<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-  } else {
-    std::cout << "Failed to execute VV float widening reduction for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vid(std::vector<std::vector<Byte>> &vreg_file, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DP(4, "Element Index = " << +i);
-    getVregData<DT>(vreg_file, rdest, i) = i;
-  }
-}
-
-void vector_op_vid(std::vector<std::vector<Byte>> &vreg_file, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vid<uint8_t>(vreg_file, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vid<uint16_t>(vreg_file, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vid<uint32_t>(vreg_file, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vid<uint64_t>(vreg_file, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute vector element index for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT>
-void vector_op_vv_mask(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
-  for (uint32_t i = 0; i < vl; i++) {
-    if (isMasked(vreg_file, 0, i, vmask))
-      continue;
-
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    bool result = OP<DT, bool>::apply(first, second, 0);
-    DP(4, "Integer/float compare mask " << (OP<DT, bool>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
-    if (result) {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
-    } else {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
-    }
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_mask(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_mask<OP, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 16:
-    vector_op_vv_mask<OP, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 32:
-    vector_op_vv_mask<OP, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  case 64:
-    vector_op_vv_mask<OP, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VV integer/float compare mask for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <template <typename DT1, typename DT2> class OP>
-void vector_op_vv_mask(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl) {
-  for (uint32_t i = 0; i < vl; i++) {
-    uint8_t firstMask = getVregData<uint8_t>(vreg_file, rsrc0, i / 8);
-    bool first = (firstMask >> (i % 8)) & 0x1;
-    uint8_t secondMask = getVregData<uint8_t>(vreg_file, rsrc1, i / 8);
-    bool second = (secondMask >> (i % 8)) & 0x1;
-    bool result = OP<uint8_t, uint8_t>::apply(first, second, 0) & 0x1;
-    DP(4, "Compare mask bits " << (OP<uint8_t, uint8_t>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
-    if (result) {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
-    } else {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
-    }
-  }
-}
-
-template <typename DT>
-void vector_op_vv_compress(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl) {
-  int currPos = 0;
-  for (uint32_t i = 0; i < vl; i++) {
-    // Special case: use rsrc0 as mask vector register instead of default v0
-    // This instruction is always masked (vmask == 0), but encoded as unmasked (vmask == 1)
-    if (isMasked(vreg_file, rsrc0, i, 0))
-      continue;
-
-    DT value = getVregData<DT>(vreg_file, rsrc1, i);
-    DP(4, "Compression - Moving value " << +value << " from position " << i << " to position " << currPos);
-    getVregData<DT>(vreg_file, rdest, currPos) = value;
-    currPos++;
-  }
-}
-
-template <typename DT8, typename DT16, typename DT32, typename DT64>
-void vector_op_vv_compress(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_compress<DT8>(vreg_file, rsrc0, rsrc1, rdest, vl);
-    break;
-  case 16:
-    vector_op_vv_compress<DT16>(vreg_file, rsrc0, rsrc1, rdest, vl);
-    break;
-  case 32:
-    vector_op_vv_compress<DT32>(vreg_file, rsrc0, rsrc1, rdest, vl);
-    break;
-  case 64:
-    vector_op_vv_compress<DT64>(vreg_file, rsrc0, rsrc1, rdest, vl);
-    break;
-  default:
-    std::cout << "Failed to execute VV compression for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
 void Emulator::loadVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
   auto &warp = warps_.at(wid);
   auto vmask = instr.getVmask();
@@ -4864,3 +2478,4 @@ void Emulator::executeVector(const Instr &instr, uint32_t wid, std::vector<reg_d
     std::abort();
   }
 }
+#endif
\ No newline at end of file
diff --git a/sim/simx/vpu.h b/sim/simx/vpu.h
new file mode 100644
index 000000000..9ea9ec389
--- /dev/null
+++ b/sim/simx/vpu.h
@@ -0,0 +1,2393 @@
+#ifdef EXT_V_ENABLE
+#pragma once
+
+using namespace vortex;
+
+template <typename T, typename R>
+class Add {
+public:
+  static R apply(T first, T second, R) {
+    return (R)first + (R)second;
+  }
+  static std::string name() { return "Add"; }
+};
+
+template <typename T, typename R>
+class Sub {
+public:
+  static R apply(T first, T second, R) {
+    return (R)second - (R)first;
+  }
+  static std::string name() { return "Sub"; }
+};
+
+template <typename T, typename R>
+class Adc {
+public:
+  static R apply(T first, T second, R third) {
+    return (R)first + (R)second + third;
+  }
+  static std::string name() { return "Adc"; }
+};
+
+template <typename T, typename R>
+class Madc {
+public:
+  static R apply(T first, T second, R third) {
+    return ((R)first + (R)second + third) > (R)std::numeric_limits<T>::max();
+  }
+  static std::string name() { return "Madc"; }
+};
+
+template <typename T, typename R>
+class Sbc {
+public:
+  static R apply(T first, T second, R third) {
+    return (R)second - (R)first - third;
+  }
+  static std::string name() { return "Sbc"; }
+};
+
+template <typename T, typename R>
+class Msbc {
+public:
+  static R apply(T first, T second, R third) {
+    return (R)second < ((R)first + third);
+  }
+  static std::string name() { return "Msbc"; }
+};
+
+template <typename T, typename R>
+class Ssub {
+public:
+  static R apply(T first, T second, uint32_t, uint32_t &vxsat_) {
+    // rounding mode is not relevant for this operation
+    T unclippedResult = second - first;
+    R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits<R>::min(), (T)std::numeric_limits<R>::max());
+    vxsat_ |= clippedResult != unclippedResult;
+    return clippedResult;
+  }
+  static std::string name() { return "Ssub"; }
+};
+
+template <typename T, typename R>
+class Ssubu {
+public:
+  static R apply(T first, T second, uint32_t, uint32_t &vxsat_) {
+    // rounding mode is not relevant for this operation
+    if (first > second) {
+      vxsat_ = true;
+      return 0;
+    } else {
+      vxsat_ = false;
+      return second - first;
+    }
+  }
+  static std::string name() { return "Ssubu"; }
+};
+
+template <typename T, typename R>
+class Sadd {
+public:
+  static R apply(T first, T second, uint32_t, uint32_t &vxsat_) {
+    // rounding mode is not relevant for this operation
+    T unclippedResult = second + first;
+    R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits<R>::min(), (T)std::numeric_limits<R>::max());
+    vxsat_ |= clippedResult != unclippedResult;
+    return clippedResult;
+  }
+  static std::string name() { return "Sadd"; }
+};
+
+template <typename T, typename R>
+class Rsub {
+public:
+  static R apply(T first, T second, R) {
+    return first - second;
+  }
+  static std::string name() { return "Rsub"; }
+};
+
+template <typename T, typename R>
+class Div {
+public:
+  static R apply(T first, T second, R) {
+    // logic taken from scalar div
+    if (first == 0) {
+      return -1;
+    } else if (second == std::numeric_limits<T>::min() && first == T(-1)) {
+      return second;
+    } else {
+      return (R)second / (R)first;
+    }
+  }
+  static std::string name() { return "Div"; }
+};
+
+template <typename T, typename R>
+class Rem {
+public:
+  static R apply(T first, T second, R) {
+    // logic taken from scalar rem
+    if (first == 0) {
+      return second;
+    } else if (second == std::numeric_limits<T>::min() && first == T(-1)) {
+      return 0;
+    } else {
+      return (R)second % (R)first;
+    }
+  }
+  static std::string name() { return "Rem"; }
+};
+
+template <typename T, typename R>
+class Mul {
+public:
+  static R apply(T first, T second, R) {
+    return (R)first * (R)second;
+  }
+  static std::string name() { return "Mul"; }
+};
+
+template <typename T, typename R>
+class Mulsu {
+public:
+  static R apply(T first, T second, R) {
+    R first_ext = zext((R)first, (sizeof(T) * 8));
+    return first_ext * (R)second;
+  }
+  static std::string name() { return "Mulsu"; }
+};
+
+template <typename T, typename R>
+class Mulh {
+public:
+  static R apply(T first, T second, R) {
+    __int128_t first_ext = sext((__int128_t)first, (sizeof(T) * 8));
+    __int128_t second_ext = sext((__int128_t)second, (sizeof(T) * 8));
+    return (first_ext * second_ext) >> (sizeof(T) * 8);
+  }
+  static std::string name() { return "Mulh"; }
+};
+
+template <typename T, typename R>
+class Mulhsu {
+public:
+  static R apply(T first, T second, R) {
+    __int128_t first_ext = zext((__int128_t)first, (sizeof(T) * 8));
+    __int128_t second_ext = sext((__int128_t)second, (sizeof(T) * 8));
+    return (first_ext * second_ext) >> (sizeof(T) * 8);
+  }
+  static std::string name() { return "Mulhsu"; }
+};
+
+template <typename T, typename R>
+class Mulhu {
+public:
+  static R apply(T first, T second, R) {
+    return ((__uint128_t)first * (__uint128_t)second) >> (sizeof(T) * 8);
+  }
+  static std::string name() { return "Mulhu"; }
+};
+
+template <typename T, typename R>
+class Madd {
+public:
+  static R apply(T first, T second, R third) {
+    return ((R)first * third) + (R)second;
+  }
+  static std::string name() { return "Madd"; }
+};
+
+template <typename T, typename R>
+class Nmsac {
+public:
+  static R apply(T first, T second, R third) {
+    return -((R)first * (R)second) + third;
+  }
+  static std::string name() { return "Nmsac"; }
+};
+
+template <typename T, typename R>
+class Macc {
+public:
+  static R apply(T first, T second, R third) {
+    return ((R)first * (R)second) + third;
+  }
+  static std::string name() { return "Macc"; }
+};
+
+template <typename T, typename R>
+class Maccsu {
+public:
+  static R apply(T first, T second, R third) {
+    R first_ext = sext((R)first, (sizeof(T) * 8));
+    R second_ext = zext((R)second, (sizeof(T) * 8));
+    return (first_ext * second_ext) + third;
+  }
+  static std::string name() { return "Maccsu"; }
+};
+
+template <typename T, typename R>
+class Maccus {
+public:
+  static R apply(T first, T second, R third) {
+    R first_ext = zext((R)first, (sizeof(T) * 8));
+    R second_ext = sext((R)second, (sizeof(T) * 8));
+    return (first_ext * second_ext) + third;
+  }
+  static std::string name() { return "Maccus"; }
+};
+
+template <typename T, typename R>
+class Nmsub {
+public:
+  static R apply(T first, T second, R third) {
+    return -((R)first * third) + (R)second;
+  }
+  static std::string name() { return "Nmsub"; }
+};
+
+template <typename T, typename R>
+class Min {
+public:
+  static R apply(T first, T second, R) {
+    return std::min(first, second);
+  }
+  static std::string name() { return "Min"; }
+};
+
+template <typename T, typename R>
+class Max {
+public:
+  static R apply(T first, T second, R) {
+    return std::max(first, second);
+  }
+  static std::string name() { return "Max"; }
+};
+
+template <typename T, typename R>
+class And {
+public:
+  static R apply(T first, T second, R) {
+    return first & second;
+  }
+  static std::string name() { return "And"; }
+};
+
+template <typename T, typename R>
+class Or {
+public:
+  static R apply(T first, T second, R) {
+    return first | second;
+  }
+  static std::string name() { return "Or"; }
+};
+
+template <typename T, typename R>
+class Xor {
+public:
+  static R apply(T first, T second, R) {
+    return first ^ second;
+  }
+  static std::string name() { return "Xor"; }
+};
+
+template <typename T, typename R>
+class Sll {
+public:
+  static R apply(T first, T second, R) {
+    // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount.
+    return second << (first & (sizeof(T) * 8 - 1));
+  }
+  static std::string name() { return "Sll"; }
+};
+
+template <typename T, typename R>
+bool bitAt(T value, R pos, R negOffset) {
+  R offsetPos = pos - negOffset;
+  return pos >= negOffset && ((value >> offsetPos) & 0x1);
+}
+
+template <typename T, typename R>
+bool anyBitUpTo(T value, R to, R negOffset) {
+  R offsetTo = to - negOffset;
+  return to >= negOffset && (value & (((R)1 << (offsetTo + 1)) - 1));
+}
+
+template <typename T, typename R>
+bool roundBit(T value, R shiftDown, uint32_t vxrm) {
+  switch (vxrm) {
+  case 0: // round-to-nearest-up
+    return bitAt(value, shiftDown, (R)1);
+  case 1: // round-to-nearest-even
+    return bitAt(value, shiftDown, (R)1) && (anyBitUpTo(value, shiftDown, (R)2) || bitAt(value, shiftDown, (R)0));
+  case 2: // round-down (truncate)
+    return 0;
+  case 3: // round-to-odd
+    return !bitAt(value, shiftDown, (R)0) && anyBitUpTo(value, shiftDown, (R)1);
+  default:
+    std::cout << "Roundoff - invalid value for vxrm: " << vxrm << std::endl;
+    std::abort();
+  }
+}
+
+template <typename T, typename R>
+class SrlSra {
+public:
+  static R apply(T first, T second, R) {
+    // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount.
+    return second >> (first & (sizeof(T) * 8 - 1));
+  }
+  static R apply(T first, T second, uint32_t vxrm, uint32_t) {
+    // Saturation is not relevant for this operation
+    // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount.
+    T firstValid = first & (sizeof(T) * 8 - 1);
+    return apply(firstValid, second, 0) + roundBit(second, firstValid, vxrm);
+  }
+  static std::string name() { return "SrlSra"; }
+};
+
+template <typename T, typename R>
+class Aadd {
+public:
+  static R apply(T first, T second, uint32_t vxrm, uint32_t) {
+    // Saturation is not relevant for this operation
+    T sum = second + first;
+    return (sum >> 1) + roundBit(sum, 1, vxrm);
+  }
+  static std::string name() { return "Aadd"; }
+};
+
+template <typename T, typename R>
+class Asub {
+public:
+  static R apply(T first, T second, uint32_t vxrm, uint32_t) {
+    // Saturation is not relevant for this operation
+    T difference = second - first;
+    return (difference >> 1) + roundBit(difference, 1, vxrm);
+  }
+  static std::string name() { return "Asub"; }
+};
+
+template <typename T, typename R>
+class Eq {
+public:
+  static R apply(T first, T second, R) {
+    return first == second;
+  }
+  static std::string name() { return "Eq"; }
+};
+
+template <typename T, typename R>
+class Ne {
+public:
+  static R apply(T first, T second, R) {
+    return first != second;
+  }
+  static std::string name() { return "Ne"; }
+};
+
+template <typename T, typename R>
+class Lt {
+public:
+  static R apply(T first, T second, R) {
+    return first > second;
+  }
+  static std::string name() { return "Lt"; }
+};
+
+template <typename T, typename R>
+class Le {
+public:
+  static R apply(T first, T second, R) {
+    return first >= second;
+  }
+  static std::string name() { return "Le"; }
+};
+
+template <typename T, typename R>
+class Gt {
+public:
+  static R apply(T first, T second, R) {
+    return first < second;
+  }
+  static std::string name() { return "Gt"; }
+};
+
+template <typename T, typename R>
+class AndNot {
+public:
+  static R apply(T first, T second, R) {
+    return second & ~first;
+  }
+  static std::string name() { return "AndNot"; }
+};
+
+template <typename T, typename R>
+class OrNot {
+public:
+  static R apply(T first, T second, R) {
+    return second | ~first;
+  }
+  static std::string name() { return "OrNot"; }
+};
+
+template <typename T, typename R>
+class Nand {
+public:
+  static R apply(T first, T second, R) {
+    return ~(second & first);
+  }
+  static std::string name() { return "Nand"; }
+};
+
+template <typename T, typename R>
+class Mv {
+public:
+  static R apply(T first, T, R) {
+    return first;
+  }
+  static std::string name() { return "Mv"; }
+};
+
+template <typename T, typename R>
+class Nor {
+public:
+  static R apply(T first, T second, R) {
+    return ~(second | first);
+  }
+  static std::string name() { return "Nor"; }
+};
+
+template <typename T, typename R>
+class Xnor {
+public:
+  static R apply(T first, T second, R) {
+    return ~(second ^ first);
+  }
+  static std::string name() { return "Xnor"; }
+};
+
+template <typename T, typename R>
+class Fadd {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(R) == 4) {
+      return rv_fadd_s(first, second, frm, &fflags);
+    } else if (sizeof(R) == 8) {
+      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
+      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
+      return rv_fadd_d(first_d, second_d, frm, &fflags);
+    } else {
+      std::cout << "Fadd only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fadd"; }
+};
+
+template <typename T, typename R>
+class Fsub {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(R) == 4) {
+      return rv_fsub_s(second, first, frm, &fflags);
+    } else if (sizeof(R) == 8) {
+      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
+      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
+      return rv_fsub_d(second_d, first_d, frm, &fflags);
+    } else {
+      std::cout << "Fsub only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fsub"; }
+};
+
+template <typename T, typename R>
+class Fmacc {
+public:
+  static R apply(T first, T second, R third) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(R) == 4) {
+      return rv_fmadd_s(first, second, third, frm, &fflags);
+    } else if (sizeof(R) == 8) {
+      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
+      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
+      return rv_fmadd_d(first_d, second_d, third, frm, &fflags);
+    } else {
+      std::cout << "Fmacc only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fmacc"; }
+};
+
+template <typename T, typename R>
+class Fnmacc {
+public:
+  static R apply(T first, T second, R third) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(R) == 4) {
+      return rv_fnmadd_s(first, second, third, frm, &fflags);
+    } else if (sizeof(R) == 8) {
+      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
+      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
+      return rv_fnmadd_d(first_d, second_d, third, frm, &fflags);
+    } else {
+      std::cout << "Fnmacc only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fnmacc"; }
+};
+
+template <typename T, typename R>
+class Fmsac {
+public:
+  static R apply(T first, T second, R third) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(R) == 4) {
+      return rv_fmadd_s(first, second, rv_fsgnjn_s(third, third), frm, &fflags);
+    } else if (sizeof(R) == 8) {
+      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
+      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
+      return rv_fmadd_d(first_d, second_d, rv_fsgnjn_d(third, third), frm, &fflags);
+    } else {
+      std::cout << "Fmsac only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fmsac"; }
+};
+
+template <typename T, typename R>
+class Fnmsac {
+public:
+  static R apply(T first, T second, R third) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(R) == 4) {
+      return rv_fnmadd_s(first, second, rv_fsgnjn_s(third, third), frm, &fflags);
+    } else if (sizeof(R) == 8) {
+      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
+      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
+      return rv_fnmadd_d(first_d, second_d, rv_fsgnjn_d(third, third), frm, &fflags);
+    } else {
+      std::cout << "Fnmsac only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fnmsac"; }
+};
+
+template <typename T, typename R>
+class Fmadd {
+public:
+  static R apply(T first, T second, R third) {
+    if (sizeof(T) == 4 || sizeof(T) == 8) {
+      return Fmacc<T, R>::apply(first, third, second);
+    } else {
+      std::cout << "Fmadd only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fmadd"; }
+};
+
+template <typename T, typename R>
+class Fnmadd {
+public:
+  static R apply(T first, T second, R third) {
+    if (sizeof(T) == 4 || sizeof(T) == 8) {
+      return Fnmacc<T, R>::apply(first, third, second);
+    } else {
+      std::cout << "Fnmadd only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fnmadd"; }
+};
+
+template <typename T, typename R>
+class Fmsub {
+public:
+  static R apply(T first, T second, R third) {
+    if (sizeof(T) == 4 || sizeof(T) == 8) {
+      return Fmsac<T, R>::apply(first, third, second);
+    } else {
+      std::cout << "Fmsub only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fmsub"; }
+};
+
+template <typename T, typename R>
+class Fnmsub {
+public:
+  static R apply(T first, T second, R third) {
+    if (sizeof(T) == 4 || sizeof(T) == 8) {
+      return Fnmsac<T, R>::apply(first, third, second);
+    } else {
+      std::cout << "Fnmsub only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fnmsub"; }
+};
+
+template <typename T, typename R>
+class Fmin {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring rounding modes for now
+    uint32_t fflags = 0;
+    if (sizeof(T) == 4) {
+      return rv_fmin_s(first, second, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_fmin_d(first, second, &fflags);
+    } else {
+      std::cout << "Fmin only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fmin"; }
+};
+
+template <typename T, typename R>
+class Fmax {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring rounding modes for now
+    uint32_t fflags = 0;
+    if (sizeof(T) == 4) {
+      return rv_fmax_s(first, second, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_fmax_d(first, second, &fflags);
+    } else {
+      std::cout << "Fmax only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fmax"; }
+};
+
+template <typename T, typename R>
+class Fsgnj {
+public:
+  static R apply(T first, T second, R) {
+    if (sizeof(T) == 4) {
+      return rv_fsgnj_s(second, first);
+    } else if (sizeof(T) == 8) {
+      return rv_fsgnj_d(second, first);
+    } else {
+      std::cout << "Fsgnj only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fsgnj"; }
+};
+
+template <typename T, typename R>
+class Fsgnjn {
+public:
+  static R apply(T first, T second, R) {
+    if (sizeof(T) == 4) {
+      return rv_fsgnjn_s(second, first);
+    } else if (sizeof(T) == 8) {
+      return rv_fsgnjn_d(second, first);
+    } else {
+      std::cout << "Fsgnjn only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fsgnjn"; }
+};
+
+template <typename T, typename R>
+class Fsgnjx {
+public:
+  static R apply(T first, T second, R) {
+    if (sizeof(T) == 4) {
+      return rv_fsgnjx_s(second, first);
+    } else if (sizeof(T) == 8) {
+      return rv_fsgnjx_d(second, first);
+    } else {
+      std::cout << "Fsgnjx only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fsgnjx"; }
+};
+
+template <typename T, typename R>
+class Fcvt {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(T) == 4) {
+      switch (first) {
+      case 0b00000: // vfcvt.xu.f.v
+        return rv_ftou_s(second, frm, &fflags);
+      case 0b00001: // vfcvt.x.f.v
+        return rv_ftoi_s(second, frm, &fflags);
+      case 0b00010: // vfcvt.f.xu.v
+        return rv_utof_s(second, frm, &fflags);
+      case 0b00011: // vfcvt.f.x.v
+        return rv_itof_s(second, frm, &fflags);
+      case 0b00110: // vfcvt.rtz.xu.f.v
+        return rv_ftou_s(second, 1, &fflags);
+      case 0b00111: // vfcvt.rtz.x.f.v
+        return rv_ftoi_s(second, 1, &fflags);
+      case 0b01000: // vfwcvt.xu.f.v
+        return rv_ftolu_s(second, frm, &fflags);
+      case 0b01001: // vfwcvt.x.f.v
+        return rv_ftol_s(second, frm, &fflags);
+      case 0b01010: // vfwcvt.f.xu.v
+        return rv_utof_d(second, frm, &fflags);
+      case 0b01011: // vfwcvt.f.x.v
+        return rv_itof_d(second, frm, &fflags);
+      case 0b01100: // vfwcvt.f.f.v
+        return rv_ftod(second);
+      case 0b01110: // vfwcvt.rtz.xu.f.v
+        return rv_ftolu_s(second, 1, &fflags);
+      case 0b01111: // vfwcvt.rtz.x.f.v
+        return rv_ftol_s(second, 1, &fflags);
+      default:
+        std::cout << "Fcvt has unsupported value for first: " << first << std::endl;
+        std::abort();
+      }
+    } else if (sizeof(T) == 8) {
+      switch (first) {
+      case 0b00000: // vfcvt.xu.f.v
+        return rv_ftolu_d(second, frm, &fflags);
+      case 0b00001: // vfcvt.x.f.v
+        return rv_ftol_d(second, frm, &fflags);
+      case 0b00010: // vfcvt.f.xu.v
+        return rv_lutof_d(second, frm, &fflags);
+      case 0b00011: // vfcvt.f.x.v
+        return rv_ltof_d(second, frm, &fflags);
+      case 0b00110: // vfcvt.rtz.xu.f.v
+        return rv_ftolu_d(second, 1, &fflags);
+      case 0b00111: // vfcvt.rtz.x.f.v
+        return rv_ftol_d(second, 1, &fflags);
+      case 0b01000: // vfwcvt.xu.f.v
+      case 0b01001: // vfwcvt.x.f.v
+      case 0b01010: // vfwcvt.f.xu.v
+      case 0b01011: // vfwcvt.f.x.v
+      case 0b01100: // vfwcvt.f.f.v
+      case 0b01110: // vfwcvt.rtz.xu.f.v
+      case 0b01111: // vfwcvt.rtz.x.f.v
+        std::cout << "Fwcvt only supports f32" << std::endl;
+        std::abort();
+      default:
+        std::cout << "Fcvt has unsupported value for first: " << first << std::endl;
+        std::abort();
+      }
+    } else {
+      std::cout << "Fcvt only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static R apply(T first, T second, uint32_t vxrm, uint32_t &) { // saturation argument is unused
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    if (sizeof(T) == 8) {
+      switch (first) {
+      case 0b10000: // vfncvt.xu.f.w
+        return rv_ftou_d(second, vxrm, &fflags);
+      case 0b10001: // vfncvt.x.f.w
+        return rv_ftoi_d(second, vxrm, &fflags);
+      case 0b10010: // vfncvt.f.xu.w
+        return rv_lutof_s(second, vxrm, &fflags);
+      case 0b10011: // vfncvt.f.x.w
+        return rv_ltof_s(second, vxrm, &fflags);
+      case 0b10100: // vfncvt.f.f.w
+        return rv_dtof_r(second, vxrm);
+      case 0b10101: // vfncvt.rod.f.f.w
+        return rv_dtof_r(second, 6);
+      case 0b10110: // vfncvt.rtz.xu.f.w
+        return rv_ftou_d(second, 1, &fflags);
+      case 0b10111: // vfncvt.rtz.x.f.w
+        return rv_ftoi_d(second, 1, &fflags);
+      default:
+        std::cout << "Fncvt has unsupported value for first: " << first << std::endl;
+        std::abort();
+      }
+    } else {
+      std::cout << "Fncvt only supports f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fcvt"; }
+};
+
+template <typename T, typename R>
+class Funary1 {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(T) == 4) {
+      switch (first) {
+      case 0b00000: // vfsqrt.v
+        return rv_fsqrt_s(second, frm, &fflags);
+      case 0b00100: // vfrsqrt7.v
+        return rv_frsqrt7_s(second, frm, &fflags);
+      case 0b00101: // vfrec7.v
+        return rv_frecip7_s(second, frm, &fflags);
+      case 0b10000: // vfclass.v
+        return rv_fclss_s(second);
+      default:
+        std::cout << "Funary1 has unsupported value for first: " << first << std::endl;
+        std::abort();
+      }
+    } else if (sizeof(T) == 8) {
+      switch (first) {
+      case 0b00000: // vfsqrt.v
+        return rv_fsqrt_d(second, frm, &fflags);
+      case 0b00100: // vfrsqrt7.v
+        return rv_frsqrt7_d(second, frm, &fflags);
+      case 0b00101: // vfrec7.v
+        return rv_frecip7_d(second, frm, &fflags);
+      case 0b10000: // vfclass.v
+        return rv_fclss_d(second);
+      default:
+        std::cout << "Funary1 has unsupported value for first: " << first << std::endl;
+        std::abort();
+      }
+    } else {
+      std::cout << "Funary1 only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Funary1"; }
+};
+
+template <typename T, typename R>
+class Xunary0 {
+public:
+  static R apply(T, T second, T) {
+    return second;
+  }
+  static std::string name() { return "Xunary0"; }
+};
+
+template <typename T, typename R>
+class Feq {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    if (sizeof(T) == 4) {
+      return rv_feq_s(second, first, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_feq_d(second, first, &fflags);
+    } else {
+      std::cout << "Feq only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Feq"; }
+};
+
+template <typename T, typename R>
+class Fle {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    if (sizeof(T) == 4) {
+      return rv_fle_s(second, first, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_fle_d(second, first, &fflags);
+    } else {
+      std::cout << "Fle only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fle"; }
+};
+
+template <typename T, typename R>
+class Flt {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    if (sizeof(T) == 4) {
+      return rv_flt_s(second, first, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_flt_d(second, first, &fflags);
+    } else {
+      std::cout << "Flt only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Flt"; }
+};
+
+template <typename T, typename R>
+class Fne {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    if (sizeof(T) == 4) {
+      return !rv_feq_s(second, first, &fflags);
+    } else if (sizeof(T) == 8) {
+      return !rv_feq_d(second, first, &fflags);
+    } else {
+      std::cout << "Fne only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fne"; }
+};
+
+template <typename T, typename R>
+class Fgt {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    if (sizeof(T) == 4) {
+      return rv_flt_s(first, second, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_flt_d(first, second, &fflags);
+    } else {
+      std::cout << "Fgt only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fgt"; }
+};
+
+template <typename T, typename R>
+class Fge {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    if (sizeof(T) == 4) {
+      return rv_fle_s(first, second, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_fle_d(first, second, &fflags);
+    } else {
+      std::cout << "Fge only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fge"; }
+};
+
+template <typename T, typename R>
+class Fdiv {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(T) == 4) {
+      return rv_fdiv_s(second, first, frm, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_fdiv_d(second, first, frm, &fflags);
+    } else {
+      std::cout << "Fdiv only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fdiv"; }
+};
+
+template <typename T, typename R>
+class Frdiv {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(T) == 4) {
+      return rv_fdiv_s(first, second, frm, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_fdiv_d(first, second, frm, &fflags);
+    } else {
+      std::cout << "Frdiv only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Frdiv"; }
+};
+
+template <typename T, typename R>
+class Fmul {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(R) == 4) {
+      return rv_fmul_s(first, second, frm, &fflags);
+    } else if (sizeof(R) == 8) {
+      uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first);
+      uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second);
+      return rv_fmul_d(first_d, second_d, frm, &fflags);
+    } else {
+      std::cout << "Fmul only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Fmul"; }
+};
+
+template <typename T, typename R>
+class Frsub {
+public:
+  static R apply(T first, T second, R) {
+    // ignoring flags for now
+    uint32_t fflags = 0;
+    // ignoring rounding mode for now
+    uint32_t frm = 0;
+    if (sizeof(T) == 4) {
+      return rv_fsub_s(first, second, frm, &fflags);
+    } else if (sizeof(T) == 8) {
+      return rv_fsub_d(first, second, frm, &fflags);
+    } else {
+      std::cout << "Frsub only supports f32 and f64" << std::endl;
+      std::abort();
+    }
+  }
+  static std::string name() { return "Frsub"; }
+};
+
+template <typename T, typename R>
+class Clip {
+public:
+  static R apply(T first, T second, uint32_t vxrm, uint32_t &vxsat_) {
+    // The low lg2(2*SEW) bits of the vector or scalar shift-amount value (e.g., the low 6 bits for a SEW=64-bit to
+    // SEW=32-bit narrowing operation) are used to control the right shift amount, which provides the scaling.
+    R firstValid = first & (sizeof(T) * 8 - 1);
+    T unclippedResult = (second >> firstValid) + roundBit(second, firstValid, vxrm);
+    R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits<R>::min(), (T)std::numeric_limits<R>::max());
+    vxsat_ |= clippedResult != unclippedResult;
+    return clippedResult;
+  }
+  static std::string name() { return "Clip"; }
+};
+
+template <typename T, typename R>
+class Smul {
+public:
+  static R apply(T first, T second, uint32_t vxrm, uint32_t &vxsat_) {
+    R shift = sizeof(R) * 8 - 1;
+    T unshiftedResult = first * second;
+    T unclippedResult = (unshiftedResult >> shift) + roundBit(unshiftedResult, shift, vxrm);
+    R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits<R>::min(), (T)std::numeric_limits<R>::max());
+    vxsat_ |= clippedResult != unclippedResult;
+    return clippedResult;
+  }
+  static std::string name() { return "Smul"; }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+bool isMasked(std::vector<std::vector<Byte>> &vreg_file, uint32_t maskVreg, uint32_t byteI, bool vmask) {
+  auto &mask = vreg_file.at(maskVreg);
+  uint8_t emask = *(uint8_t *)(mask.data() + byteI / 8);
+  uint8_t value = (emask >> (byteI % 8)) & 0x1;
+  DP(4, "Masking enabled: " << +!vmask << " mask element: " << +value);
+  return !vmask && value == 0;
+}
+
+template <typename DT>
+uint32_t getVreg(uint32_t baseVreg, uint32_t byteI) {
+  uint32_t vsew = sizeof(DT) * 8;
+  return (baseVreg + (byteI / (VLEN / vsew))) % 32;
+}
+
+template <typename DT>
+DT &getVregData(std::vector<vortex::Byte> &baseVregVec, uint32_t byteI) {
+  uint32_t vsew = sizeof(DT) * 8;
+  return *(DT *)(baseVregVec.data() + (byteI % (VLEN / vsew)) * vsew / 8);
+}
+
+template <typename DT>
+DT &getVregData(std::vector<std::vector<vortex::Byte>> &vreg_file, uint32_t baseVreg, uint32_t byteI) {
+  auto &vr1 = vreg_file.at(getVreg<DT>(baseVreg, byteI));
+  return getVregData<DT>(vr1, byteI);
+}
+
+template <typename DT>
+void vector_op_vix_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rdest, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
+  uint32_t vsew = sizeof(DT) * 8;
+  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
+  if (nfields * emul > 8) {
+    std::cout << "NFIELDS * EMUL = " << nfields * lmul << " but it should be <= 8" << std::endl;
+    std::abort();
+  }
+  for (uint32_t i = 0; i < vl * nfields; i++) {
+    if (isMasked(vreg_file, 0, i / nfields, vmask))
+      continue;
+
+    uint32_t nfields_strided = strided ? nfields : 1;
+    Word mem_addr = (base_addr & 0xFFFFFFFC) + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT);
+    Word mem_data = 0;
+    emul_->dcache_read(&mem_data, mem_addr, vsew / 8);
+    DP(4, "Loading data " << mem_data << " from: " << mem_addr << " to vec reg: " << getVreg<DT>(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
+    DT &result = getVregData<DT>(vreg_file, rdest + (i % nfields) * emul, i / nfields);
+    DP(4, "Previous data: " << +result);
+    result = (DT)mem_data;
+  }
+}
+
+void vector_op_vix_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rdest, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_load<uint8_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
+    break;
+  case 16:
+    vector_op_vix_load<uint16_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
+    break;
+  case 32:
+    vector_op_vix_load<uint32_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
+    break;
+  case 64:
+    vector_op_vix_load<uint64_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VLE for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_vv_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rdest, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
+  uint32_t vsew = sizeof(DT) * 8;
+  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
+  if (nfields * emul > 8) {
+    std::cout << "NFIELDS * EMUL = " << nfields * lmul << " but it should be <= 8" << std::endl;
+    std::abort();
+  }
+  for (uint32_t i = 0; i < vl * nfields; i++) {
+    if (isMasked(vreg_file, 0, i / nfields, vmask))
+      continue;
+
+    Word offset = 0;
+    switch (iSew) {
+    case 8:
+      offset = getVregData<uint8_t>(vreg_file, rsrc1, i / nfields);
+      break;
+    case 16:
+      offset = getVregData<uint16_t>(vreg_file, rsrc1, i / nfields);
+      break;
+    case 32:
+      offset = getVregData<uint32_t>(vreg_file, rsrc1, i / nfields);
+      break;
+    case 64:
+      offset = getVregData<uint64_t>(vreg_file, rsrc1, i / nfields);
+      break;
+    default:
+      std::cout << "Unsupported iSew: " << iSew << std::endl;
+      std::abort();
+    }
+
+    Word mem_addr = (base_addr & 0xFFFFFFFC) + offset + (i % nfields) * sizeof(DT);
+    Word mem_data = 0;
+    emul_->dcache_read(&mem_data, mem_addr, vsew / 8);
+    DP(4, "VLUX/VLOX - Loading data " << mem_data << " from: " << mem_addr << " with offset: " << std::dec << offset << " to vec reg: " << getVreg<DT>(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
+    DT &result = getVregData<DT>(vreg_file, rdest + (i % nfields) * emul, i / nfields);
+    DP(4, "Previous data: " << +result);
+    result = (DT)mem_data;
+  }
+}
+
+void vector_op_vv_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_load<uint8_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
+    break;
+  case 16:
+    vector_op_vv_load<uint16_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
+    break;
+  case 32:
+    vector_op_vv_load<uint32_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
+    break;
+  case 64:
+    vector_op_vv_load<uint64_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VLUX/VLOX for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_vix_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc3, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
+  uint32_t vsew = sizeof(DT) * 8;
+  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
+  for (uint32_t i = 0; i < vl * nfields; i++) {
+    if (isMasked(vreg_file, 0, i / nfields, vmask))
+      continue;
+
+    uint32_t nfields_strided = strided ? nfields : 1;
+    Word mem_addr = base_addr + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT);
+    Word mem_data = getVregData<DT>(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields);
+    DP(4, "Storing: " << std::hex << mem_data << " at: " << mem_addr << " from vec reg: " << getVreg<DT>(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
+    emul_->dcache_write(&mem_data, mem_addr, vsew / 8);
+  }
+}
+
+void vector_op_vix_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc3, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_store<uint8_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
+    break;
+  case 16:
+    vector_op_vix_store<uint16_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
+    break;
+  case 32:
+    vector_op_vix_store<uint32_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
+    break;
+  case 64:
+    vector_op_vix_store<uint64_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VSE for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_vv_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rsrc3, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
+  uint32_t vsew = sizeof(DT) * 8;
+  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
+  for (uint32_t i = 0; i < vl * nfields; i++) {
+    if (isMasked(vreg_file, 0, i / nfields, vmask))
+      continue;
+
+    Word offset = 0;
+    switch (iSew) {
+    case 8:
+      offset = getVregData<uint8_t>(vreg_file, rsrc1, i / nfields);
+      break;
+    case 16:
+      offset = getVregData<uint16_t>(vreg_file, rsrc1, i / nfields);
+      break;
+    case 32:
+      offset = getVregData<uint32_t>(vreg_file, rsrc1, i / nfields);
+      break;
+    case 64:
+      offset = getVregData<uint64_t>(vreg_file, rsrc1, i / nfields);
+      break;
+    default:
+      std::cout << "Unsupported iSew: " << iSew << std::endl;
+      std::abort();
+    }
+
+    Word mem_addr = base_addr + offset + (i % nfields) * sizeof(DT);
+    Word mem_data = getVregData<DT>(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields);
+    DP(4, "VSUX/VSOX - Storing: " << std::hex << mem_data << " at: " << mem_addr << " with offset: " << std::dec << offset << " from vec reg: " << getVreg<DT>(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
+    emul_->dcache_write(&mem_data, mem_addr, vsew / 8);
+  }
+}
+
+void vector_op_vv_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rsrc3, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_store<uint8_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
+    break;
+  case 16:
+    vector_op_vv_store<uint16_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
+    break;
+  case 32:
+    vector_op_vv_store<uint32_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
+    break;
+  case 64:
+    vector_op_vv_store<uint64_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VSUX/VSOX for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT>
+void vector_op_vix(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT second = getVregData<DT>(vreg_file, rsrc0, i);
+    DT third = getVregData<DT>(vreg_file, rdest, i);
+    DT result = OP<DT, DT>::apply(first, second, third);
+    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    getVregData<DT>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix<OP, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vix<OP, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vix<OP, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vix<OP, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT>
+void vector_op_vix_carry(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl) {
+  for (uint32_t i = 0; i < vl; i++) {
+    DT second = getVregData<DT>(vreg_file, rsrc0, i);
+    bool third = !isMasked(vreg_file, 0, i, false);
+    DT result = OP<DT, DT>::apply(first, second, third);
+    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    getVregData<DT>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix_carry(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_carry<OP, DT8>(src1, vreg_file, rsrc0, rdest, vl);
+    break;
+  case 16:
+    vector_op_vix_carry<OP, DT16>(src1, vreg_file, rsrc0, rdest, vl);
+    break;
+  case 32:
+    vector_op_vix_carry<OP, DT32>(src1, vreg_file, rsrc0, rdest, vl);
+    break;
+  case 64:
+    vector_op_vix_carry<OP, DT64>(src1, vreg_file, rsrc0, rdest, vl);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX carry for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vix_carry_out(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    DT second = getVregData<DT>(vreg_file, rsrc0, i);
+    bool third = !vmask && !isMasked(vreg_file, 0, i, vmask);
+    bool result = OP<DT, DTR>::apply(first, second, third);
+    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    if (result) {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
+    } else {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
+    }
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64, typename DT128>
+void vector_op_vix_carry_out(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_carry_out<OP, DT8, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vix_carry_out<OP, DT16, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vix_carry_out<OP, DT32, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vix_carry_out<OP, DT64, DT128>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX carry out for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_vix_merge(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    DT result = isMasked(vreg_file, 0, i, vmask) ? getVregData<DT>(vreg_file, rsrc0, i) : first;
+    DP(4, "Merge - Choosing result: " << +result);
+    getVregData<DT>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix_merge(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_merge<DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vix_merge<DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vix_merge<DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vix_merge<DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_scalar(DT &dest, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t vsew) {
+  if (rsrc0 != 0) {
+    std::cout << "Vwxunary0/Vwfunary0 has unsupported value for vs2: " << rsrc0 << std::endl;
+    std::abort();
+  }
+  switch (vsew) {
+  case 8:
+    dest = getVregData<uint8_t>(vreg_file, rsrc1, 0);
+    break;
+  case 16:
+    dest = getVregData<uint16_t>(vreg_file, rsrc1, 0);
+    break;
+  case 32:
+    dest = getVregData<uint32_t>(vreg_file, rsrc1, 0);
+    break;
+  case 64:
+    dest = getVregData<uint64_t>(vreg_file, rsrc1, 0);
+    break;
+  default:
+    std::cout << "Failed to execute vmv.x.s/vfmv.f.s for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vix_w(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT second = getVregData<DT>(vreg_file, rsrc0, i);
+    DTR third = getVregData<DTR>(vreg_file, rdest, i);
+    DTR result = OP<DT, DTR>::apply(first, second, third);
+    DP(4, "Widening " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    getVregData<DTR>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix_w(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_w<OP, DT8, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vix_w<OP, DT16, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vix_w<OP, DT32, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX widening for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix_wx(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix<OP, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vix<OP, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vix<OP, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX widening wx for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vix_n(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT second = getVregData<DT>(vreg_file, rsrc0, i);
+    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
+    DP(4, "Narrowing " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
+    getVregData<DTR>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix_n(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_n<OP, DT16, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 16:
+    vector_op_vix_n<OP, DT32, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 32:
+    vector_op_vix_n<OP, DT64, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX narrowing for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vix_sat(DTR first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT second = getVregData<DTR>(vreg_file, rsrc0, i);
+    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
+    DP(4, "Saturating " << (OP<DT, DTR>::name()) << "(" << +(DTR)first << ", " << +(DTR)second << ")" << " = " << +(DTR)result);
+    getVregData<DTR>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64, typename DT128>
+void vector_op_vix_sat(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_sat<OP, DT16, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 16:
+    vector_op_vix_sat<OP, DT32, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 32:
+    vector_op_vix_sat<OP, DT64, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 64:
+    vector_op_vix_sat<OP, DT128, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX saturating for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix_scale(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_sat<OP, DT8, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 16:
+    vector_op_vix_sat<OP, DT16, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 32:
+    vector_op_vix_sat<OP, DT32, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 64:
+    vector_op_vix_sat<OP, DT64, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX scale for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP>
+void vector_op_vix_ext(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  if (vsew == 16) {
+    switch (src1) {
+    case 0b00110: // vzext.vf2
+      vector_op_vix_w<OP, uint8_t, uint16_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    case 0b00111: // vsext.vf2
+      vector_op_vix_w<OP, int8_t, int16_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    default:
+      std::cout << "Xunary0 has unsupported value for vf: " << src1 << std::endl;
+      std::abort();
+    }
+  } else if (vsew == 32) {
+    switch (src1) {
+    case 0b00100: // vzext.vf4
+      vector_op_vix_w<OP, uint8_t, uint32_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    case 0b00101: // vsext.vf4
+      vector_op_vix_w<OP, int8_t, int32_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    case 0b00110: // vzext.vf2
+      vector_op_vix_w<OP, uint16_t, uint32_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    case 0b00111: // vsext.vf2
+      vector_op_vix_w<OP, int16_t, int32_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    default:
+      std::cout << "Xunary0 has unsupported value for vf: " << src1 << std::endl;
+      std::abort();
+    }
+  } else if (vsew == 64) {
+    switch (src1) {
+    case 0b00010: // vzext.vf8
+      vector_op_vix_w<OP, uint8_t, uint64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    case 0b00011: // vsext.vf8
+      vector_op_vix_w<OP, int8_t, int64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    case 0b00100: // vzext.vf4
+      vector_op_vix_w<OP, uint16_t, uint64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    case 0b00101: // vsext.vf4
+      vector_op_vix_w<OP, int16_t, int64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    case 0b00110: // vzext.vf2
+      vector_op_vix_w<OP, uint32_t, uint64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    case 0b00111: // vsext.vf2
+      vector_op_vix_w<OP, int32_t, int64_t>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+      break;
+    default:
+      std::cout << "Xunary0 has unsupported value for vf: " << src1 << std::endl;
+      std::abort();
+    }
+  } else {
+    std::cout << "Failed to execute Xunary0 for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT>
+void vector_op_vix_mask(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT second = getVregData<DT>(vreg_file, rsrc0, i);
+    bool result = OP<DT, bool>::apply(first, second, 0);
+    DP(4, "Integer/float compare mask " << (OP<DT, bool>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
+    if (result) {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
+    } else {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
+    }
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix_mask(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_mask<OP, DT8>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vix_mask<OP, DT16>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vix_mask<OP, DT32>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vix_mask<OP, DT64>(src1, vreg_file, rsrc0, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX integer/float compare mask for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_vix_slide(Word first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, Word vlmax, uint32_t vmask, bool scalar) {
+  // If vlmax > 0 this means we have a vslidedown instruction, vslideup does not require vlmax
+  bool slideDown = vlmax;
+  uint32_t scalarPos = slideDown ? vl - 1 : 0;
+  // If scalar set is set this means we have a v(f)slide1up or v(f)slide1down instruction,
+  // so first is our scalar value and we need to overwrite it with 1 for later computations
+  if (scalar && vl && !isMasked(vreg_file, 0, scalarPos, vmask)) {
+    DP(4, "Slide - Moving scalar value " << +first << " to position " << +scalarPos);
+    getVregData<DT>(vreg_file, rdest, scalarPos) = first;
+  }
+  first = scalar ? 1 : first;
+
+  for (Word i = slideDown ? 0 : first; i < vl - (scalar && vl && slideDown); i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    __uint128_t iSrc = slideDown ? (__uint128_t)i + (__uint128_t)first : (__uint128_t)i - (__uint128_t)first; // prevent overflows/underflows
+    DT value = (!slideDown || iSrc < vlmax) ? getVregData<DT>(vreg_file, rsrc0, iSrc) : 0;
+    DP(4, "Slide - Moving value " << +value << " from position " << (uint64_t)iSrc << " to position " << +i);
+    getVregData<DT>(vreg_file, rdest, i) = value;
+  }
+}
+
+template <typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix_slide(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, Word vlmax, uint32_t vmask, bool scalar) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_slide<DT8>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask, scalar);
+    break;
+  case 16:
+    vector_op_vix_slide<DT16>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask, scalar);
+    break;
+  case 32:
+    vector_op_vix_slide<DT32>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask, scalar);
+    break;
+  case 64:
+    vector_op_vix_slide<DT64>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask, scalar);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX slide for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_vix_gather(Word first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, Word vlmax, uint32_t vmask) {
+  for (Word i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT value = first < vlmax ? getVregData<DT>(vreg_file, rsrc0, first) : 0;
+    DP(4, "Register gather - Moving value " << +value << " from position " << +first << " to position " << +i);
+    getVregData<DT>(vreg_file, rdest, i) = value;
+  }
+}
+
+template <typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vix_gather(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vsew, uint32_t vl, Word vlmax, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vix_gather<DT8>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask);
+    break;
+  case 16:
+    vector_op_vix_gather<DT16>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask);
+    break;
+  case 32:
+    vector_op_vix_gather<DT32>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask);
+    break;
+  case 64:
+    vector_op_vix_gather<DT64>(src1, vreg_file, rsrc0, rdest, vl, vlmax, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VI/VX register gather for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT>
+void vector_op_vv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT first = getVregData<DT>(vreg_file, rsrc0, i);
+    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    DT third = getVregData<DT>(vreg_file, rdest, i);
+    DT result = OP<DT, DT>::apply(first, second, third);
+    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    getVregData<DT>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv<OP, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vv<OP, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vv<OP, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vv<OP, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VV for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT>
+void vector_op_vv_carry(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl) {
+  for (uint32_t i = 0; i < vl; i++) {
+    DT first = getVregData<DT>(vreg_file, rsrc0, i);
+    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    bool third = !isMasked(vreg_file, 0, i, false);
+    DT result = OP<DT, DT>::apply(first, second, third);
+    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    getVregData<DT>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_carry(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_carry<OP, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl);
+    break;
+  case 16:
+    vector_op_vv_carry<OP, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl);
+    break;
+  case 32:
+    vector_op_vv_carry<OP, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl);
+    break;
+  case 64:
+    vector_op_vv_carry<OP, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl);
+    break;
+  default:
+    std::cout << "Failed to execute VV carry for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vv_carry_out(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    DT first = getVregData<DT>(vreg_file, rsrc0, i);
+    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    bool third = !vmask && !isMasked(vreg_file, 0, i, vmask);
+    bool result = OP<DT, DTR>::apply(first, second, third);
+    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    if (result) {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
+    } else {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
+    }
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64, typename DT128>
+void vector_op_vv_carry_out(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_carry_out<OP, DT8, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vv_carry_out<OP, DT16, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vv_carry_out<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vv_carry_out<OP, DT64, DT128>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VV carry out for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_vv_merge(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    uint32_t rsrc = isMasked(vreg_file, 0, i, vmask) ? rsrc1 : rsrc0;
+    DT result = getVregData<DT>(vreg_file, rsrc, i);
+    DP(4, "Merge - Choosing result: " << +result);
+    getVregData<DT>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_merge(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_merge<DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vv_merge<DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vv_merge<DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vv_merge<DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VV for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_vv_gather(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, bool ei16, uint32_t vlmax, uint32_t vmask) {
+  for (Word i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    uint32_t first = ei16 ? getVregData<uint16_t>(vreg_file, rsrc0, i) : getVregData<DT>(vreg_file, rsrc0, i);
+    DT value = first < vlmax ? getVregData<DT>(vreg_file, rsrc1, first) : 0;
+    DP(4, "Register gather - Moving value " << +value << " from position " << +first << " to position " << +i);
+    getVregData<DT>(vreg_file, rdest, i) = value;
+  }
+}
+
+template <typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_gather(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, bool ei16, uint32_t vlmax, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_gather<DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, ei16, vlmax, vmask);
+    break;
+  case 16:
+    vector_op_vv_gather<DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, ei16, vlmax, vmask);
+    break;
+  case 32:
+    vector_op_vv_gather<DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, ei16, vlmax, vmask);
+    break;
+  case 64:
+    vector_op_vv_gather<DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, ei16, vlmax, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VV register gather for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vv_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT first = getVregData<DT>(vreg_file, rsrc0, i);
+    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    DTR third = getVregData<DTR>(vreg_file, rdest, i);
+    DTR result = OP<DT, DTR>::apply(first, second, third);
+    DP(4, "Widening " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    getVregData<DTR>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_w<OP, DT8, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vv_w<OP, DT16, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vv_w<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VV widening for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vv_wv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT first = getVregData<DT>(vreg_file, rsrc0, i);
+    DTR second = getVregData<DTR>(vreg_file, rsrc1, i);
+    DTR third = getVregData<DTR>(vreg_file, rdest, i);
+    DTR result = OP<DTR, DTR>::apply(first, second, third);
+    DP(4, "Widening wv " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    getVregData<DTR>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_wv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_wv<OP, DT8, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vv_wv<OP, DT16, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vv_wv<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VV widening wv for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vv_wfv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT first = getVregData<DT>(vreg_file, rsrc0, i);
+    DTR second = getVregData<DTR>(vreg_file, rsrc1, i);
+    DTR third = getVregData<DTR>(vreg_file, rdest, i);
+    DTR result = OP<DTR, DTR>::apply(rv_ftod(first), second, third);
+    DP(4, "Widening wfv " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
+    getVregData<DTR>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_wfv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  if (vsew == 32) {
+    vector_op_vv_wfv<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+  } else {
+    std::cout << "Failed to execute VV widening wfv for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vv_n(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DTR first = getVregData<DTR>(vreg_file, rsrc0, i);
+    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
+    DP(4, "Narrowing " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
+    getVregData<DTR>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_n(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_n<OP, DT16, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 16:
+    vector_op_vv_n<OP, DT32, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 32:
+    vector_op_vv_n<OP, DT64, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  default:
+    std::cout << "Failed to execute VV narrowing for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vv_sat(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT first = getVregData<DTR>(vreg_file, rsrc0, i);
+    DT second = getVregData<DTR>(vreg_file, rsrc1, i);
+    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
+    DP(4, "Saturating " << (OP<DT, DTR>::name()) << "(" << +(DTR)first << ", " << +(DTR)second << ")" << " = " << +(DTR)result);
+    getVregData<DTR>(vreg_file, rdest, i) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64, typename DT128>
+void vector_op_vv_sat(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_sat<OP, DT16, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 16:
+    vector_op_vv_sat<OP, DT32, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 32:
+    vector_op_vv_sat<OP, DT64, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 64:
+    vector_op_vv_sat<OP, DT128, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  default:
+    std::cout << "Failed to execute VV saturating for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_scale(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask, uint32_t vxrm, uint32_t &vxsat) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_sat<OP, DT8, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 16:
+    vector_op_vv_sat<OP, DT16, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 32:
+    vector_op_vv_sat<OP, DT32, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  case 64:
+    vector_op_vv_sat<OP, DT64, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask, vxrm, vxsat);
+    break;
+  default:
+    std::cout << "Failed to execute VV scale for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT>
+void vector_op_vv_red(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    // use rdest as accumulator
+    if (i == 0) {
+      getVregData<DT>(vreg_file, rdest, 0) = getVregData<DT>(vreg_file, rsrc0, 0);
+    }
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT first = getVregData<DT>(vreg_file, rdest, 0);
+    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    DT result = OP<DT, DT>::apply(first, second, 0);
+    DP(4, "Reduction " << (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
+    getVregData<DT>(vreg_file, rdest, 0) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_red(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_red<OP, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vv_red<OP, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vv_red<OP, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vv_red<OP, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VV reduction for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vv_red_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    // use rdest as accumulator
+    if (i == 0) {
+      getVregData<DTR>(vreg_file, rdest, 0) = getVregData<DTR>(vreg_file, rsrc0, 0);
+    }
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DTR first = getVregData<DTR>(vreg_file, rdest, 0);
+    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    DTR second_w = std::is_signed<DT>() ? sext((DTR)second, sizeof(DT) * 8) : zext((DTR)second, sizeof(DT) * 8);
+    DTR result = OP<DTR, DTR>::apply(first, second_w, 0);
+    DP(4, "Widening reduction " << (OP<DTR, DTR>::name()) << "(" << +first << ", " << +second_w << ")" << " = " << +result);
+    getVregData<DTR>(vreg_file, rdest, 0) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_red_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_red_w<OP, DT8, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vv_red_w<OP, DT16, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vv_red_w<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VV widening reduction for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
+void vector_op_vv_red_wf(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    // use rdest as accumulator
+    if (i == 0) {
+      getVregData<DTR>(vreg_file, rdest, 0) = getVregData<DTR>(vreg_file, rsrc0, 0);
+    }
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DTR first = getVregData<DTR>(vreg_file, rdest, 0);
+    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    DTR second_w = rv_ftod(second);
+    DTR result = OP<DTR, DTR>::apply(first, second_w, 0);
+    DP(4, "Float widening reduction " << (OP<DTR, DTR>::name()) << "(" << +first << ", " << +second_w << ")" << " = " << +result);
+    getVregData<DTR>(vreg_file, rdest, 0) = result;
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_red_wf(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  if (vsew == 32) {
+    vector_op_vv_red_wf<OP, DT32, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+  } else {
+    std::cout << "Failed to execute VV float widening reduction for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <typename DT>
+void vector_op_vid(std::vector<std::vector<Byte>> &vreg_file, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DP(4, "Element Index = " << +i);
+    getVregData<DT>(vreg_file, rdest, i) = i;
+  }
+}
+
+void vector_op_vid(std::vector<std::vector<Byte>> &vreg_file, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vid<uint8_t>(vreg_file, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vid<uint16_t>(vreg_file, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vid<uint32_t>(vreg_file, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vid<uint64_t>(vreg_file, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute vector element index for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT>
+void vector_op_vv_mask(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
+  for (uint32_t i = 0; i < vl; i++) {
+    if (isMasked(vreg_file, 0, i, vmask))
+      continue;
+
+    DT first = getVregData<DT>(vreg_file, rsrc0, i);
+    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    bool result = OP<DT, bool>::apply(first, second, 0);
+    DP(4, "Integer/float compare mask " << (OP<DT, bool>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
+    if (result) {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
+    } else {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
+    }
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP, typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_mask(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl, uint32_t vmask) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_mask<OP, DT8>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 16:
+    vector_op_vv_mask<OP, DT16>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 32:
+    vector_op_vv_mask<OP, DT32>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  case 64:
+    vector_op_vv_mask<OP, DT64>(vreg_file, rsrc0, rsrc1, rdest, vl, vmask);
+    break;
+  default:
+    std::cout << "Failed to execute VV integer/float compare mask for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+
+template <template <typename DT1, typename DT2> class OP>
+void vector_op_vv_mask(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl) {
+  for (uint32_t i = 0; i < vl; i++) {
+    uint8_t firstMask = getVregData<uint8_t>(vreg_file, rsrc0, i / 8);
+    bool first = (firstMask >> (i % 8)) & 0x1;
+    uint8_t secondMask = getVregData<uint8_t>(vreg_file, rsrc1, i / 8);
+    bool second = (secondMask >> (i % 8)) & 0x1;
+    bool result = OP<uint8_t, uint8_t>::apply(first, second, 0) & 0x1;
+    DP(4, "Compare mask bits " << (OP<uint8_t, uint8_t>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
+    if (result) {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
+    } else {
+      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
+    }
+  }
+}
+
+template <typename DT>
+void vector_op_vv_compress(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl) {
+  int currPos = 0;
+  for (uint32_t i = 0; i < vl; i++) {
+    // Special case: use rsrc0 as mask vector register instead of default v0
+    // This instruction is always masked (vmask == 0), but encoded as unmasked (vmask == 1)
+    if (isMasked(vreg_file, rsrc0, i, 0))
+      continue;
+
+    DT value = getVregData<DT>(vreg_file, rsrc1, i);
+    DP(4, "Compression - Moving value " << +value << " from position " << i << " to position " << currPos);
+    getVregData<DT>(vreg_file, rdest, currPos) = value;
+    currPos++;
+  }
+}
+
+template <typename DT8, typename DT16, typename DT32, typename DT64>
+void vector_op_vv_compress(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t vl) {
+  switch (vsew) {
+  case 8:
+    vector_op_vv_compress<DT8>(vreg_file, rsrc0, rsrc1, rdest, vl);
+    break;
+  case 16:
+    vector_op_vv_compress<DT16>(vreg_file, rsrc0, rsrc1, rdest, vl);
+    break;
+  case 32:
+    vector_op_vv_compress<DT32>(vreg_file, rsrc0, rsrc1, rdest, vl);
+    break;
+  case 64:
+    vector_op_vv_compress<DT64>(vreg_file, rsrc0, rsrc1, rdest, vl);
+    break;
+  default:
+    std::cout << "Failed to execute VV compression for vsew: " << vsew << std::endl;
+    std::abort();
+  }
+}
+#endif
\ No newline at end of file
diff --git a/tests/riscv/riscv-vector-tests/README b/tests/riscv/riscv-vector-tests/README
index bf75d2675..78af65edf 100644
--- a/tests/riscv/riscv-vector-tests/README
+++ b/tests/riscv/riscv-vector-tests/README
@@ -11,7 +11,7 @@ XLEN=64 ./run-test.sh
 
 ## Adding a new testcase
 
-The source code for the vector extension can be found in `sim/simx/execute_vector.cpp`.
+The source code for the vector extension can be found in `sim/simx/vpu.cpp`.
 If you add support for a new vector instruction please go to `run-test.sh` and it to the default testcases.
 This will ensure your instruction is included in the regression test suite.
 
diff --git a/tests/riscv/riscv-vector-tests/run-test.sh.in b/tests/riscv/riscv-vector-tests/run-test.sh.in
index 68b4b6563..11ebcf313 100755
--- a/tests/riscv/riscv-vector-tests/run-test.sh.in
+++ b/tests/riscv/riscv-vector-tests/run-test.sh.in
@@ -1,7 +1,4 @@
 #!/bin/bash
-VLEN=${VLEN:-256}
-XLEN=${XLEN:-32}
-
 RISCV_TOOLCHAIN_PATH=${RISCV_TOOLCHAIN_PATH:-$TOOLDIR"/riscv"$XLEN"-gnu-toolchain"}
 
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )