NVIDIA · rdspring1 · Jun 19, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -512,6 +512,7 @@ if(BUILD_CUTLASS)
     set(NVFUSER_CUTLASS_SRCS)
     list(APPEND NVFUSER_CUTLASS_SRCS
         ${NVFUSER_CUTLASS}/fp8_blockwise_moe_kernel.cu
+        ${NVFUSER_CUTLASS}/nvfp4_blockwise_moe_kernel.cu
         ${NVFUSER_CUTLASS}/cutlass_utils.cpp
         )
     add_library(nvf_cutlass SHARED ${NVFUSER_CUTLASS_SRCS})

diff --git a/cutlass/nvf_cutlass.h b/cutlass/nvf_cutlass.h
@@ -31,4 +31,17 @@ void fp8_blockwise_scaled_grouped_mm(
     const torch::Tensor& expert_offsets,
     const torch::Tensor& workspace);
 
+void nvfp4_blockwise_scaled_grouped_mm(
+    torch::Tensor& output,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& a_blockscale,
+    const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas,
+    const torch::Tensor& ab_strides,
+    const torch::Tensor& c_strides,
+    const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets,
+    const torch::Tensor& sf_offsets);
+
 } // namespace nvfuser::cutlass_kernels