NVIDIA
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎cutlass/nvf_cutlass.h
Lines changed: 13 additions & 0 deletions b/‎cutlass/nvf_cutlass.h
Lines changed: 13 additions & 0 deletions
@@ -512,6 +512,7 @@ if(BUILD_CUTLASS)
     set(NVFUSER_CUTLASS_SRCS)
     list(APPEND NVFUSER_CUTLASS_SRCS
         ${NVFUSER_CUTLASS}/fp8_blockwise_moe_kernel.cu
+        ${NVFUSER_CUTLASS}/nvfp4_blockwise_moe_kernel.cu
         ${NVFUSER_CUTLASS}/cutlass_utils.cpp
         )
     add_library(nvf_cutlass SHARED ${NVFUSER_CUTLASS_SRCS})
 
@@ -31,4 +31,17 @@ void fp8_blockwise_scaled_grouped_mm(
     const torch::Tensor& expert_offsets,
     const torch::Tensor& workspace);
 
+void nvfp4_blockwise_scaled_grouped_mm(
+    torch::Tensor& output,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& a_blockscale,
+    const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas,
+    const torch::Tensor& ab_strides,
+    const torch::Tensor& c_strides,
+    const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets,
+    const torch::Tensor& sf_offsets);
+
 } // namespace nvfuser::cutlass_kernels
Original file line number	Diff line number	Diff line change
`@@ -512,6 +512,7 @@ if(BUILD_CUTLASS)`
`512`	`512`	`set(NVFUSER_CUTLASS_SRCS)`
`513`	`513`	`list(APPEND NVFUSER_CUTLASS_SRCS`
`514`	`514`	`${NVFUSER_CUTLASS}/fp8_blockwise_moe_kernel.cu`
	`515`	`+ ${NVFUSER_CUTLASS}/nvfp4_blockwise_moe_kernel.cu`
`515`	`516`	`${NVFUSER_CUTLASS}/cutlass_utils.cpp`
`516`	`517`	`)`
`517`	`518`	`add_library(nvf_cutlass SHARED ${NVFUSER_CUTLASS_SRCS})`