gfx-rs · 39ali · Mar 28, 2026
diff --git a/naga/src/back/msl/writer.rs b/naga/src/back/msl/writer.rs
@@ -7235,6 +7235,15 @@ template <typename A>
                 }
             }
 
+            //https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf - 1.6.6
+            if ep.stage == crate::ShaderStage::Compute && options.lang_version >= (3, 0) {
+                let total_threads =
+                    ep.workgroup_size[0] * ep.workgroup_size[1] * ep.workgroup_size[2];
+                write!(
+                    self.out,
+                    "[[max_total_threads_per_threadgroup({total_threads})]] "
+                )?;
+            }
             // Write the entry point function's name, and begin its argument list.
             writeln!(self.out, "{em_str} {result_type_name} {fun_name}(")?;
 

diff --git a/naga/tests/out/msl/wgsl-atomicOps-float32.metal b/naga/tests/out/msl/wgsl-atomicOps-float32.metal
@@ -14,7 +14,7 @@ struct Struct {
 
 struct cs_mainInput {
 };
-kernel void cs_main(
+[[max_total_threads_per_threadgroup(2)]] kernel void cs_main(
   metal::uint3 id [[thread_position_in_threadgroup]]
 , device metal::atomic_float& storage_atomic_scalar [[user(fake0)]]
 , device type_2& storage_atomic_arr [[user(fake0)]]

diff --git a/naga/tests/out/msl/wgsl-atomicTexture-int64.metal b/naga/tests/out/msl/wgsl-atomicTexture-int64.metal
@@ -7,7 +7,7 @@ using metal::uint;
 
 struct cs_mainInput {
 };
-kernel void cs_main(
+[[max_total_threads_per_threadgroup(2)]] kernel void cs_main(
   metal::uint3 id [[thread_position_in_threadgroup]]
 , metal::texture2d<ulong, metal::access::read_write> image [[user(fake0)]]
 ) {

diff --git a/naga/tests/out/msl/wgsl-atomicTexture.metal b/naga/tests/out/msl/wgsl-atomicTexture.metal
@@ -7,7 +7,7 @@ using metal::uint;
 
 struct cs_mainInput {
 };
-kernel void cs_main(
+[[max_total_threads_per_threadgroup(2)]] kernel void cs_main(
   metal::uint3 id [[thread_position_in_threadgroup]]
 , metal::texture2d<uint, metal::access::read_write> image_u [[user(fake0)]]
 , metal::texture2d<int, metal::access::read_write> image_s [[user(fake0)]]

diff --git a/naga/tests/out/msl/wgsl-memory-decorations-coherent.metal b/naga/tests/out/msl/wgsl-memory-decorations-coherent.metal
@@ -14,7 +14,7 @@ struct Data {
     type_1 values;
 };
 
-kernel void main_(
+[[max_total_threads_per_threadgroup(1)]] kernel void main_(
   coherent device Data& coherent_buf [[user(fake0)]]
 , device Data const& plain_buf [[user(fake0)]]
 , constant _mslBufferSizes& _buffer_sizes [[user(fake0)]]

diff --git a/wgpu-types/src/limits.rs b/wgpu-types/src/limits.rs
@@ -219,6 +219,9 @@ pub struct Limits {
     pub max_compute_workgroup_storage_size: u32,
     /// Maximum value of the product of the `workgroup_size` dimensions for a compute entry-point.
     /// Defaults to 256. Higher is "better".
+    /// **Note:** On Metal, this is used to emit the `[[max_total_threads_per_threadgroup]]`
+    /// attribute. This is how we guarantee that compute dispatches will always run
+    /// and not silently fail due to hardware register pressure or occupancy limits.
     pub max_compute_invocations_per_workgroup: u32,
     /// The maximum value of the `workgroup_size` X dimension for a compute stage `ShaderModule` entry-point.
     /// Defaults to 256. Higher is "better".