tile-ai · w169q169 · Dec 21, 2025 · Dec 22, 2025 · Dec 22, 2025 · Dec 22, 2025
diff --git a/docs/programming_guides/instructions.md b/docs/programming_guides/instructions.md
@@ -138,6 +138,10 @@ Annotation helpers
 - `T.annotate_safe_value(var, ...)`: Safety/const hints.
 - `T.annotate_l2_hit_ratio(buf, ratio)`: Cache behavior hint.
 
+Synchronization helpers
+- `T.pdl_trigger()`: Signal programmatic launch completion for the current kernel.
+- `T.pdl_sync()`: Wait until kernel dependencies are satisfied.
+
 Atomics
 - `T.atomic_add(dst, value, memory_order=None, return_prev=False, use_tma=False)`.
 - `T.atomic_addx2(dst, value, return_prev=False)`; `T.atomic_addx4(...)`.

diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
@@ -14,6 +14,7 @@
 #include <vector>
 
 #include "../op/builtin.h"
+#include "../transform/common/attr.h"
 #include "./ptx.h"
 #include "./utils.h"
 #include "arith/pattern_match.h"
@@ -3338,6 +3339,10 @@ void CodeGenTileLangCUDA::PrintFunctionSignature(const String &function_name,
   CodeGenC::PrintType(func->ret_type, os);
   CodeGenC::PrintExtraAttrs(func, os);
   bool no_alias = func->HasNonzeroAttr(tir::attr::kNoAlias);
+  // NVCC has issues with __restrict__ on kernel parameters when using PDL
+  // (Programmatic Dependent Launch) synchronization. Suppress the annotation
+  // when kHasGridSync is set.
+  bool has_cuda_pdl_sync = func->HasNonzeroAttr(tl::attr::kHasGridSync);
   std::unordered_set<const VarNode *> non_restrict;
   if (auto opt =
           func->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
@@ -3387,7 +3392,7 @@ void CodeGenTileLangCUDA::PrintFunctionSignature(const String &function_name,
         }
       }
 
-      if (no_alias && !non_restrict.count(v.get())) {
+      if (!has_cuda_pdl_sync && no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, os);
       }
     } else {
@@ -3423,6 +3428,10 @@ void CodeGenTileLangCUDA::AddFunction(const GlobalVar &gvar,
   ICHECK(global_symbol)
       << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
   bool no_alias = f->HasNonzeroAttr(tir::attr::kNoAlias);
+  // NVCC has issues with __restrict__ on kernel parameters when using PDL
+  // (Programmatic Dependent Launch) synchronization. Suppress the annotation
+  // when kHasGridSync is set.
+  bool has_cuda_pdl_sync = f->HasNonzeroAttr(tl::attr::kHasGridSync);
   std::unordered_set<const VarNode *> non_restrict;
   if (auto opt =
           f->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
@@ -3474,7 +3483,7 @@ void CodeGenTileLangCUDA::AddFunction(const GlobalVar &gvar,
         }
       }
 
-      if (no_alias && !non_restrict.count(v.get())) {
+      if (!has_cuda_pdl_sync && no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, stream);
       }
     } else {

diff --git a/src/transform/common/attr.h b/src/transform/common/attr.h
@@ -11,5 +11,11 @@ constexpr const char *MainBlockName = "tilelang_root";
 constexpr const char *tilelang_is_cpu_kernel_frame =
     "tilelang.is_cpu_kernel_frame";
 
+namespace attr {
+// Attributes to mark CUDA sync calls
+constexpr const char *kHasTriggerLaunch = "has_cuda_pdl_trigger";
+constexpr const char *kHasGridSync = "has_cuda_pdl_sync";
+} // namespace attr
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/lower_pdl.cc b/src/transform/lower_pdl.cc
@@ -0,0 +1,99 @@
+/*!
+ * \file lower_pdl.cc
+ * \brief Mark Device PrimFunc with attributes if CUDA PDL functions are called
+ */
+
+#include "../op/builtin.h"
+#include "../target/utils.h"
+#include "common/attr.h"
+#include "tvm/ir/type.h"
+#include "tvm/tir/builtin.h"
+#include "tvm/tir/expr.h"
+#include "tvm/tir/stmt.h"
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+// NVCC has issues with __ldg when using PDL (Programmatic Dependent Launch)
+// synchronization. Suppress the annotation when kHasGridSync is set.
+class CheckLDGCalls : public StmtExprVisitor {
+public:
+  void VisitExpr_(const tir::CallNode *op) final {
+    if (op->op.same_as(tl::__ldg())) {
+      LOG(FATAL) << "Cannot invoke __ldg function with pdl_sync";
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+};
+
+class MarkCudaSyncCalls : public StmtExprMutator {
+public:
+  static PrimFunc Substitute(PrimFunc f, bool support_pdl) {
+    MarkCudaSyncCalls mutator;
+    PrimFunc new_f = f;
+    new_f.CopyOnWrite()->body = mutator.VisitStmt(f->body);
+
+    if (!support_pdl) {
+      ICHECK(!mutator.has_trigger_launch_ && !mutator.has_grid_sync_)
+          << "PDL is not supported";
+    }
+
+    if (mutator.has_trigger_launch_) {
+      new_f = WithAttr(std::move(new_f), attr::kHasTriggerLaunch, 1);
+    }
+    if (mutator.has_grid_sync_) {
+      new_f = WithAttr(std::move(new_f), attr::kHasGridSync, 1);
+      CheckLDGCalls analyzer;
+      analyzer(f->body);
+    }
+    return new_f;
+  }
+
+  PrimExpr VisitExpr_(const tir::CallNode *op) final {
+    if (op && op->op.same_as(builtin::call_extern())) {
+      if (!op->args.empty()) {
+        if (const auto *str_node = op->args[0].as<tvm::tir::StringImmNode>()) {
+          std::string func_name = str_node->value;
+          if (func_name == "cudaTriggerProgrammaticLaunchCompletion") {
+            has_trigger_launch_ = true;
+          } else if (func_name == "cudaGridDependencySynchronize") {
+            has_grid_sync_ = true;
+          }
+        }
+      }
+    }
+    return StmtExprMutator::VisitExpr_(op);
+  }
+
+private:
+  bool has_trigger_launch_ = false;
+  bool has_grid_sync_ = false;
+
+  MarkCudaSyncCalls() = default;
+};
+
+using namespace tir::transform;
+
+tvm::transform::Pass MarkCudaSyncCallsPass(bool support_pdl) {
+  auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
+    return MarkCudaSyncCalls::Substitute(f, support_pdl);
+  };
+
+  return CreatePrimFuncPass(pass_func, 0, "tl.MarkCudaSyncCalls", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.MarkCudaSyncCalls",
+                        MarkCudaSyncCallsPass);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/warp_specialized_rewriter.cc b/src/transform/warp_specialized_rewriter.cc
@@ -164,6 +164,18 @@ class WarpSpecializedRoleMarker : public StmtVisitor {
       if (call->op.same_as(loop_break())) {
         role = Role::kBoth;
       }
+      if (call->op.same_as(builtin::call_extern())) {
+        if (!call->args.empty()) {
+          if (const auto *str_node =
+                  call->args[0].as<tvm::tir::StringImmNode>()) {
+            std::string func_name = str_node->value;
+            if (func_name == "cudaGridDependencySynchronize" ||
+                func_name == "cudaTriggerProgrammaticLaunchCompletion") {
+              role = Role::kBoth;
+            }
+          }
+        }
+      }
     }
     SetRole(op, role);
   }

diff --git a/testing/python/jit/test_tilelang_jit_cython.py b/testing/python/jit/test_tilelang_jit_cython.py
@@ -0,0 +1,71 @@
+from tilelang import tvm as tvm
+import tilelang.language as T
+import tilelang.testing
+import tilelang
+import torch
+import pytest
+
+
+def check_pdl():
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    compute_capability = props.major, props.minor
+    return compute_capability[0] >= 9
+
+
+def test_cython_pdl():
+    """Test pdl."""
+
+    if not check_pdl():
+        pytest.skip("PDL Test requires compute capability >= 9")
+
+    N = 64
+
+    @tilelang.jit(execution_backend="cython")
+    def multi_kernels_with_pdl(N, block_size=256, dtype=T.float32):
+        @T.prim_func
+        def main(
+            A: T.Tensor((N,), dtype),
+            B: T.Tensor((N,), dtype),
+            C: T.Tensor((N,), dtype),
+        ):
+            with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx,):
+                for i in T.Parallel(block_size):
+                    idx = bx * block_size + i
+                    if idx < N:
+                        B[idx] = A[idx] + 1.0
+                T.pdl_trigger()
+
+            with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx2,):
+                T.pdl_sync()
+                for i in T.Parallel(block_size):
+                    idx = bx2 * block_size + i
+                    if idx < N:
+                        C[idx] = B[idx] * 2.0
+
+        return main
+
+    # Compile the kernel
+    kernel = multi_kernels_with_pdl(N)
+
+    # Create test tensors
+    a = torch.randn(N, dtype=torch.float32).cuda()
+    b = torch.randn(N, dtype=torch.float32).cuda()
+    c = torch.randn(N, dtype=torch.float32).cuda()
+
+    ref_b = a + 1.0
+    ref_c = ref_b * 2.0
+
+    kernel(a, b, c)
+
+    # Verify correctness
+
+    tilelang.testing.torch_assert_close(b, ref_b, atol=1e-5, rtol=1e-5)
+    tilelang.testing.torch_assert_close(c, ref_c, atol=1e-5, rtol=1e-5)
+
+    print("pdl test passed!")
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/jit/test_tilelang_jit_nvrtc.py b/testing/python/jit/test_tilelang_jit_nvrtc.py
@@ -4,6 +4,7 @@
 import tilelang
 import torch
 from tilelang.utils.tensor import map_torch_type
+import pytest
 
 
 def matmul(
@@ -494,5 +495,66 @@ def kernel(
     print("L2 persistent map test passed!")
 
 
+def check_pdl():
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    compute_capability = props.major, props.minor
+    return compute_capability[0] >= 9
+
+
+def test_nvrtc_pdl():
+    """Test pdl."""
+
+    if not check_pdl():
+        pytest.skip("PDL Test requires compute capability >= 9")
+
+    N = 64
+
+    @tilelang.jit(execution_backend="nvrtc")
+    def multi_kernels_with_pdl(N, block_size=256, dtype=T.float32):
+        @T.prim_func
+        def main(
+            A: T.Tensor((N,), dtype),
+            B: T.Tensor((N,), dtype),
+            C: T.Tensor((N,), dtype),
+        ):
+            with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx,):
+                for i in T.Parallel(block_size):
+                    idx = bx * block_size + i
+                    if idx < N:
+                        B[idx] = A[idx] + 1.0
+                T.pdl_trigger()
+
+            with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx2,):
+                T.pdl_sync()
+                for i in T.Parallel(block_size):
+                    idx = bx2 * block_size + i
+                    if idx < N:
+                        C[idx] = B[idx] * 2.0
+
+        return main
+
+    # Compile the kernel
+    kernel = multi_kernels_with_pdl(N)
+
+    # Create test tensors
+    a = torch.randn(N, dtype=torch.float32).cuda()
+    b = torch.randn(N, dtype=torch.float32).cuda()
+    c = torch.randn(N, dtype=torch.float32).cuda()
+
+    ref_b = a + 1.0
+    ref_c = ref_b * 2.0
+
+    kernel(a, b, c)
+
+    # Verify correctness
+
+    tilelang.testing.torch_assert_close(b, ref_b, atol=1e-5, rtol=1e-5)
+    tilelang.testing.torch_assert_close(c, ref_c, atol=1e-5, rtol=1e-5)
+
+    print("pdl test passed!")
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_pdl.py b/testing/python/language/test_tilelang_language_pdl.py
@@ -0,0 +1,56 @@
+import tilelang.testing
+import tilelang.language as T
+
+
+def kernels_with_pdl_trigger(N, block_size=256, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx,):
+            for i in T.Parallel(block_size):
+                idx = bx * block_size + i
+                if idx < N:
+                    B[idx] = A[idx] + 1.0
+            T.pdl_trigger()
+
+    return main
+
+
+def kernels_with_pdl_sync(N, block_size=256, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx2,):
+            T.pdl_sync()
+            for i in T.Parallel(block_size):
+                idx = bx2 * block_size + i
+                if idx < N:
+                    B[idx] = A[idx] * 2.0
+
+    return main
+
+
+def test_pdl_trigger():
+    N = 64
+    program = kernels_with_pdl_trigger(N)
+
+    pdl_kernel = tilelang.compile(program, target="cuda -arch=sm_90")
+    code = pdl_kernel.get_kernel_source()
+    assert "cudaTriggerProgrammaticLaunchCompletion" in code
+
+
+def test_pdl_sync():
+    N = 64
+    program = kernels_with_pdl_sync(N)
+
+    pdl_kernel = tilelang.compile(program, target="cuda -arch=sm_90")
+    code = pdl_kernel.get_kernel_source()
+    assert "cudaGridDependencySynchronize" in code
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()