tile-ai
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/ci.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/op/logical.cc‎
Lines changed: 4 additions & 2 deletions b/‎src/op/logical.cc‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/tl_templates/hip/atomic.h‎
Lines changed: 104 additions & 0 deletions b/‎src/tl_templates/hip/atomic.h‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎src/tl_templates/hip/common.h‎
Lines changed: 99 additions & 10 deletions b/‎src/tl_templates/hip/common.h‎
Lines changed: 99 additions & 10 deletions
diff --git a/‎src/tl_templates/hip/debug.h‎
Lines changed: 25 additions & 0 deletions b/‎src/tl_templates/hip/debug.h‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎src/tl_templates/hip/reduce.h‎
Lines changed: 4 additions & 2 deletions b/‎src/tl_templates/hip/reduce.h‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎testing/python/autotune/test_tilelang_autotune.py‎
Lines changed: 2 additions & 0 deletions b/‎testing/python/autotune/test_tilelang_autotune.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎testing/python/carver/test_tilelang_carver_recommend_hints.py‎
Lines changed: 1 addition & 0 deletions b/‎testing/python/carver/test_tilelang_carver_recommend_hints.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎testing/python/components/test_storage_rewrite_detect_inplace.py‎
Lines changed: 6 additions & 2 deletions b/‎testing/python/components/test_storage_rewrite_detect_inplace.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎testing/python/debug/test_device_assert.py‎
Lines changed: 2 additions & 2 deletions b/‎testing/python/debug/test_device_assert.py‎
Lines changed: 2 additions & 2 deletions
@@ -383,7 +383,7 @@ jobs:
             pytest --verbose --color=yes --durations=0 --showlocals --cache-clear
           )
           "${PYTEST[@]}" --maxfail=3 --numprocesses=4 \
-            ./python/amd
+            ./python
 
       # Apple Metal tests
       - name: Run Metal tests with Python ${{ matrix.python-version }} (${{ matrix.runner.toolkit }})
 
@@ -42,14 +42,16 @@ TVM_REGISTER_OP("tl.any_of")
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure))
     .set_attr<TScriptPrinterName>("TScriptPrinterName", "any_of")
-    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", any_of_op);
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", any_of_op)
+    .set_attr<FLowerIntrinsic>("hip.FLowerIntrinsic", any_of_op);
 
 TVM_REGISTER_OP("tl.all_of")
     .set_num_inputs(1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure))
     .set_attr<TScriptPrinterName>("TScriptPrinterName", "all_of")
-    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", all_of_op);
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", all_of_op)
+    .set_attr<FLowerIntrinsic>("hip.FLowerIntrinsic", all_of_op);
 
 } // namespace tl
 } // namespace tvm
@@ -0,0 +1,104 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicAdd(
+  T1 *address, T2 val, int memory_order = 0) {
+  atomicAdd(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+// Overload for when the first argument is a value instead of a pointer
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicAdd(
+  T1 &address, T2 val, int memory_order = 0) {
+  atomicAdd(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+template <typename T1, typename T2> 
+__forceinline__ __device__ T1 AtomicAddRet(
+  T1 &ref, T2 val, int memory_order = 0) {
+  return atomicAdd(&ref, static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicMax(
+  T1 *address, T2 val, int memory_order = 0) {
+  atomicMax(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+// Overload for when the first argument is a value instead of a pointer
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicMax(
+  T1 &address, T2 val, int memory_order = 0) {
+  atomicMax(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicMin(
+  T1 *address, T2 val, int memory_order = 0) {
+  atomicMin(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+// Overload for when the first argument is a value instead of a pointer
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicMin(
+  T1 &address, T2 val, int memory_order = 0) {
+  atomicMin(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
+}
+
+__forceinline__ __device__ void AtomicAddx2(
+  float *ref, float *val, int memory_order = 0) {
+  float2 add_val = *reinterpret_cast<float2 *>(val);
+  atomicAdd(ref + 0, add_val.x);
+  atomicAdd(ref + 1, add_val.y);
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+__forceinline__ __device__ float2 AtomicAddx2Ret(
+  float *ref, float *val, int memory_order = 0) {
+  float2 add_val = *reinterpret_cast<float2 *>(val);
+  float2 ret;
+  ret.x = atomicAdd(ref + 0, add_val.x);
+  ret.y = atomicAdd(ref + 1, add_val.y);
+  return ret;
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+__forceinline__ __device__ void AtomicAddx4(
+  float *ref, float *val, int memory_order = 0) {
+  float4 add_val = *reinterpret_cast<float4 *>(val);
+  atomicAdd(ref + 0, add_val.x);
+  atomicAdd(ref + 1, add_val.y);
+  atomicAdd(ref + 2, add_val.z);
+  atomicAdd(ref + 3, add_val.w);
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+__forceinline__ __device__ float4 AtomicAddx4Ret(
+  float *ref, float *val, int memory_order = 0) {
+  float4 add_val = *reinterpret_cast<float4 *>(val);
+  float4 ret;
+  ret.x = atomicAdd(ref + 0, add_val.x);
+  ret.y = atomicAdd(ref + 1, add_val.y);
+  ret.z = atomicAdd(ref + 2, add_val.z);
+  ret.w = atomicAdd(ref + 3, add_val.w);
+  return ret;
+}
@@ -5,6 +5,7 @@
 #include <hip/hip_fp16.h>
 #include <hip/hip_runtime.h>
 #include <rocwmma/rocwmma.hpp>
+#include "atomic.h"
 
 #define HIPRT_INF_F __int_as_float(0x7f800000)
 #define HIPRT_NEGINF_F __int_as_float(0xff800000)
@@ -105,18 +106,106 @@ TL_DEVICE unsigned __pack_bfloat162(const bfloat16_t x, const bfloat16_t y) {
   return (v1 << 16) | v0;
 }
 
-template <typename T1, typename T2>
-TL_DEVICE void AtomicAdd(T1 *address, T2 val) {
-  atomicAdd(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
+namespace tl {
+
+// Any
+template <typename T> TL_DEVICE bool Any(T *a, int size) {
+  for (int i = 0; i < size; i++) {
+    if (a[i]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// All
+template <typename T> TL_DEVICE bool All(T *a, int size) {
+  for (int i = 0; i < size; i++) {
+    if (!a[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Shuffle functions for HIP
+template <typename T>
+TL_DEVICE T shfl_xor_sync(unsigned mask, T val, int laneMask) {
+  return __shfl_xor_sync(mask, val, laneMask);
+}
+
+template <typename T>
+TL_DEVICE T shfl_down_sync(unsigned mask, T val, int delta) {
+  return __shfl_down_sync(mask, val, delta);
+}
+
+template <typename T>
+TL_DEVICE T shfl_up_sync(unsigned mask, T val, int delta) {
+  return __shfl_up_sync(mask, val, delta);
+}
+
+template <typename T>
+TL_DEVICE T shfl_sync(unsigned mask, T val, int srcLane) {
+  return __shfl_sync(mask, val, srcLane);
+}
+
+// Specializations for half_t (float16_t)
+template <>
+TL_DEVICE half_t shfl_xor_sync(unsigned mask, half_t val, int laneMask) {
+  float f = static_cast<float>(val);
+  float r = __shfl_xor_sync(mask, f, laneMask);
+  return half_t(r);
+}
+
+template <>
+TL_DEVICE half_t shfl_down_sync(unsigned mask, half_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_down_sync(mask, f, delta);
+  return half_t(r);
+}
+
+template <>
+TL_DEVICE half_t shfl_up_sync(unsigned mask, half_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_up_sync(mask, f, delta);
+  return half_t(r);
+}
+
+template <>
+TL_DEVICE half_t shfl_sync(unsigned mask, half_t val, int srcLane) {
+  float f = static_cast<float>(val);
+  float r = __shfl_sync(mask, f, srcLane);
+  return half_t(r);
+}
+
+// Specializations for bfloat16_t
+template <>
+TL_DEVICE bfloat16_t shfl_xor_sync(unsigned mask, bfloat16_t val,
+                                   int laneMask) {
+  float f = static_cast<float>(val);
+  float r = __shfl_xor_sync(mask, f, laneMask);
+  return bfloat16_t(r);
 }
 
-// Overload for when the first argument is a value instead of a pointer
-template <typename T1, typename T2>
-TL_DEVICE void AtomicAdd(T1 address, T2 val) {
-  atomicAdd(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
+template <>
+TL_DEVICE bfloat16_t shfl_down_sync(unsigned mask, bfloat16_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_down_sync(mask, f, delta);
+  return bfloat16_t(r);
 }
 
-template <typename T1, typename T2>
-TL_DEVICE T1 AtomicAddRet(T1 *address, T2 val) {
-  return atomicAdd(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
+template <>
+TL_DEVICE bfloat16_t shfl_up_sync(unsigned mask, bfloat16_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_up_sync(mask, f, delta);
+  return bfloat16_t(r);
 }
+
+template <>
+TL_DEVICE bfloat16_t shfl_sync(unsigned mask, bfloat16_t val, int srcLane) {
+  float f = static_cast<float>(val);
+  float r = __shfl_sync(mask, f, srcLane);
+  return bfloat16_t(r);
+}
+
+} // namespace tl
@@ -47,6 +47,17 @@ __device__ void debug_print_var<unsigned int>(const char *msg,
          (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, var);
 }
 
+// Specialization for unsigned short type
+template <>
+__device__ void debug_print_var<half_t>(const char *msg, half_t var) {
+  const char *safe_msg = msg;
+  float value = static_cast<float>(var);
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
+         "dtype=half_t value=%f\n",
+         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
+         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, value);
+}
+
 // Specialization for float type
 template <> __device__ void debug_print_var<float>(const char *msg, float var) {
   const char *safe_msg = msg;
@@ -133,6 +144,20 @@ debug_print_buffer_value<unsigned char>(const char *msg, const char *buf_name,
          index, value);
 }
 
+// Specialization for bool type
+template <>
+__device__ void debug_print_buffer_value<bool>(const char *msg,
+                                              const char *buf_name, int index,
+                                              bool var) {
+  const char *safe_msg = msg;
+  const char *safe_buf_name = buf_name;
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+         "index=%d, dtype=bool value=%s\n",
+         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
+         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, safe_buf_name,
+         index, var ? "true" : "false");
+}
+
 // Specialization for integer type
 template <>
 __device__ void debug_print_buffer_value<int>(const char *msg,
 
@@ -73,7 +73,8 @@ struct SharedReduceWarp {
       }
 
       for (int offset = kWarpSize / 2; offset > 0; offset >>= 1) {
-        T other = __shfl_down(partial, offset, kWarpSize);
+        constexpr uint32_t mask = 0xffffffff;
+        T other = tl::shfl_down_sync(mask, partial, offset, kWarpSize);
         partial = Reducer()(partial, other);
       }
 
@@ -104,7 +105,8 @@ struct AllReduce {
       __syncthreads();
       x = Reducer()(x, red_buf[threadIdx.x ^ offset]);
     } else {
-      x = Reducer()(x, __shfl_xor(x, offset));
+      constexpr uint32_t mask = 0xffffffff;
+      x = Reducer()(x, tl::shfl_xor_sync(mask, x, offset));
     }
     if constexpr (offset == scale) {
       return x;
 
@@ -260,11 +260,13 @@ def main(
     return autotuner.run(warmup=3, rep=20)
 
 
+@tilelang.testing.requires_cuda
 def test_autotune_get_configs():
     get_configs(1024, 1024, 1024, with_roller=True)
     get_configs(1024, 1024, 1024, with_roller=False)
 
 
+@tilelang.testing.requires_cuda
 def test_autotune_matmul():
     matmul(1024, 1024, 1024, with_roller=True)
     matmul(1024, 1024, 1024, with_roller=False)
 
@@ -132,6 +132,7 @@ def run_fmha_recommend_hints(
     assert len(hints) > 0, "Hints length should be greater than 0"
 
 
+@tilelang.testing.requires_cuda
 def test_fmha_recommend_hints():
     run_fmha_recommend_hints(4, 32, 512, 512, 128, "float16", "float16", "float16")
     run_fmha_recommend_hints(4, 32, 512, 512, 128, "int8", "int32", "int32")
 
@@ -1,6 +1,9 @@
 import tilelang
 import tilelang.testing
 from tilelang import language as T
+from tilelang.utils.target import check_hip_availability
+
+_IS_HIP_AVAILABLE = check_hip_availability()
 
 
 @tilelang.jit
@@ -54,8 +57,9 @@ def test_storage_rewrite_detect_inplace_toggle():
     script_off = _get_device_kernel_script(detect_inplace=False)
     script_on = _get_device_kernel_script(detect_inplace=True)
 
-    assert script_off.count("read = (read * 2);") == 0
-    assert script_on.count("read = (read * 2);") > 0
+    pattern = "read[0] = (read[0] * 2);" if _IS_HIP_AVAILABLE else "read = (read * 2);"
+    assert script_off.count(pattern) == 0
+    assert script_on.count(pattern) > 0
 
 
 if __name__ == "__main__":
 
@@ -13,7 +13,7 @@ def program():
             tid = T.get_thread_binding()
             T.device_assert(tid > 0, "Assertion Trigger !")
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program, target="auto")
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
@@ -25,7 +25,7 @@ def program():
             tid = T.get_thread_binding()
             T.device_assert(tid == tid)
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program, target="auto")
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
Original file line number	Diff line number	Diff line change
`@@ -383,7 +383,7 @@ jobs:`
`383`	`383`	`pytest --verbose --color=yes --durations=0 --showlocals --cache-clear`
`384`	`384`	`)`
`385`	`385`	`"${PYTEST[@]}" --maxfail=3 --numprocesses=4 \`
`386`		`- ./python/amd`
	`386`	`+ ./python`
`387`	`387`
`388`	`388`	`# Apple Metal tests`
`389`	`389`	`- name: Run Metal tests with Python ${{ matrix.python-version }} (${{ matrix.runner.toolkit }})`
Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,8 @@ struct SharedReduceWarp {`
`73`	`73`	`}`
`74`	`74`
`75`	`75`	`for (int offset = kWarpSize / 2; offset > 0; offset >>= 1) {`
`76`		`- T other = __shfl_down(partial, offset, kWarpSize);`
	`76`	`+ constexpr uint32_t mask = 0xffffffff;`
	`77`	`+ T other = tl::shfl_down_sync(mask, partial, offset, kWarpSize);`
`77`	`78`	`partial = Reducer()(partial, other);`
`78`	`79`	`}`
`79`	`80`
`@@ -104,7 +105,8 @@ struct AllReduce {`
`104`	`105`	`__syncthreads();`
`105`	`106`	`x = Reducer()(x, red_buf[threadIdx.x ^ offset]);`
`106`	`107`	`} else {`
`107`		`- x = Reducer()(x, __shfl_xor(x, offset));`
	`108`	`+ constexpr uint32_t mask = 0xffffffff;`
	`109`	`+ x = Reducer()(x, tl::shfl_xor_sync(mask, x, offset));`
`108`	`110`	`}`
`109`	`111`	`if constexpr (offset == scale) {`
`110`	`112`	`return x;`