tile-ai · hubertlu-tw · Jan 27, 2026 · Jan 28, 2026 · coderabbitai · Jan 27, 2026
diff --git a/examples/gemm_fp8/example_tilelang_gemm_amd.py b/examples/gemm_fp8/example_tilelang_gemm_amd.py
@@ -2,6 +2,7 @@
 import tilelang
 import tilelang.language as T
 from tilelang.utils.tensor import torch_assert_close
+from tilelang.utils import select_fp8_e4m3_dtype, select_torch_fp8_e4m3_dtype
 import itertools
 
 
@@ -17,8 +18,9 @@ def supply_prog(args):
     a_param, b_param = args
     M, K = a_param.shape
     N, _ = b_param.shape
-    a = (torch.randn(M, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
-    b = (torch.randn(N, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
+    fp8_dtype = select_torch_fp8_e4m3_dtype()
+    a = (torch.randn(M, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=fp8_dtype)
+    b = (torch.randn(N, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=fp8_dtype)
     return [a, b]
 
 
@@ -53,7 +55,7 @@ def get_configs():
 )
 @tilelang.jit(out_idx=[-1])
 def fp8_matmul(M, N, K, block_M, block_N, block_K, num_stages, num_threads, k_pack, gemm_type):
-    dtype = T.float8_e4m3fnuz
+    dtype = select_fp8_e4m3_dtype()
     accum_dtype = T.float32
 
     @T.prim_func
@@ -104,8 +106,9 @@ def gemm_fp8_ss(
 
 def test_gemm_fp8(M, N, K):
     kernel = fp8_matmul(M, N, K)
-    a = (torch.randn(M, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
-    b = (torch.randn(N, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
+    fp8_dtype = select_torch_fp8_e4m3_dtype()
+    a = (torch.randn(M, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=fp8_dtype)
+    b = (torch.randn(N, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=fp8_dtype)
     c = kernel(a, b)
     ref_c = ref_program(a, b)
     torch_assert_close(c, ref_c, rtol=1e-2, atol=1e-2)

diff --git a/examples/gemm_fp8/example_tilelang_gemm_amd_fp8_preshuffle.py b/examples/gemm_fp8/example_tilelang_gemm_amd_fp8_preshuffle.py
@@ -7,6 +7,7 @@
 from tilelang.tileop.base import GemmWarpPolicy
 from tilelang.layout import make_swizzled_layout
 from tilelang.intrinsics.mfma_macro_generator import MatrixCorePreshuffleIntrinEmitter
+from tilelang.utils import select_fp8_e4m3_dtype
 
 tilelang.testing.set_random_seed(0)
 
@@ -45,12 +46,14 @@ def tl_matmul(
     num_stages,
     k_pack=2,
     num_threads=256,
-    in_dtype=T.float8_e4m3fnuz,
+    in_dtype=None,
     out_dtype=T.float32,
     accum_dtype=T.float32,
     a_transposed=False,
     b_transposed=True,
 ):
+    if in_dtype is None:
+        in_dtype = select_fp8_e4m3_dtype()
     b_preshuffle = True
     warp_size = 64
     num_warps = num_threads // warp_size
@@ -164,7 +167,7 @@ def shuffle_weight(
 
 
 def assert_tl_matmul_correctness(M, N, K, k_pack=1, a_transposed=False, b_transposed=True):
-    in_dtype = T.float8_e4m3fnuz
+    in_dtype = select_fp8_e4m3_dtype()
     out_dtype = T.float32
     accum_dtype = T.float32
     kernel = tl_matmul(

diff --git a/src/tl_templates/hip/hip_fp8.h b/src/tl_templates/hip/hip_fp8.h
@@ -1,5 +1,6 @@
 #pragma once
 #include <hip/amd_detail/amd_hip_fp8.h>
+#include <stdint.h>
 
 #define HIP_FP8_ENABLED 1
 
@@ -16,53 +17,84 @@
 
 #if (TILELANG_FP8_E4M3_VARIANT == TILELANG_FP8_E4M3_VARIANT_FN)
 #if defined(__clang__) && defined(__HIPCC__)
-#if __is_identifier(__hip_fp8_e4m3)
+#if !__is_identifier(__hip_fp8_e4m3)
 #define TILELANG_HAVE_FP8_E4M3_FN 1
 #endif
 #endif
 #endif
 
 #if defined(TILELANG_HAVE_FP8_E4M3_FN)
-using fp8_e4_t = __hip_fp8_e4m3;
-using fp8_e4_2_t = __hip_fp8x2_e4m3;
-using fp8_e4_4_storage_t = __hip_fp8x4_e4m3;
+using hip_fp8_e4_t = __hip_fp8_e4m3;
+using hip_fp8x2_e4_t = __hip_fp8x2_e4m3;
+using hip_fp8x4_e4_t = __hip_fp8x4_e4m3;
 #else
 // FNUZ path (MI300X and universal fallback)
-using fp8_e4_t = __hip_fp8_e4m3_fnuz;
-using fp8_e4_2_t = __hip_fp8x2_e4m3_fnuz;
-using fp8_e4_4_storage_t = __hip_fp8x4_e4m3_fnuz;
+using hip_fp8_e4_t = __hip_fp8_e4m3_fnuz;
+using hip_fp8x2_e4_t = __hip_fp8x2_e4m3_fnuz;
+using hip_fp8x4_e4_t = __hip_fp8x4_e4m3_fnuz;
 #endif
 
+struct fp8_e4_t {
+  unsigned char data;
+  __device__ fp8_e4_t() {}
+  __device__ fp8_e4_t(hip_fp8_e4_t val) {
+    data = *reinterpret_cast<unsigned char *>(&val);
+  }
+  __device__ fp8_e4_t(float val) {
+    constexpr __hip_fp8_interpretation_t interp =
+#if (TILELANG_FP8_E4M3_VARIANT == TILELANG_FP8_E4M3_VARIANT_FNUZ)
+        __HIP_E4M3_FNUZ;
+#else
+        __HIP_E4M3;
+#endif
+    data = __hip_cvt_float_to_fp8(val, __HIP_SATFINITE, interp);
+  }
+  __device__ operator hip_fp8_e4_t() const {
+    return *reinterpret_cast<const hip_fp8_e4_t *>(&data);
+  }
+  __device__ operator float() const {
+    return static_cast<float>(static_cast<hip_fp8_e4_t>(*this));
+  }
+};
+
+using fp8_e4_2_t = hip_fp8x2_e4_t;
+using fp8_e4_4_storage_t = uint32_t;
+
 // Additional FP8 types for compatibility
-using fp8_e5_t = __hip_fp8_e5m2_fnuz;
+using hip_fp8_e5_t = __hip_fp8_e5m2_fnuz;
 using fp8_e5_2_t = __hip_fp8x2_e5m2_fnuz;
+
+struct fp8_e5_t {
+  unsigned char data;
+  __device__ fp8_e5_t() {}
+  __device__ fp8_e5_t(hip_fp8_e5_t val) {
+    data = *reinterpret_cast<unsigned char *>(&val);
+  }
+  __device__ operator hip_fp8_e5_t() const {
+    return *reinterpret_cast<const hip_fp8_e5_t *>(&data);
+  }
+  __device__ operator float() const {
+    return static_cast<float>(static_cast<hip_fp8_e5_t>(*this));
+  }
+};
 // Note: E8M0 types are not supported in current HIP version
 // using fp8_e8_t = __hip_fp8_e8m0_fnuz;
 // using fp8_e8_2_t = __hip_fp8x2_e8m0_fnuz;
 
 // Simple wrapper that provides member access for generated code
-struct fp8_e4_4_t {
-  union {
-    // __hip_fp8x4_e4m3_fnuz data;
-    fp8_e4_4_storage_t data;
-    struct {
-      fp8_e4_t x, y, z, w;
-    };
-  };
-
-  // Default constructor
-  __device__ fp8_e4_4_t() = default;
+struct __align__(4) fp8_e4_4_t {
+  fp8_e4_4_storage_t data;
 
-  // Constructor from __hip_fp8x4_e4m3_fnuz
+  __device__ fp8_e4_4_t() {}
   __device__ fp8_e4_4_t(const fp8_e4_4_storage_t &val) : data(val) {}
+  __device__ fp8_e4_4_t(const hip_fp8x4_e4_t &val) {
+    data = *reinterpret_cast<const fp8_e4_4_storage_t *>(&val);
+  }
 
-  // Constructor from float4
-  __device__ fp8_e4_4_t(const float4 &val) : data(val) {}
-
-  // Conversion operator to __hip_fp8x4_e4m3_fnuz
-  __device__ operator fp8_e4_4_storage_t() const { return data; }
+  __device__ operator hip_fp8x4_e4_t() const {
+    return *reinterpret_cast<const hip_fp8x4_e4_t *>(&data);
+  }
 
-  // Assignment operator
   __device__ fp8_e4_4_t &operator=(const fp8_e4_4_storage_t &val) {
     data = val;
     return *this;
@@ -80,16 +112,17 @@ struct __align__(16) fp8_e4_16_t {
 };
 
 // FP8 E5M2 vector types
-struct fp8_e5_4_t {
-  union {
-    __hip_fp8x4_e5m2_fnuz data;
-    struct {
-      fp8_e5_t x, y, z, w;
-    };
-  };
-  __device__ fp8_e5_4_t() = delete;
-  __device__ fp8_e5_4_t(const __hip_fp8x4_e5m2_fnuz &val) : data(val) {}
-  __device__ operator __hip_fp8x4_e5m2_fnuz() const { return data; }
+using fp8_e5_4_storage_t = uint32_t;
+
+struct __align__(4) fp8_e5_4_t {
+  fp8_e5_4_storage_t data;
+  __device__ fp8_e5_4_t() {}
+  __device__ fp8_e5_4_t(const __hip_fp8x4_e5m2_fnuz &val) {
+    data = *reinterpret_cast<const fp8_e5_4_storage_t *>(&val);
+  }
+  __device__ operator __hip_fp8x4_e5m2_fnuz() const {
+    return *reinterpret_cast<const __hip_fp8x4_e5m2_fnuz *>(&data);
+  }
 };
 
 struct __align__(8) fp8_e5_8_t {

diff --git a/tilelang/intrinsics/mfma_macro_generator.py b/tilelang/intrinsics/mfma_macro_generator.py
@@ -49,6 +49,7 @@ class MatrixCoreIntrinEmitter:
         "int32": "int32",
         "float8_e4m3": "e4m3",
         "float8_e5m2": "e5m2",
+        "float8_e4m3fn": "e4m3fn",
         "float8_e4m3fnuz": "e4m3fnuz",
         "float8_e5m2fnuz": "e5m2fnuz",
     }
@@ -108,7 +109,7 @@ def __init__(
 
     def _initialize_k_dim(self, a_dtype=T.float16):
         if isinstance(a_dtype, str):
-            if a_dtype in ["float8_e4m3fnuz", "float8_e5m2fnuz", T.int8]:
+            if a_dtype in ["float8_e4m3fn", "float8_e4m3fnuz", "float8_e5m2fnuz", T.int8]:
                 self.k_dim = 32
                 return
             a_dtype = DataType(a_dtype)
@@ -141,6 +142,7 @@ def _initialize_mfma_prefix(self, k_dim=16):
             "float32": "f32",
             "int8": "i8",
             "int32": "i32",
+            "float8_e4m3fn": "fp8",
             "float8_e4m3fnuz": "fp8",
             "float8_e5m2fnuz": "fp8",
         }[in_dtype]

diff --git a/tilelang/utils/__init__.py b/tilelang/utils/__init__.py
@@ -1,6 +1,6 @@
 """The profiler and convert to torch utils"""
 
-from .target import determine_target  # noqa: F401
+from .target import determine_target, select_fp8_e4m3_dtype, select_torch_fp8_e4m3_dtype  # noqa: F401
-from .target import determine_target, select_fp8_e4m3_dtype, select_torch_fp8_e4m3_dtype  # noqa: F401
+from .target import determine_target, select_fp8_e4m3_dtype, select_torch_fp8_e4m3_dtype
-from .target import determine_target, select_fp8_e4m3_dtype, select_torch_fp8_e4m3_dtype  # noqa: F401
+from .target import determine_target, select_fp8_e4m3_dtype, select_torch_fp8_e4m3_dtype
 from .tensor import TensorSupplyType, torch_assert_close, map_torch_type  # noqa: F401
 from .language import (
     is_global,  # noqa: F401

diff --git a/tilelang/utils/target.py b/tilelang/utils/target.py
@@ -64,6 +64,31 @@ def check_metal_availability() -> bool:
     return arch == "arm64"
 
 
+def select_fp8_e4m3_dtype() -> str:
+    """
+    Select the correct FP8 E4M3 dtype string for the current platform.
+    - CUDA defaults to FP8 E4M3FN.
+    - ROCm uses FNUZ except gfx950 (OCP), which requires FN.
+    """
+    if torch.version.hip is None:
+        return "float8_e4m3fn"
+    if not torch.cuda.is_available():
+        return "float8_e4m3fnuz"
+    props = torch.cuda.get_device_properties(0)
+    gcn_arch = getattr(props, "gcnArchName", "")
+    if gcn_arch.startswith("gfx950"):
-    if torch.version.hip is None:
-        return "float8_e4m3fn"
-    if not torch.cuda.is_available():
-        return "float8_e4m3fnuz"
-    props = torch.cuda.get_device_properties(0)
-    gcn_arch = getattr(props, "gcnArchName", "")
-    if gcn_arch.startswith("gfx950"):
+    if torch.version.hip is None:
+        return "float8_e4m3fn"
+    if not torch.cuda.is_available():
+        return "float8_e4m3fnuz"
+    device = torch.cuda.current_device()
+    props = torch.cuda.get_device_properties(device)
+    gcn_arch = getattr(props, "gcnArchName", "")
+    if gcn_arch.startswith("gfx950"):
-    if torch.version.hip is None:
-        return "float8_e4m3fn"
-    if not torch.cuda.is_available():
-        return "float8_e4m3fnuz"
-    props = torch.cuda.get_device_properties(0)
-    gcn_arch = getattr(props, "gcnArchName", "")
-    if gcn_arch.startswith("gfx950"):
+    if torch.version.hip is None:
+        return "float8_e4m3fn"
+    if not torch.cuda.is_available():
+        return "float8_e4m3fnuz"
+    device = torch.cuda.current_device()
+    props = torch.cuda.get_device_properties(device)
+    gcn_arch = getattr(props, "gcnArchName", "")
+    if gcn_arch.startswith("gfx950"):
+        return "float8_e4m3fn"
+    return "float8_e4m3fnuz"
+
+
+def select_torch_fp8_e4m3_dtype() -> torch.dtype:
+    dtype_name = select_fp8_e4m3_dtype()
+    torch_dtype = getattr(torch, dtype_name, None)
+    if torch_dtype is None:
+        raise RuntimeError(f"PyTorch does not expose dtype {dtype_name}")
+    return torch_dtype
+
+
 def normalize_cutedsl_target(target: str | Target) -> Target | None:
     if isinstance(target, Target):
         if target.kind.name == "cuda" and "cutedsl" in target.keys: