From 041557be0bbf2faae89c58e3bde19a4dcc434441 Mon Sep 17 00:00:00 2001
From: "Zhong, Zhicong" <zhicong.zhong@intel.com>
Date: Wed, 30 Oct 2024 04:34:18 +0000
Subject: [PATCH 1/2] add functionality test

---
 test/mlir/test/gc/Integration/op/binary.mlir | 57 ++++++++++++++
 test/mlir/test/gc/Integration/op/matmul.mlir | 79 ++++++++++++++++++++
 test/mlir/test/gc/Integration/op/relu.mlir   | 77 +++++++++++++++++++
 3 files changed, 213 insertions(+)
 create mode 100644 test/mlir/test/gc/Integration/op/binary.mlir
 create mode 100644 test/mlir/test/gc/Integration/op/matmul.mlir
 create mode 100644 test/mlir/test/gc/Integration/op/relu.mlir

diff --git a/test/mlir/test/gc/Integration/op/binary.mlir b/test/mlir/test/gc/Integration/op/binary.mlir
new file mode 100644
index 00000000..fc67a7d5
--- /dev/null
+++ b/test/mlir/test/gc/Integration/op/binary.mlir
@@ -0,0 +1,57 @@
+// RUN: gc-opt %s --gc-gpu-pipeline -split-input-file | FileCheck %s
+
+// CHECK-LABEL: llvm
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+  func.func @multiply(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16>
+    %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16>
+    %2 = bufferization.to_tensor %arg2 restrict : memref<1024x1024xf16>
+    %3 = tensor.empty() : tensor<1024x1024xf16>
+    %4 = linalg.mul ins(%0, %1 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%3 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16>
+    bufferization.materialize_in_destination %4 in restrict writable %arg3 : (tensor<1024x1024xf16>, memref<1024x1024xf16>) -> ()
+    return
+  }
+}
+
+// -----
+
+// CHECK-LABEL: llvm
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+  func.func @add(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16>
+    %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16>
+    %2 = bufferization.to_tensor %arg2 restrict : memref<1024x1024xf16>
+    %3 = tensor.empty() : tensor<1024x1024xf16>
+    %4 = linalg.add ins(%0, %1 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%3 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16>
+    bufferization.materialize_in_destination %4 in restrict writable %arg3 : (tensor<1024x1024xf16>, memref<1024x1024xf16>) -> ()
+    return
+  }
+}
+
+// -----
+// CHECK-LABEL: llvm
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+  func.func @subtract(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16>
+    %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16>
+    %2 = bufferization.to_tensor %arg2 restrict : memref<1024x1024xf16>
+    %3 = tensor.empty() : tensor<1024x1024xf16>
+    %4 = linalg.sub ins(%0, %1 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%3 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16>
+    bufferization.materialize_in_destination %4 in restrict writable %arg3 : (tensor<1024x1024xf16>, memref<1024x1024xf16>) -> ()
+    return
+  }
+}
+
+// -----
+// CHECK-LABEL: llvm
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+  func.func @divide(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16>
+    %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16>
+    %2 = bufferization.to_tensor %arg2 restrict : memref<1024x1024xf16>
+    %3 = tensor.empty() : tensor<1024x1024xf16>
+    %4 = linalg.div ins(%0, %1 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%3 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16>
+    bufferization.materialize_in_destination %4 in restrict writable %arg3 : (tensor<1024x1024xf16>, memref<1024x1024xf16>) -> ()
+    return
+  }
+}
diff --git a/test/mlir/test/gc/Integration/op/matmul.mlir b/test/mlir/test/gc/Integration/op/matmul.mlir
new file mode 100644
index 00000000..69816d32
--- /dev/null
+++ b/test/mlir/test/gc/Integration/op/matmul.mlir
@@ -0,0 +1,79 @@
+// RUN: gc-opt %s --gc-gpu-pipeline -split-input-file | FileCheck %s
+
+// CHECK-LABEL: llvm
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+  func.func @matmul_f16(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<4096x4096xf16>
+    %1 = bufferization.to_tensor %arg1 restrict : memref<4096x4096xf16>
+    %2 = tensor.empty() : tensor<4096x4096xf16>
+    %cst = arith.constant 0.000000e+00 : f16
+    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<4096x4096xf16>) -> tensor<4096x4096xf16>
+    %4 = linalg.matmul_transpose_b ins(%0, %1 : tensor<4096x4096xf16>, tensor<4096x4096xf16>) outs(%3 : tensor<4096x4096xf16>) -> tensor<4096x4096xf16>
+    bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<4096x4096xf16>, memref<4096x4096xf16>) -> ()
+    return
+  }
+}
+
+// -----
+// CHECK-LABEL: llvm
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+  func.func @corner_shape_matmul_f16(%arg0: memref<521x521xf16>, %arg1: memref<521x521xf16>, %arg2: memref<521x521xf16>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<521x521xf16>
+    %1 = bufferization.to_tensor %arg1 restrict : memref<521x521xf16>
+    %2 = tensor.empty() : tensor<521x521xf16>
+    %cst = arith.constant 0.000000e+00 : f16
+    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<521x521xf16>) -> tensor<521x521xf16>
+    %4 = linalg.matmul_transpose_b ins(%0, %1 : tensor<521x521xf16>, tensor<521x521xf16>) outs(%3 : tensor<521x521xf16>) -> tensor<521x521xf16>
+    bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<521x521xf16>, memref<521x521xf16>) -> ()
+    return
+  }
+}
+
+// -----
+// CHECK-LABEL: llvm
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+  func.func @dynamic_matmul_f16(%arg0: memref<?x?xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<?x1024xf16>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<?x?xf16>
+    %c0 = arith.constant 0 : index
+    %dim = tensor.dim %0, %c0 : tensor<?x?xf16>
+    %c1 = arith.constant 1 : index
+    %dim_0 = tensor.dim %0, %c1 : tensor<?x?xf16>
+    %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16>
+    %2 = tensor.empty(%dim) : tensor<?x1024xf16>
+    %cst = arith.constant 0.000000e+00 : f16
+    %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<?x1024xf16>) -> tensor<?x1024xf16>
+    %4 = linalg.matmul_transpose_b ins(%0, %1 : tensor<?x?xf16>, tensor<1024x1024xf16>) outs(%3 : tensor<?x1024xf16>) -> tensor<?x1024xf16>
+    bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<?x1024xf16>, memref<?x1024xf16>) -> ()
+    return
+  }
+}
+
+// -----
+// CHECK-LABEL: llvm
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+  func.func @matmul_bf16(%arg0: memref<4096x4096xbf16>, %arg1: memref<4096x4096xbf16>, %arg2: memref<4096x4096xbf16>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<4096x4096xbf16>
+    %1 = bufferization.to_tensor %arg1 restrict : memref<4096x4096xbf16>
+    %2 = tensor.empty() : tensor<4096x4096xbf16>
+    %cst = arith.constant 0.000000e+00 : bf16
+    %3 = linalg.fill ins(%cst : bf16) outs(%2 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16>
+    %4 = linalg.matmul_transpose_b ins(%0, %1 : tensor<4096x4096xbf16>, tensor<4096x4096xbf16>) outs(%3 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16>
+    bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<4096x4096xbf16>, memref<4096x4096xbf16>) -> ()
+    return
+  }
+}
+
+// -----
+// CHECK-LABEL: llvm
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+  func.func @matmul_f32(%arg0: memref<4096x4096xf32>, %arg1: memref<4096x4096xf32>, %arg2: memref<4096x4096xf32>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<4096x4096xf32>
+    %1 = bufferization.to_tensor %arg1 restrict : memref<4096x4096xf32>
+    %2 = tensor.empty() : tensor<4096x4096xf32>
+    %cst = arith.constant 0.000000e+00 : f32
+    %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<4096x4096xf32>) -> tensor<4096x4096xf32>
+    %4 = linalg.matmul_transpose_b ins(%0, %1 : tensor<4096x4096xf32>, tensor<4096x4096xf32>) outs(%3 : tensor<4096x4096xf32>) -> tensor<4096x4096xf32>
+    bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<4096x4096xf32>, memref<4096x4096xf32>) -> ()
+    return
+  }
+}
\ No newline at end of file
diff --git a/test/mlir/test/gc/Integration/op/relu.mlir b/test/mlir/test/gc/Integration/op/relu.mlir
new file mode 100644
index 00000000..76dd17ef
--- /dev/null
+++ b/test/mlir/test/gc/Integration/op/relu.mlir
@@ -0,0 +1,77 @@
+// RUN: gc-opt %s --gc-gpu-pipeline -split-input-file | FileCheck %s
+
+
+// -----
+// CHECK-LABEL: llvm
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+  func.func @relu_f16(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16>
+    %1 = tensor.empty() : tensor<1024x1024xf16>
+    %cst = arith.constant 0.000000e+00 : f16
+    %2 = linalg.fill ins(%cst : f16) outs(%1 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16>
+    %3 = linalg.max ins(%0, %2 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%1 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16>
+    bufferization.materialize_in_destination %3 in restrict writable %arg1 : (tensor<1024x1024xf16>, memref<1024x1024xf16>) -> ()
+    return
+  }
+}
+
+// -----
+// CHECK-LABEL: llvm
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+  func.func @dynamic_relu(%arg0: memref<?x?xf16>, %arg1: memref<?x?xf16>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<?x?xf16>
+    %c0 = arith.constant 0 : index
+    %dim = tensor.dim %0, %c0 : tensor<?x?xf16>
+    %c1 = arith.constant 1 : index
+    %dim_0 = tensor.dim %0, %c1 : tensor<?x?xf16>
+    %1 = tensor.empty(%dim, %dim_0) : tensor<?x?xf16>
+    %cst = arith.constant 0.000000e+00 : f16
+    %2 = linalg.fill ins(%cst : f16) outs(%1 : tensor<?x?xf16>) -> tensor<?x?xf16>
+    %3 = linalg.max ins(%0, %2 : tensor<?x?xf16>, tensor<?x?xf16>) outs(%1 : tensor<?x?xf16>) -> tensor<?x?xf16>
+    bufferization.materialize_in_destination %3 in restrict writable %arg1 : (tensor<?x?xf16>, memref<?x?xf16>) -> ()
+    return
+  }
+}
+
+// -----
+// CHECK-LABEL: llvm
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+  func.func @relu_bf16(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xbf16>
+    %1 = tensor.empty() : tensor<1024x1024xbf16>
+    %cst = arith.constant 0.000000e+00 : bf16
+    %2 = linalg.fill ins(%cst : bf16) outs(%1 : tensor<1024x1024xbf16>) -> tensor<1024x1024xbf16>
+    %3 = linalg.max ins(%0, %2 : tensor<1024x1024xbf16>, tensor<1024x1024xbf16>) outs(%1 : tensor<1024x1024xbf16>) -> tensor<1024x1024xbf16>
+    bufferization.materialize_in_destination %3 in restrict writable %arg1 : (tensor<1024x1024xbf16>, memref<1024x1024xbf16>) -> ()
+    return
+  }
+}
+
+// -----
+// CHECK-LABEL: llvm
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+  func.func @relu_f32(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf32>
+    %1 = tensor.empty() : tensor<1024x1024xf32>
+    %cst = arith.constant 0.000000e+00 : f32
+    %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+    %3 = linalg.max ins(%0, %2 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%1 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+    bufferization.materialize_in_destination %3 in restrict writable %arg1 : (tensor<1024x1024xf32>, memref<1024x1024xf32>) -> ()
+    return
+  }
+}
+
+// -----
+// CHECK-LABEL: llvm
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+  func.func @relu_f32_corner_shape(%arg0: memref<1061x1061xf32>, %arg1: memref<1061x1061xf32>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<1061x1061xf32>
+    %1 = tensor.empty() : tensor<1061x1061xf32>
+    %cst = arith.constant 0.000000e+00 : f32
+    %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1061x1061xf32>) -> tensor<1061x1061xf32>
+    %3 = linalg.max ins(%0, %2 : tensor<1061x1061xf32>, tensor<1061x1061xf32>) outs(%1 : tensor<1061x1061xf32>) -> tensor<1061x1061xf32>
+    bufferization.materialize_in_destination %3 in restrict writable %arg1 : (tensor<1061x1061xf32>, memref<1061x1061xf32>) -> ()
+    return
+  }
+}
+

From 0cb9c8787f7d2409fe4f91ddeadf3db2855157d2 Mon Sep 17 00:00:00 2001
From: "Zhong, Zhicong" <zhicong.zhong@intel.com>
Date: Mon, 11 Nov 2024 07:10:41 +0000
Subject: [PATCH 2/2] update

---
 test/mlir/test/gc/Integration/op/binary.mlir |  8 ++++----
 test/mlir/test/gc/Integration/op/matmul.mlir | 12 ++++++------
 test/mlir/test/gc/Integration/op/relu.mlir   | 11 +++++------
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/test/mlir/test/gc/Integration/op/binary.mlir b/test/mlir/test/gc/Integration/op/binary.mlir
index fc67a7d5..536de72f 100644
--- a/test/mlir/test/gc/Integration/op/binary.mlir
+++ b/test/mlir/test/gc/Integration/op/binary.mlir
@@ -1,7 +1,7 @@
 // RUN: gc-opt %s --gc-gpu-pipeline -split-input-file | FileCheck %s
 
 // CHECK-LABEL: llvm
-module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} {
   func.func @multiply(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) {
     %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16>
     %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16>
@@ -16,7 +16,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C
 // -----
 
 // CHECK-LABEL: llvm
-module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} {
   func.func @add(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) {
     %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16>
     %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16>
@@ -30,7 +30,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C
 
 // -----
 // CHECK-LABEL: llvm
-module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} {
   func.func @subtract(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) {
     %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16>
     %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16>
@@ -44,7 +44,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C
 
 // -----
 // CHECK-LABEL: llvm
-module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} {
   func.func @divide(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) {
     %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16>
     %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16>
diff --git a/test/mlir/test/gc/Integration/op/matmul.mlir b/test/mlir/test/gc/Integration/op/matmul.mlir
index 69816d32..ed0c2391 100644
--- a/test/mlir/test/gc/Integration/op/matmul.mlir
+++ b/test/mlir/test/gc/Integration/op/matmul.mlir
@@ -1,7 +1,7 @@
 // RUN: gc-opt %s --gc-gpu-pipeline -split-input-file | FileCheck %s
 
 // CHECK-LABEL: llvm
-module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} {
   func.func @matmul_f16(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) {
     %0 = bufferization.to_tensor %arg0 restrict : memref<4096x4096xf16>
     %1 = bufferization.to_tensor %arg1 restrict : memref<4096x4096xf16>
@@ -16,7 +16,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C
 
 // -----
 // CHECK-LABEL: llvm
-module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} {
   func.func @corner_shape_matmul_f16(%arg0: memref<521x521xf16>, %arg1: memref<521x521xf16>, %arg2: memref<521x521xf16>) {
     %0 = bufferization.to_tensor %arg0 restrict : memref<521x521xf16>
     %1 = bufferization.to_tensor %arg1 restrict : memref<521x521xf16>
@@ -31,7 +31,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C
 
 // -----
 // CHECK-LABEL: llvm
-module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>}{
   func.func @dynamic_matmul_f16(%arg0: memref<?x?xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<?x1024xf16>) {
     %0 = bufferization.to_tensor %arg0 restrict : memref<?x?xf16>
     %c0 = arith.constant 0 : index
@@ -50,7 +50,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C
 
 // -----
 // CHECK-LABEL: llvm
-module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} {
   func.func @matmul_bf16(%arg0: memref<4096x4096xbf16>, %arg1: memref<4096x4096xbf16>, %arg2: memref<4096x4096xbf16>) {
     %0 = bufferization.to_tensor %arg0 restrict : memref<4096x4096xbf16>
     %1 = bufferization.to_tensor %arg1 restrict : memref<4096x4096xbf16>
@@ -65,7 +65,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C
 
 // -----
 // CHECK-LABEL: llvm
-module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} {
   func.func @matmul_f32(%arg0: memref<4096x4096xf32>, %arg1: memref<4096x4096xf32>, %arg2: memref<4096x4096xf32>) {
     %0 = bufferization.to_tensor %arg0 restrict : memref<4096x4096xf32>
     %1 = bufferization.to_tensor %arg1 restrict : memref<4096x4096xf32>
@@ -76,4 +76,4 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C
     bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<4096x4096xf32>, memref<4096x4096xf32>) -> ()
     return
   }
-}
\ No newline at end of file
+}
diff --git a/test/mlir/test/gc/Integration/op/relu.mlir b/test/mlir/test/gc/Integration/op/relu.mlir
index 76dd17ef..d49f2f41 100644
--- a/test/mlir/test/gc/Integration/op/relu.mlir
+++ b/test/mlir/test/gc/Integration/op/relu.mlir
@@ -3,7 +3,7 @@
 
 // -----
 // CHECK-LABEL: llvm
-module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} {
   func.func @relu_f16(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>) {
     %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16>
     %1 = tensor.empty() : tensor<1024x1024xf16>
@@ -17,7 +17,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C
 
 // -----
 // CHECK-LABEL: llvm
-module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} {
   func.func @dynamic_relu(%arg0: memref<?x?xf16>, %arg1: memref<?x?xf16>) {
     %0 = bufferization.to_tensor %arg0 restrict : memref<?x?xf16>
     %c0 = arith.constant 0 : index
@@ -35,7 +35,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C
 
 // -----
 // CHECK-LABEL: llvm
-module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} {
   func.func @relu_bf16(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>) {
     %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xbf16>
     %1 = tensor.empty() : tensor<1024x1024xbf16>
@@ -49,7 +49,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C
 
 // -----
 // CHECK-LABEL: llvm
-module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} {
   func.func @relu_f32(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>) {
     %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf32>
     %1 = tensor.empty() : tensor<1024x1024xf32>
@@ -63,7 +63,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C
 
 // -----
 // CHECK-LABEL: llvm
-module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} {
   func.func @relu_f32_corner_shape(%arg0: memref<1061x1061xf32>, %arg1: memref<1061x1061xf32>) {
     %0 = bufferization.to_tensor %arg0 restrict : memref<1061x1061xf32>
     %1 = tensor.empty() : tensor<1061x1061xf32>
@@ -74,4 +74,3 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C
     return
   }
 }
-