From 041557be0bbf2faae89c58e3bde19a4dcc434441 Mon Sep 17 00:00:00 2001 From: "Zhong, Zhicong" Date: Wed, 30 Oct 2024 04:34:18 +0000 Subject: [PATCH 1/2] add functionality test --- test/mlir/test/gc/Integration/op/binary.mlir | 57 ++++++++++++++ test/mlir/test/gc/Integration/op/matmul.mlir | 79 ++++++++++++++++++++ test/mlir/test/gc/Integration/op/relu.mlir | 77 +++++++++++++++++++ 3 files changed, 213 insertions(+) create mode 100644 test/mlir/test/gc/Integration/op/binary.mlir create mode 100644 test/mlir/test/gc/Integration/op/matmul.mlir create mode 100644 test/mlir/test/gc/Integration/op/relu.mlir diff --git a/test/mlir/test/gc/Integration/op/binary.mlir b/test/mlir/test/gc/Integration/op/binary.mlir new file mode 100644 index 00000000..fc67a7d5 --- /dev/null +++ b/test/mlir/test/gc/Integration/op/binary.mlir @@ -0,0 +1,57 @@ +// RUN: gc-opt %s --gc-gpu-pipeline -split-input-file | FileCheck %s + +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { + func.func @multiply(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16> + %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16> + %2 = bufferization.to_tensor %arg2 restrict : memref<1024x1024xf16> + %3 = tensor.empty() : tensor<1024x1024xf16> + %4 = linalg.mul ins(%0, %1 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%3 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16> + bufferization.materialize_in_destination %4 in restrict writable %arg3 : (tensor<1024x1024xf16>, memref<1024x1024xf16>) -> () + return + } +} + +// ----- + +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { + func.func @add(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16> + %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16> + %2 = bufferization.to_tensor %arg2 restrict : memref<1024x1024xf16> + %3 = tensor.empty() : tensor<1024x1024xf16> + %4 = linalg.add ins(%0, %1 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%3 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16> + bufferization.materialize_in_destination %4 in restrict writable %arg3 : (tensor<1024x1024xf16>, memref<1024x1024xf16>) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { + func.func @subtract(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16> + %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16> + %2 = bufferization.to_tensor %arg2 restrict : memref<1024x1024xf16> + %3 = tensor.empty() : tensor<1024x1024xf16> + %4 = linalg.sub ins(%0, %1 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%3 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16> + bufferization.materialize_in_destination %4 in restrict writable %arg3 : (tensor<1024x1024xf16>, memref<1024x1024xf16>) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { + func.func @divide(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16> + %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16> + %2 = bufferization.to_tensor %arg2 restrict : memref<1024x1024xf16> + %3 = tensor.empty() : tensor<1024x1024xf16> + %4 = linalg.div ins(%0, %1 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%3 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16> + bufferization.materialize_in_destination %4 in restrict writable %arg3 : (tensor<1024x1024xf16>, memref<1024x1024xf16>) -> () + return + } +} diff --git a/test/mlir/test/gc/Integration/op/matmul.mlir b/test/mlir/test/gc/Integration/op/matmul.mlir new file mode 100644 index 00000000..69816d32 --- /dev/null +++ b/test/mlir/test/gc/Integration/op/matmul.mlir @@ -0,0 +1,79 @@ +// RUN: gc-opt %s --gc-gpu-pipeline -split-input-file | FileCheck %s + +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { + func.func @matmul_f16(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<4096x4096xf16> + %1 = bufferization.to_tensor %arg1 restrict : memref<4096x4096xf16> + %2 = tensor.empty() : tensor<4096x4096xf16> + %cst = arith.constant 0.000000e+00 : f16 + %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<4096x4096xf16>) -> tensor<4096x4096xf16> + %4 = linalg.matmul_transpose_b ins(%0, %1 : tensor<4096x4096xf16>, tensor<4096x4096xf16>) outs(%3 : tensor<4096x4096xf16>) -> tensor<4096x4096xf16> + bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<4096x4096xf16>, memref<4096x4096xf16>) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { + func.func @corner_shape_matmul_f16(%arg0: memref<521x521xf16>, %arg1: memref<521x521xf16>, %arg2: memref<521x521xf16>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<521x521xf16> + %1 = bufferization.to_tensor %arg1 restrict : memref<521x521xf16> + %2 = tensor.empty() : tensor<521x521xf16> + %cst = arith.constant 0.000000e+00 : f16 + %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<521x521xf16>) -> tensor<521x521xf16> + %4 = linalg.matmul_transpose_b ins(%0, %1 : tensor<521x521xf16>, tensor<521x521xf16>) outs(%3 : tensor<521x521xf16>) -> tensor<521x521xf16> + bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<521x521xf16>, memref<521x521xf16>) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { + func.func @dynamic_matmul_f16(%arg0: memref, %arg1: memref<1024x1024xf16>, %arg2: memref) { + %0 = bufferization.to_tensor %arg0 restrict : memref + %c0 = arith.constant 0 : index + %dim = tensor.dim %0, %c0 : tensor + %c1 = arith.constant 1 : index + %dim_0 = tensor.dim %0, %c1 : tensor + %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16> + %2 = tensor.empty(%dim) : tensor + %cst = arith.constant 0.000000e+00 : f16 + %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor) -> tensor + %4 = linalg.matmul_transpose_b ins(%0, %1 : tensor, tensor<1024x1024xf16>) outs(%3 : tensor) -> tensor + bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor, memref) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { + func.func @matmul_bf16(%arg0: memref<4096x4096xbf16>, %arg1: memref<4096x4096xbf16>, %arg2: memref<4096x4096xbf16>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<4096x4096xbf16> + %1 = bufferization.to_tensor %arg1 restrict : memref<4096x4096xbf16> + %2 = tensor.empty() : tensor<4096x4096xbf16> + %cst = arith.constant 0.000000e+00 : bf16 + %3 = linalg.fill ins(%cst : bf16) outs(%2 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16> + %4 = linalg.matmul_transpose_b ins(%0, %1 : tensor<4096x4096xbf16>, tensor<4096x4096xbf16>) outs(%3 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16> + bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<4096x4096xbf16>, memref<4096x4096xbf16>) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { + func.func @matmul_f32(%arg0: memref<4096x4096xf32>, %arg1: memref<4096x4096xf32>, %arg2: memref<4096x4096xf32>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<4096x4096xf32> + %1 = bufferization.to_tensor %arg1 restrict : memref<4096x4096xf32> + %2 = tensor.empty() : tensor<4096x4096xf32> + %cst = arith.constant 0.000000e+00 : f32 + %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<4096x4096xf32>) -> tensor<4096x4096xf32> + %4 = linalg.matmul_transpose_b ins(%0, %1 : tensor<4096x4096xf32>, tensor<4096x4096xf32>) outs(%3 : tensor<4096x4096xf32>) -> tensor<4096x4096xf32> + bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<4096x4096xf32>, memref<4096x4096xf32>) -> () + return + } +} \ No newline at end of file diff --git a/test/mlir/test/gc/Integration/op/relu.mlir b/test/mlir/test/gc/Integration/op/relu.mlir new file mode 100644 index 00000000..76dd17ef --- /dev/null +++ b/test/mlir/test/gc/Integration/op/relu.mlir @@ -0,0 +1,77 @@ +// RUN: gc-opt %s --gc-gpu-pipeline -split-input-file | FileCheck %s + + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { + func.func @relu_f16(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16> + %1 = tensor.empty() : tensor<1024x1024xf16> + %cst = arith.constant 0.000000e+00 : f16 + %2 = linalg.fill ins(%cst : f16) outs(%1 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16> + %3 = linalg.max ins(%0, %2 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%1 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16> + bufferization.materialize_in_destination %3 in restrict writable %arg1 : (tensor<1024x1024xf16>, memref<1024x1024xf16>) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { + func.func @dynamic_relu(%arg0: memref, %arg1: memref) { + %0 = bufferization.to_tensor %arg0 restrict : memref + %c0 = arith.constant 0 : index + %dim = tensor.dim %0, %c0 : tensor + %c1 = arith.constant 1 : index + %dim_0 = tensor.dim %0, %c1 : tensor + %1 = tensor.empty(%dim, %dim_0) : tensor + %cst = arith.constant 0.000000e+00 : f16 + %2 = linalg.fill ins(%cst : f16) outs(%1 : tensor) -> tensor + %3 = linalg.max ins(%0, %2 : tensor, tensor) outs(%1 : tensor) -> tensor + bufferization.materialize_in_destination %3 in restrict writable %arg1 : (tensor, memref) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { + func.func @relu_bf16(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xbf16> + %1 = tensor.empty() : tensor<1024x1024xbf16> + %cst = arith.constant 0.000000e+00 : bf16 + %2 = linalg.fill ins(%cst : bf16) outs(%1 : tensor<1024x1024xbf16>) -> tensor<1024x1024xbf16> + %3 = linalg.max ins(%0, %2 : tensor<1024x1024xbf16>, tensor<1024x1024xbf16>) outs(%1 : tensor<1024x1024xbf16>) -> tensor<1024x1024xbf16> + bufferization.materialize_in_destination %3 in restrict writable %arg1 : (tensor<1024x1024xbf16>, memref<1024x1024xbf16>) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { + func.func @relu_f32(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf32> + %1 = tensor.empty() : tensor<1024x1024xf32> + %cst = arith.constant 0.000000e+00 : f32 + %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> + %3 = linalg.max ins(%0, %2 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%1 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> + bufferization.materialize_in_destination %3 in restrict writable %arg1 : (tensor<1024x1024xf32>, memref<1024x1024xf32>) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { + func.func @relu_f32_corner_shape(%arg0: memref<1061x1061xf32>, %arg1: memref<1061x1061xf32>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<1061x1061xf32> + %1 = tensor.empty() : tensor<1061x1061xf32> + %cst = arith.constant 0.000000e+00 : f32 + %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1061x1061xf32>) -> tensor<1061x1061xf32> + %3 = linalg.max ins(%0, %2 : tensor<1061x1061xf32>, tensor<1061x1061xf32>) outs(%1 : tensor<1061x1061xf32>) -> tensor<1061x1061xf32> + bufferization.materialize_in_destination %3 in restrict writable %arg1 : (tensor<1061x1061xf32>, memref<1061x1061xf32>) -> () + return + } +} + From 0cb9c8787f7d2409fe4f91ddeadf3db2855157d2 Mon Sep 17 00:00:00 2001 From: "Zhong, Zhicong" Date: Mon, 11 Nov 2024 07:10:41 +0000 Subject: [PATCH 2/2] update --- test/mlir/test/gc/Integration/op/binary.mlir | 8 ++++---- test/mlir/test/gc/Integration/op/matmul.mlir | 12 ++++++------ test/mlir/test/gc/Integration/op/relu.mlir | 11 +++++------ 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/test/mlir/test/gc/Integration/op/binary.mlir b/test/mlir/test/gc/Integration/op/binary.mlir index fc67a7d5..536de72f 100644 --- a/test/mlir/test/gc/Integration/op/binary.mlir +++ b/test/mlir/test/gc/Integration/op/binary.mlir @@ -1,7 +1,7 @@ // RUN: gc-opt %s --gc-gpu-pipeline -split-input-file | FileCheck %s // CHECK-LABEL: llvm -module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { func.func @multiply(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) { %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16> %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16> @@ -16,7 +16,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C // ----- // CHECK-LABEL: llvm -module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { func.func @add(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) { %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16> %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16> @@ -30,7 +30,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C // ----- // CHECK-LABEL: llvm -module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { func.func @subtract(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) { %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16> %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16> @@ -44,7 +44,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C // ----- // CHECK-LABEL: llvm -module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { func.func @divide(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) { %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16> %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16> diff --git a/test/mlir/test/gc/Integration/op/matmul.mlir b/test/mlir/test/gc/Integration/op/matmul.mlir index 69816d32..ed0c2391 100644 --- a/test/mlir/test/gc/Integration/op/matmul.mlir +++ b/test/mlir/test/gc/Integration/op/matmul.mlir @@ -1,7 +1,7 @@ // RUN: gc-opt %s --gc-gpu-pipeline -split-input-file | FileCheck %s // CHECK-LABEL: llvm -module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { func.func @matmul_f16(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) { %0 = bufferization.to_tensor %arg0 restrict : memref<4096x4096xf16> %1 = bufferization.to_tensor %arg1 restrict : memref<4096x4096xf16> @@ -16,7 +16,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C // ----- // CHECK-LABEL: llvm -module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { func.func @corner_shape_matmul_f16(%arg0: memref<521x521xf16>, %arg1: memref<521x521xf16>, %arg2: memref<521x521xf16>) { %0 = bufferization.to_tensor %arg0 restrict : memref<521x521xf16> %1 = bufferization.to_tensor %arg1 restrict : memref<521x521xf16> @@ -31,7 +31,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C // ----- // CHECK-LABEL: llvm -module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>}{ func.func @dynamic_matmul_f16(%arg0: memref, %arg1: memref<1024x1024xf16>, %arg2: memref) { %0 = bufferization.to_tensor %arg0 restrict : memref %c0 = arith.constant 0 : index @@ -50,7 +50,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C // ----- // CHECK-LABEL: llvm -module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { func.func @matmul_bf16(%arg0: memref<4096x4096xbf16>, %arg1: memref<4096x4096xbf16>, %arg2: memref<4096x4096xbf16>) { %0 = bufferization.to_tensor %arg0 restrict : memref<4096x4096xbf16> %1 = bufferization.to_tensor %arg1 restrict : memref<4096x4096xbf16> @@ -65,7 +65,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C // ----- // CHECK-LABEL: llvm -module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { func.func @matmul_f32(%arg0: memref<4096x4096xf32>, %arg1: memref<4096x4096xf32>, %arg2: memref<4096x4096xf32>) { %0 = bufferization.to_tensor %arg0 restrict : memref<4096x4096xf32> %1 = bufferization.to_tensor %arg1 restrict : memref<4096x4096xf32> @@ -76,4 +76,4 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<4096x4096xf32>, memref<4096x4096xf32>) -> () return } -} \ No newline at end of file +} diff --git a/test/mlir/test/gc/Integration/op/relu.mlir b/test/mlir/test/gc/Integration/op/relu.mlir index 76dd17ef..d49f2f41 100644 --- a/test/mlir/test/gc/Integration/op/relu.mlir +++ b/test/mlir/test/gc/Integration/op/relu.mlir @@ -3,7 +3,7 @@ // ----- // CHECK-LABEL: llvm -module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { func.func @relu_f16(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>) { %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16> %1 = tensor.empty() : tensor<1024x1024xf16> @@ -17,7 +17,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C // ----- // CHECK-LABEL: llvm -module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { func.func @dynamic_relu(%arg0: memref, %arg1: memref) { %0 = bufferization.to_tensor %arg0 restrict : memref %c0 = arith.constant 0 : index @@ -35,7 +35,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C // ----- // CHECK-LABEL: llvm -module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { func.func @relu_bf16(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>) { %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xbf16> %1 = tensor.empty() : tensor<1024x1024xbf16> @@ -49,7 +49,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C // ----- // CHECK-LABEL: llvm -module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { func.func @relu_f32(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>) { %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf32> %1 = tensor.empty() : tensor<1024x1024xf32> @@ -63,7 +63,7 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C // ----- // CHECK-LABEL: llvm -module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { func.func @relu_f32_corner_shape(%arg0: memref<1061x1061xf32>, %arg1: memref<1061x1061xf32>) { %0 = bufferization.to_tensor %arg0 restrict : memref<1061x1061xf32> %1 = tensor.empty() : tensor<1061x1061xf32> @@ -74,4 +74,3 @@ module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"C return } } -