diff --git a/test/mlir/test/gc/Integration/op/binary.mlir b/test/mlir/test/gc/Integration/op/binary.mlir new file mode 100644 index 00000000..536de72f --- /dev/null +++ b/test/mlir/test/gc/Integration/op/binary.mlir @@ -0,0 +1,57 @@ +// RUN: gc-opt %s --gc-gpu-pipeline -split-input-file | FileCheck %s + +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { + func.func @multiply(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16> + %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16> + %2 = bufferization.to_tensor %arg2 restrict : memref<1024x1024xf16> + %3 = tensor.empty() : tensor<1024x1024xf16> + %4 = linalg.mul ins(%0, %1 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%3 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16> + bufferization.materialize_in_destination %4 in restrict writable %arg3 : (tensor<1024x1024xf16>, memref<1024x1024xf16>) -> () + return + } +} + +// ----- + +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { + func.func @add(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16> + %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16> + %2 = bufferization.to_tensor %arg2 restrict : memref<1024x1024xf16> + %3 = tensor.empty() : tensor<1024x1024xf16> + %4 = linalg.add ins(%0, %1 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%3 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16> + bufferization.materialize_in_destination %4 in restrict writable %arg3 : (tensor<1024x1024xf16>, memref<1024x1024xf16>) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { + func.func @subtract(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16> + %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16> + %2 = bufferization.to_tensor %arg2 restrict : memref<1024x1024xf16> + %3 = tensor.empty() : tensor<1024x1024xf16> + %4 = linalg.sub ins(%0, %1 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%3 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16> + bufferization.materialize_in_destination %4 in restrict writable %arg3 : (tensor<1024x1024xf16>, memref<1024x1024xf16>) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { + func.func @divide(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf16>, %arg3: memref<1024x1024xf16>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16> + %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16> + %2 = bufferization.to_tensor %arg2 restrict : memref<1024x1024xf16> + %3 = tensor.empty() : tensor<1024x1024xf16> + %4 = linalg.div ins(%0, %1 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%3 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16> + bufferization.materialize_in_destination %4 in restrict writable %arg3 : (tensor<1024x1024xf16>, memref<1024x1024xf16>) -> () + return + } +} diff --git a/test/mlir/test/gc/Integration/op/matmul.mlir b/test/mlir/test/gc/Integration/op/matmul.mlir new file mode 100644 index 00000000..ed0c2391 --- /dev/null +++ b/test/mlir/test/gc/Integration/op/matmul.mlir @@ -0,0 +1,79 @@ +// RUN: gc-opt %s --gc-gpu-pipeline -split-input-file | FileCheck %s + +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { + func.func @matmul_f16(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<4096x4096xf16> + %1 = bufferization.to_tensor %arg1 restrict : memref<4096x4096xf16> + %2 = tensor.empty() : tensor<4096x4096xf16> + %cst = arith.constant 0.000000e+00 : f16 + %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<4096x4096xf16>) -> tensor<4096x4096xf16> + %4 = linalg.matmul_transpose_b ins(%0, %1 : tensor<4096x4096xf16>, tensor<4096x4096xf16>) outs(%3 : tensor<4096x4096xf16>) -> tensor<4096x4096xf16> + bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<4096x4096xf16>, memref<4096x4096xf16>) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { + func.func @corner_shape_matmul_f16(%arg0: memref<521x521xf16>, %arg1: memref<521x521xf16>, %arg2: memref<521x521xf16>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<521x521xf16> + %1 = bufferization.to_tensor %arg1 restrict : memref<521x521xf16> + %2 = tensor.empty() : tensor<521x521xf16> + %cst = arith.constant 0.000000e+00 : f16 + %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor<521x521xf16>) -> tensor<521x521xf16> + %4 = linalg.matmul_transpose_b ins(%0, %1 : tensor<521x521xf16>, tensor<521x521xf16>) outs(%3 : tensor<521x521xf16>) -> tensor<521x521xf16> + bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<521x521xf16>, memref<521x521xf16>) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>}{ + func.func @dynamic_matmul_f16(%arg0: memref, %arg1: memref<1024x1024xf16>, %arg2: memref) { + %0 = bufferization.to_tensor %arg0 restrict : memref + %c0 = arith.constant 0 : index + %dim = tensor.dim %0, %c0 : tensor + %c1 = arith.constant 1 : index + %dim_0 = tensor.dim %0, %c1 : tensor + %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16> + %2 = tensor.empty(%dim) : tensor + %cst = arith.constant 0.000000e+00 : f16 + %3 = linalg.fill ins(%cst : f16) outs(%2 : tensor) -> tensor + %4 = linalg.matmul_transpose_b ins(%0, %1 : tensor, tensor<1024x1024xf16>) outs(%3 : tensor) -> tensor + bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor, memref) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { + func.func @matmul_bf16(%arg0: memref<4096x4096xbf16>, %arg1: memref<4096x4096xbf16>, %arg2: memref<4096x4096xbf16>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<4096x4096xbf16> + %1 = bufferization.to_tensor %arg1 restrict : memref<4096x4096xbf16> + %2 = tensor.empty() : tensor<4096x4096xbf16> + %cst = arith.constant 0.000000e+00 : bf16 + %3 = linalg.fill ins(%cst : bf16) outs(%2 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16> + %4 = linalg.matmul_transpose_b ins(%0, %1 : tensor<4096x4096xbf16>, tensor<4096x4096xbf16>) outs(%3 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16> + bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<4096x4096xbf16>, memref<4096x4096xbf16>) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { + func.func @matmul_f32(%arg0: memref<4096x4096xf32>, %arg1: memref<4096x4096xf32>, %arg2: memref<4096x4096xf32>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<4096x4096xf32> + %1 = bufferization.to_tensor %arg1 restrict : memref<4096x4096xf32> + %2 = tensor.empty() : tensor<4096x4096xf32> + %cst = arith.constant 0.000000e+00 : f32 + %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<4096x4096xf32>) -> tensor<4096x4096xf32> + %4 = linalg.matmul_transpose_b ins(%0, %1 : tensor<4096x4096xf32>, tensor<4096x4096xf32>) outs(%3 : tensor<4096x4096xf32>) -> tensor<4096x4096xf32> + bufferization.materialize_in_destination %4 in restrict writable %arg2 : (tensor<4096x4096xf32>, memref<4096x4096xf32>) -> () + return + } +} diff --git a/test/mlir/test/gc/Integration/op/relu.mlir b/test/mlir/test/gc/Integration/op/relu.mlir new file mode 100644 index 00000000..d49f2f41 --- /dev/null +++ b/test/mlir/test/gc/Integration/op/relu.mlir @@ -0,0 +1,76 @@ +// RUN: gc-opt %s --gc-gpu-pipeline -split-input-file | FileCheck %s + + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { + func.func @relu_f16(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf16> + %1 = tensor.empty() : tensor<1024x1024xf16> + %cst = arith.constant 0.000000e+00 : f16 + %2 = linalg.fill ins(%cst : f16) outs(%1 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16> + %3 = linalg.max ins(%0, %2 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%1 : tensor<1024x1024xf16>) -> tensor<1024x1024xf16> + bufferization.materialize_in_destination %3 in restrict writable %arg1 : (tensor<1024x1024xf16>, memref<1024x1024xf16>) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { + func.func @dynamic_relu(%arg0: memref, %arg1: memref) { + %0 = bufferization.to_tensor %arg0 restrict : memref + %c0 = arith.constant 0 : index + %dim = tensor.dim %0, %c0 : tensor + %c1 = arith.constant 1 : index + %dim_0 = tensor.dim %0, %c1 : tensor + %1 = tensor.empty(%dim, %dim_0) : tensor + %cst = arith.constant 0.000000e+00 : f16 + %2 = linalg.fill ins(%cst : f16) outs(%1 : tensor) -> tensor + %3 = linalg.max ins(%0, %2 : tensor, tensor) outs(%1 : tensor) -> tensor + bufferization.materialize_in_destination %3 in restrict writable %arg1 : (tensor, memref) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { + func.func @relu_bf16(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xbf16> + %1 = tensor.empty() : tensor<1024x1024xbf16> + %cst = arith.constant 0.000000e+00 : bf16 + %2 = linalg.fill ins(%cst : bf16) outs(%1 : tensor<1024x1024xbf16>) -> tensor<1024x1024xbf16> + %3 = linalg.max ins(%0, %2 : tensor<1024x1024xbf16>, tensor<1024x1024xbf16>) outs(%1 : tensor<1024x1024xbf16>) -> tensor<1024x1024xbf16> + bufferization.materialize_in_destination %3 in restrict writable %arg1 : (tensor<1024x1024xbf16>, memref<1024x1024xbf16>) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { + func.func @relu_f32(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<1024x1024xf32> + %1 = tensor.empty() : tensor<1024x1024xf32> + %cst = arith.constant 0.000000e+00 : f32 + %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> + %3 = linalg.max ins(%0, %2 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%1 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> + bufferization.materialize_in_destination %3 in restrict writable %arg1 : (tensor<1024x1024xf32>, memref<1024x1024xf32>) -> () + return + } +} + +// ----- +// CHECK-LABEL: llvm +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"num_exec_units", 448 : i32>, #dlti.dl_entry<"num_exec_units_per_slice", 32 : i32>, #dlti.dl_entry<"num_threads_per_eu", 8 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 67108864 : i32>, #dlti.dl_entry<"max_vector_op_width", 256 : i32>, #dlti.dl_entry<"max_work_group_size", 1024 : i32>>>} { + func.func @relu_f32_corner_shape(%arg0: memref<1061x1061xf32>, %arg1: memref<1061x1061xf32>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<1061x1061xf32> + %1 = tensor.empty() : tensor<1061x1061xf32> + %cst = arith.constant 0.000000e+00 : f32 + %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1061x1061xf32>) -> tensor<1061x1061xf32> + %3 = linalg.max ins(%0, %2 : tensor<1061x1061xf32>, tensor<1061x1061xf32>) outs(%1 : tensor<1061x1061xf32>) -> tensor<1061x1061xf32> + bufferization.materialize_in_destination %3 in restrict writable %arg1 : (tensor<1061x1061xf32>, memref<1061x1061xf32>) -> () + return + } +}