diff --git a/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h b/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h new file mode 100644 index 000000000..b01b9f2c6 --- /dev/null +++ b/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h @@ -0,0 +1,29 @@ +//===-- GpuOclRuntime.h - GPU OpenCL runtime --------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef GC_GPUOCLRUNTIME_H +#define GC_GPUOCLRUNTIME_H + +namespace mlir::gc::gpu { +constexpr char GPU_OCL_MALLOC[] = "gcGpuOclMalloc"; +constexpr char GPU_OCL_DEALLOC[] = "gcGpuOclDealloc"; +constexpr char GPU_OCL_MEMCPY[] = "gcGpuOclMemcpy"; +constexpr char GPU_OCL_KERNEL_CREATE[] = "gcGpuOclKernelCreate"; +constexpr char GPU_OCL_KERNEL_DESTROY[] = "gcGpuOclKernelDestroy"; +constexpr char GPU_OCL_KERNEL_LAUNCH[] = "gcGpuOclKernelLaunch"; +constexpr char GPU_OCL_MOD_DESTRUCTOR[] = "gcGpuOclModuleDestructor"; +} // namespace mlir::gc::gpu + +#ifndef GC_GPU_OCL_CONST_ONLY + +// TBD + +#else +#undef GC_GPU_OCL_CONST_ONLY +#endif +#endif diff --git a/include/gc/Transforms/Passes.td b/include/gc/Transforms/Passes.td index 5151a0335..2ddf0a06e 100644 --- a/include/gc/Transforms/Passes.td +++ b/include/gc/Transforms/Passes.td @@ -93,6 +93,20 @@ def LinalgToXeGPU : Pass<"linalg-to-xegpu", "func::FuncOp"> { "DPAS register block sizes MxNxK">, ]; } + +def AddContextArg : Pass<"add-ctx-arg", "func::FuncOp"> { + let summary = "Add a context argument."; + let description = [{ + Add a new memref argument to the function, that could be used to pass some context. + }]; +} + +def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> { + let summary = "Convert the GPU operations to GpuOclRuntime calls."; + let description = [{ + Convert the gpu alloc, dealloc, memcpy and launch operations to GpuOclRuntime calls. + }]; +} #endif // GC_USE_IMEX def IterativeTilingAndFusion : Pass<"iterative-tiling-and-fusion", diff --git a/lib/gc/Transforms/GPU/AddContextArg.cpp b/lib/gc/Transforms/GPU/AddContextArg.cpp new file mode 100644 index 000000000..d731fbb62 --- /dev/null +++ b/lib/gc/Transforms/GPU/AddContextArg.cpp @@ -0,0 +1,45 @@ +//===-- AddContextArg.cpp - Add context argument ----------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "mlir/Conversion/Passes.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" + +namespace mlir::gc { +#define GEN_PASS_DECL_ADDCONTEXTARG +#define GEN_PASS_DEF_ADDCONTEXTARG +#include "gc/Transforms/Passes.h.inc" +} // namespace mlir::gc + +using namespace mlir; + +namespace { +struct AddContextArg final : gc::impl::AddContextArgBase { + void runOnOperation() override { + auto func = getOperation(); + auto funcType = func.getFunctionType(); + auto argTypes = llvm::to_vector<8>(funcType.getInputs()); + auto resultTypes = llvm::to_vector<1>(funcType.getResults()); + auto ctx = func->getContext(); + auto newArgType = MemRefType::get({}, IntegerType::get(ctx, 8)); + argTypes.emplace_back(newArgType); + auto newFuncType = FunctionType::get(ctx, argTypes, resultTypes); + func.setType(newFuncType); + + if (func.getBody().hasOneBlock()) { + func.getBody().front().addArgument(newArgType, func.getLoc()); + } + + // Find all function calls and append the last argument of the current + // function to the call. + func.walk([&](func::CallOp call) { + auto args = llvm::to_vector<8>(call.getOperands()); + args.emplace_back(func.getArgument(func.getNumArguments() - 1)); + call->setOperands(args); + }); + } +}; +} // namespace diff --git a/lib/gc/Transforms/GPU/CMakeLists.txt b/lib/gc/Transforms/GPU/CMakeLists.txt index 13f9c2981..3909681e3 100644 --- a/lib/gc/Transforms/GPU/CMakeLists.txt +++ b/lib/gc/Transforms/GPU/CMakeLists.txt @@ -1,4 +1,6 @@ gc_add_mlir_library(GcGpuPasses + AddContextArg.cpp + GpuToGpuOcl.cpp LinalgToXeGPU.cpp Pipeline.cpp diff --git a/lib/gc/Transforms/GPU/GpuToGpuOcl.cpp b/lib/gc/Transforms/GPU/GpuToGpuOcl.cpp new file mode 100644 index 000000000..dfcd1daba --- /dev/null +++ b/lib/gc/Transforms/GPU/GpuToGpuOcl.cpp @@ -0,0 +1,541 @@ +//===-- GpuToGpuOcl.cpp - GpuToGpuOcl path ----------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include + +#define GC_GPU_OCL_CONST_ONLY +#include "gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h" + +#include "mlir/Conversion/LLVMCommon/ConversionTarget.h" +#include "mlir/Conversion/LLVMCommon/Pattern.h" +#include "mlir/Conversion/Passes.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/Transforms/Passes.h" + +using namespace mlir; +using namespace mlir::gc::gpu; + +namespace mlir::gc { +#define GEN_PASS_DECL_GPUTOGPUOCL +#define GEN_PASS_DEF_GPUTOGPUOCL +#include "gc/Transforms/Passes.h.inc" +} // namespace mlir::gc + +namespace { +LLVM::CallOp funcCall(OpBuilder &builder, const StringRef name, + const Type returnType, const ArrayRef argTypes, + const Location loc, const ArrayRef arguments, + bool isVarArg = false) { + auto module = builder.getBlock()->getParent()->getParentOfType(); + auto function = module.lookupSymbol(name); + if (!function) { + auto type = LLVM::LLVMFunctionType::get(returnType, argTypes, isVarArg); + function = OpBuilder::atBlockEnd(module.getBody()) + .create(loc, name, type); + } + return builder.create(loc, function, arguments); +} + +// Assuming that the pointer to the context is passed as the last argument +// of the current function of type memref with zero dims. When lowering +// to LLVM, the memref arg is replaced with 3 args of types ptr, ptr, i64. +// Returning the first one. +Value getCtxPtr(const OpBuilder &rewriter) { + auto func = + rewriter.getBlock()->getParent()->getParentOfType(); + return func.getArgument(func.getNumArguments() - 3); +} + +struct Helper final { + LLVMTypeConverter &converter; + Type voidType; + Type ptrType; + Type idxType; + mutable std::unordered_set kernelNames; + + explicit Helper(MLIRContext *ctx, LLVMTypeConverter &converter) + : converter(converter), voidType(LLVM::LLVMVoidType::get(ctx)), + ptrType(LLVM::LLVMPointerType::get(ctx)), + idxType(IntegerType::get(ctx, converter.getPointerBitwidth())) {} + + Value idxConstant(OpBuilder &rewriter, const Location loc, + size_t value) const { + return rewriter.create( + loc, idxType, + rewriter.getIntegerAttr(idxType, static_cast(value))); + } + + void destroyKernels(OpBuilder &rewriter, Location loc, + ArrayRef kernelPtrs) const { + auto size = idxConstant(rewriter, loc, kernelPtrs.size()); + auto kernelPtrsArray = + rewriter.create(loc, ptrType, ptrType, size); + for (size_t i = 0, n = kernelPtrs.size(); i < n; i++) { + auto elementPtr = + rewriter.create(loc, ptrType, ptrType, kernelPtrsArray, + idxConstant(rewriter, loc, i)); + rewriter.create(loc, kernelPtrs[i], elementPtr); + } + + funcCall(rewriter, GPU_OCL_KERNEL_DESTROY, voidType, {idxType, ptrType}, + loc, {size, kernelPtrsArray}); + } +}; + +template +struct ConvertOpPattern : ConvertOpToLLVMPattern { + const Helper &helper; + + explicit ConvertOpPattern(const Helper &helper) + : ConvertOpToLLVMPattern(helper.converter), helper(helper) {} +}; + +struct ConvertAlloc final : ConvertOpPattern { + explicit ConvertAlloc(const Helper &helper) : ConvertOpPattern(helper) {} + + LogicalResult + matchAndRewrite(gpu::AllocOp allocOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = allocOp.getLoc(); + MemRefType type = allocOp.getType(); + auto shape = type.getShape(); + auto dynamics = adaptor.getDynamicSizes(); + + if (shape.empty() || dynamics.empty()) { + int64_t staticSize; + if (shape.empty()) { + staticSize = 0; + } else { + staticSize = type.getElementType().getIntOrFloatBitWidth() / 8; + for (auto dim : shape) { + assert(dim != ShapedType::kDynamic); + staticSize *= dim; + } + } + auto size = helper.idxConstant(rewriter, loc, staticSize); + auto ptr = funcCall(rewriter, GPU_OCL_MALLOC, helper.ptrType, + {helper.ptrType, helper.idxType}, loc, + {getCtxPtr(rewriter), size}) + .getResult(); + Value replacement = MemRefDescriptor::fromStaticShape( + rewriter, loc, helper.converter, type, ptr, ptr); + rewriter.replaceOp(allocOp, replacement); + return success(); + } + + auto ndims = shape.size(); + SmallVector newShape; + SmallVector newStrides(ndims); + auto staticSize = type.getElementType().getIntOrFloatBitWidth() / 8; + auto size = dynamics[0]; + + auto idxMul = [&](Value x, Value y) -> Value { + if (auto xConst = getConstantIntValue(x)) { + if (auto yConst = getConstantIntValue(y)) { + return helper.idxConstant(rewriter, loc, + xConst.value() * yConst.value()); + } + } + return rewriter.create(loc, x, y); + }; + + for (size_t i = 0, j = 0; i < ndims; i++) { + auto dim = shape[i]; + if (dim == ShapedType::kDynamic) { + auto dynSize = dynamics[j++]; + newShape.emplace_back(dynSize); + if (j != 1) { + size = idxMul(size, dynSize); + } + } else { + staticSize *= dim; + newShape.emplace_back(helper.idxConstant(rewriter, loc, dim)); + } + } + + size = idxMul(size, helper.idxConstant(rewriter, loc, staticSize)); + auto ptr = funcCall(rewriter, GPU_OCL_MALLOC, helper.ptrType, + {helper.ptrType, helper.idxType}, loc, + {getCtxPtr(rewriter), size}) + .getResult(); + + newStrides[ndims - 1] = helper.idxConstant(rewriter, loc, 1); + for (int i = static_cast(ndims) - 2; i >= 0; i--) { + newStrides[i] = idxMul(newStrides[i + 1], newShape[i]); + ; + } + + auto dsc = MemRefDescriptor::undef(rewriter, loc, + helper.converter.convertType(type)); + dsc.setAllocatedPtr(rewriter, loc, ptr); + dsc.setAlignedPtr(rewriter, loc, ptr); + dsc.setOffset(rewriter, loc, helper.idxConstant(rewriter, loc, 0)); + + for (unsigned i = 0, n = static_cast(ndims); i < n; i++) { + dsc.setSize(rewriter, loc, i, newShape[i]); + dsc.setStride(rewriter, loc, i, newStrides[i]); + } + + rewriter.replaceOp(allocOp, static_cast(dsc)); + return success(); + } +}; + +struct ConvertDealloc final : ConvertOpPattern { + explicit ConvertDealloc(const Helper &helper) : ConvertOpPattern(helper) {} + + LogicalResult + matchAndRewrite(gpu::DeallocOp gpuDealloc, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = gpuDealloc.getLoc(); + MemRefDescriptor dsc(adaptor.getMemref()); + auto ptr = dsc.allocatedPtr(rewriter, loc); + auto oclDealloc = funcCall(rewriter, GPU_OCL_DEALLOC, helper.voidType, + {helper.ptrType, helper.ptrType}, loc, + {getCtxPtr(rewriter), ptr}); + rewriter.replaceOp(gpuDealloc, oclDealloc); + return success(); + } +}; + +struct ConvertMemcpy final : ConvertOpPattern { + explicit ConvertMemcpy(const Helper &helper) : ConvertOpPattern(helper) {} + + LogicalResult + matchAndRewrite(gpu::MemcpyOp gpuMemcpy, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = gpuMemcpy.getLoc(); + auto srcType = gpuMemcpy.getSrc().getType(); + auto elementSize = srcType.getElementType().getIntOrFloatBitWidth() / 8; + uint64_t numElements = 0; + for (auto dim : srcType.getShape()) { + if (dim == ShapedType::kDynamic) { + gpuMemcpy.emitOpError() + << "dynamic shapes are not currently not supported"; + return failure(); + } + numElements = numElements ? numElements * dim : dim; + } + + MemRefDescriptor srcDsc(adaptor.getSrc()); + MemRefDescriptor dstDsc(adaptor.getDst()); + auto srcPtr = srcDsc.alignedPtr(rewriter, loc); + auto dstPtr = dstDsc.alignedPtr(rewriter, loc); + auto size = helper.idxConstant(rewriter, loc, elementSize * numElements); + auto oclMemcpy = funcCall( + rewriter, GPU_OCL_MEMCPY, helper.voidType, + {helper.ptrType, helper.ptrType, helper.ptrType, helper.idxType}, loc, + {getCtxPtr(rewriter), srcPtr, dstPtr, size}); + rewriter.replaceOp(gpuMemcpy, oclMemcpy); + return success(); + } +}; + +struct ConvertLaunch final : ConvertOpPattern { + + explicit ConvertLaunch(const Helper &helper) : ConvertOpPattern(helper) {} + + LogicalResult + matchAndRewrite(gpu::LaunchFuncOp gpuLaunch, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto kernelPtr = getKernel(gpuLaunch, adaptor, rewriter); + if (!kernelPtr) { + return failure(); + } + + const Location loc = gpuLaunch.getLoc(); + auto kernelArgs = adaptor.getKernelOperands(); + SmallVector args; + args.reserve(kernelArgs.size() + 2); + args.emplace_back(getCtxPtr(rewriter)); + args.emplace_back(kernelPtr.value()); + + int i = 0; + for (auto arg : kernelArgs) { + if (auto type = gpuLaunch.getKernelOperand(i++).getType(); + isa(type)) { + MemRefDescriptor desc(arg); + args.emplace_back(desc.alignedPtr(rewriter, loc)); + } else { + // Store the arg on the stack and pass the pointer + auto ptr = rewriter.create( + loc, helper.ptrType, typeConverter->convertType(type), + helper.idxConstant(rewriter, loc, 1)); + rewriter.create(loc, arg, ptr); + args.emplace_back(ptr); + } + } + + const auto gpuOclLaunch = + funcCall(rewriter, GPU_OCL_KERNEL_LAUNCH, helper.voidType, + {helper.ptrType, helper.ptrType}, loc, args, true); + rewriter.replaceOp(gpuLaunch, gpuOclLaunch); + return success(); + } + +private: + // Returns the kernel pointer stored in the global var ...name_Ptr. + // If it's NULL, calls the createKernel() function. + std::optional getKernel(gpu::LaunchFuncOp &gpuLaunch, + OpAdaptor &adaptor, + ConversionPatternRewriter &rewriter) const { + auto loc = gpuLaunch.getLoc(); + auto ctx = getCtxPtr(rewriter); + auto mod = rewriter.getBlock()->getParent()->getParentOfType(); + auto kernelModName = gpuLaunch.getKernelModuleName(); + SmallString<128> getFuncName("getGcGpuOclKernel_"); + getFuncName.append(kernelModName); + + if (helper.kernelNames + .insert(std::string(kernelModName.begin(), kernelModName.end())) + .second) { + auto insPoint = rewriter.saveInsertionPoint(); + SmallString<128> strBuf("gcGpuOclKernel_"); + strBuf.append(kernelModName); + strBuf.append("_"); + auto strBufStart = strBuf.size(); + auto str = [&strBuf, + strBufStart](const char *chars) -> SmallString<128> & { + strBuf.truncate(strBufStart); + strBuf.append(chars); + return strBuf; + }; + + SmallString<128> createFuncName("createGcGpuOclKernel_"); + createFuncName.append(kernelModName); + if (!createKernel(gpuLaunch, adaptor, rewriter, loc, mod, createFuncName, + str)) { + return std::nullopt; + } + + auto function = rewriter.create( + loc, getFuncName, + LLVM::LLVMFunctionType::get(helper.ptrType, {helper.ptrType}), + LLVM::Linkage::Internal); + function.setAlwaysInline(true); + rewriter.setInsertionPointToStart(function.addEntryBlock(rewriter)); + + auto ptr = mod.lookupSymbol(str("Ptr")); + assert(ptr); + auto null = rewriter.create(loc, helper.ptrType); + auto ptrPtr = rewriter.create(loc, ptr); + auto ptrVal = rewriter.create(loc, helper.ptrType, ptrPtr); + auto cmp = rewriter.create(loc, LLVM::ICmpPredicate::eq, + ptrVal, null); + + auto body = &function.getBody(); + auto thenBlock = rewriter.createBlock(body); + auto elseBlock = rewriter.createBlock(body); + rewriter.setInsertionPointToEnd(&body->front()); + rewriter.create(loc, cmp, thenBlock, elseBlock); + + // Then block + rewriter.setInsertionPointToStart(thenBlock); + auto result = funcCall(rewriter, createFuncName, helper.ptrType, + {helper.ptrType}, loc, {function.getArgument(0)}); + rewriter.create(loc, result.getResult()); + + // Else block + rewriter.setInsertionPointToStart(elseBlock); + rewriter.create(loc, ptrVal); + + rewriter.restoreInsertionPoint(insPoint); + } + + auto kernelFunc = mod.lookupSymbol(getFuncName); + if (!kernelFunc) { + gpuLaunch.emitOpError() << "Function " << getFuncName << " not found!"; + return std::nullopt; + } + return rewriter.create(loc, kernelFunc, ValueRange(ctx)) + .getResult(); + } + + // Create a new kernel and save the pointer to the global variable + // ...name_Ptr. + bool createKernel( + gpu::LaunchFuncOp &gpuLaunch, OpAdaptor &adaptor, + ConversionPatternRewriter &rewriter, const Location &loc, ModuleOp &mod, + StringRef funcName, + const std::function &(const char *chars)> &str) const { + auto kernelModName = gpuLaunch.getKernelModuleName(); + auto kernelMod = SymbolTable::lookupNearestSymbolFrom( + gpuLaunch, kernelModName); + if (!kernelMod) { + gpuLaunch.emitOpError() << "Module " << kernelModName << " not found!"; + return false; + } + const auto binaryAttr = kernelMod->getAttrOfType("gpu.binary"); + if (!binaryAttr) { + kernelMod.emitOpError() << "missing 'gpu.binary' attribute"; + return false; + } + + rewriter.setInsertionPointToStart(mod.getBody()); + // The kernel pointer is stored here + rewriter.create(loc, helper.ptrType, /*isConstant=*/false, + LLVM::Linkage::Internal, str("Ptr"), + rewriter.getZeroAttr(helper.ptrType)); + rewriter.eraseOp(kernelMod); + + auto function = rewriter.create( + loc, funcName, + LLVM::LLVMFunctionType::get(helper.ptrType, {helper.ptrType}), + LLVM::Linkage::Internal); + rewriter.setInsertionPointToStart(function.addEntryBlock(rewriter)); + + auto ptr = mod.lookupSymbol(str("Ptr")); + assert(ptr); + SmallVector nameChars(kernelModName.getValue().begin(), + kernelModName.getValue().end()); + nameChars.emplace_back('\0'); + // Kernel name and SPIRV are stored as global strings + auto name = LLVM::createGlobalString( + loc, rewriter, str("Name"), + StringRef(nameChars.data(), nameChars.size()), LLVM::Linkage::Internal); + auto spirv = LLVM::createGlobalString(loc, rewriter, str("SPIRV"), + binaryAttr.getValue(), + LLVM::Linkage::Internal); + auto spirvSize = rewriter.create( + loc, helper.idxType, + IntegerAttr::get(helper.idxType, + static_cast(binaryAttr.size()))); + + SmallVector gridSize; + SmallVector blockSize; + SmallVector argSize; + gridSize.emplace_back(gpuLaunch.getGridSizeX()); + gridSize.emplace_back(gpuLaunch.getGridSizeY()); + gridSize.emplace_back(gpuLaunch.getGridSizeZ()); + blockSize.emplace_back(gpuLaunch.getBlockSizeX()); + blockSize.emplace_back(gpuLaunch.getBlockSizeY()); + blockSize.emplace_back(gpuLaunch.getBlockSizeZ()); + + for (auto arg : adaptor.getKernelOperands()) { + auto type = arg.getType(); + // Assuming, that the value is either an integer or a float or a pointer. + // In the latter case, the size is 0 bytes. + auto size = type.isIntOrFloat() ? type.getIntOrFloatBitWidth() / 8 : 0; + argSize.emplace_back(helper.idxConstant(rewriter, loc, size)); + } + + auto array = [&](SmallVector &values) { + auto size = helper.idxConstant(rewriter, loc, values.size()); + auto arrayPtr = rewriter.create(loc, helper.ptrType, + helper.idxType, size); + for (size_t i = 0, n = values.size(); i < n; i++) { + auto elementPtr = rewriter.create( + loc, helper.ptrType, helper.idxType, arrayPtr, + helper.idxConstant(rewriter, loc, i)); + auto value = values[i]; + if (auto cast = value.getDefiningOp()) { + assert(getConstantIntValue(cast.getOperand(0))); + value = helper.idxConstant( + rewriter, loc, getConstantIntValue(cast.getOperand(0)).value()); + } + rewriter.create(loc, value, elementPtr); + } + return arrayPtr.getResult(); + }; + + auto ctx = function.getArgument(0); + auto argNum = + helper.idxConstant(rewriter, loc, adaptor.getKernelOperands().size()); + auto createKernelCall = funcCall( + rewriter, GPU_OCL_KERNEL_CREATE, helper.ptrType, + {helper.ptrType, helper.idxType, helper.ptrType, helper.ptrType, + helper.ptrType, helper.ptrType, helper.idxType, helper.ptrType}, + loc, + {ctx, spirvSize, spirv, name, array(gridSize), array(blockSize), argNum, + array(argSize)}); + auto result = createKernelCall.getResult(); + + // Save the kernel pointer to the global var using CAS + auto null = rewriter.create(loc, helper.ptrType); + auto ptrPtr = rewriter.create(loc, ptr); + auto casResult = rewriter.create( + loc, ptrPtr, null, result, LLVM::AtomicOrdering::acq_rel, + LLVM::AtomicOrdering::monotonic); + auto casFlag = rewriter.create( + loc, rewriter.getI1Type(), casResult, 1); + + auto body = &function.getBody(); + auto thenBlock = rewriter.createBlock(body); + auto elseBlock = rewriter.createBlock(body); + rewriter.setInsertionPointToEnd(&body->front()); + rewriter.create(loc, casFlag, thenBlock, elseBlock); + + // Then block + rewriter.setInsertionPointToStart(thenBlock); + rewriter.create(loc, result); + + // Else block + // The kernel has already been created by another thread, destroying this + // one. + rewriter.setInsertionPointToStart(elseBlock); + helper.destroyKernels(rewriter, loc, result); + result = rewriter.create(loc, helper.ptrType, + casResult, 0); + rewriter.create(loc, result); + + rewriter.setInsertionPointAfter(function); + return true; + } +}; + +struct GpuToGpuOcl final : gc::impl::GpuToGpuOclBase { + + void runOnOperation() override { + const auto ctx = &getContext(); + const LLVMConversionTarget target(getContext()); + LLVMTypeConverter converter(ctx); + Helper helper(ctx, converter); + RewritePatternSet patterns(ctx); + + populateGpuToLLVMConversionPatterns(converter, patterns); + patterns.insert( + helper); + + if (failed(applyPartialConversion(getOperation(), target, + std::move(patterns)))) { + signalPassFailure(); + return; + } + + // Add gpuOclDestructor() function that destroys all the kernels + auto mod = llvm::dyn_cast(getOperation()); + assert(mod); + OpBuilder rewriter(mod.getBody(), mod.getBody()->end()); + auto destruct = rewriter.create( + mod.getLoc(), GPU_OCL_MOD_DESTRUCTOR, + LLVM::LLVMFunctionType::get(helper.voidType, {}), + LLVM::Linkage::External); + auto loc = destruct.getLoc(); + rewriter.setInsertionPointToStart(destruct.addEntryBlock(rewriter)); + // Add memory fence + rewriter.create(loc, LLVM::AtomicOrdering::acquire); + + SmallVector kernelPtrs; + SmallString<128> strBuf("gcGpuOclKernel_"); + auto strBufStart = strBuf.size(); + kernelPtrs.reserve(helper.kernelNames.size()); + for (auto &name : helper.kernelNames) { + strBuf.truncate(strBufStart); + strBuf.append(name); + strBuf.append("_Ptr"); + auto ptr = mod.lookupSymbol(strBuf); + assert(ptr); + auto ptrVal = rewriter.create( + loc, helper.ptrType, rewriter.create(loc, ptr)); + kernelPtrs.emplace_back(ptrVal); + } + + helper.destroyKernels(rewriter, loc, kernelPtrs); + rewriter.create(loc, ValueRange{}); + } +}; +} // namespace \ No newline at end of file diff --git a/lib/gc/Transforms/GPU/Pipeline.cpp b/lib/gc/Transforms/GPU/Pipeline.cpp index 061c115b8..a752bed86 100644 --- a/lib/gc/Transforms/GPU/Pipeline.cpp +++ b/lib/gc/Transforms/GPU/Pipeline.cpp @@ -39,6 +39,9 @@ namespace mlir::gc { void populateGPUPipeline(mlir::OpPassManager &pm) { + // Add an argument for the GPU context + pm.addNestedPass(createAddContextArg()); + pm.addNestedPass(createIterativeTilingAndFusion()); pm.addPass(bufferization::createEmptyTensorEliminationPass()); diff --git a/test/mlir/test/gc/gpu-runner/XeGPU/lit.local.cfg b/test/mlir/test/gc/gpu-runner/XeGPU/lit.local.cfg new file mode 100644 index 000000000..152c26255 --- /dev/null +++ b/test/mlir/test/gc/gpu-runner/XeGPU/lit.local.cfg @@ -0,0 +1,2 @@ +# GPUX is currently disabled +config.unsupported = True diff --git a/test/mlir/test/gc/gpu-runner/gpu-to-gpuocl.mlir b/test/mlir/test/gc/gpu-runner/gpu-to-gpuocl.mlir new file mode 100644 index 000000000..7742b8d19 --- /dev/null +++ b/test/mlir/test/gc/gpu-runner/gpu-to-gpuocl.mlir @@ -0,0 +1,63 @@ +// RUN: gc-opt %s --gc-gpu-pipeline | FileCheck %s + +module @test { + func.func @entry(%arg0: memref<32x32xf32>, %arg1: memref<32x32xf32>, %arg2: memref<32x32xf32>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<32x32xf32> + %1 = bufferization.to_tensor %arg1 restrict : memref<32x32xf32> + %2 = tensor.empty() : tensor<32x32xf32> + %3 = linalg.add ins(%1, %0 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%2 : tensor<32x32xf32>) -> tensor<32x32xf32> + bufferization.materialize_in_destination %3 in restrict writable %arg2 : (tensor<32x32xf32>, memref<32x32xf32>) -> () + return + } +} + +// CHECK: llvm.mlir.global internal constant @gcGpuOclKernel_entry_kernel_SPIRV +// CHECK: llvm.mlir.global internal constant @gcGpuOclKernel_entry_kernel_Name +// CHECK: llvm.mlir.global internal @gcGpuOclKernel_entry_kernel_Ptr + +// CHECK: llvm.func internal @createGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr +// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr +// CHECK: [[ZERO:%.+]] = llvm.mlir.zero +// CHECK: [[ONE:%.+]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK: [[NEW_PTR:%.+]] = llvm.call @gcGpuOclKernelCreate([[CTX]] +// CHECK: [[CMPXCHG:%.+]] = llvm.cmpxchg [[PTR_ADDR]], [[ZERO]], [[NEW_PTR]] +// CHECK: [[FLAG:%.+]] = llvm.extractvalue [[CMPXCHG]][1] +// CHECK: llvm.cond_br [[FLAG]], [[BB1:\^.+]], [[BB2:\^.+]] +// CHECK: [[BB1]]: +// CHECK: llvm.return [[NEW_PTR]] +// CHECK: [[BB2]]: +// CHECK: [[ARRAY:%.+]] = llvm.alloca [[ONE]] +// CHECK: llvm.store [[NEW_PTR]], [[ARRAY]] +// CHECK: llvm.call @gcGpuOclKernelDestroy([[ONE]], [[ARRAY]]) +// CHECK: [[OLD_PTR:%.+]] = llvm.extractvalue [[CMPXCHG]][0] +// CHECK: llvm.return [[OLD_PTR]] + +// CHECK: llvm.func internal @getGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} +// CHECK: [[ZERO:%.+]] = llvm.mlir.zero +// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr +// CHECK: [[PTR:%.+]] = llvm.load [[PTR_ADDR]] +// CHECK: [[ICMP:%.+]] = llvm.icmp "eq" [[PTR]], [[ZERO]] +// CHECK: llvm.cond_br [[ICMP]], [[BB1:\^.+]], [[BB2:\^.+]] +// CHECK: [[BB1]]: +// CHECK: [[NEW_PTR:%.+]] = llvm.call @createGcGpuOclKernel_entry_kernel([[CTX]]) +// CHECK: llvm.return [[NEW_PTR]] +// CHECK: [[BB2]]: +// CHECK: llvm.return [[PTR]] + +// CHECK: llvm.func @entry +// CHECK: [[KERNEL:%.+]] = llvm.call @getGcGpuOclKernel_entry_kernel([[CTX:%.+]]) : (!llvm.ptr) -> !llvm.ptr +// CHECK: llvm.call @gcGpuOclKernelLaunch([[CTX]], [[KERNEL]], + +// CHECK: llvm.func @gcGpuOclKernelCreate +// CHECK: llvm.func @gcGpuOclKernelDestroy +// CHECK: llvm.func @gcGpuOclKernelLaunch + + +// CHECK: llvm.func @gcGpuOclModuleDestructor() +// CHECK: [[ONE:%.+]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr +// CHECK: llvm.fence acquire +// CHECK: [[PTR:%.+]] = llvm.load [[PTR_ADDR]] +// CHECK: [[ARRAY:%.+]] = llvm.alloca [[ONE]] +// CHECK: llvm.store [[PTR]], [[ARRAY]] +// CHECK: llvm.call @gcGpuOclKernelDestroy([[ONE]], [[ARRAY]]) diff --git a/test/mlir/test/gc/gpu-runner/lit.local.cfg b/test/mlir/test/gc/gpu-runner/lit.local.cfg index f180dd41b..5ed13b0d2 100644 --- a/test/mlir/test/gc/gpu-runner/lit.local.cfg +++ b/test/mlir/test/gc/gpu-runner/lit.local.cfg @@ -1,2 +1,5 @@ if not config.gc_use_imex: - config.unsupported = True \ No newline at end of file + config.unsupported = True +else: + # FIXME: Enable when the GPU runner is implemented. + config.excludes = ['mlp.mlir']