Skip to content

Commit

Permalink
Implemented GPU OpenCL runtime
Browse files Browse the repository at this point in the history
  • Loading branch information
AndreyPavlenko committed Sep 15, 2024
1 parent 8635ad2 commit 048f66d
Show file tree
Hide file tree
Showing 20 changed files with 1,403 additions and 11 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/clang-tidy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:

steps:
- name: Install OpenMP
run: "sudo apt install -y libomp-dev"
run: "sudo apt install -y libomp-dev opencl-c-headers"

- name: Fetch sources
uses: actions/checkout@v4
Expand Down
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -98,12 +98,15 @@ get_property(GC_TOOLS GLOBAL PROPERTY GC_TOOLS)
get_property(GC_MLIR_LIBS GLOBAL PROPERTY GC_MLIR_LIBS)
get_property(GC_PASS_LIBS GLOBAL PROPERTY GC_PASS_LIBS)
get_property(GC_DIALECT_LIBS GLOBAL PROPERTY GC_DIALECT_LIBS)
get_property(IMEX_LIBS GLOBAL PROPERTY IMEX_LIBS)

install(TARGETS
GcInterface
${GC_TOOLS}
${GC_MLIR_LIBS}
${GC_PASS_LIBS}
${GC_DIALECT_LIBS}
${IMEX_LIBS}
EXPORT ${PROJECT_NAME}Targets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
Expand Down
1 change: 1 addition & 0 deletions cmake/imex.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,5 @@ if (NOT DEFINED IMEX_INCLUDES)
${imex_SOURCE_DIR}/src
)
set_property(GLOBAL PROPERTY IMEX_INCLUDES ${IMEX_INCLUDES})
target_compile_options(GcInterface INTERFACE -DGC_USE_IMEX)
endif ()
294 changes: 293 additions & 1 deletion include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,301 @@ constexpr char GPU_OCL_MOD_DESTRUCTOR[] = "gcGpuOclModuleDestructor";
} // namespace mlir::gc::gpu

#ifndef GC_GPU_OCL_CONST_ONLY
#include <cstdarg>
#include <unordered_set>
#include <vector>

// TBD
#include <CL/cl.h>

#include <llvm/ADT/SmallString.h>

#include "mlir/ExecutionEngine/ExecutionEngine.h"
#include "mlir/IR/BuiltinOps.h"

namespace mlir::gc::gpu {
struct OclDevCtxPair {
cl_device_id device;
cl_context context;
explicit OclDevCtxPair(cl_device_id device, cl_context context)
: device(device), context(context) {}

bool operator==(const OclDevCtxPair &other) const {
return device == other.device && context == other.context;
}
};
} // namespace mlir::gc::gpu
template <> struct std::hash<const mlir::gc::gpu::OclDevCtxPair> {
std::size_t
operator()(const mlir::gc::gpu::OclDevCtxPair &pair) const noexcept {
return std::hash<cl_device_id>()(pair.device) ^
std::hash<cl_context>()(pair.context);
}
}; // namespace std
namespace mlir::gc::gpu {
struct OclModule;
struct OclContext;
struct OclModuleBuilder;

struct OclRuntime {
// Returns the available Intel GPU device ids.
[[nodiscard]] static llvm::Expected<SmallVector<cl_device_id, 2>>
gcIntelDevices(size_t max = std::numeric_limits<size_t>::max());

[[nodiscard]] static llvm::Expected<OclRuntime> get();

[[nodiscard]] static llvm::Expected<OclRuntime> get(cl_device_id device);

[[nodiscard]] static llvm::Expected<OclRuntime> get(cl_command_queue queue);

[[nodiscard]] static llvm::Expected<OclRuntime> get(cl_device_id device,
cl_context context);

static bool isOutOfOrder(cl_command_queue queue);

[[nodiscard]] cl_context getContext() const;

[[nodiscard]] cl_device_id getDevice() const;

[[nodiscard]] llvm::Expected<cl_command_queue>
createQueue(bool outOfOrder = false) const;

[[nodiscard]] static llvm::Expected<bool>
releaseQueue(cl_command_queue queue);

[[nodiscard]] llvm::Expected<void *> usmAllocDev(size_t size) const;

[[nodiscard]] llvm::Expected<void *> usmAllocShared(size_t size) const;

[[nodiscard]] llvm::Expected<bool> usmFree(const void *ptr) const;

[[nodiscard]] llvm::Expected<bool> usmCpy(OclContext &ctx, const void *src,
void *dst, size_t size) const;

template <typename T>
[[nodiscard]] llvm::Expected<T *> usmNewDev(size_t size) const {
auto expected = usmAllocDev(size * sizeof(T));
if (expected) {
return static_cast<T *>(*expected);
}
return expected.takeError();
}

template <typename T>
[[nodiscard]] llvm::Expected<T *> usmNewShared(size_t size) const {
auto expected = usmAllocShared(size * sizeof(T));
if (expected) {
return static_cast<T *>(*expected);
}
return expected.takeError();
}

template <typename T>
[[nodiscard]] llvm::Expected<bool> usmCpy(OclContext &ctx, const T *src,
T *dst, size_t size) const {
return usmCpy(ctx, static_cast<const void *>(src), static_cast<void *>(dst),
size * sizeof(T));
}

// Use with caution! This is safe to check validity of USM, but may be false
// positive for any other kinds.
bool isUsm(const void *ptr) const;

bool operator==(const OclRuntime &other) const {
return getDevice() == other.getDevice() &&
getContext() == other.getContext();
}

private:
struct Ext;
struct Exports;
friend OclContext;
friend OclModuleBuilder;
explicit OclRuntime(const Ext &ext);
const Ext &ext;
};

static constexpr int64_t ZERO = 0;
static constexpr auto ZERO_PTR = const_cast<int64_t *>(&ZERO);

// NOTE: The context is mutable and not thread-safe! It's expected to be used in
// a single thread only.
struct OclContext {
const OclRuntime &runtime;
cl_command_queue const queue;
// Preserve the execution order. This is required in case of out-of-order
// execution (CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE). When the execution
// is completed, the 'lastEvent' field contains the event of the last enqueued
// command. If this field is false, 'waitList' is ignored.
const bool preserveOrder;
cl_uint waitListLen;
cl_event *waitList;
cl_event lastEvent;

explicit OclContext(const OclRuntime &runtime, cl_command_queue queue,
cl_uint waitListLen = 0, cl_event *waitList = nullptr)
: OclContext(runtime, queue, OclRuntime::isOutOfOrder(queue), waitListLen,
waitList) {}

explicit OclContext(const OclRuntime &runtime, cl_command_queue queue,
bool preserveOrder, cl_uint waitListLen,
cl_event *waitList)
: runtime(runtime), queue(queue), preserveOrder(preserveOrder),
waitListLen(preserveOrder ? waitListLen : 0),
waitList(preserveOrder ? waitList : nullptr), lastEvent(nullptr) {
assert(!OclRuntime::isOutOfOrder(queue) || preserveOrder);
assert(preserveOrder || (waitListLen == 0 && waitList == nullptr));
}

OclContext(const OclContext &) = delete;
OclContext &operator=(const OclContext &) = delete;

void finish();

private:
friend OclRuntime;
friend OclRuntime::Exports;
template <unsigned N> friend struct OclModuleArgs;
// Contains the pointers of all non-USM arguments. It's expected, that the
// arguments are either USM or CL pointers and most probably are USM, thus,
// in most cases, this set will be empty.
std::unordered_set<void *> clPtrs;

void setLastEvent(cl_event event) {
lastEvent = event;
if (event) {
waitListLen = 1;
waitList = &lastEvent;
} else {
waitListLen = 0;
waitList = nullptr;
}
}
};

// The main function arguments in the following format -
// https://mlir.llvm.org/docs/TargetLLVMIR/#c-compatible-wrapper-emission.
// NOTE: The values are not copied, only the pointers are stored!
// NOTE: This class is mutable and not thread-safe!
template <unsigned N = 64> struct OclModuleArgs {
explicit OclModuleArgs(OclContext &ctx) : ctx(ctx) {}
OclModuleArgs(const OclModuleArgs &) = delete;
OclModuleArgs &operator=(const OclModuleArgs &) = delete;

void add(void *&alignedPtr, size_t rank, const int64_t *shape,
const int64_t *strides, bool isUsm = true) {
add(alignedPtr, alignedPtr, ZERO, rank, shape, strides, isUsm);
}

void add(void *&allocatedPtr, void *&alignedPtr, const int64_t &offset,
size_t rank, const int64_t *shape, const int64_t *strides,
bool isUsm = true) {
#ifndef NDEBUG
assert(!isUsm || ctx.runtime.isUsm(alignedPtr));
// It's recommended to have at least 16-byte alignment
assert(reinterpret_cast<std::uintptr_t>(alignedPtr) % 16 == 0);
#endif

args.emplace_back(&allocatedPtr);
args.emplace_back(&alignedPtr);
args.emplace_back(const_cast<int64_t *>(&offset));
for (size_t i = 0; i < rank; i++) {
args.emplace_back(const_cast<int64_t *>(&shape[i]));
}
for (size_t i = 0; i < rank; i++) {
args.emplace_back(const_cast<int64_t *>(&strides[i]));
}
if (!isUsm) {
ctx.clPtrs.insert(alignedPtr);
}
}

template <typename T>
void add(T *&alignedPtr, size_t rank, const int64_t *shape,
const int64_t *strides, bool isUsm = true) {
add(reinterpret_cast<void *&>(alignedPtr), rank, shape, strides, isUsm);
}

template <typename T>
void add(T *&allocatedPtr, T *&alignedPtr, const int64_t &offset, size_t rank,
const int64_t *shape, const int64_t *strides, bool isUsm = true) {
add(reinterpret_cast<void *&>(allocatedPtr),
reinterpret_cast<void *&>(alignedPtr), offset, rank, shape, strides,
isUsm);
}

void clear() {
args.clear();
ctx.clPtrs.clear();
}

private:
friend OclModule;
OclContext &ctx;
SmallVector<void *, N + 3> args;
};

struct OclModule {
const OclRuntime runtime;

using MainFunc = void (*)(void **);

explicit OclModule(const OclRuntime &runtime,
std::unique_ptr<ExecutionEngine> engine, MainFunc main)
: runtime(runtime), engine(std::move(engine)), main(main) {}

template <unsigned N> void exec(OclModuleArgs<N> &args) const {
OclContext &ctx = args.ctx;
#ifndef NDEBUG
auto rt = OclRuntime::get(ctx.queue);
assert(rt);
assert(*rt == this->runtime);
#endif
auto size = args.args.size();
auto ctxPtr = &ctx;
args.args.emplace_back(&ctxPtr);
args.args.emplace_back(&ctxPtr);
args.args.emplace_back(ZERO_PTR);
main(args.args.data());
args.args.truncate(size);
}

~OclModule();
OclModule(const OclModule &) = delete;
OclModule &operator=(const OclModule &) = delete;
OclModule(const OclModule &&) = delete;
OclModule &operator=(const OclModule &&) = delete;

private:
std::unique_ptr<ExecutionEngine> engine;
MainFunc main;
};

struct OclModuleBuilder {
friend OclRuntime;
explicit OclModuleBuilder(ModuleOp module);
explicit OclModuleBuilder(OwningOpRef<ModuleOp> &module)
: OclModuleBuilder(module.release()) {}

llvm::Expected<std::shared_ptr<const OclModule>>
build(const OclRuntime &runtime);

llvm::Expected<std::shared_ptr<const OclModule>>
build(cl_command_queue queue);

llvm::Expected<std::shared_ptr<const OclModule>> build(cl_device_id device,
cl_context context);

private:
std::shared_mutex mux;
ModuleOp mlirModule;
SmallString<32> funcName;
std::unordered_map<const OclDevCtxPair, std::shared_ptr<const OclModule>>
cache;
llvm::Expected<std::shared_ptr<const OclModule>>

build(const OclRuntime::Ext &ext);
};
}; // namespace mlir::gc::gpu
#else
#undef GC_GPU_OCL_CONST_ONLY
#endif
Expand Down
4 changes: 4 additions & 0 deletions include/gc/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ std::unique_ptr<Pass> createMergeAllocPass();
void populateFrontendPasses(mlir::OpPassManager &);
void populateCPUPipeline(mlir::OpPassManager &);

#ifdef GC_USE_IMEX
void populateGPUPipeline(mlir::OpPassManager &);
#endif

#define GEN_PASS_DECL
#include "gc/Transforms/Passes.h.inc"

Expand Down
Loading

0 comments on commit 048f66d

Please sign in to comment.