-
Notifications
You must be signed in to change notification settings - Fork 43
Add Buffer Device Address Backend #311
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Conversation
Codecov ReportAttention: Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## master #311 +/- ##
==========================================
- Coverage 75.32% 73.83% -1.50%
==========================================
Files 12 12
Lines 616 642 +26
==========================================
+ Hits 464 474 +10
- Misses 152 168 +16 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/lib/cl/buffer.jl b/lib/cl/buffer.jl
index 0d4774f..46c6c24 100644
--- a/lib/cl/buffer.jl
+++ b/lib/cl/buffer.jl
@@ -144,7 +144,8 @@ end
## memory operations
# reading from buffer to host array, return an event
-function enqueue_read(dst::Ptr, src::Union{Buffer, cl_mem}, src_off::Int, nbytes::Int;
+function enqueue_read(
+ dst::Ptr, src::Union{Buffer, cl_mem}, src_off::Int, nbytes::Int;
blocking::Bool=false, wait_for::Vector{Event}=Event[])
n_evts = length(wait_for)
evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for]
@@ -159,7 +160,8 @@ enqueue_read(dst::Ptr, src::Union{Buffer, cl_mem}, nbytes; kwargs...) =
enqueue_read(dst, src, 0, nbytes; kwargs...)
# writing from host array to buffer, return an event
-function enqueue_write(dst::Union{Buffer, cl_mem}, dst_off::Int, src::Ptr, nbytes::Int;
+function enqueue_write(
+ dst::Union{Buffer, cl_mem}, dst_off::Int, src::Ptr, nbytes::Int;
blocking::Bool=false, wait_for::Vector{Event}=Event[])
n_evts = length(wait_for)
evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for]
@@ -174,7 +176,8 @@ enqueue_write(dst::Union{Buffer, cl_mem}, src::Ptr, nbytes; kwargs...) =
enqueue_write(dst, 0, src, nbytes; kwargs...)
# copying between two buffers, return an event
-function enqueue_copy(dst::Union{Buffer, cl_mem}, dst_off::Int, src::Union{Buffer, cl_mem}, src_off::Int,
+function enqueue_copy(
+ dst::Union{Buffer, cl_mem}, dst_off::Int, src::Union{Buffer, cl_mem}, src_off::Int,
nbytes::Int; blocking::Bool=false,
wait_for::Vector{Event}=Event[])
n_evts = length(wait_for)
@@ -231,7 +234,8 @@ function enqueue_unmap(b::Buffer, ptr::Ptr; wait_for::Vector{Event}=Event[])
end
# fill a buffer with a pattern, returning an event
-function enqueue_fill(b::Union{Buffer, cl_mem}, offset::Integer, pattern::T, N::Integer;
+function enqueue_fill(
+ b::Union{Buffer, cl_mem}, offset::Integer, pattern::T, N::Integer;
wait_for::Vector{Event}=Event[]) where {T}
nbytes = N * sizeof(T)
nbytes_pattern = sizeof(T)
diff --git a/lib/cl/libopencl.jl b/lib/cl/libopencl.jl
index f34b1bb..70a32e3 100644
--- a/lib/cl/libopencl.jl
+++ b/lib/cl/libopencl.jl
@@ -8,8 +8,9 @@ end
function check(f)
res = retry_reclaim(err -> err == CL_OUT_OF_RESOURCES ||
- err == CL_MEM_OBJECT_ALLOCATION_FAILURE ||
- err == CL_OUT_OF_HOST_MEMORY) do
+ err == CL_MEM_OBJECT_ALLOCATION_FAILURE ||
+ err == CL_OUT_OF_HOST_MEMORY
+ ) do
return f()
end
@@ -1330,13 +1331,17 @@ const clIcdSetPlatformDispatchDataKHR_fn = Ptr{clIcdSetPlatformDispatchDataKHR_t
end
function clIcdGetFunctionAddressForPlatformKHR(platform, func_name)
- @ext_ccall libopencl.clIcdGetFunctionAddressForPlatformKHR(platform::cl_platform_id,
- func_name::Ptr{Cchar})::Ptr{Cvoid}
+ return @ext_ccall libopencl.clIcdGetFunctionAddressForPlatformKHR(
+ platform::cl_platform_id,
+ func_name::Ptr{Cchar}
+ )::Ptr{Cvoid}
end
@checked function clIcdSetPlatformDispatchDataKHR(platform, dispatch_data)
- @ext_ccall libopencl.clIcdSetPlatformDispatchDataKHR(platform::cl_platform_id,
- dispatch_data::Ptr{Cvoid})::cl_int
+ @ext_ccall libopencl.clIcdSetPlatformDispatchDataKHR(
+ platform::cl_platform_id,
+ dispatch_data::Ptr{Cvoid}
+ )::cl_int
end
# typedef cl_program CL_API_CALL clCreateProgramWithILKHR_t ( cl_context context , const void * il , size_t length , cl_int * errcode_ret )
@@ -2054,7 +2059,8 @@ end
@checked function clGetMemAllocInfoINTEL(context, ptr, param_name, param_value_size,
param_value, param_value_size_ret)
- @ext_ccall libopencl.clGetMemAllocInfoINTEL(context::cl_context, ptr::PtrOrCLPtr{Cvoid},
+ @ext_ccall libopencl.clGetMemAllocInfoINTEL(
+ context::cl_context, ptr::PtrOrCLPtr{Cvoid},
param_name::cl_mem_info_intel,
param_value_size::Csize_t,
param_value::Ptr{Cvoid},
@@ -2064,15 +2070,16 @@ end
@checked function clSetKernelArgMemPointerINTEL(kernel, arg_index, arg_value)
@ext_ccall libopencl.clSetKernelArgMemPointerINTEL(kernel::cl_kernel,
arg_index::cl_uint,
- arg_value::PtrOrCLPtr{Cvoid})::cl_int
+ arg_value::PtrOrCLPtr{Cvoid}
+ )::cl_int
end
@checked function clEnqueueMemFillINTEL(command_queue, dst_ptr, pattern, pattern_size, size,
num_events_in_wait_list, event_wait_list, event)
@ext_ccall libopencl.clEnqueueMemFillINTEL(command_queue::cl_command_queue,
- dst_ptr::PtrOrCLPtr{Cvoid},
- pattern::Ptr{Cvoid}, pattern_size::Csize_t,
- size::Csize_t,
+ dst_ptr::PtrOrCLPtr{Cvoid},
+ pattern::Ptr{Cvoid}, pattern_size::Csize_t,
+ size::Csize_t,
num_events_in_wait_list::cl_uint,
event_wait_list::Ptr{cl_event},
event::Ptr{cl_event})::cl_int
@@ -2091,7 +2098,7 @@ end
@checked function clEnqueueMemAdviseINTEL(command_queue, ptr, size, advice,
num_events_in_wait_list, event_wait_list, event)
@ext_ccall libopencl.clEnqueueMemAdviseINTEL(command_queue::cl_command_queue,
- ptr::PtrOrCLPtr{Cvoid}, size::Csize_t,
+ ptr::PtrOrCLPtr{Cvoid}, size::Csize_t,
advice::cl_mem_advice_intel,
num_events_in_wait_list::cl_uint,
event_wait_list::Ptr{cl_event},
@@ -2106,7 +2113,7 @@ const clEnqueueMigrateMemINTEL_fn = Ptr{clEnqueueMigrateMemINTEL_t}
@checked function clEnqueueMigrateMemINTEL(command_queue, ptr, size, flags,
num_events_in_wait_list, event_wait_list, event)
@ext_ccall libopencl.clEnqueueMigrateMemINTEL(command_queue::cl_command_queue,
- ptr::PtrOrCLPtr{Cvoid}, size::Csize_t,
+ ptr::PtrOrCLPtr{Cvoid}, size::Csize_t,
flags::cl_mem_migration_flags,
num_events_in_wait_list::cl_uint,
event_wait_list::Ptr{cl_event},
@@ -2248,9 +2255,11 @@ const clSetKernelArgDevicePointerEXT_t = Cvoid
const clSetKernelArgDevicePointerEXT_fn = Ptr{clSetKernelArgDevicePointerEXT_t}
@checked function clSetKernelArgDevicePointerEXT(kernel, arg_index, arg_value)
- @ext_ccall libopencl.clSetKernelArgDevicePointerEXT(kernel::cl_kernel,
- arg_index::cl_uint,
- arg_value::cl_mem_device_address_ext)::cl_int
+ @ext_ccall libopencl.clSetKernelArgDevicePointerEXT(
+ kernel::cl_kernel,
+ arg_index::cl_uint,
+ arg_value::cl_mem_device_address_ext
+ )::cl_int
end
# typedef cl_int CL_API_CALL clCancelCommandsIMG_t ( const cl_event * event_list , size_t num_events_in_list )
@@ -2271,8 +2280,10 @@ const clSetPerfHintQCOM_t = Cvoid
const clSetPerfHintQCOM_fn = Ptr{clSetPerfHintQCOM_t}
@checked function clSetPerfHintQCOM(context, perf_hint)
- @ext_ccall libopencl.clSetPerfHintQCOM(context::cl_context,
- perf_hint::cl_perf_hint_qcom)::cl_int
+ @ext_ccall libopencl.clSetPerfHintQCOM(
+ context::cl_context,
+ perf_hint::cl_perf_hint_qcom
+ )::cl_int
end
const CL_NAME_VERSION_MAX_NAME_SIZE = 64
diff --git a/lib/cl/memory/bda.jl b/lib/cl/memory/bda.jl
index 6cbd0bb..9985e75 100644
--- a/lib/cl/memory/bda.jl
+++ b/lib/cl/memory/bda.jl
@@ -7,7 +7,8 @@ end
BufferDeviceMemory() = BufferDeviceMemory(C_NULL, CL_NULL, 0, context())
-function bda_alloc(bytesize::Integer;
+function bda_alloc(
+ bytesize::Integer;
alignment::Integer = 0, device_access::Symbol = :rw, host_access::Symbol = :rw
)
@@ -33,7 +34,7 @@ function bda_alloc(bytesize::Integer;
throw(ArgumentError("Host access flag must be one of :rw, :r, or :w"))
end
-
+
err_code = Ref{Cint}()
properties = cl_mem_properties[CL_MEM_DEVICE_PRIVATE_ADDRESS_EXT, CL_TRUE, 0]
mem_id = clCreateBufferWithProperties(context(), properties, flags, bytesize, C_NULL, err_code)
diff --git a/lib/cl/state.jl b/lib/cl/state.jl
index 4cc9a68..5ad6f40 100644
--- a/lib/cl/state.jl
+++ b/lib/cl/state.jl
@@ -173,14 +173,14 @@ function default_memory_backend(dev::Device)
end
bda = bda_supported(dev)
-
+
# determine if SVM is available (if needed)
svm = let
caps = svm_capabilities(dev)
caps.coarse_grain_buffer
end
- if usm
+ return if usm
USMBackend()
else
if svm
diff --git a/src/array.jl b/src/array.jl
index 5e17bfa..8793272 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -52,7 +52,7 @@ mutable struct CLArray{T, N, M} <: AbstractGPUArray{T, N}
maxsize
end
data = GPUArrays.cached_alloc((CLArray, cl.context(), M, bufsize)) do
- buf = managed_alloc(M, bufsize; alignment=Base.datatype_alignment(T))
+ buf = managed_alloc(M, bufsize; alignment = Base.datatype_alignment(T))
DataRef(free, buf)
end
obj = new{T, N, M}(data, maxsize, 0, dims)
@@ -506,7 +506,7 @@ function Base.resize!(a::CLVector{T}, n::Integer) where {T}
# replace the data with a new CL. this 'unshares' the array.
# as a result, we can safely support resizing unowned buffers.
new_data = cl.context!(context(a)) do
- mem = managed_alloc(memtype(a), bufsize; alignment=Base.datatype_alignment(T))
+ mem = managed_alloc(memtype(a), bufsize; alignment = Base.datatype_alignment(T))
ptr = convert(CLPtr{T}, mem)
m = min(length(a), n)
if m > 0
@@ -514,7 +514,7 @@ function Base.resize!(a::CLVector{T}, n::Integer) where {T}
if memtype(a) == cl.SharedVirtualMemory
cl.enqueue_svm_copy(ptr, pointer(a), m*sizeof(T); blocking=false)
elseif memtype(a) == cl.BufferDeviceMemory
- cl.enqueue_bda_copy(convert(cl.cl_mem, mem), 0, convert(cl.cl_mem, a.data[]), 0, m*sizeof(T); blocking=false)
+ cl.enqueue_bda_copy(convert(cl.cl_mem, mem), 0, convert(cl.cl_mem, a.data[]), 0, m * sizeof(T); blocking = false)
else
cl.enqueue_usm_copy(ptr, pointer(a), m*sizeof(T); blocking=false)
end
diff --git a/src/memory.jl b/src/memory.jl
index 8fc2087..17045c6 100644
--- a/src/memory.jl
+++ b/src/memory.jl
@@ -112,7 +112,7 @@ end
## public interface
-function managed_alloc(t::Type{T}, bytes::Int; kwargs...) where T <: cl.AbstractMemory
+function managed_alloc(t::Type{T}, bytes::Int; kwargs...) where {T <: cl.AbstractMemory}
if bytes == 0
return Managed(T())
else |
@maleadt Can you take a look at the PR to see whats missing? Most tests pass now. Regarding BDA specific tests, I think what we have for normal Here's the latest one: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Couple of initial thoughts. I haven't had the time to thoroughly test this.
@@ -1263,424 +1263,6 @@ end | |||
|
|||
const cl_device_partition_property_ext = cl_ulong | |||
|
|||
const cl_device_command_buffer_capabilities_khr = cl_bitfield |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How come all these other extension methods are gone from the wrappers now?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No idea, I just ran the main function from the provided scripts in res
. Is that not how it works?
lib/cl/memory/bda.jl
Outdated
function bda_alloc(bytesize::Integer; | ||
alignment::Integer = 0, device_access::Symbol = :rw, host_access::Symbol = :rw | ||
) | ||
bytesize == 0 && error("size 0 is not supported for BufferDeviceMemory.") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we return a dummy object instead? Zero-size allocations are generally expected to be supported by the layers above, so if this back-end doesn't support it we should somehow spoof the object (e.g. putting in a null pointer, as we do in CUDA).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was doing that initially but it errors when doing any operations, like fill!
, read, write etc.
Is that still ok?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we'll need to include a condition to check for size and return a dummy CLArray in all the functions that are being used to enable array support. Specifically read
write
copy
and fill
I'll take a look at this too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In CUDA.jl I've tried to confine all that logic to the high-level wrappers in src
, e.g., https://github.com/JuliaGPU/CUDA.jl/blob/8b6a2a06c8f9d247081d7c92d8f8a82ab68153d5/src/array.jl#L541
The way we allocate dummy objects is by catching the sz == 0
case in the high-level pool allocator and similarly returning from there: https://github.com/JuliaGPU/CUDA.jl/blob/8b6a2a06c8f9d247081d7c92d8f8a82ab68153d5/src/memory.jl#L618-L619
This only requires that the Buffer objects have a dummy constructor. IMO a reasonably clean design that keeps the API wrappers truthful (not supporting 0-size allocations) while confining the logic working around that to the high-level wrappers.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In CUDA.jl I've tried to confine all that logic to the high-level wrappers in src
Apparently we already do that in OpenCL, but it was still letting many edge cases slip.
I've pushed a solution to this in the latest commit. Do let me know if you see issues with it.
That being said, kernelabstractions and event test just segfaults on Mesa-git. I don't think they depend on BDA, so its probably a Mesa bug.
There's 2 errors in base, but I think its because of the mentioned 'clarray views with range index' bug.
If PoCL supports this, we should probably have a way to pick the back-end (e.g. using a preference, asserting at run time that the device supports said back-end) and test this functionality on CI. The test failures are problematic though. Indexing really shouldn't behave differently on this back-end. |
I fixed the indexing part. :) The only remaining errors come from:
|
PoCL does support this. |
lib/cl/memory/bda.jl
Outdated
function bda_alloc(bytesize::Integer; | ||
alignment::Integer = 0, device_access::Symbol = :rw, host_access::Symbol = :rw | ||
) | ||
bytesize == 0 && error("size 0 is not supported for BufferDeviceMemory.") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you do this from alloc
in src/memory.jl
instead? We probably ought to do this for other memories as well, in order to flush out issues more quickly, and be compatible with hypothetical drivers not supporting zero-size allocations (unless this is mandated by the spec).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you look at the latest commit to see if there any issues with the approach?
More on the extension here: https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_API.html#cl_ext_buffer_device_address
This PR introduces a new backend for the array interface, as well as bumps the generated wrapper version for OpenCL Headers.
Main purpose for the new backend is to be able to support more devices than currently possible.
This is heavily inspired by the recent merge of bda into Mesa's Rusticl. Specifically the Zink backend, which means OpenCL.jl can use Vulkan as a backend and ideally work on a large number of previously unsupported devices.
This is still a work in progress, and mostly meant for review or question purposes at the current stage.Current status: Mostly works, more testing needed. Feel free to try! This is supported both by POCL 7.0 as well as with mesa-git using the environment variable
RUSTICL_ENABLE=zink
(this can work with any compliant Vulkan driver)