diff --git a/src/arraymancer/tensor/data_structure.nim b/src/arraymancer/tensor/data_structure.nim index 5d7166cf..e17a4821 100644 --- a/src/arraymancer/tensor/data_structure.nim +++ b/src/arraymancer/tensor/data_structure.nim @@ -16,73 +16,87 @@ import ../laser/dynamic_stack_arrays, ../laser/tensor/datatypes, nimblas, - nimcuda/cuda12_5/[cuda_runtime_api, check], # Standard library std/[complex] export nimblas.OrderType, complex export datatypes, dynamic_stack_arrays -type - CudaTensorRefTrackerObj*[T: SomeFloat] = object - value*: ptr UncheckedArray[T] - - CudaTensorRefTracker*[T] = ref CudaTensorRefTrackerObj[T] - - CudaStorage*[T: SomeFloat] = object - ## Opaque seq-like structure for storage on the Cuda backend. - ## - ## Nim garbage collector will automatically ask cuda to clear GPU memory if data becomes unused. - ## - # TODO: Forward declaring this and making this completely private prevent assignment in newCudaStorage from working - Flen*: int - Fdata*: ptr UncheckedArray[T] - Fref_tracking*: CudaTensorRefTracker[T] # We keep ref tracking for the GC in a separate field to avoid double indirection. - - CudaTensor*[T: SomeFloat] = object - ## Tensor data structure stored on Nvidia GPU (Cuda) - ## - ``shape``: Dimensions of the CudaTensor - ## - ``strides``: Numbers of items to skip to get the next item along a dimension. - ## - ``offset``: Offset to get the first item of the CudaTensor. Note: offset can be negative, in particular for slices. - ## - ``storage``: An opaque data storage for the CudaTensor - ## - ## Warning ⚠: - ## Assignment ``var a = b`` does not copy the data. Data modification on one CudaTensor will be reflected on the other. - ## However modification on metadata (shape, strides or offset) will not affect the other tensor. - ## Explicit copies can be made with ``clone``: ``var a = b.clone`` - shape*: Metadata - strides*: Metadata - offset*: int - storage*: CudaStorage[T] - - ClStorage*[T: SomeFloat] = object - ## Opaque seq-like structure for storage on the OpenCL backend. - Flen*: int - Fdata*: ptr UncheckedArray[T] - Fref_tracking*: ref[ptr UncheckedArray[T]] # We keep ref tracking for the GC in a separate field to avoid double indirection. - - ClTensor*[T: SomeFloat] = object - ## Tensor data structure stored on OpenCL (CPU, GPU, FPGAs or other accelerators) - ## - ``shape``: Dimensions of the CudaTensor - ## - ``strides``: Numbers of items to skip to get the next item along a dimension. - ## - ``offset``: Offset to get the first item of the CudaTensor. Note: offset can be negative, in particular for slices. - ## - ``storage``: An opaque data storage for the CudaTensor - ## - ## Warning ⚠: - ## Assignment ``var a = b`` does not copy the data. Data modification on one CudaTensor will be reflected on the other. - ## However modification on metadata (shape, strides or offset) will not affect the other tensor. - ## Explicit copies can be made with ``clone``: ``var a = b.clone`` - shape*: Metadata - strides*: Metadata - offset*: int - storage*: ClStorage[T] - - AnyTensor*[T] = Tensor[T] or CudaTensor[T] or ClTensor[T] - - -proc deallocCuda*[T](p: CudaTensorRefTracker[T]) {.noSideEffect.}= - if not p.value.isNil: - check cudaFree(p.value) +when defined(cuda): + import nimcuda/cuda12_5/[cuda_runtime_api, check] + + type + CudaTensorRefTrackerObj*[T: SomeFloat] = object + value*: ptr UncheckedArray[T] + + CudaTensorRefTracker*[T] = ref CudaTensorRefTrackerObj[T] + + CudaStorage*[T: SomeFloat] = object + ## Opaque seq-like structure for storage on the Cuda backend. + ## + ## Nim garbage collector will automatically ask cuda to clear GPU memory if data becomes unused. + ## + # TODO: Forward declaring this and making this completely private prevent assignment in newCudaStorage from working + Flen*: int + Fdata*: ptr UncheckedArray[T] + Fref_tracking*: CudaTensorRefTracker[T] # We keep ref tracking for the GC in a separate field to avoid double indirection. + + CudaTensor*[T: SomeFloat] = object + ## Tensor data structure stored on Nvidia GPU (Cuda) + ## - ``shape``: Dimensions of the CudaTensor + ## - ``strides``: Numbers of items to skip to get the next item along a dimension. + ## - ``offset``: Offset to get the first item of the CudaTensor. Note: offset can be negative, in particular for slices. + ## - ``storage``: An opaque data storage for the CudaTensor + ## + ## Warning ⚠: + ## Assignment ``var a = b`` does not copy the data. Data modification on one CudaTensor will be reflected on the other. + ## However modification on metadata (shape, strides or offset) will not affect the other tensor. + ## Explicit copies can be made with ``clone``: ``var a = b.clone`` + shape*: Metadata + strides*: Metadata + offset*: int + storage*: CudaStorage[T] + + proc deallocCuda*[T](p: CudaTensorRefTracker[T]) {.noSideEffect.}= + if not p.value.isNil: + check cudaFree(p.value) + +when defined(opencl): + type + ClStorage*[T: SomeFloat] = object + ## Opaque seq-like structure for storage on the OpenCL backend. + Flen*: int + Fdata*: ptr UncheckedArray[T] + Fref_tracking*: ref[ptr UncheckedArray[T]] # We keep ref tracking for the GC in a separate field to avoid double indirection. + + ClTensor*[T: SomeFloat] = object + ## Tensor data structure stored on OpenCL (CPU, GPU, FPGAs or other accelerators) + ## - ``shape``: Dimensions of the CudaTensor + ## - ``strides``: Numbers of items to skip to get the next item along a dimension. + ## - ``offset``: Offset to get the first item of the CudaTensor. Note: offset can be negative, in particular for slices. + ## - ``storage``: An opaque data storage for the CudaTensor + ## + ## Warning ⚠: + ## Assignment ``var a = b`` does not copy the data. Data modification on one CudaTensor will be reflected on the other. + ## However modification on metadata (shape, strides or offset) will not affect the other tensor. + ## Explicit copies can be made with ``clone``: ``var a = b.clone`` + shape*: Metadata + strides*: Metadata + offset*: int + storage*: ClStorage[T] + +when defined(cuda) and defined(opencl): + type AnyTensor*[T] = Tensor[T] or CudaTensor[T] or ClTensor[T] +elif defined(cuda): + type AnyTensor*[T] = Tensor[T] or CudaTensor[T] +elif defined(opencl): + type AnyTensor*[T] = Tensor[T] or ClTensor[T] +else: + type AnyTensor*[T] = Tensor[T] + +type GpuTensor[T] = AnyTensor[T] and not Tensor[T] + + # ############### @@ -102,10 +116,10 @@ proc `data=`*[T](t: var Tensor[T], s: seq[T]) {.deprecated: "Use copyFromRaw ins # Tensor Metadata # ################ -func rank*[T](t: CudaTensor[T] or ClTensor[T]): range[0 .. LASER_MAXRANK] {.inline.} = +func rank*[T](t: GpuTensor[T]): range[0 .. LASER_MAXRANK] {.inline.} = t.shape.len -func size*[T](t: CudaTensor[T] or ClTensor[T]): Natural {.inline.} = +func size*[T](t: GpuTensor[T]): Natural {.inline.} = t.shape.product proc shape_to_strides*(shape: Metadata, layout: OrderType = rowMajor, result: var Metadata) {.noSideEffect.} = @@ -131,7 +145,7 @@ proc shape_to_strides*(shape: Metadata, layout: OrderType = rowMajor, result: va accum *= shape[i] return -func is_C_contiguous*(t: CudaTensor or ClTensor): bool = +func is_C_contiguous*(t: GpuTensor): bool = ## Check if the tensor follows C convention / is row major var cur_size = 1 for i in countdown(t.rank - 1,0): @@ -182,14 +196,14 @@ proc get_offset_ptr*[T: KnownSupportsCopyMem](t: Tensor[T]): ptr T {.noSideEffec proc get_offset_ptr*[T: not KnownSupportsCopyMem](t: AnyTensor[T]): ptr T {.error: "`get_offset_ptr`" & " cannot be safely used for GC'ed types!".} -proc get_data_ptr*[T](t: CudaTensor[T] or ClTensor[T]): ptr T {.noSideEffect, inline.}= +proc get_data_ptr*[T](t: GpuTensor[T]): ptr T {.noSideEffect, inline.}= ## Input: ## - A tensor ## Returns: ## - A pointer to the real start of its data (no offset) cast[ptr T](t.storage.Fdata) -proc get_offset_ptr*[T](t: CudaTensor[T] or ClTensor[T]): ptr T {.noSideEffect, inline.}= +proc get_offset_ptr*[T](t: GpuTensor[T]): ptr T {.noSideEffect, inline.}= ## Input: ## - A tensor ## Returns: diff --git a/src/arraymancer/tensor/exporting.nim b/src/arraymancer/tensor/exporting.nim index d0b5bbd6..bd622d40 100644 --- a/src/arraymancer/tensor/exporting.nim +++ b/src/arraymancer/tensor/exporting.nim @@ -33,17 +33,12 @@ proc toRawSeq*[T](t:Tensor[T]): seq[T] {.noSideEffect, deprecated: "This proc ca ## or that you raise your use-case in the issue tracker https://github.com/mratsim/Arraymancer/issues ## so that more suitable primitives can be crafted - # Due to forward declaration this proc must be declared - # after "cpu" proc are declared in init_cuda - when t is Tensor: - result = newSeq[T](t.size) - for i in 0 ..< t.size: - when T is KnownSupportsCopyMem: - result[i] = t.unsafe_raw_offset()[i] - else: - result[i] = t.storage.raw_buffer[i] - elif t is CudaTensor: - return t.cpu.data + result = newSeq[T](t.size) + for i in 0 ..< t.size: + when T is KnownSupportsCopyMem: + result[i] = t.unsafe_raw_offset()[i] + else: + result[i] = t.storage.raw_buffer[i] proc toFlatSeq*[T](t: Tensor[T]) : seq[T] = ## Export the data of the Tensor flattened as a Seq diff --git a/src/arraymancer/tensor/private/p_checks.nim b/src/arraymancer/tensor/private/p_checks.nim index c49c2ffa..0fc46b04 100644 --- a/src/arraymancer/tensor/private/p_checks.nim +++ b/src/arraymancer/tensor/private/p_checks.nim @@ -19,7 +19,10 @@ import ../../laser/private/nested_containers, when (NimMajor, NimMinor) < (1, 4): import ../../std_version_types -include ./p_checks_cuda, ./p_checks_opencl +when defined(cuda): + include ./p_checks_cuda +when defined(opencl): + include ./p_checks_opencl func check_nested_elements*(shape: Metadata, len: int) {.inline.}= ## Compare the detected shape from flatten with the real length of the data