From 52c97e994d4c25d64c7aa192ef6193e06ca356ea Mon Sep 17 00:00:00 2001 From: GarethCabournDavies Date: Tue, 10 Sep 2024 08:33:33 -0700 Subject: [PATCH] Save for some fixes --- pycbc/fft/core.py | 6 +- pycbc/psd/variation.py | 2 +- pycbc/scheme.py | 6 +- pycbc/types/array_cuda.py | 471 +++++++++++++++++++------------------- 4 files changed, 243 insertions(+), 242 deletions(-) diff --git a/pycbc/fft/core.py b/pycbc/fft/core.py index 92e3cdcb229..3584efaaf86 100644 --- a/pycbc/fft/core.py +++ b/pycbc/fft/core.py @@ -46,12 +46,12 @@ def _list_available(possible_list, possible_dict): available_list = [] available_dict = {} for backend in possible_list: - try: +# try: mod = __import__('pycbc.fft.' + possible_dict[backend], fromlist = ['pycbc.fft']) available_dict.update({backend:mod}) available_list.append(backend) - except (ImportError, OSError): - pass +# except (ImportError, OSError): +# pass return available_list, available_dict # The main purpose of the top-level module is to present a diff --git a/pycbc/psd/variation.py b/pycbc/psd/variation.py index 4f8dfb07c20..2c1100b8459 100644 --- a/pycbc/psd/variation.py +++ b/pycbc/psd/variation.py @@ -165,7 +165,7 @@ def calc_filt_psd_variation(strain, segment, short_segment, psd_long_segment, # Create a bandpass filter between low_freq and high_freq filt = sig.firwin(4 * srate, [low_freq, high_freq], pass_zero=False, window='hann', fs=srate) - filt.resize(int(psd_duration * srate)) + filt.resize(int(psd_duration * srate), refcheck=False) # Fourier transform the filter and take the absolute value to get # rid of the phase. filt = abs(rfft(filt)) diff --git a/pycbc/scheme.py b/pycbc/scheme.py index e6eb6f87f86..65ffe252c2c 100644 --- a/pycbc/scheme.py +++ b/pycbc/scheme.py @@ -212,10 +212,10 @@ def _scheming_function(*args, **kwds): return schemed_fn(*args, **kwds) - err = """Failed to find implementation of (%s) - for %s scheme." % (str(fn), current_prefix())""" + err = (f"Failed to find implementation of {func}" + f"for {current_prefix()} scheme.") for emsg in exc_errors: - err += print(emsg) + err += repr(emsg) raise RuntimeError(err) return _scheming_function diff --git a/pycbc/types/array_cuda.py b/pycbc/types/array_cuda.py index 45a89b73456..2b37cffabad 100644 --- a/pycbc/types/array_cuda.py +++ b/pycbc/types/array_cuda.py @@ -12,42 +12,27 @@ # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - - -# -# ============================================================================= -# -# Preamble -# -# ============================================================================= -# -"""Pycuda based """ -import pycuda.driver -from pycuda.elementwise import ElementwiseKernel -from pycuda.reduction import ReductionKernel -from pycuda.tools import get_or_register_dtype -from pycuda.tools import context_dependent_memoize -from pycuda.tools import dtype_to_ctype +CuPy based +""" +#import pycuda.driver +#from pycuda.elementwise import ElementwiseKernel +#from pycuda.reduction import ReductionKernel +#from pycuda.tools import get_or_register_dtype +#from pycuda.tools import context_dependent_memoize +#from pycuda.tools import dtype_to_ctype from pytools import match_precision -from pycuda.gpuarray import _get_common_dtype, empty, GPUArray -import pycuda.gpuarray -from pycuda.scan import InclusiveScanKernel +#from pycuda.gpuarray import _get_common_dtype, empty, GPUArray +#import pycuda.gpuarray +#from pycuda.scan import InclusiveScanKernel import numpy as np +import cupy as cp include_complex = """ #include """ -@context_dependent_memoize -def get_cumsum_kernel(dtype): - return InclusiveScanKernel(dtype, "a+b", preamble=include_complex) - -def icumsum(vec): - krnl = get_cumsum_kernel(vec.dtype) - return krnl(vec) - -@context_dependent_memoize +#@context_dependent_memoize def call_prepare(self, sz, allocator): MAX_BLOCK_COUNT = 1024 SMALL_SEQ_COUNT = 4 @@ -71,62 +56,80 @@ def call_prepare(self, sz, allocator): return result, block_count, seq_count, grid_size, block_size -class LowerLatencyReductionKernel(ReductionKernel): - def __init__(self, dtype_out, - neutral, reduce_expr, map_expr=None, arguments=None, - name="reduce_kernel", keep=False, options=None, preamble=""): - ReductionKernel.__init__(self, dtype_out, - neutral, reduce_expr, map_expr, arguments, - name, keep, options, preamble) - - self.shared_size=self.block_size*self.dtype_out.itemsize - - - def __call__(self, *args, **kwargs): - f = self.stage1_func - s1_invocation_args = [] - for arg in args: - s1_invocation_args.append(arg.gpudata) - sz = args[0].size - - result, block_count, seq_count, grid_size, block_size = call_prepare(self, sz, args[0].allocator) - - f(grid_size, block_size, None, - *([result.gpudata]+s1_invocation_args+[seq_count, sz]), - shared_size=self.shared_size) - - while True: - f = self.stage2_func - sz = result.size - result2 = result - result, block_count, seq_count, grid_size, block_size = call_prepare(self, sz, args[0].allocator) - - f(grid_size, block_size, None, - *([result.gpudata, result2.gpudata]+s1_invocation_args+[seq_count, sz]), - shared_size=self.shared_size) - - if block_count == 1: - return result +#class LowerLatencyReductionKernel(ReductionKernel): +# def __init__(self, dtype_out, +# neutral, reduce_expr, map_expr=None, arguments=None, +# name="reduce_kernel", keep=False, options=None, preamble=""): +# ReductionKernel.__init__(self, dtype_out, +# neutral, reduce_expr, map_expr, arguments, +# name, keep, options, preamble) +# +# self.shared_size=self.block_size*self.dtype_out.itemsize +# +# +# def __call__(self, *args, **kwargs): +# f = self.stage1_func +# s1_invocation_args = [] +# for arg in args: +# s1_invocation_args.append(arg.gpudata) +# sz = args[0].size +# +# result, block_count, seq_count, grid_size, block_size = call_prepare(self, sz, args[0].allocator) +# +# f(grid_size, block_size, None, +# *([result.gpudata]+s1_invocation_args+[seq_count, sz]), +# shared_size=self.shared_size) +# +# while True: +# f = self.stage2_func +# sz = result.size +# result2 = result +# result, block_count, seq_count, grid_size, block_size = call_prepare(self, sz, args[0].allocator) +# +# f(grid_size, block_size, None, +# *([result.gpudata, result2.gpudata]+s1_invocation_args+[seq_count, sz]), +# shared_size=self.shared_size) +# +# if block_count == 1: +# return result +# +#@context_dependent_memoize +#0def get_norm_kernel(dtype_x, dtype_out): +# return ElementwiseKernel( +# "%(tp_x)s *x, %(tp_z)s *z" % { +# "tp_x": dtype_to_ctype(dtype_x), +# "tp_z": dtype_to_ctype(dtype_out), +# }, +# "z[i] = norm(x[i])", +# "normalize") + +#def squared_norm(self): +# a = self.data +# dtype_out = match_precision(np.dtype('float64'), a.dtype) +# out = a._new_like_me(dtype=dtype_out) +# krnl = get_norm_kernel(a.dtype, dtype_out) +# krnl(a, out) +# return out -@context_dependent_memoize def get_norm_kernel(dtype_x, dtype_out): - return ElementwiseKernel( - "%(tp_x)s *x, %(tp_z)s *z" % { - "tp_x": dtype_to_ctype(dtype_x), - "tp_z": dtype_to_ctype(dtype_out), - }, - "z[i] = norm(x[i])", - "normalize") + # Define an element-wise kernel to compute the norm of elements in x + return cp.ElementwiseKernel( + in_params=f"{cp.dtype(dtype_x).name} *x", + out_params=f"{cp.dtype(dtype_out).name} *z", + operation="z[i] = norm(x[i])", + name="normalize" + ) def squared_norm(self): a = self.data dtype_out = match_precision(np.dtype('float64'), a.dtype) - out = a._new_like_me(dtype=dtype_out) + out = cp.empty_like(a, dtype=dtype_out) krnl = get_norm_kernel(a.dtype, dtype_out) krnl(a, out) - return out + return out + # FIXME: Write me! #def multiply_and_add(self, other, mult_fac): @@ -135,200 +138,200 @@ def squared_norm(self): # Self will be modified in place. This requires all inputs to be of the same # precision. # """ - -@context_dependent_memoize -def get_weighted_inner_kernel(dtype_x, dtype_y, dtype_w, dtype_out): - if (dtype_x == np.complex64) or (dtype_x == np.complex128): - inner_map="conj(x[i])*y[i]/w[i]" - else: - inner_map="x[i]*y[i]/w[i]" - return LowerLatencyReductionKernel(dtype_out, - neutral="0", - arguments="%(tp_x)s *x, %(tp_y)s *y, %(tp_w)s *w" % { - "tp_x": dtype_to_ctype(dtype_x), - "tp_y": dtype_to_ctype(dtype_y), - "tp_w": dtype_to_ctype(dtype_w), - }, - reduce_expr="a+b", - map_expr=inner_map, - name="weighted_inner") - -@context_dependent_memoize -def get_inner_kernel(dtype_x, dtype_y, dtype_out): - if (dtype_x == np.complex64) or (dtype_x == np.complex128): - inner_map="conj(x[i])*y[i]" + +def weighted_inner(self, b, w): + if isinstance(self.data, cp.complex64) or isinstance(self.data == cp.complex128): + return cp.conj(self.data) * b / w else: - inner_map="x[i]*y[i]" - return LowerLatencyReductionKernel(dtype_out, - neutral="0", - arguments="%(tp_x)s *x, %(tp_y)s *y" % { - "tp_x": dtype_to_ctype(dtype_x), - "tp_y": dtype_to_ctype(dtype_y), - }, - reduce_expr="a+b", - map_expr=inner_map, - name="inner") + return self.data * b / w def inner(self, b): - a = self.data - dtype_out = _get_common_dtype(a,b) - krnl = get_inner_kernel(a.dtype, b.dtype, dtype_out) - return krnl(a, b).get().max() - -vdot = inner - -def weighted_inner(self, b, w): - if w is None: - return self.inner(b) - a = self.data - dtype_out = _get_common_dtype(a, b) - krnl = get_weighted_inner_kernel(a.dtype, b.dtype, w.dtype, dtype_out) - return krnl(a, b, w).get().max() + if isinstance(self.data, cp.complex64) or isinstance(self.data == cp.complex128): + return cp.conj(self.data) * b + else: + return self.data * b + + +#@context_dependent_memoize +#def get_weighted_inner_kernel(dtype_x, dtype_y, dtype_w, dtype_out): +# if (dtype_x == np.complex64) or (dtype_x == np.complex128): +# inner_map="conj(x[i])*y[i]/w[i]" +# else: +# inner_map="x[i]*y[i]/w[i]" +# return LowerLatencyReductionKernel(dtype_out, +# neutral="0", +# arguments="%(tp_x)s *x, %(tp_y)s *y, %(tp_w)s *w" % { +# "tp_x": dtype_to_ctype(dtype_x), +# "tp_y": dtype_to_ctype(dtype_y), +# "tp_w": dtype_to_ctype(dtype_w), +# }, +# reduce_expr="a+b", +# map_expr=inner_map, +# name="weighted_inner") + +#@context_dependent_memoize +#def get_inner_kernel(dtype_x, dtype_y, dtype_out): +# if (dtype_x == np.complex64) or (dtype_x == np.complex128): +# inner_map="conj(x[i])*y[i]" +# else: +# inner_map="x[i]*y[i]" +# return LowerLatencyReductionKernel(dtype_out, +# neutral="0", +# arguments="%(tp_x)s *x, %(tp_y)s *y" % { +# "tp_x": dtype_to_ctype(dtype_x), +# "tp_y": dtype_to_ctype(dtype_y), +# }, +# reduce_expr="a+b", +# map_expr=inner_map, +# name="inner") +# +#def inner(self, b): +# a = self.data +# dtype_out = _get_common_dtype(a,b) +# krnl = get_inner_kernel(a.dtype, b.dtype, dtype_out) +# return krnl(a, b).get().max() +# +#vdot = inner +# +#def weighted_inner(self, b, w): +# if w is None: +# return self.inner(b) +# a = self.data +# dtype_out = _get_common_dtype(a, b) +# krnl = get_weighted_inner_kernel(a.dtype, b.dtype, w.dtype, dtype_out) +# return krnl(a, b, w).get().max() # Define PYCUDA MAXLOC for both single and double precission ################## -maxloc_preamble = """ - - struct MAXLOCN{ - TTYPE max; - LTYPE loc; - - __device__ - MAXLOCN(){} - - __device__ - MAXLOCN(MAXLOCN const &src): max(src.max), loc(src.loc){} - __device__ - MAXLOCN(MAXLOCN const volatile &src): max(src.max), loc(src.loc){} - - __device__ - MAXLOCN volatile &operator=( MAXLOCN const &src) volatile{ - max = src.max; - loc = src.loc; - return *this; - } - }; - - __device__ - MAXLOCN maxloc_red(MAXLOCN a, MAXLOCN b){ - if (a.max > b.max) - return a; - else - return b; - } - - __device__ - MAXLOCN maxloc_start(){ - MAXLOCN t; - t.max=0; - t.loc=0; - return t; - } - - __device__ - MAXLOCN maxloc_map(TTYPE val, LTYPE loc){ - MAXLOCN t; - t.max = val; - t.loc = loc; - return t; - } - - """ +#maxloc_preamble = """ +# +# struct MAXLOCN{ +# TTYPE max; +# LTYPE loc; +# +# __device__ +# MAXLOCN(){} +# +# __device__ +# MAXLOCN(MAXLOCN const &src): max(src.max), loc(src.loc){} +# __device__ +# MAXLOCN(MAXLOCN const volatile &src): max(src.max), loc(src.loc){} +# +# __device__ +# MAXLOCN volatile &operator=( MAXLOCN const &src) volatile{ +# max = src.max; +# loc = src.loc; +# return *this; +# } +# }; +# +# __device__ +# MAXLOCN maxloc_red(MAXLOCN a, MAXLOCN b){ +# if (a.max > b.max) +# return a; +# else +# return b; +# } +# +# __device__ +# MAXLOCN maxloc_start(){ +# MAXLOCN t; +# t.max=0; +# t.loc=0; +# return t; +# } +# +# __device__ +# MAXLOCN maxloc_map(TTYPE val, LTYPE loc){ +# MAXLOCN t; +# t.max = val; +# t.loc = loc; +# return t; +# } +# +# """ -maxloc_preamble_single = """ - #define MAXLOCN maxlocs - #define TTYPE float - #define LTYPE int -""" + maxloc_preamble - -maxloc_preamble_double = """ - #define MAXLOCN maxlocd - #define TTYPE double - #define LTYPE long -""" + maxloc_preamble +#maxloc_preamble_single = """ +# #define MAXLOCN maxlocs +# #define TTYPE float +# #define LTYPE int +#""" + maxloc_preamble + +#maxloc_preamble_double = """ +# #define MAXLOCN maxlocd +# #define TTYPE double +# #define LTYPE long +#""" + maxloc_preamble -maxloc_dtype_double = np.dtype([("max", np.float64), ("loc", np.int64)]) -maxloc_dtype_single = np.dtype([("max", np.float32), ("loc", np.int32)]) +#maxloc_dtype_double = np.dtype([("max", np.float64), ("loc", np.int64)]) +#maxloc_dtype_single = np.dtype([("max", np.float32), ("loc", np.int32)]) -maxloc_dtype_single = get_or_register_dtype("maxlocs", dtype=maxloc_dtype_single) -maxloc_dtype_double = get_or_register_dtype("maxlocd", dtype=maxloc_dtype_double) +#maxloc_dtype_single = get_or_register_dtype("maxlocs", dtype=maxloc_dtype_single) +#maxloc_dtype_double = get_or_register_dtype("maxlocd", dtype=maxloc_dtype_double) -mls = LowerLatencyReductionKernel(maxloc_dtype_single, neutral = "maxloc_start()", - reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(x[i], i)", - arguments="float *x", preamble=maxloc_preamble_single) +#mls = LowerLatencyReductionKernel(maxloc_dtype_single, neutral = "maxloc_start()", +# reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(x[i], i)", +# arguments="float *x", preamble=maxloc_preamble_single) -mld = LowerLatencyReductionKernel(maxloc_dtype_double, neutral = "maxloc_start()", - reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(x[i], i)", - arguments="double *x", preamble=maxloc_preamble_double) +#mld = LowerLatencyReductionKernel(maxloc_dtype_double, neutral = "maxloc_start()", +# reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(x[i], i)", +# arguments="double *x", preamble=maxloc_preamble_double) -max_loc_map = {'single':mls,'double':mld} +#max_loc_map = {'single':mls,'double':mld} -amls = LowerLatencyReductionKernel(maxloc_dtype_single, neutral = "maxloc_start()", - reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(abs(x[i]), i)", - arguments="float *x", preamble=maxloc_preamble_single) +#amls = LowerLatencyReductionKernel(maxloc_dtype_single, neutral = "maxloc_start()", +# reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(abs(x[i]), i)", +# arguments="float *x", preamble=maxloc_preamble_single) -amld = LowerLatencyReductionKernel(maxloc_dtype_double, neutral = "maxloc_start()", - reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(abs(x[i]), i)", - arguments="double *x", preamble=maxloc_preamble_double) +#amld = LowerLatencyReductionKernel(maxloc_dtype_double, neutral = "maxloc_start()", +# reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(abs(x[i]), i)", +# arguments="double *x", preamble=maxloc_preamble_double) -amlsc = LowerLatencyReductionKernel(maxloc_dtype_single, neutral = "maxloc_start()", - reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(abs(x[i]), i)", - arguments="pycuda::complex *x", preamble=maxloc_preamble_single) +#amlsc = LowerLatencyReductionKernel(maxloc_dtype_single, neutral = "maxloc_start()", +# reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(abs(x[i]), i)", +# arguments="pycuda::complex *x", preamble=maxloc_preamble_single) -amldc = LowerLatencyReductionKernel(maxloc_dtype_double, neutral = "maxloc_start()", - reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(abs(x[i]), i)", - arguments="pycuda::complex *x", preamble=maxloc_preamble_double) +#amldc = LowerLatencyReductionKernel(maxloc_dtype_double, neutral = "maxloc_start()", +# reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(abs(x[i]), i)", +# arguments="pycuda::complex *x", preamble=maxloc_preamble_double) -abs_max_loc_map = {'single':{ 'real':amls, 'complex':amlsc }, 'double':{ 'real':amld, 'complex':amldc }} +#abs_max_loc_map = {'single':{ 'real':amls, 'complex':amlsc }, 'double':{ 'real':amld, 'complex':amldc }} def zeros(length, dtype=np.float64): - result = GPUArray(length, dtype=dtype) - nwords = result.nbytes / 4 - pycuda.driver.memset_d32(result.gpudata, 0, nwords) - return result + return cp.zeros(length, dtype=dtype) def ptr(self): - return self._data.ptr + return self._data.data.ptr def dot(self, other): - return pycuda.gpuarray.dot(self._data,other).get().max() + return cp.dot(self._data,other).item() def min(self): - return pycuda.gpuarray.min(self._data).get().max() + return cp.min(self._data).item() def abs_max_loc(self): - maxloc = abs_max_loc_map[self.precision][self.kind](self._data) - maxloc = maxloc.get() - return float(maxloc['max']),int(maxloc['loc']) + return cp.max(cp.abs(self._data)).item(), int(cp.argmax(self._data)) def cumsum(self): - tmp = self.data*1 - return icumsum(tmp) + return cp.cumsum(self.data) def max(self): - return pycuda.gpuarray.max(self._data).get().max() + return cp.max(self._data).item() def max_loc(self): - maxloc = max_loc_map[self.precision](self._data) - maxloc = maxloc.get() - return float(maxloc['max']),int(maxloc['loc']) + return cp.max(self._data).item(), int(cp.argmax(self._data)) def take(self, indices): - if not isinstance(indices, pycuda.gpuarray.GPUArray): - indices = pycuda.gpuarray.to_gpu(indices) - return pycuda.gpuarray.take(self.data, indices) + indices = cp.asarray(indices) + return cp.take(self.data, indices) def numpy(self): - return self._data.get() + return cp.asnumpy(self._data) def _copy(self, self_ref, other_ref): if (len(other_ref) <= len(self_ref)) : - from pycuda.elementwise import get_copy_kernel - func = get_copy_kernel(self.dtype, other_ref.dtype) - func.prepared_async_call(self_ref._grid, self_ref._block, None, - self_ref.gpudata, other_ref.gpudata, - self_ref.mem_size) + cp.copyto(other_ref, self_ref) else: raise RuntimeError("The arrays must the same length") @@ -336,26 +339,24 @@ def _getvalue(self, index): return self._data.get()[index] def sum(self): - return pycuda.gpuarray.sum(self._data).get().max() + return self.sum(self._data).item() def clear(self): - n32 = self.data.nbytes / 4 - pycuda.driver.memset_d32(self.data.gpudata, 0, n32) + cp.zeros_like(self.data, out=self.data) def _scheme_matches_base_array(array): - if isinstance(array, pycuda.gpuarray.GPUArray): + if isinstance(array, cp.ndarray): return True else: return False def _copy_base_array(array): - data = pycuda.gpuarray.GPUArray((array.size), array.dtype) - if len(array) > 0: - pycuda.driver.memcpy_dtod(data.gpudata, array.gpudata, array.nbytes) + data = cupy.empty_like(array) + cp.copyto(data, array) return data def _to_device(array): - return pycuda.gpuarray.to_gpu(array) + return cp.array(array)