diff --git a/pycbc/fft/core.py b/pycbc/fft/core.py
index 92e3cdcb229..3584efaaf86 100644
--- a/pycbc/fft/core.py
+++ b/pycbc/fft/core.py
@@ -46,12 +46,12 @@ def _list_available(possible_list, possible_dict):
     available_list = []
     available_dict = {}
     for backend in possible_list:
-        try:
+#        try:
             mod = __import__('pycbc.fft.' + possible_dict[backend], fromlist = ['pycbc.fft'])
             available_dict.update({backend:mod})
             available_list.append(backend)
-        except (ImportError, OSError):
-            pass
+#        except (ImportError, OSError):
+#            pass
     return available_list, available_dict
 
 # The main purpose of the top-level module is to present a
diff --git a/pycbc/psd/variation.py b/pycbc/psd/variation.py
index 4f8dfb07c20..2c1100b8459 100644
--- a/pycbc/psd/variation.py
+++ b/pycbc/psd/variation.py
@@ -165,7 +165,7 @@ def calc_filt_psd_variation(strain, segment, short_segment, psd_long_segment,
     # Create a bandpass filter between low_freq and high_freq
     filt = sig.firwin(4 * srate, [low_freq, high_freq], pass_zero=False,
                       window='hann', fs=srate)
-    filt.resize(int(psd_duration * srate))
+    filt.resize(int(psd_duration * srate), refcheck=False)
     # Fourier transform the filter and take the absolute value to get
     # rid of the phase.
     filt = abs(rfft(filt))
diff --git a/pycbc/scheme.py b/pycbc/scheme.py
index e6eb6f87f86..65ffe252c2c 100644
--- a/pycbc/scheme.py
+++ b/pycbc/scheme.py
@@ -212,10 +212,10 @@ def _scheming_function(*args, **kwds):
 
                     return schemed_fn(*args, **kwds)
 
-                err = """Failed to find implementation of (%s)
-                      for %s scheme." % (str(fn), current_prefix())"""
+                err = (f"Failed to find implementation of {func}"
+                       f"for {current_prefix()} scheme.")
                 for emsg in exc_errors:
-                    err += print(emsg)
+                    err += repr(emsg)
                 raise RuntimeError(err)
         return _scheming_function
 
diff --git a/pycbc/types/array_cuda.py b/pycbc/types/array_cuda.py
index 45a89b73456..2b37cffabad 100644
--- a/pycbc/types/array_cuda.py
+++ b/pycbc/types/array_cuda.py
@@ -12,42 +12,27 @@
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-
-
-#
-# =============================================================================
-#
-#                                   Preamble
-#
-# =============================================================================
-#
-"""Pycuda based 
 """
-import pycuda.driver
-from pycuda.elementwise import ElementwiseKernel
-from pycuda.reduction import ReductionKernel
-from pycuda.tools import get_or_register_dtype
-from pycuda.tools import context_dependent_memoize
-from pycuda.tools import dtype_to_ctype
+CuPy based
+"""
+#import pycuda.driver
+#from pycuda.elementwise import ElementwiseKernel
+#from pycuda.reduction import ReductionKernel
+#from pycuda.tools import get_or_register_dtype
+#from pycuda.tools import context_dependent_memoize
+#from pycuda.tools import dtype_to_ctype
 from pytools import match_precision
-from pycuda.gpuarray import _get_common_dtype, empty, GPUArray
-import pycuda.gpuarray
-from pycuda.scan import InclusiveScanKernel
+#from pycuda.gpuarray import _get_common_dtype, empty, GPUArray
+#import pycuda.gpuarray
+#from pycuda.scan import InclusiveScanKernel
 import numpy as np
+import cupy as cp
 
 include_complex = """
 #include <pycuda-complex.hpp>
 """
 
-@context_dependent_memoize
-def get_cumsum_kernel(dtype):
-    return InclusiveScanKernel(dtype, "a+b", preamble=include_complex)
-
-def icumsum(vec):
-    krnl = get_cumsum_kernel(vec.dtype)
-    return krnl(vec)
-
-@context_dependent_memoize
+#@context_dependent_memoize
 def call_prepare(self, sz, allocator):
     MAX_BLOCK_COUNT = 1024
     SMALL_SEQ_COUNT = 4        
@@ -71,62 +56,80 @@ def call_prepare(self, sz, allocator):
 
     return result, block_count, seq_count, grid_size, block_size
 
-class LowerLatencyReductionKernel(ReductionKernel):
-    def __init__(self, dtype_out,
-                 neutral, reduce_expr, map_expr=None, arguments=None,
-                 name="reduce_kernel", keep=False, options=None, preamble=""):
-        ReductionKernel.__init__(self, dtype_out,
-                                 neutral, reduce_expr, map_expr, arguments,
-                                 name, keep, options, preamble)
-
-        self.shared_size=self.block_size*self.dtype_out.itemsize
-
-
-    def __call__(self, *args, **kwargs):
-        f = self.stage1_func
-        s1_invocation_args = [] 
-        for arg in args:
-            s1_invocation_args.append(arg.gpudata)
-        sz = args[0].size
-
-        result, block_count, seq_count, grid_size, block_size = call_prepare(self, sz, args[0].allocator)
-
-        f(grid_size, block_size, None,
-                *([result.gpudata]+s1_invocation_args+[seq_count, sz]),
-                shared_size=self.shared_size)
-
-        while True:
-            f = self.stage2_func
-            sz = result.size
-            result2 = result
-            result, block_count, seq_count, grid_size, block_size = call_prepare(self, sz, args[0].allocator)
-
-            f(grid_size, block_size, None,
-                    *([result.gpudata, result2.gpudata]+s1_invocation_args+[seq_count, sz]),
-                    shared_size=self.shared_size)
-
-            if block_count == 1:
-                return result
+#class LowerLatencyReductionKernel(ReductionKernel):
+#    def __init__(self, dtype_out,
+#                 neutral, reduce_expr, map_expr=None, arguments=None,
+#                 name="reduce_kernel", keep=False, options=None, preamble=""):
+#        ReductionKernel.__init__(self, dtype_out,
+#                                 neutral, reduce_expr, map_expr, arguments,
+#                                 name, keep, options, preamble)
+#
+#        self.shared_size=self.block_size*self.dtype_out.itemsize
+#
+#
+#    def __call__(self, *args, **kwargs):
+#        f = self.stage1_func
+#        s1_invocation_args = [] 
+#        for arg in args:
+#            s1_invocation_args.append(arg.gpudata)
+#        sz = args[0].size
+#
+#        result, block_count, seq_count, grid_size, block_size = call_prepare(self, sz, args[0].allocator)
+#
+#        f(grid_size, block_size, None,
+#                *([result.gpudata]+s1_invocation_args+[seq_count, sz]),
+#                shared_size=self.shared_size)
+#
+#        while True:
+#            f = self.stage2_func
+#            sz = result.size
+#            result2 = result
+#            result, block_count, seq_count, grid_size, block_size = call_prepare(self, sz, args[0].allocator)
+#
+#            f(grid_size, block_size, None,
+#                    *([result.gpudata, result2.gpudata]+s1_invocation_args+[seq_count, sz]),
+#                    shared_size=self.shared_size)
+#
+#            if block_count == 1:
+#                return result
+#
 
 
+#@context_dependent_memoize
+#0def get_norm_kernel(dtype_x, dtype_out):
+#    return ElementwiseKernel(
+#            "%(tp_x)s *x, %(tp_z)s *z" % {
+#                "tp_x": dtype_to_ctype(dtype_x),
+#                "tp_z": dtype_to_ctype(dtype_out),
+#                },
+#            "z[i] = norm(x[i])",
+#            "normalize")
+
+#def squared_norm(self):
+#    a = self.data
+#    dtype_out = match_precision(np.dtype('float64'), a.dtype)
+#    out = a._new_like_me(dtype=dtype_out)
+#    krnl = get_norm_kernel(a.dtype, dtype_out)
+#    krnl(a, out)
+#    return out     
 
-@context_dependent_memoize
 def get_norm_kernel(dtype_x, dtype_out):
-    return ElementwiseKernel(
-            "%(tp_x)s *x, %(tp_z)s *z" % {
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_z": dtype_to_ctype(dtype_out),
-                },
-            "z[i] = norm(x[i])",
-            "normalize")
+    # Define an element-wise kernel to compute the norm of elements in x
+    return cp.ElementwiseKernel(
+        in_params=f"{cp.dtype(dtype_x).name} *x",
+        out_params=f"{cp.dtype(dtype_out).name} *z",
+        operation="z[i] = norm(x[i])",
+        name="normalize"
+    )
 
 def squared_norm(self):
     a = self.data
     dtype_out = match_precision(np.dtype('float64'), a.dtype)
-    out = a._new_like_me(dtype=dtype_out)
+    out = cp.empty_like(a, dtype=dtype_out)
     krnl = get_norm_kernel(a.dtype, dtype_out)
     krnl(a, out)
-    return out     
+    return out
+
 
 # FIXME: Write me!
 #def multiply_and_add(self, other, mult_fac):
@@ -135,200 +138,200 @@ def squared_norm(self):
 #    Self will be modified in place. This requires all inputs to be of the same
 #    precision.
 #    """
- 
-@context_dependent_memoize
-def get_weighted_inner_kernel(dtype_x, dtype_y, dtype_w, dtype_out):
-    if (dtype_x == np.complex64) or (dtype_x == np.complex128):
-        inner_map="conj(x[i])*y[i]/w[i]"
-    else:
-        inner_map="x[i]*y[i]/w[i]"       
-    return LowerLatencyReductionKernel(dtype_out,
-            neutral="0",
-            arguments="%(tp_x)s *x, %(tp_y)s *y,  %(tp_w)s *w" % {
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_y": dtype_to_ctype(dtype_y),
-                "tp_w": dtype_to_ctype(dtype_w),
-                },
-            reduce_expr="a+b",
-            map_expr=inner_map,
-            name="weighted_inner")
-
-@context_dependent_memoize
-def get_inner_kernel(dtype_x, dtype_y, dtype_out):
-    if (dtype_x == np.complex64) or (dtype_x == np.complex128):
-        inner_map="conj(x[i])*y[i]"
+
+def weighted_inner(self, b, w):
+    if isinstance(self.data, cp.complex64) or isinstance(self.data == cp.complex128):
+        return cp.conj(self.data) * b / w
     else:
-        inner_map="x[i]*y[i]"           
-    return LowerLatencyReductionKernel(dtype_out,
-            neutral="0",
-            arguments="%(tp_x)s *x, %(tp_y)s *y" % {
-                "tp_x": dtype_to_ctype(dtype_x),
-                "tp_y": dtype_to_ctype(dtype_y),
-                },
-            reduce_expr="a+b",
-            map_expr=inner_map,
-            name="inner")
+        return self.data * b / w
 
 def inner(self, b):
-    a = self.data
-    dtype_out = _get_common_dtype(a,b)
-    krnl = get_inner_kernel(a.dtype, b.dtype, dtype_out)
-    return krnl(a, b).get().max()
-    
-vdot = inner
-
-def weighted_inner(self, b, w):
-    if w is None:
-        return self.inner(b)  
-    a = self.data
-    dtype_out = _get_common_dtype(a, b)
-    krnl = get_weighted_inner_kernel(a.dtype, b.dtype, w.dtype, dtype_out)
-    return krnl(a, b, w).get().max()
+    if isinstance(self.data, cp.complex64) or isinstance(self.data == cp.complex128):
+        return cp.conj(self.data) * b
+    else:
+        return self.data * b
+
+
+#@context_dependent_memoize
+#def get_weighted_inner_kernel(dtype_x, dtype_y, dtype_w, dtype_out):
+#    if (dtype_x == np.complex64) or (dtype_x == np.complex128):
+#        inner_map="conj(x[i])*y[i]/w[i]"
+#    else:
+#        inner_map="x[i]*y[i]/w[i]"       
+#    return LowerLatencyReductionKernel(dtype_out,
+#            neutral="0",
+#            arguments="%(tp_x)s *x, %(tp_y)s *y,  %(tp_w)s *w" % {
+#                "tp_x": dtype_to_ctype(dtype_x),
+#                "tp_y": dtype_to_ctype(dtype_y),
+#                "tp_w": dtype_to_ctype(dtype_w),
+#                },
+#            reduce_expr="a+b",
+#            map_expr=inner_map,
+#            name="weighted_inner")
+
+#@context_dependent_memoize
+#def get_inner_kernel(dtype_x, dtype_y, dtype_out):
+#    if (dtype_x == np.complex64) or (dtype_x == np.complex128):
+#        inner_map="conj(x[i])*y[i]"
+#    else:
+#        inner_map="x[i]*y[i]"           
+#    return LowerLatencyReductionKernel(dtype_out,
+#            neutral="0",
+#            arguments="%(tp_x)s *x, %(tp_y)s *y" % {
+#                "tp_x": dtype_to_ctype(dtype_x),
+#                "tp_y": dtype_to_ctype(dtype_y),
+#                },
+#            reduce_expr="a+b",
+#            map_expr=inner_map,
+#            name="inner")
+#
+#def inner(self, b):
+#    a = self.data
+#    dtype_out = _get_common_dtype(a,b)
+#    krnl = get_inner_kernel(a.dtype, b.dtype, dtype_out)
+#    return krnl(a, b).get().max()
+#    
+#vdot = inner
+#
+#def weighted_inner(self, b, w):
+#    if w is None:
+#        return self.inner(b)  
+#    a = self.data
+#    dtype_out = _get_common_dtype(a, b)
+#    krnl = get_weighted_inner_kernel(a.dtype, b.dtype, w.dtype, dtype_out)
+#    return krnl(a, b, w).get().max()
 
 # Define PYCUDA MAXLOC for both single and double precission ################## 
        
-maxloc_preamble = """
-
-    struct MAXLOCN{
-        TTYPE max;
-        LTYPE   loc;
-        
-        __device__
-        MAXLOCN(){}
-        
-        __device__
-        MAXLOCN(MAXLOCN const &src): max(src.max), loc(src.loc){}
-        __device__
-        MAXLOCN(MAXLOCN const volatile &src): max(src.max), loc(src.loc){}
-        
-        __device__
-        MAXLOCN volatile &operator=( MAXLOCN const &src) volatile{
-            max = src.max;
-            loc = src.loc;
-            return *this;
-        }
-    };
-    
-    __device__
-    MAXLOCN maxloc_red(MAXLOCN a, MAXLOCN b){
-        if (a.max > b.max)
-            return a;
-        else 
-            return b;  
-    }
-    
-    __device__
-    MAXLOCN maxloc_start(){
-        MAXLOCN t;
-        t.max=0;
-        t.loc=0;
-        return t;
-    }
-    
-    __device__
-    MAXLOCN maxloc_map(TTYPE val, LTYPE loc){
-        MAXLOCN t;
-        t.max = val;
-        t.loc = loc;
-        return t;
-    }
-
-    """
+#maxloc_preamble = """
+#
+#    struct MAXLOCN{
+#        TTYPE max;
+#        LTYPE   loc;
+#        
+#        __device__
+#        MAXLOCN(){}
+#        
+#        __device__
+#        MAXLOCN(MAXLOCN const &src): max(src.max), loc(src.loc){}
+#        __device__
+#        MAXLOCN(MAXLOCN const volatile &src): max(src.max), loc(src.loc){}
+#        
+#        __device__
+#        MAXLOCN volatile &operator=( MAXLOCN const &src) volatile{
+#            max = src.max;
+#            loc = src.loc;
+#            return *this;
+#        }
+#    };
+#    
+#    __device__
+#    MAXLOCN maxloc_red(MAXLOCN a, MAXLOCN b){
+#        if (a.max > b.max)
+#            return a;
+#        else 
+#            return b;  
+#    }
+#    
+#    __device__
+#    MAXLOCN maxloc_start(){
+#        MAXLOCN t;
+#        t.max=0;
+#        t.loc=0;
+#        return t;
+#    }
+#    
+#    __device__
+#    MAXLOCN maxloc_map(TTYPE val, LTYPE loc){
+#        MAXLOCN t;
+#        t.max = val;
+#        t.loc = loc;
+#        return t;
+#    }
+#
+#    """
     
-maxloc_preamble_single = """
-    #define MAXLOCN maxlocs
-    #define TTYPE float
-    #define LTYPE int
-""" + maxloc_preamble
-
-maxloc_preamble_double = """
-    #define MAXLOCN maxlocd
-    #define TTYPE double
-    #define LTYPE long
-""" + maxloc_preamble
+#maxloc_preamble_single = """
+#    #define MAXLOCN maxlocs
+#    #define TTYPE float
+#    #define LTYPE int
+#""" + maxloc_preamble
+
+#maxloc_preamble_double = """
+#    #define MAXLOCN maxlocd
+#    #define TTYPE double
+#    #define LTYPE long
+#""" + maxloc_preamble
     
-maxloc_dtype_double = np.dtype([("max", np.float64), ("loc", np.int64)])
-maxloc_dtype_single = np.dtype([("max", np.float32), ("loc", np.int32)])
+#maxloc_dtype_double = np.dtype([("max", np.float64), ("loc", np.int64)])
+#maxloc_dtype_single = np.dtype([("max", np.float32), ("loc", np.int32)])
 
-maxloc_dtype_single = get_or_register_dtype("maxlocs", dtype=maxloc_dtype_single)
-maxloc_dtype_double = get_or_register_dtype("maxlocd", dtype=maxloc_dtype_double)
+#maxloc_dtype_single = get_or_register_dtype("maxlocs", dtype=maxloc_dtype_single)
+#maxloc_dtype_double = get_or_register_dtype("maxlocd", dtype=maxloc_dtype_double)
 
-mls = LowerLatencyReductionKernel(maxloc_dtype_single, neutral = "maxloc_start()",
-        reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(x[i], i)",
-        arguments="float *x", preamble=maxloc_preamble_single)
+#mls = LowerLatencyReductionKernel(maxloc_dtype_single, neutral = "maxloc_start()",
+#        reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(x[i], i)",
+#        arguments="float *x", preamble=maxloc_preamble_single)
 
-mld = LowerLatencyReductionKernel(maxloc_dtype_double, neutral = "maxloc_start()",
-        reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(x[i], i)",
-        arguments="double *x", preamble=maxloc_preamble_double)
+#mld = LowerLatencyReductionKernel(maxloc_dtype_double, neutral = "maxloc_start()",
+#        reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(x[i], i)",
+#        arguments="double *x", preamble=maxloc_preamble_double)
         
-max_loc_map = {'single':mls,'double':mld}
+#max_loc_map = {'single':mls,'double':mld}
 
 
-amls = LowerLatencyReductionKernel(maxloc_dtype_single, neutral = "maxloc_start()",
-        reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(abs(x[i]), i)",
-        arguments="float *x", preamble=maxloc_preamble_single)
+#amls = LowerLatencyReductionKernel(maxloc_dtype_single, neutral = "maxloc_start()",
+#        reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(abs(x[i]), i)",
+#        arguments="float *x", preamble=maxloc_preamble_single)
 
-amld = LowerLatencyReductionKernel(maxloc_dtype_double, neutral = "maxloc_start()",
-        reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(abs(x[i]), i)",
-        arguments="double *x", preamble=maxloc_preamble_double)
+#amld = LowerLatencyReductionKernel(maxloc_dtype_double, neutral = "maxloc_start()",
+#        reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(abs(x[i]), i)",
+#        arguments="double *x", preamble=maxloc_preamble_double)
 
-amlsc = LowerLatencyReductionKernel(maxloc_dtype_single, neutral = "maxloc_start()",
-        reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(abs(x[i]), i)",
-        arguments="pycuda::complex<float> *x", preamble=maxloc_preamble_single)
+#amlsc = LowerLatencyReductionKernel(maxloc_dtype_single, neutral = "maxloc_start()",
+#        reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(abs(x[i]), i)",
+#        arguments="pycuda::complex<float> *x", preamble=maxloc_preamble_single)
 
-amldc = LowerLatencyReductionKernel(maxloc_dtype_double, neutral = "maxloc_start()",
-        reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(abs(x[i]), i)",
-        arguments="pycuda::complex<double> *x", preamble=maxloc_preamble_double)
+#amldc = LowerLatencyReductionKernel(maxloc_dtype_double, neutral = "maxloc_start()",
+#        reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(abs(x[i]), i)",
+#        arguments="pycuda::complex<double> *x", preamble=maxloc_preamble_double)
 
-abs_max_loc_map = {'single':{ 'real':amls, 'complex':amlsc }, 'double':{ 'real':amld, 'complex':amldc }}
+#abs_max_loc_map = {'single':{ 'real':amls, 'complex':amlsc }, 'double':{ 'real':amld, 'complex':amldc }}
 
 def zeros(length, dtype=np.float64):
-    result = GPUArray(length, dtype=dtype)
-    nwords = result.nbytes / 4
-    pycuda.driver.memset_d32(result.gpudata, 0, nwords)
-    return result
+    return cp.zeros(length, dtype=dtype)
 
 def ptr(self):
-    return self._data.ptr
+    return self._data.data.ptr
 
 def dot(self, other):
-    return pycuda.gpuarray.dot(self._data,other).get().max()
+    return cp.dot(self._data,other).item()
 
 def min(self):
-    return pycuda.gpuarray.min(self._data).get().max()
+    return cp.min(self._data).item()
 
 def abs_max_loc(self):
-    maxloc = abs_max_loc_map[self.precision][self.kind](self._data)
-    maxloc = maxloc.get()
-    return float(maxloc['max']),int(maxloc['loc'])
+    return cp.max(cp.abs(self._data)).item(), int(cp.argmax(self._data))
 
 def cumsum(self):
-    tmp = self.data*1
-    return icumsum(tmp)
+    return cp.cumsum(self.data)
 
 def max(self):
-    return pycuda.gpuarray.max(self._data).get().max()
+    return cp.max(self._data).item()
 
 def max_loc(self):
-    maxloc = max_loc_map[self.precision](self._data)
-    maxloc = maxloc.get()
-    return float(maxloc['max']),int(maxloc['loc'])
+    return cp.max(self._data).item(), int(cp.argmax(self._data))
     
 def take(self, indices):
-    if not isinstance(indices, pycuda.gpuarray.GPUArray):
-        indices = pycuda.gpuarray.to_gpu(indices)
-    return pycuda.gpuarray.take(self.data, indices)
+    indices = cp.asarray(indices)
+    return cp.take(self.data, indices)
     
 def numpy(self):
-    return self._data.get()
+    return cp.asnumpy(self._data)
      
 def _copy(self, self_ref, other_ref):
     if (len(other_ref) <= len(self_ref)) :
-        from pycuda.elementwise import get_copy_kernel
-        func = get_copy_kernel(self.dtype, other_ref.dtype)
-        func.prepared_async_call(self_ref._grid, self_ref._block, None,
-                self_ref.gpudata, other_ref.gpudata,
-                self_ref.mem_size)
+        cp.copyto(other_ref, self_ref)
     else:
         raise RuntimeError("The arrays must the same length")
 
@@ -336,26 +339,24 @@ def _getvalue(self, index):
     return self._data.get()[index]
     
 def sum(self):
-    return pycuda.gpuarray.sum(self._data).get().max()
+    return self.sum(self._data).item()
     
 def clear(self):
-    n32 = self.data.nbytes / 4
-    pycuda.driver.memset_d32(self.data.gpudata, 0, n32)
+    cp.zeros_like(self.data, out=self.data)
     
 def _scheme_matches_base_array(array):
-    if isinstance(array, pycuda.gpuarray.GPUArray):
+    if isinstance(array, cp.ndarray):
         return True
     else:
         return False
 
 def _copy_base_array(array):
-    data = pycuda.gpuarray.GPUArray((array.size), array.dtype)
-    if len(array) > 0:
-        pycuda.driver.memcpy_dtod(data.gpudata, array.gpudata, array.nbytes)
+    data = cupy.empty_like(array)
+    cp.copyto(data, array)
     return data
 
 def _to_device(array):
-    return pycuda.gpuarray.to_gpu(array)
+    return cp.array(array)